git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  47 #define pb_7f (~0UL/255 * 0x7f)
  48 #define pb_80 (~0UL/255 * 0x80)
  49
  50 const uint8_t ff_zigzag_direct[64] = {
  51     0,   1,  8, 16,  9,  2,  3, 10,
  52     17, 24, 32, 25, 18, 11,  4,  5,
  53     12, 19, 26, 33, 40, 48, 41, 34,
  54     27, 20, 13,  6,  7, 14, 21, 28,
  55     35, 42, 49, 56, 57, 50, 43, 36,
  56     29, 22, 15, 23, 30, 37, 44, 51,
  57     58, 59, 52, 45, 38, 31, 39, 46,
  58     53, 60, 61, 54, 47, 55, 62, 63
  59 };
  60
  61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  62    specification, we interleave the fields */
  63 const uint8_t ff_zigzag248_direct[64] = {
  64      0,  8,  1,  9, 16, 24,  2, 10,
  65     17, 25, 32, 40, 48, 56, 33, 41,
  66     18, 26,  3, 11,  4, 12, 19, 27,
  67     34, 42, 49, 57, 50, 58, 35, 43,
  68     20, 28,  5, 13,  6, 14, 21, 29,
  69     36, 44, 51, 59, 52, 60, 37, 45,
  70     22, 30,  7, 15, 23, 31, 38, 46,
  71     53, 61, 54, 62, 39, 47, 55, 63,
  72 };
  73
  74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  76
  77 const uint8_t ff_alternate_horizontal_scan[64] = {
  78     0,  1,   2,  3,  8,  9, 16, 17,
  79     10, 11,  4,  5,  6,  7, 15, 14,
  80     13, 12, 19, 18, 24, 25, 32, 33,
  81     26, 27, 20, 21, 22, 23, 28, 29,
  82     30, 31, 34, 35, 40, 41, 48, 49,
  83     42, 43, 36, 37, 38, 39, 44, 45,
  84     46, 47, 50, 51, 56, 57, 58, 59,
  85     52, 53, 54, 55, 60, 61, 62, 63,
  86 };
  87
  88 const uint8_t ff_alternate_vertical_scan[64] = {
  89     0,  8,  16, 24,  1,  9,  2, 10,
  90     17, 25, 32, 40, 48, 56, 57, 49,
  91     41, 33, 26, 18,  3, 11,  4, 12,
  92     19, 27, 34, 42, 50, 58, 35, 43,
  93     51, 59, 20, 28,  5, 13,  6, 14,
  94     21, 29, 36, 44, 52, 60, 37, 45,
  95     53, 61, 22, 30,  7, 15, 23, 31,
  96     38, 46, 54, 62, 39, 47, 55, 63,
  97 };
  98
  99 /* Input permutation for the simple_idct_mmx */
 100 static const uint8_t simple_mmx_permutation[64]={
 101         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 102         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 103         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 104         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 105         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 106         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 107         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 108         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 109 };
 110
 111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 112
 113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 114     int i;
 115     int end;
 116
 117     st->scantable= src_scantable;
 118
 119     for(i=0; i<64; i++){
 120         int j;
 121         j = src_scantable[i];
 122         st->permutated[i] = permutation[j];
 123 #if ARCH_PPC
 124         st->inverse[j] = i;
 125 #endif
 126     }
 127
 128     end=-1;
 129     for(i=0; i<64; i++){
 130         int j;
 131         j = st->permutated[i];
 132         if(j>end) end=j;
 133         st->raster_end[i]= end;
 134     }
 135 }
 136
 137 static int pix_sum_c(uint8_t * pix, int line_size)
 138 {
 139     int s, i, j;
 140
 141     s = 0;
 142     for (i = 0; i < 16; i++) {
 143         for (j = 0; j < 16; j += 8) {
 144             s += pix[0];
 145             s += pix[1];
 146             s += pix[2];
 147             s += pix[3];
 148             s += pix[4];
 149             s += pix[5];
 150             s += pix[6];
 151             s += pix[7];
 152             pix += 8;
 153         }
 154         pix += line_size - 16;
 155     }
 156     return s;
 157 }
 158
 159 static int pix_norm1_c(uint8_t * pix, int line_size)
 160 {
 161     int s, i, j;
 162     uint32_t *sq = ff_squareTbl + 256;
 163
 164     s = 0;
 165     for (i = 0; i < 16; i++) {
 166         for (j = 0; j < 16; j += 8) {
 167 #if 0
 168             s += sq[pix[0]];
 169             s += sq[pix[1]];
 170             s += sq[pix[2]];
 171             s += sq[pix[3]];
 172             s += sq[pix[4]];
 173             s += sq[pix[5]];
 174             s += sq[pix[6]];
 175             s += sq[pix[7]];
 176 #else
 177 #if LONG_MAX > 2147483647
 178             register uint64_t x=*(uint64_t*)pix;
 179             s += sq[x&0xff];
 180             s += sq[(x>>8)&0xff];
 181             s += sq[(x>>16)&0xff];
 182             s += sq[(x>>24)&0xff];
 183             s += sq[(x>>32)&0xff];
 184             s += sq[(x>>40)&0xff];
 185             s += sq[(x>>48)&0xff];
 186             s += sq[(x>>56)&0xff];
 187 #else
 188             register uint32_t x=*(uint32_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             x=*(uint32_t*)(pix+4);
 194             s += sq[x&0xff];
 195             s += sq[(x>>8)&0xff];
 196             s += sq[(x>>16)&0xff];
 197             s += sq[(x>>24)&0xff];
 198 #endif
 199 #endif
 200             pix += 8;
 201         }
 202         pix += line_size - 16;
 203     }
 204     return s;
 205 }
 206
 207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 208     int i;
 209
 210     for(i=0; i+8<=w; i+=8){
 211         dst[i+0]= av_bswap32(src[i+0]);
 212         dst[i+1]= av_bswap32(src[i+1]);
 213         dst[i+2]= av_bswap32(src[i+2]);
 214         dst[i+3]= av_bswap32(src[i+3]);
 215         dst[i+4]= av_bswap32(src[i+4]);
 216         dst[i+5]= av_bswap32(src[i+5]);
 217         dst[i+6]= av_bswap32(src[i+6]);
 218         dst[i+7]= av_bswap32(src[i+7]);
 219     }
 220     for(;i<w; i++){
 221         dst[i+0]= av_bswap32(src[i+0]);
 222     }
 223 }
 224
 225 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 226 {
 227     while (len--)
 228         *dst++ = av_bswap16(*src++);
 229 }
 230
 231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 232 {
 233     int s, i;
 234     uint32_t *sq = ff_squareTbl + 256;
 235
 236     s = 0;
 237     for (i = 0; i < h; i++) {
 238         s += sq[pix1[0] - pix2[0]];
 239         s += sq[pix1[1] - pix2[1]];
 240         s += sq[pix1[2] - pix2[2]];
 241         s += sq[pix1[3] - pix2[3]];
 242         pix1 += line_size;
 243         pix2 += line_size;
 244     }
 245     return s;
 246 }
 247
 248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 249 {
 250     int s, i;
 251     uint32_t *sq = ff_squareTbl + 256;
 252
 253     s = 0;
 254     for (i = 0; i < h; i++) {
 255         s += sq[pix1[0] - pix2[0]];
 256         s += sq[pix1[1] - pix2[1]];
 257         s += sq[pix1[2] - pix2[2]];
 258         s += sq[pix1[3] - pix2[3]];
 259         s += sq[pix1[4] - pix2[4]];
 260         s += sq[pix1[5] - pix2[5]];
 261         s += sq[pix1[6] - pix2[6]];
 262         s += sq[pix1[7] - pix2[7]];
 263         pix1 += line_size;
 264         pix2 += line_size;
 265     }
 266     return s;
 267 }
 268
 269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 270 {
 271     int s, i;
 272     uint32_t *sq = ff_squareTbl + 256;
 273
 274     s = 0;
 275     for (i = 0; i < h; i++) {
 276         s += sq[pix1[ 0] - pix2[ 0]];
 277         s += sq[pix1[ 1] - pix2[ 1]];
 278         s += sq[pix1[ 2] - pix2[ 2]];
 279         s += sq[pix1[ 3] - pix2[ 3]];
 280         s += sq[pix1[ 4] - pix2[ 4]];
 281         s += sq[pix1[ 5] - pix2[ 5]];
 282         s += sq[pix1[ 6] - pix2[ 6]];
 283         s += sq[pix1[ 7] - pix2[ 7]];
 284         s += sq[pix1[ 8] - pix2[ 8]];
 285         s += sq[pix1[ 9] - pix2[ 9]];
 286         s += sq[pix1[10] - pix2[10]];
 287         s += sq[pix1[11] - pix2[11]];
 288         s += sq[pix1[12] - pix2[12]];
 289         s += sq[pix1[13] - pix2[13]];
 290         s += sq[pix1[14] - pix2[14]];
 291         s += sq[pix1[15] - pix2[15]];
 292
 293         pix1 += line_size;
 294         pix2 += line_size;
 295     }
 296     return s;
 297 }
 298
 299 /* draw the edges of width 'w' of an image of size width, height */
 300 //FIXME check that this is ok for mpeg4 interlaced
 301 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w, int sides)
 302 {
 303     uint8_t *ptr, *last_line;
 304     int i;
 305
 306     last_line = buf + (height - 1) * wrap;
 307     for(i=0;i<w;i++) {
 308         /* top and bottom */
 309         if (sides&EDGE_TOP)    memcpy(buf - (i + 1) * wrap, buf, width);
 310         if (sides&EDGE_BOTTOM) memcpy(last_line + (i + 1) * wrap, last_line, width);
 311     }
 312     /* left and right */
 313     ptr = buf;
 314     for(i=0;i<height;i++) {
 315         memset(ptr - w, ptr[0], w);
 316         memset(ptr + width, ptr[width-1], w);
 317         ptr += wrap;
 318     }
 319     /* corners */
 320     for(i=0;i<w;i++) {
 321         if (sides&EDGE_TOP) {
 322             memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 323             memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 324         }
 325
 326         if (sides&EDGE_BOTTOM) {
 327             memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 328             memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 329         }
 330     }
 331 }
 332
 333 /**
 334  * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
 335  * @param buf destination buffer
 336  * @param src source buffer
 337  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 338  * @param block_w width of block
 339  * @param block_h height of block
 340  * @param src_x x coordinate of the top left sample of the block in the source buffer
 341  * @param src_y y coordinate of the top left sample of the block in the source buffer
 342  * @param w width of the source buffer
 343  * @param h height of the source buffer
 344  */
 345 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
 346                                     int src_x, int src_y, int w, int h){
 347     int x, y;
 348     int start_y, start_x, end_y, end_x;
 349
 350     if(src_y>= h){
 351         src+= (h-1-src_y)*linesize;
 352         src_y=h-1;
 353     }else if(src_y<=-block_h){
 354         src+= (1-block_h-src_y)*linesize;
 355         src_y=1-block_h;
 356     }
 357     if(src_x>= w){
 358         src+= (w-1-src_x);
 359         src_x=w-1;
 360     }else if(src_x<=-block_w){
 361         src+= (1-block_w-src_x);
 362         src_x=1-block_w;
 363     }
 364
 365     start_y= FFMAX(0, -src_y);
 366     start_x= FFMAX(0, -src_x);
 367     end_y= FFMIN(block_h, h-src_y);
 368     end_x= FFMIN(block_w, w-src_x);
 369     assert(start_y < end_y && block_h);
 370     assert(start_x < end_x && block_w);
 371
 372     w    = end_x - start_x;
 373     src += start_y*linesize + start_x;
 374     buf += start_x;
 375
 376     //top
 377     for(y=0; y<start_y; y++){
 378         memcpy(buf, src, w);
 379         buf += linesize;
 380     }
 381
 382     // copy existing part
 383     for(; y<end_y; y++){
 384         memcpy(buf, src, w);
 385         src += linesize;
 386         buf += linesize;
 387     }
 388
 389     //bottom
 390     src -= linesize;
 391     for(; y<block_h; y++){
 392         memcpy(buf, src, w);
 393         buf += linesize;
 394     }
 395
 396     buf -= block_h * linesize + start_x;
 397     while (block_h--){
 398        //left
 399         for(x=0; x<start_x; x++){
 400             buf[x] = buf[start_x];
 401         }
 402
 403        //right
 404         for(x=end_x; x<block_w; x++){
 405             buf[x] = buf[end_x - 1];
 406         }
 407         buf += linesize;
 408     }
 409 }
 410
 411 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 412 {
 413     int i;
 414
 415     /* read the pixels */
 416     for(i=0;i<8;i++) {
 417         block[0] = pixels[0];
 418         block[1] = pixels[1];
 419         block[2] = pixels[2];
 420         block[3] = pixels[3];
 421         block[4] = pixels[4];
 422         block[5] = pixels[5];
 423         block[6] = pixels[6];
 424         block[7] = pixels[7];
 425         pixels += line_size;
 426         block += 8;
 427     }
 428 }
 429
 430 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 431                           const uint8_t *s2, int stride){
 432     int i;
 433
 434     /* read the pixels */
 435     for(i=0;i<8;i++) {
 436         block[0] = s1[0] - s2[0];
 437         block[1] = s1[1] - s2[1];
 438         block[2] = s1[2] - s2[2];
 439         block[3] = s1[3] - s2[3];
 440         block[4] = s1[4] - s2[4];
 441         block[5] = s1[5] - s2[5];
 442         block[6] = s1[6] - s2[6];
 443         block[7] = s1[7] - s2[7];
 444         s1 += stride;
 445         s2 += stride;
 446         block += 8;
 447     }
 448 }
 449
 450
 451 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 452                              int line_size)
 453 {
 454     int i;
 455     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 456
 457     /* read the pixels */
 458     for(i=0;i<8;i++) {
 459         pixels[0] = cm[block[0]];
 460         pixels[1] = cm[block[1]];
 461         pixels[2] = cm[block[2]];
 462         pixels[3] = cm[block[3]];
 463         pixels[4] = cm[block[4]];
 464         pixels[5] = cm[block[5]];
 465         pixels[6] = cm[block[6]];
 466         pixels[7] = cm[block[7]];
 467
 468         pixels += line_size;
 469         block += 8;
 470     }
 471 }
 472
 473 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 474                                  int line_size)
 475 {
 476     int i;
 477     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 478
 479     /* read the pixels */
 480     for(i=0;i<4;i++) {
 481         pixels[0] = cm[block[0]];
 482         pixels[1] = cm[block[1]];
 483         pixels[2] = cm[block[2]];
 484         pixels[3] = cm[block[3]];
 485
 486         pixels += line_size;
 487         block += 8;
 488     }
 489 }
 490
 491 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 492                                  int line_size)
 493 {
 494     int i;
 495     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 496
 497     /* read the pixels */
 498     for(i=0;i<2;i++) {
 499         pixels[0] = cm[block[0]];
 500         pixels[1] = cm[block[1]];
 501
 502         pixels += line_size;
 503         block += 8;
 504     }
 505 }
 506
 507 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
 508                                     uint8_t *restrict pixels,
 509                                     int line_size)
 510 {
 511     int i, j;
 512
 513     for (i = 0; i < 8; i++) {
 514         for (j = 0; j < 8; j++) {
 515             if (*block < -128)
 516                 *pixels = 0;
 517             else if (*block > 127)
 518                 *pixels = 255;
 519             else
 520                 *pixels = (uint8_t)(*block + 128);
 521             block++;
 522             pixels++;
 523         }
 524         pixels += (line_size - 8);
 525     }
 526 }
 527
 528 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 529                                     int line_size)
 530 {
 531     int i;
 532
 533     /* read the pixels */
 534     for(i=0;i<8;i++) {
 535         pixels[0] = block[0];
 536         pixels[1] = block[1];
 537         pixels[2] = block[2];
 538         pixels[3] = block[3];
 539         pixels[4] = block[4];
 540         pixels[5] = block[5];
 541         pixels[6] = block[6];
 542         pixels[7] = block[7];
 543
 544         pixels += line_size;
 545         block += 8;
 546     }
 547 }
 548
 549 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 550                              int line_size)
 551 {
 552     int i;
 553     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 554
 555     /* read the pixels */
 556     for(i=0;i<8;i++) {
 557         pixels[0] = cm[pixels[0] + block[0]];
 558         pixels[1] = cm[pixels[1] + block[1]];
 559         pixels[2] = cm[pixels[2] + block[2]];
 560         pixels[3] = cm[pixels[3] + block[3]];
 561         pixels[4] = cm[pixels[4] + block[4]];
 562         pixels[5] = cm[pixels[5] + block[5]];
 563         pixels[6] = cm[pixels[6] + block[6]];
 564         pixels[7] = cm[pixels[7] + block[7]];
 565         pixels += line_size;
 566         block += 8;
 567     }
 568 }
 569
 570 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 571                           int line_size)
 572 {
 573     int i;
 574     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 575
 576     /* read the pixels */
 577     for(i=0;i<4;i++) {
 578         pixels[0] = cm[pixels[0] + block[0]];
 579         pixels[1] = cm[pixels[1] + block[1]];
 580         pixels[2] = cm[pixels[2] + block[2]];
 581         pixels[3] = cm[pixels[3] + block[3]];
 582         pixels += line_size;
 583         block += 8;
 584     }
 585 }
 586
 587 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 588                           int line_size)
 589 {
 590     int i;
 591     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 592
 593     /* read the pixels */
 594     for(i=0;i<2;i++) {
 595         pixels[0] = cm[pixels[0] + block[0]];
 596         pixels[1] = cm[pixels[1] + block[1]];
 597         pixels += line_size;
 598         block += 8;
 599     }
 600 }
 601
 602 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 603 {
 604     int i;
 605     for(i=0;i<8;i++) {
 606         pixels[0] += block[0];
 607         pixels[1] += block[1];
 608         pixels[2] += block[2];
 609         pixels[3] += block[3];
 610         pixels[4] += block[4];
 611         pixels[5] += block[5];
 612         pixels[6] += block[6];
 613         pixels[7] += block[7];
 614         pixels += line_size;
 615         block += 8;
 616     }
 617 }
 618
 619 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 620 {
 621     int i;
 622     for(i=0;i<4;i++) {
 623         pixels[0] += block[0];
 624         pixels[1] += block[1];
 625         pixels[2] += block[2];
 626         pixels[3] += block[3];
 627         pixels += line_size;
 628         block += 4;
 629     }
 630 }
 631
 632 static int sum_abs_dctelem_c(DCTELEM *block)
 633 {
 634     int sum=0, i;
 635     for(i=0; i<64; i++)
 636         sum+= FFABS(block[i]);
 637     return sum;
 638 }
 639
 640 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 641 {
 642     int i;
 643
 644     for (i = 0; i < h; i++) {
 645         memset(block, value, 16);
 646         block += line_size;
 647     }
 648 }
 649
 650 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 651 {
 652     int i;
 653
 654     for (i = 0; i < h; i++) {
 655         memset(block, value, 8);
 656         block += line_size;
 657     }
 658 }
 659
 660 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 661 {
 662     int i, j;
 663     uint16_t *dst1 = (uint16_t *) dst;
 664     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 665
 666     for (j = 0; j < 8; j++) {
 667         for (i = 0; i < 8; i++) {
 668             dst1[i] = dst2[i] = src[i] * 0x0101;
 669         }
 670         src  += 8;
 671         dst1 += linesize;
 672         dst2 += linesize;
 673     }
 674 }
 675
 676 #if 0
 677
 678 #define PIXOP2(OPNAME, OP) \
 679 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 680 {\
 681     int i;\
 682     for(i=0; i<h; i++){\
 683         OP(*((uint64_t*)block), AV_RN64(pixels));\
 684         pixels+=line_size;\
 685         block +=line_size;\
 686     }\
 687 }\
 688 \
 689 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 690 {\
 691     int i;\
 692     for(i=0; i<h; i++){\
 693         const uint64_t a= AV_RN64(pixels  );\
 694         const uint64_t b= AV_RN64(pixels+1);\
 695         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 696         pixels+=line_size;\
 697         block +=line_size;\
 698     }\
 699 }\
 700 \
 701 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 702 {\
 703     int i;\
 704     for(i=0; i<h; i++){\
 705         const uint64_t a= AV_RN64(pixels  );\
 706         const uint64_t b= AV_RN64(pixels+1);\
 707         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 708         pixels+=line_size;\
 709         block +=line_size;\
 710     }\
 711 }\
 712 \
 713 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 714 {\
 715     int i;\
 716     for(i=0; i<h; i++){\
 717         const uint64_t a= AV_RN64(pixels          );\
 718         const uint64_t b= AV_RN64(pixels+line_size);\
 719         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 720         pixels+=line_size;\
 721         block +=line_size;\
 722     }\
 723 }\
 724 \
 725 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 726 {\
 727     int i;\
 728     for(i=0; i<h; i++){\
 729         const uint64_t a= AV_RN64(pixels          );\
 730         const uint64_t b= AV_RN64(pixels+line_size);\
 731         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 732         pixels+=line_size;\
 733         block +=line_size;\
 734     }\
 735 }\
 736 \
 737 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 738 {\
 739         int i;\
 740         const uint64_t a= AV_RN64(pixels  );\
 741         const uint64_t b= AV_RN64(pixels+1);\
 742         uint64_t l0=  (a&0x0303030303030303ULL)\
 743                     + (b&0x0303030303030303ULL)\
 744                     + 0x0202020202020202ULL;\
 745         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 746                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 747         uint64_t l1,h1;\
 748 \
 749         pixels+=line_size;\
 750         for(i=0; i<h; i+=2){\
 751             uint64_t a= AV_RN64(pixels  );\
 752             uint64_t b= AV_RN64(pixels+1);\
 753             l1=  (a&0x0303030303030303ULL)\
 754                + (b&0x0303030303030303ULL);\
 755             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 756               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 757             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 758             pixels+=line_size;\
 759             block +=line_size;\
 760             a= AV_RN64(pixels  );\
 761             b= AV_RN64(pixels+1);\
 762             l0=  (a&0x0303030303030303ULL)\
 763                + (b&0x0303030303030303ULL)\
 764                + 0x0202020202020202ULL;\
 765             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 766               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 767             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 768             pixels+=line_size;\
 769             block +=line_size;\
 770         }\
 771 }\
 772 \
 773 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 774 {\
 775         int i;\
 776         const uint64_t a= AV_RN64(pixels  );\
 777         const uint64_t b= AV_RN64(pixels+1);\
 778         uint64_t l0=  (a&0x0303030303030303ULL)\
 779                     + (b&0x0303030303030303ULL)\
 780                     + 0x0101010101010101ULL;\
 781         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 782                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 783         uint64_t l1,h1;\
 784 \
 785         pixels+=line_size;\
 786         for(i=0; i<h; i+=2){\
 787             uint64_t a= AV_RN64(pixels  );\
 788             uint64_t b= AV_RN64(pixels+1);\
 789             l1=  (a&0x0303030303030303ULL)\
 790                + (b&0x0303030303030303ULL);\
 791             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 792               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 793             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 794             pixels+=line_size;\
 795             block +=line_size;\
 796             a= AV_RN64(pixels  );\
 797             b= AV_RN64(pixels+1);\
 798             l0=  (a&0x0303030303030303ULL)\
 799                + (b&0x0303030303030303ULL)\
 800                + 0x0101010101010101ULL;\
 801             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 802               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 803             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 804             pixels+=line_size;\
 805             block +=line_size;\
 806         }\
 807 }\
 808 \
 809 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 810 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 811 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 812 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 813 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 814 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 815 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 816
 817 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 818 #else // 64 bit variant
 819
 820 #define PIXOP2(OPNAME, OP) \
 821 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 822     int i;\
 823     for(i=0; i<h; i++){\
 824         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 825         pixels+=line_size;\
 826         block +=line_size;\
 827     }\
 828 }\
 829 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 830     int i;\
 831     for(i=0; i<h; i++){\
 832         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 833         pixels+=line_size;\
 834         block +=line_size;\
 835     }\
 836 }\
 837 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 838     int i;\
 839     for(i=0; i<h; i++){\
 840         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 841         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 842         pixels+=line_size;\
 843         block +=line_size;\
 844     }\
 845 }\
 846 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 847     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 848 }\
 849 \
 850 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 851                                                 int src_stride1, int src_stride2, int h){\
 852     int i;\
 853     for(i=0; i<h; i++){\
 854         uint32_t a,b;\
 855         a= AV_RN32(&src1[i*src_stride1  ]);\
 856         b= AV_RN32(&src2[i*src_stride2  ]);\
 857         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 858         a= AV_RN32(&src1[i*src_stride1+4]);\
 859         b= AV_RN32(&src2[i*src_stride2+4]);\
 860         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 861     }\
 862 }\
 863 \
 864 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 865                                                 int src_stride1, int src_stride2, int h){\
 866     int i;\
 867     for(i=0; i<h; i++){\
 868         uint32_t a,b;\
 869         a= AV_RN32(&src1[i*src_stride1  ]);\
 870         b= AV_RN32(&src2[i*src_stride2  ]);\
 871         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 872         a= AV_RN32(&src1[i*src_stride1+4]);\
 873         b= AV_RN32(&src2[i*src_stride2+4]);\
 874         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 875     }\
 876 }\
 877 \
 878 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 879                                                 int src_stride1, int src_stride2, int h){\
 880     int i;\
 881     for(i=0; i<h; i++){\
 882         uint32_t a,b;\
 883         a= AV_RN32(&src1[i*src_stride1  ]);\
 884         b= AV_RN32(&src2[i*src_stride2  ]);\
 885         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 886     }\
 887 }\
 888 \
 889 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 890                                                 int src_stride1, int src_stride2, int h){\
 891     int i;\
 892     for(i=0; i<h; i++){\
 893         uint32_t a,b;\
 894         a= AV_RN16(&src1[i*src_stride1  ]);\
 895         b= AV_RN16(&src2[i*src_stride2  ]);\
 896         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 897     }\
 898 }\
 899 \
 900 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 901                                                 int src_stride1, int src_stride2, int h){\
 902     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 903     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 904 }\
 905 \
 906 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 907                                                 int src_stride1, int src_stride2, int h){\
 908     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 909     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 910 }\
 911 \
 912 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 913     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 914 }\
 915 \
 916 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 917     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 918 }\
 919 \
 920 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 921     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 922 }\
 923 \
 924 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 925     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 926 }\
 927 \
 928 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 929                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 930     int i;\
 931     for(i=0; i<h; i++){\
 932         uint32_t a, b, c, d, l0, l1, h0, h1;\
 933         a= AV_RN32(&src1[i*src_stride1]);\
 934         b= AV_RN32(&src2[i*src_stride2]);\
 935         c= AV_RN32(&src3[i*src_stride3]);\
 936         d= AV_RN32(&src4[i*src_stride4]);\
 937         l0=  (a&0x03030303UL)\
 938            + (b&0x03030303UL)\
 939            + 0x02020202UL;\
 940         h0= ((a&0xFCFCFCFCUL)>>2)\
 941           + ((b&0xFCFCFCFCUL)>>2);\
 942         l1=  (c&0x03030303UL)\
 943            + (d&0x03030303UL);\
 944         h1= ((c&0xFCFCFCFCUL)>>2)\
 945           + ((d&0xFCFCFCFCUL)>>2);\
 946         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 947         a= AV_RN32(&src1[i*src_stride1+4]);\
 948         b= AV_RN32(&src2[i*src_stride2+4]);\
 949         c= AV_RN32(&src3[i*src_stride3+4]);\
 950         d= AV_RN32(&src4[i*src_stride4+4]);\
 951         l0=  (a&0x03030303UL)\
 952            + (b&0x03030303UL)\
 953            + 0x02020202UL;\
 954         h0= ((a&0xFCFCFCFCUL)>>2)\
 955           + ((b&0xFCFCFCFCUL)>>2);\
 956         l1=  (c&0x03030303UL)\
 957            + (d&0x03030303UL);\
 958         h1= ((c&0xFCFCFCFCUL)>>2)\
 959           + ((d&0xFCFCFCFCUL)>>2);\
 960         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 961     }\
 962 }\
 963 \
 964 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 965     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 966 }\
 967 \
 968 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 969     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 970 }\
 971 \
 972 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 973     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 974 }\
 975 \
 976 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 977     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 978 }\
 979 \
 980 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 981                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 982     int i;\
 983     for(i=0; i<h; i++){\
 984         uint32_t a, b, c, d, l0, l1, h0, h1;\
 985         a= AV_RN32(&src1[i*src_stride1]);\
 986         b= AV_RN32(&src2[i*src_stride2]);\
 987         c= AV_RN32(&src3[i*src_stride3]);\
 988         d= AV_RN32(&src4[i*src_stride4]);\
 989         l0=  (a&0x03030303UL)\
 990            + (b&0x03030303UL)\
 991            + 0x01010101UL;\
 992         h0= ((a&0xFCFCFCFCUL)>>2)\
 993           + ((b&0xFCFCFCFCUL)>>2);\
 994         l1=  (c&0x03030303UL)\
 995            + (d&0x03030303UL);\
 996         h1= ((c&0xFCFCFCFCUL)>>2)\
 997           + ((d&0xFCFCFCFCUL)>>2);\
 998         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 999         a= AV_RN32(&src1[i*src_stride1+4]);\
1000         b= AV_RN32(&src2[i*src_stride2+4]);\
1001         c= AV_RN32(&src3[i*src_stride3+4]);\
1002         d= AV_RN32(&src4[i*src_stride4+4]);\
1003         l0=  (a&0x03030303UL)\
1004            + (b&0x03030303UL)\
1005            + 0x01010101UL;\
1006         h0= ((a&0xFCFCFCFCUL)>>2)\
1007           + ((b&0xFCFCFCFCUL)>>2);\
1008         l1=  (c&0x03030303UL)\
1009            + (d&0x03030303UL);\
1010         h1= ((c&0xFCFCFCFCUL)>>2)\
1011           + ((d&0xFCFCFCFCUL)>>2);\
1012         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1013     }\
1014 }\
1015 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1016                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1017     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1018     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1019 }\
1020 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1021                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1022     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1023     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1024 }\
1025 \
1026 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1027 {\
1028         int i, a0, b0, a1, b1;\
1029         a0= pixels[0];\
1030         b0= pixels[1] + 2;\
1031         a0 += b0;\
1032         b0 += pixels[2];\
1033 \
1034         pixels+=line_size;\
1035         for(i=0; i<h; i+=2){\
1036             a1= pixels[0];\
1037             b1= pixels[1];\
1038             a1 += b1;\
1039             b1 += pixels[2];\
1040 \
1041             block[0]= (a1+a0)>>2; /* FIXME non put */\
1042             block[1]= (b1+b0)>>2;\
1043 \
1044             pixels+=line_size;\
1045             block +=line_size;\
1046 \
1047             a0= pixels[0];\
1048             b0= pixels[1] + 2;\
1049             a0 += b0;\
1050             b0 += pixels[2];\
1051 \
1052             block[0]= (a1+a0)>>2;\
1053             block[1]= (b1+b0)>>2;\
1054             pixels+=line_size;\
1055             block +=line_size;\
1056         }\
1057 }\
1058 \
1059 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1060 {\
1061         int i;\
1062         const uint32_t a= AV_RN32(pixels  );\
1063         const uint32_t b= AV_RN32(pixels+1);\
1064         uint32_t l0=  (a&0x03030303UL)\
1065                     + (b&0x03030303UL)\
1066                     + 0x02020202UL;\
1067         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1068                    + ((b&0xFCFCFCFCUL)>>2);\
1069         uint32_t l1,h1;\
1070 \
1071         pixels+=line_size;\
1072         for(i=0; i<h; i+=2){\
1073             uint32_t a= AV_RN32(pixels  );\
1074             uint32_t b= AV_RN32(pixels+1);\
1075             l1=  (a&0x03030303UL)\
1076                + (b&0x03030303UL);\
1077             h1= ((a&0xFCFCFCFCUL)>>2)\
1078               + ((b&0xFCFCFCFCUL)>>2);\
1079             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1080             pixels+=line_size;\
1081             block +=line_size;\
1082             a= AV_RN32(pixels  );\
1083             b= AV_RN32(pixels+1);\
1084             l0=  (a&0x03030303UL)\
1085                + (b&0x03030303UL)\
1086                + 0x02020202UL;\
1087             h0= ((a&0xFCFCFCFCUL)>>2)\
1088               + ((b&0xFCFCFCFCUL)>>2);\
1089             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1090             pixels+=line_size;\
1091             block +=line_size;\
1092         }\
1093 }\
1094 \
1095 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1096 {\
1097     int j;\
1098     for(j=0; j<2; j++){\
1099         int i;\
1100         const uint32_t a= AV_RN32(pixels  );\
1101         const uint32_t b= AV_RN32(pixels+1);\
1102         uint32_t l0=  (a&0x03030303UL)\
1103                     + (b&0x03030303UL)\
1104                     + 0x02020202UL;\
1105         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1106                    + ((b&0xFCFCFCFCUL)>>2);\
1107         uint32_t l1,h1;\
1108 \
1109         pixels+=line_size;\
1110         for(i=0; i<h; i+=2){\
1111             uint32_t a= AV_RN32(pixels  );\
1112             uint32_t b= AV_RN32(pixels+1);\
1113             l1=  (a&0x03030303UL)\
1114                + (b&0x03030303UL);\
1115             h1= ((a&0xFCFCFCFCUL)>>2)\
1116               + ((b&0xFCFCFCFCUL)>>2);\
1117             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1118             pixels+=line_size;\
1119             block +=line_size;\
1120             a= AV_RN32(pixels  );\
1121             b= AV_RN32(pixels+1);\
1122             l0=  (a&0x03030303UL)\
1123                + (b&0x03030303UL)\
1124                + 0x02020202UL;\
1125             h0= ((a&0xFCFCFCFCUL)>>2)\
1126               + ((b&0xFCFCFCFCUL)>>2);\
1127             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1128             pixels+=line_size;\
1129             block +=line_size;\
1130         }\
1131         pixels+=4-line_size*(h+1);\
1132         block +=4-line_size*h;\
1133     }\
1134 }\
1135 \
1136 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1137 {\
1138     int j;\
1139     for(j=0; j<2; j++){\
1140         int i;\
1141         const uint32_t a= AV_RN32(pixels  );\
1142         const uint32_t b= AV_RN32(pixels+1);\
1143         uint32_t l0=  (a&0x03030303UL)\
1144                     + (b&0x03030303UL)\
1145                     + 0x01010101UL;\
1146         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1147                    + ((b&0xFCFCFCFCUL)>>2);\
1148         uint32_t l1,h1;\
1149 \
1150         pixels+=line_size;\
1151         for(i=0; i<h; i+=2){\
1152             uint32_t a= AV_RN32(pixels  );\
1153             uint32_t b= AV_RN32(pixels+1);\
1154             l1=  (a&0x03030303UL)\
1155                + (b&0x03030303UL);\
1156             h1= ((a&0xFCFCFCFCUL)>>2)\
1157               + ((b&0xFCFCFCFCUL)>>2);\
1158             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1159             pixels+=line_size;\
1160             block +=line_size;\
1161             a= AV_RN32(pixels  );\
1162             b= AV_RN32(pixels+1);\
1163             l0=  (a&0x03030303UL)\
1164                + (b&0x03030303UL)\
1165                + 0x01010101UL;\
1166             h0= ((a&0xFCFCFCFCUL)>>2)\
1167               + ((b&0xFCFCFCFCUL)>>2);\
1168             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1169             pixels+=line_size;\
1170             block +=line_size;\
1171         }\
1172         pixels+=4-line_size*(h+1);\
1173         block +=4-line_size*h;\
1174     }\
1175 }\
1176 \
1177 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1178 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1179 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1180 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1181 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1182 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1183 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1184 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1185
1186 #define op_avg(a, b) a = rnd_avg32(a, b)
1187 #endif
1188 #define op_put(a, b) a = b
1189
1190 PIXOP2(avg, op_avg)
1191 PIXOP2(put, op_put)
1192 #undef op_avg
1193 #undef op_put
1194
1195 #define put_no_rnd_pixels8_c  put_pixels8_c
1196 #define put_no_rnd_pixels16_c put_pixels16_c
1197
1198 #define avg2(a,b) ((a+b+1)>>1)
1199 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1200
1201 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1202     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1203 }
1204
1205 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1206     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1207 }
1208
1209 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1210 {
1211     const int A=(16-x16)*(16-y16);
1212     const int B=(   x16)*(16-y16);
1213     const int C=(16-x16)*(   y16);
1214     const int D=(   x16)*(   y16);
1215     int i;
1216
1217     for(i=0; i<h; i++)
1218     {
1219         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1220         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1221         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1222         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1223         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1224         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1225         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1226         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1227         dst+= stride;
1228         src+= stride;
1229     }
1230 }
1231
1232 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1233                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1234 {
1235     int y, vx, vy;
1236     const int s= 1<<shift;
1237
1238     width--;
1239     height--;
1240
1241     for(y=0; y<h; y++){
1242         int x;
1243
1244         vx= ox;
1245         vy= oy;
1246         for(x=0; x<8; x++){ //XXX FIXME optimize
1247             int src_x, src_y, frac_x, frac_y, index;
1248
1249             src_x= vx>>16;
1250             src_y= vy>>16;
1251             frac_x= src_x&(s-1);
1252             frac_y= src_y&(s-1);
1253             src_x>>=shift;
1254             src_y>>=shift;
1255
1256             if((unsigned)src_x < width){
1257                 if((unsigned)src_y < height){
1258                     index= src_x + src_y*stride;
1259                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1260                                            + src[index       +1]*   frac_x )*(s-frac_y)
1261                                         + (  src[index+stride  ]*(s-frac_x)
1262                                            + src[index+stride+1]*   frac_x )*   frac_y
1263                                         + r)>>(shift*2);
1264                 }else{
1265                     index= src_x + av_clip(src_y, 0, height)*stride;
1266                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1267                                           + src[index       +1]*   frac_x )*s
1268                                         + r)>>(shift*2);
1269                 }
1270             }else{
1271                 if((unsigned)src_y < height){
1272                     index= av_clip(src_x, 0, width) + src_y*stride;
1273                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1274                                            + src[index+stride  ]*   frac_y )*s
1275                                         + r)>>(shift*2);
1276                 }else{
1277                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1278                     dst[y*stride + x]=    src[index         ];
1279                 }
1280             }
1281
1282             vx+= dxx;
1283             vy+= dyx;
1284         }
1285         ox += dxy;
1286         oy += dyy;
1287     }
1288 }
1289
1290 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291     switch(width){
1292     case 2: put_pixels2_c (dst, src, stride, height); break;
1293     case 4: put_pixels4_c (dst, src, stride, height); break;
1294     case 8: put_pixels8_c (dst, src, stride, height); break;
1295     case 16:put_pixels16_c(dst, src, stride, height); break;
1296     }
1297 }
1298
1299 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1300     int i,j;
1301     for (i=0; i < height; i++) {
1302       for (j=0; j < width; j++) {
1303         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1304       }
1305       src += stride;
1306       dst += stride;
1307     }
1308 }
1309
1310 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311     int i,j;
1312     for (i=0; i < height; i++) {
1313       for (j=0; j < width; j++) {
1314         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1315       }
1316       src += stride;
1317       dst += stride;
1318     }
1319 }
1320
1321 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322     int i,j;
1323     for (i=0; i < height; i++) {
1324       for (j=0; j < width; j++) {
1325         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1326       }
1327       src += stride;
1328       dst += stride;
1329     }
1330 }
1331
1332 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333     int i,j;
1334     for (i=0; i < height; i++) {
1335       for (j=0; j < width; j++) {
1336         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1337       }
1338       src += stride;
1339       dst += stride;
1340     }
1341 }
1342
1343 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344     int i,j;
1345     for (i=0; i < height; i++) {
1346       for (j=0; j < width; j++) {
1347         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1348       }
1349       src += stride;
1350       dst += stride;
1351     }
1352 }
1353
1354 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355     int i,j;
1356     for (i=0; i < height; i++) {
1357       for (j=0; j < width; j++) {
1358         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1359       }
1360       src += stride;
1361       dst += stride;
1362     }
1363 }
1364
1365 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366     int i,j;
1367     for (i=0; i < height; i++) {
1368       for (j=0; j < width; j++) {
1369         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1370       }
1371       src += stride;
1372       dst += stride;
1373     }
1374 }
1375
1376 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377     int i,j;
1378     for (i=0; i < height; i++) {
1379       for (j=0; j < width; j++) {
1380         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1381       }
1382       src += stride;
1383       dst += stride;
1384     }
1385 }
1386
1387 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388     switch(width){
1389     case 2: avg_pixels2_c (dst, src, stride, height); break;
1390     case 4: avg_pixels4_c (dst, src, stride, height); break;
1391     case 8: avg_pixels8_c (dst, src, stride, height); break;
1392     case 16:avg_pixels16_c(dst, src, stride, height); break;
1393     }
1394 }
1395
1396 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1397     int i,j;
1398     for (i=0; i < height; i++) {
1399       for (j=0; j < width; j++) {
1400         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1401       }
1402       src += stride;
1403       dst += stride;
1404     }
1405 }
1406
1407 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1408     int i,j;
1409     for (i=0; i < height; i++) {
1410       for (j=0; j < width; j++) {
1411         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1412       }
1413       src += stride;
1414       dst += stride;
1415     }
1416 }
1417
1418 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1419     int i,j;
1420     for (i=0; i < height; i++) {
1421       for (j=0; j < width; j++) {
1422         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1423       }
1424       src += stride;
1425       dst += stride;
1426     }
1427 }
1428
1429 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1430     int i,j;
1431     for (i=0; i < height; i++) {
1432       for (j=0; j < width; j++) {
1433         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1434       }
1435       src += stride;
1436       dst += stride;
1437     }
1438 }
1439
1440 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1441     int i,j;
1442     for (i=0; i < height; i++) {
1443       for (j=0; j < width; j++) {
1444         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1445       }
1446       src += stride;
1447       dst += stride;
1448     }
1449 }
1450
1451 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1452     int i,j;
1453     for (i=0; i < height; i++) {
1454       for (j=0; j < width; j++) {
1455         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1456       }
1457       src += stride;
1458       dst += stride;
1459     }
1460 }
1461
1462 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1463     int i,j;
1464     for (i=0; i < height; i++) {
1465       for (j=0; j < width; j++) {
1466         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1467       }
1468       src += stride;
1469       dst += stride;
1470     }
1471 }
1472
1473 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1474     int i,j;
1475     for (i=0; i < height; i++) {
1476       for (j=0; j < width; j++) {
1477         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1478       }
1479       src += stride;
1480       dst += stride;
1481     }
1482 }
1483 #if 0
1484 #define TPEL_WIDTH(width)\
1485 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1486     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1487 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1488     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1489 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1490     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1491 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1492     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1493 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1494     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1495 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1496     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1497 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1498     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1499 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1500     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1501 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1502     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1503 #endif
1504
1505 #define H264_CHROMA_MC(OPNAME, OP)\
1506 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1507     const int A=(8-x)*(8-y);\
1508     const int B=(  x)*(8-y);\
1509     const int C=(8-x)*(  y);\
1510     const int D=(  x)*(  y);\
1511     int i;\
1512     \
1513     assert(x<8 && y<8 && x>=0 && y>=0);\
1514 \
1515     if(D){\
1516         for(i=0; i<h; i++){\
1517             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1518             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1519             dst+= stride;\
1520             src+= stride;\
1521         }\
1522     }else{\
1523         const int E= B+C;\
1524         const int step= C ? stride : 1;\
1525         for(i=0; i<h; i++){\
1526             OP(dst[0], (A*src[0] + E*src[step+0]));\
1527             OP(dst[1], (A*src[1] + E*src[step+1]));\
1528             dst+= stride;\
1529             src+= stride;\
1530         }\
1531     }\
1532 }\
1533 \
1534 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1535     const int A=(8-x)*(8-y);\
1536     const int B=(  x)*(8-y);\
1537     const int C=(8-x)*(  y);\
1538     const int D=(  x)*(  y);\
1539     int i;\
1540     \
1541     assert(x<8 && y<8 && x>=0 && y>=0);\
1542 \
1543     if(D){\
1544         for(i=0; i<h; i++){\
1545             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1546             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1547             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1548             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1549             dst+= stride;\
1550             src+= stride;\
1551         }\
1552     }else{\
1553         const int E= B+C;\
1554         const int step= C ? stride : 1;\
1555         for(i=0; i<h; i++){\
1556             OP(dst[0], (A*src[0] + E*src[step+0]));\
1557             OP(dst[1], (A*src[1] + E*src[step+1]));\
1558             OP(dst[2], (A*src[2] + E*src[step+2]));\
1559             OP(dst[3], (A*src[3] + E*src[step+3]));\
1560             dst+= stride;\
1561             src+= stride;\
1562         }\
1563     }\
1564 }\
1565 \
1566 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1567     const int A=(8-x)*(8-y);\
1568     const int B=(  x)*(8-y);\
1569     const int C=(8-x)*(  y);\
1570     const int D=(  x)*(  y);\
1571     int i;\
1572     \
1573     assert(x<8 && y<8 && x>=0 && y>=0);\
1574 \
1575     if(D){\
1576         for(i=0; i<h; i++){\
1577             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1578             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1579             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1580             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1581             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1582             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1583             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1584             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1585             dst+= stride;\
1586             src+= stride;\
1587         }\
1588     }else{\
1589         const int E= B+C;\
1590         const int step= C ? stride : 1;\
1591         for(i=0; i<h; i++){\
1592             OP(dst[0], (A*src[0] + E*src[step+0]));\
1593             OP(dst[1], (A*src[1] + E*src[step+1]));\
1594             OP(dst[2], (A*src[2] + E*src[step+2]));\
1595             OP(dst[3], (A*src[3] + E*src[step+3]));\
1596             OP(dst[4], (A*src[4] + E*src[step+4]));\
1597             OP(dst[5], (A*src[5] + E*src[step+5]));\
1598             OP(dst[6], (A*src[6] + E*src[step+6]));\
1599             OP(dst[7], (A*src[7] + E*src[step+7]));\
1600             dst+= stride;\
1601             src+= stride;\
1602         }\
1603     }\
1604 }
1605
1606 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1607 #define op_put(a, b) a = (((b) + 32)>>6)
1608
1609 H264_CHROMA_MC(put_       , op_put)
1610 H264_CHROMA_MC(avg_       , op_avg)
1611 #undef op_avg
1612 #undef op_put
1613
1614 #define QPEL_MC(r, OPNAME, RND, OP) \
1615 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1616     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1617     int i;\
1618     for(i=0; i<h; i++)\
1619     {\
1620         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1621         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1622         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1623         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1624         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1625         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1626         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1627         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1628         dst+=dstStride;\
1629         src+=srcStride;\
1630     }\
1631 }\
1632 \
1633 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1634     const int w=8;\
1635     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1636     int i;\
1637     for(i=0; i<w; i++)\
1638     {\
1639         const int src0= src[0*srcStride];\
1640         const int src1= src[1*srcStride];\
1641         const int src2= src[2*srcStride];\
1642         const int src3= src[3*srcStride];\
1643         const int src4= src[4*srcStride];\
1644         const int src5= src[5*srcStride];\
1645         const int src6= src[6*srcStride];\
1646         const int src7= src[7*srcStride];\
1647         const int src8= src[8*srcStride];\
1648         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1649         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1650         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1651         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1652         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1653         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1654         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1655         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1656         dst++;\
1657         src++;\
1658     }\
1659 }\
1660 \
1661 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1662     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1663     int i;\
1664     \
1665     for(i=0; i<h; i++)\
1666     {\
1667         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1668         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1669         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1670         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1671         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1672         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1673         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1674         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1675         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1676         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1677         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1678         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1679         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1680         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1681         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1682         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1683         dst+=dstStride;\
1684         src+=srcStride;\
1685     }\
1686 }\
1687 \
1688 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1689     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1690     int i;\
1691     const int w=16;\
1692     for(i=0; i<w; i++)\
1693     {\
1694         const int src0= src[0*srcStride];\
1695         const int src1= src[1*srcStride];\
1696         const int src2= src[2*srcStride];\
1697         const int src3= src[3*srcStride];\
1698         const int src4= src[4*srcStride];\
1699         const int src5= src[5*srcStride];\
1700         const int src6= src[6*srcStride];\
1701         const int src7= src[7*srcStride];\
1702         const int src8= src[8*srcStride];\
1703         const int src9= src[9*srcStride];\
1704         const int src10= src[10*srcStride];\
1705         const int src11= src[11*srcStride];\
1706         const int src12= src[12*srcStride];\
1707         const int src13= src[13*srcStride];\
1708         const int src14= src[14*srcStride];\
1709         const int src15= src[15*srcStride];\
1710         const int src16= src[16*srcStride];\
1711         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1712         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1713         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1714         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1715         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1716         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1717         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1718         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1719         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1720         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1721         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1722         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1723         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1724         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1725         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1726         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1727         dst++;\
1728         src++;\
1729     }\
1730 }\
1731 \
1732 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1733     uint8_t half[64];\
1734     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1735     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1736 }\
1737 \
1738 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1739     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1740 }\
1741 \
1742 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1743     uint8_t half[64];\
1744     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1745     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1746 }\
1747 \
1748 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1749     uint8_t full[16*9];\
1750     uint8_t half[64];\
1751     copy_block9(full, src, 16, stride, 9);\
1752     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1753     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1754 }\
1755 \
1756 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1757     uint8_t full[16*9];\
1758     copy_block9(full, src, 16, stride, 9);\
1759     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1760 }\
1761 \
1762 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1763     uint8_t full[16*9];\
1764     uint8_t half[64];\
1765     copy_block9(full, src, 16, stride, 9);\
1766     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1767     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1768 }\
1769 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770     uint8_t full[16*9];\
1771     uint8_t halfH[72];\
1772     uint8_t halfV[64];\
1773     uint8_t halfHV[64];\
1774     copy_block9(full, src, 16, stride, 9);\
1775     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1776     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1777     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1778     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1779 }\
1780 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1781     uint8_t full[16*9];\
1782     uint8_t halfH[72];\
1783     uint8_t halfHV[64];\
1784     copy_block9(full, src, 16, stride, 9);\
1785     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1787     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1789 }\
1790 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1791     uint8_t full[16*9];\
1792     uint8_t halfH[72];\
1793     uint8_t halfV[64];\
1794     uint8_t halfHV[64];\
1795     copy_block9(full, src, 16, stride, 9);\
1796     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1797     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1798     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1799     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1800 }\
1801 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1802     uint8_t full[16*9];\
1803     uint8_t halfH[72];\
1804     uint8_t halfHV[64];\
1805     copy_block9(full, src, 16, stride, 9);\
1806     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1808     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1810 }\
1811 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1812     uint8_t full[16*9];\
1813     uint8_t halfH[72];\
1814     uint8_t halfV[64];\
1815     uint8_t halfHV[64];\
1816     copy_block9(full, src, 16, stride, 9);\
1817     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1818     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1819     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1821 }\
1822 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1823     uint8_t full[16*9];\
1824     uint8_t halfH[72];\
1825     uint8_t halfHV[64];\
1826     copy_block9(full, src, 16, stride, 9);\
1827     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1829     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1831 }\
1832 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1833     uint8_t full[16*9];\
1834     uint8_t halfH[72];\
1835     uint8_t halfV[64];\
1836     uint8_t halfHV[64];\
1837     copy_block9(full, src, 16, stride, 9);\
1838     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1839     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1840     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1842 }\
1843 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1844     uint8_t full[16*9];\
1845     uint8_t halfH[72];\
1846     uint8_t halfHV[64];\
1847     copy_block9(full, src, 16, stride, 9);\
1848     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1849     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1850     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1851     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1852 }\
1853 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1854     uint8_t halfH[72];\
1855     uint8_t halfHV[64];\
1856     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1857     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1858     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1859 }\
1860 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1861     uint8_t halfH[72];\
1862     uint8_t halfHV[64];\
1863     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1864     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1865     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1866 }\
1867 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1868     uint8_t full[16*9];\
1869     uint8_t halfH[72];\
1870     uint8_t halfV[64];\
1871     uint8_t halfHV[64];\
1872     copy_block9(full, src, 16, stride, 9);\
1873     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1874     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1875     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1876     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1877 }\
1878 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1879     uint8_t full[16*9];\
1880     uint8_t halfH[72];\
1881     copy_block9(full, src, 16, stride, 9);\
1882     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1883     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1884     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1885 }\
1886 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887     uint8_t full[16*9];\
1888     uint8_t halfH[72];\
1889     uint8_t halfV[64];\
1890     uint8_t halfHV[64];\
1891     copy_block9(full, src, 16, stride, 9);\
1892     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1893     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1894     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1896 }\
1897 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1898     uint8_t full[16*9];\
1899     uint8_t halfH[72];\
1900     copy_block9(full, src, 16, stride, 9);\
1901     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1902     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1903     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1904 }\
1905 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1906     uint8_t halfH[72];\
1907     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1908     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1909 }\
1910 \
1911 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1912     uint8_t half[256];\
1913     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1914     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1915 }\
1916 \
1917 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1918     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1919 }\
1920 \
1921 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1922     uint8_t half[256];\
1923     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1924     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1925 }\
1926 \
1927 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1928     uint8_t full[24*17];\
1929     uint8_t half[256];\
1930     copy_block17(full, src, 24, stride, 17);\
1931     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1932     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1933 }\
1934 \
1935 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1936     uint8_t full[24*17];\
1937     copy_block17(full, src, 24, stride, 17);\
1938     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1939 }\
1940 \
1941 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1942     uint8_t full[24*17];\
1943     uint8_t half[256];\
1944     copy_block17(full, src, 24, stride, 17);\
1945     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1946     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1947 }\
1948 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1949     uint8_t full[24*17];\
1950     uint8_t halfH[272];\
1951     uint8_t halfV[256];\
1952     uint8_t halfHV[256];\
1953     copy_block17(full, src, 24, stride, 17);\
1954     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1955     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1956     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1957     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1958 }\
1959 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1960     uint8_t full[24*17];\
1961     uint8_t halfH[272];\
1962     uint8_t halfHV[256];\
1963     copy_block17(full, src, 24, stride, 17);\
1964     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1965     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1966     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1967     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1968 }\
1969 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1970     uint8_t full[24*17];\
1971     uint8_t halfH[272];\
1972     uint8_t halfV[256];\
1973     uint8_t halfHV[256];\
1974     copy_block17(full, src, 24, stride, 17);\
1975     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1976     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1977     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1978     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1979 }\
1980 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1981     uint8_t full[24*17];\
1982     uint8_t halfH[272];\
1983     uint8_t halfHV[256];\
1984     copy_block17(full, src, 24, stride, 17);\
1985     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1986     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1987     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1988     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1989 }\
1990 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1991     uint8_t full[24*17];\
1992     uint8_t halfH[272];\
1993     uint8_t halfV[256];\
1994     uint8_t halfHV[256];\
1995     copy_block17(full, src, 24, stride, 17);\
1996     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1997     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1998     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1999     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2000 }\
2001 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2002     uint8_t full[24*17];\
2003     uint8_t halfH[272];\
2004     uint8_t halfHV[256];\
2005     copy_block17(full, src, 24, stride, 17);\
2006     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2007     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2008     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2010 }\
2011 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2012     uint8_t full[24*17];\
2013     uint8_t halfH[272];\
2014     uint8_t halfV[256];\
2015     uint8_t halfHV[256];\
2016     copy_block17(full, src, 24, stride, 17);\
2017     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2018     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2019     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2020     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2021 }\
2022 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2023     uint8_t full[24*17];\
2024     uint8_t halfH[272];\
2025     uint8_t halfHV[256];\
2026     copy_block17(full, src, 24, stride, 17);\
2027     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2028     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2029     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2031 }\
2032 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2033     uint8_t halfH[272];\
2034     uint8_t halfHV[256];\
2035     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2036     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2037     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2038 }\
2039 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2040     uint8_t halfH[272];\
2041     uint8_t halfHV[256];\
2042     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2043     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2044     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2045 }\
2046 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047     uint8_t full[24*17];\
2048     uint8_t halfH[272];\
2049     uint8_t halfV[256];\
2050     uint8_t halfHV[256];\
2051     copy_block17(full, src, 24, stride, 17);\
2052     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2053     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2054     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2056 }\
2057 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2058     uint8_t full[24*17];\
2059     uint8_t halfH[272];\
2060     copy_block17(full, src, 24, stride, 17);\
2061     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2062     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2063     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2064 }\
2065 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2066     uint8_t full[24*17];\
2067     uint8_t halfH[272];\
2068     uint8_t halfV[256];\
2069     uint8_t halfHV[256];\
2070     copy_block17(full, src, 24, stride, 17);\
2071     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2072     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2073     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2074     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2075 }\
2076 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2077     uint8_t full[24*17];\
2078     uint8_t halfH[272];\
2079     copy_block17(full, src, 24, stride, 17);\
2080     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2082     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2083 }\
2084 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2085     uint8_t halfH[272];\
2086     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2087     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2088 }
2089
2090 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2091 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2092 #define op_put(a, b) a = cm[((b) + 16)>>5]
2093 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2094
2095 QPEL_MC(0, put_       , _       , op_put)
2096 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2097 QPEL_MC(0, avg_       , _       , op_avg)
2098 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2099 #undef op_avg
2100 #undef op_avg_no_rnd
2101 #undef op_put
2102 #undef op_put_no_rnd
2103
2104 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
2105 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2106 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2107 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2108 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2109 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2110
2111 #if 1
2112 #define H264_LOWPASS(OPNAME, OP, OP2) \
2113 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2114     const int h=2;\
2115     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2116     int i;\
2117     for(i=0; i<h; i++)\
2118     {\
2119         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2120         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2121         dst+=dstStride;\
2122         src+=srcStride;\
2123     }\
2124 }\
2125 \
2126 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2127     const int w=2;\
2128     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2129     int i;\
2130     for(i=0; i<w; i++)\
2131     {\
2132         const int srcB= src[-2*srcStride];\
2133         const int srcA= src[-1*srcStride];\
2134         const int src0= src[0 *srcStride];\
2135         const int src1= src[1 *srcStride];\
2136         const int src2= src[2 *srcStride];\
2137         const int src3= src[3 *srcStride];\
2138         const int src4= src[4 *srcStride];\
2139         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2140         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2141         dst++;\
2142         src++;\
2143     }\
2144 }\
2145 \
2146 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2147     const int h=2;\
2148     const int w=2;\
2149     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2150     int i;\
2151     src -= 2*srcStride;\
2152     for(i=0; i<h+5; i++)\
2153     {\
2154         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2155         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2156         tmp+=tmpStride;\
2157         src+=srcStride;\
2158     }\
2159     tmp -= tmpStride*(h+5-2);\
2160     for(i=0; i<w; i++)\
2161     {\
2162         const int tmpB= tmp[-2*tmpStride];\
2163         const int tmpA= tmp[-1*tmpStride];\
2164         const int tmp0= tmp[0 *tmpStride];\
2165         const int tmp1= tmp[1 *tmpStride];\
2166         const int tmp2= tmp[2 *tmpStride];\
2167         const int tmp3= tmp[3 *tmpStride];\
2168         const int tmp4= tmp[4 *tmpStride];\
2169         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2170         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2171         dst++;\
2172         tmp++;\
2173     }\
2174 }\
2175 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2176     const int h=4;\
2177     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2178     int i;\
2179     for(i=0; i<h; i++)\
2180     {\
2181         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2182         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2183         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2184         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2185         dst+=dstStride;\
2186         src+=srcStride;\
2187     }\
2188 }\
2189 \
2190 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2191     const int w=4;\
2192     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2193     int i;\
2194     for(i=0; i<w; i++)\
2195     {\
2196         const int srcB= src[-2*srcStride];\
2197         const int srcA= src[-1*srcStride];\
2198         const int src0= src[0 *srcStride];\
2199         const int src1= src[1 *srcStride];\
2200         const int src2= src[2 *srcStride];\
2201         const int src3= src[3 *srcStride];\
2202         const int src4= src[4 *srcStride];\
2203         const int src5= src[5 *srcStride];\
2204         const int src6= src[6 *srcStride];\
2205         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2206         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2207         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2208         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2209         dst++;\
2210         src++;\
2211     }\
2212 }\
2213 \
2214 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2215     const int h=4;\
2216     const int w=4;\
2217     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2218     int i;\
2219     src -= 2*srcStride;\
2220     for(i=0; i<h+5; i++)\
2221     {\
2222         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2223         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2224         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2225         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2226         tmp+=tmpStride;\
2227         src+=srcStride;\
2228     }\
2229     tmp -= tmpStride*(h+5-2);\
2230     for(i=0; i<w; i++)\
2231     {\
2232         const int tmpB= tmp[-2*tmpStride];\
2233         const int tmpA= tmp[-1*tmpStride];\
2234         const int tmp0= tmp[0 *tmpStride];\
2235         const int tmp1= tmp[1 *tmpStride];\
2236         const int tmp2= tmp[2 *tmpStride];\
2237         const int tmp3= tmp[3 *tmpStride];\
2238         const int tmp4= tmp[4 *tmpStride];\
2239         const int tmp5= tmp[5 *tmpStride];\
2240         const int tmp6= tmp[6 *tmpStride];\
2241         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2242         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2243         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2244         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2245         dst++;\
2246         tmp++;\
2247     }\
2248 }\
2249 \
2250 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2251     const int h=8;\
2252     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2253     int i;\
2254     for(i=0; i<h; i++)\
2255     {\
2256         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2257         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2258         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2259         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2260         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2261         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2262         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2263         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2264         dst+=dstStride;\
2265         src+=srcStride;\
2266     }\
2267 }\
2268 \
2269 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2270     const int w=8;\
2271     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2272     int i;\
2273     for(i=0; i<w; i++)\
2274     {\
2275         const int srcB= src[-2*srcStride];\
2276         const int srcA= src[-1*srcStride];\
2277         const int src0= src[0 *srcStride];\
2278         const int src1= src[1 *srcStride];\
2279         const int src2= src[2 *srcStride];\
2280         const int src3= src[3 *srcStride];\
2281         const int src4= src[4 *srcStride];\
2282         const int src5= src[5 *srcStride];\
2283         const int src6= src[6 *srcStride];\
2284         const int src7= src[7 *srcStride];\
2285         const int src8= src[8 *srcStride];\
2286         const int src9= src[9 *srcStride];\
2287         const int src10=src[10*srcStride];\
2288         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2289         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2290         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2291         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2292         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2293         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2294         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2295         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2296         dst++;\
2297         src++;\
2298     }\
2299 }\
2300 \
2301 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2302     const int h=8;\
2303     const int w=8;\
2304     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2305     int i;\
2306     src -= 2*srcStride;\
2307     for(i=0; i<h+5; i++)\
2308     {\
2309         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2310         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2311         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2312         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2313         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2314         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2315         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2316         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2317         tmp+=tmpStride;\
2318         src+=srcStride;\
2319     }\
2320     tmp -= tmpStride*(h+5-2);\
2321     for(i=0; i<w; i++)\
2322     {\
2323         const int tmpB= tmp[-2*tmpStride];\
2324         const int tmpA= tmp[-1*tmpStride];\
2325         const int tmp0= tmp[0 *tmpStride];\
2326         const int tmp1= tmp[1 *tmpStride];\
2327         const int tmp2= tmp[2 *tmpStride];\
2328         const int tmp3= tmp[3 *tmpStride];\
2329         const int tmp4= tmp[4 *tmpStride];\
2330         const int tmp5= tmp[5 *tmpStride];\
2331         const int tmp6= tmp[6 *tmpStride];\
2332         const int tmp7= tmp[7 *tmpStride];\
2333         const int tmp8= tmp[8 *tmpStride];\
2334         const int tmp9= tmp[9 *tmpStride];\
2335         const int tmp10=tmp[10*tmpStride];\
2336         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2337         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2338         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2339         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2340         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2341         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2342         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2343         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2344         dst++;\
2345         tmp++;\
2346     }\
2347 }\
2348 \
2349 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2350     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2351     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2352     src += 8*srcStride;\
2353     dst += 8*dstStride;\
2354     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2355     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2356 }\
2357 \
2358 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2359     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2360     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2361     src += 8*srcStride;\
2362     dst += 8*dstStride;\
2363     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2364     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2365 }\
2366 \
2367 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2368     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2369     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2370     src += 8*srcStride;\
2371     dst += 8*dstStride;\
2372     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2373     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2374 }\
2375
2376 #define H264_MC(OPNAME, SIZE) \
2377 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2378     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2379 }\
2380 \
2381 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2382     uint8_t half[SIZE*SIZE];\
2383     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2384     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2385 }\
2386 \
2387 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2388     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2389 }\
2390 \
2391 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2392     uint8_t half[SIZE*SIZE];\
2393     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2394     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2395 }\
2396 \
2397 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2398     uint8_t full[SIZE*(SIZE+5)];\
2399     uint8_t * const full_mid= full + SIZE*2;\
2400     uint8_t half[SIZE*SIZE];\
2401     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2402     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2403     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2404 }\
2405 \
2406 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2407     uint8_t full[SIZE*(SIZE+5)];\
2408     uint8_t * const full_mid= full + SIZE*2;\
2409     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2410     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2411 }\
2412 \
2413 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2414     uint8_t full[SIZE*(SIZE+5)];\
2415     uint8_t * const full_mid= full + SIZE*2;\
2416     uint8_t half[SIZE*SIZE];\
2417     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2418     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2419     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2420 }\
2421 \
2422 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2423     uint8_t full[SIZE*(SIZE+5)];\
2424     uint8_t * const full_mid= full + SIZE*2;\
2425     uint8_t halfH[SIZE*SIZE];\
2426     uint8_t halfV[SIZE*SIZE];\
2427     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2429     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2430     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2431 }\
2432 \
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2434     uint8_t full[SIZE*(SIZE+5)];\
2435     uint8_t * const full_mid= full + SIZE*2;\
2436     uint8_t halfH[SIZE*SIZE];\
2437     uint8_t halfV[SIZE*SIZE];\
2438     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2439     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2440     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2441     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2442 }\
2443 \
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2445     uint8_t full[SIZE*(SIZE+5)];\
2446     uint8_t * const full_mid= full + SIZE*2;\
2447     uint8_t halfH[SIZE*SIZE];\
2448     uint8_t halfV[SIZE*SIZE];\
2449     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2450     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2451     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2452     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2453 }\
2454 \
2455 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2456     uint8_t full[SIZE*(SIZE+5)];\
2457     uint8_t * const full_mid= full + SIZE*2;\
2458     uint8_t halfH[SIZE*SIZE];\
2459     uint8_t halfV[SIZE*SIZE];\
2460     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2461     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2462     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2463     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2464 }\
2465 \
2466 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2467     int16_t tmp[SIZE*(SIZE+5)];\
2468     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2469 }\
2470 \
2471 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2472     int16_t tmp[SIZE*(SIZE+5)];\
2473     uint8_t halfH[SIZE*SIZE];\
2474     uint8_t halfHV[SIZE*SIZE];\
2475     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2476     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2477     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2478 }\
2479 \
2480 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2481     int16_t tmp[SIZE*(SIZE+5)];\
2482     uint8_t halfH[SIZE*SIZE];\
2483     uint8_t halfHV[SIZE*SIZE];\
2484     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2485     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2486     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2487 }\
2488 \
2489 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2490     uint8_t full[SIZE*(SIZE+5)];\
2491     uint8_t * const full_mid= full + SIZE*2;\
2492     int16_t tmp[SIZE*(SIZE+5)];\
2493     uint8_t halfV[SIZE*SIZE];\
2494     uint8_t halfHV[SIZE*SIZE];\
2495     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2496     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2497     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2498     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2499 }\
2500 \
2501 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2502     uint8_t full[SIZE*(SIZE+5)];\
2503     uint8_t * const full_mid= full + SIZE*2;\
2504     int16_t tmp[SIZE*(SIZE+5)];\
2505     uint8_t halfV[SIZE*SIZE];\
2506     uint8_t halfHV[SIZE*SIZE];\
2507     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2508     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2509     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2510     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2511 }\
2512
2513 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2514 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2515 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2516 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2517 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2518
2519 H264_LOWPASS(put_       , op_put, op2_put)
2520 H264_LOWPASS(avg_       , op_avg, op2_avg)
2521 H264_MC(put_, 2)
2522 H264_MC(put_, 4)
2523 H264_MC(put_, 8)
2524 H264_MC(put_, 16)
2525 H264_MC(avg_, 4)
2526 H264_MC(avg_, 8)
2527 H264_MC(avg_, 16)
2528
2529 #undef op_avg
2530 #undef op_put
2531 #undef op2_avg
2532 #undef op2_put
2533 #endif
2534
2535 #define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2536 #define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2537 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2538 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2539
2540 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2541     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2542     int i;
2543
2544     for(i=0; i<h; i++){
2545         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2546         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2547         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2548         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2549         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2550         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2551         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2552         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2553         dst+=dstStride;
2554         src+=srcStride;
2555     }
2556 }
2557
2558 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2559     put_pixels8_c(dst, src, stride, 8);
2560 }
2561 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2562     avg_pixels8_c(dst, src, stride, 8);
2563 }
2564 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2565     put_pixels16_c(dst, src, stride, 16);
2566 }
2567 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2568     avg_pixels16_c(dst, src, stride, 16);
2569 }
2570
2571 #if CONFIG_RV40_DECODER
2572 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2573     put_pixels16_xy2_c(dst, src, stride, 16);
2574 }
2575 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2576     avg_pixels16_xy2_c(dst, src, stride, 16);
2577 }
2578 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2579     put_pixels8_xy2_c(dst, src, stride, 8);
2580 }
2581 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2582     avg_pixels8_xy2_c(dst, src, stride, 8);
2583 }
2584 #endif /* CONFIG_RV40_DECODER */
2585
2586 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2587     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2588     int i;
2589
2590     for(i=0; i<w; i++){
2591         const int src_1= src[ -srcStride];
2592         const int src0 = src[0          ];
2593         const int src1 = src[  srcStride];
2594         const int src2 = src[2*srcStride];
2595         const int src3 = src[3*srcStride];
2596         const int src4 = src[4*srcStride];
2597         const int src5 = src[5*srcStride];
2598         const int src6 = src[6*srcStride];
2599         const int src7 = src[7*srcStride];
2600         const int src8 = src[8*srcStride];
2601         const int src9 = src[9*srcStride];
2602         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2603         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2604         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2605         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2606         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2607         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2608         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2609         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2610         src++;
2611         dst++;
2612     }
2613 }
2614
2615 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2616     uint8_t half[64];
2617     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2618     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2619 }
2620
2621 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2622     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2623 }
2624
2625 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2626     uint8_t half[64];
2627     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2628     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2629 }
2630
2631 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2632     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2633 }
2634
2635 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2636     uint8_t halfH[88];
2637     uint8_t halfV[64];
2638     uint8_t halfHV[64];
2639     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2640     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2641     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2642     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2643 }
2644 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2645     uint8_t halfH[88];
2646     uint8_t halfV[64];
2647     uint8_t halfHV[64];
2648     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2649     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2650     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2651     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2652 }
2653 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2654     uint8_t halfH[88];
2655     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2656     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2657 }
2658
2659 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2660     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2661     int x;
2662     const int strength= ff_h263_loop_filter_strength[qscale];
2663
2664     for(x=0; x<8; x++){
2665         int d1, d2, ad1;
2666         int p0= src[x-2*stride];
2667         int p1= src[x-1*stride];
2668         int p2= src[x+0*stride];
2669         int p3= src[x+1*stride];
2670         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2671
2672         if     (d<-2*strength) d1= 0;
2673         else if(d<-  strength) d1=-2*strength - d;
2674         else if(d<   strength) d1= d;
2675         else if(d< 2*strength) d1= 2*strength - d;
2676         else                   d1= 0;
2677
2678         p1 += d1;
2679         p2 -= d1;
2680         if(p1&256) p1= ~(p1>>31);
2681         if(p2&256) p2= ~(p2>>31);
2682
2683         src[x-1*stride] = p1;
2684         src[x+0*stride] = p2;
2685
2686         ad1= FFABS(d1)>>1;
2687
2688         d2= av_clip((p0-p3)/4, -ad1, ad1);
2689
2690         src[x-2*stride] = p0 - d2;
2691         src[x+  stride] = p3 + d2;
2692     }
2693     }
2694 }
2695
2696 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2697     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2698     int y;
2699     const int strength= ff_h263_loop_filter_strength[qscale];
2700
2701     for(y=0; y<8; y++){
2702         int d1, d2, ad1;
2703         int p0= src[y*stride-2];
2704         int p1= src[y*stride-1];
2705         int p2= src[y*stride+0];
2706         int p3= src[y*stride+1];
2707         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2708
2709         if     (d<-2*strength) d1= 0;
2710         else if(d<-  strength) d1=-2*strength - d;
2711         else if(d<   strength) d1= d;
2712         else if(d< 2*strength) d1= 2*strength - d;
2713         else                   d1= 0;
2714
2715         p1 += d1;
2716         p2 -= d1;
2717         if(p1&256) p1= ~(p1>>31);
2718         if(p2&256) p2= ~(p2>>31);
2719
2720         src[y*stride-1] = p1;
2721         src[y*stride+0] = p2;
2722
2723         ad1= FFABS(d1)>>1;
2724
2725         d2= av_clip((p0-p3)/4, -ad1, ad1);
2726
2727         src[y*stride-2] = p0 - d2;
2728         src[y*stride+1] = p3 + d2;
2729     }
2730     }
2731 }
2732
2733 static void h261_loop_filter_c(uint8_t *src, int stride){
2734     int x,y,xy,yz;
2735     int temp[64];
2736
2737     for(x=0; x<8; x++){
2738         temp[x      ] = 4*src[x           ];
2739         temp[x + 7*8] = 4*src[x + 7*stride];
2740     }
2741     for(y=1; y<7; y++){
2742         for(x=0; x<8; x++){
2743             xy = y * stride + x;
2744             yz = y * 8 + x;
2745             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2746         }
2747     }
2748
2749     for(y=0; y<8; y++){
2750         src[  y*stride] = (temp[  y*8] + 2)>>2;
2751         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2752         for(x=1; x<7; x++){
2753             xy = y * stride + x;
2754             yz = y * 8 + x;
2755             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2756         }
2757     }
2758 }
2759
2760 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2761 {
2762     int s, i;
2763
2764     s = 0;
2765     for(i=0;i<h;i++) {
2766         s += abs(pix1[0] - pix2[0]);
2767         s += abs(pix1[1] - pix2[1]);
2768         s += abs(pix1[2] - pix2[2]);
2769         s += abs(pix1[3] - pix2[3]);
2770         s += abs(pix1[4] - pix2[4]);
2771         s += abs(pix1[5] - pix2[5]);
2772         s += abs(pix1[6] - pix2[6]);
2773         s += abs(pix1[7] - pix2[7]);
2774         s += abs(pix1[8] - pix2[8]);
2775         s += abs(pix1[9] - pix2[9]);
2776         s += abs(pix1[10] - pix2[10]);
2777         s += abs(pix1[11] - pix2[11]);
2778         s += abs(pix1[12] - pix2[12]);
2779         s += abs(pix1[13] - pix2[13]);
2780         s += abs(pix1[14] - pix2[14]);
2781         s += abs(pix1[15] - pix2[15]);
2782         pix1 += line_size;
2783         pix2 += line_size;
2784     }
2785     return s;
2786 }
2787
2788 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2789 {
2790     int s, i;
2791
2792     s = 0;
2793     for(i=0;i<h;i++) {
2794         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2795         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2796         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2797         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2798         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2799         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2800         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2801         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2802         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2803         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2804         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2805         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2806         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2807         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2808         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2809         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2810         pix1 += line_size;
2811         pix2 += line_size;
2812     }
2813     return s;
2814 }
2815
2816 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2817 {
2818     int s, i;
2819     uint8_t *pix3 = pix2 + line_size;
2820
2821     s = 0;
2822     for(i=0;i<h;i++) {
2823         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2824         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2825         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2826         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2827         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2828         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2829         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2830         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2831         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2832         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2833         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2834         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2835         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2836         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2837         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2838         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2839         pix1 += line_size;
2840         pix2 += line_size;
2841         pix3 += line_size;
2842     }
2843     return s;
2844 }
2845
2846 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2847 {
2848     int s, i;
2849     uint8_t *pix3 = pix2 + line_size;
2850
2851     s = 0;
2852     for(i=0;i<h;i++) {
2853         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2854         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2855         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2856         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2857         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2858         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2859         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2860         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2861         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2862         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2863         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2864         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2865         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2866         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2867         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2868         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2869         pix1 += line_size;
2870         pix2 += line_size;
2871         pix3 += line_size;
2872     }
2873     return s;
2874 }
2875
2876 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2877 {
2878     int s, i;
2879
2880     s = 0;
2881     for(i=0;i<h;i++) {
2882         s += abs(pix1[0] - pix2[0]);
2883         s += abs(pix1[1] - pix2[1]);
2884         s += abs(pix1[2] - pix2[2]);
2885         s += abs(pix1[3] - pix2[3]);
2886         s += abs(pix1[4] - pix2[4]);
2887         s += abs(pix1[5] - pix2[5]);
2888         s += abs(pix1[6] - pix2[6]);
2889         s += abs(pix1[7] - pix2[7]);
2890         pix1 += line_size;
2891         pix2 += line_size;
2892     }
2893     return s;
2894 }
2895
2896 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2897 {
2898     int s, i;
2899
2900     s = 0;
2901     for(i=0;i<h;i++) {
2902         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2903         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2904         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2905         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2906         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2907         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2908         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2909         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2910         pix1 += line_size;
2911         pix2 += line_size;
2912     }
2913     return s;
2914 }
2915
2916 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2917 {
2918     int s, i;
2919     uint8_t *pix3 = pix2 + line_size;
2920
2921     s = 0;
2922     for(i=0;i<h;i++) {
2923         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2924         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2925         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2926         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2927         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2928         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2929         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2930         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2931         pix1 += line_size;
2932         pix2 += line_size;
2933         pix3 += line_size;
2934     }
2935     return s;
2936 }
2937
2938 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2939 {
2940     int s, i;
2941     uint8_t *pix3 = pix2 + line_size;
2942
2943     s = 0;
2944     for(i=0;i<h;i++) {
2945         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2946         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2947         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2948         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2949         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2950         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2951         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2952         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2953         pix1 += line_size;
2954         pix2 += line_size;
2955         pix3 += line_size;
2956     }
2957     return s;
2958 }
2959
2960 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2961     MpegEncContext *c = v;
2962     int score1=0;
2963     int score2=0;
2964     int x,y;
2965
2966     for(y=0; y<h; y++){
2967         for(x=0; x<16; x++){
2968             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2969         }
2970         if(y+1<h){
2971             for(x=0; x<15; x++){
2972                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
2973                              - s1[x+1] + s1[x+1+stride])
2974                         -FFABS(  s2[x  ] - s2[x  +stride]
2975                              - s2[x+1] + s2[x+1+stride]);
2976             }
2977         }
2978         s1+= stride;
2979         s2+= stride;
2980     }
2981
2982     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2983     else  return score1 + FFABS(score2)*8;
2984 }
2985
2986 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2987     MpegEncContext *c = v;
2988     int score1=0;
2989     int score2=0;
2990     int x,y;
2991
2992     for(y=0; y<h; y++){
2993         for(x=0; x<8; x++){
2994             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2995         }
2996         if(y+1<h){
2997             for(x=0; x<7; x++){
2998                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
2999                              - s1[x+1] + s1[x+1+stride])
3000                         -FFABS(  s2[x  ] - s2[x  +stride]
3001                              - s2[x+1] + s2[x+1+stride]);
3002             }
3003         }
3004         s1+= stride;
3005         s2+= stride;
3006     }
3007
3008     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3009     else  return score1 + FFABS(score2)*8;
3010 }
3011
3012 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3013     int i;
3014     unsigned int sum=0;
3015
3016     for(i=0; i<8*8; i++){
3017         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3018         int w= weight[i];
3019         b>>= RECON_SHIFT;
3020         assert(-512<b && b<512);
3021
3022         sum += (w*b)*(w*b)>>4;
3023     }
3024     return sum>>2;
3025 }
3026
3027 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3028     int i;
3029
3030     for(i=0; i<8*8; i++){
3031         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3032     }
3033 }
3034
3035 /**
3036  * permutes an 8x8 block.
3037  * @param block the block which will be permuted according to the given permutation vector
3038  * @param permutation the permutation vector
3039  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3040  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3041  *                  (inverse) permutated to scantable order!
3042  */
3043 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3044 {
3045     int i;
3046     DCTELEM temp[64];
3047
3048     if(last<=0) return;
3049     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3050
3051     for(i=0; i<=last; i++){
3052         const int j= scantable[i];
3053         temp[j]= block[j];
3054         block[j]=0;
3055     }
3056
3057     for(i=0; i<=last; i++){
3058         const int j= scantable[i];
3059         const int perm_j= permutation[j];
3060         block[perm_j]= temp[j];
3061     }
3062 }
3063
3064 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3065     return 0;
3066 }
3067
3068 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3069     int i;
3070
3071     memset(cmp, 0, sizeof(void*)*6);
3072
3073     for(i=0; i<6; i++){
3074         switch(type&0xFF){
3075         case FF_CMP_SAD:
3076             cmp[i]= c->sad[i];
3077             break;
3078         case FF_CMP_SATD:
3079             cmp[i]= c->hadamard8_diff[i];
3080             break;
3081         case FF_CMP_SSE:
3082             cmp[i]= c->sse[i];
3083             break;
3084         case FF_CMP_DCT:
3085             cmp[i]= c->dct_sad[i];
3086             break;
3087         case FF_CMP_DCT264:
3088             cmp[i]= c->dct264_sad[i];
3089             break;
3090         case FF_CMP_DCTMAX:
3091             cmp[i]= c->dct_max[i];
3092             break;
3093         case FF_CMP_PSNR:
3094             cmp[i]= c->quant_psnr[i];
3095             break;
3096         case FF_CMP_BIT:
3097             cmp[i]= c->bit[i];
3098             break;
3099         case FF_CMP_RD:
3100             cmp[i]= c->rd[i];
3101             break;
3102         case FF_CMP_VSAD:
3103             cmp[i]= c->vsad[i];
3104             break;
3105         case FF_CMP_VSSE:
3106             cmp[i]= c->vsse[i];
3107             break;
3108         case FF_CMP_ZERO:
3109             cmp[i]= zero_cmp;
3110             break;
3111         case FF_CMP_NSSE:
3112             cmp[i]= c->nsse[i];
3113             break;
3114 #if CONFIG_DWT
3115         case FF_CMP_W53:
3116             cmp[i]= c->w53[i];
3117             break;
3118         case FF_CMP_W97:
3119             cmp[i]= c->w97[i];
3120             break;
3121 #endif
3122         default:
3123             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3124         }
3125     }
3126 }
3127
3128 static void clear_block_c(DCTELEM *block)
3129 {
3130     memset(block, 0, sizeof(DCTELEM)*64);
3131 }
3132
3133 /**
3134  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3135  */
3136 static void clear_blocks_c(DCTELEM *blocks)
3137 {
3138     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3139 }
3140
3141 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3142     long i;
3143     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3144         long a = *(long*)(src+i);
3145         long b = *(long*)(dst+i);
3146         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3147     }
3148     for(; i<w; i++)
3149         dst[i+0] += src[i+0];
3150 }
3151
3152 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3153     long i;
3154     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3155         long a = *(long*)(src1+i);
3156         long b = *(long*)(src2+i);
3157         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3158     }
3159     for(; i<w; i++)
3160         dst[i] = src1[i]+src2[i];
3161 }
3162
3163 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3164     long i;
3165 #if !HAVE_FAST_UNALIGNED
3166     if((long)src2 & (sizeof(long)-1)){
3167         for(i=0; i+7<w; i+=8){
3168             dst[i+0] = src1[i+0]-src2[i+0];
3169             dst[i+1] = src1[i+1]-src2[i+1];
3170             dst[i+2] = src1[i+2]-src2[i+2];
3171             dst[i+3] = src1[i+3]-src2[i+3];
3172             dst[i+4] = src1[i+4]-src2[i+4];
3173             dst[i+5] = src1[i+5]-src2[i+5];
3174             dst[i+6] = src1[i+6]-src2[i+6];
3175             dst[i+7] = src1[i+7]-src2[i+7];
3176         }
3177     }else
3178 #endif
3179     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3180         long a = *(long*)(src1+i);
3181         long b = *(long*)(src2+i);
3182         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3183     }
3184     for(; i<w; i++)
3185         dst[i+0] = src1[i+0]-src2[i+0];
3186 }
3187
3188 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3189     int i;
3190     uint8_t l, lt;
3191
3192     l= *left;
3193     lt= *left_top;
3194
3195     for(i=0; i<w; i++){
3196         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3197         lt= src1[i];
3198         dst[i]= l;
3199     }
3200
3201     *left= l;
3202     *left_top= lt;
3203 }
3204
3205 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3206     int i;
3207     uint8_t l, lt;
3208
3209     l= *left;
3210     lt= *left_top;
3211
3212     for(i=0; i<w; i++){
3213         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3214         lt= src1[i];
3215         l= src2[i];
3216         dst[i]= l - pred;
3217     }
3218
3219     *left= l;
3220     *left_top= lt;
3221 }
3222
3223 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3224     int i;
3225
3226     for(i=0; i<w-1; i++){
3227         acc+= src[i];
3228         dst[i]= acc;
3229         i++;
3230         acc+= src[i];
3231         dst[i]= acc;
3232     }
3233
3234     for(; i<w; i++){
3235         acc+= src[i];
3236         dst[i]= acc;
3237     }
3238
3239     return acc;
3240 }
3241
3242 #if HAVE_BIGENDIAN
3243 #define B 3
3244 #define G 2
3245 #define R 1
3246 #define A 0
3247 #else
3248 #define B 0
3249 #define G 1
3250 #define R 2
3251 #define A 3
3252 #endif
3253 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3254     int i;
3255     int r,g,b,a;
3256     r= *red;
3257     g= *green;
3258     b= *blue;
3259     a= *alpha;
3260
3261     for(i=0; i<w; i++){
3262         b+= src[4*i+B];
3263         g+= src[4*i+G];
3264         r+= src[4*i+R];
3265         a+= src[4*i+A];
3266
3267         dst[4*i+B]= b;
3268         dst[4*i+G]= g;
3269         dst[4*i+R]= r;
3270         dst[4*i+A]= a;
3271     }
3272
3273     *red= r;
3274     *green= g;
3275     *blue= b;
3276     *alpha= a;
3277 }
3278 #undef B
3279 #undef G
3280 #undef R
3281 #undef A
3282
3283 #define BUTTERFLY2(o1,o2,i1,i2) \
3284 o1= (i1)+(i2);\
3285 o2= (i1)-(i2);
3286
3287 #define BUTTERFLY1(x,y) \
3288 {\
3289     int a,b;\
3290     a= x;\
3291     b= y;\
3292     x= a+b;\
3293     y= a-b;\
3294 }
3295
3296 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3297
3298 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3299     int i;
3300     int temp[64];
3301     int sum=0;
3302
3303     assert(h==8);
3304
3305     for(i=0; i<8; i++){
3306         //FIXME try pointer walks
3307         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3308         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3309         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3310         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3311
3312         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3313         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3314         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3315         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3316
3317         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3318         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3319         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3320         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3321     }
3322
3323     for(i=0; i<8; i++){
3324         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3325         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3326         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3327         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3328
3329         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3330         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3331         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3332         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3333
3334         sum +=
3335              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3336             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3337             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3338             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3339     }
3340 #if 0
3341 static int maxi=0;
3342 if(sum>maxi){
3343     maxi=sum;
3344     printf("MAX:%d\n", maxi);
3345 }
3346 #endif
3347     return sum;
3348 }
3349
3350 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3351     int i;
3352     int temp[64];
3353     int sum=0;
3354
3355     assert(h==8);
3356
3357     for(i=0; i<8; i++){
3358         //FIXME try pointer walks
3359         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3360         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3361         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3362         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3363
3364         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3365         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3366         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3367         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3368
3369         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3370         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3371         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3372         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3373     }
3374
3375     for(i=0; i<8; i++){
3376         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3377         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3378         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3379         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3380
3381         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3382         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3383         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3384         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3385
3386         sum +=
3387              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3388             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3389             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3390             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3391     }
3392
3393     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3394
3395     return sum;
3396 }
3397
3398 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3399     MpegEncContext * const s= (MpegEncContext *)c;
3400     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3401
3402     assert(h==8);
3403
3404     s->dsp.diff_pixels(temp, src1, src2, stride);
3405     s->dsp.fdct(temp);
3406     return s->dsp.sum_abs_dctelem(temp);
3407 }
3408
3409 #if CONFIG_GPL
3410 #define DCT8_1D {\
3411     const int s07 = SRC(0) + SRC(7);\
3412     const int s16 = SRC(1) + SRC(6);\
3413     const int s25 = SRC(2) + SRC(5);\
3414     const int s34 = SRC(3) + SRC(4);\
3415     const int a0 = s07 + s34;\
3416     const int a1 = s16 + s25;\
3417     const int a2 = s07 - s34;\
3418     const int a3 = s16 - s25;\
3419     const int d07 = SRC(0) - SRC(7);\
3420     const int d16 = SRC(1) - SRC(6);\
3421     const int d25 = SRC(2) - SRC(5);\
3422     const int d34 = SRC(3) - SRC(4);\
3423     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3424     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3425     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3426     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3427     DST(0,  a0 + a1     ) ;\
3428     DST(1,  a4 + (a7>>2)) ;\
3429     DST(2,  a2 + (a3>>1)) ;\
3430     DST(3,  a5 + (a6>>2)) ;\
3431     DST(4,  a0 - a1     ) ;\
3432     DST(5,  a6 - (a5>>2)) ;\
3433     DST(6, (a2>>1) - a3 ) ;\
3434     DST(7, (a4>>2) - a7 ) ;\
3435 }
3436
3437 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3438     MpegEncContext * const s= (MpegEncContext *)c;
3439     DCTELEM dct[8][8];
3440     int i;
3441     int sum=0;
3442
3443     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3444
3445 #define SRC(x) dct[i][x]
3446 #define DST(x,v) dct[i][x]= v
3447     for( i = 0; i < 8; i++ )
3448         DCT8_1D
3449 #undef SRC
3450 #undef DST
3451
3452 #define SRC(x) dct[x][i]
3453 #define DST(x,v) sum += FFABS(v)
3454     for( i = 0; i < 8; i++ )
3455         DCT8_1D
3456 #undef SRC
3457 #undef DST
3458     return sum;
3459 }
3460 #endif
3461
3462 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3463     MpegEncContext * const s= (MpegEncContext *)c;
3464     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3465     int sum=0, i;
3466
3467     assert(h==8);
3468
3469     s->dsp.diff_pixels(temp, src1, src2, stride);
3470     s->dsp.fdct(temp);
3471
3472     for(i=0; i<64; i++)
3473         sum= FFMAX(sum, FFABS(temp[i]));
3474
3475     return sum;
3476 }
3477
3478 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3479     MpegEncContext * const s= (MpegEncContext *)c;
3480     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3481     DCTELEM * const bak = temp+64;
3482     int sum=0, i;
3483
3484     assert(h==8);
3485     s->mb_intra=0;
3486
3487     s->dsp.diff_pixels(temp, src1, src2, stride);
3488
3489     memcpy(bak, temp, 64*sizeof(DCTELEM));
3490
3491     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3492     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3493     ff_simple_idct(temp); //FIXME
3494
3495     for(i=0; i<64; i++)
3496         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3497
3498     return sum;
3499 }
3500
3501 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3502     MpegEncContext * const s= (MpegEncContext *)c;
3503     const uint8_t *scantable= s->intra_scantable.permutated;
3504     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3505     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3506     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3507     int i, last, run, bits, level, distortion, start_i;
3508     const int esc_length= s->ac_esc_length;
3509     uint8_t * length;
3510     uint8_t * last_length;
3511
3512     assert(h==8);
3513
3514     copy_block8(lsrc1, src1, 8, stride, 8);
3515     copy_block8(lsrc2, src2, 8, stride, 8);
3516
3517     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3518
3519     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3520
3521     bits=0;
3522
3523     if (s->mb_intra) {
3524         start_i = 1;
3525         length     = s->intra_ac_vlc_length;
3526         last_length= s->intra_ac_vlc_last_length;
3527         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3528     } else {
3529         start_i = 0;
3530         length     = s->inter_ac_vlc_length;
3531         last_length= s->inter_ac_vlc_last_length;
3532     }
3533
3534     if(last>=start_i){
3535         run=0;
3536         for(i=start_i; i<last; i++){
3537             int j= scantable[i];
3538             level= temp[j];
3539
3540             if(level){
3541                 level+=64;
3542                 if((level&(~127)) == 0){
3543                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3544                 }else
3545                     bits+= esc_length;
3546                 run=0;
3547             }else
3548                 run++;
3549         }
3550         i= scantable[last];
3551
3552         level= temp[i] + 64;
3553
3554         assert(level - 64);
3555
3556         if((level&(~127)) == 0){
3557             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3558         }else
3559             bits+= esc_length;
3560
3561     }
3562
3563     if(last>=0){
3564         if(s->mb_intra)
3565             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3566         else
3567             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3568     }
3569
3570     s->dsp.idct_add(lsrc2, 8, temp);
3571
3572     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3573
3574     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3575 }
3576
3577 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3578     MpegEncContext * const s= (MpegEncContext *)c;
3579     const uint8_t *scantable= s->intra_scantable.permutated;
3580     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3581     int i, last, run, bits, level, start_i;
3582     const int esc_length= s->ac_esc_length;
3583     uint8_t * length;
3584     uint8_t * last_length;
3585
3586     assert(h==8);
3587
3588     s->dsp.diff_pixels(temp, src1, src2, stride);
3589
3590     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3591
3592     bits=0;
3593
3594     if (s->mb_intra) {
3595         start_i = 1;
3596         length     = s->intra_ac_vlc_length;
3597         last_length= s->intra_ac_vlc_last_length;
3598         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3599     } else {
3600         start_i = 0;
3601         length     = s->inter_ac_vlc_length;
3602         last_length= s->inter_ac_vlc_last_length;
3603     }
3604
3605     if(last>=start_i){
3606         run=0;
3607         for(i=start_i; i<last; i++){
3608             int j= scantable[i];
3609             level= temp[j];
3610
3611             if(level){
3612                 level+=64;
3613                 if((level&(~127)) == 0){
3614                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3615                 }else
3616                     bits+= esc_length;
3617                 run=0;
3618             }else
3619                 run++;
3620         }
3621         i= scantable[last];
3622
3623         level= temp[i] + 64;
3624
3625         assert(level - 64);
3626
3627         if((level&(~127)) == 0){
3628             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3629         }else
3630             bits+= esc_length;
3631     }
3632
3633     return bits;
3634 }
3635
3636 #define VSAD_INTRA(size) \
3637 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3638     int score=0;                                                                                            \
3639     int x,y;                                                                                                \
3640                                                                                                             \
3641     for(y=1; y<h; y++){                                                                                     \
3642         for(x=0; x<size; x+=4){                                                                             \
3643             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3644                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3645         }                                                                                                   \
3646         s+= stride;                                                                                         \
3647     }                                                                                                       \
3648                                                                                                             \
3649     return score;                                                                                           \
3650 }
3651 VSAD_INTRA(8)
3652 VSAD_INTRA(16)
3653
3654 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3655     int score=0;
3656     int x,y;
3657
3658     for(y=1; y<h; y++){
3659         for(x=0; x<16; x++){
3660             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3661         }
3662         s1+= stride;
3663         s2+= stride;
3664     }
3665
3666     return score;
3667 }
3668
3669 #define SQ(a) ((a)*(a))
3670 #define VSSE_INTRA(size) \
3671 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3672     int score=0;                                                                                            \
3673     int x,y;                                                                                                \
3674                                                                                                             \
3675     for(y=1; y<h; y++){                                                                                     \
3676         for(x=0; x<size; x+=4){                                                                               \
3677             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3678                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3679         }                                                                                                   \
3680         s+= stride;                                                                                         \
3681     }                                                                                                       \
3682                                                                                                             \
3683     return score;                                                                                           \
3684 }
3685 VSSE_INTRA(8)
3686 VSSE_INTRA(16)
3687
3688 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3689     int score=0;
3690     int x,y;
3691
3692     for(y=1; y<h; y++){
3693         for(x=0; x<16; x++){
3694             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3695         }
3696         s1+= stride;
3697         s2+= stride;
3698     }
3699
3700     return score;
3701 }
3702
3703 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3704                                int size){
3705     int score=0;
3706     int i;
3707     for(i=0; i<size; i++)
3708         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3709     return score;
3710 }
3711
3712 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3713 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3714 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3715 #if CONFIG_GPL
3716 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3717 #endif
3718 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3719 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3720 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3721 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3722
3723 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3724     int i;
3725     for(i=0; i<len; i++)
3726         dst[i] = src0[i] * src1[i];
3727 }
3728
3729 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3730     int i;
3731     src1 += len-1;
3732     for(i=0; i<len; i++)
3733         dst[i] = src0[i] * src1[-i];
3734 }
3735
3736 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3737     int i;
3738     for(i=0; i<len; i++)
3739         dst[i] = src0[i] * src1[i] + src2[i];
3740 }
3741
3742 static void vector_fmul_window_c(float *dst, const float *src0,
3743                                  const float *src1, const float *win, int len)
3744 {
3745     int i,j;
3746     dst += len;
3747     win += len;
3748     src0+= len;
3749     for(i=-len, j=len-1; i<0; i++, j--) {
3750         float s0 = src0[i];
3751         float s1 = src1[j];
3752         float wi = win[i];
3753         float wj = win[j];
3754         dst[i] = s0*wj - s1*wi;
3755         dst[j] = s0*wi + s1*wj;
3756     }
3757 }
3758
3759 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3760                                  int len)
3761 {
3762     int i;
3763     for (i = 0; i < len; i++)
3764         dst[i] = src[i] * mul;
3765 }
3766
3767 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3768                                       const float **sv, float mul, int len)
3769 {
3770     int i;
3771     for (i = 0; i < len; i += 2, sv++) {
3772         dst[i  ] = src[i  ] * sv[0][0] * mul;
3773         dst[i+1] = src[i+1] * sv[0][1] * mul;
3774     }
3775 }
3776
3777 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3778                                       const float **sv, float mul, int len)
3779 {
3780     int i;
3781     for (i = 0; i < len; i += 4, sv++) {
3782         dst[i  ] = src[i  ] * sv[0][0] * mul;
3783         dst[i+1] = src[i+1] * sv[0][1] * mul;
3784         dst[i+2] = src[i+2] * sv[0][2] * mul;
3785         dst[i+3] = src[i+3] * sv[0][3] * mul;
3786     }
3787 }
3788
3789 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3790                                int len)
3791 {
3792     int i;
3793     for (i = 0; i < len; i += 2, sv++) {
3794         dst[i  ] = sv[0][0] * mul;
3795         dst[i+1] = sv[0][1] * mul;
3796     }
3797 }
3798
3799 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3800                                int len)
3801 {
3802     int i;
3803     for (i = 0; i < len; i += 4, sv++) {
3804         dst[i  ] = sv[0][0] * mul;
3805         dst[i+1] = sv[0][1] * mul;
3806         dst[i+2] = sv[0][2] * mul;
3807         dst[i+3] = sv[0][3] * mul;
3808     }
3809 }
3810
3811 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3812                                 int len)
3813 {
3814     int i;
3815     for (i = 0; i < len; i++) {
3816         float t = v1[i] - v2[i];
3817         v1[i] += v2[i];
3818         v2[i] = t;
3819     }
3820 }
3821
3822 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3823 {
3824     float p = 0.0;
3825     int i;
3826
3827     for (i = 0; i < len; i++)
3828         p += v1[i] * v2[i];
3829
3830     return p;
3831 }
3832
3833 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3834                    uint32_t maxi, uint32_t maxisign)
3835 {
3836
3837     if(a > mini) return mini;
3838     else if((a^(1<<31)) > maxisign) return maxi;
3839     else return a;
3840 }
3841
3842 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3843     int i;
3844     uint32_t mini = *(uint32_t*)min;
3845     uint32_t maxi = *(uint32_t*)max;
3846     uint32_t maxisign = maxi ^ (1<<31);
3847     uint32_t *dsti = (uint32_t*)dst;
3848     const uint32_t *srci = (const uint32_t*)src;
3849     for(i=0; i<len; i+=8) {
3850         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3851         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3852         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3853         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3854         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3855         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3856         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3857         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3858     }
3859 }
3860 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3861     int i;
3862     if(min < 0 && max > 0) {
3863         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3864     } else {
3865         for(i=0; i < len; i+=8) {
3866             dst[i    ] = av_clipf(src[i    ], min, max);
3867             dst[i + 1] = av_clipf(src[i + 1], min, max);
3868             dst[i + 2] = av_clipf(src[i + 2], min, max);
3869             dst[i + 3] = av_clipf(src[i + 3], min, max);
3870             dst[i + 4] = av_clipf(src[i + 4], min, max);
3871             dst[i + 5] = av_clipf(src[i + 5], min, max);
3872             dst[i + 6] = av_clipf(src[i + 6], min, max);
3873             dst[i + 7] = av_clipf(src[i + 7], min, max);
3874         }
3875     }
3876 }
3877
3878 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3879 {
3880     int res = 0;
3881
3882     while (order--)
3883         res += (*v1++ * *v2++) >> shift;
3884
3885     return res;
3886 }
3887
3888 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3889 {
3890     int res = 0;
3891     while (order--) {
3892         res   += *v1 * *v2++;
3893         *v1++ += mul * *v3++;
3894     }
3895     return res;
3896 }
3897
3898 #define W0 2048
3899 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3900 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3901 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3902 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3903 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3904 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3905 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3906
3907 static void wmv2_idct_row(short * b)
3908 {
3909     int s1,s2;
3910     int a0,a1,a2,a3,a4,a5,a6,a7;
3911     /*step 1*/
3912     a1 = W1*b[1]+W7*b[7];
3913     a7 = W7*b[1]-W1*b[7];
3914     a5 = W5*b[5]+W3*b[3];
3915     a3 = W3*b[5]-W5*b[3];
3916     a2 = W2*b[2]+W6*b[6];
3917     a6 = W6*b[2]-W2*b[6];
3918     a0 = W0*b[0]+W0*b[4];
3919     a4 = W0*b[0]-W0*b[4];
3920     /*step 2*/
3921     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3922     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3923     /*step 3*/
3924     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3925     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3926     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3927     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3928     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3929     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3930     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3931     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3932 }
3933 static void wmv2_idct_col(short * b)
3934 {
3935     int s1,s2;
3936     int a0,a1,a2,a3,a4,a5,a6,a7;
3937     /*step 1, with extended precision*/
3938     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3939     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3940     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3941     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3942     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3943     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3944     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3945     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3946     /*step 2*/
3947     s1 = (181*(a1-a5+a7-a3)+128)>>8;
3948     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3949     /*step 3*/
3950     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3951     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3952     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3953     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3954
3955     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3956     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3957     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3958     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3959 }
3960 void ff_wmv2_idct_c(short * block){
3961     int i;
3962
3963     for(i=0;i<64;i+=8){
3964         wmv2_idct_row(block+i);
3965     }
3966     for(i=0;i<8;i++){
3967         wmv2_idct_col(block+i);
3968     }
3969 }
3970 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3971  converted */
3972 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3973 {
3974     ff_wmv2_idct_c(block);
3975     ff_put_pixels_clamped_c(block, dest, line_size);
3976 }
3977 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3978 {
3979     ff_wmv2_idct_c(block);
3980     ff_add_pixels_clamped_c(block, dest, line_size);
3981 }
3982 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3983 {
3984     j_rev_dct (block);
3985     ff_put_pixels_clamped_c(block, dest, line_size);
3986 }
3987 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3988 {
3989     j_rev_dct (block);
3990     ff_add_pixels_clamped_c(block, dest, line_size);
3991 }
3992
3993 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3994 {
3995     j_rev_dct4 (block);
3996     put_pixels_clamped4_c(block, dest, line_size);
3997 }
3998 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3999 {
4000     j_rev_dct4 (block);
4001     add_pixels_clamped4_c(block, dest, line_size);
4002 }
4003
4004 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4005 {
4006     j_rev_dct2 (block);
4007     put_pixels_clamped2_c(block, dest, line_size);
4008 }
4009 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4010 {
4011     j_rev_dct2 (block);
4012     add_pixels_clamped2_c(block, dest, line_size);
4013 }
4014
4015 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4016 {
4017     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4018
4019     dest[0] = cm[(block[0] + 4)>>3];
4020 }
4021 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4022 {
4023     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4024
4025     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4026 }
4027
4028 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4029
4030 /* init static data */
4031 av_cold void dsputil_static_init(void)
4032 {
4033     int i;
4034
4035     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4036     for(i=0;i<MAX_NEG_CROP;i++) {
4037         ff_cropTbl[i] = 0;
4038         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4039     }
4040
4041     for(i=0;i<512;i++) {
4042         ff_squareTbl[i] = (i - 256) * (i - 256);
4043     }
4044
4045     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4046 }
4047
4048 int ff_check_alignment(void){
4049     static int did_fail=0;
4050     DECLARE_ALIGNED(16, int, aligned);
4051
4052     if((intptr_t)&aligned & 15){
4053         if(!did_fail){
4054 #if HAVE_MMX || HAVE_ALTIVEC
4055             av_log(NULL, AV_LOG_ERROR,
4056                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4057                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4058                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4059                 "Do not report crashes to FFmpeg developers.\n");
4060 #endif
4061             did_fail=1;
4062         }
4063         return -1;
4064     }
4065     return 0;
4066 }
4067
4068 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4069 {
4070     int i;
4071
4072     ff_check_alignment();
4073
4074 #if CONFIG_ENCODERS
4075     if(avctx->dct_algo==FF_DCT_FASTINT) {
4076         c->fdct = fdct_ifast;
4077         c->fdct248 = fdct_ifast248;
4078     }
4079     else if(avctx->dct_algo==FF_DCT_FAAN) {
4080         c->fdct = ff_faandct;
4081         c->fdct248 = ff_faandct248;
4082     }
4083     else {
4084         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4085         c->fdct248 = ff_fdct248_islow;
4086     }
4087 #endif //CONFIG_ENCODERS
4088
4089     if(avctx->lowres==1){
4090         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4091             c->idct_put= ff_jref_idct4_put;
4092             c->idct_add= ff_jref_idct4_add;
4093         }else{
4094             c->idct_put= ff_h264_lowres_idct_put_c;
4095             c->idct_add= ff_h264_lowres_idct_add_c;
4096         }
4097         c->idct    = j_rev_dct4;
4098         c->idct_permutation_type= FF_NO_IDCT_PERM;
4099     }else if(avctx->lowres==2){
4100         c->idct_put= ff_jref_idct2_put;
4101         c->idct_add= ff_jref_idct2_add;
4102         c->idct    = j_rev_dct2;
4103         c->idct_permutation_type= FF_NO_IDCT_PERM;
4104     }else if(avctx->lowres==3){
4105         c->idct_put= ff_jref_idct1_put;
4106         c->idct_add= ff_jref_idct1_add;
4107         c->idct    = j_rev_dct1;
4108         c->idct_permutation_type= FF_NO_IDCT_PERM;
4109     }else{
4110         if(avctx->idct_algo==FF_IDCT_INT){
4111             c->idct_put= ff_jref_idct_put;
4112             c->idct_add= ff_jref_idct_add;
4113             c->idct    = j_rev_dct;
4114             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4115         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4116                 avctx->idct_algo==FF_IDCT_VP3){
4117             c->idct_put= ff_vp3_idct_put_c;
4118             c->idct_add= ff_vp3_idct_add_c;
4119             c->idct    = ff_vp3_idct_c;
4120             c->idct_permutation_type= FF_NO_IDCT_PERM;
4121         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4122             c->idct_put= ff_wmv2_idct_put_c;
4123             c->idct_add= ff_wmv2_idct_add_c;
4124             c->idct    = ff_wmv2_idct_c;
4125             c->idct_permutation_type= FF_NO_IDCT_PERM;
4126         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4127             c->idct_put= ff_faanidct_put;
4128             c->idct_add= ff_faanidct_add;
4129             c->idct    = ff_faanidct;
4130             c->idct_permutation_type= FF_NO_IDCT_PERM;
4131         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4132             c->idct_put= ff_ea_idct_put_c;
4133             c->idct_permutation_type= FF_NO_IDCT_PERM;
4134         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4135             c->idct     = ff_bink_idct_c;
4136             c->idct_add = ff_bink_idct_add_c;
4137             c->idct_put = ff_bink_idct_put_c;
4138             c->idct_permutation_type = FF_NO_IDCT_PERM;
4139         }else{ //accurate/default
4140             c->idct_put= ff_simple_idct_put;
4141             c->idct_add= ff_simple_idct_add;
4142             c->idct    = ff_simple_idct;
4143             c->idct_permutation_type= FF_NO_IDCT_PERM;
4144         }
4145     }
4146
4147     c->get_pixels = get_pixels_c;
4148     c->diff_pixels = diff_pixels_c;
4149     c->put_pixels_clamped = ff_put_pixels_clamped_c;
4150     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
4151     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4152     c->add_pixels_clamped = ff_add_pixels_clamped_c;
4153     c->add_pixels8 = add_pixels8_c;
4154     c->add_pixels4 = add_pixels4_c;
4155     c->sum_abs_dctelem = sum_abs_dctelem_c;
4156     c->emulated_edge_mc = ff_emulated_edge_mc;
4157     c->gmc1 = gmc1_c;
4158     c->gmc = ff_gmc_c;
4159     c->clear_block = clear_block_c;
4160     c->clear_blocks = clear_blocks_c;
4161     c->pix_sum = pix_sum_c;
4162     c->pix_norm1 = pix_norm1_c;
4163
4164     c->fill_block_tab[0] = fill_block16_c;
4165     c->fill_block_tab[1] = fill_block8_c;
4166     c->scale_block = scale_block_c;
4167
4168     /* TODO [0] 16  [1] 8 */
4169     c->pix_abs[0][0] = pix_abs16_c;
4170     c->pix_abs[0][1] = pix_abs16_x2_c;
4171     c->pix_abs[0][2] = pix_abs16_y2_c;
4172     c->pix_abs[0][3] = pix_abs16_xy2_c;
4173     c->pix_abs[1][0] = pix_abs8_c;
4174     c->pix_abs[1][1] = pix_abs8_x2_c;
4175     c->pix_abs[1][2] = pix_abs8_y2_c;
4176     c->pix_abs[1][3] = pix_abs8_xy2_c;
4177
4178 #define dspfunc(PFX, IDX, NUM) \
4179     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4180     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4181     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4182     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4183
4184     dspfunc(put, 0, 16);
4185     dspfunc(put_no_rnd, 0, 16);
4186     dspfunc(put, 1, 8);
4187     dspfunc(put_no_rnd, 1, 8);
4188     dspfunc(put, 2, 4);
4189     dspfunc(put, 3, 2);
4190
4191     dspfunc(avg, 0, 16);
4192     dspfunc(avg_no_rnd, 0, 16);
4193     dspfunc(avg, 1, 8);
4194     dspfunc(avg_no_rnd, 1, 8);
4195     dspfunc(avg, 2, 4);
4196     dspfunc(avg, 3, 2);
4197 #undef dspfunc
4198
4199     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4200     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4201
4202     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4203     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4204     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4205     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4206     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4207     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4208     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4209     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4210     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4211
4212     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4213     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4214     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4215     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4216     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4217     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4218     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4219     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4220     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4221
4222 #define dspfunc(PFX, IDX, NUM) \
4223     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4224     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4225     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4226     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4227     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4228     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4229     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4230     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4231     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4232     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4233     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4234     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4235     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4236     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4237     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4238     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4239
4240     dspfunc(put_qpel, 0, 16);
4241     dspfunc(put_no_rnd_qpel, 0, 16);
4242
4243     dspfunc(avg_qpel, 0, 16);
4244     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4245
4246     dspfunc(put_qpel, 1, 8);
4247     dspfunc(put_no_rnd_qpel, 1, 8);
4248
4249     dspfunc(avg_qpel, 1, 8);
4250     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4251
4252     dspfunc(put_h264_qpel, 0, 16);
4253     dspfunc(put_h264_qpel, 1, 8);
4254     dspfunc(put_h264_qpel, 2, 4);
4255     dspfunc(put_h264_qpel, 3, 2);
4256     dspfunc(avg_h264_qpel, 0, 16);
4257     dspfunc(avg_h264_qpel, 1, 8);
4258     dspfunc(avg_h264_qpel, 2, 4);
4259
4260 #undef dspfunc
4261     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4262     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4263     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4264     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4265     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4266     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4267
4268     c->draw_edges = draw_edges_c;
4269
4270 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4271     ff_mlp_init(c, avctx);
4272 #endif
4273 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4274     ff_intrax8dsp_init(c,avctx);
4275 #endif
4276 #if CONFIG_RV30_DECODER
4277     ff_rv30dsp_init(c,avctx);
4278 #endif
4279 #if CONFIG_RV40_DECODER
4280     ff_rv40dsp_init(c,avctx);
4281     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4282     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4283     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4284     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4285 #endif
4286
4287     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4288     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4289     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4290     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4291     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4292     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4293     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4294     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4295
4296 #define SET_CMP_FUNC(name) \
4297     c->name[0]= name ## 16_c;\
4298     c->name[1]= name ## 8x8_c;
4299
4300     SET_CMP_FUNC(hadamard8_diff)
4301     c->hadamard8_diff[4]= hadamard8_intra16_c;
4302     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4303     SET_CMP_FUNC(dct_sad)
4304     SET_CMP_FUNC(dct_max)
4305 #if CONFIG_GPL
4306     SET_CMP_FUNC(dct264_sad)
4307 #endif
4308     c->sad[0]= pix_abs16_c;
4309     c->sad[1]= pix_abs8_c;
4310     c->sse[0]= sse16_c;
4311     c->sse[1]= sse8_c;
4312     c->sse[2]= sse4_c;
4313     SET_CMP_FUNC(quant_psnr)
4314     SET_CMP_FUNC(rd)
4315     SET_CMP_FUNC(bit)
4316     c->vsad[0]= vsad16_c;
4317     c->vsad[4]= vsad_intra16_c;
4318     c->vsad[5]= vsad_intra8_c;
4319     c->vsse[0]= vsse16_c;
4320     c->vsse[4]= vsse_intra16_c;
4321     c->vsse[5]= vsse_intra8_c;
4322     c->nsse[0]= nsse16_c;
4323     c->nsse[1]= nsse8_c;
4324 #if CONFIG_DWT
4325     ff_dsputil_init_dwt(c);
4326 #endif
4327
4328     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4329
4330     c->add_bytes= add_bytes_c;
4331     c->add_bytes_l2= add_bytes_l2_c;
4332     c->diff_bytes= diff_bytes_c;
4333     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4334     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4335     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4336     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4337     c->bswap_buf= bswap_buf;
4338     c->bswap16_buf = bswap16_buf;
4339 #if CONFIG_PNG_DECODER
4340     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4341 #endif
4342
4343     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4344         c->h263_h_loop_filter= h263_h_loop_filter_c;
4345         c->h263_v_loop_filter= h263_v_loop_filter_c;
4346     }
4347
4348     if (CONFIG_VP3_DECODER) {
4349         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4350         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4351         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4352     }
4353
4354     c->h261_loop_filter= h261_loop_filter_c;
4355
4356     c->try_8x8basis= try_8x8basis_c;
4357     c->add_8x8basis= add_8x8basis_c;
4358
4359 #if CONFIG_VORBIS_DECODER
4360     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4361 #endif
4362 #if CONFIG_AC3_DECODER
4363     c->ac3_downmix = ff_ac3_downmix_c;
4364 #endif
4365     c->vector_fmul = vector_fmul_c;
4366     c->vector_fmul_reverse = vector_fmul_reverse_c;
4367     c->vector_fmul_add = vector_fmul_add_c;
4368     c->vector_fmul_window = vector_fmul_window_c;
4369     c->vector_clipf = vector_clipf_c;
4370     c->scalarproduct_int16 = scalarproduct_int16_c;
4371     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4372     c->scalarproduct_float = scalarproduct_float_c;
4373     c->butterflies_float = butterflies_float_c;
4374     c->vector_fmul_scalar = vector_fmul_scalar_c;
4375
4376     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4377     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4378
4379     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4380     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4381
4382     c->shrink[0]= av_image_copy_plane;
4383     c->shrink[1]= ff_shrink22;
4384     c->shrink[2]= ff_shrink44;
4385     c->shrink[3]= ff_shrink88;
4386
4387     c->prefetch= just_return;
4388
4389     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4390     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4391
4392     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4393     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4394     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4395     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4396     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4397     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4398     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4399     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4400     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4401
4402     for(i=0; i<64; i++){
4403         if(!c->put_2tap_qpel_pixels_tab[0][i])
4404             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4405         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4406             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4407     }
4408
4409     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4410     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4411     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4412     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4413
4414     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4415     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4416     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4417     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4418
4419     switch(c->idct_permutation_type){
4420     case FF_NO_IDCT_PERM:
4421         for(i=0; i<64; i++)
4422             c->idct_permutation[i]= i;
4423         break;
4424     case FF_LIBMPEG2_IDCT_PERM:
4425         for(i=0; i<64; i++)
4426             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4427         break;
4428     case FF_SIMPLE_IDCT_PERM:
4429         for(i=0; i<64; i++)
4430             c->idct_permutation[i]= simple_mmx_permutation[i];
4431         break;
4432     case FF_TRANSPOSE_IDCT_PERM:
4433         for(i=0; i<64; i++)
4434             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4435         break;
4436     case FF_PARTTRANS_IDCT_PERM:
4437         for(i=0; i<64; i++)
4438             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4439         break;
4440     case FF_SSE2_IDCT_PERM:
4441         for(i=0; i<64; i++)
4442             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4443         break;
4444     default:
4445         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4446     }
4447 }
4448