git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavutil/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  47 #define pb_7f (~0UL/255 * 0x7f)
  48 #define pb_80 (~0UL/255 * 0x80)
  49
  50 const uint8_t ff_zigzag_direct[64] = {
  51     0,   1,  8, 16,  9,  2,  3, 10,
  52     17, 24, 32, 25, 18, 11,  4,  5,
  53     12, 19, 26, 33, 40, 48, 41, 34,
  54     27, 20, 13,  6,  7, 14, 21, 28,
  55     35, 42, 49, 56, 57, 50, 43, 36,
  56     29, 22, 15, 23, 30, 37, 44, 51,
  57     58, 59, 52, 45, 38, 31, 39, 46,
  58     53, 60, 61, 54, 47, 55, 62, 63
  59 };
  60
  61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  62    specification, we interleave the fields */
  63 const uint8_t ff_zigzag248_direct[64] = {
  64      0,  8,  1,  9, 16, 24,  2, 10,
  65     17, 25, 32, 40, 48, 56, 33, 41,
  66     18, 26,  3, 11,  4, 12, 19, 27,
  67     34, 42, 49, 57, 50, 58, 35, 43,
  68     20, 28,  5, 13,  6, 14, 21, 29,
  69     36, 44, 51, 59, 52, 60, 37, 45,
  70     22, 30,  7, 15, 23, 31, 38, 46,
  71     53, 61, 54, 62, 39, 47, 55, 63,
  72 };
  73
  74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  76
  77 const uint8_t ff_alternate_horizontal_scan[64] = {
  78     0,  1,   2,  3,  8,  9, 16, 17,
  79     10, 11,  4,  5,  6,  7, 15, 14,
  80     13, 12, 19, 18, 24, 25, 32, 33,
  81     26, 27, 20, 21, 22, 23, 28, 29,
  82     30, 31, 34, 35, 40, 41, 48, 49,
  83     42, 43, 36, 37, 38, 39, 44, 45,
  84     46, 47, 50, 51, 56, 57, 58, 59,
  85     52, 53, 54, 55, 60, 61, 62, 63,
  86 };
  87
  88 const uint8_t ff_alternate_vertical_scan[64] = {
  89     0,  8,  16, 24,  1,  9,  2, 10,
  90     17, 25, 32, 40, 48, 56, 57, 49,
  91     41, 33, 26, 18,  3, 11,  4, 12,
  92     19, 27, 34, 42, 50, 58, 35, 43,
  93     51, 59, 20, 28,  5, 13,  6, 14,
  94     21, 29, 36, 44, 52, 60, 37, 45,
  95     53, 61, 22, 30,  7, 15, 23, 31,
  96     38, 46, 54, 62, 39, 47, 55, 63,
  97 };
  98
  99 /* Input permutation for the simple_idct_mmx */
 100 static const uint8_t simple_mmx_permutation[64]={
 101         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 102         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 103         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 104         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 105         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 106         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 107         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 108         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 109 };
 110
 111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 112
 113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 114     int i;
 115     int end;
 116
 117     st->scantable= src_scantable;
 118
 119     for(i=0; i<64; i++){
 120         int j;
 121         j = src_scantable[i];
 122         st->permutated[i] = permutation[j];
 123 #if ARCH_PPC
 124         st->inverse[j] = i;
 125 #endif
 126     }
 127
 128     end=-1;
 129     for(i=0; i<64; i++){
 130         int j;
 131         j = st->permutated[i];
 132         if(j>end) end=j;
 133         st->raster_end[i]= end;
 134     }
 135 }
 136
 137 static int pix_sum_c(uint8_t * pix, int line_size)
 138 {
 139     int s, i, j;
 140
 141     s = 0;
 142     for (i = 0; i < 16; i++) {
 143         for (j = 0; j < 16; j += 8) {
 144             s += pix[0];
 145             s += pix[1];
 146             s += pix[2];
 147             s += pix[3];
 148             s += pix[4];
 149             s += pix[5];
 150             s += pix[6];
 151             s += pix[7];
 152             pix += 8;
 153         }
 154         pix += line_size - 16;
 155     }
 156     return s;
 157 }
 158
 159 static int pix_norm1_c(uint8_t * pix, int line_size)
 160 {
 161     int s, i, j;
 162     uint32_t *sq = ff_squareTbl + 256;
 163
 164     s = 0;
 165     for (i = 0; i < 16; i++) {
 166         for (j = 0; j < 16; j += 8) {
 167 #if 0
 168             s += sq[pix[0]];
 169             s += sq[pix[1]];
 170             s += sq[pix[2]];
 171             s += sq[pix[3]];
 172             s += sq[pix[4]];
 173             s += sq[pix[5]];
 174             s += sq[pix[6]];
 175             s += sq[pix[7]];
 176 #else
 177 #if LONG_MAX > 2147483647
 178             register uint64_t x=*(uint64_t*)pix;
 179             s += sq[x&0xff];
 180             s += sq[(x>>8)&0xff];
 181             s += sq[(x>>16)&0xff];
 182             s += sq[(x>>24)&0xff];
 183             s += sq[(x>>32)&0xff];
 184             s += sq[(x>>40)&0xff];
 185             s += sq[(x>>48)&0xff];
 186             s += sq[(x>>56)&0xff];
 187 #else
 188             register uint32_t x=*(uint32_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             x=*(uint32_t*)(pix+4);
 194             s += sq[x&0xff];
 195             s += sq[(x>>8)&0xff];
 196             s += sq[(x>>16)&0xff];
 197             s += sq[(x>>24)&0xff];
 198 #endif
 199 #endif
 200             pix += 8;
 201         }
 202         pix += line_size - 16;
 203     }
 204     return s;
 205 }
 206
 207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 208     int i;
 209
 210     for(i=0; i+8<=w; i+=8){
 211         dst[i+0]= av_bswap32(src[i+0]);
 212         dst[i+1]= av_bswap32(src[i+1]);
 213         dst[i+2]= av_bswap32(src[i+2]);
 214         dst[i+3]= av_bswap32(src[i+3]);
 215         dst[i+4]= av_bswap32(src[i+4]);
 216         dst[i+5]= av_bswap32(src[i+5]);
 217         dst[i+6]= av_bswap32(src[i+6]);
 218         dst[i+7]= av_bswap32(src[i+7]);
 219     }
 220     for(;i<w; i++){
 221         dst[i+0]= av_bswap32(src[i+0]);
 222     }
 223 }
 224
 225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 226 {
 227     int s, i;
 228     uint32_t *sq = ff_squareTbl + 256;
 229
 230     s = 0;
 231     for (i = 0; i < h; i++) {
 232         s += sq[pix1[0] - pix2[0]];
 233         s += sq[pix1[1] - pix2[1]];
 234         s += sq[pix1[2] - pix2[2]];
 235         s += sq[pix1[3] - pix2[3]];
 236         pix1 += line_size;
 237         pix2 += line_size;
 238     }
 239     return s;
 240 }
 241
 242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = ff_squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[0] - pix2[0]];
 250         s += sq[pix1[1] - pix2[1]];
 251         s += sq[pix1[2] - pix2[2]];
 252         s += sq[pix1[3] - pix2[3]];
 253         s += sq[pix1[4] - pix2[4]];
 254         s += sq[pix1[5] - pix2[5]];
 255         s += sq[pix1[6] - pix2[6]];
 256         s += sq[pix1[7] - pix2[7]];
 257         pix1 += line_size;
 258         pix2 += line_size;
 259     }
 260     return s;
 261 }
 262
 263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 264 {
 265     int s, i;
 266     uint32_t *sq = ff_squareTbl + 256;
 267
 268     s = 0;
 269     for (i = 0; i < h; i++) {
 270         s += sq[pix1[ 0] - pix2[ 0]];
 271         s += sq[pix1[ 1] - pix2[ 1]];
 272         s += sq[pix1[ 2] - pix2[ 2]];
 273         s += sq[pix1[ 3] - pix2[ 3]];
 274         s += sq[pix1[ 4] - pix2[ 4]];
 275         s += sq[pix1[ 5] - pix2[ 5]];
 276         s += sq[pix1[ 6] - pix2[ 6]];
 277         s += sq[pix1[ 7] - pix2[ 7]];
 278         s += sq[pix1[ 8] - pix2[ 8]];
 279         s += sq[pix1[ 9] - pix2[ 9]];
 280         s += sq[pix1[10] - pix2[10]];
 281         s += sq[pix1[11] - pix2[11]];
 282         s += sq[pix1[12] - pix2[12]];
 283         s += sq[pix1[13] - pix2[13]];
 284         s += sq[pix1[14] - pix2[14]];
 285         s += sq[pix1[15] - pix2[15]];
 286
 287         pix1 += line_size;
 288         pix2 += line_size;
 289     }
 290     return s;
 291 }
 292
 293 /* draw the edges of width 'w' of an image of size width, height */
 294 //FIXME check that this is ok for mpeg4 interlaced
 295 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 296 {
 297     uint8_t *ptr, *last_line;
 298     int i;
 299
 300     last_line = buf + (height - 1) * wrap;
 301     for(i=0;i<w;i++) {
 302         /* top and bottom */
 303         memcpy(buf - (i + 1) * wrap, buf, width);
 304         memcpy(last_line + (i + 1) * wrap, last_line, width);
 305     }
 306     /* left and right */
 307     ptr = buf;
 308     for(i=0;i<height;i++) {
 309         memset(ptr - w, ptr[0], w);
 310         memset(ptr + width, ptr[width-1], w);
 311         ptr += wrap;
 312     }
 313     /* corners */
 314     for(i=0;i<w;i++) {
 315         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 316         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 317         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 318         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 319     }
 320 }
 321
 322 /**
 323  * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
 324  * @param buf destination buffer
 325  * @param src source buffer
 326  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 327  * @param block_w width of block
 328  * @param block_h height of block
 329  * @param src_x x coordinate of the top left sample of the block in the source buffer
 330  * @param src_y y coordinate of the top left sample of the block in the source buffer
 331  * @param w width of the source buffer
 332  * @param h height of the source buffer
 333  */
 334 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
 335                                     int src_x, int src_y, int w, int h){
 336     int x, y;
 337     int start_y, start_x, end_y, end_x;
 338
 339     if(src_y>= h){
 340         src+= (h-1-src_y)*linesize;
 341         src_y=h-1;
 342     }else if(src_y<=-block_h){
 343         src+= (1-block_h-src_y)*linesize;
 344         src_y=1-block_h;
 345     }
 346     if(src_x>= w){
 347         src+= (w-1-src_x);
 348         src_x=w-1;
 349     }else if(src_x<=-block_w){
 350         src+= (1-block_w-src_x);
 351         src_x=1-block_w;
 352     }
 353
 354     start_y= FFMAX(0, -src_y);
 355     start_x= FFMAX(0, -src_x);
 356     end_y= FFMIN(block_h, h-src_y);
 357     end_x= FFMIN(block_w, w-src_x);
 358     assert(start_y < end_y && block_h);
 359     assert(start_x < end_x && block_w);
 360
 361     w    = end_x - start_x;
 362     src += start_y*linesize + start_x;
 363     buf += start_x;
 364
 365     //top
 366     for(y=0; y<start_y; y++){
 367         memcpy(buf, src, w);
 368         buf += linesize;
 369     }
 370
 371     // copy existing part
 372     for(; y<end_y; y++){
 373         memcpy(buf, src, w);
 374         src += linesize;
 375         buf += linesize;
 376     }
 377
 378     //bottom
 379     src -= linesize;
 380     for(; y<block_h; y++){
 381         memcpy(buf, src, w);
 382         buf += linesize;
 383     }
 384
 385     buf -= block_h * linesize + start_x;
 386     while (block_h--){
 387        //left
 388         for(x=0; x<start_x; x++){
 389             buf[x] = buf[start_x];
 390         }
 391
 392        //right
 393         for(x=end_x; x<block_w; x++){
 394             buf[x] = buf[end_x - 1];
 395         }
 396         buf += linesize;
 397     }
 398 }
 399
 400 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 401 {
 402     int i;
 403
 404     /* read the pixels */
 405     for(i=0;i<8;i++) {
 406         block[0] = pixels[0];
 407         block[1] = pixels[1];
 408         block[2] = pixels[2];
 409         block[3] = pixels[3];
 410         block[4] = pixels[4];
 411         block[5] = pixels[5];
 412         block[6] = pixels[6];
 413         block[7] = pixels[7];
 414         pixels += line_size;
 415         block += 8;
 416     }
 417 }
 418
 419 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 420                           const uint8_t *s2, int stride){
 421     int i;
 422
 423     /* read the pixels */
 424     for(i=0;i<8;i++) {
 425         block[0] = s1[0] - s2[0];
 426         block[1] = s1[1] - s2[1];
 427         block[2] = s1[2] - s2[2];
 428         block[3] = s1[3] - s2[3];
 429         block[4] = s1[4] - s2[4];
 430         block[5] = s1[5] - s2[5];
 431         block[6] = s1[6] - s2[6];
 432         block[7] = s1[7] - s2[7];
 433         s1 += stride;
 434         s2 += stride;
 435         block += 8;
 436     }
 437 }
 438
 439
 440 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 441                                  int line_size)
 442 {
 443     int i;
 444     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 445
 446     /* read the pixels */
 447     for(i=0;i<8;i++) {
 448         pixels[0] = cm[block[0]];
 449         pixels[1] = cm[block[1]];
 450         pixels[2] = cm[block[2]];
 451         pixels[3] = cm[block[3]];
 452         pixels[4] = cm[block[4]];
 453         pixels[5] = cm[block[5]];
 454         pixels[6] = cm[block[6]];
 455         pixels[7] = cm[block[7]];
 456
 457         pixels += line_size;
 458         block += 8;
 459     }
 460 }
 461
 462 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 463                                  int line_size)
 464 {
 465     int i;
 466     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 467
 468     /* read the pixels */
 469     for(i=0;i<4;i++) {
 470         pixels[0] = cm[block[0]];
 471         pixels[1] = cm[block[1]];
 472         pixels[2] = cm[block[2]];
 473         pixels[3] = cm[block[3]];
 474
 475         pixels += line_size;
 476         block += 8;
 477     }
 478 }
 479
 480 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 481                                  int line_size)
 482 {
 483     int i;
 484     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 485
 486     /* read the pixels */
 487     for(i=0;i<2;i++) {
 488         pixels[0] = cm[block[0]];
 489         pixels[1] = cm[block[1]];
 490
 491         pixels += line_size;
 492         block += 8;
 493     }
 494 }
 495
 496 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 497                                         uint8_t *restrict pixels,
 498                                         int line_size)
 499 {
 500     int i, j;
 501
 502     for (i = 0; i < 8; i++) {
 503         for (j = 0; j < 8; j++) {
 504             if (*block < -128)
 505                 *pixels = 0;
 506             else if (*block > 127)
 507                 *pixels = 255;
 508             else
 509                 *pixels = (uint8_t)(*block + 128);
 510             block++;
 511             pixels++;
 512         }
 513         pixels += (line_size - 8);
 514     }
 515 }
 516
 517 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 518                                     int line_size)
 519 {
 520     int i;
 521
 522     /* read the pixels */
 523     for(i=0;i<8;i++) {
 524         pixels[0] = block[0];
 525         pixels[1] = block[1];
 526         pixels[2] = block[2];
 527         pixels[3] = block[3];
 528         pixels[4] = block[4];
 529         pixels[5] = block[5];
 530         pixels[6] = block[6];
 531         pixels[7] = block[7];
 532
 533         pixels += line_size;
 534         block += 8;
 535     }
 536 }
 537
 538 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 539                           int line_size)
 540 {
 541     int i;
 542     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 543
 544     /* read the pixels */
 545     for(i=0;i<8;i++) {
 546         pixels[0] = cm[pixels[0] + block[0]];
 547         pixels[1] = cm[pixels[1] + block[1]];
 548         pixels[2] = cm[pixels[2] + block[2]];
 549         pixels[3] = cm[pixels[3] + block[3]];
 550         pixels[4] = cm[pixels[4] + block[4]];
 551         pixels[5] = cm[pixels[5] + block[5]];
 552         pixels[6] = cm[pixels[6] + block[6]];
 553         pixels[7] = cm[pixels[7] + block[7]];
 554         pixels += line_size;
 555         block += 8;
 556     }
 557 }
 558
 559 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 560                           int line_size)
 561 {
 562     int i;
 563     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 564
 565     /* read the pixels */
 566     for(i=0;i<4;i++) {
 567         pixels[0] = cm[pixels[0] + block[0]];
 568         pixels[1] = cm[pixels[1] + block[1]];
 569         pixels[2] = cm[pixels[2] + block[2]];
 570         pixels[3] = cm[pixels[3] + block[3]];
 571         pixels += line_size;
 572         block += 8;
 573     }
 574 }
 575
 576 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 577                           int line_size)
 578 {
 579     int i;
 580     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 581
 582     /* read the pixels */
 583     for(i=0;i<2;i++) {
 584         pixels[0] = cm[pixels[0] + block[0]];
 585         pixels[1] = cm[pixels[1] + block[1]];
 586         pixels += line_size;
 587         block += 8;
 588     }
 589 }
 590
 591 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 592 {
 593     int i;
 594     for(i=0;i<8;i++) {
 595         pixels[0] += block[0];
 596         pixels[1] += block[1];
 597         pixels[2] += block[2];
 598         pixels[3] += block[3];
 599         pixels[4] += block[4];
 600         pixels[5] += block[5];
 601         pixels[6] += block[6];
 602         pixels[7] += block[7];
 603         pixels += line_size;
 604         block += 8;
 605     }
 606 }
 607
 608 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 609 {
 610     int i;
 611     for(i=0;i<4;i++) {
 612         pixels[0] += block[0];
 613         pixels[1] += block[1];
 614         pixels[2] += block[2];
 615         pixels[3] += block[3];
 616         pixels += line_size;
 617         block += 4;
 618     }
 619 }
 620
 621 static int sum_abs_dctelem_c(DCTELEM *block)
 622 {
 623     int sum=0, i;
 624     for(i=0; i<64; i++)
 625         sum+= FFABS(block[i]);
 626     return sum;
 627 }
 628
 629 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 630 {
 631     int i;
 632
 633     for (i = 0; i < h; i++) {
 634         memset(block, value, 16);
 635         block += line_size;
 636     }
 637 }
 638
 639 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 640 {
 641     int i;
 642
 643     for (i = 0; i < h; i++) {
 644         memset(block, value, 8);
 645         block += line_size;
 646     }
 647 }
 648
 649 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 650 {
 651     int i, j;
 652     uint16_t *dst1 = (uint16_t *) dst;
 653     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 654
 655     for (j = 0; j < 8; j++) {
 656         for (i = 0; i < 8; i++) {
 657             dst1[i] = dst2[i] = src[i] * 0x0101;
 658         }
 659         src  += 8;
 660         dst1 += linesize;
 661         dst2 += linesize;
 662     }
 663 }
 664
 665 #if 0
 666
 667 #define PIXOP2(OPNAME, OP) \
 668 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 669 {\
 670     int i;\
 671     for(i=0; i<h; i++){\
 672         OP(*((uint64_t*)block), AV_RN64(pixels));\
 673         pixels+=line_size;\
 674         block +=line_size;\
 675     }\
 676 }\
 677 \
 678 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 679 {\
 680     int i;\
 681     for(i=0; i<h; i++){\
 682         const uint64_t a= AV_RN64(pixels  );\
 683         const uint64_t b= AV_RN64(pixels+1);\
 684         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 685         pixels+=line_size;\
 686         block +=line_size;\
 687     }\
 688 }\
 689 \
 690 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 691 {\
 692     int i;\
 693     for(i=0; i<h; i++){\
 694         const uint64_t a= AV_RN64(pixels  );\
 695         const uint64_t b= AV_RN64(pixels+1);\
 696         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 697         pixels+=line_size;\
 698         block +=line_size;\
 699     }\
 700 }\
 701 \
 702 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 703 {\
 704     int i;\
 705     for(i=0; i<h; i++){\
 706         const uint64_t a= AV_RN64(pixels          );\
 707         const uint64_t b= AV_RN64(pixels+line_size);\
 708         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 709         pixels+=line_size;\
 710         block +=line_size;\
 711     }\
 712 }\
 713 \
 714 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 715 {\
 716     int i;\
 717     for(i=0; i<h; i++){\
 718         const uint64_t a= AV_RN64(pixels          );\
 719         const uint64_t b= AV_RN64(pixels+line_size);\
 720         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 721         pixels+=line_size;\
 722         block +=line_size;\
 723     }\
 724 }\
 725 \
 726 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 727 {\
 728         int i;\
 729         const uint64_t a= AV_RN64(pixels  );\
 730         const uint64_t b= AV_RN64(pixels+1);\
 731         uint64_t l0=  (a&0x0303030303030303ULL)\
 732                     + (b&0x0303030303030303ULL)\
 733                     + 0x0202020202020202ULL;\
 734         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 735                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 736         uint64_t l1,h1;\
 737 \
 738         pixels+=line_size;\
 739         for(i=0; i<h; i+=2){\
 740             uint64_t a= AV_RN64(pixels  );\
 741             uint64_t b= AV_RN64(pixels+1);\
 742             l1=  (a&0x0303030303030303ULL)\
 743                + (b&0x0303030303030303ULL);\
 744             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 745               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 746             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 747             pixels+=line_size;\
 748             block +=line_size;\
 749             a= AV_RN64(pixels  );\
 750             b= AV_RN64(pixels+1);\
 751             l0=  (a&0x0303030303030303ULL)\
 752                + (b&0x0303030303030303ULL)\
 753                + 0x0202020202020202ULL;\
 754             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 755               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 756             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 757             pixels+=line_size;\
 758             block +=line_size;\
 759         }\
 760 }\
 761 \
 762 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 763 {\
 764         int i;\
 765         const uint64_t a= AV_RN64(pixels  );\
 766         const uint64_t b= AV_RN64(pixels+1);\
 767         uint64_t l0=  (a&0x0303030303030303ULL)\
 768                     + (b&0x0303030303030303ULL)\
 769                     + 0x0101010101010101ULL;\
 770         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 771                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 772         uint64_t l1,h1;\
 773 \
 774         pixels+=line_size;\
 775         for(i=0; i<h; i+=2){\
 776             uint64_t a= AV_RN64(pixels  );\
 777             uint64_t b= AV_RN64(pixels+1);\
 778             l1=  (a&0x0303030303030303ULL)\
 779                + (b&0x0303030303030303ULL);\
 780             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 781               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 782             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 783             pixels+=line_size;\
 784             block +=line_size;\
 785             a= AV_RN64(pixels  );\
 786             b= AV_RN64(pixels+1);\
 787             l0=  (a&0x0303030303030303ULL)\
 788                + (b&0x0303030303030303ULL)\
 789                + 0x0101010101010101ULL;\
 790             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 791               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 792             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 793             pixels+=line_size;\
 794             block +=line_size;\
 795         }\
 796 }\
 797 \
 798 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 799 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 800 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 801 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 802 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 803 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 804 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 805
 806 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 807 #else // 64 bit variant
 808
 809 #define PIXOP2(OPNAME, OP) \
 810 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 811     int i;\
 812     for(i=0; i<h; i++){\
 813         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 814         pixels+=line_size;\
 815         block +=line_size;\
 816     }\
 817 }\
 818 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 819     int i;\
 820     for(i=0; i<h; i++){\
 821         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 822         pixels+=line_size;\
 823         block +=line_size;\
 824     }\
 825 }\
 826 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 827     int i;\
 828     for(i=0; i<h; i++){\
 829         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 830         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 831         pixels+=line_size;\
 832         block +=line_size;\
 833     }\
 834 }\
 835 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 836     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 837 }\
 838 \
 839 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 840                                                 int src_stride1, int src_stride2, int h){\
 841     int i;\
 842     for(i=0; i<h; i++){\
 843         uint32_t a,b;\
 844         a= AV_RN32(&src1[i*src_stride1  ]);\
 845         b= AV_RN32(&src2[i*src_stride2  ]);\
 846         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 847         a= AV_RN32(&src1[i*src_stride1+4]);\
 848         b= AV_RN32(&src2[i*src_stride2+4]);\
 849         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 850     }\
 851 }\
 852 \
 853 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 854                                                 int src_stride1, int src_stride2, int h){\
 855     int i;\
 856     for(i=0; i<h; i++){\
 857         uint32_t a,b;\
 858         a= AV_RN32(&src1[i*src_stride1  ]);\
 859         b= AV_RN32(&src2[i*src_stride2  ]);\
 860         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 861         a= AV_RN32(&src1[i*src_stride1+4]);\
 862         b= AV_RN32(&src2[i*src_stride2+4]);\
 863         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 864     }\
 865 }\
 866 \
 867 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 868                                                 int src_stride1, int src_stride2, int h){\
 869     int i;\
 870     for(i=0; i<h; i++){\
 871         uint32_t a,b;\
 872         a= AV_RN32(&src1[i*src_stride1  ]);\
 873         b= AV_RN32(&src2[i*src_stride2  ]);\
 874         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 875     }\
 876 }\
 877 \
 878 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 879                                                 int src_stride1, int src_stride2, int h){\
 880     int i;\
 881     for(i=0; i<h; i++){\
 882         uint32_t a,b;\
 883         a= AV_RN16(&src1[i*src_stride1  ]);\
 884         b= AV_RN16(&src2[i*src_stride2  ]);\
 885         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 886     }\
 887 }\
 888 \
 889 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 890                                                 int src_stride1, int src_stride2, int h){\
 891     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 892     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 893 }\
 894 \
 895 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 896                                                 int src_stride1, int src_stride2, int h){\
 897     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 898     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 899 }\
 900 \
 901 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 902     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 903 }\
 904 \
 905 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 906     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 907 }\
 908 \
 909 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 910     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 911 }\
 912 \
 913 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 914     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 915 }\
 916 \
 917 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 918                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 919     int i;\
 920     for(i=0; i<h; i++){\
 921         uint32_t a, b, c, d, l0, l1, h0, h1;\
 922         a= AV_RN32(&src1[i*src_stride1]);\
 923         b= AV_RN32(&src2[i*src_stride2]);\
 924         c= AV_RN32(&src3[i*src_stride3]);\
 925         d= AV_RN32(&src4[i*src_stride4]);\
 926         l0=  (a&0x03030303UL)\
 927            + (b&0x03030303UL)\
 928            + 0x02020202UL;\
 929         h0= ((a&0xFCFCFCFCUL)>>2)\
 930           + ((b&0xFCFCFCFCUL)>>2);\
 931         l1=  (c&0x03030303UL)\
 932            + (d&0x03030303UL);\
 933         h1= ((c&0xFCFCFCFCUL)>>2)\
 934           + ((d&0xFCFCFCFCUL)>>2);\
 935         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 936         a= AV_RN32(&src1[i*src_stride1+4]);\
 937         b= AV_RN32(&src2[i*src_stride2+4]);\
 938         c= AV_RN32(&src3[i*src_stride3+4]);\
 939         d= AV_RN32(&src4[i*src_stride4+4]);\
 940         l0=  (a&0x03030303UL)\
 941            + (b&0x03030303UL)\
 942            + 0x02020202UL;\
 943         h0= ((a&0xFCFCFCFCUL)>>2)\
 944           + ((b&0xFCFCFCFCUL)>>2);\
 945         l1=  (c&0x03030303UL)\
 946            + (d&0x03030303UL);\
 947         h1= ((c&0xFCFCFCFCUL)>>2)\
 948           + ((d&0xFCFCFCFCUL)>>2);\
 949         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 950     }\
 951 }\
 952 \
 953 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 954     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 955 }\
 956 \
 957 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 958     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 959 }\
 960 \
 961 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 962     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 963 }\
 964 \
 965 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 966     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 967 }\
 968 \
 969 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 970                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 971     int i;\
 972     for(i=0; i<h; i++){\
 973         uint32_t a, b, c, d, l0, l1, h0, h1;\
 974         a= AV_RN32(&src1[i*src_stride1]);\
 975         b= AV_RN32(&src2[i*src_stride2]);\
 976         c= AV_RN32(&src3[i*src_stride3]);\
 977         d= AV_RN32(&src4[i*src_stride4]);\
 978         l0=  (a&0x03030303UL)\
 979            + (b&0x03030303UL)\
 980            + 0x01010101UL;\
 981         h0= ((a&0xFCFCFCFCUL)>>2)\
 982           + ((b&0xFCFCFCFCUL)>>2);\
 983         l1=  (c&0x03030303UL)\
 984            + (d&0x03030303UL);\
 985         h1= ((c&0xFCFCFCFCUL)>>2)\
 986           + ((d&0xFCFCFCFCUL)>>2);\
 987         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 988         a= AV_RN32(&src1[i*src_stride1+4]);\
 989         b= AV_RN32(&src2[i*src_stride2+4]);\
 990         c= AV_RN32(&src3[i*src_stride3+4]);\
 991         d= AV_RN32(&src4[i*src_stride4+4]);\
 992         l0=  (a&0x03030303UL)\
 993            + (b&0x03030303UL)\
 994            + 0x01010101UL;\
 995         h0= ((a&0xFCFCFCFCUL)>>2)\
 996           + ((b&0xFCFCFCFCUL)>>2);\
 997         l1=  (c&0x03030303UL)\
 998            + (d&0x03030303UL);\
 999         h1= ((c&0xFCFCFCFCUL)>>2)\
1000           + ((d&0xFCFCFCFCUL)>>2);\
1001         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1002     }\
1003 }\
1004 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1005                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1006     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1008 }\
1009 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1010                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1011     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1012     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1013 }\
1014 \
1015 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1016 {\
1017         int i, a0, b0, a1, b1;\
1018         a0= pixels[0];\
1019         b0= pixels[1] + 2;\
1020         a0 += b0;\
1021         b0 += pixels[2];\
1022 \
1023         pixels+=line_size;\
1024         for(i=0; i<h; i+=2){\
1025             a1= pixels[0];\
1026             b1= pixels[1];\
1027             a1 += b1;\
1028             b1 += pixels[2];\
1029 \
1030             block[0]= (a1+a0)>>2; /* FIXME non put */\
1031             block[1]= (b1+b0)>>2;\
1032 \
1033             pixels+=line_size;\
1034             block +=line_size;\
1035 \
1036             a0= pixels[0];\
1037             b0= pixels[1] + 2;\
1038             a0 += b0;\
1039             b0 += pixels[2];\
1040 \
1041             block[0]= (a1+a0)>>2;\
1042             block[1]= (b1+b0)>>2;\
1043             pixels+=line_size;\
1044             block +=line_size;\
1045         }\
1046 }\
1047 \
1048 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1049 {\
1050         int i;\
1051         const uint32_t a= AV_RN32(pixels  );\
1052         const uint32_t b= AV_RN32(pixels+1);\
1053         uint32_t l0=  (a&0x03030303UL)\
1054                     + (b&0x03030303UL)\
1055                     + 0x02020202UL;\
1056         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1057                    + ((b&0xFCFCFCFCUL)>>2);\
1058         uint32_t l1,h1;\
1059 \
1060         pixels+=line_size;\
1061         for(i=0; i<h; i+=2){\
1062             uint32_t a= AV_RN32(pixels  );\
1063             uint32_t b= AV_RN32(pixels+1);\
1064             l1=  (a&0x03030303UL)\
1065                + (b&0x03030303UL);\
1066             h1= ((a&0xFCFCFCFCUL)>>2)\
1067               + ((b&0xFCFCFCFCUL)>>2);\
1068             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1069             pixels+=line_size;\
1070             block +=line_size;\
1071             a= AV_RN32(pixels  );\
1072             b= AV_RN32(pixels+1);\
1073             l0=  (a&0x03030303UL)\
1074                + (b&0x03030303UL)\
1075                + 0x02020202UL;\
1076             h0= ((a&0xFCFCFCFCUL)>>2)\
1077               + ((b&0xFCFCFCFCUL)>>2);\
1078             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1079             pixels+=line_size;\
1080             block +=line_size;\
1081         }\
1082 }\
1083 \
1084 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1085 {\
1086     int j;\
1087     for(j=0; j<2; j++){\
1088         int i;\
1089         const uint32_t a= AV_RN32(pixels  );\
1090         const uint32_t b= AV_RN32(pixels+1);\
1091         uint32_t l0=  (a&0x03030303UL)\
1092                     + (b&0x03030303UL)\
1093                     + 0x02020202UL;\
1094         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1095                    + ((b&0xFCFCFCFCUL)>>2);\
1096         uint32_t l1,h1;\
1097 \
1098         pixels+=line_size;\
1099         for(i=0; i<h; i+=2){\
1100             uint32_t a= AV_RN32(pixels  );\
1101             uint32_t b= AV_RN32(pixels+1);\
1102             l1=  (a&0x03030303UL)\
1103                + (b&0x03030303UL);\
1104             h1= ((a&0xFCFCFCFCUL)>>2)\
1105               + ((b&0xFCFCFCFCUL)>>2);\
1106             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1107             pixels+=line_size;\
1108             block +=line_size;\
1109             a= AV_RN32(pixels  );\
1110             b= AV_RN32(pixels+1);\
1111             l0=  (a&0x03030303UL)\
1112                + (b&0x03030303UL)\
1113                + 0x02020202UL;\
1114             h0= ((a&0xFCFCFCFCUL)>>2)\
1115               + ((b&0xFCFCFCFCUL)>>2);\
1116             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1117             pixels+=line_size;\
1118             block +=line_size;\
1119         }\
1120         pixels+=4-line_size*(h+1);\
1121         block +=4-line_size*h;\
1122     }\
1123 }\
1124 \
1125 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1126 {\
1127     int j;\
1128     for(j=0; j<2; j++){\
1129         int i;\
1130         const uint32_t a= AV_RN32(pixels  );\
1131         const uint32_t b= AV_RN32(pixels+1);\
1132         uint32_t l0=  (a&0x03030303UL)\
1133                     + (b&0x03030303UL)\
1134                     + 0x01010101UL;\
1135         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1136                    + ((b&0xFCFCFCFCUL)>>2);\
1137         uint32_t l1,h1;\
1138 \
1139         pixels+=line_size;\
1140         for(i=0; i<h; i+=2){\
1141             uint32_t a= AV_RN32(pixels  );\
1142             uint32_t b= AV_RN32(pixels+1);\
1143             l1=  (a&0x03030303UL)\
1144                + (b&0x03030303UL);\
1145             h1= ((a&0xFCFCFCFCUL)>>2)\
1146               + ((b&0xFCFCFCFCUL)>>2);\
1147             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1148             pixels+=line_size;\
1149             block +=line_size;\
1150             a= AV_RN32(pixels  );\
1151             b= AV_RN32(pixels+1);\
1152             l0=  (a&0x03030303UL)\
1153                + (b&0x03030303UL)\
1154                + 0x01010101UL;\
1155             h0= ((a&0xFCFCFCFCUL)>>2)\
1156               + ((b&0xFCFCFCFCUL)>>2);\
1157             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1158             pixels+=line_size;\
1159             block +=line_size;\
1160         }\
1161         pixels+=4-line_size*(h+1);\
1162         block +=4-line_size*h;\
1163     }\
1164 }\
1165 \
1166 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1167 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1168 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1169 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1170 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1171 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1172 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1173 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1174
1175 #define op_avg(a, b) a = rnd_avg32(a, b)
1176 #endif
1177 #define op_put(a, b) a = b
1178
1179 PIXOP2(avg, op_avg)
1180 PIXOP2(put, op_put)
1181 #undef op_avg
1182 #undef op_put
1183
1184 #define put_no_rnd_pixels8_c  put_pixels8_c
1185 #define put_no_rnd_pixels16_c put_pixels16_c
1186
1187 #define avg2(a,b) ((a+b+1)>>1)
1188 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1189
1190 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1191     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1192 }
1193
1194 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1195     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1196 }
1197
1198 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1199 {
1200     const int A=(16-x16)*(16-y16);
1201     const int B=(   x16)*(16-y16);
1202     const int C=(16-x16)*(   y16);
1203     const int D=(   x16)*(   y16);
1204     int i;
1205
1206     for(i=0; i<h; i++)
1207     {
1208         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1209         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1210         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1211         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1212         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1213         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1214         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1215         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1216         dst+= stride;
1217         src+= stride;
1218     }
1219 }
1220
1221 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1222                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1223 {
1224     int y, vx, vy;
1225     const int s= 1<<shift;
1226
1227     width--;
1228     height--;
1229
1230     for(y=0; y<h; y++){
1231         int x;
1232
1233         vx= ox;
1234         vy= oy;
1235         for(x=0; x<8; x++){ //XXX FIXME optimize
1236             int src_x, src_y, frac_x, frac_y, index;
1237
1238             src_x= vx>>16;
1239             src_y= vy>>16;
1240             frac_x= src_x&(s-1);
1241             frac_y= src_y&(s-1);
1242             src_x>>=shift;
1243             src_y>>=shift;
1244
1245             if((unsigned)src_x < width){
1246                 if((unsigned)src_y < height){
1247                     index= src_x + src_y*stride;
1248                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1249                                            + src[index       +1]*   frac_x )*(s-frac_y)
1250                                         + (  src[index+stride  ]*(s-frac_x)
1251                                            + src[index+stride+1]*   frac_x )*   frac_y
1252                                         + r)>>(shift*2);
1253                 }else{
1254                     index= src_x + av_clip(src_y, 0, height)*stride;
1255                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1256                                           + src[index       +1]*   frac_x )*s
1257                                         + r)>>(shift*2);
1258                 }
1259             }else{
1260                 if((unsigned)src_y < height){
1261                     index= av_clip(src_x, 0, width) + src_y*stride;
1262                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1263                                            + src[index+stride  ]*   frac_y )*s
1264                                         + r)>>(shift*2);
1265                 }else{
1266                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1267                     dst[y*stride + x]=    src[index         ];
1268                 }
1269             }
1270
1271             vx+= dxx;
1272             vy+= dyx;
1273         }
1274         ox += dxy;
1275         oy += dyy;
1276     }
1277 }
1278
1279 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280     switch(width){
1281     case 2: put_pixels2_c (dst, src, stride, height); break;
1282     case 4: put_pixels4_c (dst, src, stride, height); break;
1283     case 8: put_pixels8_c (dst, src, stride, height); break;
1284     case 16:put_pixels16_c(dst, src, stride, height); break;
1285     }
1286 }
1287
1288 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1289     int i,j;
1290     for (i=0; i < height; i++) {
1291       for (j=0; j < width; j++) {
1292         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1293       }
1294       src += stride;
1295       dst += stride;
1296     }
1297 }
1298
1299 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1300     int i,j;
1301     for (i=0; i < height; i++) {
1302       for (j=0; j < width; j++) {
1303         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1304       }
1305       src += stride;
1306       dst += stride;
1307     }
1308 }
1309
1310 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311     int i,j;
1312     for (i=0; i < height; i++) {
1313       for (j=0; j < width; j++) {
1314         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1315       }
1316       src += stride;
1317       dst += stride;
1318     }
1319 }
1320
1321 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322     int i,j;
1323     for (i=0; i < height; i++) {
1324       for (j=0; j < width; j++) {
1325         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1326       }
1327       src += stride;
1328       dst += stride;
1329     }
1330 }
1331
1332 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333     int i,j;
1334     for (i=0; i < height; i++) {
1335       for (j=0; j < width; j++) {
1336         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1337       }
1338       src += stride;
1339       dst += stride;
1340     }
1341 }
1342
1343 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344     int i,j;
1345     for (i=0; i < height; i++) {
1346       for (j=0; j < width; j++) {
1347         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1348       }
1349       src += stride;
1350       dst += stride;
1351     }
1352 }
1353
1354 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355     int i,j;
1356     for (i=0; i < height; i++) {
1357       for (j=0; j < width; j++) {
1358         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1359       }
1360       src += stride;
1361       dst += stride;
1362     }
1363 }
1364
1365 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366     int i,j;
1367     for (i=0; i < height; i++) {
1368       for (j=0; j < width; j++) {
1369         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1370       }
1371       src += stride;
1372       dst += stride;
1373     }
1374 }
1375
1376 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377     switch(width){
1378     case 2: avg_pixels2_c (dst, src, stride, height); break;
1379     case 4: avg_pixels4_c (dst, src, stride, height); break;
1380     case 8: avg_pixels8_c (dst, src, stride, height); break;
1381     case 16:avg_pixels16_c(dst, src, stride, height); break;
1382     }
1383 }
1384
1385 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1386     int i,j;
1387     for (i=0; i < height; i++) {
1388       for (j=0; j < width; j++) {
1389         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1390       }
1391       src += stride;
1392       dst += stride;
1393     }
1394 }
1395
1396 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1397     int i,j;
1398     for (i=0; i < height; i++) {
1399       for (j=0; j < width; j++) {
1400         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1401       }
1402       src += stride;
1403       dst += stride;
1404     }
1405 }
1406
1407 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1408     int i,j;
1409     for (i=0; i < height; i++) {
1410       for (j=0; j < width; j++) {
1411         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1412       }
1413       src += stride;
1414       dst += stride;
1415     }
1416 }
1417
1418 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1419     int i,j;
1420     for (i=0; i < height; i++) {
1421       for (j=0; j < width; j++) {
1422         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1423       }
1424       src += stride;
1425       dst += stride;
1426     }
1427 }
1428
1429 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1430     int i,j;
1431     for (i=0; i < height; i++) {
1432       for (j=0; j < width; j++) {
1433         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1434       }
1435       src += stride;
1436       dst += stride;
1437     }
1438 }
1439
1440 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1441     int i,j;
1442     for (i=0; i < height; i++) {
1443       for (j=0; j < width; j++) {
1444         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1445       }
1446       src += stride;
1447       dst += stride;
1448     }
1449 }
1450
1451 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1452     int i,j;
1453     for (i=0; i < height; i++) {
1454       for (j=0; j < width; j++) {
1455         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1456       }
1457       src += stride;
1458       dst += stride;
1459     }
1460 }
1461
1462 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1463     int i,j;
1464     for (i=0; i < height; i++) {
1465       for (j=0; j < width; j++) {
1466         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1467       }
1468       src += stride;
1469       dst += stride;
1470     }
1471 }
1472 #if 0
1473 #define TPEL_WIDTH(width)\
1474 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1475     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1476 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1477     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1478 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1479     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1480 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1481     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1482 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1483     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1484 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1485     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1486 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1487     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1488 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1489     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1490 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1491     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1492 #endif
1493
1494 #define H264_CHROMA_MC(OPNAME, OP)\
1495 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1496     const int A=(8-x)*(8-y);\
1497     const int B=(  x)*(8-y);\
1498     const int C=(8-x)*(  y);\
1499     const int D=(  x)*(  y);\
1500     int i;\
1501     \
1502     assert(x<8 && y<8 && x>=0 && y>=0);\
1503 \
1504     if(D){\
1505         for(i=0; i<h; i++){\
1506             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1507             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1508             dst+= stride;\
1509             src+= stride;\
1510         }\
1511     }else{\
1512         const int E= B+C;\
1513         const int step= C ? stride : 1;\
1514         for(i=0; i<h; i++){\
1515             OP(dst[0], (A*src[0] + E*src[step+0]));\
1516             OP(dst[1], (A*src[1] + E*src[step+1]));\
1517             dst+= stride;\
1518             src+= stride;\
1519         }\
1520     }\
1521 }\
1522 \
1523 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1524     const int A=(8-x)*(8-y);\
1525     const int B=(  x)*(8-y);\
1526     const int C=(8-x)*(  y);\
1527     const int D=(  x)*(  y);\
1528     int i;\
1529     \
1530     assert(x<8 && y<8 && x>=0 && y>=0);\
1531 \
1532     if(D){\
1533         for(i=0; i<h; i++){\
1534             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1535             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1536             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1537             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1538             dst+= stride;\
1539             src+= stride;\
1540         }\
1541     }else{\
1542         const int E= B+C;\
1543         const int step= C ? stride : 1;\
1544         for(i=0; i<h; i++){\
1545             OP(dst[0], (A*src[0] + E*src[step+0]));\
1546             OP(dst[1], (A*src[1] + E*src[step+1]));\
1547             OP(dst[2], (A*src[2] + E*src[step+2]));\
1548             OP(dst[3], (A*src[3] + E*src[step+3]));\
1549             dst+= stride;\
1550             src+= stride;\
1551         }\
1552     }\
1553 }\
1554 \
1555 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1556     const int A=(8-x)*(8-y);\
1557     const int B=(  x)*(8-y);\
1558     const int C=(8-x)*(  y);\
1559     const int D=(  x)*(  y);\
1560     int i;\
1561     \
1562     assert(x<8 && y<8 && x>=0 && y>=0);\
1563 \
1564     if(D){\
1565         for(i=0; i<h; i++){\
1566             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1567             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1568             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1569             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1570             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1571             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1572             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1573             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1574             dst+= stride;\
1575             src+= stride;\
1576         }\
1577     }else{\
1578         const int E= B+C;\
1579         const int step= C ? stride : 1;\
1580         for(i=0; i<h; i++){\
1581             OP(dst[0], (A*src[0] + E*src[step+0]));\
1582             OP(dst[1], (A*src[1] + E*src[step+1]));\
1583             OP(dst[2], (A*src[2] + E*src[step+2]));\
1584             OP(dst[3], (A*src[3] + E*src[step+3]));\
1585             OP(dst[4], (A*src[4] + E*src[step+4]));\
1586             OP(dst[5], (A*src[5] + E*src[step+5]));\
1587             OP(dst[6], (A*src[6] + E*src[step+6]));\
1588             OP(dst[7], (A*src[7] + E*src[step+7]));\
1589             dst+= stride;\
1590             src+= stride;\
1591         }\
1592     }\
1593 }
1594
1595 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1596 #define op_put(a, b) a = (((b) + 32)>>6)
1597
1598 H264_CHROMA_MC(put_       , op_put)
1599 H264_CHROMA_MC(avg_       , op_avg)
1600 #undef op_avg
1601 #undef op_put
1602
1603 #define QPEL_MC(r, OPNAME, RND, OP) \
1604 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1605     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1606     int i;\
1607     for(i=0; i<h; i++)\
1608     {\
1609         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1610         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1611         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1612         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1613         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1614         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1615         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1616         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1617         dst+=dstStride;\
1618         src+=srcStride;\
1619     }\
1620 }\
1621 \
1622 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1623     const int w=8;\
1624     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1625     int i;\
1626     for(i=0; i<w; i++)\
1627     {\
1628         const int src0= src[0*srcStride];\
1629         const int src1= src[1*srcStride];\
1630         const int src2= src[2*srcStride];\
1631         const int src3= src[3*srcStride];\
1632         const int src4= src[4*srcStride];\
1633         const int src5= src[5*srcStride];\
1634         const int src6= src[6*srcStride];\
1635         const int src7= src[7*srcStride];\
1636         const int src8= src[8*srcStride];\
1637         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1638         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1639         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1640         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1641         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1642         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1643         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1644         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1645         dst++;\
1646         src++;\
1647     }\
1648 }\
1649 \
1650 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1651     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1652     int i;\
1653     \
1654     for(i=0; i<h; i++)\
1655     {\
1656         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1657         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1658         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1659         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1660         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1661         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1662         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1663         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1664         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1665         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1666         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1667         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1668         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1669         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1670         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1671         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1672         dst+=dstStride;\
1673         src+=srcStride;\
1674     }\
1675 }\
1676 \
1677 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1678     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1679     int i;\
1680     const int w=16;\
1681     for(i=0; i<w; i++)\
1682     {\
1683         const int src0= src[0*srcStride];\
1684         const int src1= src[1*srcStride];\
1685         const int src2= src[2*srcStride];\
1686         const int src3= src[3*srcStride];\
1687         const int src4= src[4*srcStride];\
1688         const int src5= src[5*srcStride];\
1689         const int src6= src[6*srcStride];\
1690         const int src7= src[7*srcStride];\
1691         const int src8= src[8*srcStride];\
1692         const int src9= src[9*srcStride];\
1693         const int src10= src[10*srcStride];\
1694         const int src11= src[11*srcStride];\
1695         const int src12= src[12*srcStride];\
1696         const int src13= src[13*srcStride];\
1697         const int src14= src[14*srcStride];\
1698         const int src15= src[15*srcStride];\
1699         const int src16= src[16*srcStride];\
1700         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1701         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1702         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1703         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1704         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1705         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1706         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1707         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1708         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1709         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1710         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1711         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1712         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1713         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1714         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1715         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1716         dst++;\
1717         src++;\
1718     }\
1719 }\
1720 \
1721 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1722     uint8_t half[64];\
1723     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1724     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1725 }\
1726 \
1727 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1728     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1729 }\
1730 \
1731 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1732     uint8_t half[64];\
1733     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1734     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1735 }\
1736 \
1737 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1738     uint8_t full[16*9];\
1739     uint8_t half[64];\
1740     copy_block9(full, src, 16, stride, 9);\
1741     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1742     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1743 }\
1744 \
1745 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1746     uint8_t full[16*9];\
1747     copy_block9(full, src, 16, stride, 9);\
1748     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1749 }\
1750 \
1751 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1752     uint8_t full[16*9];\
1753     uint8_t half[64];\
1754     copy_block9(full, src, 16, stride, 9);\
1755     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1756     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1757 }\
1758 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1759     uint8_t full[16*9];\
1760     uint8_t halfH[72];\
1761     uint8_t halfV[64];\
1762     uint8_t halfHV[64];\
1763     copy_block9(full, src, 16, stride, 9);\
1764     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1765     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1766     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1768 }\
1769 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1770     uint8_t full[16*9];\
1771     uint8_t halfH[72];\
1772     uint8_t halfHV[64];\
1773     copy_block9(full, src, 16, stride, 9);\
1774     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1775     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1776     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1778 }\
1779 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1780     uint8_t full[16*9];\
1781     uint8_t halfH[72];\
1782     uint8_t halfV[64];\
1783     uint8_t halfHV[64];\
1784     copy_block9(full, src, 16, stride, 9);\
1785     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1787     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1789 }\
1790 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1791     uint8_t full[16*9];\
1792     uint8_t halfH[72];\
1793     uint8_t halfHV[64];\
1794     copy_block9(full, src, 16, stride, 9);\
1795     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1796     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1797     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1798     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1799 }\
1800 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1801     uint8_t full[16*9];\
1802     uint8_t halfH[72];\
1803     uint8_t halfV[64];\
1804     uint8_t halfHV[64];\
1805     copy_block9(full, src, 16, stride, 9);\
1806     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1808     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1810 }\
1811 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1812     uint8_t full[16*9];\
1813     uint8_t halfH[72];\
1814     uint8_t halfHV[64];\
1815     copy_block9(full, src, 16, stride, 9);\
1816     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1818     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1820 }\
1821 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822     uint8_t full[16*9];\
1823     uint8_t halfH[72];\
1824     uint8_t halfV[64];\
1825     uint8_t halfHV[64];\
1826     copy_block9(full, src, 16, stride, 9);\
1827     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1828     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1829     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1831 }\
1832 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1833     uint8_t full[16*9];\
1834     uint8_t halfH[72];\
1835     uint8_t halfHV[64];\
1836     copy_block9(full, src, 16, stride, 9);\
1837     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1838     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1839     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1840     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1841 }\
1842 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1843     uint8_t halfH[72];\
1844     uint8_t halfHV[64];\
1845     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1846     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1847     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1848 }\
1849 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1850     uint8_t halfH[72];\
1851     uint8_t halfHV[64];\
1852     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1853     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1854     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1855 }\
1856 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1857     uint8_t full[16*9];\
1858     uint8_t halfH[72];\
1859     uint8_t halfV[64];\
1860     uint8_t halfHV[64];\
1861     copy_block9(full, src, 16, stride, 9);\
1862     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1863     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1864     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1865     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1866 }\
1867 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1868     uint8_t full[16*9];\
1869     uint8_t halfH[72];\
1870     copy_block9(full, src, 16, stride, 9);\
1871     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1872     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1873     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1874 }\
1875 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1876     uint8_t full[16*9];\
1877     uint8_t halfH[72];\
1878     uint8_t halfV[64];\
1879     uint8_t halfHV[64];\
1880     copy_block9(full, src, 16, stride, 9);\
1881     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1882     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1883     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1884     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1885 }\
1886 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1887     uint8_t full[16*9];\
1888     uint8_t halfH[72];\
1889     copy_block9(full, src, 16, stride, 9);\
1890     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1891     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1892     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1893 }\
1894 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1895     uint8_t halfH[72];\
1896     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1897     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1898 }\
1899 \
1900 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1901     uint8_t half[256];\
1902     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1903     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1904 }\
1905 \
1906 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1907     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1908 }\
1909 \
1910 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1911     uint8_t half[256];\
1912     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1913     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1914 }\
1915 \
1916 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1917     uint8_t full[24*17];\
1918     uint8_t half[256];\
1919     copy_block17(full, src, 24, stride, 17);\
1920     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1921     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1922 }\
1923 \
1924 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1925     uint8_t full[24*17];\
1926     copy_block17(full, src, 24, stride, 17);\
1927     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1928 }\
1929 \
1930 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1931     uint8_t full[24*17];\
1932     uint8_t half[256];\
1933     copy_block17(full, src, 24, stride, 17);\
1934     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1935     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1936 }\
1937 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1938     uint8_t full[24*17];\
1939     uint8_t halfH[272];\
1940     uint8_t halfV[256];\
1941     uint8_t halfHV[256];\
1942     copy_block17(full, src, 24, stride, 17);\
1943     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1944     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1945     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1946     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1947 }\
1948 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1949     uint8_t full[24*17];\
1950     uint8_t halfH[272];\
1951     uint8_t halfHV[256];\
1952     copy_block17(full, src, 24, stride, 17);\
1953     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1954     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1955     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1957 }\
1958 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1959     uint8_t full[24*17];\
1960     uint8_t halfH[272];\
1961     uint8_t halfV[256];\
1962     uint8_t halfHV[256];\
1963     copy_block17(full, src, 24, stride, 17);\
1964     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1965     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1966     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1967     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1968 }\
1969 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1970     uint8_t full[24*17];\
1971     uint8_t halfH[272];\
1972     uint8_t halfHV[256];\
1973     copy_block17(full, src, 24, stride, 17);\
1974     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1975     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1976     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1977     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1978 }\
1979 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1980     uint8_t full[24*17];\
1981     uint8_t halfH[272];\
1982     uint8_t halfV[256];\
1983     uint8_t halfHV[256];\
1984     copy_block17(full, src, 24, stride, 17);\
1985     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1986     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1987     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1988     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1989 }\
1990 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1991     uint8_t full[24*17];\
1992     uint8_t halfH[272];\
1993     uint8_t halfHV[256];\
1994     copy_block17(full, src, 24, stride, 17);\
1995     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1996     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1997     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1998     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1999 }\
2000 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2001     uint8_t full[24*17];\
2002     uint8_t halfH[272];\
2003     uint8_t halfV[256];\
2004     uint8_t halfHV[256];\
2005     copy_block17(full, src, 24, stride, 17);\
2006     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2007     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2008     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2010 }\
2011 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2012     uint8_t full[24*17];\
2013     uint8_t halfH[272];\
2014     uint8_t halfHV[256];\
2015     copy_block17(full, src, 24, stride, 17);\
2016     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2017     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2018     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2019     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2020 }\
2021 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2022     uint8_t halfH[272];\
2023     uint8_t halfHV[256];\
2024     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2025     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2026     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2027 }\
2028 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2029     uint8_t halfH[272];\
2030     uint8_t halfHV[256];\
2031     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2032     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2033     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2034 }\
2035 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2036     uint8_t full[24*17];\
2037     uint8_t halfH[272];\
2038     uint8_t halfV[256];\
2039     uint8_t halfHV[256];\
2040     copy_block17(full, src, 24, stride, 17);\
2041     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2043     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2044     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2045 }\
2046 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2047     uint8_t full[24*17];\
2048     uint8_t halfH[272];\
2049     copy_block17(full, src, 24, stride, 17);\
2050     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2051     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2052     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2053 }\
2054 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2055     uint8_t full[24*17];\
2056     uint8_t halfH[272];\
2057     uint8_t halfV[256];\
2058     uint8_t halfHV[256];\
2059     copy_block17(full, src, 24, stride, 17);\
2060     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2061     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2062     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2063     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2064 }\
2065 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2066     uint8_t full[24*17];\
2067     uint8_t halfH[272];\
2068     copy_block17(full, src, 24, stride, 17);\
2069     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2070     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2071     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2072 }\
2073 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2074     uint8_t halfH[272];\
2075     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2076     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2077 }
2078
2079 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2080 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2081 #define op_put(a, b) a = cm[((b) + 16)>>5]
2082 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2083
2084 QPEL_MC(0, put_       , _       , op_put)
2085 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2086 QPEL_MC(0, avg_       , _       , op_avg)
2087 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2088 #undef op_avg
2089 #undef op_avg_no_rnd
2090 #undef op_put
2091 #undef op_put_no_rnd
2092
2093 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
2094 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2095 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2096 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2097 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2098 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2099
2100 #if 1
2101 #define H264_LOWPASS(OPNAME, OP, OP2) \
2102 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2103     const int h=2;\
2104     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2105     int i;\
2106     for(i=0; i<h; i++)\
2107     {\
2108         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2109         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2110         dst+=dstStride;\
2111         src+=srcStride;\
2112     }\
2113 }\
2114 \
2115 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2116     const int w=2;\
2117     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2118     int i;\
2119     for(i=0; i<w; i++)\
2120     {\
2121         const int srcB= src[-2*srcStride];\
2122         const int srcA= src[-1*srcStride];\
2123         const int src0= src[0 *srcStride];\
2124         const int src1= src[1 *srcStride];\
2125         const int src2= src[2 *srcStride];\
2126         const int src3= src[3 *srcStride];\
2127         const int src4= src[4 *srcStride];\
2128         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2129         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2130         dst++;\
2131         src++;\
2132     }\
2133 }\
2134 \
2135 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2136     const int h=2;\
2137     const int w=2;\
2138     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2139     int i;\
2140     src -= 2*srcStride;\
2141     for(i=0; i<h+5; i++)\
2142     {\
2143         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2144         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2145         tmp+=tmpStride;\
2146         src+=srcStride;\
2147     }\
2148     tmp -= tmpStride*(h+5-2);\
2149     for(i=0; i<w; i++)\
2150     {\
2151         const int tmpB= tmp[-2*tmpStride];\
2152         const int tmpA= tmp[-1*tmpStride];\
2153         const int tmp0= tmp[0 *tmpStride];\
2154         const int tmp1= tmp[1 *tmpStride];\
2155         const int tmp2= tmp[2 *tmpStride];\
2156         const int tmp3= tmp[3 *tmpStride];\
2157         const int tmp4= tmp[4 *tmpStride];\
2158         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2159         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2160         dst++;\
2161         tmp++;\
2162     }\
2163 }\
2164 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2165     const int h=4;\
2166     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2167     int i;\
2168     for(i=0; i<h; i++)\
2169     {\
2170         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2171         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2172         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2173         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2174         dst+=dstStride;\
2175         src+=srcStride;\
2176     }\
2177 }\
2178 \
2179 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2180     const int w=4;\
2181     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2182     int i;\
2183     for(i=0; i<w; i++)\
2184     {\
2185         const int srcB= src[-2*srcStride];\
2186         const int srcA= src[-1*srcStride];\
2187         const int src0= src[0 *srcStride];\
2188         const int src1= src[1 *srcStride];\
2189         const int src2= src[2 *srcStride];\
2190         const int src3= src[3 *srcStride];\
2191         const int src4= src[4 *srcStride];\
2192         const int src5= src[5 *srcStride];\
2193         const int src6= src[6 *srcStride];\
2194         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2195         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2196         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2197         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2198         dst++;\
2199         src++;\
2200     }\
2201 }\
2202 \
2203 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2204     const int h=4;\
2205     const int w=4;\
2206     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207     int i;\
2208     src -= 2*srcStride;\
2209     for(i=0; i<h+5; i++)\
2210     {\
2211         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2212         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2213         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2214         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2215         tmp+=tmpStride;\
2216         src+=srcStride;\
2217     }\
2218     tmp -= tmpStride*(h+5-2);\
2219     for(i=0; i<w; i++)\
2220     {\
2221         const int tmpB= tmp[-2*tmpStride];\
2222         const int tmpA= tmp[-1*tmpStride];\
2223         const int tmp0= tmp[0 *tmpStride];\
2224         const int tmp1= tmp[1 *tmpStride];\
2225         const int tmp2= tmp[2 *tmpStride];\
2226         const int tmp3= tmp[3 *tmpStride];\
2227         const int tmp4= tmp[4 *tmpStride];\
2228         const int tmp5= tmp[5 *tmpStride];\
2229         const int tmp6= tmp[6 *tmpStride];\
2230         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2231         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2232         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2233         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2234         dst++;\
2235         tmp++;\
2236     }\
2237 }\
2238 \
2239 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2240     const int h=8;\
2241     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2242     int i;\
2243     for(i=0; i<h; i++)\
2244     {\
2245         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2246         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2247         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2248         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2249         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2250         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2251         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2252         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2253         dst+=dstStride;\
2254         src+=srcStride;\
2255     }\
2256 }\
2257 \
2258 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2259     const int w=8;\
2260     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2261     int i;\
2262     for(i=0; i<w; i++)\
2263     {\
2264         const int srcB= src[-2*srcStride];\
2265         const int srcA= src[-1*srcStride];\
2266         const int src0= src[0 *srcStride];\
2267         const int src1= src[1 *srcStride];\
2268         const int src2= src[2 *srcStride];\
2269         const int src3= src[3 *srcStride];\
2270         const int src4= src[4 *srcStride];\
2271         const int src5= src[5 *srcStride];\
2272         const int src6= src[6 *srcStride];\
2273         const int src7= src[7 *srcStride];\
2274         const int src8= src[8 *srcStride];\
2275         const int src9= src[9 *srcStride];\
2276         const int src10=src[10*srcStride];\
2277         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2278         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2279         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2280         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2281         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2282         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2283         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2284         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2285         dst++;\
2286         src++;\
2287     }\
2288 }\
2289 \
2290 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2291     const int h=8;\
2292     const int w=8;\
2293     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2294     int i;\
2295     src -= 2*srcStride;\
2296     for(i=0; i<h+5; i++)\
2297     {\
2298         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2299         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2300         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2301         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2302         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2303         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2304         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2305         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2306         tmp+=tmpStride;\
2307         src+=srcStride;\
2308     }\
2309     tmp -= tmpStride*(h+5-2);\
2310     for(i=0; i<w; i++)\
2311     {\
2312         const int tmpB= tmp[-2*tmpStride];\
2313         const int tmpA= tmp[-1*tmpStride];\
2314         const int tmp0= tmp[0 *tmpStride];\
2315         const int tmp1= tmp[1 *tmpStride];\
2316         const int tmp2= tmp[2 *tmpStride];\
2317         const int tmp3= tmp[3 *tmpStride];\
2318         const int tmp4= tmp[4 *tmpStride];\
2319         const int tmp5= tmp[5 *tmpStride];\
2320         const int tmp6= tmp[6 *tmpStride];\
2321         const int tmp7= tmp[7 *tmpStride];\
2322         const int tmp8= tmp[8 *tmpStride];\
2323         const int tmp9= tmp[9 *tmpStride];\
2324         const int tmp10=tmp[10*tmpStride];\
2325         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2326         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2327         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2328         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2329         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2330         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2331         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2332         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2333         dst++;\
2334         tmp++;\
2335     }\
2336 }\
2337 \
2338 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2339     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2340     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2341     src += 8*srcStride;\
2342     dst += 8*dstStride;\
2343     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2344     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2345 }\
2346 \
2347 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2348     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2349     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2350     src += 8*srcStride;\
2351     dst += 8*dstStride;\
2352     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2353     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2354 }\
2355 \
2356 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2357     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2358     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2359     src += 8*srcStride;\
2360     dst += 8*dstStride;\
2361     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2362     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2363 }\
2364
2365 #define H264_MC(OPNAME, SIZE) \
2366 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2367     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2368 }\
2369 \
2370 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2371     uint8_t half[SIZE*SIZE];\
2372     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2373     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2374 }\
2375 \
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2377     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2378 }\
2379 \
2380 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2381     uint8_t half[SIZE*SIZE];\
2382     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2383     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2384 }\
2385 \
2386 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2387     uint8_t full[SIZE*(SIZE+5)];\
2388     uint8_t * const full_mid= full + SIZE*2;\
2389     uint8_t half[SIZE*SIZE];\
2390     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2391     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2392     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2393 }\
2394 \
2395 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2396     uint8_t full[SIZE*(SIZE+5)];\
2397     uint8_t * const full_mid= full + SIZE*2;\
2398     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2399     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2400 }\
2401 \
2402 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2403     uint8_t full[SIZE*(SIZE+5)];\
2404     uint8_t * const full_mid= full + SIZE*2;\
2405     uint8_t half[SIZE*SIZE];\
2406     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2407     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2408     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2409 }\
2410 \
2411 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2412     uint8_t full[SIZE*(SIZE+5)];\
2413     uint8_t * const full_mid= full + SIZE*2;\
2414     uint8_t halfH[SIZE*SIZE];\
2415     uint8_t halfV[SIZE*SIZE];\
2416     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2417     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2418     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2419     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2420 }\
2421 \
2422 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2423     uint8_t full[SIZE*(SIZE+5)];\
2424     uint8_t * const full_mid= full + SIZE*2;\
2425     uint8_t halfH[SIZE*SIZE];\
2426     uint8_t halfV[SIZE*SIZE];\
2427     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2429     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2430     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2431 }\
2432 \
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2434     uint8_t full[SIZE*(SIZE+5)];\
2435     uint8_t * const full_mid= full + SIZE*2;\
2436     uint8_t halfH[SIZE*SIZE];\
2437     uint8_t halfV[SIZE*SIZE];\
2438     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2440     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2441     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2442 }\
2443 \
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2445     uint8_t full[SIZE*(SIZE+5)];\
2446     uint8_t * const full_mid= full + SIZE*2;\
2447     uint8_t halfH[SIZE*SIZE];\
2448     uint8_t halfV[SIZE*SIZE];\
2449     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2450     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2451     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2452     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2453 }\
2454 \
2455 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2456     int16_t tmp[SIZE*(SIZE+5)];\
2457     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2458 }\
2459 \
2460 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2461     int16_t tmp[SIZE*(SIZE+5)];\
2462     uint8_t halfH[SIZE*SIZE];\
2463     uint8_t halfHV[SIZE*SIZE];\
2464     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2465     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2466     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2467 }\
2468 \
2469 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2470     int16_t tmp[SIZE*(SIZE+5)];\
2471     uint8_t halfH[SIZE*SIZE];\
2472     uint8_t halfHV[SIZE*SIZE];\
2473     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2474     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2475     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2476 }\
2477 \
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2479     uint8_t full[SIZE*(SIZE+5)];\
2480     uint8_t * const full_mid= full + SIZE*2;\
2481     int16_t tmp[SIZE*(SIZE+5)];\
2482     uint8_t halfV[SIZE*SIZE];\
2483     uint8_t halfHV[SIZE*SIZE];\
2484     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2485     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2486     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2487     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2488 }\
2489 \
2490 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2491     uint8_t full[SIZE*(SIZE+5)];\
2492     uint8_t * const full_mid= full + SIZE*2;\
2493     int16_t tmp[SIZE*(SIZE+5)];\
2494     uint8_t halfV[SIZE*SIZE];\
2495     uint8_t halfHV[SIZE*SIZE];\
2496     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2497     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2498     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2499     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2500 }\
2501
2502 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2503 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2504 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2505 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2506 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2507
2508 H264_LOWPASS(put_       , op_put, op2_put)
2509 H264_LOWPASS(avg_       , op_avg, op2_avg)
2510 H264_MC(put_, 2)
2511 H264_MC(put_, 4)
2512 H264_MC(put_, 8)
2513 H264_MC(put_, 16)
2514 H264_MC(avg_, 4)
2515 H264_MC(avg_, 8)
2516 H264_MC(avg_, 16)
2517
2518 #undef op_avg
2519 #undef op_put
2520 #undef op2_avg
2521 #undef op2_put
2522 #endif
2523
2524 #define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2525 #define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2526 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2527 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2528
2529 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2530     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2531     int i;
2532
2533     for(i=0; i<h; i++){
2534         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2535         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2536         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2537         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2538         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2539         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2540         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2541         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2542         dst+=dstStride;
2543         src+=srcStride;
2544     }
2545 }
2546
2547 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2548     put_pixels8_c(dst, src, stride, 8);
2549 }
2550 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2551     avg_pixels8_c(dst, src, stride, 8);
2552 }
2553 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2554     put_pixels16_c(dst, src, stride, 16);
2555 }
2556 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2557     avg_pixels16_c(dst, src, stride, 16);
2558 }
2559
2560 #if CONFIG_RV40_DECODER
2561 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2562     put_pixels16_xy2_c(dst, src, stride, 16);
2563 }
2564 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2565     avg_pixels16_xy2_c(dst, src, stride, 16);
2566 }
2567 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2568     put_pixels8_xy2_c(dst, src, stride, 8);
2569 }
2570 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2571     avg_pixels8_xy2_c(dst, src, stride, 8);
2572 }
2573 #endif /* CONFIG_RV40_DECODER */
2574
2575 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2576     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2577     int i;
2578
2579     for(i=0; i<w; i++){
2580         const int src_1= src[ -srcStride];
2581         const int src0 = src[0          ];
2582         const int src1 = src[  srcStride];
2583         const int src2 = src[2*srcStride];
2584         const int src3 = src[3*srcStride];
2585         const int src4 = src[4*srcStride];
2586         const int src5 = src[5*srcStride];
2587         const int src6 = src[6*srcStride];
2588         const int src7 = src[7*srcStride];
2589         const int src8 = src[8*srcStride];
2590         const int src9 = src[9*srcStride];
2591         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2592         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2593         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2594         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2595         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2596         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2597         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2598         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2599         src++;
2600         dst++;
2601     }
2602 }
2603
2604 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2605     uint8_t half[64];
2606     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2607     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2608 }
2609
2610 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2611     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2612 }
2613
2614 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2615     uint8_t half[64];
2616     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2617     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2618 }
2619
2620 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2621     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2622 }
2623
2624 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2625     uint8_t halfH[88];
2626     uint8_t halfV[64];
2627     uint8_t halfHV[64];
2628     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2629     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2630     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2631     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2632 }
2633 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2634     uint8_t halfH[88];
2635     uint8_t halfV[64];
2636     uint8_t halfHV[64];
2637     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2638     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2639     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2640     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2641 }
2642 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2643     uint8_t halfH[88];
2644     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2645     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2646 }
2647
2648 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2649     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2650     int x;
2651     const int strength= ff_h263_loop_filter_strength[qscale];
2652
2653     for(x=0; x<8; x++){
2654         int d1, d2, ad1;
2655         int p0= src[x-2*stride];
2656         int p1= src[x-1*stride];
2657         int p2= src[x+0*stride];
2658         int p3= src[x+1*stride];
2659         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2660
2661         if     (d<-2*strength) d1= 0;
2662         else if(d<-  strength) d1=-2*strength - d;
2663         else if(d<   strength) d1= d;
2664         else if(d< 2*strength) d1= 2*strength - d;
2665         else                   d1= 0;
2666
2667         p1 += d1;
2668         p2 -= d1;
2669         if(p1&256) p1= ~(p1>>31);
2670         if(p2&256) p2= ~(p2>>31);
2671
2672         src[x-1*stride] = p1;
2673         src[x+0*stride] = p2;
2674
2675         ad1= FFABS(d1)>>1;
2676
2677         d2= av_clip((p0-p3)/4, -ad1, ad1);
2678
2679         src[x-2*stride] = p0 - d2;
2680         src[x+  stride] = p3 + d2;
2681     }
2682     }
2683 }
2684
2685 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2686     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2687     int y;
2688     const int strength= ff_h263_loop_filter_strength[qscale];
2689
2690     for(y=0; y<8; y++){
2691         int d1, d2, ad1;
2692         int p0= src[y*stride-2];
2693         int p1= src[y*stride-1];
2694         int p2= src[y*stride+0];
2695         int p3= src[y*stride+1];
2696         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2697
2698         if     (d<-2*strength) d1= 0;
2699         else if(d<-  strength) d1=-2*strength - d;
2700         else if(d<   strength) d1= d;
2701         else if(d< 2*strength) d1= 2*strength - d;
2702         else                   d1= 0;
2703
2704         p1 += d1;
2705         p2 -= d1;
2706         if(p1&256) p1= ~(p1>>31);
2707         if(p2&256) p2= ~(p2>>31);
2708
2709         src[y*stride-1] = p1;
2710         src[y*stride+0] = p2;
2711
2712         ad1= FFABS(d1)>>1;
2713
2714         d2= av_clip((p0-p3)/4, -ad1, ad1);
2715
2716         src[y*stride-2] = p0 - d2;
2717         src[y*stride+1] = p3 + d2;
2718     }
2719     }
2720 }
2721
2722 static void h261_loop_filter_c(uint8_t *src, int stride){
2723     int x,y,xy,yz;
2724     int temp[64];
2725
2726     for(x=0; x<8; x++){
2727         temp[x      ] = 4*src[x           ];
2728         temp[x + 7*8] = 4*src[x + 7*stride];
2729     }
2730     for(y=1; y<7; y++){
2731         for(x=0; x<8; x++){
2732             xy = y * stride + x;
2733             yz = y * 8 + x;
2734             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2735         }
2736     }
2737
2738     for(y=0; y<8; y++){
2739         src[  y*stride] = (temp[  y*8] + 2)>>2;
2740         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2741         for(x=1; x<7; x++){
2742             xy = y * stride + x;
2743             yz = y * 8 + x;
2744             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2745         }
2746     }
2747 }
2748
2749 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2750 {
2751     int s, i;
2752
2753     s = 0;
2754     for(i=0;i<h;i++) {
2755         s += abs(pix1[0] - pix2[0]);
2756         s += abs(pix1[1] - pix2[1]);
2757         s += abs(pix1[2] - pix2[2]);
2758         s += abs(pix1[3] - pix2[3]);
2759         s += abs(pix1[4] - pix2[4]);
2760         s += abs(pix1[5] - pix2[5]);
2761         s += abs(pix1[6] - pix2[6]);
2762         s += abs(pix1[7] - pix2[7]);
2763         s += abs(pix1[8] - pix2[8]);
2764         s += abs(pix1[9] - pix2[9]);
2765         s += abs(pix1[10] - pix2[10]);
2766         s += abs(pix1[11] - pix2[11]);
2767         s += abs(pix1[12] - pix2[12]);
2768         s += abs(pix1[13] - pix2[13]);
2769         s += abs(pix1[14] - pix2[14]);
2770         s += abs(pix1[15] - pix2[15]);
2771         pix1 += line_size;
2772         pix2 += line_size;
2773     }
2774     return s;
2775 }
2776
2777 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2778 {
2779     int s, i;
2780
2781     s = 0;
2782     for(i=0;i<h;i++) {
2783         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2784         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2785         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2786         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2787         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2788         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2789         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2790         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2791         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2792         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2793         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2794         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2795         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2796         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2797         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2798         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2799         pix1 += line_size;
2800         pix2 += line_size;
2801     }
2802     return s;
2803 }
2804
2805 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2806 {
2807     int s, i;
2808     uint8_t *pix3 = pix2 + line_size;
2809
2810     s = 0;
2811     for(i=0;i<h;i++) {
2812         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2813         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2814         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2815         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2816         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2817         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2818         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2819         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2820         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2821         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2822         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2823         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2824         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2825         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2826         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2827         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2828         pix1 += line_size;
2829         pix2 += line_size;
2830         pix3 += line_size;
2831     }
2832     return s;
2833 }
2834
2835 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2836 {
2837     int s, i;
2838     uint8_t *pix3 = pix2 + line_size;
2839
2840     s = 0;
2841     for(i=0;i<h;i++) {
2842         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2843         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2844         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2845         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2846         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2847         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2848         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2849         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2850         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2851         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2852         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2853         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2854         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2855         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2856         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2857         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2858         pix1 += line_size;
2859         pix2 += line_size;
2860         pix3 += line_size;
2861     }
2862     return s;
2863 }
2864
2865 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2866 {
2867     int s, i;
2868
2869     s = 0;
2870     for(i=0;i<h;i++) {
2871         s += abs(pix1[0] - pix2[0]);
2872         s += abs(pix1[1] - pix2[1]);
2873         s += abs(pix1[2] - pix2[2]);
2874         s += abs(pix1[3] - pix2[3]);
2875         s += abs(pix1[4] - pix2[4]);
2876         s += abs(pix1[5] - pix2[5]);
2877         s += abs(pix1[6] - pix2[6]);
2878         s += abs(pix1[7] - pix2[7]);
2879         pix1 += line_size;
2880         pix2 += line_size;
2881     }
2882     return s;
2883 }
2884
2885 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2886 {
2887     int s, i;
2888
2889     s = 0;
2890     for(i=0;i<h;i++) {
2891         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2892         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2893         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2894         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2895         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2896         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2897         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2898         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2899         pix1 += line_size;
2900         pix2 += line_size;
2901     }
2902     return s;
2903 }
2904
2905 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2906 {
2907     int s, i;
2908     uint8_t *pix3 = pix2 + line_size;
2909
2910     s = 0;
2911     for(i=0;i<h;i++) {
2912         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2913         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2914         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2915         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2916         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2917         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2918         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2919         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2920         pix1 += line_size;
2921         pix2 += line_size;
2922         pix3 += line_size;
2923     }
2924     return s;
2925 }
2926
2927 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2928 {
2929     int s, i;
2930     uint8_t *pix3 = pix2 + line_size;
2931
2932     s = 0;
2933     for(i=0;i<h;i++) {
2934         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2935         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2936         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2937         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2938         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2939         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2940         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2941         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2942         pix1 += line_size;
2943         pix2 += line_size;
2944         pix3 += line_size;
2945     }
2946     return s;
2947 }
2948
2949 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2950     MpegEncContext *c = v;
2951     int score1=0;
2952     int score2=0;
2953     int x,y;
2954
2955     for(y=0; y<h; y++){
2956         for(x=0; x<16; x++){
2957             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2958         }
2959         if(y+1<h){
2960             for(x=0; x<15; x++){
2961                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
2962                              - s1[x+1] + s1[x+1+stride])
2963                         -FFABS(  s2[x  ] - s2[x  +stride]
2964                              - s2[x+1] + s2[x+1+stride]);
2965             }
2966         }
2967         s1+= stride;
2968         s2+= stride;
2969     }
2970
2971     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2972     else  return score1 + FFABS(score2)*8;
2973 }
2974
2975 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2976     MpegEncContext *c = v;
2977     int score1=0;
2978     int score2=0;
2979     int x,y;
2980
2981     for(y=0; y<h; y++){
2982         for(x=0; x<8; x++){
2983             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2984         }
2985         if(y+1<h){
2986             for(x=0; x<7; x++){
2987                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
2988                              - s1[x+1] + s1[x+1+stride])
2989                         -FFABS(  s2[x  ] - s2[x  +stride]
2990                              - s2[x+1] + s2[x+1+stride]);
2991             }
2992         }
2993         s1+= stride;
2994         s2+= stride;
2995     }
2996
2997     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2998     else  return score1 + FFABS(score2)*8;
2999 }
3000
3001 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3002     int i;
3003     unsigned int sum=0;
3004
3005     for(i=0; i<8*8; i++){
3006         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3007         int w= weight[i];
3008         b>>= RECON_SHIFT;
3009         assert(-512<b && b<512);
3010
3011         sum += (w*b)*(w*b)>>4;
3012     }
3013     return sum>>2;
3014 }
3015
3016 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3017     int i;
3018
3019     for(i=0; i<8*8; i++){
3020         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3021     }
3022 }
3023
3024 /**
3025  * permutes an 8x8 block.
3026  * @param block the block which will be permuted according to the given permutation vector
3027  * @param permutation the permutation vector
3028  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3029  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3030  *                  (inverse) permutated to scantable order!
3031  */
3032 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3033 {
3034     int i;
3035     DCTELEM temp[64];
3036
3037     if(last<=0) return;
3038     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3039
3040     for(i=0; i<=last; i++){
3041         const int j= scantable[i];
3042         temp[j]= block[j];
3043         block[j]=0;
3044     }
3045
3046     for(i=0; i<=last; i++){
3047         const int j= scantable[i];
3048         const int perm_j= permutation[j];
3049         block[perm_j]= temp[j];
3050     }
3051 }
3052
3053 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3054     return 0;
3055 }
3056
3057 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3058     int i;
3059
3060     memset(cmp, 0, sizeof(void*)*6);
3061
3062     for(i=0; i<6; i++){
3063         switch(type&0xFF){
3064         case FF_CMP_SAD:
3065             cmp[i]= c->sad[i];
3066             break;
3067         case FF_CMP_SATD:
3068             cmp[i]= c->hadamard8_diff[i];
3069             break;
3070         case FF_CMP_SSE:
3071             cmp[i]= c->sse[i];
3072             break;
3073         case FF_CMP_DCT:
3074             cmp[i]= c->dct_sad[i];
3075             break;
3076         case FF_CMP_DCT264:
3077             cmp[i]= c->dct264_sad[i];
3078             break;
3079         case FF_CMP_DCTMAX:
3080             cmp[i]= c->dct_max[i];
3081             break;
3082         case FF_CMP_PSNR:
3083             cmp[i]= c->quant_psnr[i];
3084             break;
3085         case FF_CMP_BIT:
3086             cmp[i]= c->bit[i];
3087             break;
3088         case FF_CMP_RD:
3089             cmp[i]= c->rd[i];
3090             break;
3091         case FF_CMP_VSAD:
3092             cmp[i]= c->vsad[i];
3093             break;
3094         case FF_CMP_VSSE:
3095             cmp[i]= c->vsse[i];
3096             break;
3097         case FF_CMP_ZERO:
3098             cmp[i]= zero_cmp;
3099             break;
3100         case FF_CMP_NSSE:
3101             cmp[i]= c->nsse[i];
3102             break;
3103 #if CONFIG_DWT
3104         case FF_CMP_W53:
3105             cmp[i]= c->w53[i];
3106             break;
3107         case FF_CMP_W97:
3108             cmp[i]= c->w97[i];
3109             break;
3110 #endif
3111         default:
3112             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3113         }
3114     }
3115 }
3116
3117 static void clear_block_c(DCTELEM *block)
3118 {
3119     memset(block, 0, sizeof(DCTELEM)*64);
3120 }
3121
3122 /**
3123  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3124  */
3125 static void clear_blocks_c(DCTELEM *blocks)
3126 {
3127     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3128 }
3129
3130 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3131     long i;
3132     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3133         long a = *(long*)(src+i);
3134         long b = *(long*)(dst+i);
3135         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3136     }
3137     for(; i<w; i++)
3138         dst[i+0] += src[i+0];
3139 }
3140
3141 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3142     long i;
3143     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3144         long a = *(long*)(src1+i);
3145         long b = *(long*)(src2+i);
3146         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3147     }
3148     for(; i<w; i++)
3149         dst[i] = src1[i]+src2[i];
3150 }
3151
3152 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3153     long i;
3154 #if !HAVE_FAST_UNALIGNED
3155     if((long)src2 & (sizeof(long)-1)){
3156         for(i=0; i+7<w; i+=8){
3157             dst[i+0] = src1[i+0]-src2[i+0];
3158             dst[i+1] = src1[i+1]-src2[i+1];
3159             dst[i+2] = src1[i+2]-src2[i+2];
3160             dst[i+3] = src1[i+3]-src2[i+3];
3161             dst[i+4] = src1[i+4]-src2[i+4];
3162             dst[i+5] = src1[i+5]-src2[i+5];
3163             dst[i+6] = src1[i+6]-src2[i+6];
3164             dst[i+7] = src1[i+7]-src2[i+7];
3165         }
3166     }else
3167 #endif
3168     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3169         long a = *(long*)(src1+i);
3170         long b = *(long*)(src2+i);
3171         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3172     }
3173     for(; i<w; i++)
3174         dst[i+0] = src1[i+0]-src2[i+0];
3175 }
3176
3177 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3178     int i;
3179     uint8_t l, lt;
3180
3181     l= *left;
3182     lt= *left_top;
3183
3184     for(i=0; i<w; i++){
3185         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3186         lt= src1[i];
3187         dst[i]= l;
3188     }
3189
3190     *left= l;
3191     *left_top= lt;
3192 }
3193
3194 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3195     int i;
3196     uint8_t l, lt;
3197
3198     l= *left;
3199     lt= *left_top;
3200
3201     for(i=0; i<w; i++){
3202         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3203         lt= src1[i];
3204         l= src2[i];
3205         dst[i]= l - pred;
3206     }
3207
3208     *left= l;
3209     *left_top= lt;
3210 }
3211
3212 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3213     int i;
3214
3215     for(i=0; i<w-1; i++){
3216         acc+= src[i];
3217         dst[i]= acc;
3218         i++;
3219         acc+= src[i];
3220         dst[i]= acc;
3221     }
3222
3223     for(; i<w; i++){
3224         acc+= src[i];
3225         dst[i]= acc;
3226     }
3227
3228     return acc;
3229 }
3230
3231 #if HAVE_BIGENDIAN
3232 #define B 3
3233 #define G 2
3234 #define R 1
3235 #define A 0
3236 #else
3237 #define B 0
3238 #define G 1
3239 #define R 2
3240 #define A 3
3241 #endif
3242 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3243     int i;
3244     int r,g,b,a;
3245     r= *red;
3246     g= *green;
3247     b= *blue;
3248     a= *alpha;
3249
3250     for(i=0; i<w; i++){
3251         b+= src[4*i+B];
3252         g+= src[4*i+G];
3253         r+= src[4*i+R];
3254         a+= src[4*i+A];
3255
3256         dst[4*i+B]= b;
3257         dst[4*i+G]= g;
3258         dst[4*i+R]= r;
3259         dst[4*i+A]= a;
3260     }
3261
3262     *red= r;
3263     *green= g;
3264     *blue= b;
3265     *alpha= a;
3266 }
3267 #undef B
3268 #undef G
3269 #undef R
3270 #undef A
3271
3272 #define BUTTERFLY2(o1,o2,i1,i2) \
3273 o1= (i1)+(i2);\
3274 o2= (i1)-(i2);
3275
3276 #define BUTTERFLY1(x,y) \
3277 {\
3278     int a,b;\
3279     a= x;\
3280     b= y;\
3281     x= a+b;\
3282     y= a-b;\
3283 }
3284
3285 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3286
3287 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3288     int i;
3289     int temp[64];
3290     int sum=0;
3291
3292     assert(h==8);
3293
3294     for(i=0; i<8; i++){
3295         //FIXME try pointer walks
3296         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3297         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3298         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3299         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3300
3301         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3302         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3303         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3304         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3305
3306         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3307         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3308         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3309         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3310     }
3311
3312     for(i=0; i<8; i++){
3313         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3314         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3315         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3316         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3317
3318         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3319         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3320         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3321         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3322
3323         sum +=
3324              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3325             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3326             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3327             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3328     }
3329 #if 0
3330 static int maxi=0;
3331 if(sum>maxi){
3332     maxi=sum;
3333     printf("MAX:%d\n", maxi);
3334 }
3335 #endif
3336     return sum;
3337 }
3338
3339 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3340     int i;
3341     int temp[64];
3342     int sum=0;
3343
3344     assert(h==8);
3345
3346     for(i=0; i<8; i++){
3347         //FIXME try pointer walks
3348         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3349         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3350         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3351         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3352
3353         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3354         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3355         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3356         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3357
3358         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3359         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3360         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3361         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3362     }
3363
3364     for(i=0; i<8; i++){
3365         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3366         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3367         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3368         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3369
3370         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3371         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3372         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3373         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3374
3375         sum +=
3376              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3377             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3378             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3379             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3380     }
3381
3382     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3383
3384     return sum;
3385 }
3386
3387 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3388     MpegEncContext * const s= (MpegEncContext *)c;
3389     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3390
3391     assert(h==8);
3392
3393     s->dsp.diff_pixels(temp, src1, src2, stride);
3394     s->dsp.fdct(temp);
3395     return s->dsp.sum_abs_dctelem(temp);
3396 }
3397
3398 #if CONFIG_GPL
3399 #define DCT8_1D {\
3400     const int s07 = SRC(0) + SRC(7);\
3401     const int s16 = SRC(1) + SRC(6);\
3402     const int s25 = SRC(2) + SRC(5);\
3403     const int s34 = SRC(3) + SRC(4);\
3404     const int a0 = s07 + s34;\
3405     const int a1 = s16 + s25;\
3406     const int a2 = s07 - s34;\
3407     const int a3 = s16 - s25;\
3408     const int d07 = SRC(0) - SRC(7);\
3409     const int d16 = SRC(1) - SRC(6);\
3410     const int d25 = SRC(2) - SRC(5);\
3411     const int d34 = SRC(3) - SRC(4);\
3412     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3413     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3414     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3415     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3416     DST(0,  a0 + a1     ) ;\
3417     DST(1,  a4 + (a7>>2)) ;\
3418     DST(2,  a2 + (a3>>1)) ;\
3419     DST(3,  a5 + (a6>>2)) ;\
3420     DST(4,  a0 - a1     ) ;\
3421     DST(5,  a6 - (a5>>2)) ;\
3422     DST(6, (a2>>1) - a3 ) ;\
3423     DST(7, (a4>>2) - a7 ) ;\
3424 }
3425
3426 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3427     MpegEncContext * const s= (MpegEncContext *)c;
3428     DCTELEM dct[8][8];
3429     int i;
3430     int sum=0;
3431
3432     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3433
3434 #define SRC(x) dct[i][x]
3435 #define DST(x,v) dct[i][x]= v
3436     for( i = 0; i < 8; i++ )
3437         DCT8_1D
3438 #undef SRC
3439 #undef DST
3440
3441 #define SRC(x) dct[x][i]
3442 #define DST(x,v) sum += FFABS(v)
3443     for( i = 0; i < 8; i++ )
3444         DCT8_1D
3445 #undef SRC
3446 #undef DST
3447     return sum;
3448 }
3449 #endif
3450
3451 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3452     MpegEncContext * const s= (MpegEncContext *)c;
3453     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3454     int sum=0, i;
3455
3456     assert(h==8);
3457
3458     s->dsp.diff_pixels(temp, src1, src2, stride);
3459     s->dsp.fdct(temp);
3460
3461     for(i=0; i<64; i++)
3462         sum= FFMAX(sum, FFABS(temp[i]));
3463
3464     return sum;
3465 }
3466
3467 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3468     MpegEncContext * const s= (MpegEncContext *)c;
3469     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3470     DCTELEM * const bak = temp+64;
3471     int sum=0, i;
3472
3473     assert(h==8);
3474     s->mb_intra=0;
3475
3476     s->dsp.diff_pixels(temp, src1, src2, stride);
3477
3478     memcpy(bak, temp, 64*sizeof(DCTELEM));
3479
3480     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3481     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3482     ff_simple_idct(temp); //FIXME
3483
3484     for(i=0; i<64; i++)
3485         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3486
3487     return sum;
3488 }
3489
3490 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3491     MpegEncContext * const s= (MpegEncContext *)c;
3492     const uint8_t *scantable= s->intra_scantable.permutated;
3493     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3494     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3495     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3496     int i, last, run, bits, level, distortion, start_i;
3497     const int esc_length= s->ac_esc_length;
3498     uint8_t * length;
3499     uint8_t * last_length;
3500
3501     assert(h==8);
3502
3503     copy_block8(lsrc1, src1, 8, stride, 8);
3504     copy_block8(lsrc2, src2, 8, stride, 8);
3505
3506     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3507
3508     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3509
3510     bits=0;
3511
3512     if (s->mb_intra) {
3513         start_i = 1;
3514         length     = s->intra_ac_vlc_length;
3515         last_length= s->intra_ac_vlc_last_length;
3516         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3517     } else {
3518         start_i = 0;
3519         length     = s->inter_ac_vlc_length;
3520         last_length= s->inter_ac_vlc_last_length;
3521     }
3522
3523     if(last>=start_i){
3524         run=0;
3525         for(i=start_i; i<last; i++){
3526             int j= scantable[i];
3527             level= temp[j];
3528
3529             if(level){
3530                 level+=64;
3531                 if((level&(~127)) == 0){
3532                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3533                 }else
3534                     bits+= esc_length;
3535                 run=0;
3536             }else
3537                 run++;
3538         }
3539         i= scantable[last];
3540
3541         level= temp[i] + 64;
3542
3543         assert(level - 64);
3544
3545         if((level&(~127)) == 0){
3546             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3547         }else
3548             bits+= esc_length;
3549
3550     }
3551
3552     if(last>=0){
3553         if(s->mb_intra)
3554             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3555         else
3556             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3557     }
3558
3559     s->dsp.idct_add(lsrc2, 8, temp);
3560
3561     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3562
3563     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3564 }
3565
3566 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3567     MpegEncContext * const s= (MpegEncContext *)c;
3568     const uint8_t *scantable= s->intra_scantable.permutated;
3569     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3570     int i, last, run, bits, level, start_i;
3571     const int esc_length= s->ac_esc_length;
3572     uint8_t * length;
3573     uint8_t * last_length;
3574
3575     assert(h==8);
3576
3577     s->dsp.diff_pixels(temp, src1, src2, stride);
3578
3579     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3580
3581     bits=0;
3582
3583     if (s->mb_intra) {
3584         start_i = 1;
3585         length     = s->intra_ac_vlc_length;
3586         last_length= s->intra_ac_vlc_last_length;
3587         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3588     } else {
3589         start_i = 0;
3590         length     = s->inter_ac_vlc_length;
3591         last_length= s->inter_ac_vlc_last_length;
3592     }
3593
3594     if(last>=start_i){
3595         run=0;
3596         for(i=start_i; i<last; i++){
3597             int j= scantable[i];
3598             level= temp[j];
3599
3600             if(level){
3601                 level+=64;
3602                 if((level&(~127)) == 0){
3603                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3604                 }else
3605                     bits+= esc_length;
3606                 run=0;
3607             }else
3608                 run++;
3609         }
3610         i= scantable[last];
3611
3612         level= temp[i] + 64;
3613
3614         assert(level - 64);
3615
3616         if((level&(~127)) == 0){
3617             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3618         }else
3619             bits+= esc_length;
3620     }
3621
3622     return bits;
3623 }
3624
3625 #define VSAD_INTRA(size) \
3626 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3627     int score=0;                                                                                            \
3628     int x,y;                                                                                                \
3629                                                                                                             \
3630     for(y=1; y<h; y++){                                                                                     \
3631         for(x=0; x<size; x+=4){                                                                             \
3632             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3633                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3634         }                                                                                                   \
3635         s+= stride;                                                                                         \
3636     }                                                                                                       \
3637                                                                                                             \
3638     return score;                                                                                           \
3639 }
3640 VSAD_INTRA(8)
3641 VSAD_INTRA(16)
3642
3643 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3644     int score=0;
3645     int x,y;
3646
3647     for(y=1; y<h; y++){
3648         for(x=0; x<16; x++){
3649             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3650         }
3651         s1+= stride;
3652         s2+= stride;
3653     }
3654
3655     return score;
3656 }
3657
3658 #define SQ(a) ((a)*(a))
3659 #define VSSE_INTRA(size) \
3660 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3661     int score=0;                                                                                            \
3662     int x,y;                                                                                                \
3663                                                                                                             \
3664     for(y=1; y<h; y++){                                                                                     \
3665         for(x=0; x<size; x+=4){                                                                               \
3666             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3667                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3668         }                                                                                                   \
3669         s+= stride;                                                                                         \
3670     }                                                                                                       \
3671                                                                                                             \
3672     return score;                                                                                           \
3673 }
3674 VSSE_INTRA(8)
3675 VSSE_INTRA(16)
3676
3677 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3678     int score=0;
3679     int x,y;
3680
3681     for(y=1; y<h; y++){
3682         for(x=0; x<16; x++){
3683             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3684         }
3685         s1+= stride;
3686         s2+= stride;
3687     }
3688
3689     return score;
3690 }
3691
3692 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3693                                int size){
3694     int score=0;
3695     int i;
3696     for(i=0; i<size; i++)
3697         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3698     return score;
3699 }
3700
3701 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3702 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3703 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3704 #if CONFIG_GPL
3705 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3706 #endif
3707 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3708 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3709 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3710 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3711
3712 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3713     int i;
3714     for(i=0; i<len; i++)
3715         dst[i] = src0[i] * src1[i];
3716 }
3717
3718 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3719     int i;
3720     src1 += len-1;
3721     for(i=0; i<len; i++)
3722         dst[i] = src0[i] * src1[-i];
3723 }
3724
3725 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3726     int i;
3727     for(i=0; i<len; i++)
3728         dst[i] = src0[i] * src1[i] + src2[i];
3729 }
3730
3731 static void vector_fmul_window_c(float *dst, const float *src0,
3732                                  const float *src1, const float *win, int len)
3733 {
3734     int i,j;
3735     dst += len;
3736     win += len;
3737     src0+= len;
3738     for(i=-len, j=len-1; i<0; i++, j--) {
3739         float s0 = src0[i];
3740         float s1 = src1[j];
3741         float wi = win[i];
3742         float wj = win[j];
3743         dst[i] = s0*wj - s1*wi;
3744         dst[j] = s0*wi + s1*wj;
3745     }
3746 }
3747
3748 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3749                                  int len)
3750 {
3751     int i;
3752     for (i = 0; i < len; i++)
3753         dst[i] = src[i] * mul;
3754 }
3755
3756 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3757                                       const float **sv, float mul, int len)
3758 {
3759     int i;
3760     for (i = 0; i < len; i += 2, sv++) {
3761         dst[i  ] = src[i  ] * sv[0][0] * mul;
3762         dst[i+1] = src[i+1] * sv[0][1] * mul;
3763     }
3764 }
3765
3766 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3767                                       const float **sv, float mul, int len)
3768 {
3769     int i;
3770     for (i = 0; i < len; i += 4, sv++) {
3771         dst[i  ] = src[i  ] * sv[0][0] * mul;
3772         dst[i+1] = src[i+1] * sv[0][1] * mul;
3773         dst[i+2] = src[i+2] * sv[0][2] * mul;
3774         dst[i+3] = src[i+3] * sv[0][3] * mul;
3775     }
3776 }
3777
3778 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3779                                int len)
3780 {
3781     int i;
3782     for (i = 0; i < len; i += 2, sv++) {
3783         dst[i  ] = sv[0][0] * mul;
3784         dst[i+1] = sv[0][1] * mul;
3785     }
3786 }
3787
3788 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3789                                int len)
3790 {
3791     int i;
3792     for (i = 0; i < len; i += 4, sv++) {
3793         dst[i  ] = sv[0][0] * mul;
3794         dst[i+1] = sv[0][1] * mul;
3795         dst[i+2] = sv[0][2] * mul;
3796         dst[i+3] = sv[0][3] * mul;
3797     }
3798 }
3799
3800 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3801                                 int len)
3802 {
3803     int i;
3804     for (i = 0; i < len; i++) {
3805         float t = v1[i] - v2[i];
3806         v1[i] += v2[i];
3807         v2[i] = t;
3808     }
3809 }
3810
3811 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3812 {
3813     float p = 0.0;
3814     int i;
3815
3816     for (i = 0; i < len; i++)
3817         p += v1[i] * v2[i];
3818
3819     return p;
3820 }
3821
3822 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3823                    uint32_t maxi, uint32_t maxisign)
3824 {
3825
3826     if(a > mini) return mini;
3827     else if((a^(1<<31)) > maxisign) return maxi;
3828     else return a;
3829 }
3830
3831 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3832     int i;
3833     uint32_t mini = *(uint32_t*)min;
3834     uint32_t maxi = *(uint32_t*)max;
3835     uint32_t maxisign = maxi ^ (1<<31);
3836     uint32_t *dsti = (uint32_t*)dst;
3837     const uint32_t *srci = (const uint32_t*)src;
3838     for(i=0; i<len; i+=8) {
3839         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3840         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3841         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3842         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3843         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3844         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3845         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3846         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3847     }
3848 }
3849 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3850     int i;
3851     if(min < 0 && max > 0) {
3852         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3853     } else {
3854         for(i=0; i < len; i+=8) {
3855             dst[i    ] = av_clipf(src[i    ], min, max);
3856             dst[i + 1] = av_clipf(src[i + 1], min, max);
3857             dst[i + 2] = av_clipf(src[i + 2], min, max);
3858             dst[i + 3] = av_clipf(src[i + 3], min, max);
3859             dst[i + 4] = av_clipf(src[i + 4], min, max);
3860             dst[i + 5] = av_clipf(src[i + 5], min, max);
3861             dst[i + 6] = av_clipf(src[i + 6], min, max);
3862             dst[i + 7] = av_clipf(src[i + 7], min, max);
3863         }
3864     }
3865 }
3866
3867 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3868 {
3869     int res = 0;
3870
3871     while (order--)
3872         res += (*v1++ * *v2++) >> shift;
3873
3874     return res;
3875 }
3876
3877 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3878 {
3879     int res = 0;
3880     while (order--) {
3881         res   += *v1 * *v2++;
3882         *v1++ += mul * *v3++;
3883     }
3884     return res;
3885 }
3886
3887 #define W0 2048
3888 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3889 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3890 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3891 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3892 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3893 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3894 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3895
3896 static void wmv2_idct_row(short * b)
3897 {
3898     int s1,s2;
3899     int a0,a1,a2,a3,a4,a5,a6,a7;
3900     /*step 1*/
3901     a1 = W1*b[1]+W7*b[7];
3902     a7 = W7*b[1]-W1*b[7];
3903     a5 = W5*b[5]+W3*b[3];
3904     a3 = W3*b[5]-W5*b[3];
3905     a2 = W2*b[2]+W6*b[6];
3906     a6 = W6*b[2]-W2*b[6];
3907     a0 = W0*b[0]+W0*b[4];
3908     a4 = W0*b[0]-W0*b[4];
3909     /*step 2*/
3910     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3911     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3912     /*step 3*/
3913     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3914     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3915     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3916     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3917     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3918     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3919     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3920     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3921 }
3922 static void wmv2_idct_col(short * b)
3923 {
3924     int s1,s2;
3925     int a0,a1,a2,a3,a4,a5,a6,a7;
3926     /*step 1, with extended precision*/
3927     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3928     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3929     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3930     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3931     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3932     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3933     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3934     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3935     /*step 2*/
3936     s1 = (181*(a1-a5+a7-a3)+128)>>8;
3937     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3938     /*step 3*/
3939     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3940     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3941     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3942     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3943
3944     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3945     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3946     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3947     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3948 }
3949 void ff_wmv2_idct_c(short * block){
3950     int i;
3951
3952     for(i=0;i<64;i+=8){
3953         wmv2_idct_row(block+i);
3954     }
3955     for(i=0;i<8;i++){
3956         wmv2_idct_col(block+i);
3957     }
3958 }
3959 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3960  converted */
3961 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3962 {
3963     ff_wmv2_idct_c(block);
3964     put_pixels_clamped_c(block, dest, line_size);
3965 }
3966 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3967 {
3968     ff_wmv2_idct_c(block);
3969     add_pixels_clamped_c(block, dest, line_size);
3970 }
3971 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3972 {
3973     j_rev_dct (block);
3974     put_pixels_clamped_c(block, dest, line_size);
3975 }
3976 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3977 {
3978     j_rev_dct (block);
3979     add_pixels_clamped_c(block, dest, line_size);
3980 }
3981
3982 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3983 {
3984     j_rev_dct4 (block);
3985     put_pixels_clamped4_c(block, dest, line_size);
3986 }
3987 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3988 {
3989     j_rev_dct4 (block);
3990     add_pixels_clamped4_c(block, dest, line_size);
3991 }
3992
3993 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3994 {
3995     j_rev_dct2 (block);
3996     put_pixels_clamped2_c(block, dest, line_size);
3997 }
3998 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3999 {
4000     j_rev_dct2 (block);
4001     add_pixels_clamped2_c(block, dest, line_size);
4002 }
4003
4004 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4005 {
4006     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4007
4008     dest[0] = cm[(block[0] + 4)>>3];
4009 }
4010 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4011 {
4012     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4013
4014     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4015 }
4016
4017 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4018
4019 /* init static data */
4020 av_cold void dsputil_static_init(void)
4021 {
4022     int i;
4023
4024     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4025     for(i=0;i<MAX_NEG_CROP;i++) {
4026         ff_cropTbl[i] = 0;
4027         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4028     }
4029
4030     for(i=0;i<512;i++) {
4031         ff_squareTbl[i] = (i - 256) * (i - 256);
4032     }
4033
4034     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4035 }
4036
4037 int ff_check_alignment(void){
4038     static int did_fail=0;
4039     DECLARE_ALIGNED(16, int, aligned);
4040
4041     if((intptr_t)&aligned & 15){
4042         if(!did_fail){
4043 #if HAVE_MMX || HAVE_ALTIVEC
4044             av_log(NULL, AV_LOG_ERROR,
4045                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4046                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4047                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4048                 "Do not report crashes to FFmpeg developers.\n");
4049 #endif
4050             did_fail=1;
4051         }
4052         return -1;
4053     }
4054     return 0;
4055 }
4056
4057 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4058 {
4059     int i;
4060
4061     ff_check_alignment();
4062
4063 #if CONFIG_ENCODERS
4064     if(avctx->dct_algo==FF_DCT_FASTINT) {
4065         c->fdct = fdct_ifast;
4066         c->fdct248 = fdct_ifast248;
4067     }
4068     else if(avctx->dct_algo==FF_DCT_FAAN) {
4069         c->fdct = ff_faandct;
4070         c->fdct248 = ff_faandct248;
4071     }
4072     else {
4073         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4074         c->fdct248 = ff_fdct248_islow;
4075     }
4076 #endif //CONFIG_ENCODERS
4077
4078     if(avctx->lowres==1){
4079         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4080             c->idct_put= ff_jref_idct4_put;
4081             c->idct_add= ff_jref_idct4_add;
4082         }else{
4083             c->idct_put= ff_h264_lowres_idct_put_c;
4084             c->idct_add= ff_h264_lowres_idct_add_c;
4085         }
4086         c->idct    = j_rev_dct4;
4087         c->idct_permutation_type= FF_NO_IDCT_PERM;
4088     }else if(avctx->lowres==2){
4089         c->idct_put= ff_jref_idct2_put;
4090         c->idct_add= ff_jref_idct2_add;
4091         c->idct    = j_rev_dct2;
4092         c->idct_permutation_type= FF_NO_IDCT_PERM;
4093     }else if(avctx->lowres==3){
4094         c->idct_put= ff_jref_idct1_put;
4095         c->idct_add= ff_jref_idct1_add;
4096         c->idct    = j_rev_dct1;
4097         c->idct_permutation_type= FF_NO_IDCT_PERM;
4098     }else{
4099         if(avctx->idct_algo==FF_IDCT_INT){
4100             c->idct_put= ff_jref_idct_put;
4101             c->idct_add= ff_jref_idct_add;
4102             c->idct    = j_rev_dct;
4103             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4104         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4105                 avctx->idct_algo==FF_IDCT_VP3){
4106             c->idct_put= ff_vp3_idct_put_c;
4107             c->idct_add= ff_vp3_idct_add_c;
4108             c->idct    = ff_vp3_idct_c;
4109             c->idct_permutation_type= FF_NO_IDCT_PERM;
4110         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4111             c->idct_put= ff_wmv2_idct_put_c;
4112             c->idct_add= ff_wmv2_idct_add_c;
4113             c->idct    = ff_wmv2_idct_c;
4114             c->idct_permutation_type= FF_NO_IDCT_PERM;
4115         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4116             c->idct_put= ff_faanidct_put;
4117             c->idct_add= ff_faanidct_add;
4118             c->idct    = ff_faanidct;
4119             c->idct_permutation_type= FF_NO_IDCT_PERM;
4120         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4121             c->idct_put= ff_ea_idct_put_c;
4122             c->idct_permutation_type= FF_NO_IDCT_PERM;
4123         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4124             c->idct     = ff_bink_idct_c;
4125             c->idct_add = ff_bink_idct_add_c;
4126             c->idct_put = ff_bink_idct_put_c;
4127             c->idct_permutation_type = FF_NO_IDCT_PERM;
4128         }else{ //accurate/default
4129             c->idct_put= ff_simple_idct_put;
4130             c->idct_add= ff_simple_idct_add;
4131             c->idct    = ff_simple_idct;
4132             c->idct_permutation_type= FF_NO_IDCT_PERM;
4133         }
4134     }
4135
4136     c->get_pixels = get_pixels_c;
4137     c->diff_pixels = diff_pixels_c;
4138     c->put_pixels_clamped = put_pixels_clamped_c;
4139     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4140     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4141     c->add_pixels_clamped = add_pixels_clamped_c;
4142     c->add_pixels8 = add_pixels8_c;
4143     c->add_pixels4 = add_pixels4_c;
4144     c->sum_abs_dctelem = sum_abs_dctelem_c;
4145     c->emulated_edge_mc = ff_emulated_edge_mc;
4146     c->gmc1 = gmc1_c;
4147     c->gmc = ff_gmc_c;
4148     c->clear_block = clear_block_c;
4149     c->clear_blocks = clear_blocks_c;
4150     c->pix_sum = pix_sum_c;
4151     c->pix_norm1 = pix_norm1_c;
4152
4153     c->fill_block_tab[0] = fill_block16_c;
4154     c->fill_block_tab[1] = fill_block8_c;
4155     c->scale_block = scale_block_c;
4156
4157     /* TODO [0] 16  [1] 8 */
4158     c->pix_abs[0][0] = pix_abs16_c;
4159     c->pix_abs[0][1] = pix_abs16_x2_c;
4160     c->pix_abs[0][2] = pix_abs16_y2_c;
4161     c->pix_abs[0][3] = pix_abs16_xy2_c;
4162     c->pix_abs[1][0] = pix_abs8_c;
4163     c->pix_abs[1][1] = pix_abs8_x2_c;
4164     c->pix_abs[1][2] = pix_abs8_y2_c;
4165     c->pix_abs[1][3] = pix_abs8_xy2_c;
4166
4167 #define dspfunc(PFX, IDX, NUM) \
4168     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4169     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4170     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4171     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4172
4173     dspfunc(put, 0, 16);
4174     dspfunc(put_no_rnd, 0, 16);
4175     dspfunc(put, 1, 8);
4176     dspfunc(put_no_rnd, 1, 8);
4177     dspfunc(put, 2, 4);
4178     dspfunc(put, 3, 2);
4179
4180     dspfunc(avg, 0, 16);
4181     dspfunc(avg_no_rnd, 0, 16);
4182     dspfunc(avg, 1, 8);
4183     dspfunc(avg_no_rnd, 1, 8);
4184     dspfunc(avg, 2, 4);
4185     dspfunc(avg, 3, 2);
4186 #undef dspfunc
4187
4188     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4189     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4190
4191     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4192     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4193     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4194     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4195     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4196     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4197     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4198     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4199     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4200
4201     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4202     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4203     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4204     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4205     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4206     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4207     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4208     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4209     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4210
4211 #define dspfunc(PFX, IDX, NUM) \
4212     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4213     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4214     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4215     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4216     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4217     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4218     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4219     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4220     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4221     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4222     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4223     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4224     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4225     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4226     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4227     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4228
4229     dspfunc(put_qpel, 0, 16);
4230     dspfunc(put_no_rnd_qpel, 0, 16);
4231
4232     dspfunc(avg_qpel, 0, 16);
4233     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4234
4235     dspfunc(put_qpel, 1, 8);
4236     dspfunc(put_no_rnd_qpel, 1, 8);
4237
4238     dspfunc(avg_qpel, 1, 8);
4239     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4240
4241     dspfunc(put_h264_qpel, 0, 16);
4242     dspfunc(put_h264_qpel, 1, 8);
4243     dspfunc(put_h264_qpel, 2, 4);
4244     dspfunc(put_h264_qpel, 3, 2);
4245     dspfunc(avg_h264_qpel, 0, 16);
4246     dspfunc(avg_h264_qpel, 1, 8);
4247     dspfunc(avg_h264_qpel, 2, 4);
4248
4249 #undef dspfunc
4250     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4251     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4252     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4253     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4254     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4255     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4256
4257     c->draw_edges = draw_edges_c;
4258
4259 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4260     ff_mlp_init(c, avctx);
4261 #endif
4262 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4263     ff_intrax8dsp_init(c,avctx);
4264 #endif
4265 #if CONFIG_RV30_DECODER
4266     ff_rv30dsp_init(c,avctx);
4267 #endif
4268 #if CONFIG_RV40_DECODER
4269     ff_rv40dsp_init(c,avctx);
4270     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4271     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4272     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4273     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4274 #endif
4275
4276     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4277     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4278     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4279     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4280     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4281     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4282     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4283     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4284
4285 #define SET_CMP_FUNC(name) \
4286     c->name[0]= name ## 16_c;\
4287     c->name[1]= name ## 8x8_c;
4288
4289     SET_CMP_FUNC(hadamard8_diff)
4290     c->hadamard8_diff[4]= hadamard8_intra16_c;
4291     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4292     SET_CMP_FUNC(dct_sad)
4293     SET_CMP_FUNC(dct_max)
4294 #if CONFIG_GPL
4295     SET_CMP_FUNC(dct264_sad)
4296 #endif
4297     c->sad[0]= pix_abs16_c;
4298     c->sad[1]= pix_abs8_c;
4299     c->sse[0]= sse16_c;
4300     c->sse[1]= sse8_c;
4301     c->sse[2]= sse4_c;
4302     SET_CMP_FUNC(quant_psnr)
4303     SET_CMP_FUNC(rd)
4304     SET_CMP_FUNC(bit)
4305     c->vsad[0]= vsad16_c;
4306     c->vsad[4]= vsad_intra16_c;
4307     c->vsad[5]= vsad_intra8_c;
4308     c->vsse[0]= vsse16_c;
4309     c->vsse[4]= vsse_intra16_c;
4310     c->vsse[5]= vsse_intra8_c;
4311     c->nsse[0]= nsse16_c;
4312     c->nsse[1]= nsse8_c;
4313 #if CONFIG_DWT
4314     ff_dsputil_init_dwt(c);
4315 #endif
4316
4317     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4318
4319     c->add_bytes= add_bytes_c;
4320     c->add_bytes_l2= add_bytes_l2_c;
4321     c->diff_bytes= diff_bytes_c;
4322     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4323     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4324     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4325     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4326     c->bswap_buf= bswap_buf;
4327 #if CONFIG_PNG_DECODER
4328     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4329 #endif
4330
4331     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4332         c->h263_h_loop_filter= h263_h_loop_filter_c;
4333         c->h263_v_loop_filter= h263_v_loop_filter_c;
4334     }
4335
4336     if (CONFIG_VP3_DECODER) {
4337         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4338         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4339         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4340     }
4341
4342     c->h261_loop_filter= h261_loop_filter_c;
4343
4344     c->try_8x8basis= try_8x8basis_c;
4345     c->add_8x8basis= add_8x8basis_c;
4346
4347 #if CONFIG_VORBIS_DECODER
4348     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4349 #endif
4350 #if CONFIG_AC3_DECODER
4351     c->ac3_downmix = ff_ac3_downmix_c;
4352 #endif
4353     c->vector_fmul = vector_fmul_c;
4354     c->vector_fmul_reverse = vector_fmul_reverse_c;
4355     c->vector_fmul_add = vector_fmul_add_c;
4356     c->vector_fmul_window = vector_fmul_window_c;
4357     c->vector_clipf = vector_clipf_c;
4358     c->scalarproduct_int16 = scalarproduct_int16_c;
4359     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4360     c->scalarproduct_float = scalarproduct_float_c;
4361     c->butterflies_float = butterflies_float_c;
4362     c->vector_fmul_scalar = vector_fmul_scalar_c;
4363
4364     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4365     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4366
4367     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4368     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4369
4370     c->shrink[0]= av_image_copy_plane;
4371     c->shrink[1]= ff_shrink22;
4372     c->shrink[2]= ff_shrink44;
4373     c->shrink[3]= ff_shrink88;
4374
4375     c->prefetch= just_return;
4376
4377     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4378     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4379
4380     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4381     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4382     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4383     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4384     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4385     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4386     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4387     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4388     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4389
4390     for(i=0; i<64; i++){
4391         if(!c->put_2tap_qpel_pixels_tab[0][i])
4392             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4393         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4394             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4395     }
4396
4397     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4398     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4399     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4400     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4401
4402     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4403     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4404     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4405     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4406
4407     switch(c->idct_permutation_type){
4408     case FF_NO_IDCT_PERM:
4409         for(i=0; i<64; i++)
4410             c->idct_permutation[i]= i;
4411         break;
4412     case FF_LIBMPEG2_IDCT_PERM:
4413         for(i=0; i<64; i++)
4414             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4415         break;
4416     case FF_SIMPLE_IDCT_PERM:
4417         for(i=0; i<64; i++)
4418             c->idct_permutation[i]= simple_mmx_permutation[i];
4419         break;
4420     case FF_TRANSPOSE_IDCT_PERM:
4421         for(i=0; i<64; i++)
4422             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4423         break;
4424     case FF_PARTTRANS_IDCT_PERM:
4425         for(i=0; i<64; i++)
4426             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4427         break;
4428     case FF_SSE2_IDCT_PERM:
4429         for(i=0; i<64; i++)
4430             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4431         break;
4432     default:
4433         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4434     }
4435 }
4436