git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "libavcore/imgutils.h"
  31 #include "avcodec.h"
  32 #include "dsputil.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "faanidct.h"
  36 #include "mathops.h"
  37 #include "mpegvideo.h"
  38 #include "config.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  47 #define pb_7f (~0UL/255 * 0x7f)
  48 #define pb_80 (~0UL/255 * 0x80)
  49
  50 const uint8_t ff_zigzag_direct[64] = {
  51     0,   1,  8, 16,  9,  2,  3, 10,
  52     17, 24, 32, 25, 18, 11,  4,  5,
  53     12, 19, 26, 33, 40, 48, 41, 34,
  54     27, 20, 13,  6,  7, 14, 21, 28,
  55     35, 42, 49, 56, 57, 50, 43, 36,
  56     29, 22, 15, 23, 30, 37, 44, 51,
  57     58, 59, 52, 45, 38, 31, 39, 46,
  58     53, 60, 61, 54, 47, 55, 62, 63
  59 };
  60
  61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  62    specification, we interleave the fields */
  63 const uint8_t ff_zigzag248_direct[64] = {
  64      0,  8,  1,  9, 16, 24,  2, 10,
  65     17, 25, 32, 40, 48, 56, 33, 41,
  66     18, 26,  3, 11,  4, 12, 19, 27,
  67     34, 42, 49, 57, 50, 58, 35, 43,
  68     20, 28,  5, 13,  6, 14, 21, 29,
  69     36, 44, 51, 59, 52, 60, 37, 45,
  70     22, 30,  7, 15, 23, 31, 38, 46,
  71     53, 61, 54, 62, 39, 47, 55, 63,
  72 };
  73
  74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  76
  77 const uint8_t ff_alternate_horizontal_scan[64] = {
  78     0,  1,   2,  3,  8,  9, 16, 17,
  79     10, 11,  4,  5,  6,  7, 15, 14,
  80     13, 12, 19, 18, 24, 25, 32, 33,
  81     26, 27, 20, 21, 22, 23, 28, 29,
  82     30, 31, 34, 35, 40, 41, 48, 49,
  83     42, 43, 36, 37, 38, 39, 44, 45,
  84     46, 47, 50, 51, 56, 57, 58, 59,
  85     52, 53, 54, 55, 60, 61, 62, 63,
  86 };
  87
  88 const uint8_t ff_alternate_vertical_scan[64] = {
  89     0,  8,  16, 24,  1,  9,  2, 10,
  90     17, 25, 32, 40, 48, 56, 57, 49,
  91     41, 33, 26, 18,  3, 11,  4, 12,
  92     19, 27, 34, 42, 50, 58, 35, 43,
  93     51, 59, 20, 28,  5, 13,  6, 14,
  94     21, 29, 36, 44, 52, 60, 37, 45,
  95     53, 61, 22, 30,  7, 15, 23, 31,
  96     38, 46, 54, 62, 39, 47, 55, 63,
  97 };
  98
  99 /* Input permutation for the simple_idct_mmx */
 100 static const uint8_t simple_mmx_permutation[64]={
 101         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 102         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 103         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 104         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 105         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 106         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 107         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 108         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 109 };
 110
 111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 112
 113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 114     int i;
 115     int end;
 116
 117     st->scantable= src_scantable;
 118
 119     for(i=0; i<64; i++){
 120         int j;
 121         j = src_scantable[i];
 122         st->permutated[i] = permutation[j];
 123 #if ARCH_PPC
 124         st->inverse[j] = i;
 125 #endif
 126     }
 127
 128     end=-1;
 129     for(i=0; i<64; i++){
 130         int j;
 131         j = st->permutated[i];
 132         if(j>end) end=j;
 133         st->raster_end[i]= end;
 134     }
 135 }
 136
 137 static int pix_sum_c(uint8_t * pix, int line_size)
 138 {
 139     int s, i, j;
 140
 141     s = 0;
 142     for (i = 0; i < 16; i++) {
 143         for (j = 0; j < 16; j += 8) {
 144             s += pix[0];
 145             s += pix[1];
 146             s += pix[2];
 147             s += pix[3];
 148             s += pix[4];
 149             s += pix[5];
 150             s += pix[6];
 151             s += pix[7];
 152             pix += 8;
 153         }
 154         pix += line_size - 16;
 155     }
 156     return s;
 157 }
 158
 159 static int pix_norm1_c(uint8_t * pix, int line_size)
 160 {
 161     int s, i, j;
 162     uint32_t *sq = ff_squareTbl + 256;
 163
 164     s = 0;
 165     for (i = 0; i < 16; i++) {
 166         for (j = 0; j < 16; j += 8) {
 167 #if 0
 168             s += sq[pix[0]];
 169             s += sq[pix[1]];
 170             s += sq[pix[2]];
 171             s += sq[pix[3]];
 172             s += sq[pix[4]];
 173             s += sq[pix[5]];
 174             s += sq[pix[6]];
 175             s += sq[pix[7]];
 176 #else
 177 #if LONG_MAX > 2147483647
 178             register uint64_t x=*(uint64_t*)pix;
 179             s += sq[x&0xff];
 180             s += sq[(x>>8)&0xff];
 181             s += sq[(x>>16)&0xff];
 182             s += sq[(x>>24)&0xff];
 183             s += sq[(x>>32)&0xff];
 184             s += sq[(x>>40)&0xff];
 185             s += sq[(x>>48)&0xff];
 186             s += sq[(x>>56)&0xff];
 187 #else
 188             register uint32_t x=*(uint32_t*)pix;
 189             s += sq[x&0xff];
 190             s += sq[(x>>8)&0xff];
 191             s += sq[(x>>16)&0xff];
 192             s += sq[(x>>24)&0xff];
 193             x=*(uint32_t*)(pix+4);
 194             s += sq[x&0xff];
 195             s += sq[(x>>8)&0xff];
 196             s += sq[(x>>16)&0xff];
 197             s += sq[(x>>24)&0xff];
 198 #endif
 199 #endif
 200             pix += 8;
 201         }
 202         pix += line_size - 16;
 203     }
 204     return s;
 205 }
 206
 207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 208     int i;
 209
 210     for(i=0; i+8<=w; i+=8){
 211         dst[i+0]= av_bswap32(src[i+0]);
 212         dst[i+1]= av_bswap32(src[i+1]);
 213         dst[i+2]= av_bswap32(src[i+2]);
 214         dst[i+3]= av_bswap32(src[i+3]);
 215         dst[i+4]= av_bswap32(src[i+4]);
 216         dst[i+5]= av_bswap32(src[i+5]);
 217         dst[i+6]= av_bswap32(src[i+6]);
 218         dst[i+7]= av_bswap32(src[i+7]);
 219     }
 220     for(;i<w; i++){
 221         dst[i+0]= av_bswap32(src[i+0]);
 222     }
 223 }
 224
 225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 226 {
 227     int s, i;
 228     uint32_t *sq = ff_squareTbl + 256;
 229
 230     s = 0;
 231     for (i = 0; i < h; i++) {
 232         s += sq[pix1[0] - pix2[0]];
 233         s += sq[pix1[1] - pix2[1]];
 234         s += sq[pix1[2] - pix2[2]];
 235         s += sq[pix1[3] - pix2[3]];
 236         pix1 += line_size;
 237         pix2 += line_size;
 238     }
 239     return s;
 240 }
 241
 242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 243 {
 244     int s, i;
 245     uint32_t *sq = ff_squareTbl + 256;
 246
 247     s = 0;
 248     for (i = 0; i < h; i++) {
 249         s += sq[pix1[0] - pix2[0]];
 250         s += sq[pix1[1] - pix2[1]];
 251         s += sq[pix1[2] - pix2[2]];
 252         s += sq[pix1[3] - pix2[3]];
 253         s += sq[pix1[4] - pix2[4]];
 254         s += sq[pix1[5] - pix2[5]];
 255         s += sq[pix1[6] - pix2[6]];
 256         s += sq[pix1[7] - pix2[7]];
 257         pix1 += line_size;
 258         pix2 += line_size;
 259     }
 260     return s;
 261 }
 262
 263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 264 {
 265     int s, i;
 266     uint32_t *sq = ff_squareTbl + 256;
 267
 268     s = 0;
 269     for (i = 0; i < h; i++) {
 270         s += sq[pix1[ 0] - pix2[ 0]];
 271         s += sq[pix1[ 1] - pix2[ 1]];
 272         s += sq[pix1[ 2] - pix2[ 2]];
 273         s += sq[pix1[ 3] - pix2[ 3]];
 274         s += sq[pix1[ 4] - pix2[ 4]];
 275         s += sq[pix1[ 5] - pix2[ 5]];
 276         s += sq[pix1[ 6] - pix2[ 6]];
 277         s += sq[pix1[ 7] - pix2[ 7]];
 278         s += sq[pix1[ 8] - pix2[ 8]];
 279         s += sq[pix1[ 9] - pix2[ 9]];
 280         s += sq[pix1[10] - pix2[10]];
 281         s += sq[pix1[11] - pix2[11]];
 282         s += sq[pix1[12] - pix2[12]];
 283         s += sq[pix1[13] - pix2[13]];
 284         s += sq[pix1[14] - pix2[14]];
 285         s += sq[pix1[15] - pix2[15]];
 286
 287         pix1 += line_size;
 288         pix2 += line_size;
 289     }
 290     return s;
 291 }
 292
 293 /* draw the edges of width 'w' of an image of size width, height */
 294 //FIXME check that this is ok for mpeg4 interlaced
 295 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 296 {
 297     uint8_t *ptr, *last_line;
 298     int i;
 299
 300     last_line = buf + (height - 1) * wrap;
 301     for(i=0;i<w;i++) {
 302         /* top and bottom */
 303         memcpy(buf - (i + 1) * wrap, buf, width);
 304         memcpy(last_line + (i + 1) * wrap, last_line, width);
 305     }
 306     /* left and right */
 307     ptr = buf;
 308     for(i=0;i<height;i++) {
 309         memset(ptr - w, ptr[0], w);
 310         memset(ptr + width, ptr[width-1], w);
 311         ptr += wrap;
 312     }
 313     /* corners */
 314     for(i=0;i<w;i++) {
 315         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 316         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 317         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 318         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 319     }
 320 }
 321
 322 /**
 323  * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
 324  * @param buf destination buffer
 325  * @param src source buffer
 326  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 327  * @param block_w width of block
 328  * @param block_h height of block
 329  * @param src_x x coordinate of the top left sample of the block in the source buffer
 330  * @param src_y y coordinate of the top left sample of the block in the source buffer
 331  * @param w width of the source buffer
 332  * @param h height of the source buffer
 333  */
 334 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
 335                                     int src_x, int src_y, int w, int h){
 336     int x, y;
 337     int start_y, start_x, end_y, end_x;
 338
 339     if(src_y>= h){
 340         src+= (h-1-src_y)*linesize;
 341         src_y=h-1;
 342     }else if(src_y<=-block_h){
 343         src+= (1-block_h-src_y)*linesize;
 344         src_y=1-block_h;
 345     }
 346     if(src_x>= w){
 347         src+= (w-1-src_x);
 348         src_x=w-1;
 349     }else if(src_x<=-block_w){
 350         src+= (1-block_w-src_x);
 351         src_x=1-block_w;
 352     }
 353
 354     start_y= FFMAX(0, -src_y);
 355     start_x= FFMAX(0, -src_x);
 356     end_y= FFMIN(block_h, h-src_y);
 357     end_x= FFMIN(block_w, w-src_x);
 358
 359     // copy existing part
 360     for(y=start_y; y<end_y; y++){
 361         for(x=start_x; x<end_x; x++){
 362             buf[x + y*linesize]= src[x + y*linesize];
 363         }
 364     }
 365
 366     //top
 367     for(y=0; y<start_y; y++){
 368         for(x=start_x; x<end_x; x++){
 369             buf[x + y*linesize]= buf[x + start_y*linesize];
 370         }
 371     }
 372
 373     //bottom
 374     for(y=end_y; y<block_h; y++){
 375         for(x=start_x; x<end_x; x++){
 376             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 377         }
 378     }
 379
 380     for(y=0; y<block_h; y++){
 381        //left
 382         for(x=0; x<start_x; x++){
 383             buf[x + y*linesize]= buf[start_x + y*linesize];
 384         }
 385
 386        //right
 387         for(x=end_x; x<block_w; x++){
 388             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 389         }
 390     }
 391 }
 392
 393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 394 {
 395     int i;
 396
 397     /* read the pixels */
 398     for(i=0;i<8;i++) {
 399         block[0] = pixels[0];
 400         block[1] = pixels[1];
 401         block[2] = pixels[2];
 402         block[3] = pixels[3];
 403         block[4] = pixels[4];
 404         block[5] = pixels[5];
 405         block[6] = pixels[6];
 406         block[7] = pixels[7];
 407         pixels += line_size;
 408         block += 8;
 409     }
 410 }
 411
 412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 413                           const uint8_t *s2, int stride){
 414     int i;
 415
 416     /* read the pixels */
 417     for(i=0;i<8;i++) {
 418         block[0] = s1[0] - s2[0];
 419         block[1] = s1[1] - s2[1];
 420         block[2] = s1[2] - s2[2];
 421         block[3] = s1[3] - s2[3];
 422         block[4] = s1[4] - s2[4];
 423         block[5] = s1[5] - s2[5];
 424         block[6] = s1[6] - s2[6];
 425         block[7] = s1[7] - s2[7];
 426         s1 += stride;
 427         s2 += stride;
 428         block += 8;
 429     }
 430 }
 431
 432
 433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 434                                  int line_size)
 435 {
 436     int i;
 437     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 438
 439     /* read the pixels */
 440     for(i=0;i<8;i++) {
 441         pixels[0] = cm[block[0]];
 442         pixels[1] = cm[block[1]];
 443         pixels[2] = cm[block[2]];
 444         pixels[3] = cm[block[3]];
 445         pixels[4] = cm[block[4]];
 446         pixels[5] = cm[block[5]];
 447         pixels[6] = cm[block[6]];
 448         pixels[7] = cm[block[7]];
 449
 450         pixels += line_size;
 451         block += 8;
 452     }
 453 }
 454
 455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 456                                  int line_size)
 457 {
 458     int i;
 459     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 460
 461     /* read the pixels */
 462     for(i=0;i<4;i++) {
 463         pixels[0] = cm[block[0]];
 464         pixels[1] = cm[block[1]];
 465         pixels[2] = cm[block[2]];
 466         pixels[3] = cm[block[3]];
 467
 468         pixels += line_size;
 469         block += 8;
 470     }
 471 }
 472
 473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 474                                  int line_size)
 475 {
 476     int i;
 477     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 478
 479     /* read the pixels */
 480     for(i=0;i<2;i++) {
 481         pixels[0] = cm[block[0]];
 482         pixels[1] = cm[block[1]];
 483
 484         pixels += line_size;
 485         block += 8;
 486     }
 487 }
 488
 489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 490                                         uint8_t *restrict pixels,
 491                                         int line_size)
 492 {
 493     int i, j;
 494
 495     for (i = 0; i < 8; i++) {
 496         for (j = 0; j < 8; j++) {
 497             if (*block < -128)
 498                 *pixels = 0;
 499             else if (*block > 127)
 500                 *pixels = 255;
 501             else
 502                 *pixels = (uint8_t)(*block + 128);
 503             block++;
 504             pixels++;
 505         }
 506         pixels += (line_size - 8);
 507     }
 508 }
 509
 510 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 511                                     int line_size)
 512 {
 513     int i;
 514
 515     /* read the pixels */
 516     for(i=0;i<8;i++) {
 517         pixels[0] = block[0];
 518         pixels[1] = block[1];
 519         pixels[2] = block[2];
 520         pixels[3] = block[3];
 521         pixels[4] = block[4];
 522         pixels[5] = block[5];
 523         pixels[6] = block[6];
 524         pixels[7] = block[7];
 525
 526         pixels += line_size;
 527         block += 8;
 528     }
 529 }
 530
 531 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 532                           int line_size)
 533 {
 534     int i;
 535     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 536
 537     /* read the pixels */
 538     for(i=0;i<8;i++) {
 539         pixels[0] = cm[pixels[0] + block[0]];
 540         pixels[1] = cm[pixels[1] + block[1]];
 541         pixels[2] = cm[pixels[2] + block[2]];
 542         pixels[3] = cm[pixels[3] + block[3]];
 543         pixels[4] = cm[pixels[4] + block[4]];
 544         pixels[5] = cm[pixels[5] + block[5]];
 545         pixels[6] = cm[pixels[6] + block[6]];
 546         pixels[7] = cm[pixels[7] + block[7]];
 547         pixels += line_size;
 548         block += 8;
 549     }
 550 }
 551
 552 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 553                           int line_size)
 554 {
 555     int i;
 556     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 557
 558     /* read the pixels */
 559     for(i=0;i<4;i++) {
 560         pixels[0] = cm[pixels[0] + block[0]];
 561         pixels[1] = cm[pixels[1] + block[1]];
 562         pixels[2] = cm[pixels[2] + block[2]];
 563         pixels[3] = cm[pixels[3] + block[3]];
 564         pixels += line_size;
 565         block += 8;
 566     }
 567 }
 568
 569 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 570                           int line_size)
 571 {
 572     int i;
 573     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 574
 575     /* read the pixels */
 576     for(i=0;i<2;i++) {
 577         pixels[0] = cm[pixels[0] + block[0]];
 578         pixels[1] = cm[pixels[1] + block[1]];
 579         pixels += line_size;
 580         block += 8;
 581     }
 582 }
 583
 584 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 585 {
 586     int i;
 587     for(i=0;i<8;i++) {
 588         pixels[0] += block[0];
 589         pixels[1] += block[1];
 590         pixels[2] += block[2];
 591         pixels[3] += block[3];
 592         pixels[4] += block[4];
 593         pixels[5] += block[5];
 594         pixels[6] += block[6];
 595         pixels[7] += block[7];
 596         pixels += line_size;
 597         block += 8;
 598     }
 599 }
 600
 601 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 602 {
 603     int i;
 604     for(i=0;i<4;i++) {
 605         pixels[0] += block[0];
 606         pixels[1] += block[1];
 607         pixels[2] += block[2];
 608         pixels[3] += block[3];
 609         pixels += line_size;
 610         block += 4;
 611     }
 612 }
 613
 614 static int sum_abs_dctelem_c(DCTELEM *block)
 615 {
 616     int sum=0, i;
 617     for(i=0; i<64; i++)
 618         sum+= FFABS(block[i]);
 619     return sum;
 620 }
 621
 622 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 623 {
 624     int i;
 625
 626     for (i = 0; i < h; i++) {
 627         memset(block, value, 16);
 628         block += line_size;
 629     }
 630 }
 631
 632 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 633 {
 634     int i;
 635
 636     for (i = 0; i < h; i++) {
 637         memset(block, value, 8);
 638         block += line_size;
 639     }
 640 }
 641
 642 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 643 {
 644     int i, j;
 645     uint16_t *dst1 = (uint16_t *) dst;
 646     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 647
 648     for (j = 0; j < 8; j++) {
 649         for (i = 0; i < 8; i++) {
 650             dst1[i] = dst2[i] = src[i] * 0x0101;
 651         }
 652         src  += 8;
 653         dst1 += linesize;
 654         dst2 += linesize;
 655     }
 656 }
 657
 658 #if 0
 659
 660 #define PIXOP2(OPNAME, OP) \
 661 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 662 {\
 663     int i;\
 664     for(i=0; i<h; i++){\
 665         OP(*((uint64_t*)block), AV_RN64(pixels));\
 666         pixels+=line_size;\
 667         block +=line_size;\
 668     }\
 669 }\
 670 \
 671 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 672 {\
 673     int i;\
 674     for(i=0; i<h; i++){\
 675         const uint64_t a= AV_RN64(pixels  );\
 676         const uint64_t b= AV_RN64(pixels+1);\
 677         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 678         pixels+=line_size;\
 679         block +=line_size;\
 680     }\
 681 }\
 682 \
 683 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 684 {\
 685     int i;\
 686     for(i=0; i<h; i++){\
 687         const uint64_t a= AV_RN64(pixels  );\
 688         const uint64_t b= AV_RN64(pixels+1);\
 689         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 690         pixels+=line_size;\
 691         block +=line_size;\
 692     }\
 693 }\
 694 \
 695 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 696 {\
 697     int i;\
 698     for(i=0; i<h; i++){\
 699         const uint64_t a= AV_RN64(pixels          );\
 700         const uint64_t b= AV_RN64(pixels+line_size);\
 701         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 702         pixels+=line_size;\
 703         block +=line_size;\
 704     }\
 705 }\
 706 \
 707 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 708 {\
 709     int i;\
 710     for(i=0; i<h; i++){\
 711         const uint64_t a= AV_RN64(pixels          );\
 712         const uint64_t b= AV_RN64(pixels+line_size);\
 713         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 714         pixels+=line_size;\
 715         block +=line_size;\
 716     }\
 717 }\
 718 \
 719 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 720 {\
 721         int i;\
 722         const uint64_t a= AV_RN64(pixels  );\
 723         const uint64_t b= AV_RN64(pixels+1);\
 724         uint64_t l0=  (a&0x0303030303030303ULL)\
 725                     + (b&0x0303030303030303ULL)\
 726                     + 0x0202020202020202ULL;\
 727         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 728                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 729         uint64_t l1,h1;\
 730 \
 731         pixels+=line_size;\
 732         for(i=0; i<h; i+=2){\
 733             uint64_t a= AV_RN64(pixels  );\
 734             uint64_t b= AV_RN64(pixels+1);\
 735             l1=  (a&0x0303030303030303ULL)\
 736                + (b&0x0303030303030303ULL);\
 737             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 738               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 739             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 740             pixels+=line_size;\
 741             block +=line_size;\
 742             a= AV_RN64(pixels  );\
 743             b= AV_RN64(pixels+1);\
 744             l0=  (a&0x0303030303030303ULL)\
 745                + (b&0x0303030303030303ULL)\
 746                + 0x0202020202020202ULL;\
 747             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 748               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 749             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 750             pixels+=line_size;\
 751             block +=line_size;\
 752         }\
 753 }\
 754 \
 755 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 756 {\
 757         int i;\
 758         const uint64_t a= AV_RN64(pixels  );\
 759         const uint64_t b= AV_RN64(pixels+1);\
 760         uint64_t l0=  (a&0x0303030303030303ULL)\
 761                     + (b&0x0303030303030303ULL)\
 762                     + 0x0101010101010101ULL;\
 763         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 764                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 765         uint64_t l1,h1;\
 766 \
 767         pixels+=line_size;\
 768         for(i=0; i<h; i+=2){\
 769             uint64_t a= AV_RN64(pixels  );\
 770             uint64_t b= AV_RN64(pixels+1);\
 771             l1=  (a&0x0303030303030303ULL)\
 772                + (b&0x0303030303030303ULL);\
 773             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 774               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 775             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 776             pixels+=line_size;\
 777             block +=line_size;\
 778             a= AV_RN64(pixels  );\
 779             b= AV_RN64(pixels+1);\
 780             l0=  (a&0x0303030303030303ULL)\
 781                + (b&0x0303030303030303ULL)\
 782                + 0x0101010101010101ULL;\
 783             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 784               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 785             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 786             pixels+=line_size;\
 787             block +=line_size;\
 788         }\
 789 }\
 790 \
 791 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 792 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 793 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 794 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 795 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 796 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 797 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 798
 799 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 800 #else // 64 bit variant
 801
 802 #define PIXOP2(OPNAME, OP) \
 803 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 804     int i;\
 805     for(i=0; i<h; i++){\
 806         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 807         pixels+=line_size;\
 808         block +=line_size;\
 809     }\
 810 }\
 811 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 812     int i;\
 813     for(i=0; i<h; i++){\
 814         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 815         pixels+=line_size;\
 816         block +=line_size;\
 817     }\
 818 }\
 819 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 820     int i;\
 821     for(i=0; i<h; i++){\
 822         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 823         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 824         pixels+=line_size;\
 825         block +=line_size;\
 826     }\
 827 }\
 828 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 829     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 830 }\
 831 \
 832 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 833                                                 int src_stride1, int src_stride2, int h){\
 834     int i;\
 835     for(i=0; i<h; i++){\
 836         uint32_t a,b;\
 837         a= AV_RN32(&src1[i*src_stride1  ]);\
 838         b= AV_RN32(&src2[i*src_stride2  ]);\
 839         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 840         a= AV_RN32(&src1[i*src_stride1+4]);\
 841         b= AV_RN32(&src2[i*src_stride2+4]);\
 842         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 843     }\
 844 }\
 845 \
 846 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 847                                                 int src_stride1, int src_stride2, int h){\
 848     int i;\
 849     for(i=0; i<h; i++){\
 850         uint32_t a,b;\
 851         a= AV_RN32(&src1[i*src_stride1  ]);\
 852         b= AV_RN32(&src2[i*src_stride2  ]);\
 853         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 854         a= AV_RN32(&src1[i*src_stride1+4]);\
 855         b= AV_RN32(&src2[i*src_stride2+4]);\
 856         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 857     }\
 858 }\
 859 \
 860 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 861                                                 int src_stride1, int src_stride2, int h){\
 862     int i;\
 863     for(i=0; i<h; i++){\
 864         uint32_t a,b;\
 865         a= AV_RN32(&src1[i*src_stride1  ]);\
 866         b= AV_RN32(&src2[i*src_stride2  ]);\
 867         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 868     }\
 869 }\
 870 \
 871 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 872                                                 int src_stride1, int src_stride2, int h){\
 873     int i;\
 874     for(i=0; i<h; i++){\
 875         uint32_t a,b;\
 876         a= AV_RN16(&src1[i*src_stride1  ]);\
 877         b= AV_RN16(&src2[i*src_stride2  ]);\
 878         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 879     }\
 880 }\
 881 \
 882 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 883                                                 int src_stride1, int src_stride2, int h){\
 884     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 885     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 886 }\
 887 \
 888 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 889                                                 int src_stride1, int src_stride2, int h){\
 890     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 891     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 892 }\
 893 \
 894 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 895     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 896 }\
 897 \
 898 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 899     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 900 }\
 901 \
 902 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 903     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 904 }\
 905 \
 906 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 907     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 908 }\
 909 \
 910 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 911                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 912     int i;\
 913     for(i=0; i<h; i++){\
 914         uint32_t a, b, c, d, l0, l1, h0, h1;\
 915         a= AV_RN32(&src1[i*src_stride1]);\
 916         b= AV_RN32(&src2[i*src_stride2]);\
 917         c= AV_RN32(&src3[i*src_stride3]);\
 918         d= AV_RN32(&src4[i*src_stride4]);\
 919         l0=  (a&0x03030303UL)\
 920            + (b&0x03030303UL)\
 921            + 0x02020202UL;\
 922         h0= ((a&0xFCFCFCFCUL)>>2)\
 923           + ((b&0xFCFCFCFCUL)>>2);\
 924         l1=  (c&0x03030303UL)\
 925            + (d&0x03030303UL);\
 926         h1= ((c&0xFCFCFCFCUL)>>2)\
 927           + ((d&0xFCFCFCFCUL)>>2);\
 928         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 929         a= AV_RN32(&src1[i*src_stride1+4]);\
 930         b= AV_RN32(&src2[i*src_stride2+4]);\
 931         c= AV_RN32(&src3[i*src_stride3+4]);\
 932         d= AV_RN32(&src4[i*src_stride4+4]);\
 933         l0=  (a&0x03030303UL)\
 934            + (b&0x03030303UL)\
 935            + 0x02020202UL;\
 936         h0= ((a&0xFCFCFCFCUL)>>2)\
 937           + ((b&0xFCFCFCFCUL)>>2);\
 938         l1=  (c&0x03030303UL)\
 939            + (d&0x03030303UL);\
 940         h1= ((c&0xFCFCFCFCUL)>>2)\
 941           + ((d&0xFCFCFCFCUL)>>2);\
 942         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 943     }\
 944 }\
 945 \
 946 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 947     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 948 }\
 949 \
 950 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 951     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 952 }\
 953 \
 954 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 955     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 956 }\
 957 \
 958 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 959     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 960 }\
 961 \
 962 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 963                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 964     int i;\
 965     for(i=0; i<h; i++){\
 966         uint32_t a, b, c, d, l0, l1, h0, h1;\
 967         a= AV_RN32(&src1[i*src_stride1]);\
 968         b= AV_RN32(&src2[i*src_stride2]);\
 969         c= AV_RN32(&src3[i*src_stride3]);\
 970         d= AV_RN32(&src4[i*src_stride4]);\
 971         l0=  (a&0x03030303UL)\
 972            + (b&0x03030303UL)\
 973            + 0x01010101UL;\
 974         h0= ((a&0xFCFCFCFCUL)>>2)\
 975           + ((b&0xFCFCFCFCUL)>>2);\
 976         l1=  (c&0x03030303UL)\
 977            + (d&0x03030303UL);\
 978         h1= ((c&0xFCFCFCFCUL)>>2)\
 979           + ((d&0xFCFCFCFCUL)>>2);\
 980         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 981         a= AV_RN32(&src1[i*src_stride1+4]);\
 982         b= AV_RN32(&src2[i*src_stride2+4]);\
 983         c= AV_RN32(&src3[i*src_stride3+4]);\
 984         d= AV_RN32(&src4[i*src_stride4+4]);\
 985         l0=  (a&0x03030303UL)\
 986            + (b&0x03030303UL)\
 987            + 0x01010101UL;\
 988         h0= ((a&0xFCFCFCFCUL)>>2)\
 989           + ((b&0xFCFCFCFCUL)>>2);\
 990         l1=  (c&0x03030303UL)\
 991            + (d&0x03030303UL);\
 992         h1= ((c&0xFCFCFCFCUL)>>2)\
 993           + ((d&0xFCFCFCFCUL)>>2);\
 994         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 995     }\
 996 }\
 997 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
 998                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 999     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1000     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1001 }\
1002 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1003                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1004     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1005     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1006 }\
1007 \
1008 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1009 {\
1010         int i, a0, b0, a1, b1;\
1011         a0= pixels[0];\
1012         b0= pixels[1] + 2;\
1013         a0 += b0;\
1014         b0 += pixels[2];\
1015 \
1016         pixels+=line_size;\
1017         for(i=0; i<h; i+=2){\
1018             a1= pixels[0];\
1019             b1= pixels[1];\
1020             a1 += b1;\
1021             b1 += pixels[2];\
1022 \
1023             block[0]= (a1+a0)>>2; /* FIXME non put */\
1024             block[1]= (b1+b0)>>2;\
1025 \
1026             pixels+=line_size;\
1027             block +=line_size;\
1028 \
1029             a0= pixels[0];\
1030             b0= pixels[1] + 2;\
1031             a0 += b0;\
1032             b0 += pixels[2];\
1033 \
1034             block[0]= (a1+a0)>>2;\
1035             block[1]= (b1+b0)>>2;\
1036             pixels+=line_size;\
1037             block +=line_size;\
1038         }\
1039 }\
1040 \
1041 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1042 {\
1043         int i;\
1044         const uint32_t a= AV_RN32(pixels  );\
1045         const uint32_t b= AV_RN32(pixels+1);\
1046         uint32_t l0=  (a&0x03030303UL)\
1047                     + (b&0x03030303UL)\
1048                     + 0x02020202UL;\
1049         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1050                    + ((b&0xFCFCFCFCUL)>>2);\
1051         uint32_t l1,h1;\
1052 \
1053         pixels+=line_size;\
1054         for(i=0; i<h; i+=2){\
1055             uint32_t a= AV_RN32(pixels  );\
1056             uint32_t b= AV_RN32(pixels+1);\
1057             l1=  (a&0x03030303UL)\
1058                + (b&0x03030303UL);\
1059             h1= ((a&0xFCFCFCFCUL)>>2)\
1060               + ((b&0xFCFCFCFCUL)>>2);\
1061             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1062             pixels+=line_size;\
1063             block +=line_size;\
1064             a= AV_RN32(pixels  );\
1065             b= AV_RN32(pixels+1);\
1066             l0=  (a&0x03030303UL)\
1067                + (b&0x03030303UL)\
1068                + 0x02020202UL;\
1069             h0= ((a&0xFCFCFCFCUL)>>2)\
1070               + ((b&0xFCFCFCFCUL)>>2);\
1071             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1072             pixels+=line_size;\
1073             block +=line_size;\
1074         }\
1075 }\
1076 \
1077 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1078 {\
1079     int j;\
1080     for(j=0; j<2; j++){\
1081         int i;\
1082         const uint32_t a= AV_RN32(pixels  );\
1083         const uint32_t b= AV_RN32(pixels+1);\
1084         uint32_t l0=  (a&0x03030303UL)\
1085                     + (b&0x03030303UL)\
1086                     + 0x02020202UL;\
1087         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1088                    + ((b&0xFCFCFCFCUL)>>2);\
1089         uint32_t l1,h1;\
1090 \
1091         pixels+=line_size;\
1092         for(i=0; i<h; i+=2){\
1093             uint32_t a= AV_RN32(pixels  );\
1094             uint32_t b= AV_RN32(pixels+1);\
1095             l1=  (a&0x03030303UL)\
1096                + (b&0x03030303UL);\
1097             h1= ((a&0xFCFCFCFCUL)>>2)\
1098               + ((b&0xFCFCFCFCUL)>>2);\
1099             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100             pixels+=line_size;\
1101             block +=line_size;\
1102             a= AV_RN32(pixels  );\
1103             b= AV_RN32(pixels+1);\
1104             l0=  (a&0x03030303UL)\
1105                + (b&0x03030303UL)\
1106                + 0x02020202UL;\
1107             h0= ((a&0xFCFCFCFCUL)>>2)\
1108               + ((b&0xFCFCFCFCUL)>>2);\
1109             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1110             pixels+=line_size;\
1111             block +=line_size;\
1112         }\
1113         pixels+=4-line_size*(h+1);\
1114         block +=4-line_size*h;\
1115     }\
1116 }\
1117 \
1118 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1119 {\
1120     int j;\
1121     for(j=0; j<2; j++){\
1122         int i;\
1123         const uint32_t a= AV_RN32(pixels  );\
1124         const uint32_t b= AV_RN32(pixels+1);\
1125         uint32_t l0=  (a&0x03030303UL)\
1126                     + (b&0x03030303UL)\
1127                     + 0x01010101UL;\
1128         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1129                    + ((b&0xFCFCFCFCUL)>>2);\
1130         uint32_t l1,h1;\
1131 \
1132         pixels+=line_size;\
1133         for(i=0; i<h; i+=2){\
1134             uint32_t a= AV_RN32(pixels  );\
1135             uint32_t b= AV_RN32(pixels+1);\
1136             l1=  (a&0x03030303UL)\
1137                + (b&0x03030303UL);\
1138             h1= ((a&0xFCFCFCFCUL)>>2)\
1139               + ((b&0xFCFCFCFCUL)>>2);\
1140             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1141             pixels+=line_size;\
1142             block +=line_size;\
1143             a= AV_RN32(pixels  );\
1144             b= AV_RN32(pixels+1);\
1145             l0=  (a&0x03030303UL)\
1146                + (b&0x03030303UL)\
1147                + 0x01010101UL;\
1148             h0= ((a&0xFCFCFCFCUL)>>2)\
1149               + ((b&0xFCFCFCFCUL)>>2);\
1150             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151             pixels+=line_size;\
1152             block +=line_size;\
1153         }\
1154         pixels+=4-line_size*(h+1);\
1155         block +=4-line_size*h;\
1156     }\
1157 }\
1158 \
1159 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1160 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1161 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1162 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1163 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1164 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1165 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1166 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1167
1168 #define op_avg(a, b) a = rnd_avg32(a, b)
1169 #endif
1170 #define op_put(a, b) a = b
1171
1172 PIXOP2(avg, op_avg)
1173 PIXOP2(put, op_put)
1174 #undef op_avg
1175 #undef op_put
1176
1177 #define put_no_rnd_pixels8_c  put_pixels8_c
1178 #define put_no_rnd_pixels16_c put_pixels16_c
1179
1180 #define avg2(a,b) ((a+b+1)>>1)
1181 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1182
1183 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1184     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1185 }
1186
1187 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1188     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1189 }
1190
1191 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1192 {
1193     const int A=(16-x16)*(16-y16);
1194     const int B=(   x16)*(16-y16);
1195     const int C=(16-x16)*(   y16);
1196     const int D=(   x16)*(   y16);
1197     int i;
1198
1199     for(i=0; i<h; i++)
1200     {
1201         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1202         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1203         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1204         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1205         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1206         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1207         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1208         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1209         dst+= stride;
1210         src+= stride;
1211     }
1212 }
1213
1214 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1215                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1216 {
1217     int y, vx, vy;
1218     const int s= 1<<shift;
1219
1220     width--;
1221     height--;
1222
1223     for(y=0; y<h; y++){
1224         int x;
1225
1226         vx= ox;
1227         vy= oy;
1228         for(x=0; x<8; x++){ //XXX FIXME optimize
1229             int src_x, src_y, frac_x, frac_y, index;
1230
1231             src_x= vx>>16;
1232             src_y= vy>>16;
1233             frac_x= src_x&(s-1);
1234             frac_y= src_y&(s-1);
1235             src_x>>=shift;
1236             src_y>>=shift;
1237
1238             if((unsigned)src_x < width){
1239                 if((unsigned)src_y < height){
1240                     index= src_x + src_y*stride;
1241                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1242                                            + src[index       +1]*   frac_x )*(s-frac_y)
1243                                         + (  src[index+stride  ]*(s-frac_x)
1244                                            + src[index+stride+1]*   frac_x )*   frac_y
1245                                         + r)>>(shift*2);
1246                 }else{
1247                     index= src_x + av_clip(src_y, 0, height)*stride;
1248                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1249                                           + src[index       +1]*   frac_x )*s
1250                                         + r)>>(shift*2);
1251                 }
1252             }else{
1253                 if((unsigned)src_y < height){
1254                     index= av_clip(src_x, 0, width) + src_y*stride;
1255                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1256                                            + src[index+stride  ]*   frac_y )*s
1257                                         + r)>>(shift*2);
1258                 }else{
1259                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1260                     dst[y*stride + x]=    src[index         ];
1261                 }
1262             }
1263
1264             vx+= dxx;
1265             vy+= dyx;
1266         }
1267         ox += dxy;
1268         oy += dyy;
1269     }
1270 }
1271
1272 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1273     switch(width){
1274     case 2: put_pixels2_c (dst, src, stride, height); break;
1275     case 4: put_pixels4_c (dst, src, stride, height); break;
1276     case 8: put_pixels8_c (dst, src, stride, height); break;
1277     case 16:put_pixels16_c(dst, src, stride, height); break;
1278     }
1279 }
1280
1281 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1282     int i,j;
1283     for (i=0; i < height; i++) {
1284       for (j=0; j < width; j++) {
1285         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1286       }
1287       src += stride;
1288       dst += stride;
1289     }
1290 }
1291
1292 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1293     int i,j;
1294     for (i=0; i < height; i++) {
1295       for (j=0; j < width; j++) {
1296         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1297       }
1298       src += stride;
1299       dst += stride;
1300     }
1301 }
1302
1303 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1304     int i,j;
1305     for (i=0; i < height; i++) {
1306       for (j=0; j < width; j++) {
1307         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1308       }
1309       src += stride;
1310       dst += stride;
1311     }
1312 }
1313
1314 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1315     int i,j;
1316     for (i=0; i < height; i++) {
1317       for (j=0; j < width; j++) {
1318         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1319       }
1320       src += stride;
1321       dst += stride;
1322     }
1323 }
1324
1325 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1326     int i,j;
1327     for (i=0; i < height; i++) {
1328       for (j=0; j < width; j++) {
1329         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1330       }
1331       src += stride;
1332       dst += stride;
1333     }
1334 }
1335
1336 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1337     int i,j;
1338     for (i=0; i < height; i++) {
1339       for (j=0; j < width; j++) {
1340         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1341       }
1342       src += stride;
1343       dst += stride;
1344     }
1345 }
1346
1347 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1348     int i,j;
1349     for (i=0; i < height; i++) {
1350       for (j=0; j < width; j++) {
1351         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1352       }
1353       src += stride;
1354       dst += stride;
1355     }
1356 }
1357
1358 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359     int i,j;
1360     for (i=0; i < height; i++) {
1361       for (j=0; j < width; j++) {
1362         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1363       }
1364       src += stride;
1365       dst += stride;
1366     }
1367 }
1368
1369 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1370     switch(width){
1371     case 2: avg_pixels2_c (dst, src, stride, height); break;
1372     case 4: avg_pixels4_c (dst, src, stride, height); break;
1373     case 8: avg_pixels8_c (dst, src, stride, height); break;
1374     case 16:avg_pixels16_c(dst, src, stride, height); break;
1375     }
1376 }
1377
1378 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379     int i,j;
1380     for (i=0; i < height; i++) {
1381       for (j=0; j < width; j++) {
1382         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1383       }
1384       src += stride;
1385       dst += stride;
1386     }
1387 }
1388
1389 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390     int i,j;
1391     for (i=0; i < height; i++) {
1392       for (j=0; j < width; j++) {
1393         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1394       }
1395       src += stride;
1396       dst += stride;
1397     }
1398 }
1399
1400 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401     int i,j;
1402     for (i=0; i < height; i++) {
1403       for (j=0; j < width; j++) {
1404         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1405       }
1406       src += stride;
1407       dst += stride;
1408     }
1409 }
1410
1411 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412     int i,j;
1413     for (i=0; i < height; i++) {
1414       for (j=0; j < width; j++) {
1415         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1416       }
1417       src += stride;
1418       dst += stride;
1419     }
1420 }
1421
1422 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423     int i,j;
1424     for (i=0; i < height; i++) {
1425       for (j=0; j < width; j++) {
1426         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1427       }
1428       src += stride;
1429       dst += stride;
1430     }
1431 }
1432
1433 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434     int i,j;
1435     for (i=0; i < height; i++) {
1436       for (j=0; j < width; j++) {
1437         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1438       }
1439       src += stride;
1440       dst += stride;
1441     }
1442 }
1443
1444 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445     int i,j;
1446     for (i=0; i < height; i++) {
1447       for (j=0; j < width; j++) {
1448         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1449       }
1450       src += stride;
1451       dst += stride;
1452     }
1453 }
1454
1455 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456     int i,j;
1457     for (i=0; i < height; i++) {
1458       for (j=0; j < width; j++) {
1459         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1460       }
1461       src += stride;
1462       dst += stride;
1463     }
1464 }
1465 #if 0
1466 #define TPEL_WIDTH(width)\
1467 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1468     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1469 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1470     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1471 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1472     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1473 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1474     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1475 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1476     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1477 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1479 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1481 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1483 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1484     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1485 #endif
1486
1487 #define H264_CHROMA_MC(OPNAME, OP)\
1488 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1489     const int A=(8-x)*(8-y);\
1490     const int B=(  x)*(8-y);\
1491     const int C=(8-x)*(  y);\
1492     const int D=(  x)*(  y);\
1493     int i;\
1494     \
1495     assert(x<8 && y<8 && x>=0 && y>=0);\
1496 \
1497     if(D){\
1498         for(i=0; i<h; i++){\
1499             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1500             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1501             dst+= stride;\
1502             src+= stride;\
1503         }\
1504     }else{\
1505         const int E= B+C;\
1506         const int step= C ? stride : 1;\
1507         for(i=0; i<h; i++){\
1508             OP(dst[0], (A*src[0] + E*src[step+0]));\
1509             OP(dst[1], (A*src[1] + E*src[step+1]));\
1510             dst+= stride;\
1511             src+= stride;\
1512         }\
1513     }\
1514 }\
1515 \
1516 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1517     const int A=(8-x)*(8-y);\
1518     const int B=(  x)*(8-y);\
1519     const int C=(8-x)*(  y);\
1520     const int D=(  x)*(  y);\
1521     int i;\
1522     \
1523     assert(x<8 && y<8 && x>=0 && y>=0);\
1524 \
1525     if(D){\
1526         for(i=0; i<h; i++){\
1527             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1528             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1529             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1530             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1531             dst+= stride;\
1532             src+= stride;\
1533         }\
1534     }else{\
1535         const int E= B+C;\
1536         const int step= C ? stride : 1;\
1537         for(i=0; i<h; i++){\
1538             OP(dst[0], (A*src[0] + E*src[step+0]));\
1539             OP(dst[1], (A*src[1] + E*src[step+1]));\
1540             OP(dst[2], (A*src[2] + E*src[step+2]));\
1541             OP(dst[3], (A*src[3] + E*src[step+3]));\
1542             dst+= stride;\
1543             src+= stride;\
1544         }\
1545     }\
1546 }\
1547 \
1548 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1549     const int A=(8-x)*(8-y);\
1550     const int B=(  x)*(8-y);\
1551     const int C=(8-x)*(  y);\
1552     const int D=(  x)*(  y);\
1553     int i;\
1554     \
1555     assert(x<8 && y<8 && x>=0 && y>=0);\
1556 \
1557     if(D){\
1558         for(i=0; i<h; i++){\
1559             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1560             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1561             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1562             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1563             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1564             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1565             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1566             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1567             dst+= stride;\
1568             src+= stride;\
1569         }\
1570     }else{\
1571         const int E= B+C;\
1572         const int step= C ? stride : 1;\
1573         for(i=0; i<h; i++){\
1574             OP(dst[0], (A*src[0] + E*src[step+0]));\
1575             OP(dst[1], (A*src[1] + E*src[step+1]));\
1576             OP(dst[2], (A*src[2] + E*src[step+2]));\
1577             OP(dst[3], (A*src[3] + E*src[step+3]));\
1578             OP(dst[4], (A*src[4] + E*src[step+4]));\
1579             OP(dst[5], (A*src[5] + E*src[step+5]));\
1580             OP(dst[6], (A*src[6] + E*src[step+6]));\
1581             OP(dst[7], (A*src[7] + E*src[step+7]));\
1582             dst+= stride;\
1583             src+= stride;\
1584         }\
1585     }\
1586 }
1587
1588 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1589 #define op_put(a, b) a = (((b) + 32)>>6)
1590
1591 H264_CHROMA_MC(put_       , op_put)
1592 H264_CHROMA_MC(avg_       , op_avg)
1593 #undef op_avg
1594 #undef op_put
1595
1596 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1597     const int A=(8-x)*(8-y);
1598     const int B=(  x)*(8-y);
1599     const int C=(8-x)*(  y);
1600     const int D=(  x)*(  y);
1601     int i;
1602
1603     assert(x<8 && y<8 && x>=0 && y>=0);
1604
1605     for(i=0; i<h; i++)
1606     {
1607         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1608         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1609         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1610         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1611         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1612         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1613         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1614         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1615         dst+= stride;
1616         src+= stride;
1617     }
1618 }
1619
1620 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1621     const int A=(8-x)*(8-y);
1622     const int B=(  x)*(8-y);
1623     const int C=(8-x)*(  y);
1624     const int D=(  x)*(  y);
1625     int i;
1626
1627     assert(x<8 && y<8 && x>=0 && y>=0);
1628
1629     for(i=0; i<h; i++)
1630     {
1631         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1632         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1633         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1634         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1635         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1636         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1637         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1638         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1639         dst+= stride;
1640         src+= stride;
1641     }
1642 }
1643
1644 #define QPEL_MC(r, OPNAME, RND, OP) \
1645 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1646     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1647     int i;\
1648     for(i=0; i<h; i++)\
1649     {\
1650         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1651         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1652         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1653         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1654         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1655         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1656         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1657         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1658         dst+=dstStride;\
1659         src+=srcStride;\
1660     }\
1661 }\
1662 \
1663 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1664     const int w=8;\
1665     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1666     int i;\
1667     for(i=0; i<w; i++)\
1668     {\
1669         const int src0= src[0*srcStride];\
1670         const int src1= src[1*srcStride];\
1671         const int src2= src[2*srcStride];\
1672         const int src3= src[3*srcStride];\
1673         const int src4= src[4*srcStride];\
1674         const int src5= src[5*srcStride];\
1675         const int src6= src[6*srcStride];\
1676         const int src7= src[7*srcStride];\
1677         const int src8= src[8*srcStride];\
1678         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1679         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1680         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1681         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1682         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1683         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1684         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1685         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1686         dst++;\
1687         src++;\
1688     }\
1689 }\
1690 \
1691 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1692     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1693     int i;\
1694     \
1695     for(i=0; i<h; i++)\
1696     {\
1697         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1698         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1699         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1700         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1701         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1702         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1703         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1704         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1705         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1706         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1707         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1708         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1709         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1710         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1711         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1712         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1713         dst+=dstStride;\
1714         src+=srcStride;\
1715     }\
1716 }\
1717 \
1718 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1719     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1720     int i;\
1721     const int w=16;\
1722     for(i=0; i<w; i++)\
1723     {\
1724         const int src0= src[0*srcStride];\
1725         const int src1= src[1*srcStride];\
1726         const int src2= src[2*srcStride];\
1727         const int src3= src[3*srcStride];\
1728         const int src4= src[4*srcStride];\
1729         const int src5= src[5*srcStride];\
1730         const int src6= src[6*srcStride];\
1731         const int src7= src[7*srcStride];\
1732         const int src8= src[8*srcStride];\
1733         const int src9= src[9*srcStride];\
1734         const int src10= src[10*srcStride];\
1735         const int src11= src[11*srcStride];\
1736         const int src12= src[12*srcStride];\
1737         const int src13= src[13*srcStride];\
1738         const int src14= src[14*srcStride];\
1739         const int src15= src[15*srcStride];\
1740         const int src16= src[16*srcStride];\
1741         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1742         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1743         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1744         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1745         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1746         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1747         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1748         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1749         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1750         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1751         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1752         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1753         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1754         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1755         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1756         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1757         dst++;\
1758         src++;\
1759     }\
1760 }\
1761 \
1762 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1763     uint8_t half[64];\
1764     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1765     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1766 }\
1767 \
1768 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1769     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1770 }\
1771 \
1772 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1773     uint8_t half[64];\
1774     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1775     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1776 }\
1777 \
1778 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1779     uint8_t full[16*9];\
1780     uint8_t half[64];\
1781     copy_block9(full, src, 16, stride, 9);\
1782     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1783     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1784 }\
1785 \
1786 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1787     uint8_t full[16*9];\
1788     copy_block9(full, src, 16, stride, 9);\
1789     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1790 }\
1791 \
1792 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1793     uint8_t full[16*9];\
1794     uint8_t half[64];\
1795     copy_block9(full, src, 16, stride, 9);\
1796     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1797     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1798 }\
1799 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1800     uint8_t full[16*9];\
1801     uint8_t halfH[72];\
1802     uint8_t halfV[64];\
1803     uint8_t halfHV[64];\
1804     copy_block9(full, src, 16, stride, 9);\
1805     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1806     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1807     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1808     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1809 }\
1810 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1811     uint8_t full[16*9];\
1812     uint8_t halfH[72];\
1813     uint8_t halfHV[64];\
1814     copy_block9(full, src, 16, stride, 9);\
1815     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1816     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1817     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1818     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1819 }\
1820 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1821     uint8_t full[16*9];\
1822     uint8_t halfH[72];\
1823     uint8_t halfV[64];\
1824     uint8_t halfHV[64];\
1825     copy_block9(full, src, 16, stride, 9);\
1826     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1827     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1828     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1829     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1830 }\
1831 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1832     uint8_t full[16*9];\
1833     uint8_t halfH[72];\
1834     uint8_t halfHV[64];\
1835     copy_block9(full, src, 16, stride, 9);\
1836     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1838     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1839     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1840 }\
1841 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1842     uint8_t full[16*9];\
1843     uint8_t halfH[72];\
1844     uint8_t halfV[64];\
1845     uint8_t halfHV[64];\
1846     copy_block9(full, src, 16, stride, 9);\
1847     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1848     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1849     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1850     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1851 }\
1852 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1853     uint8_t full[16*9];\
1854     uint8_t halfH[72];\
1855     uint8_t halfHV[64];\
1856     copy_block9(full, src, 16, stride, 9);\
1857     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1858     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1859     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1860     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1861 }\
1862 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1863     uint8_t full[16*9];\
1864     uint8_t halfH[72];\
1865     uint8_t halfV[64];\
1866     uint8_t halfHV[64];\
1867     copy_block9(full, src, 16, stride, 9);\
1868     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1869     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1870     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1871     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1872 }\
1873 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1874     uint8_t full[16*9];\
1875     uint8_t halfH[72];\
1876     uint8_t halfHV[64];\
1877     copy_block9(full, src, 16, stride, 9);\
1878     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1879     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1880     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1881     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1882 }\
1883 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1884     uint8_t halfH[72];\
1885     uint8_t halfHV[64];\
1886     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1887     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1888     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1889 }\
1890 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1891     uint8_t halfH[72];\
1892     uint8_t halfHV[64];\
1893     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1894     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1896 }\
1897 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1898     uint8_t full[16*9];\
1899     uint8_t halfH[72];\
1900     uint8_t halfV[64];\
1901     uint8_t halfHV[64];\
1902     copy_block9(full, src, 16, stride, 9);\
1903     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1904     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1905     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1906     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1907 }\
1908 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1909     uint8_t full[16*9];\
1910     uint8_t halfH[72];\
1911     copy_block9(full, src, 16, stride, 9);\
1912     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1913     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1914     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1915 }\
1916 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1917     uint8_t full[16*9];\
1918     uint8_t halfH[72];\
1919     uint8_t halfV[64];\
1920     uint8_t halfHV[64];\
1921     copy_block9(full, src, 16, stride, 9);\
1922     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1923     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1924     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1925     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1926 }\
1927 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1928     uint8_t full[16*9];\
1929     uint8_t halfH[72];\
1930     copy_block9(full, src, 16, stride, 9);\
1931     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1932     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1933     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1934 }\
1935 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1936     uint8_t halfH[72];\
1937     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1938     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1939 }\
1940 \
1941 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1942     uint8_t half[256];\
1943     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1944     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1945 }\
1946 \
1947 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1948     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1949 }\
1950 \
1951 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1952     uint8_t half[256];\
1953     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1954     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1955 }\
1956 \
1957 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1958     uint8_t full[24*17];\
1959     uint8_t half[256];\
1960     copy_block17(full, src, 24, stride, 17);\
1961     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1962     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1963 }\
1964 \
1965 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1966     uint8_t full[24*17];\
1967     copy_block17(full, src, 24, stride, 17);\
1968     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1969 }\
1970 \
1971 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1972     uint8_t full[24*17];\
1973     uint8_t half[256];\
1974     copy_block17(full, src, 24, stride, 17);\
1975     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1976     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1977 }\
1978 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1979     uint8_t full[24*17];\
1980     uint8_t halfH[272];\
1981     uint8_t halfV[256];\
1982     uint8_t halfHV[256];\
1983     copy_block17(full, src, 24, stride, 17);\
1984     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1985     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1986     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1988 }\
1989 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1990     uint8_t full[24*17];\
1991     uint8_t halfH[272];\
1992     uint8_t halfHV[256];\
1993     copy_block17(full, src, 24, stride, 17);\
1994     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1995     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1996     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1997     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1998 }\
1999 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2000     uint8_t full[24*17];\
2001     uint8_t halfH[272];\
2002     uint8_t halfV[256];\
2003     uint8_t halfHV[256];\
2004     copy_block17(full, src, 24, stride, 17);\
2005     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2006     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2007     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2008     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2009 }\
2010 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2011     uint8_t full[24*17];\
2012     uint8_t halfH[272];\
2013     uint8_t halfHV[256];\
2014     copy_block17(full, src, 24, stride, 17);\
2015     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2016     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2017     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2018     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2019 }\
2020 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2021     uint8_t full[24*17];\
2022     uint8_t halfH[272];\
2023     uint8_t halfV[256];\
2024     uint8_t halfHV[256];\
2025     copy_block17(full, src, 24, stride, 17);\
2026     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2028     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2030 }\
2031 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2032     uint8_t full[24*17];\
2033     uint8_t halfH[272];\
2034     uint8_t halfHV[256];\
2035     copy_block17(full, src, 24, stride, 17);\
2036     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2038     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2040 }\
2041 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042     uint8_t full[24*17];\
2043     uint8_t halfH[272];\
2044     uint8_t halfV[256];\
2045     uint8_t halfHV[256];\
2046     copy_block17(full, src, 24, stride, 17);\
2047     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2048     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2049     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2051 }\
2052 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2053     uint8_t full[24*17];\
2054     uint8_t halfH[272];\
2055     uint8_t halfHV[256];\
2056     copy_block17(full, src, 24, stride, 17);\
2057     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2059     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2061 }\
2062 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2063     uint8_t halfH[272];\
2064     uint8_t halfHV[256];\
2065     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2066     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2067     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2068 }\
2069 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2070     uint8_t halfH[272];\
2071     uint8_t halfHV[256];\
2072     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2073     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2074     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2075 }\
2076 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2077     uint8_t full[24*17];\
2078     uint8_t halfH[272];\
2079     uint8_t halfV[256];\
2080     uint8_t halfHV[256];\
2081     copy_block17(full, src, 24, stride, 17);\
2082     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2083     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2084     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2085     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2086 }\
2087 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2088     uint8_t full[24*17];\
2089     uint8_t halfH[272];\
2090     copy_block17(full, src, 24, stride, 17);\
2091     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2092     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2093     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2094 }\
2095 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2096     uint8_t full[24*17];\
2097     uint8_t halfH[272];\
2098     uint8_t halfV[256];\
2099     uint8_t halfHV[256];\
2100     copy_block17(full, src, 24, stride, 17);\
2101     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2103     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2105 }\
2106 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2107     uint8_t full[24*17];\
2108     uint8_t halfH[272];\
2109     copy_block17(full, src, 24, stride, 17);\
2110     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2111     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2112     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2113 }\
2114 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2115     uint8_t halfH[272];\
2116     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2117     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2118 }
2119
2120 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2121 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2122 #define op_put(a, b) a = cm[((b) + 16)>>5]
2123 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2124
2125 QPEL_MC(0, put_       , _       , op_put)
2126 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2127 QPEL_MC(0, avg_       , _       , op_avg)
2128 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2129 #undef op_avg
2130 #undef op_avg_no_rnd
2131 #undef op_put
2132 #undef op_put_no_rnd
2133
2134 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
2135 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2136 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2137 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2138 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2139 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2140
2141 #if 1
2142 #define H264_LOWPASS(OPNAME, OP, OP2) \
2143 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2144     const int h=2;\
2145     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2146     int i;\
2147     for(i=0; i<h; i++)\
2148     {\
2149         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2150         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2151         dst+=dstStride;\
2152         src+=srcStride;\
2153     }\
2154 }\
2155 \
2156 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2157     const int w=2;\
2158     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2159     int i;\
2160     for(i=0; i<w; i++)\
2161     {\
2162         const int srcB= src[-2*srcStride];\
2163         const int srcA= src[-1*srcStride];\
2164         const int src0= src[0 *srcStride];\
2165         const int src1= src[1 *srcStride];\
2166         const int src2= src[2 *srcStride];\
2167         const int src3= src[3 *srcStride];\
2168         const int src4= src[4 *srcStride];\
2169         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2170         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2171         dst++;\
2172         src++;\
2173     }\
2174 }\
2175 \
2176 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2177     const int h=2;\
2178     const int w=2;\
2179     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2180     int i;\
2181     src -= 2*srcStride;\
2182     for(i=0; i<h+5; i++)\
2183     {\
2184         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2185         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2186         tmp+=tmpStride;\
2187         src+=srcStride;\
2188     }\
2189     tmp -= tmpStride*(h+5-2);\
2190     for(i=0; i<w; i++)\
2191     {\
2192         const int tmpB= tmp[-2*tmpStride];\
2193         const int tmpA= tmp[-1*tmpStride];\
2194         const int tmp0= tmp[0 *tmpStride];\
2195         const int tmp1= tmp[1 *tmpStride];\
2196         const int tmp2= tmp[2 *tmpStride];\
2197         const int tmp3= tmp[3 *tmpStride];\
2198         const int tmp4= tmp[4 *tmpStride];\
2199         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2200         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2201         dst++;\
2202         tmp++;\
2203     }\
2204 }\
2205 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2206     const int h=4;\
2207     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208     int i;\
2209     for(i=0; i<h; i++)\
2210     {\
2211         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2212         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2213         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2214         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2215         dst+=dstStride;\
2216         src+=srcStride;\
2217     }\
2218 }\
2219 \
2220 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2221     const int w=4;\
2222     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2223     int i;\
2224     for(i=0; i<w; i++)\
2225     {\
2226         const int srcB= src[-2*srcStride];\
2227         const int srcA= src[-1*srcStride];\
2228         const int src0= src[0 *srcStride];\
2229         const int src1= src[1 *srcStride];\
2230         const int src2= src[2 *srcStride];\
2231         const int src3= src[3 *srcStride];\
2232         const int src4= src[4 *srcStride];\
2233         const int src5= src[5 *srcStride];\
2234         const int src6= src[6 *srcStride];\
2235         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2236         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2237         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2238         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2239         dst++;\
2240         src++;\
2241     }\
2242 }\
2243 \
2244 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2245     const int h=4;\
2246     const int w=4;\
2247     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2248     int i;\
2249     src -= 2*srcStride;\
2250     for(i=0; i<h+5; i++)\
2251     {\
2252         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2253         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2254         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2255         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2256         tmp+=tmpStride;\
2257         src+=srcStride;\
2258     }\
2259     tmp -= tmpStride*(h+5-2);\
2260     for(i=0; i<w; i++)\
2261     {\
2262         const int tmpB= tmp[-2*tmpStride];\
2263         const int tmpA= tmp[-1*tmpStride];\
2264         const int tmp0= tmp[0 *tmpStride];\
2265         const int tmp1= tmp[1 *tmpStride];\
2266         const int tmp2= tmp[2 *tmpStride];\
2267         const int tmp3= tmp[3 *tmpStride];\
2268         const int tmp4= tmp[4 *tmpStride];\
2269         const int tmp5= tmp[5 *tmpStride];\
2270         const int tmp6= tmp[6 *tmpStride];\
2271         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2272         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2273         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2274         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2275         dst++;\
2276         tmp++;\
2277     }\
2278 }\
2279 \
2280 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2281     const int h=8;\
2282     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2283     int i;\
2284     for(i=0; i<h; i++)\
2285     {\
2286         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2287         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2288         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2289         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2290         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2291         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2292         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2293         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2294         dst+=dstStride;\
2295         src+=srcStride;\
2296     }\
2297 }\
2298 \
2299 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2300     const int w=8;\
2301     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2302     int i;\
2303     for(i=0; i<w; i++)\
2304     {\
2305         const int srcB= src[-2*srcStride];\
2306         const int srcA= src[-1*srcStride];\
2307         const int src0= src[0 *srcStride];\
2308         const int src1= src[1 *srcStride];\
2309         const int src2= src[2 *srcStride];\
2310         const int src3= src[3 *srcStride];\
2311         const int src4= src[4 *srcStride];\
2312         const int src5= src[5 *srcStride];\
2313         const int src6= src[6 *srcStride];\
2314         const int src7= src[7 *srcStride];\
2315         const int src8= src[8 *srcStride];\
2316         const int src9= src[9 *srcStride];\
2317         const int src10=src[10*srcStride];\
2318         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2319         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2320         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2321         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2322         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2323         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2324         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2325         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2326         dst++;\
2327         src++;\
2328     }\
2329 }\
2330 \
2331 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2332     const int h=8;\
2333     const int w=8;\
2334     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2335     int i;\
2336     src -= 2*srcStride;\
2337     for(i=0; i<h+5; i++)\
2338     {\
2339         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2340         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2341         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2342         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2343         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2344         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2345         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2346         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2347         tmp+=tmpStride;\
2348         src+=srcStride;\
2349     }\
2350     tmp -= tmpStride*(h+5-2);\
2351     for(i=0; i<w; i++)\
2352     {\
2353         const int tmpB= tmp[-2*tmpStride];\
2354         const int tmpA= tmp[-1*tmpStride];\
2355         const int tmp0= tmp[0 *tmpStride];\
2356         const int tmp1= tmp[1 *tmpStride];\
2357         const int tmp2= tmp[2 *tmpStride];\
2358         const int tmp3= tmp[3 *tmpStride];\
2359         const int tmp4= tmp[4 *tmpStride];\
2360         const int tmp5= tmp[5 *tmpStride];\
2361         const int tmp6= tmp[6 *tmpStride];\
2362         const int tmp7= tmp[7 *tmpStride];\
2363         const int tmp8= tmp[8 *tmpStride];\
2364         const int tmp9= tmp[9 *tmpStride];\
2365         const int tmp10=tmp[10*tmpStride];\
2366         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2367         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2368         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2369         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2370         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2371         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2372         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2373         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2374         dst++;\
2375         tmp++;\
2376     }\
2377 }\
2378 \
2379 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2380     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2381     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2382     src += 8*srcStride;\
2383     dst += 8*dstStride;\
2384     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2385     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2386 }\
2387 \
2388 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2389     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2390     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2391     src += 8*srcStride;\
2392     dst += 8*dstStride;\
2393     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2394     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2395 }\
2396 \
2397 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2398     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2399     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2400     src += 8*srcStride;\
2401     dst += 8*dstStride;\
2402     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2403     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2404 }\
2405
2406 #define H264_MC(OPNAME, SIZE) \
2407 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2408     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2409 }\
2410 \
2411 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2412     uint8_t half[SIZE*SIZE];\
2413     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2414     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2415 }\
2416 \
2417 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2418     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2419 }\
2420 \
2421 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2422     uint8_t half[SIZE*SIZE];\
2423     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2424     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2425 }\
2426 \
2427 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2428     uint8_t full[SIZE*(SIZE+5)];\
2429     uint8_t * const full_mid= full + SIZE*2;\
2430     uint8_t half[SIZE*SIZE];\
2431     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2432     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2433     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2434 }\
2435 \
2436 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2437     uint8_t full[SIZE*(SIZE+5)];\
2438     uint8_t * const full_mid= full + SIZE*2;\
2439     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2440     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2441 }\
2442 \
2443 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2444     uint8_t full[SIZE*(SIZE+5)];\
2445     uint8_t * const full_mid= full + SIZE*2;\
2446     uint8_t half[SIZE*SIZE];\
2447     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2448     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2449     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2450 }\
2451 \
2452 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2453     uint8_t full[SIZE*(SIZE+5)];\
2454     uint8_t * const full_mid= full + SIZE*2;\
2455     uint8_t halfH[SIZE*SIZE];\
2456     uint8_t halfV[SIZE*SIZE];\
2457     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2458     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2459     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2460     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2461 }\
2462 \
2463 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2464     uint8_t full[SIZE*(SIZE+5)];\
2465     uint8_t * const full_mid= full + SIZE*2;\
2466     uint8_t halfH[SIZE*SIZE];\
2467     uint8_t halfV[SIZE*SIZE];\
2468     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2469     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2470     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2471     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2472 }\
2473 \
2474 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2475     uint8_t full[SIZE*(SIZE+5)];\
2476     uint8_t * const full_mid= full + SIZE*2;\
2477     uint8_t halfH[SIZE*SIZE];\
2478     uint8_t halfV[SIZE*SIZE];\
2479     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2480     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2481     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2482     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2483 }\
2484 \
2485 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2486     uint8_t full[SIZE*(SIZE+5)];\
2487     uint8_t * const full_mid= full + SIZE*2;\
2488     uint8_t halfH[SIZE*SIZE];\
2489     uint8_t halfV[SIZE*SIZE];\
2490     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2491     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2492     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2493     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2494 }\
2495 \
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2497     int16_t tmp[SIZE*(SIZE+5)];\
2498     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2499 }\
2500 \
2501 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2502     int16_t tmp[SIZE*(SIZE+5)];\
2503     uint8_t halfH[SIZE*SIZE];\
2504     uint8_t halfHV[SIZE*SIZE];\
2505     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2506     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2507     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2508 }\
2509 \
2510 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2511     int16_t tmp[SIZE*(SIZE+5)];\
2512     uint8_t halfH[SIZE*SIZE];\
2513     uint8_t halfHV[SIZE*SIZE];\
2514     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2515     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2516     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2517 }\
2518 \
2519 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2520     uint8_t full[SIZE*(SIZE+5)];\
2521     uint8_t * const full_mid= full + SIZE*2;\
2522     int16_t tmp[SIZE*(SIZE+5)];\
2523     uint8_t halfV[SIZE*SIZE];\
2524     uint8_t halfHV[SIZE*SIZE];\
2525     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2526     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2527     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2528     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2529 }\
2530 \
2531 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2532     uint8_t full[SIZE*(SIZE+5)];\
2533     uint8_t * const full_mid= full + SIZE*2;\
2534     int16_t tmp[SIZE*(SIZE+5)];\
2535     uint8_t halfV[SIZE*SIZE];\
2536     uint8_t halfHV[SIZE*SIZE];\
2537     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2538     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2539     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2540     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2541 }\
2542
2543 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2544 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2545 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2546 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2547 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2548
2549 H264_LOWPASS(put_       , op_put, op2_put)
2550 H264_LOWPASS(avg_       , op_avg, op2_avg)
2551 H264_MC(put_, 2)
2552 H264_MC(put_, 4)
2553 H264_MC(put_, 8)
2554 H264_MC(put_, 16)
2555 H264_MC(avg_, 4)
2556 H264_MC(avg_, 8)
2557 H264_MC(avg_, 16)
2558
2559 #undef op_avg
2560 #undef op_put
2561 #undef op2_avg
2562 #undef op2_put
2563 #endif
2564
2565 #define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2566 #define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2567 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2568 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2569
2570 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2571     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2572     int i;
2573
2574     for(i=0; i<h; i++){
2575         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2576         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2577         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2578         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2579         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2580         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2581         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2582         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2583         dst+=dstStride;
2584         src+=srcStride;
2585     }
2586 }
2587
2588 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2589     put_pixels8_c(dst, src, stride, 8);
2590 }
2591 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2592     avg_pixels8_c(dst, src, stride, 8);
2593 }
2594 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2595     put_pixels16_c(dst, src, stride, 16);
2596 }
2597 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2598     avg_pixels16_c(dst, src, stride, 16);
2599 }
2600
2601 #if CONFIG_RV40_DECODER
2602 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2603     put_pixels16_xy2_c(dst, src, stride, 16);
2604 }
2605 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2606     avg_pixels16_xy2_c(dst, src, stride, 16);
2607 }
2608 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2609     put_pixels8_xy2_c(dst, src, stride, 8);
2610 }
2611 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2612     avg_pixels8_xy2_c(dst, src, stride, 8);
2613 }
2614 #endif /* CONFIG_RV40_DECODER */
2615
2616 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2617     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2618     int i;
2619
2620     for(i=0; i<w; i++){
2621         const int src_1= src[ -srcStride];
2622         const int src0 = src[0          ];
2623         const int src1 = src[  srcStride];
2624         const int src2 = src[2*srcStride];
2625         const int src3 = src[3*srcStride];
2626         const int src4 = src[4*srcStride];
2627         const int src5 = src[5*srcStride];
2628         const int src6 = src[6*srcStride];
2629         const int src7 = src[7*srcStride];
2630         const int src8 = src[8*srcStride];
2631         const int src9 = src[9*srcStride];
2632         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2633         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2634         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2635         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2636         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2637         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2638         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2639         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2640         src++;
2641         dst++;
2642     }
2643 }
2644
2645 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2646     uint8_t half[64];
2647     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2648     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2649 }
2650
2651 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2652     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2653 }
2654
2655 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2656     uint8_t half[64];
2657     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2658     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2659 }
2660
2661 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2662     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2663 }
2664
2665 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2666     uint8_t halfH[88];
2667     uint8_t halfV[64];
2668     uint8_t halfHV[64];
2669     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2670     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2671     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2672     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2673 }
2674 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2675     uint8_t halfH[88];
2676     uint8_t halfV[64];
2677     uint8_t halfHV[64];
2678     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2679     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2680     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2681     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2682 }
2683 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2684     uint8_t halfH[88];
2685     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2686     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2687 }
2688
2689 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2690     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2691     int x;
2692     const int strength= ff_h263_loop_filter_strength[qscale];
2693
2694     for(x=0; x<8; x++){
2695         int d1, d2, ad1;
2696         int p0= src[x-2*stride];
2697         int p1= src[x-1*stride];
2698         int p2= src[x+0*stride];
2699         int p3= src[x+1*stride];
2700         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2701
2702         if     (d<-2*strength) d1= 0;
2703         else if(d<-  strength) d1=-2*strength - d;
2704         else if(d<   strength) d1= d;
2705         else if(d< 2*strength) d1= 2*strength - d;
2706         else                   d1= 0;
2707
2708         p1 += d1;
2709         p2 -= d1;
2710         if(p1&256) p1= ~(p1>>31);
2711         if(p2&256) p2= ~(p2>>31);
2712
2713         src[x-1*stride] = p1;
2714         src[x+0*stride] = p2;
2715
2716         ad1= FFABS(d1)>>1;
2717
2718         d2= av_clip((p0-p3)/4, -ad1, ad1);
2719
2720         src[x-2*stride] = p0 - d2;
2721         src[x+  stride] = p3 + d2;
2722     }
2723     }
2724 }
2725
2726 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2727     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2728     int y;
2729     const int strength= ff_h263_loop_filter_strength[qscale];
2730
2731     for(y=0; y<8; y++){
2732         int d1, d2, ad1;
2733         int p0= src[y*stride-2];
2734         int p1= src[y*stride-1];
2735         int p2= src[y*stride+0];
2736         int p3= src[y*stride+1];
2737         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2738
2739         if     (d<-2*strength) d1= 0;
2740         else if(d<-  strength) d1=-2*strength - d;
2741         else if(d<   strength) d1= d;
2742         else if(d< 2*strength) d1= 2*strength - d;
2743         else                   d1= 0;
2744
2745         p1 += d1;
2746         p2 -= d1;
2747         if(p1&256) p1= ~(p1>>31);
2748         if(p2&256) p2= ~(p2>>31);
2749
2750         src[y*stride-1] = p1;
2751         src[y*stride+0] = p2;
2752
2753         ad1= FFABS(d1)>>1;
2754
2755         d2= av_clip((p0-p3)/4, -ad1, ad1);
2756
2757         src[y*stride-2] = p0 - d2;
2758         src[y*stride+1] = p3 + d2;
2759     }
2760     }
2761 }
2762
2763 static void h261_loop_filter_c(uint8_t *src, int stride){
2764     int x,y,xy,yz;
2765     int temp[64];
2766
2767     for(x=0; x<8; x++){
2768         temp[x      ] = 4*src[x           ];
2769         temp[x + 7*8] = 4*src[x + 7*stride];
2770     }
2771     for(y=1; y<7; y++){
2772         for(x=0; x<8; x++){
2773             xy = y * stride + x;
2774             yz = y * 8 + x;
2775             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2776         }
2777     }
2778
2779     for(y=0; y<8; y++){
2780         src[  y*stride] = (temp[  y*8] + 2)>>2;
2781         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2782         for(x=1; x<7; x++){
2783             xy = y * stride + x;
2784             yz = y * 8 + x;
2785             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2786         }
2787     }
2788 }
2789
2790 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2791 {
2792     int s, i;
2793
2794     s = 0;
2795     for(i=0;i<h;i++) {
2796         s += abs(pix1[0] - pix2[0]);
2797         s += abs(pix1[1] - pix2[1]);
2798         s += abs(pix1[2] - pix2[2]);
2799         s += abs(pix1[3] - pix2[3]);
2800         s += abs(pix1[4] - pix2[4]);
2801         s += abs(pix1[5] - pix2[5]);
2802         s += abs(pix1[6] - pix2[6]);
2803         s += abs(pix1[7] - pix2[7]);
2804         s += abs(pix1[8] - pix2[8]);
2805         s += abs(pix1[9] - pix2[9]);
2806         s += abs(pix1[10] - pix2[10]);
2807         s += abs(pix1[11] - pix2[11]);
2808         s += abs(pix1[12] - pix2[12]);
2809         s += abs(pix1[13] - pix2[13]);
2810         s += abs(pix1[14] - pix2[14]);
2811         s += abs(pix1[15] - pix2[15]);
2812         pix1 += line_size;
2813         pix2 += line_size;
2814     }
2815     return s;
2816 }
2817
2818 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2819 {
2820     int s, i;
2821
2822     s = 0;
2823     for(i=0;i<h;i++) {
2824         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2825         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2826         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2827         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2828         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2829         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2830         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2831         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2832         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2833         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2834         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2835         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2836         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2837         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2838         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2839         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2840         pix1 += line_size;
2841         pix2 += line_size;
2842     }
2843     return s;
2844 }
2845
2846 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2847 {
2848     int s, i;
2849     uint8_t *pix3 = pix2 + line_size;
2850
2851     s = 0;
2852     for(i=0;i<h;i++) {
2853         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2854         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2855         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2856         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2857         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2858         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2859         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2860         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2861         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2862         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2863         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2864         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2865         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2866         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2867         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2868         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2869         pix1 += line_size;
2870         pix2 += line_size;
2871         pix3 += line_size;
2872     }
2873     return s;
2874 }
2875
2876 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2877 {
2878     int s, i;
2879     uint8_t *pix3 = pix2 + line_size;
2880
2881     s = 0;
2882     for(i=0;i<h;i++) {
2883         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2884         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2885         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2886         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2887         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2888         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2889         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2890         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2891         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2892         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2893         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2894         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2895         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2896         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2897         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2898         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2899         pix1 += line_size;
2900         pix2 += line_size;
2901         pix3 += line_size;
2902     }
2903     return s;
2904 }
2905
2906 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2907 {
2908     int s, i;
2909
2910     s = 0;
2911     for(i=0;i<h;i++) {
2912         s += abs(pix1[0] - pix2[0]);
2913         s += abs(pix1[1] - pix2[1]);
2914         s += abs(pix1[2] - pix2[2]);
2915         s += abs(pix1[3] - pix2[3]);
2916         s += abs(pix1[4] - pix2[4]);
2917         s += abs(pix1[5] - pix2[5]);
2918         s += abs(pix1[6] - pix2[6]);
2919         s += abs(pix1[7] - pix2[7]);
2920         pix1 += line_size;
2921         pix2 += line_size;
2922     }
2923     return s;
2924 }
2925
2926 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2927 {
2928     int s, i;
2929
2930     s = 0;
2931     for(i=0;i<h;i++) {
2932         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2933         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2934         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2935         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2936         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2937         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2938         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2939         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2940         pix1 += line_size;
2941         pix2 += line_size;
2942     }
2943     return s;
2944 }
2945
2946 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2947 {
2948     int s, i;
2949     uint8_t *pix3 = pix2 + line_size;
2950
2951     s = 0;
2952     for(i=0;i<h;i++) {
2953         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2954         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2955         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2956         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2957         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2958         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2959         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2960         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2961         pix1 += line_size;
2962         pix2 += line_size;
2963         pix3 += line_size;
2964     }
2965     return s;
2966 }
2967
2968 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2969 {
2970     int s, i;
2971     uint8_t *pix3 = pix2 + line_size;
2972
2973     s = 0;
2974     for(i=0;i<h;i++) {
2975         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2976         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2977         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2978         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2979         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2980         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2981         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2982         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2983         pix1 += line_size;
2984         pix2 += line_size;
2985         pix3 += line_size;
2986     }
2987     return s;
2988 }
2989
2990 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2991     MpegEncContext *c = v;
2992     int score1=0;
2993     int score2=0;
2994     int x,y;
2995
2996     for(y=0; y<h; y++){
2997         for(x=0; x<16; x++){
2998             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2999         }
3000         if(y+1<h){
3001             for(x=0; x<15; x++){
3002                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3003                              - s1[x+1] + s1[x+1+stride])
3004                         -FFABS(  s2[x  ] - s2[x  +stride]
3005                              - s2[x+1] + s2[x+1+stride]);
3006             }
3007         }
3008         s1+= stride;
3009         s2+= stride;
3010     }
3011
3012     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3013     else  return score1 + FFABS(score2)*8;
3014 }
3015
3016 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3017     MpegEncContext *c = v;
3018     int score1=0;
3019     int score2=0;
3020     int x,y;
3021
3022     for(y=0; y<h; y++){
3023         for(x=0; x<8; x++){
3024             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3025         }
3026         if(y+1<h){
3027             for(x=0; x<7; x++){
3028                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3029                              - s1[x+1] + s1[x+1+stride])
3030                         -FFABS(  s2[x  ] - s2[x  +stride]
3031                              - s2[x+1] + s2[x+1+stride]);
3032             }
3033         }
3034         s1+= stride;
3035         s2+= stride;
3036     }
3037
3038     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3039     else  return score1 + FFABS(score2)*8;
3040 }
3041
3042 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3043     int i;
3044     unsigned int sum=0;
3045
3046     for(i=0; i<8*8; i++){
3047         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3048         int w= weight[i];
3049         b>>= RECON_SHIFT;
3050         assert(-512<b && b<512);
3051
3052         sum += (w*b)*(w*b)>>4;
3053     }
3054     return sum>>2;
3055 }
3056
3057 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3058     int i;
3059
3060     for(i=0; i<8*8; i++){
3061         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3062     }
3063 }
3064
3065 /**
3066  * permutes an 8x8 block.
3067  * @param block the block which will be permuted according to the given permutation vector
3068  * @param permutation the permutation vector
3069  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3070  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3071  *                  (inverse) permutated to scantable order!
3072  */
3073 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3074 {
3075     int i;
3076     DCTELEM temp[64];
3077
3078     if(last<=0) return;
3079     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3080
3081     for(i=0; i<=last; i++){
3082         const int j= scantable[i];
3083         temp[j]= block[j];
3084         block[j]=0;
3085     }
3086
3087     for(i=0; i<=last; i++){
3088         const int j= scantable[i];
3089         const int perm_j= permutation[j];
3090         block[perm_j]= temp[j];
3091     }
3092 }
3093
3094 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3095     return 0;
3096 }
3097
3098 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3099     int i;
3100
3101     memset(cmp, 0, sizeof(void*)*6);
3102
3103     for(i=0; i<6; i++){
3104         switch(type&0xFF){
3105         case FF_CMP_SAD:
3106             cmp[i]= c->sad[i];
3107             break;
3108         case FF_CMP_SATD:
3109             cmp[i]= c->hadamard8_diff[i];
3110             break;
3111         case FF_CMP_SSE:
3112             cmp[i]= c->sse[i];
3113             break;
3114         case FF_CMP_DCT:
3115             cmp[i]= c->dct_sad[i];
3116             break;
3117         case FF_CMP_DCT264:
3118             cmp[i]= c->dct264_sad[i];
3119             break;
3120         case FF_CMP_DCTMAX:
3121             cmp[i]= c->dct_max[i];
3122             break;
3123         case FF_CMP_PSNR:
3124             cmp[i]= c->quant_psnr[i];
3125             break;
3126         case FF_CMP_BIT:
3127             cmp[i]= c->bit[i];
3128             break;
3129         case FF_CMP_RD:
3130             cmp[i]= c->rd[i];
3131             break;
3132         case FF_CMP_VSAD:
3133             cmp[i]= c->vsad[i];
3134             break;
3135         case FF_CMP_VSSE:
3136             cmp[i]= c->vsse[i];
3137             break;
3138         case FF_CMP_ZERO:
3139             cmp[i]= zero_cmp;
3140             break;
3141         case FF_CMP_NSSE:
3142             cmp[i]= c->nsse[i];
3143             break;
3144 #if CONFIG_DWT
3145         case FF_CMP_W53:
3146             cmp[i]= c->w53[i];
3147             break;
3148         case FF_CMP_W97:
3149             cmp[i]= c->w97[i];
3150             break;
3151 #endif
3152         default:
3153             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3154         }
3155     }
3156 }
3157
3158 static void clear_block_c(DCTELEM *block)
3159 {
3160     memset(block, 0, sizeof(DCTELEM)*64);
3161 }
3162
3163 /**
3164  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3165  */
3166 static void clear_blocks_c(DCTELEM *blocks)
3167 {
3168     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3169 }
3170
3171 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3172     long i;
3173     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3174         long a = *(long*)(src+i);
3175         long b = *(long*)(dst+i);
3176         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3177     }
3178     for(; i<w; i++)
3179         dst[i+0] += src[i+0];
3180 }
3181
3182 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3183     long i;
3184     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3185         long a = *(long*)(src1+i);
3186         long b = *(long*)(src2+i);
3187         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3188     }
3189     for(; i<w; i++)
3190         dst[i] = src1[i]+src2[i];
3191 }
3192
3193 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3194     long i;
3195 #if !HAVE_FAST_UNALIGNED
3196     if((long)src2 & (sizeof(long)-1)){
3197         for(i=0; i+7<w; i+=8){
3198             dst[i+0] = src1[i+0]-src2[i+0];
3199             dst[i+1] = src1[i+1]-src2[i+1];
3200             dst[i+2] = src1[i+2]-src2[i+2];
3201             dst[i+3] = src1[i+3]-src2[i+3];
3202             dst[i+4] = src1[i+4]-src2[i+4];
3203             dst[i+5] = src1[i+5]-src2[i+5];
3204             dst[i+6] = src1[i+6]-src2[i+6];
3205             dst[i+7] = src1[i+7]-src2[i+7];
3206         }
3207     }else
3208 #endif
3209     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3210         long a = *(long*)(src1+i);
3211         long b = *(long*)(src2+i);
3212         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3213     }
3214     for(; i<w; i++)
3215         dst[i+0] = src1[i+0]-src2[i+0];
3216 }
3217
3218 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3219     int i;
3220     uint8_t l, lt;
3221
3222     l= *left;
3223     lt= *left_top;
3224
3225     for(i=0; i<w; i++){
3226         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3227         lt= src1[i];
3228         dst[i]= l;
3229     }
3230
3231     *left= l;
3232     *left_top= lt;
3233 }
3234
3235 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3236     int i;
3237     uint8_t l, lt;
3238
3239     l= *left;
3240     lt= *left_top;
3241
3242     for(i=0; i<w; i++){
3243         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3244         lt= src1[i];
3245         l= src2[i];
3246         dst[i]= l - pred;
3247     }
3248
3249     *left= l;
3250     *left_top= lt;
3251 }
3252
3253 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3254     int i;
3255
3256     for(i=0; i<w-1; i++){
3257         acc+= src[i];
3258         dst[i]= acc;
3259         i++;
3260         acc+= src[i];
3261         dst[i]= acc;
3262     }
3263
3264     for(; i<w; i++){
3265         acc+= src[i];
3266         dst[i]= acc;
3267     }
3268
3269     return acc;
3270 }
3271
3272 #if HAVE_BIGENDIAN
3273 #define B 3
3274 #define G 2
3275 #define R 1
3276 #define A 0
3277 #else
3278 #define B 0
3279 #define G 1
3280 #define R 2
3281 #define A 3
3282 #endif
3283 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3284     int i;
3285     int r,g,b,a;
3286     r= *red;
3287     g= *green;
3288     b= *blue;
3289     a= *alpha;
3290
3291     for(i=0; i<w; i++){
3292         b+= src[4*i+B];
3293         g+= src[4*i+G];
3294         r+= src[4*i+R];
3295         a+= src[4*i+A];
3296
3297         dst[4*i+B]= b;
3298         dst[4*i+G]= g;
3299         dst[4*i+R]= r;
3300         dst[4*i+A]= a;
3301     }
3302
3303     *red= r;
3304     *green= g;
3305     *blue= b;
3306     *alpha= a;
3307 }
3308 #undef B
3309 #undef G
3310 #undef R
3311 #undef A
3312
3313 #define BUTTERFLY2(o1,o2,i1,i2) \
3314 o1= (i1)+(i2);\
3315 o2= (i1)-(i2);
3316
3317 #define BUTTERFLY1(x,y) \
3318 {\
3319     int a,b;\
3320     a= x;\
3321     b= y;\
3322     x= a+b;\
3323     y= a-b;\
3324 }
3325
3326 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3327
3328 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3329     int i;
3330     int temp[64];
3331     int sum=0;
3332
3333     assert(h==8);
3334
3335     for(i=0; i<8; i++){
3336         //FIXME try pointer walks
3337         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3338         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3339         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3340         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3341
3342         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3343         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3344         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3345         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3346
3347         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3348         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3349         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3350         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3351     }
3352
3353     for(i=0; i<8; i++){
3354         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3355         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3356         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3357         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3358
3359         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3360         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3361         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3362         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3363
3364         sum +=
3365              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3366             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3367             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3368             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3369     }
3370 #if 0
3371 static int maxi=0;
3372 if(sum>maxi){
3373     maxi=sum;
3374     printf("MAX:%d\n", maxi);
3375 }
3376 #endif
3377     return sum;
3378 }
3379
3380 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3381     int i;
3382     int temp[64];
3383     int sum=0;
3384
3385     assert(h==8);
3386
3387     for(i=0; i<8; i++){
3388         //FIXME try pointer walks
3389         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3390         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3391         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3392         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3393
3394         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3395         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3396         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3397         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3398
3399         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3400         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3401         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3402         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3403     }
3404
3405     for(i=0; i<8; i++){
3406         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3407         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3408         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3409         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3410
3411         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3412         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3413         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3414         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3415
3416         sum +=
3417              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3418             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3419             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3420             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3421     }
3422
3423     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3424
3425     return sum;
3426 }
3427
3428 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3429     MpegEncContext * const s= (MpegEncContext *)c;
3430     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3431
3432     assert(h==8);
3433
3434     s->dsp.diff_pixels(temp, src1, src2, stride);
3435     s->dsp.fdct(temp);
3436     return s->dsp.sum_abs_dctelem(temp);
3437 }
3438
3439 #if CONFIG_GPL
3440 #define DCT8_1D {\
3441     const int s07 = SRC(0) + SRC(7);\
3442     const int s16 = SRC(1) + SRC(6);\
3443     const int s25 = SRC(2) + SRC(5);\
3444     const int s34 = SRC(3) + SRC(4);\
3445     const int a0 = s07 + s34;\
3446     const int a1 = s16 + s25;\
3447     const int a2 = s07 - s34;\
3448     const int a3 = s16 - s25;\
3449     const int d07 = SRC(0) - SRC(7);\
3450     const int d16 = SRC(1) - SRC(6);\
3451     const int d25 = SRC(2) - SRC(5);\
3452     const int d34 = SRC(3) - SRC(4);\
3453     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3454     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3455     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3456     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3457     DST(0,  a0 + a1     ) ;\
3458     DST(1,  a4 + (a7>>2)) ;\
3459     DST(2,  a2 + (a3>>1)) ;\
3460     DST(3,  a5 + (a6>>2)) ;\
3461     DST(4,  a0 - a1     ) ;\
3462     DST(5,  a6 - (a5>>2)) ;\
3463     DST(6, (a2>>1) - a3 ) ;\
3464     DST(7, (a4>>2) - a7 ) ;\
3465 }
3466
3467 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3468     MpegEncContext * const s= (MpegEncContext *)c;
3469     DCTELEM dct[8][8];
3470     int i;
3471     int sum=0;
3472
3473     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3474
3475 #define SRC(x) dct[i][x]
3476 #define DST(x,v) dct[i][x]= v
3477     for( i = 0; i < 8; i++ )
3478         DCT8_1D
3479 #undef SRC
3480 #undef DST
3481
3482 #define SRC(x) dct[x][i]
3483 #define DST(x,v) sum += FFABS(v)
3484     for( i = 0; i < 8; i++ )
3485         DCT8_1D
3486 #undef SRC
3487 #undef DST
3488     return sum;
3489 }
3490 #endif
3491
3492 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3493     MpegEncContext * const s= (MpegEncContext *)c;
3494     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3495     int sum=0, i;
3496
3497     assert(h==8);
3498
3499     s->dsp.diff_pixels(temp, src1, src2, stride);
3500     s->dsp.fdct(temp);
3501
3502     for(i=0; i<64; i++)
3503         sum= FFMAX(sum, FFABS(temp[i]));
3504
3505     return sum;
3506 }
3507
3508 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3509     MpegEncContext * const s= (MpegEncContext *)c;
3510     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3511     DCTELEM * const bak = temp+64;
3512     int sum=0, i;
3513
3514     assert(h==8);
3515     s->mb_intra=0;
3516
3517     s->dsp.diff_pixels(temp, src1, src2, stride);
3518
3519     memcpy(bak, temp, 64*sizeof(DCTELEM));
3520
3521     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3522     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3523     ff_simple_idct(temp); //FIXME
3524
3525     for(i=0; i<64; i++)
3526         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3527
3528     return sum;
3529 }
3530
3531 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3532     MpegEncContext * const s= (MpegEncContext *)c;
3533     const uint8_t *scantable= s->intra_scantable.permutated;
3534     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3535     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3536     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3537     int i, last, run, bits, level, distortion, start_i;
3538     const int esc_length= s->ac_esc_length;
3539     uint8_t * length;
3540     uint8_t * last_length;
3541
3542     assert(h==8);
3543
3544     copy_block8(lsrc1, src1, 8, stride, 8);
3545     copy_block8(lsrc2, src2, 8, stride, 8);
3546
3547     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3548
3549     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3550
3551     bits=0;
3552
3553     if (s->mb_intra) {
3554         start_i = 1;
3555         length     = s->intra_ac_vlc_length;
3556         last_length= s->intra_ac_vlc_last_length;
3557         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3558     } else {
3559         start_i = 0;
3560         length     = s->inter_ac_vlc_length;
3561         last_length= s->inter_ac_vlc_last_length;
3562     }
3563
3564     if(last>=start_i){
3565         run=0;
3566         for(i=start_i; i<last; i++){
3567             int j= scantable[i];
3568             level= temp[j];
3569
3570             if(level){
3571                 level+=64;
3572                 if((level&(~127)) == 0){
3573                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3574                 }else
3575                     bits+= esc_length;
3576                 run=0;
3577             }else
3578                 run++;
3579         }
3580         i= scantable[last];
3581
3582         level= temp[i] + 64;
3583
3584         assert(level - 64);
3585
3586         if((level&(~127)) == 0){
3587             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3588         }else
3589             bits+= esc_length;
3590
3591     }
3592
3593     if(last>=0){
3594         if(s->mb_intra)
3595             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3596         else
3597             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3598     }
3599
3600     s->dsp.idct_add(lsrc2, 8, temp);
3601
3602     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3603
3604     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3605 }
3606
3607 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3608     MpegEncContext * const s= (MpegEncContext *)c;
3609     const uint8_t *scantable= s->intra_scantable.permutated;
3610     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3611     int i, last, run, bits, level, start_i;
3612     const int esc_length= s->ac_esc_length;
3613     uint8_t * length;
3614     uint8_t * last_length;
3615
3616     assert(h==8);
3617
3618     s->dsp.diff_pixels(temp, src1, src2, stride);
3619
3620     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3621
3622     bits=0;
3623
3624     if (s->mb_intra) {
3625         start_i = 1;
3626         length     = s->intra_ac_vlc_length;
3627         last_length= s->intra_ac_vlc_last_length;
3628         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3629     } else {
3630         start_i = 0;
3631         length     = s->inter_ac_vlc_length;
3632         last_length= s->inter_ac_vlc_last_length;
3633     }
3634
3635     if(last>=start_i){
3636         run=0;
3637         for(i=start_i; i<last; i++){
3638             int j= scantable[i];
3639             level= temp[j];
3640
3641             if(level){
3642                 level+=64;
3643                 if((level&(~127)) == 0){
3644                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3645                 }else
3646                     bits+= esc_length;
3647                 run=0;
3648             }else
3649                 run++;
3650         }
3651         i= scantable[last];
3652
3653         level= temp[i] + 64;
3654
3655         assert(level - 64);
3656
3657         if((level&(~127)) == 0){
3658             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3659         }else
3660             bits+= esc_length;
3661     }
3662
3663     return bits;
3664 }
3665
3666 #define VSAD_INTRA(size) \
3667 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3668     int score=0;                                                                                            \
3669     int x,y;                                                                                                \
3670                                                                                                             \
3671     for(y=1; y<h; y++){                                                                                     \
3672         for(x=0; x<size; x+=4){                                                                             \
3673             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3674                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3675         }                                                                                                   \
3676         s+= stride;                                                                                         \
3677     }                                                                                                       \
3678                                                                                                             \
3679     return score;                                                                                           \
3680 }
3681 VSAD_INTRA(8)
3682 VSAD_INTRA(16)
3683
3684 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3685     int score=0;
3686     int x,y;
3687
3688     for(y=1; y<h; y++){
3689         for(x=0; x<16; x++){
3690             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3691         }
3692         s1+= stride;
3693         s2+= stride;
3694     }
3695
3696     return score;
3697 }
3698
3699 #define SQ(a) ((a)*(a))
3700 #define VSSE_INTRA(size) \
3701 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3702     int score=0;                                                                                            \
3703     int x,y;                                                                                                \
3704                                                                                                             \
3705     for(y=1; y<h; y++){                                                                                     \
3706         for(x=0; x<size; x+=4){                                                                               \
3707             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3708                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3709         }                                                                                                   \
3710         s+= stride;                                                                                         \
3711     }                                                                                                       \
3712                                                                                                             \
3713     return score;                                                                                           \
3714 }
3715 VSSE_INTRA(8)
3716 VSSE_INTRA(16)
3717
3718 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3719     int score=0;
3720     int x,y;
3721
3722     for(y=1; y<h; y++){
3723         for(x=0; x<16; x++){
3724             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3725         }
3726         s1+= stride;
3727         s2+= stride;
3728     }
3729
3730     return score;
3731 }
3732
3733 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3734                                int size){
3735     int score=0;
3736     int i;
3737     for(i=0; i<size; i++)
3738         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3739     return score;
3740 }
3741
3742 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3743 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3744 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3745 #if CONFIG_GPL
3746 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3747 #endif
3748 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3749 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3750 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3751 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3752
3753 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3754     int i;
3755     for(i=0; i<len; i++)
3756         dst[i] = src0[i] * src1[i];
3757 }
3758
3759 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3760     int i;
3761     src1 += len-1;
3762     for(i=0; i<len; i++)
3763         dst[i] = src0[i] * src1[-i];
3764 }
3765
3766 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3767     int i;
3768     for(i=0; i<len; i++)
3769         dst[i] = src0[i] * src1[i] + src2[i];
3770 }
3771
3772 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3773     int i,j;
3774     dst += len;
3775     win += len;
3776     src0+= len;
3777     for(i=-len, j=len-1; i<0; i++, j--) {
3778         float s0 = src0[i];
3779         float s1 = src1[j];
3780         float wi = win[i];
3781         float wj = win[j];
3782         dst[i] = s0*wj - s1*wi + add_bias;
3783         dst[j] = s0*wi + s1*wj + add_bias;
3784     }
3785 }
3786
3787 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3788                                  int len)
3789 {
3790     int i;
3791     for (i = 0; i < len; i++)
3792         dst[i] = src[i] * mul;
3793 }
3794
3795 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3796                                       const float **sv, float mul, int len)
3797 {
3798     int i;
3799     for (i = 0; i < len; i += 2, sv++) {
3800         dst[i  ] = src[i  ] * sv[0][0] * mul;
3801         dst[i+1] = src[i+1] * sv[0][1] * mul;
3802     }
3803 }
3804
3805 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3806                                       const float **sv, float mul, int len)
3807 {
3808     int i;
3809     for (i = 0; i < len; i += 4, sv++) {
3810         dst[i  ] = src[i  ] * sv[0][0] * mul;
3811         dst[i+1] = src[i+1] * sv[0][1] * mul;
3812         dst[i+2] = src[i+2] * sv[0][2] * mul;
3813         dst[i+3] = src[i+3] * sv[0][3] * mul;
3814     }
3815 }
3816
3817 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3818                                int len)
3819 {
3820     int i;
3821     for (i = 0; i < len; i += 2, sv++) {
3822         dst[i  ] = sv[0][0] * mul;
3823         dst[i+1] = sv[0][1] * mul;
3824     }
3825 }
3826
3827 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3828                                int len)
3829 {
3830     int i;
3831     for (i = 0; i < len; i += 4, sv++) {
3832         dst[i  ] = sv[0][0] * mul;
3833         dst[i+1] = sv[0][1] * mul;
3834         dst[i+2] = sv[0][2] * mul;
3835         dst[i+3] = sv[0][3] * mul;
3836     }
3837 }
3838
3839 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3840                                 int len)
3841 {
3842     int i;
3843     for (i = 0; i < len; i++) {
3844         float t = v1[i] - v2[i];
3845         v1[i] += v2[i];
3846         v2[i] = t;
3847     }
3848 }
3849
3850 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3851 {
3852     float p = 0.0;
3853     int i;
3854
3855     for (i = 0; i < len; i++)
3856         p += v1[i] * v2[i];
3857
3858     return p;
3859 }
3860
3861 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3862     int i;
3863     for(i=0; i<len; i++)
3864         dst[i] = src[i] * mul;
3865 }
3866
3867 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3868                    uint32_t maxi, uint32_t maxisign)
3869 {
3870
3871     if(a > mini) return mini;
3872     else if((a^(1<<31)) > maxisign) return maxi;
3873     else return a;
3874 }
3875
3876 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3877     int i;
3878     uint32_t mini = *(uint32_t*)min;
3879     uint32_t maxi = *(uint32_t*)max;
3880     uint32_t maxisign = maxi ^ (1<<31);
3881     uint32_t *dsti = (uint32_t*)dst;
3882     const uint32_t *srci = (const uint32_t*)src;
3883     for(i=0; i<len; i+=8) {
3884         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3885         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3886         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3887         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3888         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3889         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3890         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3891         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3892     }
3893 }
3894 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3895     int i;
3896     if(min < 0 && max > 0) {
3897         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3898     } else {
3899         for(i=0; i < len; i+=8) {
3900             dst[i    ] = av_clipf(src[i    ], min, max);
3901             dst[i + 1] = av_clipf(src[i + 1], min, max);
3902             dst[i + 2] = av_clipf(src[i + 2], min, max);
3903             dst[i + 3] = av_clipf(src[i + 3], min, max);
3904             dst[i + 4] = av_clipf(src[i + 4], min, max);
3905             dst[i + 5] = av_clipf(src[i + 5], min, max);
3906             dst[i + 6] = av_clipf(src[i + 6], min, max);
3907             dst[i + 7] = av_clipf(src[i + 7], min, max);
3908         }
3909     }
3910 }
3911
3912 static av_always_inline int float_to_int16_one(const float *src){
3913     return av_clip_int16(lrintf(*src));
3914 }
3915
3916 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3917     int i;
3918     for(i=0; i<len; i++)
3919         dst[i] = float_to_int16_one(src+i);
3920 }
3921
3922 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3923     int i,j,c;
3924     if(channels==2){
3925         for(i=0; i<len; i++){
3926             dst[2*i]   = float_to_int16_one(src[0]+i);
3927             dst[2*i+1] = float_to_int16_one(src[1]+i);
3928         }
3929     }else{
3930         for(c=0; c<channels; c++)
3931             for(i=0, j=c; i<len; i++, j+=channels)
3932                 dst[j] = float_to_int16_one(src[c]+i);
3933     }
3934 }
3935
3936 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3937 {
3938     int res = 0;
3939
3940     while (order--)
3941         res += (*v1++ * *v2++) >> shift;
3942
3943     return res;
3944 }
3945
3946 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3947 {
3948     int res = 0;
3949     while (order--) {
3950         res   += *v1 * *v2++;
3951         *v1++ += mul * *v3++;
3952     }
3953     return res;
3954 }
3955
3956 #define W0 2048
3957 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3958 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3959 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3960 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3961 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3962 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3963 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3964
3965 static void wmv2_idct_row(short * b)
3966 {
3967     int s1,s2;
3968     int a0,a1,a2,a3,a4,a5,a6,a7;
3969     /*step 1*/
3970     a1 = W1*b[1]+W7*b[7];
3971     a7 = W7*b[1]-W1*b[7];
3972     a5 = W5*b[5]+W3*b[3];
3973     a3 = W3*b[5]-W5*b[3];
3974     a2 = W2*b[2]+W6*b[6];
3975     a6 = W6*b[2]-W2*b[6];
3976     a0 = W0*b[0]+W0*b[4];
3977     a4 = W0*b[0]-W0*b[4];
3978     /*step 2*/
3979     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3980     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3981     /*step 3*/
3982     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3983     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3984     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3985     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3986     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3987     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3988     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3989     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3990 }
3991 static void wmv2_idct_col(short * b)
3992 {
3993     int s1,s2;
3994     int a0,a1,a2,a3,a4,a5,a6,a7;
3995     /*step 1, with extended precision*/
3996     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3997     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3998     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3999     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4000     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4001     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4002     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4003     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4004     /*step 2*/
4005     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4006     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4007     /*step 3*/
4008     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4009     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4010     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4011     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4012
4013     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4014     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4015     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4016     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4017 }
4018 void ff_wmv2_idct_c(short * block){
4019     int i;
4020
4021     for(i=0;i<64;i+=8){
4022         wmv2_idct_row(block+i);
4023     }
4024     for(i=0;i<8;i++){
4025         wmv2_idct_col(block+i);
4026     }
4027 }
4028 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4029  converted */
4030 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4031 {
4032     ff_wmv2_idct_c(block);
4033     put_pixels_clamped_c(block, dest, line_size);
4034 }
4035 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4036 {
4037     ff_wmv2_idct_c(block);
4038     add_pixels_clamped_c(block, dest, line_size);
4039 }
4040 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4041 {
4042     j_rev_dct (block);
4043     put_pixels_clamped_c(block, dest, line_size);
4044 }
4045 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4046 {
4047     j_rev_dct (block);
4048     add_pixels_clamped_c(block, dest, line_size);
4049 }
4050
4051 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4052 {
4053     j_rev_dct4 (block);
4054     put_pixels_clamped4_c(block, dest, line_size);
4055 }
4056 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4057 {
4058     j_rev_dct4 (block);
4059     add_pixels_clamped4_c(block, dest, line_size);
4060 }
4061
4062 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4063 {
4064     j_rev_dct2 (block);
4065     put_pixels_clamped2_c(block, dest, line_size);
4066 }
4067 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4068 {
4069     j_rev_dct2 (block);
4070     add_pixels_clamped2_c(block, dest, line_size);
4071 }
4072
4073 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4074 {
4075     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4076
4077     dest[0] = cm[(block[0] + 4)>>3];
4078 }
4079 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4080 {
4081     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4082
4083     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4084 }
4085
4086 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4087
4088 /* init static data */
4089 av_cold void dsputil_static_init(void)
4090 {
4091     int i;
4092
4093     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4094     for(i=0;i<MAX_NEG_CROP;i++) {
4095         ff_cropTbl[i] = 0;
4096         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4097     }
4098
4099     for(i=0;i<512;i++) {
4100         ff_squareTbl[i] = (i - 256) * (i - 256);
4101     }
4102
4103     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4104 }
4105
4106 int ff_check_alignment(void){
4107     static int did_fail=0;
4108     DECLARE_ALIGNED(16, int, aligned);
4109
4110     if((intptr_t)&aligned & 15){
4111         if(!did_fail){
4112 #if HAVE_MMX || HAVE_ALTIVEC
4113             av_log(NULL, AV_LOG_ERROR,
4114                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4115                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4116                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4117                 "Do not report crashes to FFmpeg developers.\n");
4118 #endif
4119             did_fail=1;
4120         }
4121         return -1;
4122     }
4123     return 0;
4124 }
4125
4126 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4127 {
4128     int i;
4129
4130     ff_check_alignment();
4131
4132 #if CONFIG_ENCODERS
4133     if(avctx->dct_algo==FF_DCT_FASTINT) {
4134         c->fdct = fdct_ifast;
4135         c->fdct248 = fdct_ifast248;
4136     }
4137     else if(avctx->dct_algo==FF_DCT_FAAN) {
4138         c->fdct = ff_faandct;
4139         c->fdct248 = ff_faandct248;
4140     }
4141     else {
4142         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4143         c->fdct248 = ff_fdct248_islow;
4144     }
4145 #endif //CONFIG_ENCODERS
4146
4147     if(avctx->lowres==1){
4148         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4149             c->idct_put= ff_jref_idct4_put;
4150             c->idct_add= ff_jref_idct4_add;
4151         }else{
4152             c->idct_put= ff_h264_lowres_idct_put_c;
4153             c->idct_add= ff_h264_lowres_idct_add_c;
4154         }
4155         c->idct    = j_rev_dct4;
4156         c->idct_permutation_type= FF_NO_IDCT_PERM;
4157     }else if(avctx->lowres==2){
4158         c->idct_put= ff_jref_idct2_put;
4159         c->idct_add= ff_jref_idct2_add;
4160         c->idct    = j_rev_dct2;
4161         c->idct_permutation_type= FF_NO_IDCT_PERM;
4162     }else if(avctx->lowres==3){
4163         c->idct_put= ff_jref_idct1_put;
4164         c->idct_add= ff_jref_idct1_add;
4165         c->idct    = j_rev_dct1;
4166         c->idct_permutation_type= FF_NO_IDCT_PERM;
4167     }else{
4168         if(avctx->idct_algo==FF_IDCT_INT){
4169             c->idct_put= ff_jref_idct_put;
4170             c->idct_add= ff_jref_idct_add;
4171             c->idct    = j_rev_dct;
4172             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4173         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4174                 avctx->idct_algo==FF_IDCT_VP3){
4175             c->idct_put= ff_vp3_idct_put_c;
4176             c->idct_add= ff_vp3_idct_add_c;
4177             c->idct    = ff_vp3_idct_c;
4178             c->idct_permutation_type= FF_NO_IDCT_PERM;
4179         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4180             c->idct_put= ff_wmv2_idct_put_c;
4181             c->idct_add= ff_wmv2_idct_add_c;
4182             c->idct    = ff_wmv2_idct_c;
4183             c->idct_permutation_type= FF_NO_IDCT_PERM;
4184         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4185             c->idct_put= ff_faanidct_put;
4186             c->idct_add= ff_faanidct_add;
4187             c->idct    = ff_faanidct;
4188             c->idct_permutation_type= FF_NO_IDCT_PERM;
4189         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4190             c->idct_put= ff_ea_idct_put_c;
4191             c->idct_permutation_type= FF_NO_IDCT_PERM;
4192         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4193             c->idct     = ff_bink_idct_c;
4194             c->idct_add = ff_bink_idct_add_c;
4195             c->idct_put = ff_bink_idct_put_c;
4196             c->idct_permutation_type = FF_NO_IDCT_PERM;
4197         }else{ //accurate/default
4198             c->idct_put= ff_simple_idct_put;
4199             c->idct_add= ff_simple_idct_add;
4200             c->idct    = ff_simple_idct;
4201             c->idct_permutation_type= FF_NO_IDCT_PERM;
4202         }
4203     }
4204
4205     c->get_pixels = get_pixels_c;
4206     c->diff_pixels = diff_pixels_c;
4207     c->put_pixels_clamped = put_pixels_clamped_c;
4208     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4209     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4210     c->add_pixels_clamped = add_pixels_clamped_c;
4211     c->add_pixels8 = add_pixels8_c;
4212     c->add_pixels4 = add_pixels4_c;
4213     c->sum_abs_dctelem = sum_abs_dctelem_c;
4214     c->gmc1 = gmc1_c;
4215     c->gmc = ff_gmc_c;
4216     c->clear_block = clear_block_c;
4217     c->clear_blocks = clear_blocks_c;
4218     c->pix_sum = pix_sum_c;
4219     c->pix_norm1 = pix_norm1_c;
4220
4221     c->fill_block_tab[0] = fill_block16_c;
4222     c->fill_block_tab[1] = fill_block8_c;
4223     c->scale_block = scale_block_c;
4224
4225     /* TODO [0] 16  [1] 8 */
4226     c->pix_abs[0][0] = pix_abs16_c;
4227     c->pix_abs[0][1] = pix_abs16_x2_c;
4228     c->pix_abs[0][2] = pix_abs16_y2_c;
4229     c->pix_abs[0][3] = pix_abs16_xy2_c;
4230     c->pix_abs[1][0] = pix_abs8_c;
4231     c->pix_abs[1][1] = pix_abs8_x2_c;
4232     c->pix_abs[1][2] = pix_abs8_y2_c;
4233     c->pix_abs[1][3] = pix_abs8_xy2_c;
4234
4235 #define dspfunc(PFX, IDX, NUM) \
4236     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4237     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4238     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4239     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4240
4241     dspfunc(put, 0, 16);
4242     dspfunc(put_no_rnd, 0, 16);
4243     dspfunc(put, 1, 8);
4244     dspfunc(put_no_rnd, 1, 8);
4245     dspfunc(put, 2, 4);
4246     dspfunc(put, 3, 2);
4247
4248     dspfunc(avg, 0, 16);
4249     dspfunc(avg_no_rnd, 0, 16);
4250     dspfunc(avg, 1, 8);
4251     dspfunc(avg_no_rnd, 1, 8);
4252     dspfunc(avg, 2, 4);
4253     dspfunc(avg, 3, 2);
4254 #undef dspfunc
4255
4256     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4257     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4258
4259     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4260     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4261     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4262     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4263     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4264     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4265     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4266     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4267     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4268
4269     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4270     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4271     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4272     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4273     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4274     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4275     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4276     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4277     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4278
4279 #define dspfunc(PFX, IDX, NUM) \
4280     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4281     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4282     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4283     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4284     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4285     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4286     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4287     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4288     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4289     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4290     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4291     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4292     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4293     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4294     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4295     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4296
4297     dspfunc(put_qpel, 0, 16);
4298     dspfunc(put_no_rnd_qpel, 0, 16);
4299
4300     dspfunc(avg_qpel, 0, 16);
4301     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4302
4303     dspfunc(put_qpel, 1, 8);
4304     dspfunc(put_no_rnd_qpel, 1, 8);
4305
4306     dspfunc(avg_qpel, 1, 8);
4307     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4308
4309     dspfunc(put_h264_qpel, 0, 16);
4310     dspfunc(put_h264_qpel, 1, 8);
4311     dspfunc(put_h264_qpel, 2, 4);
4312     dspfunc(put_h264_qpel, 3, 2);
4313     dspfunc(avg_h264_qpel, 0, 16);
4314     dspfunc(avg_h264_qpel, 1, 8);
4315     dspfunc(avg_h264_qpel, 2, 4);
4316
4317 #undef dspfunc
4318     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4319     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4320     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4321     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4322     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4323     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4324     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4325     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4326
4327     c->draw_edges = draw_edges_c;
4328
4329 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4330     ff_mlp_init(c, avctx);
4331 #endif
4332 #if CONFIG_VC1_DECODER
4333     ff_vc1dsp_init(c,avctx);
4334 #endif
4335 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4336     ff_intrax8dsp_init(c,avctx);
4337 #endif
4338 #if CONFIG_RV30_DECODER
4339     ff_rv30dsp_init(c,avctx);
4340 #endif
4341 #if CONFIG_RV40_DECODER
4342     ff_rv40dsp_init(c,avctx);
4343     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4344     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4345     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4346     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4347 #endif
4348
4349     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4350     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4351     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4352     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4353     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4354     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4355     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4356     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4357
4358 #define SET_CMP_FUNC(name) \
4359     c->name[0]= name ## 16_c;\
4360     c->name[1]= name ## 8x8_c;
4361
4362     SET_CMP_FUNC(hadamard8_diff)
4363     c->hadamard8_diff[4]= hadamard8_intra16_c;
4364     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4365     SET_CMP_FUNC(dct_sad)
4366     SET_CMP_FUNC(dct_max)
4367 #if CONFIG_GPL
4368     SET_CMP_FUNC(dct264_sad)
4369 #endif
4370     c->sad[0]= pix_abs16_c;
4371     c->sad[1]= pix_abs8_c;
4372     c->sse[0]= sse16_c;
4373     c->sse[1]= sse8_c;
4374     c->sse[2]= sse4_c;
4375     SET_CMP_FUNC(quant_psnr)
4376     SET_CMP_FUNC(rd)
4377     SET_CMP_FUNC(bit)
4378     c->vsad[0]= vsad16_c;
4379     c->vsad[4]= vsad_intra16_c;
4380     c->vsad[5]= vsad_intra8_c;
4381     c->vsse[0]= vsse16_c;
4382     c->vsse[4]= vsse_intra16_c;
4383     c->vsse[5]= vsse_intra8_c;
4384     c->nsse[0]= nsse16_c;
4385     c->nsse[1]= nsse8_c;
4386 #if CONFIG_DWT
4387     ff_dsputil_init_dwt(c);
4388 #endif
4389
4390     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4391
4392     c->add_bytes= add_bytes_c;
4393     c->add_bytes_l2= add_bytes_l2_c;
4394     c->diff_bytes= diff_bytes_c;
4395     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4396     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4397     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4398     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4399     c->bswap_buf= bswap_buf;
4400 #if CONFIG_PNG_DECODER
4401     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4402 #endif
4403
4404     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4405         c->h263_h_loop_filter= h263_h_loop_filter_c;
4406         c->h263_v_loop_filter= h263_v_loop_filter_c;
4407     }
4408
4409     if (CONFIG_VP3_DECODER) {
4410         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4411         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4412         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4413     }
4414
4415     c->h261_loop_filter= h261_loop_filter_c;
4416
4417     c->try_8x8basis= try_8x8basis_c;
4418     c->add_8x8basis= add_8x8basis_c;
4419
4420 #if CONFIG_VORBIS_DECODER
4421     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4422 #endif
4423 #if CONFIG_AC3_DECODER
4424     c->ac3_downmix = ff_ac3_downmix_c;
4425 #endif
4426     c->vector_fmul = vector_fmul_c;
4427     c->vector_fmul_reverse = vector_fmul_reverse_c;
4428     c->vector_fmul_add = vector_fmul_add_c;
4429     c->vector_fmul_window = ff_vector_fmul_window_c;
4430     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4431     c->vector_clipf = vector_clipf_c;
4432     c->float_to_int16 = ff_float_to_int16_c;
4433     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4434     c->scalarproduct_int16 = scalarproduct_int16_c;
4435     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4436     c->scalarproduct_float = scalarproduct_float_c;
4437     c->butterflies_float = butterflies_float_c;
4438     c->vector_fmul_scalar = vector_fmul_scalar_c;
4439
4440     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4441     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4442
4443     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4444     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4445
4446     c->shrink[0]= av_image_copy_plane;
4447     c->shrink[1]= ff_shrink22;
4448     c->shrink[2]= ff_shrink44;
4449     c->shrink[3]= ff_shrink88;
4450
4451     c->prefetch= just_return;
4452
4453     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4454     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4455
4456     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4457     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4458     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4459     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4460     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4461     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4462     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4463     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4464     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4465
4466     for(i=0; i<64; i++){
4467         if(!c->put_2tap_qpel_pixels_tab[0][i])
4468             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4469         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4470             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4471     }
4472
4473     c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4474     c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4475     c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4476     c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4477
4478     c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4479     c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4480     c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4481     c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4482
4483     switch(c->idct_permutation_type){
4484     case FF_NO_IDCT_PERM:
4485         for(i=0; i<64; i++)
4486             c->idct_permutation[i]= i;
4487         break;
4488     case FF_LIBMPEG2_IDCT_PERM:
4489         for(i=0; i<64; i++)
4490             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4491         break;
4492     case FF_SIMPLE_IDCT_PERM:
4493         for(i=0; i<64; i++)
4494             c->idct_permutation[i]= simple_mmx_permutation[i];
4495         break;
4496     case FF_TRANSPOSE_IDCT_PERM:
4497         for(i=0; i<64; i++)
4498             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4499         break;
4500     case FF_PARTTRANS_IDCT_PERM:
4501         for(i=0; i<64; i++)
4502             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4503         break;
4504     case FF_SSE2_IDCT_PERM:
4505         for(i=0; i<64; i++)
4506             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4507         break;
4508     default:
4509         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4510     }
4511 }
4512