git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33 #include "snow.h"
  34
  35 /* snow.c */
  36 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  37
  38 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  39 uint32_t squareTbl[512] = {0, };
  40
  41 const uint8_t ff_zigzag_direct[64] = {
  42     0,   1,  8, 16,  9,  2,  3, 10,
  43     17, 24, 32, 25, 18, 11,  4,  5,
  44     12, 19, 26, 33, 40, 48, 41, 34,
  45     27, 20, 13,  6,  7, 14, 21, 28,
  46     35, 42, 49, 56, 57, 50, 43, 36,
  47     29, 22, 15, 23, 30, 37, 44, 51,
  48     58, 59, 52, 45, 38, 31, 39, 46,
  49     53, 60, 61, 54, 47, 55, 62, 63
  50 };
  51
  52 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  53    specification, we interleave the fields */
  54 const uint8_t ff_zigzag248_direct[64] = {
  55      0,  8,  1,  9, 16, 24,  2, 10,
  56     17, 25, 32, 40, 48, 56, 33, 41,
  57     18, 26,  3, 11,  4, 12, 19, 27,
  58     34, 42, 49, 57, 50, 58, 35, 43,
  59     20, 28,  5, 13,  6, 14, 21, 29,
  60     36, 44, 51, 59, 52, 60, 37, 45,
  61     22, 30,  7, 15, 23, 31, 38, 46,
  62     53, 61, 54, 62, 39, 47, 55, 63,
  63 };
  64
  65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  66 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  67
  68 const uint8_t ff_alternate_horizontal_scan[64] = {
  69     0,  1,   2,  3,  8,  9, 16, 17,
  70     10, 11,  4,  5,  6,  7, 15, 14,
  71     13, 12, 19, 18, 24, 25, 32, 33,
  72     26, 27, 20, 21, 22, 23, 28, 29,
  73     30, 31, 34, 35, 40, 41, 48, 49,
  74     42, 43, 36, 37, 38, 39, 44, 45,
  75     46, 47, 50, 51, 56, 57, 58, 59,
  76     52, 53, 54, 55, 60, 61, 62, 63,
  77 };
  78
  79 const uint8_t ff_alternate_vertical_scan[64] = {
  80     0,  8,  16, 24,  1,  9,  2, 10,
  81     17, 25, 32, 40, 48, 56, 57, 49,
  82     41, 33, 26, 18,  3, 11,  4, 12,
  83     19, 27, 34, 42, 50, 58, 35, 43,
  84     51, 59, 20, 28,  5, 13,  6, 14,
  85     21, 29, 36, 44, 52, 60, 37, 45,
  86     53, 61, 22, 30,  7, 15, 23, 31,
  87     38, 46, 54, 62, 39, 47, 55, 63,
  88 };
  89
  90 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  91 const uint32_t inverse[256]={
  92          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  93  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  94  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  95  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  96  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  97  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  98   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  99   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 100   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 101   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 102   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 103   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 104   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 105   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 106   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 107   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 108   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 109   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 110   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 111   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 112   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 113   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 114   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 115   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 116   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 117   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 118   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 119   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 120   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 121   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 122   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 123   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 124 };
 125
 126 /* Input permutation for the simple_idct_mmx */
 127 static const uint8_t simple_mmx_permutation[64]={
 128         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 129         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 130         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 131         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 132         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 133         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 134         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 135         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 136 };
 137
 138 static int pix_sum_c(uint8_t * pix, int line_size)
 139 {
 140     int s, i, j;
 141
 142     s = 0;
 143     for (i = 0; i < 16; i++) {
 144         for (j = 0; j < 16; j += 8) {
 145             s += pix[0];
 146             s += pix[1];
 147             s += pix[2];
 148             s += pix[3];
 149             s += pix[4];
 150             s += pix[5];
 151             s += pix[6];
 152             s += pix[7];
 153             pix += 8;
 154         }
 155         pix += line_size - 16;
 156     }
 157     return s;
 158 }
 159
 160 static int pix_norm1_c(uint8_t * pix, int line_size)
 161 {
 162     int s, i, j;
 163     uint32_t *sq = squareTbl + 256;
 164
 165     s = 0;
 166     for (i = 0; i < 16; i++) {
 167         for (j = 0; j < 16; j += 8) {
 168 #if 0
 169             s += sq[pix[0]];
 170             s += sq[pix[1]];
 171             s += sq[pix[2]];
 172             s += sq[pix[3]];
 173             s += sq[pix[4]];
 174             s += sq[pix[5]];
 175             s += sq[pix[6]];
 176             s += sq[pix[7]];
 177 #else
 178 #if LONG_MAX > 2147483647
 179             register uint64_t x=*(uint64_t*)pix;
 180             s += sq[x&0xff];
 181             s += sq[(x>>8)&0xff];
 182             s += sq[(x>>16)&0xff];
 183             s += sq[(x>>24)&0xff];
 184             s += sq[(x>>32)&0xff];
 185             s += sq[(x>>40)&0xff];
 186             s += sq[(x>>48)&0xff];
 187             s += sq[(x>>56)&0xff];
 188 #else
 189             register uint32_t x=*(uint32_t*)pix;
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194             x=*(uint32_t*)(pix+4);
 195             s += sq[x&0xff];
 196             s += sq[(x>>8)&0xff];
 197             s += sq[(x>>16)&0xff];
 198             s += sq[(x>>24)&0xff];
 199 #endif
 200 #endif
 201             pix += 8;
 202         }
 203         pix += line_size - 16;
 204     }
 205     return s;
 206 }
 207
 208 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 209     int i;
 210
 211     for(i=0; i+8<=w; i+=8){
 212         dst[i+0]= bswap_32(src[i+0]);
 213         dst[i+1]= bswap_32(src[i+1]);
 214         dst[i+2]= bswap_32(src[i+2]);
 215         dst[i+3]= bswap_32(src[i+3]);
 216         dst[i+4]= bswap_32(src[i+4]);
 217         dst[i+5]= bswap_32(src[i+5]);
 218         dst[i+6]= bswap_32(src[i+6]);
 219         dst[i+7]= bswap_32(src[i+7]);
 220     }
 221     for(;i<w; i++){
 222         dst[i+0]= bswap_32(src[i+0]);
 223     }
 224 }
 225
 226 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 227 {
 228     int s, i;
 229     uint32_t *sq = squareTbl + 256;
 230
 231     s = 0;
 232     for (i = 0; i < h; i++) {
 233         s += sq[pix1[0] - pix2[0]];
 234         s += sq[pix1[1] - pix2[1]];
 235         s += sq[pix1[2] - pix2[2]];
 236         s += sq[pix1[3] - pix2[3]];
 237         pix1 += line_size;
 238         pix2 += line_size;
 239     }
 240     return s;
 241 }
 242
 243 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 244 {
 245     int s, i;
 246     uint32_t *sq = squareTbl + 256;
 247
 248     s = 0;
 249     for (i = 0; i < h; i++) {
 250         s += sq[pix1[0] - pix2[0]];
 251         s += sq[pix1[1] - pix2[1]];
 252         s += sq[pix1[2] - pix2[2]];
 253         s += sq[pix1[3] - pix2[3]];
 254         s += sq[pix1[4] - pix2[4]];
 255         s += sq[pix1[5] - pix2[5]];
 256         s += sq[pix1[6] - pix2[6]];
 257         s += sq[pix1[7] - pix2[7]];
 258         pix1 += line_size;
 259         pix2 += line_size;
 260     }
 261     return s;
 262 }
 263
 264 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 265 {
 266     int s, i;
 267     uint32_t *sq = squareTbl + 256;
 268
 269     s = 0;
 270     for (i = 0; i < h; i++) {
 271         s += sq[pix1[ 0] - pix2[ 0]];
 272         s += sq[pix1[ 1] - pix2[ 1]];
 273         s += sq[pix1[ 2] - pix2[ 2]];
 274         s += sq[pix1[ 3] - pix2[ 3]];
 275         s += sq[pix1[ 4] - pix2[ 4]];
 276         s += sq[pix1[ 5] - pix2[ 5]];
 277         s += sq[pix1[ 6] - pix2[ 6]];
 278         s += sq[pix1[ 7] - pix2[ 7]];
 279         s += sq[pix1[ 8] - pix2[ 8]];
 280         s += sq[pix1[ 9] - pix2[ 9]];
 281         s += sq[pix1[10] - pix2[10]];
 282         s += sq[pix1[11] - pix2[11]];
 283         s += sq[pix1[12] - pix2[12]];
 284         s += sq[pix1[13] - pix2[13]];
 285         s += sq[pix1[14] - pix2[14]];
 286         s += sq[pix1[15] - pix2[15]];
 287
 288         pix1 += line_size;
 289         pix2 += line_size;
 290     }
 291     return s;
 292 }
 293
 294
 295 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 296 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 297     int s, i, j;
 298     const int dec_count= w==8 ? 3 : 4;
 299     int tmp[32*32];
 300     int level, ori;
 301     static const int scale[2][2][4][4]={
 302       {
 303         {
 304             // 9/7 8x8 dec=3
 305             {268, 239, 239, 213},
 306             {  0, 224, 224, 152},
 307             {  0, 135, 135, 110},
 308         },{
 309             // 9/7 16x16 or 32x32 dec=4
 310             {344, 310, 310, 280},
 311             {  0, 320, 320, 228},
 312             {  0, 175, 175, 136},
 313             {  0, 129, 129, 102},
 314         }
 315       },{
 316         {
 317             // 5/3 8x8 dec=3
 318             {275, 245, 245, 218},
 319             {  0, 230, 230, 156},
 320             {  0, 138, 138, 113},
 321         },{
 322             // 5/3 16x16 or 32x32 dec=4
 323             {352, 317, 317, 286},
 324             {  0, 328, 328, 233},
 325             {  0, 180, 180, 140},
 326             {  0, 132, 132, 105},
 327         }
 328       }
 329     };
 330
 331     for (i = 0; i < h; i++) {
 332         for (j = 0; j < w; j+=4) {
 333             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 334             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 335             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 336             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 337         }
 338         pix1 += line_size;
 339         pix2 += line_size;
 340     }
 341
 342     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 343
 344     s=0;
 345     assert(w==h);
 346     for(level=0; level<dec_count; level++){
 347         for(ori= level ? 1 : 0; ori<4; ori++){
 348             int size= w>>(dec_count-level);
 349             int sx= (ori&1) ? size : 0;
 350             int stride= 32<<(dec_count-level);
 351             int sy= (ori&2) ? stride>>1 : 0;
 352
 353             for(i=0; i<size; i++){
 354                 for(j=0; j<size; j++){
 355                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 356                     s += ABS(v);
 357                 }
 358             }
 359         }
 360     }
 361     assert(s>=0);
 362     return s>>9;
 363 }
 364
 365 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 366     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 367 }
 368
 369 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 370     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 371 }
 372
 373 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 374     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 375 }
 376
 377 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 378     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 379 }
 380
 381 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 382     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 383 }
 384
 385 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 386     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 387 }
 388 #endif
 389
 390 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 391 {
 392     int i;
 393
 394     /* read the pixels */
 395     for(i=0;i<8;i++) {
 396         block[0] = pixels[0];
 397         block[1] = pixels[1];
 398         block[2] = pixels[2];
 399         block[3] = pixels[3];
 400         block[4] = pixels[4];
 401         block[5] = pixels[5];
 402         block[6] = pixels[6];
 403         block[7] = pixels[7];
 404         pixels += line_size;
 405         block += 8;
 406     }
 407 }
 408
 409 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 410                           const uint8_t *s2, int stride){
 411     int i;
 412
 413     /* read the pixels */
 414     for(i=0;i<8;i++) {
 415         block[0] = s1[0] - s2[0];
 416         block[1] = s1[1] - s2[1];
 417         block[2] = s1[2] - s2[2];
 418         block[3] = s1[3] - s2[3];
 419         block[4] = s1[4] - s2[4];
 420         block[5] = s1[5] - s2[5];
 421         block[6] = s1[6] - s2[6];
 422         block[7] = s1[7] - s2[7];
 423         s1 += stride;
 424         s2 += stride;
 425         block += 8;
 426     }
 427 }
 428
 429
 430 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 431                                  int line_size)
 432 {
 433     int i;
 434     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 435
 436     /* read the pixels */
 437     for(i=0;i<8;i++) {
 438         pixels[0] = cm[block[0]];
 439         pixels[1] = cm[block[1]];
 440         pixels[2] = cm[block[2]];
 441         pixels[3] = cm[block[3]];
 442         pixels[4] = cm[block[4]];
 443         pixels[5] = cm[block[5]];
 444         pixels[6] = cm[block[6]];
 445         pixels[7] = cm[block[7]];
 446
 447         pixels += line_size;
 448         block += 8;
 449     }
 450 }
 451
 452 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 453                                  int line_size)
 454 {
 455     int i;
 456     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 457
 458     /* read the pixels */
 459     for(i=0;i<4;i++) {
 460         pixels[0] = cm[block[0]];
 461         pixels[1] = cm[block[1]];
 462         pixels[2] = cm[block[2]];
 463         pixels[3] = cm[block[3]];
 464
 465         pixels += line_size;
 466         block += 8;
 467     }
 468 }
 469
 470 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 471                                  int line_size)
 472 {
 473     int i;
 474     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 475
 476     /* read the pixels */
 477     for(i=0;i<2;i++) {
 478         pixels[0] = cm[block[0]];
 479         pixels[1] = cm[block[1]];
 480
 481         pixels += line_size;
 482         block += 8;
 483     }
 484 }
 485
 486 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 487                                         uint8_t *restrict pixels,
 488                                         int line_size)
 489 {
 490     int i, j;
 491
 492     for (i = 0; i < 8; i++) {
 493         for (j = 0; j < 8; j++) {
 494             if (*block < -128)
 495                 *pixels = 0;
 496             else if (*block > 127)
 497                 *pixels = 255;
 498             else
 499                 *pixels = (uint8_t)(*block + 128);
 500             block++;
 501             pixels++;
 502         }
 503         pixels += (line_size - 8);
 504     }
 505 }
 506
 507 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 508                           int line_size)
 509 {
 510     int i;
 511     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 512
 513     /* read the pixels */
 514     for(i=0;i<8;i++) {
 515         pixels[0] = cm[pixels[0] + block[0]];
 516         pixels[1] = cm[pixels[1] + block[1]];
 517         pixels[2] = cm[pixels[2] + block[2]];
 518         pixels[3] = cm[pixels[3] + block[3]];
 519         pixels[4] = cm[pixels[4] + block[4]];
 520         pixels[5] = cm[pixels[5] + block[5]];
 521         pixels[6] = cm[pixels[6] + block[6]];
 522         pixels[7] = cm[pixels[7] + block[7]];
 523         pixels += line_size;
 524         block += 8;
 525     }
 526 }
 527
 528 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 529                           int line_size)
 530 {
 531     int i;
 532     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 533
 534     /* read the pixels */
 535     for(i=0;i<4;i++) {
 536         pixels[0] = cm[pixels[0] + block[0]];
 537         pixels[1] = cm[pixels[1] + block[1]];
 538         pixels[2] = cm[pixels[2] + block[2]];
 539         pixels[3] = cm[pixels[3] + block[3]];
 540         pixels += line_size;
 541         block += 8;
 542     }
 543 }
 544
 545 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 546                           int line_size)
 547 {
 548     int i;
 549     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 550
 551     /* read the pixels */
 552     for(i=0;i<2;i++) {
 553         pixels[0] = cm[pixels[0] + block[0]];
 554         pixels[1] = cm[pixels[1] + block[1]];
 555         pixels += line_size;
 556         block += 8;
 557     }
 558 }
 559
 560 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 561 {
 562     int i;
 563     for(i=0;i<8;i++) {
 564         pixels[0] += block[0];
 565         pixels[1] += block[1];
 566         pixels[2] += block[2];
 567         pixels[3] += block[3];
 568         pixels[4] += block[4];
 569         pixels[5] += block[5];
 570         pixels[6] += block[6];
 571         pixels[7] += block[7];
 572         pixels += line_size;
 573         block += 8;
 574     }
 575 }
 576
 577 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 578 {
 579     int i;
 580     for(i=0;i<4;i++) {
 581         pixels[0] += block[0];
 582         pixels[1] += block[1];
 583         pixels[2] += block[2];
 584         pixels[3] += block[3];
 585         pixels += line_size;
 586         block += 4;
 587     }
 588 }
 589
 590 #if 0
 591
 592 #define PIXOP2(OPNAME, OP) \
 593 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 594 {\
 595     int i;\
 596     for(i=0; i<h; i++){\
 597         OP(*((uint64_t*)block), LD64(pixels));\
 598         pixels+=line_size;\
 599         block +=line_size;\
 600     }\
 601 }\
 602 \
 603 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 604 {\
 605     int i;\
 606     for(i=0; i<h; i++){\
 607         const uint64_t a= LD64(pixels  );\
 608         const uint64_t b= LD64(pixels+1);\
 609         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 610         pixels+=line_size;\
 611         block +=line_size;\
 612     }\
 613 }\
 614 \
 615 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 616 {\
 617     int i;\
 618     for(i=0; i<h; i++){\
 619         const uint64_t a= LD64(pixels  );\
 620         const uint64_t b= LD64(pixels+1);\
 621         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 622         pixels+=line_size;\
 623         block +=line_size;\
 624     }\
 625 }\
 626 \
 627 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 628 {\
 629     int i;\
 630     for(i=0; i<h; i++){\
 631         const uint64_t a= LD64(pixels          );\
 632         const uint64_t b= LD64(pixels+line_size);\
 633         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 634         pixels+=line_size;\
 635         block +=line_size;\
 636     }\
 637 }\
 638 \
 639 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 640 {\
 641     int i;\
 642     for(i=0; i<h; i++){\
 643         const uint64_t a= LD64(pixels          );\
 644         const uint64_t b= LD64(pixels+line_size);\
 645         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 646         pixels+=line_size;\
 647         block +=line_size;\
 648     }\
 649 }\
 650 \
 651 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 652 {\
 653         int i;\
 654         const uint64_t a= LD64(pixels  );\
 655         const uint64_t b= LD64(pixels+1);\
 656         uint64_t l0=  (a&0x0303030303030303ULL)\
 657                     + (b&0x0303030303030303ULL)\
 658                     + 0x0202020202020202ULL;\
 659         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 660                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 661         uint64_t l1,h1;\
 662 \
 663         pixels+=line_size;\
 664         for(i=0; i<h; i+=2){\
 665             uint64_t a= LD64(pixels  );\
 666             uint64_t b= LD64(pixels+1);\
 667             l1=  (a&0x0303030303030303ULL)\
 668                + (b&0x0303030303030303ULL);\
 669             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 670               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 671             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 672             pixels+=line_size;\
 673             block +=line_size;\
 674             a= LD64(pixels  );\
 675             b= LD64(pixels+1);\
 676             l0=  (a&0x0303030303030303ULL)\
 677                + (b&0x0303030303030303ULL)\
 678                + 0x0202020202020202ULL;\
 679             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 680               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 681             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 682             pixels+=line_size;\
 683             block +=line_size;\
 684         }\
 685 }\
 686 \
 687 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 688 {\
 689         int i;\
 690         const uint64_t a= LD64(pixels  );\
 691         const uint64_t b= LD64(pixels+1);\
 692         uint64_t l0=  (a&0x0303030303030303ULL)\
 693                     + (b&0x0303030303030303ULL)\
 694                     + 0x0101010101010101ULL;\
 695         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 696                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 697         uint64_t l1,h1;\
 698 \
 699         pixels+=line_size;\
 700         for(i=0; i<h; i+=2){\
 701             uint64_t a= LD64(pixels  );\
 702             uint64_t b= LD64(pixels+1);\
 703             l1=  (a&0x0303030303030303ULL)\
 704                + (b&0x0303030303030303ULL);\
 705             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 706               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 707             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 708             pixels+=line_size;\
 709             block +=line_size;\
 710             a= LD64(pixels  );\
 711             b= LD64(pixels+1);\
 712             l0=  (a&0x0303030303030303ULL)\
 713                + (b&0x0303030303030303ULL)\
 714                + 0x0101010101010101ULL;\
 715             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 716               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 717             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 718             pixels+=line_size;\
 719             block +=line_size;\
 720         }\
 721 }\
 722 \
 723 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 724 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 725 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 726 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 727 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 728 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 729 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 730
 731 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 732 #else // 64 bit variant
 733
 734 #define PIXOP2(OPNAME, OP) \
 735 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 736     int i;\
 737     for(i=0; i<h; i++){\
 738         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 739         pixels+=line_size;\
 740         block +=line_size;\
 741     }\
 742 }\
 743 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 744     int i;\
 745     for(i=0; i<h; i++){\
 746         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 747         pixels+=line_size;\
 748         block +=line_size;\
 749     }\
 750 }\
 751 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 752     int i;\
 753     for(i=0; i<h; i++){\
 754         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 755         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 756         pixels+=line_size;\
 757         block +=line_size;\
 758     }\
 759 }\
 760 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 761     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 762 }\
 763 \
 764 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 765                                                 int src_stride1, int src_stride2, int h){\
 766     int i;\
 767     for(i=0; i<h; i++){\
 768         uint32_t a,b;\
 769         a= LD32(&src1[i*src_stride1  ]);\
 770         b= LD32(&src2[i*src_stride2  ]);\
 771         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 772         a= LD32(&src1[i*src_stride1+4]);\
 773         b= LD32(&src2[i*src_stride2+4]);\
 774         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 775     }\
 776 }\
 777 \
 778 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 779                                                 int src_stride1, int src_stride2, int h){\
 780     int i;\
 781     for(i=0; i<h; i++){\
 782         uint32_t a,b;\
 783         a= LD32(&src1[i*src_stride1  ]);\
 784         b= LD32(&src2[i*src_stride2  ]);\
 785         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 786         a= LD32(&src1[i*src_stride1+4]);\
 787         b= LD32(&src2[i*src_stride2+4]);\
 788         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 789     }\
 790 }\
 791 \
 792 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 793                                                 int src_stride1, int src_stride2, int h){\
 794     int i;\
 795     for(i=0; i<h; i++){\
 796         uint32_t a,b;\
 797         a= LD32(&src1[i*src_stride1  ]);\
 798         b= LD32(&src2[i*src_stride2  ]);\
 799         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 800     }\
 801 }\
 802 \
 803 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 804                                                 int src_stride1, int src_stride2, int h){\
 805     int i;\
 806     for(i=0; i<h; i++){\
 807         uint32_t a,b;\
 808         a= LD16(&src1[i*src_stride1  ]);\
 809         b= LD16(&src2[i*src_stride2  ]);\
 810         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 811     }\
 812 }\
 813 \
 814 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 815                                                 int src_stride1, int src_stride2, int h){\
 816     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 817     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 818 }\
 819 \
 820 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 821                                                 int src_stride1, int src_stride2, int h){\
 822     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 823     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 824 }\
 825 \
 826 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 827     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 828 }\
 829 \
 830 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 831     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 832 }\
 833 \
 834 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 835     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 836 }\
 837 \
 838 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 839     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 840 }\
 841 \
 842 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 843                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 844     int i;\
 845     for(i=0; i<h; i++){\
 846         uint32_t a, b, c, d, l0, l1, h0, h1;\
 847         a= LD32(&src1[i*src_stride1]);\
 848         b= LD32(&src2[i*src_stride2]);\
 849         c= LD32(&src3[i*src_stride3]);\
 850         d= LD32(&src4[i*src_stride4]);\
 851         l0=  (a&0x03030303UL)\
 852            + (b&0x03030303UL)\
 853            + 0x02020202UL;\
 854         h0= ((a&0xFCFCFCFCUL)>>2)\
 855           + ((b&0xFCFCFCFCUL)>>2);\
 856         l1=  (c&0x03030303UL)\
 857            + (d&0x03030303UL);\
 858         h1= ((c&0xFCFCFCFCUL)>>2)\
 859           + ((d&0xFCFCFCFCUL)>>2);\
 860         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 861         a= LD32(&src1[i*src_stride1+4]);\
 862         b= LD32(&src2[i*src_stride2+4]);\
 863         c= LD32(&src3[i*src_stride3+4]);\
 864         d= LD32(&src4[i*src_stride4+4]);\
 865         l0=  (a&0x03030303UL)\
 866            + (b&0x03030303UL)\
 867            + 0x02020202UL;\
 868         h0= ((a&0xFCFCFCFCUL)>>2)\
 869           + ((b&0xFCFCFCFCUL)>>2);\
 870         l1=  (c&0x03030303UL)\
 871            + (d&0x03030303UL);\
 872         h1= ((c&0xFCFCFCFCUL)>>2)\
 873           + ((d&0xFCFCFCFCUL)>>2);\
 874         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 875     }\
 876 }\
 877 \
 878 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 879     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 880 }\
 881 \
 882 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 883     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 884 }\
 885 \
 886 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 887     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 888 }\
 889 \
 890 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 891     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 892 }\
 893 \
 894 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 895                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 896     int i;\
 897     for(i=0; i<h; i++){\
 898         uint32_t a, b, c, d, l0, l1, h0, h1;\
 899         a= LD32(&src1[i*src_stride1]);\
 900         b= LD32(&src2[i*src_stride2]);\
 901         c= LD32(&src3[i*src_stride3]);\
 902         d= LD32(&src4[i*src_stride4]);\
 903         l0=  (a&0x03030303UL)\
 904            + (b&0x03030303UL)\
 905            + 0x01010101UL;\
 906         h0= ((a&0xFCFCFCFCUL)>>2)\
 907           + ((b&0xFCFCFCFCUL)>>2);\
 908         l1=  (c&0x03030303UL)\
 909            + (d&0x03030303UL);\
 910         h1= ((c&0xFCFCFCFCUL)>>2)\
 911           + ((d&0xFCFCFCFCUL)>>2);\
 912         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 913         a= LD32(&src1[i*src_stride1+4]);\
 914         b= LD32(&src2[i*src_stride2+4]);\
 915         c= LD32(&src3[i*src_stride3+4]);\
 916         d= LD32(&src4[i*src_stride4+4]);\
 917         l0=  (a&0x03030303UL)\
 918            + (b&0x03030303UL)\
 919            + 0x01010101UL;\
 920         h0= ((a&0xFCFCFCFCUL)>>2)\
 921           + ((b&0xFCFCFCFCUL)>>2);\
 922         l1=  (c&0x03030303UL)\
 923            + (d&0x03030303UL);\
 924         h1= ((c&0xFCFCFCFCUL)>>2)\
 925           + ((d&0xFCFCFCFCUL)>>2);\
 926         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 927     }\
 928 }\
 929 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 930                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 931     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 932     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 933 }\
 934 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 935                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 936     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 937     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 938 }\
 939 \
 940 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 941 {\
 942         int i, a0, b0, a1, b1;\
 943         a0= pixels[0];\
 944         b0= pixels[1] + 2;\
 945         a0 += b0;\
 946         b0 += pixels[2];\
 947 \
 948         pixels+=line_size;\
 949         for(i=0; i<h; i+=2){\
 950             a1= pixels[0];\
 951             b1= pixels[1];\
 952             a1 += b1;\
 953             b1 += pixels[2];\
 954 \
 955             block[0]= (a1+a0)>>2; /* FIXME non put */\
 956             block[1]= (b1+b0)>>2;\
 957 \
 958             pixels+=line_size;\
 959             block +=line_size;\
 960 \
 961             a0= pixels[0];\
 962             b0= pixels[1] + 2;\
 963             a0 += b0;\
 964             b0 += pixels[2];\
 965 \
 966             block[0]= (a1+a0)>>2;\
 967             block[1]= (b1+b0)>>2;\
 968             pixels+=line_size;\
 969             block +=line_size;\
 970         }\
 971 }\
 972 \
 973 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 974 {\
 975         int i;\
 976         const uint32_t a= LD32(pixels  );\
 977         const uint32_t b= LD32(pixels+1);\
 978         uint32_t l0=  (a&0x03030303UL)\
 979                     + (b&0x03030303UL)\
 980                     + 0x02020202UL;\
 981         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 982                    + ((b&0xFCFCFCFCUL)>>2);\
 983         uint32_t l1,h1;\
 984 \
 985         pixels+=line_size;\
 986         for(i=0; i<h; i+=2){\
 987             uint32_t a= LD32(pixels  );\
 988             uint32_t b= LD32(pixels+1);\
 989             l1=  (a&0x03030303UL)\
 990                + (b&0x03030303UL);\
 991             h1= ((a&0xFCFCFCFCUL)>>2)\
 992               + ((b&0xFCFCFCFCUL)>>2);\
 993             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 994             pixels+=line_size;\
 995             block +=line_size;\
 996             a= LD32(pixels  );\
 997             b= LD32(pixels+1);\
 998             l0=  (a&0x03030303UL)\
 999                + (b&0x03030303UL)\
1000                + 0x02020202UL;\
1001             h0= ((a&0xFCFCFCFCUL)>>2)\
1002               + ((b&0xFCFCFCFCUL)>>2);\
1003             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004             pixels+=line_size;\
1005             block +=line_size;\
1006         }\
1007 }\
1008 \
1009 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1010 {\
1011     int j;\
1012     for(j=0; j<2; j++){\
1013         int i;\
1014         const uint32_t a= LD32(pixels  );\
1015         const uint32_t b= LD32(pixels+1);\
1016         uint32_t l0=  (a&0x03030303UL)\
1017                     + (b&0x03030303UL)\
1018                     + 0x02020202UL;\
1019         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1020                    + ((b&0xFCFCFCFCUL)>>2);\
1021         uint32_t l1,h1;\
1022 \
1023         pixels+=line_size;\
1024         for(i=0; i<h; i+=2){\
1025             uint32_t a= LD32(pixels  );\
1026             uint32_t b= LD32(pixels+1);\
1027             l1=  (a&0x03030303UL)\
1028                + (b&0x03030303UL);\
1029             h1= ((a&0xFCFCFCFCUL)>>2)\
1030               + ((b&0xFCFCFCFCUL)>>2);\
1031             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032             pixels+=line_size;\
1033             block +=line_size;\
1034             a= LD32(pixels  );\
1035             b= LD32(pixels+1);\
1036             l0=  (a&0x03030303UL)\
1037                + (b&0x03030303UL)\
1038                + 0x02020202UL;\
1039             h0= ((a&0xFCFCFCFCUL)>>2)\
1040               + ((b&0xFCFCFCFCUL)>>2);\
1041             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1042             pixels+=line_size;\
1043             block +=line_size;\
1044         }\
1045         pixels+=4-line_size*(h+1);\
1046         block +=4-line_size*h;\
1047     }\
1048 }\
1049 \
1050 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1051 {\
1052     int j;\
1053     for(j=0; j<2; j++){\
1054         int i;\
1055         const uint32_t a= LD32(pixels  );\
1056         const uint32_t b= LD32(pixels+1);\
1057         uint32_t l0=  (a&0x03030303UL)\
1058                     + (b&0x03030303UL)\
1059                     + 0x01010101UL;\
1060         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1061                    + ((b&0xFCFCFCFCUL)>>2);\
1062         uint32_t l1,h1;\
1063 \
1064         pixels+=line_size;\
1065         for(i=0; i<h; i+=2){\
1066             uint32_t a= LD32(pixels  );\
1067             uint32_t b= LD32(pixels+1);\
1068             l1=  (a&0x03030303UL)\
1069                + (b&0x03030303UL);\
1070             h1= ((a&0xFCFCFCFCUL)>>2)\
1071               + ((b&0xFCFCFCFCUL)>>2);\
1072             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073             pixels+=line_size;\
1074             block +=line_size;\
1075             a= LD32(pixels  );\
1076             b= LD32(pixels+1);\
1077             l0=  (a&0x03030303UL)\
1078                + (b&0x03030303UL)\
1079                + 0x01010101UL;\
1080             h0= ((a&0xFCFCFCFCUL)>>2)\
1081               + ((b&0xFCFCFCFCUL)>>2);\
1082             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083             pixels+=line_size;\
1084             block +=line_size;\
1085         }\
1086         pixels+=4-line_size*(h+1);\
1087         block +=4-line_size*h;\
1088     }\
1089 }\
1090 \
1091 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1092 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1093 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1094 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1095 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1098 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1099
1100 #define op_avg(a, b) a = rnd_avg32(a, b)
1101 #endif
1102 #define op_put(a, b) a = b
1103
1104 PIXOP2(avg, op_avg)
1105 PIXOP2(put, op_put)
1106 #undef op_avg
1107 #undef op_put
1108
1109 #define avg2(a,b) ((a+b+1)>>1)
1110 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1111
1112 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1113     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1114 }
1115
1116 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1118 }
1119
1120 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1121 {
1122     const int A=(16-x16)*(16-y16);
1123     const int B=(   x16)*(16-y16);
1124     const int C=(16-x16)*(   y16);
1125     const int D=(   x16)*(   y16);
1126     int i;
1127
1128     for(i=0; i<h; i++)
1129     {
1130         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1131         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1132         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1133         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1134         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1135         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1136         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1137         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1138         dst+= stride;
1139         src+= stride;
1140     }
1141 }
1142
1143 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1144                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1145 {
1146     int y, vx, vy;
1147     const int s= 1<<shift;
1148
1149     width--;
1150     height--;
1151
1152     for(y=0; y<h; y++){
1153         int x;
1154
1155         vx= ox;
1156         vy= oy;
1157         for(x=0; x<8; x++){ //XXX FIXME optimize
1158             int src_x, src_y, frac_x, frac_y, index;
1159
1160             src_x= vx>>16;
1161             src_y= vy>>16;
1162             frac_x= src_x&(s-1);
1163             frac_y= src_y&(s-1);
1164             src_x>>=shift;
1165             src_y>>=shift;
1166
1167             if((unsigned)src_x < width){
1168                 if((unsigned)src_y < height){
1169                     index= src_x + src_y*stride;
1170                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1171                                            + src[index       +1]*   frac_x )*(s-frac_y)
1172                                         + (  src[index+stride  ]*(s-frac_x)
1173                                            + src[index+stride+1]*   frac_x )*   frac_y
1174                                         + r)>>(shift*2);
1175                 }else{
1176                     index= src_x + clip(src_y, 0, height)*stride;
1177                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1178                                           + src[index       +1]*   frac_x )*s
1179                                         + r)>>(shift*2);
1180                 }
1181             }else{
1182                 if((unsigned)src_y < height){
1183                     index= clip(src_x, 0, width) + src_y*stride;
1184                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1185                                            + src[index+stride  ]*   frac_y )*s
1186                                         + r)>>(shift*2);
1187                 }else{
1188                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1189                     dst[y*stride + x]=    src[index         ];
1190                 }
1191             }
1192
1193             vx+= dxx;
1194             vy+= dyx;
1195         }
1196         ox += dxy;
1197         oy += dyy;
1198     }
1199 }
1200
1201 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1202     switch(width){
1203     case 2: put_pixels2_c (dst, src, stride, height); break;
1204     case 4: put_pixels4_c (dst, src, stride, height); break;
1205     case 8: put_pixels8_c (dst, src, stride, height); break;
1206     case 16:put_pixels16_c(dst, src, stride, height); break;
1207     }
1208 }
1209
1210 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1211     int i,j;
1212     for (i=0; i < height; i++) {
1213       for (j=0; j < width; j++) {
1214         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1215       }
1216       src += stride;
1217       dst += stride;
1218     }
1219 }
1220
1221 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1222     int i,j;
1223     for (i=0; i < height; i++) {
1224       for (j=0; j < width; j++) {
1225         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1226       }
1227       src += stride;
1228       dst += stride;
1229     }
1230 }
1231
1232 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1233     int i,j;
1234     for (i=0; i < height; i++) {
1235       for (j=0; j < width; j++) {
1236         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1237       }
1238       src += stride;
1239       dst += stride;
1240     }
1241 }
1242
1243 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1244     int i,j;
1245     for (i=0; i < height; i++) {
1246       for (j=0; j < width; j++) {
1247         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1248       }
1249       src += stride;
1250       dst += stride;
1251     }
1252 }
1253
1254 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1255     int i,j;
1256     for (i=0; i < height; i++) {
1257       for (j=0; j < width; j++) {
1258         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1259       }
1260       src += stride;
1261       dst += stride;
1262     }
1263 }
1264
1265 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1266     int i,j;
1267     for (i=0; i < height; i++) {
1268       for (j=0; j < width; j++) {
1269         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1270       }
1271       src += stride;
1272       dst += stride;
1273     }
1274 }
1275
1276 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1277     int i,j;
1278     for (i=0; i < height; i++) {
1279       for (j=0; j < width; j++) {
1280         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1281       }
1282       src += stride;
1283       dst += stride;
1284     }
1285 }
1286
1287 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1288     int i,j;
1289     for (i=0; i < height; i++) {
1290       for (j=0; j < width; j++) {
1291         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1292       }
1293       src += stride;
1294       dst += stride;
1295     }
1296 }
1297
1298 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1299     switch(width){
1300     case 2: avg_pixels2_c (dst, src, stride, height); break;
1301     case 4: avg_pixels4_c (dst, src, stride, height); break;
1302     case 8: avg_pixels8_c (dst, src, stride, height); break;
1303     case 16:avg_pixels16_c(dst, src, stride, height); break;
1304     }
1305 }
1306
1307 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308     int i,j;
1309     for (i=0; i < height; i++) {
1310       for (j=0; j < width; j++) {
1311         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1312       }
1313       src += stride;
1314       dst += stride;
1315     }
1316 }
1317
1318 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1319     int i,j;
1320     for (i=0; i < height; i++) {
1321       for (j=0; j < width; j++) {
1322         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1323       }
1324       src += stride;
1325       dst += stride;
1326     }
1327 }
1328
1329 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1330     int i,j;
1331     for (i=0; i < height; i++) {
1332       for (j=0; j < width; j++) {
1333         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1334       }
1335       src += stride;
1336       dst += stride;
1337     }
1338 }
1339
1340 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1341     int i,j;
1342     for (i=0; i < height; i++) {
1343       for (j=0; j < width; j++) {
1344         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1345       }
1346       src += stride;
1347       dst += stride;
1348     }
1349 }
1350
1351 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1352     int i,j;
1353     for (i=0; i < height; i++) {
1354       for (j=0; j < width; j++) {
1355         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1356       }
1357       src += stride;
1358       dst += stride;
1359     }
1360 }
1361
1362 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1363     int i,j;
1364     for (i=0; i < height; i++) {
1365       for (j=0; j < width; j++) {
1366         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1367       }
1368       src += stride;
1369       dst += stride;
1370     }
1371 }
1372
1373 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1374     int i,j;
1375     for (i=0; i < height; i++) {
1376       for (j=0; j < width; j++) {
1377         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1378       }
1379       src += stride;
1380       dst += stride;
1381     }
1382 }
1383
1384 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1385     int i,j;
1386     for (i=0; i < height; i++) {
1387       for (j=0; j < width; j++) {
1388         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1389       }
1390       src += stride;
1391       dst += stride;
1392     }
1393 }
1394 #if 0
1395 #define TPEL_WIDTH(width)\
1396 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1397     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1398 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1399     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1400 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1402 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1404 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1406 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1408 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1410 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1412 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1414 #endif
1415
1416 #define H264_CHROMA_MC(OPNAME, OP)\
1417 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1418     const int A=(8-x)*(8-y);\
1419     const int B=(  x)*(8-y);\
1420     const int C=(8-x)*(  y);\
1421     const int D=(  x)*(  y);\
1422     int i;\
1423     \
1424     assert(x<8 && y<8 && x>=0 && y>=0);\
1425 \
1426     for(i=0; i<h; i++)\
1427     {\
1428         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1429         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1430         dst+= stride;\
1431         src+= stride;\
1432     }\
1433 }\
1434 \
1435 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1436     const int A=(8-x)*(8-y);\
1437     const int B=(  x)*(8-y);\
1438     const int C=(8-x)*(  y);\
1439     const int D=(  x)*(  y);\
1440     int i;\
1441     \
1442     assert(x<8 && y<8 && x>=0 && y>=0);\
1443 \
1444     for(i=0; i<h; i++)\
1445     {\
1446         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1447         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1448         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1449         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1450         dst+= stride;\
1451         src+= stride;\
1452     }\
1453 }\
1454 \
1455 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1456     const int A=(8-x)*(8-y);\
1457     const int B=(  x)*(8-y);\
1458     const int C=(8-x)*(  y);\
1459     const int D=(  x)*(  y);\
1460     int i;\
1461     \
1462     assert(x<8 && y<8 && x>=0 && y>=0);\
1463 \
1464     for(i=0; i<h; i++)\
1465     {\
1466         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1467         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1468         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1469         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1470         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1471         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1472         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1473         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1474         dst+= stride;\
1475         src+= stride;\
1476     }\
1477 }
1478
1479 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1480 #define op_put(a, b) a = (((b) + 32)>>6)
1481
1482 H264_CHROMA_MC(put_       , op_put)
1483 H264_CHROMA_MC(avg_       , op_avg)
1484 #undef op_avg
1485 #undef op_put
1486
1487 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1488 {
1489     int i;
1490     for(i=0; i<h; i++)
1491     {
1492         ST16(dst   , LD16(src   ));
1493         dst+=dstStride;
1494         src+=srcStride;
1495     }
1496 }
1497
1498 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1499 {
1500     int i;
1501     for(i=0; i<h; i++)
1502     {
1503         ST32(dst   , LD32(src   ));
1504         dst+=dstStride;
1505         src+=srcStride;
1506     }
1507 }
1508
1509 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1510 {
1511     int i;
1512     for(i=0; i<h; i++)
1513     {
1514         ST32(dst   , LD32(src   ));
1515         ST32(dst+4 , LD32(src+4 ));
1516         dst+=dstStride;
1517         src+=srcStride;
1518     }
1519 }
1520
1521 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1522 {
1523     int i;
1524     for(i=0; i<h; i++)
1525     {
1526         ST32(dst   , LD32(src   ));
1527         ST32(dst+4 , LD32(src+4 ));
1528         ST32(dst+8 , LD32(src+8 ));
1529         ST32(dst+12, LD32(src+12));
1530         dst+=dstStride;
1531         src+=srcStride;
1532     }
1533 }
1534
1535 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1536 {
1537     int i;
1538     for(i=0; i<h; i++)
1539     {
1540         ST32(dst   , LD32(src   ));
1541         ST32(dst+4 , LD32(src+4 ));
1542         ST32(dst+8 , LD32(src+8 ));
1543         ST32(dst+12, LD32(src+12));
1544         dst[16]= src[16];
1545         dst+=dstStride;
1546         src+=srcStride;
1547     }
1548 }
1549
1550 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1551 {
1552     int i;
1553     for(i=0; i<h; i++)
1554     {
1555         ST32(dst   , LD32(src   ));
1556         ST32(dst+4 , LD32(src+4 ));
1557         dst[8]= src[8];
1558         dst+=dstStride;
1559         src+=srcStride;
1560     }
1561 }
1562
1563
1564 #define QPEL_MC(r, OPNAME, RND, OP) \
1565 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1566     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1567     int i;\
1568     for(i=0; i<h; i++)\
1569     {\
1570         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1571         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1572         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1573         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1574         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1575         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1576         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1577         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1578         dst+=dstStride;\
1579         src+=srcStride;\
1580     }\
1581 }\
1582 \
1583 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1584     const int w=8;\
1585     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1586     int i;\
1587     for(i=0; i<w; i++)\
1588     {\
1589         const int src0= src[0*srcStride];\
1590         const int src1= src[1*srcStride];\
1591         const int src2= src[2*srcStride];\
1592         const int src3= src[3*srcStride];\
1593         const int src4= src[4*srcStride];\
1594         const int src5= src[5*srcStride];\
1595         const int src6= src[6*srcStride];\
1596         const int src7= src[7*srcStride];\
1597         const int src8= src[8*srcStride];\
1598         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1599         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1600         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1601         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1602         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1603         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1604         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1605         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1606         dst++;\
1607         src++;\
1608     }\
1609 }\
1610 \
1611 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1612     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1613     int i;\
1614     \
1615     for(i=0; i<h; i++)\
1616     {\
1617         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1618         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1619         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1620         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1621         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1622         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1623         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1624         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1625         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1626         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1627         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1628         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1629         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1630         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1631         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1632         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1633         dst+=dstStride;\
1634         src+=srcStride;\
1635     }\
1636 }\
1637 \
1638 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1639     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1640     int i;\
1641     const int w=16;\
1642     for(i=0; i<w; i++)\
1643     {\
1644         const int src0= src[0*srcStride];\
1645         const int src1= src[1*srcStride];\
1646         const int src2= src[2*srcStride];\
1647         const int src3= src[3*srcStride];\
1648         const int src4= src[4*srcStride];\
1649         const int src5= src[5*srcStride];\
1650         const int src6= src[6*srcStride];\
1651         const int src7= src[7*srcStride];\
1652         const int src8= src[8*srcStride];\
1653         const int src9= src[9*srcStride];\
1654         const int src10= src[10*srcStride];\
1655         const int src11= src[11*srcStride];\
1656         const int src12= src[12*srcStride];\
1657         const int src13= src[13*srcStride];\
1658         const int src14= src[14*srcStride];\
1659         const int src15= src[15*srcStride];\
1660         const int src16= src[16*srcStride];\
1661         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1662         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1663         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1664         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1665         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1666         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1667         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1668         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1669         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1670         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1671         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1672         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1673         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1674         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1675         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1676         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1677         dst++;\
1678         src++;\
1679     }\
1680 }\
1681 \
1682 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1683     OPNAME ## pixels8_c(dst, src, stride, 8);\
1684 }\
1685 \
1686 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1687     uint8_t half[64];\
1688     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1689     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1690 }\
1691 \
1692 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1693     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1694 }\
1695 \
1696 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1697     uint8_t half[64];\
1698     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1699     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1700 }\
1701 \
1702 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1703     uint8_t full[16*9];\
1704     uint8_t half[64];\
1705     copy_block9(full, src, 16, stride, 9);\
1706     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1707     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1708 }\
1709 \
1710 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1711     uint8_t full[16*9];\
1712     copy_block9(full, src, 16, stride, 9);\
1713     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1714 }\
1715 \
1716 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1717     uint8_t full[16*9];\
1718     uint8_t half[64];\
1719     copy_block9(full, src, 16, stride, 9);\
1720     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1721     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1722 }\
1723 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1724     uint8_t full[16*9];\
1725     uint8_t halfH[72];\
1726     uint8_t halfV[64];\
1727     uint8_t halfHV[64];\
1728     copy_block9(full, src, 16, stride, 9);\
1729     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1730     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1732     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1733 }\
1734 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1735     uint8_t full[16*9];\
1736     uint8_t halfH[72];\
1737     uint8_t halfHV[64];\
1738     copy_block9(full, src, 16, stride, 9);\
1739     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1743 }\
1744 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1745     uint8_t full[16*9];\
1746     uint8_t halfH[72];\
1747     uint8_t halfV[64];\
1748     uint8_t halfHV[64];\
1749     copy_block9(full, src, 16, stride, 9);\
1750     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1751     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1753     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1754 }\
1755 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1756     uint8_t full[16*9];\
1757     uint8_t halfH[72];\
1758     uint8_t halfHV[64];\
1759     copy_block9(full, src, 16, stride, 9);\
1760     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1764 }\
1765 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1766     uint8_t full[16*9];\
1767     uint8_t halfH[72];\
1768     uint8_t halfV[64];\
1769     uint8_t halfHV[64];\
1770     copy_block9(full, src, 16, stride, 9);\
1771     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1772     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1773     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1774     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1775 }\
1776 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1777     uint8_t full[16*9];\
1778     uint8_t halfH[72];\
1779     uint8_t halfHV[64];\
1780     copy_block9(full, src, 16, stride, 9);\
1781     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1782     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1783     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1784     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1785 }\
1786 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1787     uint8_t full[16*9];\
1788     uint8_t halfH[72];\
1789     uint8_t halfV[64];\
1790     uint8_t halfHV[64];\
1791     copy_block9(full, src, 16, stride, 9);\
1792     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1793     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1794     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1795     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1796 }\
1797 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1798     uint8_t full[16*9];\
1799     uint8_t halfH[72];\
1800     uint8_t halfHV[64];\
1801     copy_block9(full, src, 16, stride, 9);\
1802     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1803     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1804     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1805     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1806 }\
1807 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1808     uint8_t halfH[72];\
1809     uint8_t halfHV[64];\
1810     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1811     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1812     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1813 }\
1814 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1815     uint8_t halfH[72];\
1816     uint8_t halfHV[64];\
1817     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1818     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1820 }\
1821 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822     uint8_t full[16*9];\
1823     uint8_t halfH[72];\
1824     uint8_t halfV[64];\
1825     uint8_t halfHV[64];\
1826     copy_block9(full, src, 16, stride, 9);\
1827     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1829     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1831 }\
1832 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1833     uint8_t full[16*9];\
1834     uint8_t halfH[72];\
1835     copy_block9(full, src, 16, stride, 9);\
1836     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1838     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1839 }\
1840 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1841     uint8_t full[16*9];\
1842     uint8_t halfH[72];\
1843     uint8_t halfV[64];\
1844     uint8_t halfHV[64];\
1845     copy_block9(full, src, 16, stride, 9);\
1846     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1847     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1848     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1849     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1850 }\
1851 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1852     uint8_t full[16*9];\
1853     uint8_t halfH[72];\
1854     copy_block9(full, src, 16, stride, 9);\
1855     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1857     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1858 }\
1859 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1860     uint8_t halfH[72];\
1861     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1862     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1863 }\
1864 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1865     OPNAME ## pixels16_c(dst, src, stride, 16);\
1866 }\
1867 \
1868 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1869     uint8_t half[256];\
1870     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1871     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1872 }\
1873 \
1874 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1875     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1876 }\
1877 \
1878 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1879     uint8_t half[256];\
1880     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1881     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1882 }\
1883 \
1884 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1885     uint8_t full[24*17];\
1886     uint8_t half[256];\
1887     copy_block17(full, src, 24, stride, 17);\
1888     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1889     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1890 }\
1891 \
1892 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1893     uint8_t full[24*17];\
1894     copy_block17(full, src, 24, stride, 17);\
1895     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1896 }\
1897 \
1898 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1899     uint8_t full[24*17];\
1900     uint8_t half[256];\
1901     copy_block17(full, src, 24, stride, 17);\
1902     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1903     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1904 }\
1905 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1906     uint8_t full[24*17];\
1907     uint8_t halfH[272];\
1908     uint8_t halfV[256];\
1909     uint8_t halfHV[256];\
1910     copy_block17(full, src, 24, stride, 17);\
1911     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1912     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1914     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1915 }\
1916 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1917     uint8_t full[24*17];\
1918     uint8_t halfH[272];\
1919     uint8_t halfHV[256];\
1920     copy_block17(full, src, 24, stride, 17);\
1921     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1925 }\
1926 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1927     uint8_t full[24*17];\
1928     uint8_t halfH[272];\
1929     uint8_t halfV[256];\
1930     uint8_t halfHV[256];\
1931     copy_block17(full, src, 24, stride, 17);\
1932     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1933     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1935     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1936 }\
1937 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1938     uint8_t full[24*17];\
1939     uint8_t halfH[272];\
1940     uint8_t halfHV[256];\
1941     copy_block17(full, src, 24, stride, 17);\
1942     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1946 }\
1947 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1948     uint8_t full[24*17];\
1949     uint8_t halfH[272];\
1950     uint8_t halfV[256];\
1951     uint8_t halfHV[256];\
1952     copy_block17(full, src, 24, stride, 17);\
1953     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1954     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1955     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1957 }\
1958 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1959     uint8_t full[24*17];\
1960     uint8_t halfH[272];\
1961     uint8_t halfHV[256];\
1962     copy_block17(full, src, 24, stride, 17);\
1963     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1964     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1965     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1966     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1967 }\
1968 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1969     uint8_t full[24*17];\
1970     uint8_t halfH[272];\
1971     uint8_t halfV[256];\
1972     uint8_t halfHV[256];\
1973     copy_block17(full, src, 24, stride, 17);\
1974     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1975     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1976     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1977     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1978 }\
1979 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1980     uint8_t full[24*17];\
1981     uint8_t halfH[272];\
1982     uint8_t halfHV[256];\
1983     copy_block17(full, src, 24, stride, 17);\
1984     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1985     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1986     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1988 }\
1989 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1990     uint8_t halfH[272];\
1991     uint8_t halfHV[256];\
1992     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1993     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1995 }\
1996 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1997     uint8_t halfH[272];\
1998     uint8_t halfHV[256];\
1999     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2000     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2001     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2002 }\
2003 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2004     uint8_t full[24*17];\
2005     uint8_t halfH[272];\
2006     uint8_t halfV[256];\
2007     uint8_t halfHV[256];\
2008     copy_block17(full, src, 24, stride, 17);\
2009     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2010     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2011     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2012     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2013 }\
2014 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2015     uint8_t full[24*17];\
2016     uint8_t halfH[272];\
2017     copy_block17(full, src, 24, stride, 17);\
2018     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2019     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2020     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2021 }\
2022 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2023     uint8_t full[24*17];\
2024     uint8_t halfH[272];\
2025     uint8_t halfV[256];\
2026     uint8_t halfHV[256];\
2027     copy_block17(full, src, 24, stride, 17);\
2028     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2029     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2030     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2031     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2032 }\
2033 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2034     uint8_t full[24*17];\
2035     uint8_t halfH[272];\
2036     copy_block17(full, src, 24, stride, 17);\
2037     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2039     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2040 }\
2041 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2042     uint8_t halfH[272];\
2043     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2044     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2045 }
2046
2047 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2048 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2049 #define op_put(a, b) a = cm[((b) + 16)>>5]
2050 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2051
2052 QPEL_MC(0, put_       , _       , op_put)
2053 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2054 QPEL_MC(0, avg_       , _       , op_avg)
2055 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2056 #undef op_avg
2057 #undef op_avg_no_rnd
2058 #undef op_put
2059 #undef op_put_no_rnd
2060
2061 #if 1
2062 #define H264_LOWPASS(OPNAME, OP, OP2) \
2063 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2064     const int h=2;\
2065     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2066     int i;\
2067     for(i=0; i<h; i++)\
2068     {\
2069         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2070         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2071         dst+=dstStride;\
2072         src+=srcStride;\
2073     }\
2074 }\
2075 \
2076 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2077     const int w=2;\
2078     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2079     int i;\
2080     for(i=0; i<w; i++)\
2081     {\
2082         const int srcB= src[-2*srcStride];\
2083         const int srcA= src[-1*srcStride];\
2084         const int src0= src[0 *srcStride];\
2085         const int src1= src[1 *srcStride];\
2086         const int src2= src[2 *srcStride];\
2087         const int src3= src[3 *srcStride];\
2088         const int src4= src[4 *srcStride];\
2089         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2090         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2091         dst++;\
2092         src++;\
2093     }\
2094 }\
2095 \
2096 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2097     const int h=2;\
2098     const int w=2;\
2099     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2100     int i;\
2101     src -= 2*srcStride;\
2102     for(i=0; i<h+5; i++)\
2103     {\
2104         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2105         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2106         tmp+=tmpStride;\
2107         src+=srcStride;\
2108     }\
2109     tmp -= tmpStride*(h+5-2);\
2110     for(i=0; i<w; i++)\
2111     {\
2112         const int tmpB= tmp[-2*tmpStride];\
2113         const int tmpA= tmp[-1*tmpStride];\
2114         const int tmp0= tmp[0 *tmpStride];\
2115         const int tmp1= tmp[1 *tmpStride];\
2116         const int tmp2= tmp[2 *tmpStride];\
2117         const int tmp3= tmp[3 *tmpStride];\
2118         const int tmp4= tmp[4 *tmpStride];\
2119         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2120         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2121         dst++;\
2122         tmp++;\
2123     }\
2124 }\
2125 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2126     const int h=4;\
2127     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2128     int i;\
2129     for(i=0; i<h; i++)\
2130     {\
2131         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2132         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2133         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2134         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2135         dst+=dstStride;\
2136         src+=srcStride;\
2137     }\
2138 }\
2139 \
2140 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2141     const int w=4;\
2142     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2143     int i;\
2144     for(i=0; i<w; i++)\
2145     {\
2146         const int srcB= src[-2*srcStride];\
2147         const int srcA= src[-1*srcStride];\
2148         const int src0= src[0 *srcStride];\
2149         const int src1= src[1 *srcStride];\
2150         const int src2= src[2 *srcStride];\
2151         const int src3= src[3 *srcStride];\
2152         const int src4= src[4 *srcStride];\
2153         const int src5= src[5 *srcStride];\
2154         const int src6= src[6 *srcStride];\
2155         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2156         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2157         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2158         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2159         dst++;\
2160         src++;\
2161     }\
2162 }\
2163 \
2164 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2165     const int h=4;\
2166     const int w=4;\
2167     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2168     int i;\
2169     src -= 2*srcStride;\
2170     for(i=0; i<h+5; i++)\
2171     {\
2172         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2173         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2174         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2175         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2176         tmp+=tmpStride;\
2177         src+=srcStride;\
2178     }\
2179     tmp -= tmpStride*(h+5-2);\
2180     for(i=0; i<w; i++)\
2181     {\
2182         const int tmpB= tmp[-2*tmpStride];\
2183         const int tmpA= tmp[-1*tmpStride];\
2184         const int tmp0= tmp[0 *tmpStride];\
2185         const int tmp1= tmp[1 *tmpStride];\
2186         const int tmp2= tmp[2 *tmpStride];\
2187         const int tmp3= tmp[3 *tmpStride];\
2188         const int tmp4= tmp[4 *tmpStride];\
2189         const int tmp5= tmp[5 *tmpStride];\
2190         const int tmp6= tmp[6 *tmpStride];\
2191         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2192         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2193         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2194         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2195         dst++;\
2196         tmp++;\
2197     }\
2198 }\
2199 \
2200 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2201     const int h=8;\
2202     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2203     int i;\
2204     for(i=0; i<h; i++)\
2205     {\
2206         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2207         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2208         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2209         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2210         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2211         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2212         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2213         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2214         dst+=dstStride;\
2215         src+=srcStride;\
2216     }\
2217 }\
2218 \
2219 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2220     const int w=8;\
2221     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2222     int i;\
2223     for(i=0; i<w; i++)\
2224     {\
2225         const int srcB= src[-2*srcStride];\
2226         const int srcA= src[-1*srcStride];\
2227         const int src0= src[0 *srcStride];\
2228         const int src1= src[1 *srcStride];\
2229         const int src2= src[2 *srcStride];\
2230         const int src3= src[3 *srcStride];\
2231         const int src4= src[4 *srcStride];\
2232         const int src5= src[5 *srcStride];\
2233         const int src6= src[6 *srcStride];\
2234         const int src7= src[7 *srcStride];\
2235         const int src8= src[8 *srcStride];\
2236         const int src9= src[9 *srcStride];\
2237         const int src10=src[10*srcStride];\
2238         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2239         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2240         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2241         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2242         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2243         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2244         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2245         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2246         dst++;\
2247         src++;\
2248     }\
2249 }\
2250 \
2251 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2252     const int h=8;\
2253     const int w=8;\
2254     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2255     int i;\
2256     src -= 2*srcStride;\
2257     for(i=0; i<h+5; i++)\
2258     {\
2259         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2260         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2261         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2262         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2263         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2264         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2265         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2266         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2267         tmp+=tmpStride;\
2268         src+=srcStride;\
2269     }\
2270     tmp -= tmpStride*(h+5-2);\
2271     for(i=0; i<w; i++)\
2272     {\
2273         const int tmpB= tmp[-2*tmpStride];\
2274         const int tmpA= tmp[-1*tmpStride];\
2275         const int tmp0= tmp[0 *tmpStride];\
2276         const int tmp1= tmp[1 *tmpStride];\
2277         const int tmp2= tmp[2 *tmpStride];\
2278         const int tmp3= tmp[3 *tmpStride];\
2279         const int tmp4= tmp[4 *tmpStride];\
2280         const int tmp5= tmp[5 *tmpStride];\
2281         const int tmp6= tmp[6 *tmpStride];\
2282         const int tmp7= tmp[7 *tmpStride];\
2283         const int tmp8= tmp[8 *tmpStride];\
2284         const int tmp9= tmp[9 *tmpStride];\
2285         const int tmp10=tmp[10*tmpStride];\
2286         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2287         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2288         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2289         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2290         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2291         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2292         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2293         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2294         dst++;\
2295         tmp++;\
2296     }\
2297 }\
2298 \
2299 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2300     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2301     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2302     src += 8*srcStride;\
2303     dst += 8*dstStride;\
2304     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2305     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306 }\
2307 \
2308 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2309     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2310     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2311     src += 8*srcStride;\
2312     dst += 8*dstStride;\
2313     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2314     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315 }\
2316 \
2317 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2318     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2319     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2320     src += 8*srcStride;\
2321     dst += 8*dstStride;\
2322     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2323     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324 }\
2325
2326 #define H264_MC(OPNAME, SIZE) \
2327 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2328     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2329 }\
2330 \
2331 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2332     uint8_t half[SIZE*SIZE];\
2333     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2334     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2335 }\
2336 \
2337 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2338     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2339 }\
2340 \
2341 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2342     uint8_t half[SIZE*SIZE];\
2343     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2344     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2345 }\
2346 \
2347 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2348     uint8_t full[SIZE*(SIZE+5)];\
2349     uint8_t * const full_mid= full + SIZE*2;\
2350     uint8_t half[SIZE*SIZE];\
2351     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2352     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2353     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2354 }\
2355 \
2356 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2357     uint8_t full[SIZE*(SIZE+5)];\
2358     uint8_t * const full_mid= full + SIZE*2;\
2359     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2360     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2361 }\
2362 \
2363 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2364     uint8_t full[SIZE*(SIZE+5)];\
2365     uint8_t * const full_mid= full + SIZE*2;\
2366     uint8_t half[SIZE*SIZE];\
2367     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2368     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2369     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2370 }\
2371 \
2372 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2373     uint8_t full[SIZE*(SIZE+5)];\
2374     uint8_t * const full_mid= full + SIZE*2;\
2375     uint8_t halfH[SIZE*SIZE];\
2376     uint8_t halfV[SIZE*SIZE];\
2377     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2379     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2380     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2381 }\
2382 \
2383 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2384     uint8_t full[SIZE*(SIZE+5)];\
2385     uint8_t * const full_mid= full + SIZE*2;\
2386     uint8_t halfH[SIZE*SIZE];\
2387     uint8_t halfV[SIZE*SIZE];\
2388     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2389     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2390     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2391     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2392 }\
2393 \
2394 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2395     uint8_t full[SIZE*(SIZE+5)];\
2396     uint8_t * const full_mid= full + SIZE*2;\
2397     uint8_t halfH[SIZE*SIZE];\
2398     uint8_t halfV[SIZE*SIZE];\
2399     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2400     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2401     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2402     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2403 }\
2404 \
2405 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2406     uint8_t full[SIZE*(SIZE+5)];\
2407     uint8_t * const full_mid= full + SIZE*2;\
2408     uint8_t halfH[SIZE*SIZE];\
2409     uint8_t halfV[SIZE*SIZE];\
2410     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2411     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2412     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2413     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2414 }\
2415 \
2416 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2417     int16_t tmp[SIZE*(SIZE+5)];\
2418     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2419 }\
2420 \
2421 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2422     int16_t tmp[SIZE*(SIZE+5)];\
2423     uint8_t halfH[SIZE*SIZE];\
2424     uint8_t halfHV[SIZE*SIZE];\
2425     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2426     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2427     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2428 }\
2429 \
2430 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2431     int16_t tmp[SIZE*(SIZE+5)];\
2432     uint8_t halfH[SIZE*SIZE];\
2433     uint8_t halfHV[SIZE*SIZE];\
2434     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2435     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2436     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2437 }\
2438 \
2439 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2440     uint8_t full[SIZE*(SIZE+5)];\
2441     uint8_t * const full_mid= full + SIZE*2;\
2442     int16_t tmp[SIZE*(SIZE+5)];\
2443     uint8_t halfV[SIZE*SIZE];\
2444     uint8_t halfHV[SIZE*SIZE];\
2445     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2446     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2447     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2448     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2449 }\
2450 \
2451 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2452     uint8_t full[SIZE*(SIZE+5)];\
2453     uint8_t * const full_mid= full + SIZE*2;\
2454     int16_t tmp[SIZE*(SIZE+5)];\
2455     uint8_t halfV[SIZE*SIZE];\
2456     uint8_t halfHV[SIZE*SIZE];\
2457     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2458     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2459     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2460     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2461 }\
2462
2463 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2464 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2465 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2466 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2467 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2468
2469 H264_LOWPASS(put_       , op_put, op2_put)
2470 H264_LOWPASS(avg_       , op_avg, op2_avg)
2471 H264_MC(put_, 2)
2472 H264_MC(put_, 4)
2473 H264_MC(put_, 8)
2474 H264_MC(put_, 16)
2475 H264_MC(avg_, 4)
2476 H264_MC(avg_, 8)
2477 H264_MC(avg_, 16)
2478
2479 #undef op_avg
2480 #undef op_put
2481 #undef op2_avg
2482 #undef op2_put
2483 #endif
2484
2485 #define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2486 #define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2487 #define H264_WEIGHT(W,H) \
2488 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2489     int y; \
2490     offset <<= log2_denom; \
2491     if(log2_denom) offset += 1<<(log2_denom-1); \
2492     for(y=0; y<H; y++, block += stride){ \
2493         op_scale1(0); \
2494         op_scale1(1); \
2495         if(W==2) continue; \
2496         op_scale1(2); \
2497         op_scale1(3); \
2498         if(W==4) continue; \
2499         op_scale1(4); \
2500         op_scale1(5); \
2501         op_scale1(6); \
2502         op_scale1(7); \
2503         if(W==8) continue; \
2504         op_scale1(8); \
2505         op_scale1(9); \
2506         op_scale1(10); \
2507         op_scale1(11); \
2508         op_scale1(12); \
2509         op_scale1(13); \
2510         op_scale1(14); \
2511         op_scale1(15); \
2512     } \
2513 } \
2514 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2515     int y; \
2516     offset = ((offset + 1) | 1) << log2_denom; \
2517     for(y=0; y<H; y++, dst += stride, src += stride){ \
2518         op_scale2(0); \
2519         op_scale2(1); \
2520         if(W==2) continue; \
2521         op_scale2(2); \
2522         op_scale2(3); \
2523         if(W==4) continue; \
2524         op_scale2(4); \
2525         op_scale2(5); \
2526         op_scale2(6); \
2527         op_scale2(7); \
2528         if(W==8) continue; \
2529         op_scale2(8); \
2530         op_scale2(9); \
2531         op_scale2(10); \
2532         op_scale2(11); \
2533         op_scale2(12); \
2534         op_scale2(13); \
2535         op_scale2(14); \
2536         op_scale2(15); \
2537     } \
2538 }
2539
2540 H264_WEIGHT(16,16)
2541 H264_WEIGHT(16,8)
2542 H264_WEIGHT(8,16)
2543 H264_WEIGHT(8,8)
2544 H264_WEIGHT(8,4)
2545 H264_WEIGHT(4,8)
2546 H264_WEIGHT(4,4)
2547 H264_WEIGHT(4,2)
2548 H264_WEIGHT(2,4)
2549 H264_WEIGHT(2,2)
2550
2551 #undef op_scale1
2552 #undef op_scale2
2553 #undef H264_WEIGHT
2554
2555 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2556     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2557     int i;
2558
2559     for(i=0; i<h; i++){
2560         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2561         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2562         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2563         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2564         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2565         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2566         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2567         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2568         dst+=dstStride;
2569         src+=srcStride;
2570     }
2571 }
2572
2573 /* AVS specific */
2574 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2575
2576 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2577     put_pixels8_c(dst, src, stride, 8);
2578 }
2579 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2580     avg_pixels8_c(dst, src, stride, 8);
2581 }
2582 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2583     put_pixels16_c(dst, src, stride, 16);
2584 }
2585 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586     avg_pixels16_c(dst, src, stride, 16);
2587 }
2588
2589 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2590     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2591     int i;
2592
2593     for(i=0; i<w; i++){
2594         const int src_1= src[ -srcStride];
2595         const int src0 = src[0          ];
2596         const int src1 = src[  srcStride];
2597         const int src2 = src[2*srcStride];
2598         const int src3 = src[3*srcStride];
2599         const int src4 = src[4*srcStride];
2600         const int src5 = src[5*srcStride];
2601         const int src6 = src[6*srcStride];
2602         const int src7 = src[7*srcStride];
2603         const int src8 = src[8*srcStride];
2604         const int src9 = src[9*srcStride];
2605         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2606         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2607         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2608         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2609         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2610         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2611         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2612         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2613         src++;
2614         dst++;
2615     }
2616 }
2617
2618 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2619     put_pixels8_c(dst, src, stride, 8);
2620 }
2621
2622 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2623     uint8_t half[64];
2624     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2625     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2626 }
2627
2628 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2629     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2630 }
2631
2632 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2633     uint8_t half[64];
2634     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2635     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2636 }
2637
2638 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2639     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2640 }
2641
2642 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2643     uint8_t halfH[88];
2644     uint8_t halfV[64];
2645     uint8_t halfHV[64];
2646     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2647     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2648     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2649     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2650 }
2651 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2652     uint8_t halfH[88];
2653     uint8_t halfV[64];
2654     uint8_t halfHV[64];
2655     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2656     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2657     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2658     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2659 }
2660 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2661     uint8_t halfH[88];
2662     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2663     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2664 }
2665
2666 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2667     int x;
2668     const int strength= ff_h263_loop_filter_strength[qscale];
2669
2670     for(x=0; x<8; x++){
2671         int d1, d2, ad1;
2672         int p0= src[x-2*stride];
2673         int p1= src[x-1*stride];
2674         int p2= src[x+0*stride];
2675         int p3= src[x+1*stride];
2676         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2677
2678         if     (d<-2*strength) d1= 0;
2679         else if(d<-  strength) d1=-2*strength - d;
2680         else if(d<   strength) d1= d;
2681         else if(d< 2*strength) d1= 2*strength - d;
2682         else                   d1= 0;
2683
2684         p1 += d1;
2685         p2 -= d1;
2686         if(p1&256) p1= ~(p1>>31);
2687         if(p2&256) p2= ~(p2>>31);
2688
2689         src[x-1*stride] = p1;
2690         src[x+0*stride] = p2;
2691
2692         ad1= ABS(d1)>>1;
2693
2694         d2= clip((p0-p3)/4, -ad1, ad1);
2695
2696         src[x-2*stride] = p0 - d2;
2697         src[x+  stride] = p3 + d2;
2698     }
2699 }
2700
2701 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2702     int y;
2703     const int strength= ff_h263_loop_filter_strength[qscale];
2704
2705     for(y=0; y<8; y++){
2706         int d1, d2, ad1;
2707         int p0= src[y*stride-2];
2708         int p1= src[y*stride-1];
2709         int p2= src[y*stride+0];
2710         int p3= src[y*stride+1];
2711         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2712
2713         if     (d<-2*strength) d1= 0;
2714         else if(d<-  strength) d1=-2*strength - d;
2715         else if(d<   strength) d1= d;
2716         else if(d< 2*strength) d1= 2*strength - d;
2717         else                   d1= 0;
2718
2719         p1 += d1;
2720         p2 -= d1;
2721         if(p1&256) p1= ~(p1>>31);
2722         if(p2&256) p2= ~(p2>>31);
2723
2724         src[y*stride-1] = p1;
2725         src[y*stride+0] = p2;
2726
2727         ad1= ABS(d1)>>1;
2728
2729         d2= clip((p0-p3)/4, -ad1, ad1);
2730
2731         src[y*stride-2] = p0 - d2;
2732         src[y*stride+1] = p3 + d2;
2733     }
2734 }
2735
2736 static void h261_loop_filter_c(uint8_t *src, int stride){
2737     int x,y,xy,yz;
2738     int temp[64];
2739
2740     for(x=0; x<8; x++){
2741         temp[x      ] = 4*src[x           ];
2742         temp[x + 7*8] = 4*src[x + 7*stride];
2743     }
2744     for(y=1; y<7; y++){
2745         for(x=0; x<8; x++){
2746             xy = y * stride + x;
2747             yz = y * 8 + x;
2748             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2749         }
2750     }
2751
2752     for(y=0; y<8; y++){
2753         src[  y*stride] = (temp[  y*8] + 2)>>2;
2754         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2755         for(x=1; x<7; x++){
2756             xy = y * stride + x;
2757             yz = y * 8 + x;
2758             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2759         }
2760     }
2761 }
2762
2763 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2764 {
2765     int i, d;
2766     for( i = 0; i < 4; i++ ) {
2767         if( tc0[i] < 0 ) {
2768             pix += 4*ystride;
2769             continue;
2770         }
2771         for( d = 0; d < 4; d++ ) {
2772             const int p0 = pix[-1*xstride];
2773             const int p1 = pix[-2*xstride];
2774             const int p2 = pix[-3*xstride];
2775             const int q0 = pix[0];
2776             const int q1 = pix[1*xstride];
2777             const int q2 = pix[2*xstride];
2778
2779             if( ABS( p0 - q0 ) < alpha &&
2780                 ABS( p1 - p0 ) < beta &&
2781                 ABS( q1 - q0 ) < beta ) {
2782
2783                 int tc = tc0[i];
2784                 int i_delta;
2785
2786                 if( ABS( p2 - p0 ) < beta ) {
2787                     pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2788                     tc++;
2789                 }
2790                 if( ABS( q2 - q0 ) < beta ) {
2791                     pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2792                     tc++;
2793                 }
2794
2795                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2796                 pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2797                 pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2798             }
2799             pix += ystride;
2800         }
2801     }
2802 }
2803 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2804 {
2805     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2806 }
2807 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2808 {
2809     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2810 }
2811
2812 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2813 {
2814     int i, d;
2815     for( i = 0; i < 4; i++ ) {
2816         const int tc = tc0[i];
2817         if( tc <= 0 ) {
2818             pix += 2*ystride;
2819             continue;
2820         }
2821         for( d = 0; d < 2; d++ ) {
2822             const int p0 = pix[-1*xstride];
2823             const int p1 = pix[-2*xstride];
2824             const int q0 = pix[0];
2825             const int q1 = pix[1*xstride];
2826
2827             if( ABS( p0 - q0 ) < alpha &&
2828                 ABS( p1 - p0 ) < beta &&
2829                 ABS( q1 - q0 ) < beta ) {
2830
2831                 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2832
2833                 pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2834                 pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2835             }
2836             pix += ystride;
2837         }
2838     }
2839 }
2840 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2841 {
2842     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2843 }
2844 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2845 {
2846     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2847 }
2848
2849 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2850 {
2851     int d;
2852     for( d = 0; d < 8; d++ ) {
2853         const int p0 = pix[-1*xstride];
2854         const int p1 = pix[-2*xstride];
2855         const int q0 = pix[0];
2856         const int q1 = pix[1*xstride];
2857
2858         if( ABS( p0 - q0 ) < alpha &&
2859             ABS( p1 - p0 ) < beta &&
2860             ABS( q1 - q0 ) < beta ) {
2861
2862             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2863             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2864         }
2865         pix += ystride;
2866     }
2867 }
2868 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2869 {
2870     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2871 }
2872 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2873 {
2874     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2875 }
2876
2877 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2878 {
2879     int s, i;
2880
2881     s = 0;
2882     for(i=0;i<h;i++) {
2883         s += abs(pix1[0] - pix2[0]);
2884         s += abs(pix1[1] - pix2[1]);
2885         s += abs(pix1[2] - pix2[2]);
2886         s += abs(pix1[3] - pix2[3]);
2887         s += abs(pix1[4] - pix2[4]);
2888         s += abs(pix1[5] - pix2[5]);
2889         s += abs(pix1[6] - pix2[6]);
2890         s += abs(pix1[7] - pix2[7]);
2891         s += abs(pix1[8] - pix2[8]);
2892         s += abs(pix1[9] - pix2[9]);
2893         s += abs(pix1[10] - pix2[10]);
2894         s += abs(pix1[11] - pix2[11]);
2895         s += abs(pix1[12] - pix2[12]);
2896         s += abs(pix1[13] - pix2[13]);
2897         s += abs(pix1[14] - pix2[14]);
2898         s += abs(pix1[15] - pix2[15]);
2899         pix1 += line_size;
2900         pix2 += line_size;
2901     }
2902     return s;
2903 }
2904
2905 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2906 {
2907     int s, i;
2908
2909     s = 0;
2910     for(i=0;i<h;i++) {
2911         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2912         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2913         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2914         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2915         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2916         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2917         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2918         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2919         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2920         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2921         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2922         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2923         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2924         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2925         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2926         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2927         pix1 += line_size;
2928         pix2 += line_size;
2929     }
2930     return s;
2931 }
2932
2933 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2934 {
2935     int s, i;
2936     uint8_t *pix3 = pix2 + line_size;
2937
2938     s = 0;
2939     for(i=0;i<h;i++) {
2940         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2941         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2942         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2943         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2944         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2945         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2946         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2947         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2948         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2949         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2950         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2951         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2952         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2953         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2954         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2955         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2956         pix1 += line_size;
2957         pix2 += line_size;
2958         pix3 += line_size;
2959     }
2960     return s;
2961 }
2962
2963 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2964 {
2965     int s, i;
2966     uint8_t *pix3 = pix2 + line_size;
2967
2968     s = 0;
2969     for(i=0;i<h;i++) {
2970         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2971         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2972         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2973         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2974         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2975         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2976         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2977         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2978         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2979         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2980         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2981         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2982         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2983         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2984         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2985         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2986         pix1 += line_size;
2987         pix2 += line_size;
2988         pix3 += line_size;
2989     }
2990     return s;
2991 }
2992
2993 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2994 {
2995     int s, i;
2996
2997     s = 0;
2998     for(i=0;i<h;i++) {
2999         s += abs(pix1[0] - pix2[0]);
3000         s += abs(pix1[1] - pix2[1]);
3001         s += abs(pix1[2] - pix2[2]);
3002         s += abs(pix1[3] - pix2[3]);
3003         s += abs(pix1[4] - pix2[4]);
3004         s += abs(pix1[5] - pix2[5]);
3005         s += abs(pix1[6] - pix2[6]);
3006         s += abs(pix1[7] - pix2[7]);
3007         pix1 += line_size;
3008         pix2 += line_size;
3009     }
3010     return s;
3011 }
3012
3013 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3014 {
3015     int s, i;
3016
3017     s = 0;
3018     for(i=0;i<h;i++) {
3019         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3020         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3021         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3022         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3023         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3024         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3025         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3026         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3027         pix1 += line_size;
3028         pix2 += line_size;
3029     }
3030     return s;
3031 }
3032
3033 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3034 {
3035     int s, i;
3036     uint8_t *pix3 = pix2 + line_size;
3037
3038     s = 0;
3039     for(i=0;i<h;i++) {
3040         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3041         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3042         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3043         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3044         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3045         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3046         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3047         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3048         pix1 += line_size;
3049         pix2 += line_size;
3050         pix3 += line_size;
3051     }
3052     return s;
3053 }
3054
3055 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3056 {
3057     int s, i;
3058     uint8_t *pix3 = pix2 + line_size;
3059
3060     s = 0;
3061     for(i=0;i<h;i++) {
3062         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3063         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3064         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3065         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3066         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3067         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3068         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3069         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3070         pix1 += line_size;
3071         pix2 += line_size;
3072         pix3 += line_size;
3073     }
3074     return s;
3075 }
3076
3077 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3078     MpegEncContext *c = v;
3079     int score1=0;
3080     int score2=0;
3081     int x,y;
3082
3083     for(y=0; y<h; y++){
3084         for(x=0; x<16; x++){
3085             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3086         }
3087         if(y+1<h){
3088             for(x=0; x<15; x++){
3089                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3090                              - s1[x+1] + s1[x+1+stride])
3091                         -ABS(  s2[x  ] - s2[x  +stride]
3092                              - s2[x+1] + s2[x+1+stride]);
3093             }
3094         }
3095         s1+= stride;
3096         s2+= stride;
3097     }
3098
3099     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3100     else  return score1 + ABS(score2)*8;
3101 }
3102
3103 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3104     MpegEncContext *c = v;
3105     int score1=0;
3106     int score2=0;
3107     int x,y;
3108
3109     for(y=0; y<h; y++){
3110         for(x=0; x<8; x++){
3111             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3112         }
3113         if(y+1<h){
3114             for(x=0; x<7; x++){
3115                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3116                              - s1[x+1] + s1[x+1+stride])
3117                         -ABS(  s2[x  ] - s2[x  +stride]
3118                              - s2[x+1] + s2[x+1+stride]);
3119             }
3120         }
3121         s1+= stride;
3122         s2+= stride;
3123     }
3124
3125     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3126     else  return score1 + ABS(score2)*8;
3127 }
3128
3129 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3130     int i;
3131     unsigned int sum=0;
3132
3133     for(i=0; i<8*8; i++){
3134         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3135         int w= weight[i];
3136         b>>= RECON_SHIFT;
3137         assert(-512<b && b<512);
3138
3139         sum += (w*b)*(w*b)>>4;
3140     }
3141     return sum>>2;
3142 }
3143
3144 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3145     int i;
3146
3147     for(i=0; i<8*8; i++){
3148         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3149     }
3150 }
3151
3152 /**
3153  * permutes an 8x8 block.
3154  * @param block the block which will be permuted according to the given permutation vector
3155  * @param permutation the permutation vector
3156  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3157  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3158  *                  (inverse) permutated to scantable order!
3159  */
3160 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3161 {
3162     int i;
3163     DCTELEM temp[64];
3164
3165     if(last<=0) return;
3166     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3167
3168     for(i=0; i<=last; i++){
3169         const int j= scantable[i];
3170         temp[j]= block[j];
3171         block[j]=0;
3172     }
3173
3174     for(i=0; i<=last; i++){
3175         const int j= scantable[i];
3176         const int perm_j= permutation[j];
3177         block[perm_j]= temp[j];
3178     }
3179 }
3180
3181 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3182     return 0;
3183 }
3184
3185 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3186     int i;
3187
3188     memset(cmp, 0, sizeof(void*)*5);
3189
3190     for(i=0; i<5; i++){
3191         switch(type&0xFF){
3192         case FF_CMP_SAD:
3193             cmp[i]= c->sad[i];
3194             break;
3195         case FF_CMP_SATD:
3196             cmp[i]= c->hadamard8_diff[i];
3197             break;
3198         case FF_CMP_SSE:
3199             cmp[i]= c->sse[i];
3200             break;
3201         case FF_CMP_DCT:
3202             cmp[i]= c->dct_sad[i];
3203             break;
3204         case FF_CMP_DCT264:
3205             cmp[i]= c->dct264_sad[i];
3206             break;
3207         case FF_CMP_DCTMAX:
3208             cmp[i]= c->dct_max[i];
3209             break;
3210         case FF_CMP_PSNR:
3211             cmp[i]= c->quant_psnr[i];
3212             break;
3213         case FF_CMP_BIT:
3214             cmp[i]= c->bit[i];
3215             break;
3216         case FF_CMP_RD:
3217             cmp[i]= c->rd[i];
3218             break;
3219         case FF_CMP_VSAD:
3220             cmp[i]= c->vsad[i];
3221             break;
3222         case FF_CMP_VSSE:
3223             cmp[i]= c->vsse[i];
3224             break;
3225         case FF_CMP_ZERO:
3226             cmp[i]= zero_cmp;
3227             break;
3228         case FF_CMP_NSSE:
3229             cmp[i]= c->nsse[i];
3230             break;
3231 #ifdef CONFIG_SNOW_ENCODER
3232         case FF_CMP_W53:
3233             cmp[i]= c->w53[i];
3234             break;
3235         case FF_CMP_W97:
3236             cmp[i]= c->w97[i];
3237             break;
3238 #endif
3239         default:
3240             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3241         }
3242     }
3243 }
3244
3245 /**
3246  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3247  */
3248 static void clear_blocks_c(DCTELEM *blocks)
3249 {
3250     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3251 }
3252
3253 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3254     int i;
3255     for(i=0; i+7<w; i+=8){
3256         dst[i+0] += src[i+0];
3257         dst[i+1] += src[i+1];
3258         dst[i+2] += src[i+2];
3259         dst[i+3] += src[i+3];
3260         dst[i+4] += src[i+4];
3261         dst[i+5] += src[i+5];
3262         dst[i+6] += src[i+6];
3263         dst[i+7] += src[i+7];
3264     }
3265     for(; i<w; i++)
3266         dst[i+0] += src[i+0];
3267 }
3268
3269 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3270     int i;
3271     for(i=0; i+7<w; i+=8){
3272         dst[i+0] = src1[i+0]-src2[i+0];
3273         dst[i+1] = src1[i+1]-src2[i+1];
3274         dst[i+2] = src1[i+2]-src2[i+2];
3275         dst[i+3] = src1[i+3]-src2[i+3];
3276         dst[i+4] = src1[i+4]-src2[i+4];
3277         dst[i+5] = src1[i+5]-src2[i+5];
3278         dst[i+6] = src1[i+6]-src2[i+6];
3279         dst[i+7] = src1[i+7]-src2[i+7];
3280     }
3281     for(; i<w; i++)
3282         dst[i+0] = src1[i+0]-src2[i+0];
3283 }
3284
3285 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3286     int i;
3287     uint8_t l, lt;
3288
3289     l= *left;
3290     lt= *left_top;
3291
3292     for(i=0; i<w; i++){
3293         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3294         lt= src1[i];
3295         l= src2[i];
3296         dst[i]= l - pred;
3297     }
3298
3299     *left= l;
3300     *left_top= lt;
3301 }
3302
3303 #define BUTTERFLY2(o1,o2,i1,i2) \
3304 o1= (i1)+(i2);\
3305 o2= (i1)-(i2);
3306
3307 #define BUTTERFLY1(x,y) \
3308 {\
3309     int a,b;\
3310     a= x;\
3311     b= y;\
3312     x= a+b;\
3313     y= a-b;\
3314 }
3315
3316 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3317
3318 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3319     int i;
3320     int temp[64];
3321     int sum=0;
3322
3323     assert(h==8);
3324
3325     for(i=0; i<8; i++){
3326         //FIXME try pointer walks
3327         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3328         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3329         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3330         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3331
3332         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3333         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3334         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3335         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3336
3337         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3338         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3339         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3340         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3341     }
3342
3343     for(i=0; i<8; i++){
3344         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3345         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3346         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3347         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3348
3349         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3350         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3351         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3352         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3353
3354         sum +=
3355              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3356             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3357             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3358             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3359     }
3360 #if 0
3361 static int maxi=0;
3362 if(sum>maxi){
3363     maxi=sum;
3364     printf("MAX:%d\n", maxi);
3365 }
3366 #endif
3367     return sum;
3368 }
3369
3370 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3371     int i;
3372     int temp[64];
3373     int sum=0;
3374
3375     assert(h==8);
3376
3377     for(i=0; i<8; i++){
3378         //FIXME try pointer walks
3379         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3380         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3381         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3382         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3383
3384         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3385         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3386         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3387         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3388
3389         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3390         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3391         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3392         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3393     }
3394
3395     for(i=0; i<8; i++){
3396         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3397         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3398         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3399         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3400
3401         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3402         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3403         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3404         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3405
3406         sum +=
3407              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3408             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3409             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3410             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3411     }
3412
3413     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3414
3415     return sum;
3416 }
3417
3418 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3419     MpegEncContext * const s= (MpegEncContext *)c;
3420     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3421     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3422     int sum=0, i;
3423
3424     assert(h==8);
3425
3426     s->dsp.diff_pixels(temp, src1, src2, stride);
3427     s->dsp.fdct(temp);
3428
3429     for(i=0; i<64; i++)
3430         sum+= ABS(temp[i]);
3431
3432     return sum;
3433 }
3434
3435 #ifdef CONFIG_GPL
3436 #define DCT8_1D {\
3437     const int s07 = SRC(0) + SRC(7);\
3438     const int s16 = SRC(1) + SRC(6);\
3439     const int s25 = SRC(2) + SRC(5);\
3440     const int s34 = SRC(3) + SRC(4);\
3441     const int a0 = s07 + s34;\
3442     const int a1 = s16 + s25;\
3443     const int a2 = s07 - s34;\
3444     const int a3 = s16 - s25;\
3445     const int d07 = SRC(0) - SRC(7);\
3446     const int d16 = SRC(1) - SRC(6);\
3447     const int d25 = SRC(2) - SRC(5);\
3448     const int d34 = SRC(3) - SRC(4);\
3449     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3450     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3451     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3452     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3453     DST(0,  a0 + a1     ) ;\
3454     DST(1,  a4 + (a7>>2)) ;\
3455     DST(2,  a2 + (a3>>1)) ;\
3456     DST(3,  a5 + (a6>>2)) ;\
3457     DST(4,  a0 - a1     ) ;\
3458     DST(5,  a6 - (a5>>2)) ;\
3459     DST(6, (a2>>1) - a3 ) ;\
3460     DST(7, (a4>>2) - a7 ) ;\
3461 }
3462
3463 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3464     MpegEncContext * const s= (MpegEncContext *)c;
3465     int16_t dct[8][8];
3466     int i;
3467     int sum=0;
3468
3469     s->dsp.diff_pixels(dct, src1, src2, stride);
3470
3471 #define SRC(x) dct[i][x]
3472 #define DST(x,v) dct[i][x]= v
3473     for( i = 0; i < 8; i++ )
3474         DCT8_1D
3475 #undef SRC
3476 #undef DST
3477
3478 #define SRC(x) dct[x][i]
3479 #define DST(x,v) sum += ABS(v)
3480     for( i = 0; i < 8; i++ )
3481         DCT8_1D
3482 #undef SRC
3483 #undef DST
3484     return sum;
3485 }
3486 #endif
3487
3488 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3489     MpegEncContext * const s= (MpegEncContext *)c;
3490     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3491     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3492     int sum=0, i;
3493
3494     assert(h==8);
3495
3496     s->dsp.diff_pixels(temp, src1, src2, stride);
3497     s->dsp.fdct(temp);
3498
3499     for(i=0; i<64; i++)
3500         sum= FFMAX(sum, ABS(temp[i]));
3501
3502     return sum;
3503 }
3504
3505 void simple_idct(DCTELEM *block); //FIXME
3506
3507 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3508     MpegEncContext * const s= (MpegEncContext *)c;
3509     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3510     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3511     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3512     int sum=0, i;
3513
3514     assert(h==8);
3515     s->mb_intra=0;
3516
3517     s->dsp.diff_pixels(temp, src1, src2, stride);
3518
3519     memcpy(bak, temp, 64*sizeof(DCTELEM));
3520
3521     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3522     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3523     simple_idct(temp); //FIXME
3524
3525     for(i=0; i<64; i++)
3526         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3527
3528     return sum;
3529 }
3530
3531 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3532     MpegEncContext * const s= (MpegEncContext *)c;
3533     const uint8_t *scantable= s->intra_scantable.permutated;
3534     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3535     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3536     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3537     uint8_t * const bak= (uint8_t*)aligned_bak;
3538     int i, last, run, bits, level, distoration, start_i;
3539     const int esc_length= s->ac_esc_length;
3540     uint8_t * length;
3541     uint8_t * last_length;
3542
3543     assert(h==8);
3544
3545     for(i=0; i<8; i++){
3546         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3547         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3548     }
3549
3550     s->dsp.diff_pixels(temp, src1, src2, stride);
3551
3552     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3553
3554     bits=0;
3555
3556     if (s->mb_intra) {
3557         start_i = 1;
3558         length     = s->intra_ac_vlc_length;
3559         last_length= s->intra_ac_vlc_last_length;
3560         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3561     } else {
3562         start_i = 0;
3563         length     = s->inter_ac_vlc_length;
3564         last_length= s->inter_ac_vlc_last_length;
3565     }
3566
3567     if(last>=start_i){
3568         run=0;
3569         for(i=start_i; i<last; i++){
3570             int j= scantable[i];
3571             level= temp[j];
3572
3573             if(level){
3574                 level+=64;
3575                 if((level&(~127)) == 0){
3576                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3577                 }else
3578                     bits+= esc_length;
3579                 run=0;
3580             }else
3581                 run++;
3582         }
3583         i= scantable[last];
3584
3585         level= temp[i] + 64;
3586
3587         assert(level - 64);
3588
3589         if((level&(~127)) == 0){
3590             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3591         }else
3592             bits+= esc_length;
3593
3594     }
3595
3596     if(last>=0){
3597         if(s->mb_intra)
3598             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3599         else
3600             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3601     }
3602
3603     s->dsp.idct_add(bak, stride, temp);
3604
3605     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3606
3607     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3608 }
3609
3610 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3611     MpegEncContext * const s= (MpegEncContext *)c;
3612     const uint8_t *scantable= s->intra_scantable.permutated;
3613     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3614     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3615     int i, last, run, bits, level, start_i;
3616     const int esc_length= s->ac_esc_length;
3617     uint8_t * length;
3618     uint8_t * last_length;
3619
3620     assert(h==8);
3621
3622     s->dsp.diff_pixels(temp, src1, src2, stride);
3623
3624     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3625
3626     bits=0;
3627
3628     if (s->mb_intra) {
3629         start_i = 1;
3630         length     = s->intra_ac_vlc_length;
3631         last_length= s->intra_ac_vlc_last_length;
3632         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3633     } else {
3634         start_i = 0;
3635         length     = s->inter_ac_vlc_length;
3636         last_length= s->inter_ac_vlc_last_length;
3637     }
3638
3639     if(last>=start_i){
3640         run=0;
3641         for(i=start_i; i<last; i++){
3642             int j= scantable[i];
3643             level= temp[j];
3644
3645             if(level){
3646                 level+=64;
3647                 if((level&(~127)) == 0){
3648                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3649                 }else
3650                     bits+= esc_length;
3651                 run=0;
3652             }else
3653                 run++;
3654         }
3655         i= scantable[last];
3656
3657         level= temp[i] + 64;
3658
3659         assert(level - 64);
3660
3661         if((level&(~127)) == 0){
3662             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3663         }else
3664             bits+= esc_length;
3665     }
3666
3667     return bits;
3668 }
3669
3670 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3671     int score=0;
3672     int x,y;
3673
3674     for(y=1; y<h; y++){
3675         for(x=0; x<16; x+=4){
3676             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3677                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3678         }
3679         s+= stride;
3680     }
3681
3682     return score;
3683 }
3684
3685 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3686     int score=0;
3687     int x,y;
3688
3689     for(y=1; y<h; y++){
3690         for(x=0; x<16; x++){
3691             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3692         }
3693         s1+= stride;
3694         s2+= stride;
3695     }
3696
3697     return score;
3698 }
3699
3700 #define SQ(a) ((a)*(a))
3701 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3702     int score=0;
3703     int x,y;
3704
3705     for(y=1; y<h; y++){
3706         for(x=0; x<16; x+=4){
3707             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3708                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3709         }
3710         s+= stride;
3711     }
3712
3713     return score;
3714 }
3715
3716 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3717     int score=0;
3718     int x,y;
3719
3720     for(y=1; y<h; y++){
3721         for(x=0; x<16; x++){
3722             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3723         }
3724         s1+= stride;
3725         s2+= stride;
3726     }
3727
3728     return score;
3729 }
3730
3731 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3732 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3733 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3734 #ifdef CONFIG_GPL
3735 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3736 #endif
3737 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3738 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3739 WARPER8_16_SQ(rd8x8_c, rd16_c)
3740 WARPER8_16_SQ(bit8x8_c, bit16_c)
3741
3742 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3743  converted */
3744 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3745 {
3746     j_rev_dct (block);
3747     put_pixels_clamped_c(block, dest, line_size);
3748 }
3749 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3750 {
3751     j_rev_dct (block);
3752     add_pixels_clamped_c(block, dest, line_size);
3753 }
3754
3755 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3756 {
3757     j_rev_dct4 (block);
3758     put_pixels_clamped4_c(block, dest, line_size);
3759 }
3760 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3761 {
3762     j_rev_dct4 (block);
3763     add_pixels_clamped4_c(block, dest, line_size);
3764 }
3765
3766 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3767 {
3768     j_rev_dct2 (block);
3769     put_pixels_clamped2_c(block, dest, line_size);
3770 }
3771 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3772 {
3773     j_rev_dct2 (block);
3774     add_pixels_clamped2_c(block, dest, line_size);
3775 }
3776
3777 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3778 {
3779     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3780
3781     dest[0] = cm[(block[0] + 4)>>3];
3782 }
3783 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3784 {
3785     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3786
3787     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3788 }
3789
3790 static void just_return() { return; }
3791
3792 /* init static data */
3793 void dsputil_static_init(void)
3794 {
3795     int i;
3796
3797     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3798     for(i=0;i<MAX_NEG_CROP;i++) {
3799         cropTbl[i] = 0;
3800         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3801     }
3802
3803     for(i=0;i<512;i++) {
3804         squareTbl[i] = (i - 256) * (i - 256);
3805     }
3806
3807     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3808 }
3809
3810
3811 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3812 {
3813     int i;
3814
3815 #ifdef CONFIG_ENCODERS
3816     if(avctx->dct_algo==FF_DCT_FASTINT) {
3817         c->fdct = fdct_ifast;
3818         c->fdct248 = fdct_ifast248;
3819     }
3820     else if(avctx->dct_algo==FF_DCT_FAAN) {
3821         c->fdct = ff_faandct;
3822         c->fdct248 = ff_faandct248;
3823     }
3824     else {
3825         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3826         c->fdct248 = ff_fdct248_islow;
3827     }
3828 #endif //CONFIG_ENCODERS
3829
3830     if(avctx->lowres==1){
3831         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3832             c->idct_put= ff_jref_idct4_put;
3833             c->idct_add= ff_jref_idct4_add;
3834         }else{
3835             c->idct_put= ff_h264_lowres_idct_put_c;
3836             c->idct_add= ff_h264_lowres_idct_add_c;
3837         }
3838         c->idct    = j_rev_dct4;
3839         c->idct_permutation_type= FF_NO_IDCT_PERM;
3840     }else if(avctx->lowres==2){
3841         c->idct_put= ff_jref_idct2_put;
3842         c->idct_add= ff_jref_idct2_add;
3843         c->idct    = j_rev_dct2;
3844         c->idct_permutation_type= FF_NO_IDCT_PERM;
3845     }else if(avctx->lowres==3){
3846         c->idct_put= ff_jref_idct1_put;
3847         c->idct_add= ff_jref_idct1_add;
3848         c->idct    = j_rev_dct1;
3849         c->idct_permutation_type= FF_NO_IDCT_PERM;
3850     }else{
3851         if(avctx->idct_algo==FF_IDCT_INT){
3852             c->idct_put= ff_jref_idct_put;
3853             c->idct_add= ff_jref_idct_add;
3854             c->idct    = j_rev_dct;
3855             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3856         }else if(avctx->idct_algo==FF_IDCT_VP3){
3857             c->idct_put= ff_vp3_idct_put_c;
3858             c->idct_add= ff_vp3_idct_add_c;
3859             c->idct    = ff_vp3_idct_c;
3860             c->idct_permutation_type= FF_NO_IDCT_PERM;
3861         }else{ //accurate/default
3862             c->idct_put= simple_idct_put;
3863             c->idct_add= simple_idct_add;
3864             c->idct    = simple_idct;
3865             c->idct_permutation_type= FF_NO_IDCT_PERM;
3866         }
3867     }
3868
3869     c->h264_idct_add= ff_h264_idct_add_c;
3870     c->h264_idct8_add= ff_h264_idct8_add_c;
3871     c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3872     c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3873
3874     c->get_pixels = get_pixels_c;
3875     c->diff_pixels = diff_pixels_c;
3876     c->put_pixels_clamped = put_pixels_clamped_c;
3877     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3878     c->add_pixels_clamped = add_pixels_clamped_c;
3879     c->add_pixels8 = add_pixels8_c;
3880     c->add_pixels4 = add_pixels4_c;
3881     c->gmc1 = gmc1_c;
3882     c->gmc = ff_gmc_c;
3883     c->clear_blocks = clear_blocks_c;
3884     c->pix_sum = pix_sum_c;
3885     c->pix_norm1 = pix_norm1_c;
3886
3887     /* TODO [0] 16  [1] 8 */
3888     c->pix_abs[0][0] = pix_abs16_c;
3889     c->pix_abs[0][1] = pix_abs16_x2_c;
3890     c->pix_abs[0][2] = pix_abs16_y2_c;
3891     c->pix_abs[0][3] = pix_abs16_xy2_c;
3892     c->pix_abs[1][0] = pix_abs8_c;
3893     c->pix_abs[1][1] = pix_abs8_x2_c;
3894     c->pix_abs[1][2] = pix_abs8_y2_c;
3895     c->pix_abs[1][3] = pix_abs8_xy2_c;
3896
3897 #define dspfunc(PFX, IDX, NUM) \
3898     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3899     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3900     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3901     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3902
3903     dspfunc(put, 0, 16);
3904     dspfunc(put_no_rnd, 0, 16);
3905     dspfunc(put, 1, 8);
3906     dspfunc(put_no_rnd, 1, 8);
3907     dspfunc(put, 2, 4);
3908     dspfunc(put, 3, 2);
3909
3910     dspfunc(avg, 0, 16);
3911     dspfunc(avg_no_rnd, 0, 16);
3912     dspfunc(avg, 1, 8);
3913     dspfunc(avg_no_rnd, 1, 8);
3914     dspfunc(avg, 2, 4);
3915     dspfunc(avg, 3, 2);
3916 #undef dspfunc
3917
3918     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3919     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3920
3921     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3922     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3923     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3924     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3925     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3926     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3927     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3928     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3929     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3930
3931     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3932     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3933     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3934     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3935     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3936     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3937     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3938     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3939     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3940
3941 #define dspfunc(PFX, IDX, NUM) \
3942     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3943     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3944     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3945     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3946     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3947     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3948     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3949     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3950     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3951     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3952     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3953     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3954     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3955     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3956     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3957     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3958
3959     dspfunc(put_qpel, 0, 16);
3960     dspfunc(put_no_rnd_qpel, 0, 16);
3961
3962     dspfunc(avg_qpel, 0, 16);
3963     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3964
3965     dspfunc(put_qpel, 1, 8);
3966     dspfunc(put_no_rnd_qpel, 1, 8);
3967
3968     dspfunc(avg_qpel, 1, 8);
3969     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3970
3971     dspfunc(put_h264_qpel, 0, 16);
3972     dspfunc(put_h264_qpel, 1, 8);
3973     dspfunc(put_h264_qpel, 2, 4);
3974     dspfunc(put_h264_qpel, 3, 2);
3975     dspfunc(avg_h264_qpel, 0, 16);
3976     dspfunc(avg_h264_qpel, 1, 8);
3977     dspfunc(avg_h264_qpel, 2, 4);
3978
3979 #undef dspfunc
3980     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3981     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3982     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3983     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3984     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3985     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3986
3987     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3988     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3989     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3990     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3991     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3992     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3993     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3994     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3995     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3996     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3997     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3998     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3999     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4000     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4001     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4002     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4003     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4004     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4005     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4006     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4007
4008     ff_cavsdsp_init(c,avctx);
4009
4010     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4011     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4012     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4013     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4014     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4015     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4016     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4017     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4018
4019 #define SET_CMP_FUNC(name) \
4020     c->name[0]= name ## 16_c;\
4021     c->name[1]= name ## 8x8_c;
4022
4023     SET_CMP_FUNC(hadamard8_diff)
4024     c->hadamard8_diff[4]= hadamard8_intra16_c;
4025     SET_CMP_FUNC(dct_sad)
4026     SET_CMP_FUNC(dct_max)
4027 #ifdef CONFIG_GPL
4028     SET_CMP_FUNC(dct264_sad)
4029 #endif
4030     c->sad[0]= pix_abs16_c;
4031     c->sad[1]= pix_abs8_c;
4032     c->sse[0]= sse16_c;
4033     c->sse[1]= sse8_c;
4034     c->sse[2]= sse4_c;
4035     SET_CMP_FUNC(quant_psnr)
4036     SET_CMP_FUNC(rd)
4037     SET_CMP_FUNC(bit)
4038     c->vsad[0]= vsad16_c;
4039     c->vsad[4]= vsad_intra16_c;
4040     c->vsse[0]= vsse16_c;
4041     c->vsse[4]= vsse_intra16_c;
4042     c->nsse[0]= nsse16_c;
4043     c->nsse[1]= nsse8_c;
4044 #ifdef CONFIG_SNOW_ENCODER
4045     c->w53[0]= w53_16_c;
4046     c->w53[1]= w53_8_c;
4047     c->w97[0]= w97_16_c;
4048     c->w97[1]= w97_8_c;
4049 #endif
4050
4051     c->add_bytes= add_bytes_c;
4052     c->diff_bytes= diff_bytes_c;
4053     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4054     c->bswap_buf= bswap_buf;
4055
4056     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4057     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4058     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4059     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4060     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4061     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4062
4063     c->h263_h_loop_filter= h263_h_loop_filter_c;
4064     c->h263_v_loop_filter= h263_v_loop_filter_c;
4065
4066     c->h261_loop_filter= h261_loop_filter_c;
4067
4068     c->try_8x8basis= try_8x8basis_c;
4069     c->add_8x8basis= add_8x8basis_c;
4070
4071 #ifdef CONFIG_SNOW_ENCODER
4072     c->vertical_compose97i = ff_snow_vertical_compose97i;
4073     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4074     c->inner_add_yblock = ff_snow_inner_add_yblock;
4075 #endif
4076
4077     c->shrink[0]= ff_img_copy_plane;
4078     c->shrink[1]= ff_shrink22;
4079     c->shrink[2]= ff_shrink44;
4080     c->shrink[3]= ff_shrink88;
4081
4082     c->prefetch= just_return;
4083
4084 #ifdef HAVE_MMX
4085     dsputil_init_mmx(c, avctx);
4086 #endif
4087 #ifdef ARCH_ARMV4L
4088     dsputil_init_armv4l(c, avctx);
4089 #endif
4090 #ifdef HAVE_MLIB
4091     dsputil_init_mlib(c, avctx);
4092 #endif
4093 #ifdef ARCH_SPARC
4094    dsputil_init_vis(c,avctx);
4095 #endif
4096 #ifdef ARCH_ALPHA
4097     dsputil_init_alpha(c, avctx);
4098 #endif
4099 #ifdef ARCH_POWERPC
4100     dsputil_init_ppc(c, avctx);
4101 #endif
4102 #ifdef HAVE_MMI
4103     dsputil_init_mmi(c, avctx);
4104 #endif
4105 #ifdef ARCH_SH4
4106     dsputil_init_sh4(c,avctx);
4107 #endif
4108
4109     switch(c->idct_permutation_type){
4110     case FF_NO_IDCT_PERM:
4111         for(i=0; i<64; i++)
4112             c->idct_permutation[i]= i;
4113         break;
4114     case FF_LIBMPEG2_IDCT_PERM:
4115         for(i=0; i<64; i++)
4116             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4117         break;
4118     case FF_SIMPLE_IDCT_PERM:
4119         for(i=0; i<64; i++)
4120             c->idct_permutation[i]= simple_mmx_permutation[i];
4121         break;
4122     case FF_TRANSPOSE_IDCT_PERM:
4123         for(i=0; i<64; i++)
4124             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4125         break;
4126     case FF_PARTTRANS_IDCT_PERM:
4127         for(i=0; i<64; i++)
4128             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4129         break;
4130     default:
4131         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4132     }
4133 }
4134