git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  *
  19  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  20  */
  21 #include "avcodec.h"
  22 #include "dsputil.h"
  23 #include "simple_idct.h"
  24
  25 void (*ff_idct)(DCTELEM *block);
  26 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
  27 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
  28 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
  29 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
  30 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  31 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  32 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
  33 void (*clear_blocks)(DCTELEM *blocks);
  34 int (*pix_sum)(UINT8 * pix, int line_size);
  35 int (*pix_norm1)(UINT8 * pix, int line_size);
  36
  37 op_pixels_abs_func pix_abs16x16;
  38 op_pixels_abs_func pix_abs16x16_x2;
  39 op_pixels_abs_func pix_abs16x16_y2;
  40 op_pixels_abs_func pix_abs16x16_xy2;
  41
  42 op_pixels_abs_func pix_abs8x8;
  43 op_pixels_abs_func pix_abs8x8_x2;
  44 op_pixels_abs_func pix_abs8x8_y2;
  45 op_pixels_abs_func pix_abs8x8_xy2;
  46
  47 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
  48 UINT32 squareTbl[512];
  49
  50 extern INT16 ff_mpeg1_default_intra_matrix[64];
  51 extern INT16 ff_mpeg1_default_non_intra_matrix[64];
  52 extern INT16 ff_mpeg4_default_intra_matrix[64];
  53 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
  54
  55 UINT8 zigzag_direct[64] = {
  56     0, 1, 8, 16, 9, 2, 3, 10,
  57     17, 24, 32, 25, 18, 11, 4, 5,
  58     12, 19, 26, 33, 40, 48, 41, 34,
  59     27, 20, 13, 6, 7, 14, 21, 28,
  60     35, 42, 49, 56, 57, 50, 43, 36,
  61     29, 22, 15, 23, 30, 37, 44, 51,
  62     58, 59, 52, 45, 38, 31, 39, 46,
  63     53, 60, 61, 54, 47, 55, 62, 63
  64 };
  65
  66 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  67 UINT16 __align8 inv_zigzag_direct16[64];
  68
  69 /* not permutated zigzag_direct for MMX quantizer */
  70 UINT8 zigzag_direct_noperm[64];
  71
  72 UINT8 ff_alternate_horizontal_scan[64] = {
  73     0,  1,  2,  3,  8,  9, 16, 17,
  74     10, 11,  4,  5,  6,  7, 15, 14,
  75     13, 12, 19, 18, 24, 25, 32, 33,
  76     26, 27, 20, 21, 22, 23, 28, 29,
  77     30, 31, 34, 35, 40, 41, 48, 49,
  78     42, 43, 36, 37, 38, 39, 44, 45,
  79     46, 47, 50, 51, 56, 57, 58, 59,
  80     52, 53, 54, 55, 60, 61, 62, 63,
  81 };
  82
  83 UINT8 ff_alternate_vertical_scan[64] = {
  84     0,  8, 16, 24,  1,  9,  2, 10,
  85     17, 25, 32, 40, 48, 56, 57, 49,
  86     41, 33, 26, 18,  3, 11,  4, 12,
  87     19, 27, 34, 42, 50, 58, 35, 43,
  88     51, 59, 20, 28,  5, 13,  6, 14,
  89     21, 29, 36, 44, 52, 60, 37, 45,
  90     53, 61, 22, 30,  7, 15, 23, 31,
  91     38, 46, 54, 62, 39, 47, 55, 63,
  92 };
  93
  94 #ifdef SIMPLE_IDCT
  95
  96 /* Input permutation for the simple_idct_mmx */
  97 static UINT8 simple_mmx_permutation[64]={
  98         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  99         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 100         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 101         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 102         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 103         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 104         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 105         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 106 };
 107 #endif
 108
 109 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 110 UINT32 inverse[256]={
 111          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 112  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 113  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 114  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 115  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 116  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 117   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 118   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 119   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 120   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 121   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 122   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 123   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 124   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 125   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 126   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 127   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 128   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 129   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 130   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 131   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 132   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 133   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 134   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 135   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 136   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 137   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 138   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 139   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 140   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 141   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 142   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 143 };
 144
 145 /* used to skip zeros at the end */
 146 UINT8 zigzag_end[64];
 147
 148 UINT8 permutation[64];
 149 //UINT8 invPermutation[64];
 150
 151 static void build_zigzag_end(void)
 152 {
 153     int lastIndex;
 154     int lastIndexAfterPerm=0;
 155     for(lastIndex=0; lastIndex<64; lastIndex++)
 156     {
 157         if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
 158             lastIndexAfterPerm= zigzag_direct[lastIndex];
 159         zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
 160     }
 161 }
 162
 163 int pix_sum_c(UINT8 * pix, int line_size)
 164 {
 165     int s, i, j;
 166
 167     s = 0;
 168     for (i = 0; i < 16; i++) {
 169         for (j = 0; j < 16; j += 8) {
 170             s += pix[0];
 171             s += pix[1];
 172             s += pix[2];
 173             s += pix[3];
 174             s += pix[4];
 175             s += pix[5];
 176             s += pix[6];
 177             s += pix[7];
 178             pix += 8;
 179         }
 180         pix += line_size - 16;
 181     }
 182     return s;
 183 }
 184
 185 int pix_norm1_c(UINT8 * pix, int line_size)
 186 {
 187     int s, i, j;
 188     UINT32 *sq = squareTbl + 256;
 189
 190     s = 0;
 191     for (i = 0; i < 16; i++) {
 192         for (j = 0; j < 16; j += 8) {
 193             s += sq[pix[0]];
 194             s += sq[pix[1]];
 195             s += sq[pix[2]];
 196             s += sq[pix[3]];
 197             s += sq[pix[4]];
 198             s += sq[pix[5]];
 199             s += sq[pix[6]];
 200             s += sq[pix[7]];
 201             pix += 8;
 202         }
 203         pix += line_size - 16;
 204     }
 205     return s;
 206 }
 207
 208
 209 void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
 210 {
 211     int i;
 212
 213     /* read the pixels */
 214     for(i=0;i<8;i++) {
 215         block[0] = pixels[0];
 216         block[1] = pixels[1];
 217         block[2] = pixels[2];
 218         block[3] = pixels[3];
 219         block[4] = pixels[4];
 220         block[5] = pixels[5];
 221         block[6] = pixels[6];
 222         block[7] = pixels[7];
 223         pixels += line_size;
 224         block += 8;
 225     }
 226 }
 227
 228 void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
 229                    int stride){
 230     int i;
 231
 232     /* read the pixels */
 233     for(i=0;i<8;i++) {
 234         block[0] = s1[0] - s2[0];
 235         block[1] = s1[1] - s2[1];
 236         block[2] = s1[2] - s2[2];
 237         block[3] = s1[3] - s2[3];
 238         block[4] = s1[4] - s2[4];
 239         block[5] = s1[5] - s2[5];
 240         block[6] = s1[6] - s2[6];
 241         block[7] = s1[7] - s2[7];
 242         s1 += stride;
 243         s2 += stride;
 244         block += 8;
 245     }
 246 }
 247
 248
 249 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
 250                           int line_size)
 251 {
 252     int i;
 253     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 254
 255     /* read the pixels */
 256     for(i=0;i<8;i++) {
 257         pixels[0] = cm[block[0]];
 258         pixels[1] = cm[block[1]];
 259         pixels[2] = cm[block[2]];
 260         pixels[3] = cm[block[3]];
 261         pixels[4] = cm[block[4]];
 262         pixels[5] = cm[block[5]];
 263         pixels[6] = cm[block[6]];
 264         pixels[7] = cm[block[7]];
 265
 266         pixels += line_size;
 267         block += 8;
 268     }
 269 }
 270
 271 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
 272                           int line_size)
 273 {
 274     int i;
 275     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 276
 277     /* read the pixels */
 278     for(i=0;i<8;i++) {
 279         pixels[0] = cm[pixels[0] + block[0]];
 280         pixels[1] = cm[pixels[1] + block[1]];
 281         pixels[2] = cm[pixels[2] + block[2]];
 282         pixels[3] = cm[pixels[3] + block[3]];
 283         pixels[4] = cm[pixels[4] + block[4]];
 284         pixels[5] = cm[pixels[5] + block[5]];
 285         pixels[6] = cm[pixels[6] + block[6]];
 286         pixels[7] = cm[pixels[7] + block[7]];
 287         pixels += line_size;
 288         block += 8;
 289     }
 290 }
 291 #if 0
 292
 293 #define PIXOP2(OPNAME, OP) \
 294 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 295 {\
 296     int i;\
 297     for(i=0; i<h; i++){\
 298         OP(*((uint64_t*)block), LD64(pixels));\
 299         pixels+=line_size;\
 300         block +=line_size;\
 301     }\
 302 }\
 303 \
 304 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 305 {\
 306     int i;\
 307     for(i=0; i<h; i++){\
 308         const uint64_t a= LD64(pixels  );\
 309         const uint64_t b= LD64(pixels+1);\
 310         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 311         pixels+=line_size;\
 312         block +=line_size;\
 313     }\
 314 }\
 315 \
 316 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 317 {\
 318     int i;\
 319     for(i=0; i<h; i++){\
 320         const uint64_t a= LD64(pixels  );\
 321         const uint64_t b= LD64(pixels+1);\
 322         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 323         pixels+=line_size;\
 324         block +=line_size;\
 325     }\
 326 }\
 327 \
 328 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 329 {\
 330     int i;\
 331     for(i=0; i<h; i++){\
 332         const uint64_t a= LD64(pixels          );\
 333         const uint64_t b= LD64(pixels+line_size);\
 334         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 335         pixels+=line_size;\
 336         block +=line_size;\
 337     }\
 338 }\
 339 \
 340 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 341 {\
 342     int i;\
 343     for(i=0; i<h; i++){\
 344         const uint64_t a= LD64(pixels          );\
 345         const uint64_t b= LD64(pixels+line_size);\
 346         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 347         pixels+=line_size;\
 348         block +=line_size;\
 349     }\
 350 }\
 351 \
 352 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 353 {\
 354         int i;\
 355         const uint64_t a= LD64(pixels  );\
 356         const uint64_t b= LD64(pixels+1);\
 357         uint64_t l0=  (a&0x0303030303030303ULL)\
 358                     + (b&0x0303030303030303ULL)\
 359                     + 0x0202020202020202ULL;\
 360         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 361                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 362         uint64_t l1,h1;\
 363 \
 364         pixels+=line_size;\
 365         for(i=0; i<h; i+=2){\
 366             uint64_t a= LD64(pixels  );\
 367             uint64_t b= LD64(pixels+1);\
 368             l1=  (a&0x0303030303030303ULL)\
 369                + (b&0x0303030303030303ULL);\
 370             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 371               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 372             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 373             pixels+=line_size;\
 374             block +=line_size;\
 375             a= LD64(pixels  );\
 376             b= LD64(pixels+1);\
 377             l0=  (a&0x0303030303030303ULL)\
 378                + (b&0x0303030303030303ULL)\
 379                + 0x0202020202020202ULL;\
 380             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 381               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 382             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 383             pixels+=line_size;\
 384             block +=line_size;\
 385         }\
 386 }\
 387 \
 388 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 389 {\
 390         int i;\
 391         const uint64_t a= LD64(pixels  );\
 392         const uint64_t b= LD64(pixels+1);\
 393         uint64_t l0=  (a&0x0303030303030303ULL)\
 394                     + (b&0x0303030303030303ULL)\
 395                     + 0x0101010101010101ULL;\
 396         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 397                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 398         uint64_t l1,h1;\
 399 \
 400         pixels+=line_size;\
 401         for(i=0; i<h; i+=2){\
 402             uint64_t a= LD64(pixels  );\
 403             uint64_t b= LD64(pixels+1);\
 404             l1=  (a&0x0303030303030303ULL)\
 405                + (b&0x0303030303030303ULL);\
 406             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 407               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 408             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 409             pixels+=line_size;\
 410             block +=line_size;\
 411             a= LD64(pixels  );\
 412             b= LD64(pixels+1);\
 413             l0=  (a&0x0303030303030303ULL)\
 414                + (b&0x0303030303030303ULL)\
 415                + 0x0101010101010101ULL;\
 416             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 417               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 418             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 419             pixels+=line_size;\
 420             block +=line_size;\
 421         }\
 422 }\
 423 \
 424 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 425     OPNAME ## _pixels,\
 426     OPNAME ## _pixels_x2,\
 427     OPNAME ## _pixels_y2,\
 428     OPNAME ## _pixels_xy2,\
 429 };\
 430 \
 431 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 432     OPNAME ## _pixels,\
 433     OPNAME ## _no_rnd_pixels_x2,\
 434     OPNAME ## _no_rnd_pixels_y2,\
 435     OPNAME ## _no_rnd_pixels_xy2,\
 436 };
 437
 438 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 439 #else // 64 bit variant
 440
 441 #define PIXOP2(OPNAME, OP) \
 442 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 443 {\
 444     int i;\
 445     for(i=0; i<h; i++){\
 446         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 447         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 448         pixels+=line_size;\
 449         block +=line_size;\
 450     }\
 451 }\
 452 \
 453 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 454 {\
 455     int i;\
 456     for(i=0; i<h; i++){\
 457         int j;\
 458         for(j=0; j<2; j++){\
 459             const uint32_t a= LD32(pixels  );\
 460             const uint32_t b= LD32(pixels+1);\
 461             OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
 462             pixels+=4;\
 463             block +=4;\
 464         }\
 465         pixels+=line_size-8;\
 466         block +=line_size-8;\
 467     }\
 468 }\
 469 \
 470 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 471 {\
 472     int i;\
 473     for(i=0; i<h; i++){\
 474         int j;\
 475         for(j=0; j<2; j++){\
 476             const uint32_t a= LD32(pixels  );\
 477             const uint32_t b= LD32(pixels+1);\
 478             OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
 479             pixels+=4;\
 480             block +=4;\
 481         }\
 482         pixels+=line_size-8;\
 483         block +=line_size-8;\
 484     }\
 485 }\
 486 \
 487 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 488 {\
 489     int i;\
 490     for(i=0; i<h; i++){\
 491         int j;\
 492         for(j=0; j<2; j++){\
 493             const uint32_t a= LD32(pixels          );\
 494             const uint32_t b= LD32(pixels+line_size);\
 495             OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
 496             pixels+=4;\
 497             block +=4;\
 498         }\
 499         pixels+=line_size-8;\
 500         block +=line_size-8;\
 501     }\
 502 }\
 503 \
 504 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 505 {\
 506     int i;\
 507     for(i=0; i<h; i++){\
 508         int j;\
 509         for(j=0; j<2; j++){\
 510             const uint32_t a= LD32(pixels          );\
 511             const uint32_t b= LD32(pixels+line_size);\
 512             OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
 513             pixels+=4;\
 514             block +=4;\
 515         }\
 516         pixels+=line_size-8;\
 517         block +=line_size-8;\
 518     }\
 519 }\
 520 \
 521 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 522 {\
 523     int j;\
 524     for(j=0; j<2; j++){\
 525         int i;\
 526         const uint32_t a= LD32(pixels  );\
 527         const uint32_t b= LD32(pixels+1);\
 528         uint32_t l0=  (a&0x03030303UL)\
 529                     + (b&0x03030303UL)\
 530                     + 0x02020202UL;\
 531         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 532                    + ((b&0xFCFCFCFCUL)>>2);\
 533         uint32_t l1,h1;\
 534 \
 535         pixels+=line_size;\
 536         for(i=0; i<h; i+=2){\
 537             uint32_t a= LD32(pixels  );\
 538             uint32_t b= LD32(pixels+1);\
 539             l1=  (a&0x03030303UL)\
 540                + (b&0x03030303UL);\
 541             h1= ((a&0xFCFCFCFCUL)>>2)\
 542               + ((b&0xFCFCFCFCUL)>>2);\
 543             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 544             pixels+=line_size;\
 545             block +=line_size;\
 546             a= LD32(pixels  );\
 547             b= LD32(pixels+1);\
 548             l0=  (a&0x03030303UL)\
 549                + (b&0x03030303UL)\
 550                + 0x02020202UL;\
 551             h0= ((a&0xFCFCFCFCUL)>>2)\
 552               + ((b&0xFCFCFCFCUL)>>2);\
 553             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 554             pixels+=line_size;\
 555             block +=line_size;\
 556         }\
 557         pixels+=4-line_size*(h+1);\
 558         block +=4-line_size*h;\
 559     }\
 560 }\
 561 \
 562 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 563 {\
 564     int j;\
 565     for(j=0; j<2; j++){\
 566         int i;\
 567         const uint32_t a= LD32(pixels  );\
 568         const uint32_t b= LD32(pixels+1);\
 569         uint32_t l0=  (a&0x03030303UL)\
 570                     + (b&0x03030303UL)\
 571                     + 0x01010101UL;\
 572         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 573                    + ((b&0xFCFCFCFCUL)>>2);\
 574         uint32_t l1,h1;\
 575 \
 576         pixels+=line_size;\
 577         for(i=0; i<h; i+=2){\
 578             uint32_t a= LD32(pixels  );\
 579             uint32_t b= LD32(pixels+1);\
 580             l1=  (a&0x03030303UL)\
 581                + (b&0x03030303UL);\
 582             h1= ((a&0xFCFCFCFCUL)>>2)\
 583               + ((b&0xFCFCFCFCUL)>>2);\
 584             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 585             pixels+=line_size;\
 586             block +=line_size;\
 587             a= LD32(pixels  );\
 588             b= LD32(pixels+1);\
 589             l0=  (a&0x03030303UL)\
 590                + (b&0x03030303UL)\
 591                + 0x01010101UL;\
 592             h0= ((a&0xFCFCFCFCUL)>>2)\
 593               + ((b&0xFCFCFCFCUL)>>2);\
 594             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 595             pixels+=line_size;\
 596             block +=line_size;\
 597         }\
 598         pixels+=4-line_size*(h+1);\
 599         block +=4-line_size*h;\
 600     }\
 601 }\
 602 \
 603 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 604     OPNAME ## _pixels,\
 605     OPNAME ## _pixels_x2,\
 606     OPNAME ## _pixels_y2,\
 607     OPNAME ## _pixels_xy2,\
 608 };\
 609 \
 610 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 611     OPNAME ## _pixels,\
 612     OPNAME ## _no_rnd_pixels_x2,\
 613     OPNAME ## _no_rnd_pixels_y2,\
 614     OPNAME ## _no_rnd_pixels_xy2,\
 615 };
 616 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 617 #endif
 618 #define op_put(a, b) a = b
 619
 620 PIXOP2(avg, op_avg)
 621 PIXOP2(put, op_put)
 622 #undef op_avg
 623 #undef op_put
 624
 625 #if 0
 626 /* FIXME this stuff could be removed as its ot really used anymore */
 627 #define PIXOP(BTYPE, OPNAME, OP, INCR)                                                   \
 628                                                                                          \
 629 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
 630 {                                                                                        \
 631     BTYPE *p;                                                                            \
 632     const UINT8 *pix;                                                                    \
 633                                                                                          \
 634     p = block;                                                                           \
 635     pix = pixels;                                                                        \
 636     do {                                                                                 \
 637         OP(p[0], pix[0]);                                                                  \
 638         OP(p[1], pix[1]);                                                                  \
 639         OP(p[2], pix[2]);                                                                  \
 640         OP(p[3], pix[3]);                                                                  \
 641         OP(p[4], pix[4]);                                                                  \
 642         OP(p[5], pix[5]);                                                                  \
 643         OP(p[6], pix[6]);                                                                  \
 644         OP(p[7], pix[7]);                                                                  \
 645         pix += line_size;                                                                \
 646         p += INCR;                                                                       \
 647     } while (--h);;                                                                       \
 648 }                                                                                        \
 649                                                                                          \
 650 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
 651 {                                                                                        \
 652     BTYPE *p;                                                                          \
 653     const UINT8 *pix;                                                                    \
 654                                                                                          \
 655     p = block;                                                                           \
 656     pix = pixels;                                                                        \
 657     do {                                                                   \
 658         OP(p[0], avg2(pix[0], pix[1]));                                                    \
 659         OP(p[1], avg2(pix[1], pix[2]));                                                    \
 660         OP(p[2], avg2(pix[2], pix[3]));                                                    \
 661         OP(p[3], avg2(pix[3], pix[4]));                                                    \
 662         OP(p[4], avg2(pix[4], pix[5]));                                                    \
 663         OP(p[5], avg2(pix[5], pix[6]));                                                    \
 664         OP(p[6], avg2(pix[6], pix[7]));                                                    \
 665         OP(p[7], avg2(pix[7], pix[8]));                                                    \
 666         pix += line_size;                                                                \
 667         p += INCR;                                                                       \
 668     } while (--h);                                                                        \
 669 }                                                                                        \
 670                                                                                          \
 671 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
 672 {                                                                                        \
 673     BTYPE *p;                                                                          \
 674     const UINT8 *pix;                                                                    \
 675     const UINT8 *pix1;                                                                   \
 676                                                                                          \
 677     p = block;                                                                           \
 678     pix = pixels;                                                                        \
 679     pix1 = pixels + line_size;                                                           \
 680     do {                                                                                 \
 681         OP(p[0], avg2(pix[0], pix1[0]));                                                   \
 682         OP(p[1], avg2(pix[1], pix1[1]));                                                   \
 683         OP(p[2], avg2(pix[2], pix1[2]));                                                   \
 684         OP(p[3], avg2(pix[3], pix1[3]));                                                   \
 685         OP(p[4], avg2(pix[4], pix1[4]));                                                   \
 686         OP(p[5], avg2(pix[5], pix1[5]));                                                   \
 687         OP(p[6], avg2(pix[6], pix1[6]));                                                   \
 688         OP(p[7], avg2(pix[7], pix1[7]));                                                   \
 689         pix += line_size;                                                                \
 690         pix1 += line_size;                                                               \
 691         p += INCR;                                                                       \
 692     } while(--h);                                                                         \
 693 }                                                                                        \
 694                                                                                          \
 695 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
 696 {                                                                                        \
 697     BTYPE *p;                                                                          \
 698     const UINT8 *pix;                                                                    \
 699     const UINT8 *pix1;                                                                   \
 700                                                                                          \
 701     p = block;                                                                           \
 702     pix = pixels;                                                                        \
 703     pix1 = pixels + line_size;                                                           \
 704     do {                                                                   \
 705         OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1]));                                  \
 706         OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2]));                                  \
 707         OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3]));                                  \
 708         OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4]));                                  \
 709         OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5]));                                  \
 710         OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6]));                                  \
 711         OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7]));                                  \
 712         OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8]));                                  \
 713         pix += line_size;                                                                \
 714         pix1 += line_size;                                                               \
 715         p += INCR;                                                                       \
 716     } while(--h);                                                                         \
 717 }                                                                                        \
 718                                                                                          \
 719 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
 720     OPNAME ## _pixels,                                                                   \
 721     OPNAME ## _pixels_x2,                                                                \
 722     OPNAME ## _pixels_y2,                                                                \
 723     OPNAME ## _pixels_xy2,                                                               \
 724 };
 725
 726 /* rounding primitives */
 727 #define avg2(a,b) ((a+b+1)>>1)
 728 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 729
 730 #define op_avg(a, b) a = avg2(a, b)
 731 #define op_sub(a, b) a -= b
 732 #define op_put(a, b) a = b
 733
 734 PIXOP(DCTELEM, sub, op_sub, 8)
 735 PIXOP(uint8_t, avg, op_avg, line_size)
 736 PIXOP(uint8_t, put, op_put, line_size)
 737
 738 /* not rounding primitives */
 739 #undef avg2
 740 #undef avg4
 741 #define avg2(a,b) ((a+b)>>1)
 742 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
 743
 744 PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
 745 PIXOP(uint8_t, put_no_rnd, op_put, line_size)
 746 /* motion estimation */
 747
 748 #undef avg2
 749 #undef avg4
 750 #endif
 751
 752 #define avg2(a,b) ((a+b+1)>>1)
 753 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 754
 755 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
 756 {
 757     const int A=(16-x16)*(16-y16);
 758     const int B=(   x16)*(16-y16);
 759     const int C=(16-x16)*(   y16);
 760     const int D=(   x16)*(   y16);
 761     int i;
 762     rounder= 128 - rounder;
 763
 764     for(i=0; i<h; i++)
 765     {
 766         dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
 767         dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
 768         dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
 769         dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
 770         dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
 771         dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
 772         dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
 773         dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
 774         dst+= srcStride;
 775         src+= srcStride;
 776     }
 777 }
 778
 779 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
 780 {
 781     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 782     int i;
 783     for(i=0; i<h; i++)
 784     {
 785         dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
 786         dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
 787         dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
 788         dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
 789         dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
 790         dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
 791         dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
 792         dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
 793         dst+=dstStride;
 794         src+=srcStride;
 795     }
 796 }
 797
 798 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
 799 {
 800     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 801     int i;
 802     for(i=0; i<w; i++)
 803     {
 804         const int src0= src[0*srcStride];
 805         const int src1= src[1*srcStride];
 806         const int src2= src[2*srcStride];
 807         const int src3= src[3*srcStride];
 808         const int src4= src[4*srcStride];
 809         const int src5= src[5*srcStride];
 810         const int src6= src[6*srcStride];
 811         const int src7= src[7*srcStride];
 812         const int src8= src[8*srcStride];
 813         dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
 814         dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
 815         dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
 816         dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
 817         dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
 818         dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
 819         dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
 820         dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
 821         dst++;
 822         src++;
 823     }
 824 }
 825
 826 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
 827 {
 828     int i;
 829     for(i=0; i<8; i++)
 830     {
 831         dst[0]= src[0];
 832         dst[1]= src[1];
 833         dst[2]= src[2];
 834         dst[3]= src[3];
 835         dst[4]= src[4];
 836         dst[5]= src[5];
 837         dst[6]= src[6];
 838         dst[7]= src[7];
 839         dst+=dstStride;
 840         src+=srcStride;
 841     }
 842 }
 843
 844 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
 845 {
 846     int i;
 847     for(i=0; i<8; i++)
 848     {
 849         dst[0]= (src1[0] + src2[0] + r)>>1;
 850         dst[1]= (src1[1] + src2[1] + r)>>1;
 851         dst[2]= (src1[2] + src2[2] + r)>>1;
 852         dst[3]= (src1[3] + src2[3] + r)>>1;
 853         dst[4]= (src1[4] + src2[4] + r)>>1;
 854         dst[5]= (src1[5] + src2[5] + r)>>1;
 855         dst[6]= (src1[6] + src2[6] + r)>>1;
 856         dst[7]= (src1[7] + src2[7] + r)>>1;
 857         dst+=dstStride;
 858         src1+=srcStride;
 859         src2+=8;
 860     }
 861 }
 862
 863 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
 864 {
 865     int i;
 866     for(i=0; i<8; i++)
 867     {
 868         dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
 869         dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
 870         dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
 871         dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
 872         dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
 873         dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
 874         dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
 875         dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
 876         dst+=dstStride;
 877         src1+=srcStride;
 878         src2+=8;
 879         src3+=8;
 880         src4+=8;
 881     }
 882 }
 883
 884 #define QPEL_MC(r, name) \
 885 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 886 {\
 887     put_block(dst, src, dstStride, srcStride);\
 888 }\
 889 \
 890 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 891 {\
 892     UINT8 half[64];\
 893     qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
 894     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 895 }\
 896 \
 897 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 898 {\
 899     qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 900 }\
 901 \
 902 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 903 {\
 904     UINT8 half[64];\
 905     qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
 906     avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
 907 }\
 908 \
 909 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 910 {\
 911     UINT8 half[64];\
 912     qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
 913     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 914 }\
 915 \
 916 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 917 {\
 918     qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 919 }\
 920 \
 921 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 922 {\
 923     UINT8 half[64];\
 924     qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
 925     avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
 926 }\
 927 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 928 {\
 929     UINT8 halfH[72];\
 930     UINT8 halfV[64];\
 931     UINT8 halfHV[64];\
 932     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 933     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 934     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 935     avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 936 }\
 937 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 938 {\
 939     UINT8 halfH[72];\
 940     UINT8 halfV[64];\
 941     UINT8 halfHV[64];\
 942     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 943     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 944     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 945     avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 946 }\
 947 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 948 {\
 949     UINT8 halfH[72];\
 950     UINT8 halfV[64];\
 951     UINT8 halfHV[64];\
 952     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 953     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 954     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 955     avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 956 }\
 957 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 958 {\
 959     UINT8 halfH[72];\
 960     UINT8 halfV[64];\
 961     UINT8 halfHV[64];\
 962     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 963     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 964     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 965     avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 966 }\
 967 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 968 {\
 969     UINT8 halfH[72];\
 970     UINT8 halfHV[64];\
 971     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 972     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 973     avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
 974 }\
 975 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 976 {\
 977     UINT8 halfH[72];\
 978     UINT8 halfHV[64];\
 979     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 980     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 981     avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
 982 }\
 983 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 984 {\
 985     UINT8 halfH[72];\
 986     UINT8 halfV[64];\
 987     UINT8 halfHV[64];\
 988     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 989     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 990     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 991     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 992 }\
 993 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 994 {\
 995     UINT8 halfH[72];\
 996     UINT8 halfV[64];\
 997     UINT8 halfHV[64];\
 998     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 999     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
1000     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
1001     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
1002 }\
1003 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
1004 {\
1005     UINT8 halfH[72];\
1006     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
1007     qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
1008 }\
1009 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
1010     qpel_mc00_c ## name,                                                                   \
1011     qpel_mc10_c ## name,                                                                   \
1012     qpel_mc20_c ## name,                                                                   \
1013     qpel_mc30_c ## name,                                                                   \
1014     qpel_mc01_c ## name,                                                                   \
1015     qpel_mc11_c ## name,                                                                   \
1016     qpel_mc21_c ## name,                                                                   \
1017     qpel_mc31_c ## name,                                                                   \
1018     qpel_mc02_c ## name,                                                                   \
1019     qpel_mc12_c ## name,                                                                   \
1020     qpel_mc22_c ## name,                                                                   \
1021     qpel_mc32_c ## name,                                                                   \
1022     qpel_mc03_c ## name,                                                                   \
1023     qpel_mc13_c ## name,                                                                   \
1024     qpel_mc23_c ## name,                                                                   \
1025     qpel_mc33_c ## name,                                                                   \
1026 };
1027
1028 QPEL_MC(0, _rnd)
1029 QPEL_MC(1, _no_rnd)
1030
1031 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1032 {
1033     int s, i;
1034
1035     s = 0;
1036     for(i=0;i<16;i++) {
1037         s += abs(pix1[0] - pix2[0]);
1038         s += abs(pix1[1] - pix2[1]);
1039         s += abs(pix1[2] - pix2[2]);
1040         s += abs(pix1[3] - pix2[3]);
1041         s += abs(pix1[4] - pix2[4]);
1042         s += abs(pix1[5] - pix2[5]);
1043         s += abs(pix1[6] - pix2[6]);
1044         s += abs(pix1[7] - pix2[7]);
1045         s += abs(pix1[8] - pix2[8]);
1046         s += abs(pix1[9] - pix2[9]);
1047         s += abs(pix1[10] - pix2[10]);
1048         s += abs(pix1[11] - pix2[11]);
1049         s += abs(pix1[12] - pix2[12]);
1050         s += abs(pix1[13] - pix2[13]);
1051         s += abs(pix1[14] - pix2[14]);
1052         s += abs(pix1[15] - pix2[15]);
1053         pix1 += line_size;
1054         pix2 += line_size;
1055     }
1056     return s;
1057 }
1058
1059 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1060 {
1061     int s, i;
1062
1063     s = 0;
1064     for(i=0;i<16;i++) {
1065         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1066         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1067         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1068         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1069         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1070         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1071         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1072         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1073         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1074         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1075         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1076         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1077         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1078         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1079         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1080         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1081         pix1 += line_size;
1082         pix2 += line_size;
1083     }
1084     return s;
1085 }
1086
1087 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1088 {
1089     int s, i;
1090     UINT8 *pix3 = pix2 + line_size;
1091
1092     s = 0;
1093     for(i=0;i<16;i++) {
1094         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1095         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1096         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1097         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1098         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1099         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1100         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1101         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1102         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1103         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1104         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1105         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1106         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1107         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1108         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1109         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1110         pix1 += line_size;
1111         pix2 += line_size;
1112         pix3 += line_size;
1113     }
1114     return s;
1115 }
1116
1117 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1118 {
1119     int s, i;
1120     UINT8 *pix3 = pix2 + line_size;
1121
1122     s = 0;
1123     for(i=0;i<16;i++) {
1124         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1125         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1126         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1127         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1128         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1129         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1130         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1131         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1132         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1133         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1134         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1135         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1136         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1137         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1138         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1139         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1140         pix1 += line_size;
1141         pix2 += line_size;
1142         pix3 += line_size;
1143     }
1144     return s;
1145 }
1146
1147 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1148 {
1149     int s, i;
1150
1151     s = 0;
1152     for(i=0;i<8;i++) {
1153         s += abs(pix1[0] - pix2[0]);
1154         s += abs(pix1[1] - pix2[1]);
1155         s += abs(pix1[2] - pix2[2]);
1156         s += abs(pix1[3] - pix2[3]);
1157         s += abs(pix1[4] - pix2[4]);
1158         s += abs(pix1[5] - pix2[5]);
1159         s += abs(pix1[6] - pix2[6]);
1160         s += abs(pix1[7] - pix2[7]);
1161         pix1 += line_size;
1162         pix2 += line_size;
1163     }
1164     return s;
1165 }
1166
1167 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1168 {
1169     int s, i;
1170
1171     s = 0;
1172     for(i=0;i<8;i++) {
1173         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1174         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1175         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1176         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1177         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1178         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1179         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1180         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1181         pix1 += line_size;
1182         pix2 += line_size;
1183     }
1184     return s;
1185 }
1186
1187 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1188 {
1189     int s, i;
1190     UINT8 *pix3 = pix2 + line_size;
1191
1192     s = 0;
1193     for(i=0;i<8;i++) {
1194         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1195         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1196         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1197         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1198         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1199         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1200         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1201         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1202         pix1 += line_size;
1203         pix2 += line_size;
1204         pix3 += line_size;
1205     }
1206     return s;
1207 }
1208
1209 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1210 {
1211     int s, i;
1212     UINT8 *pix3 = pix2 + line_size;
1213
1214     s = 0;
1215     for(i=0;i<8;i++) {
1216         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1217         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1218         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1219         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1220         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1221         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1222         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1223         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1224         pix1 += line_size;
1225         pix2 += line_size;
1226         pix3 += line_size;
1227     }
1228     return s;
1229 }
1230
1231 /* permute block according so that it corresponds to the MMX idct
1232    order */
1233 #ifdef SIMPLE_IDCT
1234  /* general permutation, but perhaps slightly slower */
1235 void block_permute(INT16 *block)
1236 {
1237         int i;
1238         INT16 temp[64];
1239
1240         for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1241
1242         for(i=0; i<64; i++) block[i] = temp[i];
1243 }
1244 #else
1245
1246 void block_permute(INT16 *block)
1247 {
1248     int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1249     int i;
1250
1251     for(i=0;i<8;i++) {
1252         tmp1 = block[1];
1253         tmp2 = block[2];
1254         tmp3 = block[3];
1255         tmp4 = block[4];
1256         tmp5 = block[5];
1257         tmp6 = block[6];
1258         block[1] = tmp2;
1259         block[2] = tmp4;
1260         block[3] = tmp6;
1261         block[4] = tmp1;
1262         block[5] = tmp3;
1263         block[6] = tmp5;
1264         block += 8;
1265     }
1266 }
1267 #endif
1268
1269 void clear_blocks_c(DCTELEM *blocks)
1270 {
1271     memset(blocks, 0, sizeof(DCTELEM)*6*64);
1272 }
1273
1274 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1275    converted */
1276 void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1277 {
1278     ff_idct (block);
1279     put_pixels_clamped(block, dest, line_size);
1280 }
1281
1282 void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1283 {
1284     ff_idct (block);
1285     add_pixels_clamped(block, dest, line_size);
1286 }
1287
1288 void dsputil_init(void)
1289 {
1290     int i, j;
1291     int use_permuted_idct;
1292
1293     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1294     for(i=0;i<MAX_NEG_CROP;i++) {
1295         cropTbl[i] = 0;
1296         cropTbl[i + MAX_NEG_CROP + 256] = 255;
1297     }
1298
1299     for(i=0;i<512;i++) {
1300         squareTbl[i] = (i - 256) * (i - 256);
1301     }
1302
1303 #ifdef SIMPLE_IDCT
1304     ff_idct = NULL;
1305 #else
1306     ff_idct = j_rev_dct;
1307 #endif
1308     get_pixels = get_pixels_c;
1309     diff_pixels = diff_pixels_c;
1310     put_pixels_clamped = put_pixels_clamped_c;
1311     add_pixels_clamped = add_pixels_clamped_c;
1312     gmc1= gmc1_c;
1313     clear_blocks= clear_blocks_c;
1314     pix_sum= pix_sum_c;
1315     pix_norm1= pix_norm1_c;
1316
1317     pix_abs16x16     = pix_abs16x16_c;
1318     pix_abs16x16_x2  = pix_abs16x16_x2_c;
1319     pix_abs16x16_y2  = pix_abs16x16_y2_c;
1320     pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1321     pix_abs8x8     = pix_abs8x8_c;
1322     pix_abs8x8_x2  = pix_abs8x8_x2_c;
1323     pix_abs8x8_y2  = pix_abs8x8_y2_c;
1324     pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1325
1326     use_permuted_idct = 1;
1327
1328 #ifdef HAVE_MMX
1329     dsputil_init_mmx();
1330 #endif
1331 #ifdef ARCH_ARMV4L
1332     dsputil_init_armv4l();
1333 #endif
1334 #ifdef HAVE_MLIB
1335     dsputil_init_mlib();
1336     use_permuted_idct = 0;
1337 #endif
1338 #ifdef ARCH_ALPHA
1339     dsputil_init_alpha();
1340     use_permuted_idct = 0;
1341 #endif
1342 #ifdef ARCH_POWERPC
1343 #ifdef CONFIG_DARWIN
1344     dsputil_init_altivec();
1345 #endif
1346 #endif
1347
1348 #ifdef SIMPLE_IDCT
1349     if (ff_idct == NULL) {
1350         ff_idct_put = simple_idct_put;
1351         ff_idct_add = simple_idct_add;
1352         use_permuted_idct=0;
1353     }
1354 #endif
1355     if(ff_idct != NULL) {
1356         ff_idct_put = gen_idct_put;
1357         ff_idct_add = gen_idct_add;
1358     }
1359
1360     if(use_permuted_idct)
1361 #ifdef SIMPLE_IDCT
1362         for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1363 #else
1364         for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1365 #endif
1366     else
1367         for(i=0; i<64; i++) permutation[i]=i;
1368
1369     for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1370     for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1371
1372     if (use_permuted_idct) {
1373         /* permute for IDCT */
1374         for(i=0;i<64;i++) {
1375             j = zigzag_direct[i];
1376             zigzag_direct[i] = block_permute_op(j);
1377             j = ff_alternate_horizontal_scan[i];
1378             ff_alternate_horizontal_scan[i] = block_permute_op(j);
1379             j = ff_alternate_vertical_scan[i];
1380             ff_alternate_vertical_scan[i] = block_permute_op(j);
1381         }
1382         block_permute(ff_mpeg1_default_intra_matrix);
1383         block_permute(ff_mpeg1_default_non_intra_matrix);
1384         block_permute(ff_mpeg4_default_intra_matrix);
1385         block_permute(ff_mpeg4_default_non_intra_matrix);
1386     }
1387
1388     build_zigzag_end();
1389 }
1390
1391 /* remove any non bit exact operation (testing purpose) */
1392 void avcodec_set_bit_exact(void)
1393 {
1394 #ifdef HAVE_MMX
1395     dsputil_set_bit_exact_mmx();
1396 #endif
1397 }
1398
1399 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1400               int orig_linesize[3], int coded_linesize,
1401               AVCodecContext *avctx)
1402 {
1403     int quad, diff, x, y;
1404     UINT8 *orig, *coded;
1405     UINT32 *sq = squareTbl + 256;
1406
1407     quad = 0;
1408     diff = 0;
1409
1410     /* Luminance */
1411     orig = orig_image[0];
1412     coded = coded_image[0];
1413
1414     for (y=0;y<avctx->height;y++) {
1415         for (x=0;x<avctx->width;x++) {
1416             diff = *(orig + x) - *(coded + x);
1417             quad += sq[diff];
1418         }
1419         orig += orig_linesize[0];
1420         coded += coded_linesize;
1421     }
1422
1423     avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1424
1425     if (avctx->psnr_y) {
1426         avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1427         avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1428     } else
1429         avctx->psnr_y = 99.99;
1430 }
1431