git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  *
  19  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  20  */
  21 #include "avcodec.h"
  22 #include "dsputil.h"
  23 #include "simple_idct.h"
  24
  25 void (*ff_idct)(DCTELEM *block);
  26 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
  27 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
  28 void (*av_fdct)(DCTELEM *block);
  29 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
  30 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
  31 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  32 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  33 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
  34 void (*clear_blocks)(DCTELEM *blocks);
  35
  36 op_pixels_abs_func pix_abs16x16;
  37 op_pixels_abs_func pix_abs16x16_x2;
  38 op_pixels_abs_func pix_abs16x16_y2;
  39 op_pixels_abs_func pix_abs16x16_xy2;
  40
  41 op_pixels_abs_func pix_abs8x8;
  42 op_pixels_abs_func pix_abs8x8_x2;
  43 op_pixels_abs_func pix_abs8x8_y2;
  44 op_pixels_abs_func pix_abs8x8_xy2;
  45
  46 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
  47 UINT32 squareTbl[512];
  48
  49 extern INT16 default_intra_matrix[64];
  50 extern INT16 default_non_intra_matrix[64];
  51 extern INT16 ff_mpeg4_default_intra_matrix[64];
  52 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
  53
  54 UINT8 zigzag_direct[64] = {
  55     0, 1, 8, 16, 9, 2, 3, 10,
  56     17, 24, 32, 25, 18, 11, 4, 5,
  57     12, 19, 26, 33, 40, 48, 41, 34,
  58     27, 20, 13, 6, 7, 14, 21, 28,
  59     35, 42, 49, 56, 57, 50, 43, 36,
  60     29, 22, 15, 23, 30, 37, 44, 51,
  61     58, 59, 52, 45, 38, 31, 39, 46,
  62     53, 60, 61, 54, 47, 55, 62, 63
  63 };
  64
  65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  66 UINT16 __align8 inv_zigzag_direct16[64];
  67
  68 /* not permutated zigzag_direct for MMX quantizer */
  69 UINT8 zigzag_direct_noperm[64];
  70
  71 UINT8 ff_alternate_horizontal_scan[64] = {
  72     0,  1,  2,  3,  8,  9, 16, 17,
  73     10, 11,  4,  5,  6,  7, 15, 14,
  74     13, 12, 19, 18, 24, 25, 32, 33,
  75     26, 27, 20, 21, 22, 23, 28, 29,
  76     30, 31, 34, 35, 40, 41, 48, 49,
  77     42, 43, 36, 37, 38, 39, 44, 45,
  78     46, 47, 50, 51, 56, 57, 58, 59,
  79     52, 53, 54, 55, 60, 61, 62, 63,
  80 };
  81
  82 UINT8 ff_alternate_vertical_scan[64] = {
  83     0,  8, 16, 24,  1,  9,  2, 10,
  84     17, 25, 32, 40, 48, 56, 57, 49,
  85     41, 33, 26, 18,  3, 11,  4, 12,
  86     19, 27, 34, 42, 50, 58, 35, 43,
  87     51, 59, 20, 28,  5, 13,  6, 14,
  88     21, 29, 36, 44, 52, 60, 37, 45,
  89     53, 61, 22, 30,  7, 15, 23, 31,
  90     38, 46, 54, 62, 39, 47, 55, 63,
  91 };
  92
  93 #ifdef SIMPLE_IDCT
  94
  95 /* Input permutation for the simple_idct_mmx */
  96 static UINT8 simple_mmx_permutation[64]={
  97         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  98         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
  99         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 100         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 101         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 102         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 103         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 104         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 105 };
 106 #endif
 107
 108 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 109 UINT32 inverse[256]={
 110          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 111  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 112  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 113  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 114  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 115  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 116   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 117   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 118   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 119   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 120   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 121   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 122   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 123   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 124   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 125   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 126   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 127   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 128   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 129   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 130   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 131   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 132   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 133   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 134   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 135   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 136   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 137   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 138   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 139   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 140   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 141   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 142 };
 143
 144 /* used to skip zeros at the end */
 145 UINT8 zigzag_end[64];
 146
 147 UINT8 permutation[64];
 148 //UINT8 invPermutation[64];
 149
 150 static void build_zigzag_end(void)
 151 {
 152     int lastIndex;
 153     int lastIndexAfterPerm=0;
 154     for(lastIndex=0; lastIndex<64; lastIndex++)
 155     {
 156         if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
 157             lastIndexAfterPerm= zigzag_direct[lastIndex];
 158         zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
 159     }
 160 }
 161
 162 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
 163 {
 164     DCTELEM *p;
 165     const UINT8 *pix;
 166     int i;
 167
 168     /* read the pixels */
 169     p = block;
 170     pix = pixels;
 171     for(i=0;i<8;i++) {
 172         p[0] = pix[0];
 173         p[1] = pix[1];
 174         p[2] = pix[2];
 175         p[3] = pix[3];
 176         p[4] = pix[4];
 177         p[5] = pix[5];
 178         p[6] = pix[6];
 179         p[7] = pix[7];
 180         pix += line_size;
 181         p += 8;
 182     }
 183 }
 184
 185 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
 186     DCTELEM *p;
 187     int i;
 188
 189     /* read the pixels */
 190     p = block;
 191     for(i=0;i<8;i++) {
 192         p[0] = s1[0] - s2[0];
 193         p[1] = s1[1] - s2[1];
 194         p[2] = s1[2] - s2[2];
 195         p[3] = s1[3] - s2[3];
 196         p[4] = s1[4] - s2[4];
 197         p[5] = s1[5] - s2[5];
 198         p[6] = s1[6] - s2[6];
 199         p[7] = s1[7] - s2[7];
 200         s1 += stride;
 201         s2 += stride;
 202         p += 8;
 203     }
 204 }
 205
 206
 207 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
 208 {
 209     const DCTELEM *p;
 210     UINT8 *pix;
 211     int i;
 212     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 213
 214     /* read the pixels */
 215     p = block;
 216     pix = pixels;
 217     for(i=0;i<8;i++) {
 218         pix[0] = cm[p[0]];
 219         pix[1] = cm[p[1]];
 220         pix[2] = cm[p[2]];
 221         pix[3] = cm[p[3]];
 222         pix[4] = cm[p[4]];
 223         pix[5] = cm[p[5]];
 224         pix[6] = cm[p[6]];
 225         pix[7] = cm[p[7]];
 226         pix += line_size;
 227         p += 8;
 228     }
 229 }
 230
 231 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
 232 {
 233     const DCTELEM *p;
 234     UINT8 *pix;
 235     int i;
 236     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 237
 238     /* read the pixels */
 239     p = block;
 240     pix = pixels;
 241     for(i=0;i<8;i++) {
 242         pix[0] = cm[pix[0] + p[0]];
 243         pix[1] = cm[pix[1] + p[1]];
 244         pix[2] = cm[pix[2] + p[2]];
 245         pix[3] = cm[pix[3] + p[3]];
 246         pix[4] = cm[pix[4] + p[4]];
 247         pix[5] = cm[pix[5] + p[5]];
 248         pix[6] = cm[pix[6] + p[6]];
 249         pix[7] = cm[pix[7] + p[7]];
 250         pix += line_size;
 251         p += 8;
 252     }
 253 }
 254
 255 #if 0
 256
 257 #define PIXOP2(OPNAME, OP) \
 258 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 259 {\
 260     int i;\
 261     for(i=0; i<h; i++){\
 262         OP(*((uint64_t*)block), LD64(pixels));\
 263         pixels+=line_size;\
 264         block +=line_size;\
 265     }\
 266 }\
 267 \
 268 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 269 {\
 270     int i;\
 271     for(i=0; i<h; i++){\
 272         const uint64_t a= LD64(pixels  );\
 273         const uint64_t b= LD64(pixels+1);\
 274         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 275         pixels+=line_size;\
 276         block +=line_size;\
 277     }\
 278 }\
 279 \
 280 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 281 {\
 282     int i;\
 283     for(i=0; i<h; i++){\
 284         const uint64_t a= LD64(pixels  );\
 285         const uint64_t b= LD64(pixels+1);\
 286         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 287         pixels+=line_size;\
 288         block +=line_size;\
 289     }\
 290 }\
 291 \
 292 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 293 {\
 294     int i;\
 295     for(i=0; i<h; i++){\
 296         const uint64_t a= LD64(pixels          );\
 297         const uint64_t b= LD64(pixels+line_size);\
 298         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 299         pixels+=line_size;\
 300         block +=line_size;\
 301     }\
 302 }\
 303 \
 304 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 305 {\
 306     int i;\
 307     for(i=0; i<h; i++){\
 308         const uint64_t a= LD64(pixels          );\
 309         const uint64_t b= LD64(pixels+line_size);\
 310         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 311         pixels+=line_size;\
 312         block +=line_size;\
 313     }\
 314 }\
 315 \
 316 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 317 {\
 318         int i;\
 319         const uint64_t a= LD64(pixels  );\
 320         const uint64_t b= LD64(pixels+1);\
 321         uint64_t l0=  (a&0x0303030303030303ULL)\
 322                     + (b&0x0303030303030303ULL)\
 323                     + 0x0202020202020202ULL;\
 324         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 325                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 326         uint64_t l1,h1;\
 327 \
 328         pixels+=line_size;\
 329         for(i=0; i<h; i+=2){\
 330             uint64_t a= LD64(pixels  );\
 331             uint64_t b= LD64(pixels+1);\
 332             l1=  (a&0x0303030303030303ULL)\
 333                + (b&0x0303030303030303ULL);\
 334             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 335               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 336             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 337             pixels+=line_size;\
 338             block +=line_size;\
 339             a= LD64(pixels  );\
 340             b= LD64(pixels+1);\
 341             l0=  (a&0x0303030303030303ULL)\
 342                + (b&0x0303030303030303ULL)\
 343                + 0x0202020202020202ULL;\
 344             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 345               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 346             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 347             pixels+=line_size;\
 348             block +=line_size;\
 349         }\
 350 }\
 351 \
 352 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 353 {\
 354         int i;\
 355         const uint64_t a= LD64(pixels  );\
 356         const uint64_t b= LD64(pixels+1);\
 357         uint64_t l0=  (a&0x0303030303030303ULL)\
 358                     + (b&0x0303030303030303ULL)\
 359                     + 0x0101010101010101ULL;\
 360         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 361                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 362         uint64_t l1,h1;\
 363 \
 364         pixels+=line_size;\
 365         for(i=0; i<h; i+=2){\
 366             uint64_t a= LD64(pixels  );\
 367             uint64_t b= LD64(pixels+1);\
 368             l1=  (a&0x0303030303030303ULL)\
 369                + (b&0x0303030303030303ULL);\
 370             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 371               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 372             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 373             pixels+=line_size;\
 374             block +=line_size;\
 375             a= LD64(pixels  );\
 376             b= LD64(pixels+1);\
 377             l0=  (a&0x0303030303030303ULL)\
 378                + (b&0x0303030303030303ULL)\
 379                + 0x0101010101010101ULL;\
 380             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 381               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 382             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 383             pixels+=line_size;\
 384             block +=line_size;\
 385         }\
 386 }\
 387 \
 388 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 389     OPNAME ## _pixels,\
 390     OPNAME ## _pixels_x2,\
 391     OPNAME ## _pixels_y2,\
 392     OPNAME ## _pixels_xy2,\
 393 };\
 394 \
 395 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 396     OPNAME ## _pixels,\
 397     OPNAME ## _no_rnd_pixels_x2,\
 398     OPNAME ## _no_rnd_pixels_y2,\
 399     OPNAME ## _no_rnd_pixels_xy2,\
 400 };
 401
 402 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 403 #else // 64 bit variant
 404
 405 #define PIXOP2(OPNAME, OP) \
 406 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 407 {\
 408     int i;\
 409     for(i=0; i<h; i++){\
 410         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 411         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 412         pixels+=line_size;\
 413         block +=line_size;\
 414     }\
 415 }\
 416 \
 417 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 418 {\
 419     int i;\
 420     for(i=0; i<h; i++){\
 421         int j;\
 422         for(j=0; j<2; j++){\
 423             const uint32_t a= LD32(pixels  );\
 424             const uint32_t b= LD32(pixels+1);\
 425             OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
 426             pixels+=4;\
 427             block +=4;\
 428         }\
 429         pixels+=line_size-8;\
 430         block +=line_size-8;\
 431     }\
 432 }\
 433 \
 434 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 435 {\
 436     int i;\
 437     for(i=0; i<h; i++){\
 438         int j;\
 439         for(j=0; j<2; j++){\
 440             const uint32_t a= LD32(pixels  );\
 441             const uint32_t b= LD32(pixels+1);\
 442             OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
 443             pixels+=4;\
 444             block +=4;\
 445         }\
 446         pixels+=line_size-8;\
 447         block +=line_size-8;\
 448     }\
 449 }\
 450 \
 451 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 452 {\
 453     int i;\
 454     for(i=0; i<h; i++){\
 455         int j;\
 456         for(j=0; j<2; j++){\
 457             const uint32_t a= LD32(pixels          );\
 458             const uint32_t b= LD32(pixels+line_size);\
 459             OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
 460             pixels+=4;\
 461             block +=4;\
 462         }\
 463         pixels+=line_size-8;\
 464         block +=line_size-8;\
 465     }\
 466 }\
 467 \
 468 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 469 {\
 470     int i;\
 471     for(i=0; i<h; i++){\
 472         int j;\
 473         for(j=0; j<2; j++){\
 474             const uint32_t a= LD32(pixels          );\
 475             const uint32_t b= LD32(pixels+line_size);\
 476             OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
 477             pixels+=4;\
 478             block +=4;\
 479         }\
 480         pixels+=line_size-8;\
 481         block +=line_size-8;\
 482     }\
 483 }\
 484 \
 485 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 486 {\
 487     int j;\
 488     for(j=0; j<2; j++){\
 489         int i;\
 490         const uint32_t a= LD32(pixels  );\
 491         const uint32_t b= LD32(pixels+1);\
 492         uint32_t l0=  (a&0x03030303UL)\
 493                     + (b&0x03030303UL)\
 494                     + 0x02020202UL;\
 495         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 496                    + ((b&0xFCFCFCFCUL)>>2);\
 497         uint32_t l1,h1;\
 498 \
 499         pixels+=line_size;\
 500         for(i=0; i<h; i+=2){\
 501             uint32_t a= LD32(pixels  );\
 502             uint32_t b= LD32(pixels+1);\
 503             l1=  (a&0x03030303UL)\
 504                + (b&0x03030303UL);\
 505             h1= ((a&0xFCFCFCFCUL)>>2)\
 506               + ((b&0xFCFCFCFCUL)>>2);\
 507             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 508             pixels+=line_size;\
 509             block +=line_size;\
 510             a= LD32(pixels  );\
 511             b= LD32(pixels+1);\
 512             l0=  (a&0x03030303UL)\
 513                + (b&0x03030303UL)\
 514                + 0x02020202UL;\
 515             h0= ((a&0xFCFCFCFCUL)>>2)\
 516               + ((b&0xFCFCFCFCUL)>>2);\
 517             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 518             pixels+=line_size;\
 519             block +=line_size;\
 520         }\
 521         pixels+=4-line_size*(h+1);\
 522         block +=4-line_size*h;\
 523     }\
 524 }\
 525 \
 526 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 527 {\
 528     int j;\
 529     for(j=0; j<2; j++){\
 530         int i;\
 531         const uint32_t a= LD32(pixels  );\
 532         const uint32_t b= LD32(pixels+1);\
 533         uint32_t l0=  (a&0x03030303UL)\
 534                     + (b&0x03030303UL)\
 535                     + 0x01010101UL;\
 536         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 537                    + ((b&0xFCFCFCFCUL)>>2);\
 538         uint32_t l1,h1;\
 539 \
 540         pixels+=line_size;\
 541         for(i=0; i<h; i+=2){\
 542             uint32_t a= LD32(pixels  );\
 543             uint32_t b= LD32(pixels+1);\
 544             l1=  (a&0x03030303UL)\
 545                + (b&0x03030303UL);\
 546             h1= ((a&0xFCFCFCFCUL)>>2)\
 547               + ((b&0xFCFCFCFCUL)>>2);\
 548             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 549             pixels+=line_size;\
 550             block +=line_size;\
 551             a= LD32(pixels  );\
 552             b= LD32(pixels+1);\
 553             l0=  (a&0x03030303UL)\
 554                + (b&0x03030303UL)\
 555                + 0x01010101UL;\
 556             h0= ((a&0xFCFCFCFCUL)>>2)\
 557               + ((b&0xFCFCFCFCUL)>>2);\
 558             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 559             pixels+=line_size;\
 560             block +=line_size;\
 561         }\
 562         pixels+=4-line_size*(h+1);\
 563         block +=4-line_size*h;\
 564     }\
 565 }\
 566 \
 567 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 568     OPNAME ## _pixels,\
 569     OPNAME ## _pixels_x2,\
 570     OPNAME ## _pixels_y2,\
 571     OPNAME ## _pixels_xy2,\
 572 };\
 573 \
 574 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 575     OPNAME ## _pixels,\
 576     OPNAME ## _no_rnd_pixels_x2,\
 577     OPNAME ## _no_rnd_pixels_y2,\
 578     OPNAME ## _no_rnd_pixels_xy2,\
 579 };
 580 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 581 #endif
 582
 583 #define op_put(a, b) a = b
 584
 585 PIXOP2(avg, op_avg)
 586 PIXOP2(put, op_put)
 587 #undef op_avg
 588 #undef op_put
 589
 590 #if 0
 591 /* FIXME this stuff could be removed as its ot really used anymore */
 592 #define PIXOP(BTYPE, OPNAME, OP, INCR)                                                   \
 593                                                                                          \
 594 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
 595 {                                                                                        \
 596     BTYPE *p;                                                                            \
 597     const UINT8 *pix;                                                                    \
 598                                                                                          \
 599     p = block;                                                                           \
 600     pix = pixels;                                                                        \
 601     do {                                                                                 \
 602         OP(p[0], pix[0]);                                                                  \
 603         OP(p[1], pix[1]);                                                                  \
 604         OP(p[2], pix[2]);                                                                  \
 605         OP(p[3], pix[3]);                                                                  \
 606         OP(p[4], pix[4]);                                                                  \
 607         OP(p[5], pix[5]);                                                                  \
 608         OP(p[6], pix[6]);                                                                  \
 609         OP(p[7], pix[7]);                                                                  \
 610         pix += line_size;                                                                \
 611         p += INCR;                                                                       \
 612     } while (--h);;                                                                       \
 613 }                                                                                        \
 614                                                                                          \
 615 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
 616 {                                                                                        \
 617     BTYPE *p;                                                                          \
 618     const UINT8 *pix;                                                                    \
 619                                                                                          \
 620     p = block;                                                                           \
 621     pix = pixels;                                                                        \
 622     do {                                                                   \
 623         OP(p[0], avg2(pix[0], pix[1]));                                                    \
 624         OP(p[1], avg2(pix[1], pix[2]));                                                    \
 625         OP(p[2], avg2(pix[2], pix[3]));                                                    \
 626         OP(p[3], avg2(pix[3], pix[4]));                                                    \
 627         OP(p[4], avg2(pix[4], pix[5]));                                                    \
 628         OP(p[5], avg2(pix[5], pix[6]));                                                    \
 629         OP(p[6], avg2(pix[6], pix[7]));                                                    \
 630         OP(p[7], avg2(pix[7], pix[8]));                                                    \
 631         pix += line_size;                                                                \
 632         p += INCR;                                                                       \
 633     } while (--h);                                                                        \
 634 }                                                                                        \
 635                                                                                          \
 636 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
 637 {                                                                                        \
 638     BTYPE *p;                                                                          \
 639     const UINT8 *pix;                                                                    \
 640     const UINT8 *pix1;                                                                   \
 641                                                                                          \
 642     p = block;                                                                           \
 643     pix = pixels;                                                                        \
 644     pix1 = pixels + line_size;                                                           \
 645     do {                                                                                 \
 646         OP(p[0], avg2(pix[0], pix1[0]));                                                   \
 647         OP(p[1], avg2(pix[1], pix1[1]));                                                   \
 648         OP(p[2], avg2(pix[2], pix1[2]));                                                   \
 649         OP(p[3], avg2(pix[3], pix1[3]));                                                   \
 650         OP(p[4], avg2(pix[4], pix1[4]));                                                   \
 651         OP(p[5], avg2(pix[5], pix1[5]));                                                   \
 652         OP(p[6], avg2(pix[6], pix1[6]));                                                   \
 653         OP(p[7], avg2(pix[7], pix1[7]));                                                   \
 654         pix += line_size;                                                                \
 655         pix1 += line_size;                                                               \
 656         p += INCR;                                                                       \
 657     } while(--h);                                                                         \
 658 }                                                                                        \
 659                                                                                          \
 660 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
 661 {                                                                                        \
 662     BTYPE *p;                                                                          \
 663     const UINT8 *pix;                                                                    \
 664     const UINT8 *pix1;                                                                   \
 665                                                                                          \
 666     p = block;                                                                           \
 667     pix = pixels;                                                                        \
 668     pix1 = pixels + line_size;                                                           \
 669     do {                                                                   \
 670         OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1]));                                  \
 671         OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2]));                                  \
 672         OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3]));                                  \
 673         OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4]));                                  \
 674         OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5]));                                  \
 675         OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6]));                                  \
 676         OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7]));                                  \
 677         OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8]));                                  \
 678         pix += line_size;                                                                \
 679         pix1 += line_size;                                                               \
 680         p += INCR;                                                                       \
 681     } while(--h);                                                                         \
 682 }                                                                                        \
 683                                                                                          \
 684 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
 685     OPNAME ## _pixels,                                                                   \
 686     OPNAME ## _pixels_x2,                                                                \
 687     OPNAME ## _pixels_y2,                                                                \
 688     OPNAME ## _pixels_xy2,                                                               \
 689 };
 690
 691 /* rounding primitives */
 692 #define avg2(a,b) ((a+b+1)>>1)
 693 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 694
 695 #define op_avg(a, b) a = avg2(a, b)
 696 #define op_sub(a, b) a -= b
 697
 698 PIXOP(DCTELEM, sub, op_sub, 8)
 699
 700 /* not rounding primitives */
 701 #undef avg2
 702 #undef avg4
 703 #define avg2(a,b) ((a+b)>>1)
 704 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
 705
 706 /* motion estimation */
 707
 708 #undef avg2
 709 #undef avg4
 710 #endif
 711
 712 #define avg2(a,b) ((a+b+1)>>1)
 713 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 714
 715 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
 716 {
 717     const int A=(16-x16)*(16-y16);
 718     const int B=(   x16)*(16-y16);
 719     const int C=(16-x16)*(   y16);
 720     const int D=(   x16)*(   y16);
 721     int i;
 722     rounder= 128 - rounder;
 723
 724     for(i=0; i<h; i++)
 725     {
 726         dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
 727         dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
 728         dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
 729         dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
 730         dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
 731         dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
 732         dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
 733         dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
 734         dst+= srcStride;
 735         src+= srcStride;
 736     }
 737 }
 738
 739 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
 740 {
 741     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 742     int i;
 743     for(i=0; i<h; i++)
 744     {
 745         dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
 746         dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
 747         dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
 748         dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
 749         dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
 750         dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
 751         dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
 752         dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
 753         dst+=dstStride;
 754         src+=srcStride;
 755     }
 756 }
 757
 758 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
 759 {
 760     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 761     int i;
 762     for(i=0; i<w; i++)
 763     {
 764         const int src0= src[0*srcStride];
 765         const int src1= src[1*srcStride];
 766         const int src2= src[2*srcStride];
 767         const int src3= src[3*srcStride];
 768         const int src4= src[4*srcStride];
 769         const int src5= src[5*srcStride];
 770         const int src6= src[6*srcStride];
 771         const int src7= src[7*srcStride];
 772         const int src8= src[8*srcStride];
 773         dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
 774         dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
 775         dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
 776         dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
 777         dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
 778         dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
 779         dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
 780         dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
 781         dst++;
 782         src++;
 783     }
 784 }
 785
 786 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
 787 {
 788     int i;
 789     for(i=0; i<8; i++)
 790     {
 791         dst[0]= src[0];
 792         dst[1]= src[1];
 793         dst[2]= src[2];
 794         dst[3]= src[3];
 795         dst[4]= src[4];
 796         dst[5]= src[5];
 797         dst[6]= src[6];
 798         dst[7]= src[7];
 799         dst+=dstStride;
 800         src+=srcStride;
 801     }
 802 }
 803
 804 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
 805 {
 806     int i;
 807     for(i=0; i<8; i++)
 808     {
 809         dst[0]= (src1[0] + src2[0] + r)>>1;
 810         dst[1]= (src1[1] + src2[1] + r)>>1;
 811         dst[2]= (src1[2] + src2[2] + r)>>1;
 812         dst[3]= (src1[3] + src2[3] + r)>>1;
 813         dst[4]= (src1[4] + src2[4] + r)>>1;
 814         dst[5]= (src1[5] + src2[5] + r)>>1;
 815         dst[6]= (src1[6] + src2[6] + r)>>1;
 816         dst[7]= (src1[7] + src2[7] + r)>>1;
 817         dst+=dstStride;
 818         src1+=srcStride;
 819         src2+=8;
 820     }
 821 }
 822
 823 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
 824 {
 825     int i;
 826     for(i=0; i<8; i++)
 827     {
 828         dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
 829         dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
 830         dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
 831         dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
 832         dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
 833         dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
 834         dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
 835         dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
 836         dst+=dstStride;
 837         src1+=srcStride;
 838         src2+=8;
 839         src3+=8;
 840         src4+=8;
 841     }
 842 }
 843
 844 #define QPEL_MC(r, name) \
 845 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 846 {\
 847     put_block(dst, src, dstStride, srcStride);\
 848 }\
 849 \
 850 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 851 {\
 852     UINT8 half[64];\
 853     qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
 854     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 855 }\
 856 \
 857 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 858 {\
 859     qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 860 }\
 861 \
 862 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 863 {\
 864     UINT8 half[64];\
 865     qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
 866     avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
 867 }\
 868 \
 869 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 870 {\
 871     UINT8 half[64];\
 872     qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
 873     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 874 }\
 875 \
 876 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 877 {\
 878     qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 879 }\
 880 \
 881 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 882 {\
 883     UINT8 half[64];\
 884     qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
 885     avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
 886 }\
 887 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 888 {\
 889     UINT8 halfH[72];\
 890     UINT8 halfV[64];\
 891     UINT8 halfHV[64];\
 892     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 893     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 894     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 895     avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 896 }\
 897 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 898 {\
 899     UINT8 halfH[72];\
 900     UINT8 halfV[64];\
 901     UINT8 halfHV[64];\
 902     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 903     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 904     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 905     avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 906 }\
 907 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 908 {\
 909     UINT8 halfH[72];\
 910     UINT8 halfV[64];\
 911     UINT8 halfHV[64];\
 912     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 913     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 914     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 915     avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 916 }\
 917 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 918 {\
 919     UINT8 halfH[72];\
 920     UINT8 halfV[64];\
 921     UINT8 halfHV[64];\
 922     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 923     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 924     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 925     avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 926 }\
 927 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 928 {\
 929     UINT8 halfH[72];\
 930     UINT8 halfHV[64];\
 931     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 932     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 933     avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
 934 }\
 935 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 936 {\
 937     UINT8 halfH[72];\
 938     UINT8 halfHV[64];\
 939     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 940     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 941     avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
 942 }\
 943 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 944 {\
 945     UINT8 halfH[72];\
 946     UINT8 halfV[64];\
 947     UINT8 halfHV[64];\
 948     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 949     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 950     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 951     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 952 }\
 953 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 954 {\
 955     UINT8 halfH[72];\
 956     UINT8 halfV[64];\
 957     UINT8 halfHV[64];\
 958     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 959     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 960     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 961     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 962 }\
 963 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 964 {\
 965     UINT8 halfH[72];\
 966     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 967     qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
 968 }\
 969 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
 970     qpel_mc00_c ## name,                                                                   \
 971     qpel_mc10_c ## name,                                                                   \
 972     qpel_mc20_c ## name,                                                                   \
 973     qpel_mc30_c ## name,                                                                   \
 974     qpel_mc01_c ## name,                                                                   \
 975     qpel_mc11_c ## name,                                                                   \
 976     qpel_mc21_c ## name,                                                                   \
 977     qpel_mc31_c ## name,                                                                   \
 978     qpel_mc02_c ## name,                                                                   \
 979     qpel_mc12_c ## name,                                                                   \
 980     qpel_mc22_c ## name,                                                                   \
 981     qpel_mc32_c ## name,                                                                   \
 982     qpel_mc03_c ## name,                                                                   \
 983     qpel_mc13_c ## name,                                                                   \
 984     qpel_mc23_c ## name,                                                                   \
 985     qpel_mc33_c ## name,                                                                   \
 986 };
 987
 988 QPEL_MC(0, _rnd)
 989 QPEL_MC(1, _no_rnd)
 990
 991 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 992 {
 993     int s, i;
 994
 995     s = 0;
 996     for(i=0;i<16;i++) {
 997         s += abs(pix1[0] - pix2[0]);
 998         s += abs(pix1[1] - pix2[1]);
 999         s += abs(pix1[2] - pix2[2]);
1000         s += abs(pix1[3] - pix2[3]);
1001         s += abs(pix1[4] - pix2[4]);
1002         s += abs(pix1[5] - pix2[5]);
1003         s += abs(pix1[6] - pix2[6]);
1004         s += abs(pix1[7] - pix2[7]);
1005         s += abs(pix1[8] - pix2[8]);
1006         s += abs(pix1[9] - pix2[9]);
1007         s += abs(pix1[10] - pix2[10]);
1008         s += abs(pix1[11] - pix2[11]);
1009         s += abs(pix1[12] - pix2[12]);
1010         s += abs(pix1[13] - pix2[13]);
1011         s += abs(pix1[14] - pix2[14]);
1012         s += abs(pix1[15] - pix2[15]);
1013         pix1 += line_size;
1014         pix2 += line_size;
1015     }
1016     return s;
1017 }
1018
1019 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1020 {
1021     int s, i;
1022
1023     s = 0;
1024     for(i=0;i<16;i++) {
1025         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1026         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1027         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1028         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1029         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1030         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1031         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1032         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1033         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1034         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1035         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1036         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1037         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1038         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1039         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1040         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1041         pix1 += line_size;
1042         pix2 += line_size;
1043     }
1044     return s;
1045 }
1046
1047 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1048 {
1049     int s, i;
1050     UINT8 *pix3 = pix2 + line_size;
1051
1052     s = 0;
1053     for(i=0;i<16;i++) {
1054         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1055         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1056         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1057         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1058         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1059         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1060         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1061         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1062         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1063         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1064         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1065         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1066         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1067         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1068         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1069         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1070         pix1 += line_size;
1071         pix2 += line_size;
1072         pix3 += line_size;
1073     }
1074     return s;
1075 }
1076
1077 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1078 {
1079     int s, i;
1080     UINT8 *pix3 = pix2 + line_size;
1081
1082     s = 0;
1083     for(i=0;i<16;i++) {
1084         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1085         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1086         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1087         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1088         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1089         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1090         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1091         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1092         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1093         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1094         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1095         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1096         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1097         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1098         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1099         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1100         pix1 += line_size;
1101         pix2 += line_size;
1102         pix3 += line_size;
1103     }
1104     return s;
1105 }
1106
1107 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1108 {
1109     int s, i;
1110
1111     s = 0;
1112     for(i=0;i<8;i++) {
1113         s += abs(pix1[0] - pix2[0]);
1114         s += abs(pix1[1] - pix2[1]);
1115         s += abs(pix1[2] - pix2[2]);
1116         s += abs(pix1[3] - pix2[3]);
1117         s += abs(pix1[4] - pix2[4]);
1118         s += abs(pix1[5] - pix2[5]);
1119         s += abs(pix1[6] - pix2[6]);
1120         s += abs(pix1[7] - pix2[7]);
1121         pix1 += line_size;
1122         pix2 += line_size;
1123     }
1124     return s;
1125 }
1126
1127 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1128 {
1129     int s, i;
1130
1131     s = 0;
1132     for(i=0;i<8;i++) {
1133         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1134         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1135         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1136         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1137         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1138         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1139         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1140         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1141         pix1 += line_size;
1142         pix2 += line_size;
1143     }
1144     return s;
1145 }
1146
1147 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1148 {
1149     int s, i;
1150     UINT8 *pix3 = pix2 + line_size;
1151
1152     s = 0;
1153     for(i=0;i<8;i++) {
1154         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1155         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1156         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1157         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1158         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1159         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1160         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1161         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1162         pix1 += line_size;
1163         pix2 += line_size;
1164         pix3 += line_size;
1165     }
1166     return s;
1167 }
1168
1169 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1170 {
1171     int s, i;
1172     UINT8 *pix3 = pix2 + line_size;
1173
1174     s = 0;
1175     for(i=0;i<8;i++) {
1176         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1177         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1178         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1179         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1180         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1181         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1182         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1183         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1184         pix1 += line_size;
1185         pix2 += line_size;
1186         pix3 += line_size;
1187     }
1188     return s;
1189 }
1190
1191 /* permute block according so that it corresponds to the MMX idct
1192    order */
1193 #ifdef SIMPLE_IDCT
1194  /* general permutation, but perhaps slightly slower */
1195 void block_permute(INT16 *block)
1196 {
1197         int i;
1198         INT16 temp[64];
1199
1200         for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1201
1202         for(i=0; i<64; i++) block[i] = temp[i];
1203 }
1204 #else
1205
1206 void block_permute(INT16 *block)
1207 {
1208     int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1209     int i;
1210
1211     for(i=0;i<8;i++) {
1212         tmp1 = block[1];
1213         tmp2 = block[2];
1214         tmp3 = block[3];
1215         tmp4 = block[4];
1216         tmp5 = block[5];
1217         tmp6 = block[6];
1218         block[1] = tmp2;
1219         block[2] = tmp4;
1220         block[3] = tmp6;
1221         block[4] = tmp1;
1222         block[5] = tmp3;
1223         block[6] = tmp5;
1224         block += 8;
1225     }
1226 }
1227 #endif
1228
1229 void clear_blocks_c(DCTELEM *blocks)
1230 {
1231     memset(blocks, 0, sizeof(DCTELEM)*6*64);
1232 }
1233
1234 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1235    converted */
1236 void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1237 {
1238     ff_idct (block);
1239     put_pixels_clamped(block, dest, line_size);
1240 }
1241
1242 void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1243 {
1244     ff_idct (block);
1245     add_pixels_clamped(block, dest, line_size);
1246 }
1247
1248 void dsputil_init(void)
1249 {
1250     int i, j;
1251     int use_permuted_idct;
1252
1253     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1254     for(i=0;i<MAX_NEG_CROP;i++) {
1255         cropTbl[i] = 0;
1256         cropTbl[i + MAX_NEG_CROP + 256] = 255;
1257     }
1258
1259     for(i=0;i<512;i++) {
1260         squareTbl[i] = (i - 256) * (i - 256);
1261     }
1262
1263 #ifdef SIMPLE_IDCT
1264     ff_idct = NULL;
1265 #else
1266     ff_idct = j_rev_dct;
1267 #endif
1268     get_pixels = get_pixels_c;
1269     diff_pixels = diff_pixels_c;
1270     put_pixels_clamped = put_pixels_clamped_c;
1271     add_pixels_clamped = add_pixels_clamped_c;
1272     gmc1= gmc1_c;
1273     clear_blocks= clear_blocks_c;
1274
1275     pix_abs16x16     = pix_abs16x16_c;
1276     pix_abs16x16_x2  = pix_abs16x16_x2_c;
1277     pix_abs16x16_y2  = pix_abs16x16_y2_c;
1278     pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1279     pix_abs8x8     = pix_abs8x8_c;
1280     pix_abs8x8_x2  = pix_abs8x8_x2_c;
1281     pix_abs8x8_y2  = pix_abs8x8_y2_c;
1282     pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1283     av_fdct = fdct_ifast;
1284
1285     use_permuted_idct = 1;
1286
1287 #ifdef HAVE_MMX
1288     dsputil_init_mmx();
1289 #endif
1290 #ifdef ARCH_ARMV4L
1291     dsputil_init_armv4l();
1292 #endif
1293 #ifdef HAVE_MLIB
1294     dsputil_init_mlib();
1295     use_permuted_idct = 0;
1296 #endif
1297 #ifdef ARCH_ALPHA
1298     dsputil_init_alpha();
1299     use_permuted_idct = 0;
1300 #endif
1301
1302 #ifdef SIMPLE_IDCT
1303     if (ff_idct == NULL) {
1304         ff_idct_put = simple_idct_put;
1305         ff_idct_add = simple_idct_add;
1306         use_permuted_idct=0;
1307     } else {
1308         ff_idct_put = gen_idct_put;
1309         ff_idct_add = gen_idct_add;
1310     }
1311 #endif
1312
1313     if(use_permuted_idct)
1314 #ifdef SIMPLE_IDCT
1315         for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1316 #else
1317         for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1318 #endif
1319     else
1320         for(i=0; i<64; i++) permutation[i]=i;
1321
1322     for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1323     for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1324
1325     if (use_permuted_idct) {
1326         /* permute for IDCT */
1327         for(i=0;i<64;i++) {
1328             j = zigzag_direct[i];
1329             zigzag_direct[i] = block_permute_op(j);
1330             j = ff_alternate_horizontal_scan[i];
1331             ff_alternate_horizontal_scan[i] = block_permute_op(j);
1332             j = ff_alternate_vertical_scan[i];
1333             ff_alternate_vertical_scan[i] = block_permute_op(j);
1334         }
1335         block_permute(default_intra_matrix);
1336         block_permute(default_non_intra_matrix);
1337         block_permute(ff_mpeg4_default_intra_matrix);
1338         block_permute(ff_mpeg4_default_non_intra_matrix);
1339     }
1340
1341     build_zigzag_end();
1342 }
1343
1344 /* remove any non bit exact operation (testing purpose) */
1345 void avcodec_set_bit_exact(void)
1346 {
1347 #ifdef HAVE_MMX
1348     dsputil_set_bit_exact_mmx();
1349 #endif
1350 }
1351
1352 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1353               int orig_linesize[3], int coded_linesize,
1354               AVCodecContext *avctx)
1355 {
1356     int quad, diff, x, y;
1357     UINT8 *orig, *coded;
1358     UINT32 *sq = squareTbl + 256;
1359
1360     quad = 0;
1361     diff = 0;
1362
1363     /* Luminance */
1364     orig = orig_image[0];
1365     coded = coded_image[0];
1366
1367     for (y=0;y<avctx->height;y++) {
1368         for (x=0;x<avctx->width;x++) {
1369             diff = *(orig + x) - *(coded + x);
1370             quad += sq[diff];
1371         }
1372         orig += orig_linesize[0];
1373         coded += coded_linesize;
1374     }
1375
1376     avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1377
1378     if (avctx->psnr_y) {
1379         avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1380         avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1381     } else
1382         avctx->psnr_y = 99.99;
1383 }
1384