git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  *
  19  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  20  */
  21 #include "avcodec.h"
  22 #include "dsputil.h"
  23 #include "simple_idct.h"
  24
  25 void (*ff_idct)(DCTELEM *block);
  26 void (*av_fdct)(DCTELEM *block);
  27 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
  28 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
  29 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  30 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  31 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
  32 void (*clear_blocks)(DCTELEM *blocks);
  33
  34 op_pixels_abs_func pix_abs16x16;
  35 op_pixels_abs_func pix_abs16x16_x2;
  36 op_pixels_abs_func pix_abs16x16_y2;
  37 op_pixels_abs_func pix_abs16x16_xy2;
  38
  39 op_pixels_abs_func pix_abs8x8;
  40 op_pixels_abs_func pix_abs8x8_x2;
  41 op_pixels_abs_func pix_abs8x8_y2;
  42 op_pixels_abs_func pix_abs8x8_xy2;
  43
  44 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
  45 UINT32 squareTbl[512];
  46
  47 extern INT16 default_intra_matrix[64];
  48 extern INT16 default_non_intra_matrix[64];
  49 extern INT16 ff_mpeg4_default_intra_matrix[64];
  50 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
  51
  52 UINT8 zigzag_direct[64] = {
  53     0, 1, 8, 16, 9, 2, 3, 10,
  54     17, 24, 32, 25, 18, 11, 4, 5,
  55     12, 19, 26, 33, 40, 48, 41, 34,
  56     27, 20, 13, 6, 7, 14, 21, 28,
  57     35, 42, 49, 56, 57, 50, 43, 36,
  58     29, 22, 15, 23, 30, 37, 44, 51,
  59     58, 59, 52, 45, 38, 31, 39, 46,
  60     53, 60, 61, 54, 47, 55, 62, 63
  61 };
  62
  63 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  64 UINT16 __align8 inv_zigzag_direct16[64];
  65
  66 /* not permutated zigzag_direct for MMX quantizer */
  67 UINT8 zigzag_direct_noperm[64];
  68
  69 UINT8 ff_alternate_horizontal_scan[64] = {
  70     0,  1,  2,  3,  8,  9, 16, 17,
  71     10, 11,  4,  5,  6,  7, 15, 14,
  72     13, 12, 19, 18, 24, 25, 32, 33,
  73     26, 27, 20, 21, 22, 23, 28, 29,
  74     30, 31, 34, 35, 40, 41, 48, 49,
  75     42, 43, 36, 37, 38, 39, 44, 45,
  76     46, 47, 50, 51, 56, 57, 58, 59,
  77     52, 53, 54, 55, 60, 61, 62, 63,
  78 };
  79
  80 UINT8 ff_alternate_vertical_scan[64] = {
  81     0,  8, 16, 24,  1,  9,  2, 10,
  82     17, 25, 32, 40, 48, 56, 57, 49,
  83     41, 33, 26, 18,  3, 11,  4, 12,
  84     19, 27, 34, 42, 50, 58, 35, 43,
  85     51, 59, 20, 28,  5, 13,  6, 14,
  86     21, 29, 36, 44, 52, 60, 37, 45,
  87     53, 61, 22, 30,  7, 15, 23, 31,
  88     38, 46, 54, 62, 39, 47, 55, 63,
  89 };
  90
  91 #ifdef SIMPLE_IDCT
  92
  93 /* Input permutation for the simple_idct_mmx */
  94 static UINT8 simple_mmx_permutation[64]={
  95         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  96         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
  97         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
  98         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
  99         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 100         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 101         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 102         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 103 };
 104 #endif
 105
 106 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 107 UINT32 inverse[256]={
 108          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 109  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 110  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 111  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 112  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 113  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 114   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 115   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 116   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 117   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 118   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 119   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 120   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 121   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 122   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 123   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 124   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 125   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 126   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 127   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 128   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 129   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 130   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 131   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 132   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 133   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 134   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 135   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 136   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 137   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 138   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 139   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 140 };
 141
 142 /* used to skip zeros at the end */
 143 UINT8 zigzag_end[64];
 144
 145 UINT8 permutation[64];
 146 //UINT8 invPermutation[64];
 147
 148 static void build_zigzag_end(void)
 149 {
 150     int lastIndex;
 151     int lastIndexAfterPerm=0;
 152     for(lastIndex=0; lastIndex<64; lastIndex++)
 153     {
 154         if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
 155             lastIndexAfterPerm= zigzag_direct[lastIndex];
 156         zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
 157     }
 158 }
 159
 160 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
 161 {
 162     DCTELEM *p;
 163     const UINT8 *pix;
 164     int i;
 165
 166     /* read the pixels */
 167     p = block;
 168     pix = pixels;
 169     for(i=0;i<8;i++) {
 170         p[0] = pix[0];
 171         p[1] = pix[1];
 172         p[2] = pix[2];
 173         p[3] = pix[3];
 174         p[4] = pix[4];
 175         p[5] = pix[5];
 176         p[6] = pix[6];
 177         p[7] = pix[7];
 178         pix += line_size;
 179         p += 8;
 180     }
 181 }
 182
 183 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
 184     DCTELEM *p;
 185     int i;
 186
 187     /* read the pixels */
 188     p = block;
 189     for(i=0;i<8;i++) {
 190         p[0] = s1[0] - s2[0];
 191         p[1] = s1[1] - s2[1];
 192         p[2] = s1[2] - s2[2];
 193         p[3] = s1[3] - s2[3];
 194         p[4] = s1[4] - s2[4];
 195         p[5] = s1[5] - s2[5];
 196         p[6] = s1[6] - s2[6];
 197         p[7] = s1[7] - s2[7];
 198         s1 += stride;
 199         s2 += stride;
 200         p += 8;
 201     }
 202 }
 203
 204
 205 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
 206 {
 207     const DCTELEM *p;
 208     UINT8 *pix;
 209     int i;
 210     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 211
 212     /* read the pixels */
 213     p = block;
 214     pix = pixels;
 215     for(i=0;i<8;i++) {
 216         pix[0] = cm[p[0]];
 217         pix[1] = cm[p[1]];
 218         pix[2] = cm[p[2]];
 219         pix[3] = cm[p[3]];
 220         pix[4] = cm[p[4]];
 221         pix[5] = cm[p[5]];
 222         pix[6] = cm[p[6]];
 223         pix[7] = cm[p[7]];
 224         pix += line_size;
 225         p += 8;
 226     }
 227 }
 228
 229 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
 230 {
 231     const DCTELEM *p;
 232     UINT8 *pix;
 233     int i;
 234     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 235
 236     /* read the pixels */
 237     p = block;
 238     pix = pixels;
 239     for(i=0;i<8;i++) {
 240         pix[0] = cm[pix[0] + p[0]];
 241         pix[1] = cm[pix[1] + p[1]];
 242         pix[2] = cm[pix[2] + p[2]];
 243         pix[3] = cm[pix[3] + p[3]];
 244         pix[4] = cm[pix[4] + p[4]];
 245         pix[5] = cm[pix[5] + p[5]];
 246         pix[6] = cm[pix[6] + p[6]];
 247         pix[7] = cm[pix[7] + p[7]];
 248         pix += line_size;
 249         p += 8;
 250     }
 251 }
 252
 253 #ifdef __GNUC__
 254
 255 struct unaligned_64 { uint64_t l; } __attribute__((packed));
 256 struct unaligned_32 { uint32_t l; } __attribute__((packed));
 257
 258 #define LD32(a) (((const struct unaligned_32 *) (a))->l)
 259 #define LD64(a) (((const struct unaligned_64 *) (a))->l)
 260
 261 #else /* __GNUC__ */
 262
 263 #define LD32(a) (*((uint32_t*)(a)))
 264 #define LD64(a) (*((uint64_t*)(a)))
 265
 266 #endif /* !__GNUC__ */
 267
 268 #if 0
 269
 270 #define PIXOP2(OPNAME, OP) \
 271 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 272 {\
 273     int i;\
 274     for(i=0; i<h; i++){\
 275         OP(*((uint64_t*)block), LD64(pixels));\
 276         pixels+=line_size;\
 277         block +=line_size;\
 278     }\
 279 }\
 280 \
 281 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 282 {\
 283     int i;\
 284     for(i=0; i<h; i++){\
 285         const uint64_t a= LD64(pixels  );\
 286         const uint64_t b= LD64(pixels+1);\
 287         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 288         pixels+=line_size;\
 289         block +=line_size;\
 290     }\
 291 }\
 292 \
 293 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 294 {\
 295     int i;\
 296     for(i=0; i<h; i++){\
 297         const uint64_t a= LD64(pixels  );\
 298         const uint64_t b= LD64(pixels+1);\
 299         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 300         pixels+=line_size;\
 301         block +=line_size;\
 302     }\
 303 }\
 304 \
 305 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 306 {\
 307     int i;\
 308     for(i=0; i<h; i++){\
 309         const uint64_t a= LD64(pixels          );\
 310         const uint64_t b= LD64(pixels+line_size);\
 311         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 312         pixels+=line_size;\
 313         block +=line_size;\
 314     }\
 315 }\
 316 \
 317 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 318 {\
 319     int i;\
 320     for(i=0; i<h; i++){\
 321         const uint64_t a= LD64(pixels          );\
 322         const uint64_t b= LD64(pixels+line_size);\
 323         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 324         pixels+=line_size;\
 325         block +=line_size;\
 326     }\
 327 }\
 328 \
 329 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 330 {\
 331         int i;\
 332         const uint64_t a= LD64(pixels  );\
 333         const uint64_t b= LD64(pixels+1);\
 334         uint64_t l0=  (a&0x0303030303030303ULL)\
 335                     + (b&0x0303030303030303ULL)\
 336                     + 0x0202020202020202ULL;\
 337         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 338                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 339         uint64_t l1,h1;\
 340 \
 341         pixels+=line_size;\
 342         for(i=0; i<h; i+=2){\
 343             uint64_t a= LD64(pixels  );\
 344             uint64_t b= LD64(pixels+1);\
 345             l1=  (a&0x0303030303030303ULL)\
 346                + (b&0x0303030303030303ULL);\
 347             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 348               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 349             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 350             pixels+=line_size;\
 351             block +=line_size;\
 352             a= LD64(pixels  );\
 353             b= LD64(pixels+1);\
 354             l0=  (a&0x0303030303030303ULL)\
 355                + (b&0x0303030303030303ULL)\
 356                + 0x0202020202020202ULL;\
 357             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 358               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 359             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 360             pixels+=line_size;\
 361             block +=line_size;\
 362         }\
 363 }\
 364 \
 365 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 366 {\
 367         int i;\
 368         const uint64_t a= LD64(pixels  );\
 369         const uint64_t b= LD64(pixels+1);\
 370         uint64_t l0=  (a&0x0303030303030303ULL)\
 371                     + (b&0x0303030303030303ULL)\
 372                     + 0x0101010101010101ULL;\
 373         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 374                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 375         uint64_t l1,h1;\
 376 \
 377         pixels+=line_size;\
 378         for(i=0; i<h; i+=2){\
 379             uint64_t a= LD64(pixels  );\
 380             uint64_t b= LD64(pixels+1);\
 381             l1=  (a&0x0303030303030303ULL)\
 382                + (b&0x0303030303030303ULL);\
 383             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 384               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 385             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 386             pixels+=line_size;\
 387             block +=line_size;\
 388             a= LD64(pixels  );\
 389             b= LD64(pixels+1);\
 390             l0=  (a&0x0303030303030303ULL)\
 391                + (b&0x0303030303030303ULL)\
 392                + 0x0101010101010101ULL;\
 393             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 394               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 395             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 396             pixels+=line_size;\
 397             block +=line_size;\
 398         }\
 399 }\
 400 \
 401 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 402     OPNAME ## _pixels,\
 403     OPNAME ## _pixels_x2,\
 404     OPNAME ## _pixels_y2,\
 405     OPNAME ## _pixels_xy2,\
 406 };\
 407 \
 408 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 409     OPNAME ## _pixels,\
 410     OPNAME ## _no_rnd_pixels_x2,\
 411     OPNAME ## _no_rnd_pixels_y2,\
 412     OPNAME ## _no_rnd_pixels_xy2,\
 413 };
 414
 415 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 416 #else // 64 bit variant
 417
 418 #define PIXOP2(OPNAME, OP) \
 419 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 420 {\
 421     int i;\
 422     for(i=0; i<h; i++){\
 423         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 424         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 425         pixels+=line_size;\
 426         block +=line_size;\
 427     }\
 428 }\
 429 \
 430 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 431 {\
 432     int i;\
 433     for(i=0; i<h; i++){\
 434         int j;\
 435         for(j=0; j<2; j++){\
 436             const uint32_t a= LD32(pixels  );\
 437             const uint32_t b= LD32(pixels+1);\
 438             OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
 439             pixels+=4;\
 440             block +=4;\
 441         }\
 442         pixels+=line_size-8;\
 443         block +=line_size-8;\
 444     }\
 445 }\
 446 \
 447 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 448 {\
 449     int i;\
 450     for(i=0; i<h; i++){\
 451         int j;\
 452         for(j=0; j<2; j++){\
 453             const uint32_t a= LD32(pixels  );\
 454             const uint32_t b= LD32(pixels+1);\
 455             OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
 456             pixels+=4;\
 457             block +=4;\
 458         }\
 459         pixels+=line_size-8;\
 460         block +=line_size-8;\
 461     }\
 462 }\
 463 \
 464 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 465 {\
 466     int i;\
 467     for(i=0; i<h; i++){\
 468         int j;\
 469         for(j=0; j<2; j++){\
 470             const uint32_t a= LD32(pixels          );\
 471             const uint32_t b= LD32(pixels+line_size);\
 472             OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
 473             pixels+=4;\
 474             block +=4;\
 475         }\
 476         pixels+=line_size-8;\
 477         block +=line_size-8;\
 478     }\
 479 }\
 480 \
 481 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 482 {\
 483     int i;\
 484     for(i=0; i<h; i++){\
 485         int j;\
 486         for(j=0; j<2; j++){\
 487             const uint32_t a= LD32(pixels          );\
 488             const uint32_t b= LD32(pixels+line_size);\
 489             OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
 490             pixels+=4;\
 491             block +=4;\
 492         }\
 493         pixels+=line_size-8;\
 494         block +=line_size-8;\
 495     }\
 496 }\
 497 \
 498 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 499 {\
 500     int j;\
 501     for(j=0; j<2; j++){\
 502         int i;\
 503         const uint32_t a= LD32(pixels  );\
 504         const uint32_t b= LD32(pixels+1);\
 505         uint32_t l0=  (a&0x03030303UL)\
 506                     + (b&0x03030303UL)\
 507                     + 0x02020202UL;\
 508         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 509                    + ((b&0xFCFCFCFCUL)>>2);\
 510         uint32_t l1,h1;\
 511 \
 512         pixels+=line_size;\
 513         for(i=0; i<h; i+=2){\
 514             uint32_t a= LD32(pixels  );\
 515             uint32_t b= LD32(pixels+1);\
 516             l1=  (a&0x03030303UL)\
 517                + (b&0x03030303UL);\
 518             h1= ((a&0xFCFCFCFCUL)>>2)\
 519               + ((b&0xFCFCFCFCUL)>>2);\
 520             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 521             pixels+=line_size;\
 522             block +=line_size;\
 523             a= LD32(pixels  );\
 524             b= LD32(pixels+1);\
 525             l0=  (a&0x03030303UL)\
 526                + (b&0x03030303UL)\
 527                + 0x02020202UL;\
 528             h0= ((a&0xFCFCFCFCUL)>>2)\
 529               + ((b&0xFCFCFCFCUL)>>2);\
 530             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 531             pixels+=line_size;\
 532             block +=line_size;\
 533         }\
 534         pixels+=4-line_size*(h+1);\
 535         block +=4-line_size*h;\
 536     }\
 537 }\
 538 \
 539 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 540 {\
 541     int j;\
 542     for(j=0; j<2; j++){\
 543         int i;\
 544         const uint32_t a= LD32(pixels  );\
 545         const uint32_t b= LD32(pixels+1);\
 546         uint32_t l0=  (a&0x03030303UL)\
 547                     + (b&0x03030303UL)\
 548                     + 0x01010101UL;\
 549         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 550                    + ((b&0xFCFCFCFCUL)>>2);\
 551         uint32_t l1,h1;\
 552 \
 553         pixels+=line_size;\
 554         for(i=0; i<h; i+=2){\
 555             uint32_t a= LD32(pixels  );\
 556             uint32_t b= LD32(pixels+1);\
 557             l1=  (a&0x03030303UL)\
 558                + (b&0x03030303UL);\
 559             h1= ((a&0xFCFCFCFCUL)>>2)\
 560               + ((b&0xFCFCFCFCUL)>>2);\
 561             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 562             pixels+=line_size;\
 563             block +=line_size;\
 564             a= LD32(pixels  );\
 565             b= LD32(pixels+1);\
 566             l0=  (a&0x03030303UL)\
 567                + (b&0x03030303UL)\
 568                + 0x01010101UL;\
 569             h0= ((a&0xFCFCFCFCUL)>>2)\
 570               + ((b&0xFCFCFCFCUL)>>2);\
 571             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 572             pixels+=line_size;\
 573             block +=line_size;\
 574         }\
 575         pixels+=4-line_size*(h+1);\
 576         block +=4-line_size*h;\
 577     }\
 578 }\
 579 \
 580 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 581     OPNAME ## _pixels,\
 582     OPNAME ## _pixels_x2,\
 583     OPNAME ## _pixels_y2,\
 584     OPNAME ## _pixels_xy2,\
 585 };\
 586 \
 587 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 588     OPNAME ## _pixels,\
 589     OPNAME ## _no_rnd_pixels_x2,\
 590     OPNAME ## _no_rnd_pixels_y2,\
 591     OPNAME ## _no_rnd_pixels_xy2,\
 592 };
 593 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 594 #endif
 595
 596 #define op_put(a, b) a = b
 597
 598 PIXOP2(avg, op_avg)
 599 PIXOP2(put, op_put)
 600 #undef op_avg
 601 #undef op_put
 602
 603 #if 0
 604 /* FIXME this stuff could be removed as its ot really used anymore */
 605 #define PIXOP(BTYPE, OPNAME, OP, INCR)                                                   \
 606                                                                                          \
 607 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
 608 {                                                                                        \
 609     BTYPE *p;                                                                            \
 610     const UINT8 *pix;                                                                    \
 611                                                                                          \
 612     p = block;                                                                           \
 613     pix = pixels;                                                                        \
 614     do {                                                                                 \
 615         OP(p[0], pix[0]);                                                                  \
 616         OP(p[1], pix[1]);                                                                  \
 617         OP(p[2], pix[2]);                                                                  \
 618         OP(p[3], pix[3]);                                                                  \
 619         OP(p[4], pix[4]);                                                                  \
 620         OP(p[5], pix[5]);                                                                  \
 621         OP(p[6], pix[6]);                                                                  \
 622         OP(p[7], pix[7]);                                                                  \
 623         pix += line_size;                                                                \
 624         p += INCR;                                                                       \
 625     } while (--h);;                                                                       \
 626 }                                                                                        \
 627                                                                                          \
 628 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
 629 {                                                                                        \
 630     BTYPE *p;                                                                          \
 631     const UINT8 *pix;                                                                    \
 632                                                                                          \
 633     p = block;                                                                           \
 634     pix = pixels;                                                                        \
 635     do {                                                                   \
 636         OP(p[0], avg2(pix[0], pix[1]));                                                    \
 637         OP(p[1], avg2(pix[1], pix[2]));                                                    \
 638         OP(p[2], avg2(pix[2], pix[3]));                                                    \
 639         OP(p[3], avg2(pix[3], pix[4]));                                                    \
 640         OP(p[4], avg2(pix[4], pix[5]));                                                    \
 641         OP(p[5], avg2(pix[5], pix[6]));                                                    \
 642         OP(p[6], avg2(pix[6], pix[7]));                                                    \
 643         OP(p[7], avg2(pix[7], pix[8]));                                                    \
 644         pix += line_size;                                                                \
 645         p += INCR;                                                                       \
 646     } while (--h);                                                                        \
 647 }                                                                                        \
 648                                                                                          \
 649 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
 650 {                                                                                        \
 651     BTYPE *p;                                                                          \
 652     const UINT8 *pix;                                                                    \
 653     const UINT8 *pix1;                                                                   \
 654                                                                                          \
 655     p = block;                                                                           \
 656     pix = pixels;                                                                        \
 657     pix1 = pixels + line_size;                                                           \
 658     do {                                                                                 \
 659         OP(p[0], avg2(pix[0], pix1[0]));                                                   \
 660         OP(p[1], avg2(pix[1], pix1[1]));                                                   \
 661         OP(p[2], avg2(pix[2], pix1[2]));                                                   \
 662         OP(p[3], avg2(pix[3], pix1[3]));                                                   \
 663         OP(p[4], avg2(pix[4], pix1[4]));                                                   \
 664         OP(p[5], avg2(pix[5], pix1[5]));                                                   \
 665         OP(p[6], avg2(pix[6], pix1[6]));                                                   \
 666         OP(p[7], avg2(pix[7], pix1[7]));                                                   \
 667         pix += line_size;                                                                \
 668         pix1 += line_size;                                                               \
 669         p += INCR;                                                                       \
 670     } while(--h);                                                                         \
 671 }                                                                                        \
 672                                                                                          \
 673 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
 674 {                                                                                        \
 675     BTYPE *p;                                                                          \
 676     const UINT8 *pix;                                                                    \
 677     const UINT8 *pix1;                                                                   \
 678                                                                                          \
 679     p = block;                                                                           \
 680     pix = pixels;                                                                        \
 681     pix1 = pixels + line_size;                                                           \
 682     do {                                                                   \
 683         OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1]));                                  \
 684         OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2]));                                  \
 685         OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3]));                                  \
 686         OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4]));                                  \
 687         OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5]));                                  \
 688         OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6]));                                  \
 689         OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7]));                                  \
 690         OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8]));                                  \
 691         pix += line_size;                                                                \
 692         pix1 += line_size;                                                               \
 693         p += INCR;                                                                       \
 694     } while(--h);                                                                         \
 695 }                                                                                        \
 696                                                                                          \
 697 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
 698     OPNAME ## _pixels,                                                                   \
 699     OPNAME ## _pixels_x2,                                                                \
 700     OPNAME ## _pixels_y2,                                                                \
 701     OPNAME ## _pixels_xy2,                                                               \
 702 };
 703
 704 /* rounding primitives */
 705 #define avg2(a,b) ((a+b+1)>>1)
 706 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 707
 708 #define op_avg(a, b) a = avg2(a, b)
 709 #define op_sub(a, b) a -= b
 710
 711 PIXOP(DCTELEM, sub, op_sub, 8)
 712
 713 /* not rounding primitives */
 714 #undef avg2
 715 #undef avg4
 716 #define avg2(a,b) ((a+b)>>1)
 717 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
 718
 719 /* motion estimation */
 720
 721 #undef avg2
 722 #undef avg4
 723 #endif
 724
 725 #define avg2(a,b) ((a+b+1)>>1)
 726 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 727
 728 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
 729 {
 730     const int A=(16-x16)*(16-y16);
 731     const int B=(   x16)*(16-y16);
 732     const int C=(16-x16)*(   y16);
 733     const int D=(   x16)*(   y16);
 734     int i;
 735     rounder= 128 - rounder;
 736
 737     for(i=0; i<h; i++)
 738     {
 739         dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
 740         dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
 741         dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
 742         dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
 743         dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
 744         dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
 745         dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
 746         dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
 747         dst+= srcStride;
 748         src+= srcStride;
 749     }
 750 }
 751
 752 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
 753 {
 754     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 755     int i;
 756     for(i=0; i<h; i++)
 757     {
 758         dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
 759         dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
 760         dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
 761         dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
 762         dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
 763         dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
 764         dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
 765         dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
 766         dst+=dstStride;
 767         src+=srcStride;
 768     }
 769 }
 770
 771 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
 772 {
 773     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 774     int i;
 775     for(i=0; i<w; i++)
 776     {
 777         const int src0= src[0*srcStride];
 778         const int src1= src[1*srcStride];
 779         const int src2= src[2*srcStride];
 780         const int src3= src[3*srcStride];
 781         const int src4= src[4*srcStride];
 782         const int src5= src[5*srcStride];
 783         const int src6= src[6*srcStride];
 784         const int src7= src[7*srcStride];
 785         const int src8= src[8*srcStride];
 786         dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
 787         dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
 788         dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
 789         dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
 790         dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
 791         dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
 792         dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
 793         dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
 794         dst++;
 795         src++;
 796     }
 797 }
 798
 799 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
 800 {
 801     int i;
 802     for(i=0; i<8; i++)
 803     {
 804         dst[0]= src[0];
 805         dst[1]= src[1];
 806         dst[2]= src[2];
 807         dst[3]= src[3];
 808         dst[4]= src[4];
 809         dst[5]= src[5];
 810         dst[6]= src[6];
 811         dst[7]= src[7];
 812         dst+=dstStride;
 813         src+=srcStride;
 814     }
 815 }
 816
 817 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
 818 {
 819     int i;
 820     for(i=0; i<8; i++)
 821     {
 822         dst[0]= (src1[0] + src2[0] + r)>>1;
 823         dst[1]= (src1[1] + src2[1] + r)>>1;
 824         dst[2]= (src1[2] + src2[2] + r)>>1;
 825         dst[3]= (src1[3] + src2[3] + r)>>1;
 826         dst[4]= (src1[4] + src2[4] + r)>>1;
 827         dst[5]= (src1[5] + src2[5] + r)>>1;
 828         dst[6]= (src1[6] + src2[6] + r)>>1;
 829         dst[7]= (src1[7] + src2[7] + r)>>1;
 830         dst+=dstStride;
 831         src1+=srcStride;
 832         src2+=8;
 833     }
 834 }
 835
 836 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
 837 {
 838     int i;
 839     for(i=0; i<8; i++)
 840     {
 841         dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
 842         dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
 843         dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
 844         dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
 845         dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
 846         dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
 847         dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
 848         dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
 849         dst+=dstStride;
 850         src1+=srcStride;
 851         src2+=8;
 852         src3+=8;
 853         src4+=8;
 854     }
 855 }
 856
 857 #define QPEL_MC(r, name) \
 858 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 859 {\
 860     put_block(dst, src, dstStride, srcStride);\
 861 }\
 862 \
 863 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 864 {\
 865     UINT8 half[64];\
 866     qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
 867     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 868 }\
 869 \
 870 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 871 {\
 872     qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 873 }\
 874 \
 875 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 876 {\
 877     UINT8 half[64];\
 878     qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
 879     avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
 880 }\
 881 \
 882 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 883 {\
 884     UINT8 half[64];\
 885     qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
 886     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 887 }\
 888 \
 889 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 890 {\
 891     qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 892 }\
 893 \
 894 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 895 {\
 896     UINT8 half[64];\
 897     qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
 898     avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
 899 }\
 900 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 901 {\
 902     UINT8 halfH[72];\
 903     UINT8 halfV[64];\
 904     UINT8 halfHV[64];\
 905     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 906     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 907     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 908     avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 909 }\
 910 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 911 {\
 912     UINT8 halfH[72];\
 913     UINT8 halfV[64];\
 914     UINT8 halfHV[64];\
 915     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 916     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 917     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 918     avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 919 }\
 920 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 921 {\
 922     UINT8 halfH[72];\
 923     UINT8 halfV[64];\
 924     UINT8 halfHV[64];\
 925     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 926     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 927     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 928     avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 929 }\
 930 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 931 {\
 932     UINT8 halfH[72];\
 933     UINT8 halfV[64];\
 934     UINT8 halfHV[64];\
 935     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 936     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 937     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 938     avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 939 }\
 940 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 941 {\
 942     UINT8 halfH[72];\
 943     UINT8 halfHV[64];\
 944     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 945     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 946     avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
 947 }\
 948 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 949 {\
 950     UINT8 halfH[72];\
 951     UINT8 halfHV[64];\
 952     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 953     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 954     avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
 955 }\
 956 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 957 {\
 958     UINT8 halfH[72];\
 959     UINT8 halfV[64];\
 960     UINT8 halfHV[64];\
 961     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 962     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 963     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 964     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 965 }\
 966 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 967 {\
 968     UINT8 halfH[72];\
 969     UINT8 halfV[64];\
 970     UINT8 halfHV[64];\
 971     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 972     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 973     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 974     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 975 }\
 976 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 977 {\
 978     UINT8 halfH[72];\
 979     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 980     qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
 981 }\
 982 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
 983     qpel_mc00_c ## name,                                                                   \
 984     qpel_mc10_c ## name,                                                                   \
 985     qpel_mc20_c ## name,                                                                   \
 986     qpel_mc30_c ## name,                                                                   \
 987     qpel_mc01_c ## name,                                                                   \
 988     qpel_mc11_c ## name,                                                                   \
 989     qpel_mc21_c ## name,                                                                   \
 990     qpel_mc31_c ## name,                                                                   \
 991     qpel_mc02_c ## name,                                                                   \
 992     qpel_mc12_c ## name,                                                                   \
 993     qpel_mc22_c ## name,                                                                   \
 994     qpel_mc32_c ## name,                                                                   \
 995     qpel_mc03_c ## name,                                                                   \
 996     qpel_mc13_c ## name,                                                                   \
 997     qpel_mc23_c ## name,                                                                   \
 998     qpel_mc33_c ## name,                                                                   \
 999 };
1000
1001 QPEL_MC(0, _rnd)
1002 QPEL_MC(1, _no_rnd)
1003
1004 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1005 {
1006     int s, i;
1007
1008     s = 0;
1009     for(i=0;i<16;i++) {
1010         s += abs(pix1[0] - pix2[0]);
1011         s += abs(pix1[1] - pix2[1]);
1012         s += abs(pix1[2] - pix2[2]);
1013         s += abs(pix1[3] - pix2[3]);
1014         s += abs(pix1[4] - pix2[4]);
1015         s += abs(pix1[5] - pix2[5]);
1016         s += abs(pix1[6] - pix2[6]);
1017         s += abs(pix1[7] - pix2[7]);
1018         s += abs(pix1[8] - pix2[8]);
1019         s += abs(pix1[9] - pix2[9]);
1020         s += abs(pix1[10] - pix2[10]);
1021         s += abs(pix1[11] - pix2[11]);
1022         s += abs(pix1[12] - pix2[12]);
1023         s += abs(pix1[13] - pix2[13]);
1024         s += abs(pix1[14] - pix2[14]);
1025         s += abs(pix1[15] - pix2[15]);
1026         pix1 += line_size;
1027         pix2 += line_size;
1028     }
1029     return s;
1030 }
1031
1032 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1033 {
1034     int s, i;
1035
1036     s = 0;
1037     for(i=0;i<16;i++) {
1038         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1039         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1040         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1041         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1042         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1043         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1044         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1045         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1046         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1047         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1048         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1049         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1050         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1051         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1052         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1053         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1054         pix1 += line_size;
1055         pix2 += line_size;
1056     }
1057     return s;
1058 }
1059
1060 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1061 {
1062     int s, i;
1063     UINT8 *pix3 = pix2 + line_size;
1064
1065     s = 0;
1066     for(i=0;i<16;i++) {
1067         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1068         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1069         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1070         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1071         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1072         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1073         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1074         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1075         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1076         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1077         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1078         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1079         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1080         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1081         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1082         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1083         pix1 += line_size;
1084         pix2 += line_size;
1085         pix3 += line_size;
1086     }
1087     return s;
1088 }
1089
1090 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1091 {
1092     int s, i;
1093     UINT8 *pix3 = pix2 + line_size;
1094
1095     s = 0;
1096     for(i=0;i<16;i++) {
1097         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1098         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1099         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1100         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1101         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1102         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1103         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1104         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1105         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1106         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1107         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1108         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1109         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1110         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1111         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1112         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1113         pix1 += line_size;
1114         pix2 += line_size;
1115         pix3 += line_size;
1116     }
1117     return s;
1118 }
1119
1120 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1121 {
1122     int s, i;
1123
1124     s = 0;
1125     for(i=0;i<8;i++) {
1126         s += abs(pix1[0] - pix2[0]);
1127         s += abs(pix1[1] - pix2[1]);
1128         s += abs(pix1[2] - pix2[2]);
1129         s += abs(pix1[3] - pix2[3]);
1130         s += abs(pix1[4] - pix2[4]);
1131         s += abs(pix1[5] - pix2[5]);
1132         s += abs(pix1[6] - pix2[6]);
1133         s += abs(pix1[7] - pix2[7]);
1134         pix1 += line_size;
1135         pix2 += line_size;
1136     }
1137     return s;
1138 }
1139
1140 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1141 {
1142     int s, i;
1143
1144     s = 0;
1145     for(i=0;i<8;i++) {
1146         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1147         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1148         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1149         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1150         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1151         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1152         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1153         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1154         pix1 += line_size;
1155         pix2 += line_size;
1156     }
1157     return s;
1158 }
1159
1160 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1161 {
1162     int s, i;
1163     UINT8 *pix3 = pix2 + line_size;
1164
1165     s = 0;
1166     for(i=0;i<8;i++) {
1167         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1168         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1169         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1170         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1171         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1172         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1173         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1174         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1175         pix1 += line_size;
1176         pix2 += line_size;
1177         pix3 += line_size;
1178     }
1179     return s;
1180 }
1181
1182 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1183 {
1184     int s, i;
1185     UINT8 *pix3 = pix2 + line_size;
1186
1187     s = 0;
1188     for(i=0;i<8;i++) {
1189         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1190         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1191         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1192         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1193         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1194         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1195         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1196         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1197         pix1 += line_size;
1198         pix2 += line_size;
1199         pix3 += line_size;
1200     }
1201     return s;
1202 }
1203
1204 /* permute block according so that it corresponds to the MMX idct
1205    order */
1206 #ifdef SIMPLE_IDCT
1207  /* general permutation, but perhaps slightly slower */
1208 void block_permute(INT16 *block)
1209 {
1210         int i;
1211         INT16 temp[64];
1212
1213         for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1214
1215         for(i=0; i<64; i++) block[i] = temp[i];
1216 }
1217 #else
1218
1219 void block_permute(INT16 *block)
1220 {
1221     int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1222     int i;
1223
1224     for(i=0;i<8;i++) {
1225         tmp1 = block[1];
1226         tmp2 = block[2];
1227         tmp3 = block[3];
1228         tmp4 = block[4];
1229         tmp5 = block[5];
1230         tmp6 = block[6];
1231         block[1] = tmp2;
1232         block[2] = tmp4;
1233         block[3] = tmp6;
1234         block[4] = tmp1;
1235         block[5] = tmp3;
1236         block[6] = tmp5;
1237         block += 8;
1238     }
1239 }
1240 #endif
1241
1242 void clear_blocks_c(DCTELEM *blocks)
1243 {
1244     memset(blocks, 0, sizeof(DCTELEM)*6*64);
1245 }
1246
1247 void dsputil_init(void)
1248 {
1249     int i, j;
1250     int use_permuted_idct;
1251
1252     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1253     for(i=0;i<MAX_NEG_CROP;i++) {
1254         cropTbl[i] = 0;
1255         cropTbl[i + MAX_NEG_CROP + 256] = 255;
1256     }
1257
1258     for(i=0;i<512;i++) {
1259         squareTbl[i] = (i - 256) * (i - 256);
1260     }
1261
1262 #ifdef SIMPLE_IDCT
1263     ff_idct = simple_idct;
1264 #else
1265     ff_idct = j_rev_dct;
1266 #endif
1267     get_pixels = get_pixels_c;
1268     diff_pixels = diff_pixels_c;
1269     put_pixels_clamped = put_pixels_clamped_c;
1270     add_pixels_clamped = add_pixels_clamped_c;
1271     gmc1= gmc1_c;
1272     clear_blocks= clear_blocks_c;
1273
1274     pix_abs16x16     = pix_abs16x16_c;
1275     pix_abs16x16_x2  = pix_abs16x16_x2_c;
1276     pix_abs16x16_y2  = pix_abs16x16_y2_c;
1277     pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1278     pix_abs8x8     = pix_abs8x8_c;
1279     pix_abs8x8_x2  = pix_abs8x8_x2_c;
1280     pix_abs8x8_y2  = pix_abs8x8_y2_c;
1281     pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1282     av_fdct = fdct_ifast;
1283
1284     use_permuted_idct = 1;
1285
1286 #ifdef HAVE_MMX
1287     dsputil_init_mmx();
1288 #endif
1289 #ifdef ARCH_ARMV4L
1290     dsputil_init_armv4l();
1291 #endif
1292 #ifdef HAVE_MLIB
1293     dsputil_init_mlib();
1294     use_permuted_idct = 0;
1295 #endif
1296 #ifdef ARCH_ALPHA
1297     dsputil_init_alpha();
1298     use_permuted_idct = 0;
1299 #endif
1300
1301 #ifdef SIMPLE_IDCT
1302     if(ff_idct == simple_idct) use_permuted_idct=0;
1303 #endif
1304
1305     if(use_permuted_idct)
1306 #ifdef SIMPLE_IDCT
1307         for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1308 #else
1309         for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1310 #endif
1311     else
1312         for(i=0; i<64; i++) permutation[i]=i;
1313
1314     for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1315     for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1316
1317     if (use_permuted_idct) {
1318         /* permute for IDCT */
1319         for(i=0;i<64;i++) {
1320             j = zigzag_direct[i];
1321             zigzag_direct[i] = block_permute_op(j);
1322             j = ff_alternate_horizontal_scan[i];
1323             ff_alternate_horizontal_scan[i] = block_permute_op(j);
1324             j = ff_alternate_vertical_scan[i];
1325             ff_alternate_vertical_scan[i] = block_permute_op(j);
1326         }
1327         block_permute(default_intra_matrix);
1328         block_permute(default_non_intra_matrix);
1329         block_permute(ff_mpeg4_default_intra_matrix);
1330         block_permute(ff_mpeg4_default_non_intra_matrix);
1331     }
1332
1333     build_zigzag_end();
1334 }
1335
1336 /* remove any non bit exact operation (testing purpose) */
1337 void avcodec_set_bit_exact(void)
1338 {
1339 #ifdef HAVE_MMX
1340     dsputil_set_bit_exact_mmx();
1341 #endif
1342 }
1343
1344 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1345               int orig_linesize[3], int coded_linesize,
1346               AVCodecContext *avctx)
1347 {
1348     int quad, diff, x, y;
1349     UINT8 *orig, *coded;
1350     UINT32 *sq = squareTbl + 256;
1351
1352     quad = 0;
1353     diff = 0;
1354
1355     /* Luminance */
1356     orig = orig_image[0];
1357     coded = coded_image[0];
1358
1359     for (y=0;y<avctx->height;y++) {
1360         for (x=0;x<avctx->width;x++) {
1361             diff = *(orig + x) - *(coded + x);
1362             quad += sq[diff];
1363         }
1364         orig += orig_linesize[0];
1365         coded += coded_linesize;
1366     }
1367
1368     avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1369
1370     if (avctx->psnr_y) {
1371         avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1372         avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1373     } else
1374         avctx->psnr_y = 99.99;
1375 }
1376