git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Gerard Lantau.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18  *
  19  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  20  */
  21 #include <stdlib.h>
  22 #include <stdio.h>
  23 #include <math.h>
  24 #include "avcodec.h"
  25 #include "dsputil.h"
  26 #include "simple_idct.h"
  27
  28 void (*ff_idct)(DCTELEM *block);
  29 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
  30 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
  31 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  32 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
  33 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
  34 void (*clear_blocks)(DCTELEM *blocks);
  35
  36 op_pixels_abs_func pix_abs16x16;
  37 op_pixels_abs_func pix_abs16x16_x2;
  38 op_pixels_abs_func pix_abs16x16_y2;
  39 op_pixels_abs_func pix_abs16x16_xy2;
  40
  41 op_pixels_abs_func pix_abs8x8;
  42 op_pixels_abs_func pix_abs8x8_x2;
  43 op_pixels_abs_func pix_abs8x8_y2;
  44 op_pixels_abs_func pix_abs8x8_xy2;
  45
  46 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
  47 UINT32 squareTbl[512];
  48
  49 extern UINT16 default_intra_matrix[64];
  50 extern UINT16 default_non_intra_matrix[64];
  51 extern UINT16 ff_mpeg4_default_intra_matrix[64];
  52 extern UINT16 ff_mpeg4_default_non_intra_matrix[64];
  53
  54 UINT8 zigzag_direct[64] = {
  55     0, 1, 8, 16, 9, 2, 3, 10,
  56     17, 24, 32, 25, 18, 11, 4, 5,
  57     12, 19, 26, 33, 40, 48, 41, 34,
  58     27, 20, 13, 6, 7, 14, 21, 28,
  59     35, 42, 49, 56, 57, 50, 43, 36,
  60     29, 22, 15, 23, 30, 37, 44, 51,
  61     58, 59, 52, 45, 38, 31, 39, 46,
  62     53, 60, 61, 54, 47, 55, 62, 63
  63 };
  64
  65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  66 UINT16 __align8 inv_zigzag_direct16[64];
  67
  68 /* not permutated zigzag_direct for MMX quantizer */
  69 UINT8 zigzag_direct_noperm[64];
  70
  71 UINT8 ff_alternate_horizontal_scan[64] = {
  72     0,  1,  2,  3,  8,  9, 16, 17,
  73     10, 11,  4,  5,  6,  7, 15, 14,
  74     13, 12, 19, 18, 24, 25, 32, 33,
  75     26, 27, 20, 21, 22, 23, 28, 29,
  76     30, 31, 34, 35, 40, 41, 48, 49,
  77     42, 43, 36, 37, 38, 39, 44, 45,
  78     46, 47, 50, 51, 56, 57, 58, 59,
  79     52, 53, 54, 55, 60, 61, 62, 63,
  80 };
  81
  82 UINT8 ff_alternate_vertical_scan[64] = {
  83     0,  8, 16, 24,  1,  9,  2, 10,
  84     17, 25, 32, 40, 48, 56, 57, 49,
  85     41, 33, 26, 18,  3, 11,  4, 12,
  86     19, 27, 34, 42, 50, 58, 35, 43,
  87     51, 59, 20, 28,  5, 13,  6, 14,
  88     21, 29, 36, 44, 52, 60, 37, 45,
  89     53, 61, 22, 30,  7, 15, 23, 31,
  90     38, 46, 54, 62, 39, 47, 55, 63,
  91 };
  92
  93 #ifdef SIMPLE_IDCT
  94
  95 /* Input permutation for the simple_idct_mmx */
  96 static UINT8 simple_mmx_permutation[64]={
  97         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
  98         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
  99         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 100         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 101         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 102         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 103         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 104         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 105 };
 106 #endif
 107
 108 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 109 UINT32 inverse[256]={
 110          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 111  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 112  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 113  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 114  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 115  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 116   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 117   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 118   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 119   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 120   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 121   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 122   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 123   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 124   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 125   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 126   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 127   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 128   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 129   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 130   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 131   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 132   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 133   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 134   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 135   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 136   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 137   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 138   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 139   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 140   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 141   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 142 };
 143
 144 /* used to skip zeros at the end */
 145 UINT8 zigzag_end[64];
 146
 147 UINT8 permutation[64];
 148 //UINT8 invPermutation[64];
 149
 150 static void build_zigzag_end()
 151 {
 152     int lastIndex;
 153     int lastIndexAfterPerm=0;
 154     for(lastIndex=0; lastIndex<64; lastIndex++)
 155     {
 156         if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
 157             lastIndexAfterPerm= zigzag_direct[lastIndex];
 158         zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
 159     }
 160 }
 161
 162 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
 163 {
 164     DCTELEM *p;
 165     const UINT8 *pix;
 166     int i;
 167
 168     /* read the pixels */
 169     p = block;
 170     pix = pixels;
 171     for(i=0;i<8;i++) {
 172         p[0] = pix[0];
 173         p[1] = pix[1];
 174         p[2] = pix[2];
 175         p[3] = pix[3];
 176         p[4] = pix[4];
 177         p[5] = pix[5];
 178         p[6] = pix[6];
 179         p[7] = pix[7];
 180         pix += line_size;
 181         p += 8;
 182     }
 183 }
 184
 185 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
 186     DCTELEM *p;
 187     int i;
 188
 189     /* read the pixels */
 190     p = block;
 191     for(i=0;i<8;i++) {
 192         p[0] = s1[0] - s2[0];
 193         p[1] = s1[1] - s2[1];
 194         p[2] = s1[2] - s2[2];
 195         p[3] = s1[3] - s2[3];
 196         p[4] = s1[4] - s2[4];
 197         p[5] = s1[5] - s2[5];
 198         p[6] = s1[6] - s2[6];
 199         p[7] = s1[7] - s2[7];
 200         s1 += stride;
 201         s2 += stride;
 202         p += 8;
 203     }
 204 }
 205
 206
 207 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
 208 {
 209     const DCTELEM *p;
 210     UINT8 *pix;
 211     int i;
 212     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 213
 214     /* read the pixels */
 215     p = block;
 216     pix = pixels;
 217     for(i=0;i<8;i++) {
 218         pix[0] = cm[p[0]];
 219         pix[1] = cm[p[1]];
 220         pix[2] = cm[p[2]];
 221         pix[3] = cm[p[3]];
 222         pix[4] = cm[p[4]];
 223         pix[5] = cm[p[5]];
 224         pix[6] = cm[p[6]];
 225         pix[7] = cm[p[7]];
 226         pix += line_size;
 227         p += 8;
 228     }
 229 }
 230
 231 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
 232 {
 233     const DCTELEM *p;
 234     UINT8 *pix;
 235     int i;
 236     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 237
 238     /* read the pixels */
 239     p = block;
 240     pix = pixels;
 241     for(i=0;i<8;i++) {
 242         pix[0] = cm[pix[0] + p[0]];
 243         pix[1] = cm[pix[1] + p[1]];
 244         pix[2] = cm[pix[2] + p[2]];
 245         pix[3] = cm[pix[3] + p[3]];
 246         pix[4] = cm[pix[4] + p[4]];
 247         pix[5] = cm[pix[5] + p[5]];
 248         pix[6] = cm[pix[6] + p[6]];
 249         pix[7] = cm[pix[7] + p[7]];
 250         pix += line_size;
 251         p += 8;
 252     }
 253 }
 254
 255 //FIXME someone with a alignemtent picky cpu should change these
 256
 257 #define LD32(a) (*((uint32_t*)(a)))
 258 #define LD64(a) (*((uint64_t*)(a)))
 259
 260 #if 0
 261
 262 #define PIXOP2(OPNAME, OP) \
 263 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 264 {\
 265     int i;\
 266     for(i=0; i<h; i++){\
 267         OP(*((uint64_t*)block), LD64(pixels));\
 268         pixels+=line_size;\
 269         block +=line_size;\
 270     }\
 271 }\
 272 \
 273 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 274 {\
 275     int i;\
 276     for(i=0; i<h; i++){\
 277         const uint64_t a= LD64(pixels  );\
 278         const uint64_t b= LD64(pixels+1);\
 279         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 280         pixels+=line_size;\
 281         block +=line_size;\
 282     }\
 283 }\
 284 \
 285 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 286 {\
 287     int i;\
 288     for(i=0; i<h; i++){\
 289         const uint64_t a= LD64(pixels  );\
 290         const uint64_t b= LD64(pixels+1);\
 291         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 292         pixels+=line_size;\
 293         block +=line_size;\
 294     }\
 295 }\
 296 \
 297 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 298 {\
 299     int i;\
 300     for(i=0; i<h; i++){\
 301         const uint64_t a= LD64(pixels          );\
 302         const uint64_t b= LD64(pixels+line_size);\
 303         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 304         pixels+=line_size;\
 305         block +=line_size;\
 306     }\
 307 }\
 308 \
 309 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 310 {\
 311     int i;\
 312     for(i=0; i<h; i++){\
 313         const uint64_t a= LD64(pixels          );\
 314         const uint64_t b= LD64(pixels+line_size);\
 315         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 316         pixels+=line_size;\
 317         block +=line_size;\
 318     }\
 319 }\
 320 \
 321 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 322 {\
 323         int i;\
 324         const uint64_t a= LD64(pixels  );\
 325         const uint64_t b= LD64(pixels+1);\
 326         uint64_t l0=  (a&0x0303030303030303ULL)\
 327                     + (b&0x0303030303030303ULL)\
 328                     + 0x0202020202020202ULL;\
 329         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 330                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 331         uint64_t l1,h1;\
 332 \
 333         pixels+=line_size;\
 334         for(i=0; i<h; i+=2){\
 335             uint64_t a= LD64(pixels  );\
 336             uint64_t b= LD64(pixels+1);\
 337             l1=  (a&0x0303030303030303ULL)\
 338                + (b&0x0303030303030303ULL);\
 339             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 340               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 341             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 342             pixels+=line_size;\
 343             block +=line_size;\
 344             a= LD64(pixels  );\
 345             b= LD64(pixels+1);\
 346             l0=  (a&0x0303030303030303ULL)\
 347                + (b&0x0303030303030303ULL)\
 348                + 0x0202020202020202ULL;\
 349             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 350               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 351             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 352             pixels+=line_size;\
 353             block +=line_size;\
 354         }\
 355 }\
 356 \
 357 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 358 {\
 359         int i;\
 360         const uint64_t a= LD64(pixels  );\
 361         const uint64_t b= LD64(pixels+1);\
 362         uint64_t l0=  (a&0x0303030303030303ULL)\
 363                     + (b&0x0303030303030303ULL)\
 364                     + 0x0101010101010101ULL;\
 365         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 366                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 367         uint64_t l1,h1;\
 368 \
 369         pixels+=line_size;\
 370         for(i=0; i<h; i+=2){\
 371             uint64_t a= LD64(pixels  );\
 372             uint64_t b= LD64(pixels+1);\
 373             l1=  (a&0x0303030303030303ULL)\
 374                + (b&0x0303030303030303ULL);\
 375             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 376               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 377             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 378             pixels+=line_size;\
 379             block +=line_size;\
 380             a= LD64(pixels  );\
 381             b= LD64(pixels+1);\
 382             l0=  (a&0x0303030303030303ULL)\
 383                + (b&0x0303030303030303ULL)\
 384                + 0x0101010101010101ULL;\
 385             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 386               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 387             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 388             pixels+=line_size;\
 389             block +=line_size;\
 390         }\
 391 }\
 392 \
 393 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 394     OPNAME ## _pixels,\
 395     OPNAME ## _pixels_x2,\
 396     OPNAME ## _pixels_y2,\
 397     OPNAME ## _pixels_xy2,\
 398 };\
 399 \
 400 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 401     OPNAME ## _pixels,\
 402     OPNAME ## _no_rnd_pixels_x2,\
 403     OPNAME ## _no_rnd_pixels_y2,\
 404     OPNAME ## _no_rnd_pixels_xy2,\
 405 };
 406
 407 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 408 #else // 64 bit variant
 409
 410 #define PIXOP2(OPNAME, OP) \
 411 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 412 {\
 413     int i;\
 414     for(i=0; i<h; i++){\
 415         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 416         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 417         pixels+=line_size;\
 418         block +=line_size;\
 419     }\
 420 }\
 421 \
 422 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 423 {\
 424     int i;\
 425     for(i=0; i<h; i++){\
 426         int j;\
 427         for(j=0; j<2; j++){\
 428             const uint32_t a= LD32(pixels  );\
 429             const uint32_t b= LD32(pixels+1);\
 430             OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
 431             pixels+=4;\
 432             block +=4;\
 433         }\
 434         pixels+=line_size-8;\
 435         block +=line_size-8;\
 436     }\
 437 }\
 438 \
 439 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 440 {\
 441     int i;\
 442     for(i=0; i<h; i++){\
 443         int j;\
 444         for(j=0; j<2; j++){\
 445             const uint32_t a= LD32(pixels  );\
 446             const uint32_t b= LD32(pixels+1);\
 447             OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
 448             pixels+=4;\
 449             block +=4;\
 450         }\
 451         pixels+=line_size-8;\
 452         block +=line_size-8;\
 453     }\
 454 }\
 455 \
 456 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 457 {\
 458     int i;\
 459     for(i=0; i<h; i++){\
 460         int j;\
 461         for(j=0; j<2; j++){\
 462             const uint32_t a= LD32(pixels          );\
 463             const uint32_t b= LD32(pixels+line_size);\
 464             OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
 465             pixels+=4;\
 466             block +=4;\
 467         }\
 468         pixels+=line_size-8;\
 469         block +=line_size-8;\
 470     }\
 471 }\
 472 \
 473 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 474 {\
 475     int i;\
 476     for(i=0; i<h; i++){\
 477         int j;\
 478         for(j=0; j<2; j++){\
 479             const uint32_t a= LD32(pixels          );\
 480             const uint32_t b= LD32(pixels+line_size);\
 481             OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
 482             pixels+=4;\
 483             block +=4;\
 484         }\
 485         pixels+=line_size-8;\
 486         block +=line_size-8;\
 487     }\
 488 }\
 489 \
 490 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 491 {\
 492     int j;\
 493     for(j=0; j<2; j++){\
 494         int i;\
 495         const uint32_t a= LD32(pixels  );\
 496         const uint32_t b= LD32(pixels+1);\
 497         uint32_t l0=  (a&0x03030303UL)\
 498                     + (b&0x03030303UL)\
 499                     + 0x02020202UL;\
 500         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 501                    + ((b&0xFCFCFCFCUL)>>2);\
 502         uint32_t l1,h1;\
 503 \
 504         pixels+=line_size;\
 505         for(i=0; i<h; i+=2){\
 506             uint32_t a= LD32(pixels  );\
 507             uint32_t b= LD32(pixels+1);\
 508             l1=  (a&0x03030303UL)\
 509                + (b&0x03030303UL);\
 510             h1= ((a&0xFCFCFCFCUL)>>2)\
 511               + ((b&0xFCFCFCFCUL)>>2);\
 512             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 513             pixels+=line_size;\
 514             block +=line_size;\
 515             a= LD32(pixels  );\
 516             b= LD32(pixels+1);\
 517             l0=  (a&0x03030303UL)\
 518                + (b&0x03030303UL)\
 519                + 0x02020202UL;\
 520             h0= ((a&0xFCFCFCFCUL)>>2)\
 521               + ((b&0xFCFCFCFCUL)>>2);\
 522             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 523             pixels+=line_size;\
 524             block +=line_size;\
 525         }\
 526         pixels+=4-line_size*(h+1);\
 527         block +=4-line_size*h;\
 528     }\
 529 }\
 530 \
 531 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 532 {\
 533     int j;\
 534     for(j=0; j<2; j++){\
 535         int i;\
 536         const uint32_t a= LD32(pixels  );\
 537         const uint32_t b= LD32(pixels+1);\
 538         uint32_t l0=  (a&0x03030303UL)\
 539                     + (b&0x03030303UL)\
 540                     + 0x01010101UL;\
 541         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 542                    + ((b&0xFCFCFCFCUL)>>2);\
 543         uint32_t l1,h1;\
 544 \
 545         pixels+=line_size;\
 546         for(i=0; i<h; i+=2){\
 547             uint32_t a= LD32(pixels  );\
 548             uint32_t b= LD32(pixels+1);\
 549             l1=  (a&0x03030303UL)\
 550                + (b&0x03030303UL);\
 551             h1= ((a&0xFCFCFCFCUL)>>2)\
 552               + ((b&0xFCFCFCFCUL)>>2);\
 553             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 554             pixels+=line_size;\
 555             block +=line_size;\
 556             a= LD32(pixels  );\
 557             b= LD32(pixels+1);\
 558             l0=  (a&0x03030303UL)\
 559                + (b&0x03030303UL)\
 560                + 0x01010101UL;\
 561             h0= ((a&0xFCFCFCFCUL)>>2)\
 562               + ((b&0xFCFCFCFCUL)>>2);\
 563             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 564             pixels+=line_size;\
 565             block +=line_size;\
 566         }\
 567         pixels+=4-line_size*(h+1);\
 568         block +=4-line_size*h;\
 569     }\
 570 }\
 571 \
 572 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 573     OPNAME ## _pixels,\
 574     OPNAME ## _pixels_x2,\
 575     OPNAME ## _pixels_y2,\
 576     OPNAME ## _pixels_xy2,\
 577 };\
 578 \
 579 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
 580     OPNAME ## _pixels,\
 581     OPNAME ## _no_rnd_pixels_x2,\
 582     OPNAME ## _no_rnd_pixels_y2,\
 583     OPNAME ## _no_rnd_pixels_xy2,\
 584 };
 585 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 586 #endif
 587
 588 #define op_put(a, b) a = b
 589
 590 PIXOP2(avg, op_avg)
 591 PIXOP2(put, op_put)
 592 #undef op_avg
 593 #undef op_put
 594
 595 /* FIXME this stuff could be removed as its ot really used anymore */
 596 #define PIXOP(BTYPE, OPNAME, OP, INCR)                                                   \
 597                                                                                          \
 598 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
 599 {                                                                                        \
 600     BTYPE *p;                                                                            \
 601     const UINT8 *pix;                                                                    \
 602                                                                                          \
 603     p = block;                                                                           \
 604     pix = pixels;                                                                        \
 605     do {                                                                                 \
 606         OP(p[0], pix[0]);                                                                  \
 607         OP(p[1], pix[1]);                                                                  \
 608         OP(p[2], pix[2]);                                                                  \
 609         OP(p[3], pix[3]);                                                                  \
 610         OP(p[4], pix[4]);                                                                  \
 611         OP(p[5], pix[5]);                                                                  \
 612         OP(p[6], pix[6]);                                                                  \
 613         OP(p[7], pix[7]);                                                                  \
 614         pix += line_size;                                                                \
 615         p += INCR;                                                                       \
 616     } while (--h);;                                                                       \
 617 }                                                                                        \
 618                                                                                          \
 619 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
 620 {                                                                                        \
 621     BTYPE *p;                                                                          \
 622     const UINT8 *pix;                                                                    \
 623                                                                                          \
 624     p = block;                                                                           \
 625     pix = pixels;                                                                        \
 626     do {                                                                   \
 627         OP(p[0], avg2(pix[0], pix[1]));                                                    \
 628         OP(p[1], avg2(pix[1], pix[2]));                                                    \
 629         OP(p[2], avg2(pix[2], pix[3]));                                                    \
 630         OP(p[3], avg2(pix[3], pix[4]));                                                    \
 631         OP(p[4], avg2(pix[4], pix[5]));                                                    \
 632         OP(p[5], avg2(pix[5], pix[6]));                                                    \
 633         OP(p[6], avg2(pix[6], pix[7]));                                                    \
 634         OP(p[7], avg2(pix[7], pix[8]));                                                    \
 635         pix += line_size;                                                                \
 636         p += INCR;                                                                       \
 637     } while (--h);                                                                        \
 638 }                                                                                        \
 639                                                                                          \
 640 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
 641 {                                                                                        \
 642     BTYPE *p;                                                                          \
 643     const UINT8 *pix;                                                                    \
 644     const UINT8 *pix1;                                                                   \
 645                                                                                          \
 646     p = block;                                                                           \
 647     pix = pixels;                                                                        \
 648     pix1 = pixels + line_size;                                                           \
 649     do {                                                                                 \
 650         OP(p[0], avg2(pix[0], pix1[0]));                                                   \
 651         OP(p[1], avg2(pix[1], pix1[1]));                                                   \
 652         OP(p[2], avg2(pix[2], pix1[2]));                                                   \
 653         OP(p[3], avg2(pix[3], pix1[3]));                                                   \
 654         OP(p[4], avg2(pix[4], pix1[4]));                                                   \
 655         OP(p[5], avg2(pix[5], pix1[5]));                                                   \
 656         OP(p[6], avg2(pix[6], pix1[6]));                                                   \
 657         OP(p[7], avg2(pix[7], pix1[7]));                                                   \
 658         pix += line_size;                                                                \
 659         pix1 += line_size;                                                               \
 660         p += INCR;                                                                       \
 661     } while(--h);                                                                         \
 662 }                                                                                        \
 663                                                                                          \
 664 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
 665 {                                                                                        \
 666     BTYPE *p;                                                                          \
 667     const UINT8 *pix;                                                                    \
 668     const UINT8 *pix1;                                                                   \
 669                                                                                          \
 670     p = block;                                                                           \
 671     pix = pixels;                                                                        \
 672     pix1 = pixels + line_size;                                                           \
 673     do {                                                                   \
 674         OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1]));                                  \
 675         OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2]));                                  \
 676         OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3]));                                  \
 677         OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4]));                                  \
 678         OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5]));                                  \
 679         OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6]));                                  \
 680         OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7]));                                  \
 681         OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8]));                                  \
 682         pix += line_size;                                                                \
 683         pix1 += line_size;                                                               \
 684         p += INCR;                                                                       \
 685     } while(--h);                                                                         \
 686 }                                                                                        \
 687                                                                                          \
 688 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
 689     OPNAME ## _pixels,                                                                   \
 690     OPNAME ## _pixels_x2,                                                                \
 691     OPNAME ## _pixels_y2,                                                                \
 692     OPNAME ## _pixels_xy2,                                                               \
 693 };
 694
 695
 696 /* rounding primitives */
 697 #define avg2(a,b) ((a+b+1)>>1)
 698 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 699
 700 #define op_avg(a, b) a = avg2(a, b)
 701 #define op_sub(a, b) a -= b
 702
 703 PIXOP(DCTELEM, sub, op_sub, 8)
 704
 705 /* not rounding primitives */
 706 #undef avg2
 707 #undef avg4
 708 #define avg2(a,b) ((a+b)>>1)
 709 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
 710
 711 /* motion estimation */
 712
 713 #undef avg2
 714 #undef avg4
 715 #define avg2(a,b) ((a+b+1)>>1)
 716 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 717
 718 /* end of removeale stuff */
 719
 720 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
 721 {
 722     const int A=(16-x16)*(16-y16);
 723     const int B=(   x16)*(16-y16);
 724     const int C=(16-x16)*(   y16);
 725     const int D=(   x16)*(   y16);
 726     int i;
 727     rounder= 128 - rounder;
 728
 729     for(i=0; i<h; i++)
 730     {
 731         dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
 732         dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
 733         dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
 734         dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
 735         dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
 736         dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
 737         dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
 738         dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
 739         dst+= srcStride;
 740         src+= srcStride;
 741     }
 742 }
 743
 744 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
 745 {
 746     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 747     int i;
 748     for(i=0; i<h; i++)
 749     {
 750         dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
 751         dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
 752         dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
 753         dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
 754         dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
 755         dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
 756         dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
 757         dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
 758         dst+=dstStride;
 759         src+=srcStride;
 760     }
 761 }
 762
 763 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
 764 {
 765     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 766     int i;
 767     for(i=0; i<w; i++)
 768     {
 769         const int src0= src[0*srcStride];
 770         const int src1= src[1*srcStride];
 771         const int src2= src[2*srcStride];
 772         const int src3= src[3*srcStride];
 773         const int src4= src[4*srcStride];
 774         const int src5= src[5*srcStride];
 775         const int src6= src[6*srcStride];
 776         const int src7= src[7*srcStride];
 777         const int src8= src[8*srcStride];
 778         dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
 779         dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
 780         dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
 781         dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
 782         dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
 783         dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
 784         dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
 785         dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
 786         dst++;
 787         src++;
 788     }
 789 }
 790
 791 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
 792 {
 793     int i;
 794     for(i=0; i<8; i++)
 795     {
 796         dst[0]= src[0];
 797         dst[1]= src[1];
 798         dst[2]= src[2];
 799         dst[3]= src[3];
 800         dst[4]= src[4];
 801         dst[5]= src[5];
 802         dst[6]= src[6];
 803         dst[7]= src[7];
 804         dst+=dstStride;
 805         src+=srcStride;
 806     }
 807 }
 808
 809 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
 810 {
 811     int i;
 812     for(i=0; i<8; i++)
 813     {
 814         dst[0]= (src1[0] + src2[0] + r)>>1;
 815         dst[1]= (src1[1] + src2[1] + r)>>1;
 816         dst[2]= (src1[2] + src2[2] + r)>>1;
 817         dst[3]= (src1[3] + src2[3] + r)>>1;
 818         dst[4]= (src1[4] + src2[4] + r)>>1;
 819         dst[5]= (src1[5] + src2[5] + r)>>1;
 820         dst[6]= (src1[6] + src2[6] + r)>>1;
 821         dst[7]= (src1[7] + src2[7] + r)>>1;
 822         dst+=dstStride;
 823         src1+=srcStride;
 824         src2+=8;
 825     }
 826 }
 827
 828 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
 829 {
 830     int i;
 831     for(i=0; i<8; i++)
 832     {
 833         dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
 834         dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
 835         dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
 836         dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
 837         dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
 838         dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
 839         dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
 840         dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
 841         dst+=dstStride;
 842         src1+=srcStride;
 843         src2+=8;
 844         src3+=8;
 845         src4+=8;
 846     }
 847 }
 848
 849 #define QPEL_MC(r, name) \
 850 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 851 {\
 852     put_block(dst, src, dstStride, srcStride);\
 853 }\
 854 \
 855 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 856 {\
 857     UINT8 half[64];\
 858     qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
 859     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 860 }\
 861 \
 862 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 863 {\
 864     qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 865 }\
 866 \
 867 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 868 {\
 869     UINT8 half[64];\
 870     qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
 871     avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
 872 }\
 873 \
 874 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 875 {\
 876     UINT8 half[64];\
 877     qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
 878     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
 879 }\
 880 \
 881 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 882 {\
 883     qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
 884 }\
 885 \
 886 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 887 {\
 888     UINT8 half[64];\
 889     qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
 890     avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
 891 }\
 892 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 893 {\
 894     UINT8 halfH[72];\
 895     UINT8 halfV[64];\
 896     UINT8 halfHV[64];\
 897     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 898     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 899     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 900     avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 901 }\
 902 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 903 {\
 904     UINT8 halfH[72];\
 905     UINT8 halfV[64];\
 906     UINT8 halfHV[64];\
 907     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 908     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 909     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 910     avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
 911 }\
 912 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 913 {\
 914     UINT8 halfH[72];\
 915     UINT8 halfV[64];\
 916     UINT8 halfHV[64];\
 917     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 918     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 919     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 920     avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 921 }\
 922 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 923 {\
 924     UINT8 halfH[72];\
 925     UINT8 halfV[64];\
 926     UINT8 halfHV[64];\
 927     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 928     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 929     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 930     avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
 931 }\
 932 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 933 {\
 934     UINT8 halfH[72];\
 935     UINT8 halfHV[64];\
 936     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 937     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 938     avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
 939 }\
 940 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 941 {\
 942     UINT8 halfH[72];\
 943     UINT8 halfHV[64];\
 944     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 945     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 946     avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
 947 }\
 948 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 949 {\
 950     UINT8 halfH[72];\
 951     UINT8 halfV[64];\
 952     UINT8 halfHV[64];\
 953     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 954     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
 955     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 956     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 957 }\
 958 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 959 {\
 960     UINT8 halfH[72];\
 961     UINT8 halfV[64];\
 962     UINT8 halfHV[64];\
 963     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 964     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
 965     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
 966     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
 967 }\
 968 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
 969 {\
 970     UINT8 halfH[72];\
 971     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
 972     qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
 973 }\
 974 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
 975     qpel_mc00_c ## name,                                                                   \
 976     qpel_mc10_c ## name,                                                                   \
 977     qpel_mc20_c ## name,                                                                   \
 978     qpel_mc30_c ## name,                                                                   \
 979     qpel_mc01_c ## name,                                                                   \
 980     qpel_mc11_c ## name,                                                                   \
 981     qpel_mc21_c ## name,                                                                   \
 982     qpel_mc31_c ## name,                                                                   \
 983     qpel_mc02_c ## name,                                                                   \
 984     qpel_mc12_c ## name,                                                                   \
 985     qpel_mc22_c ## name,                                                                   \
 986     qpel_mc32_c ## name,                                                                   \
 987     qpel_mc03_c ## name,                                                                   \
 988     qpel_mc13_c ## name,                                                                   \
 989     qpel_mc23_c ## name,                                                                   \
 990     qpel_mc33_c ## name,                                                                   \
 991 };
 992
 993 QPEL_MC(0, _rnd)
 994 QPEL_MC(1, _no_rnd)
 995
 996 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 997 {
 998     int s, i;
 999
1000     s = 0;
1001     for(i=0;i<16;i++) {
1002         s += abs(pix1[0] - pix2[0]);
1003         s += abs(pix1[1] - pix2[1]);
1004         s += abs(pix1[2] - pix2[2]);
1005         s += abs(pix1[3] - pix2[3]);
1006         s += abs(pix1[4] - pix2[4]);
1007         s += abs(pix1[5] - pix2[5]);
1008         s += abs(pix1[6] - pix2[6]);
1009         s += abs(pix1[7] - pix2[7]);
1010         s += abs(pix1[8] - pix2[8]);
1011         s += abs(pix1[9] - pix2[9]);
1012         s += abs(pix1[10] - pix2[10]);
1013         s += abs(pix1[11] - pix2[11]);
1014         s += abs(pix1[12] - pix2[12]);
1015         s += abs(pix1[13] - pix2[13]);
1016         s += abs(pix1[14] - pix2[14]);
1017         s += abs(pix1[15] - pix2[15]);
1018         pix1 += line_size;
1019         pix2 += line_size;
1020     }
1021     return s;
1022 }
1023
1024 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1025 {
1026     int s, i;
1027
1028     s = 0;
1029     for(i=0;i<16;i++) {
1030         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1031         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1032         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1033         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1034         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1035         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1036         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1037         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1038         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1039         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1040         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1041         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1042         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1043         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1044         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1045         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1046         pix1 += line_size;
1047         pix2 += line_size;
1048     }
1049     return s;
1050 }
1051
1052 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1053 {
1054     int s, i;
1055     UINT8 *pix3 = pix2 + line_size;
1056
1057     s = 0;
1058     for(i=0;i<16;i++) {
1059         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1060         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1061         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1062         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1063         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1064         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1065         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1066         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1067         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1068         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1069         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1070         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1071         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1072         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1073         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1074         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1075         pix1 += line_size;
1076         pix2 += line_size;
1077         pix3 += line_size;
1078     }
1079     return s;
1080 }
1081
1082 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1083 {
1084     int s, i;
1085     UINT8 *pix3 = pix2 + line_size;
1086
1087     s = 0;
1088     for(i=0;i<16;i++) {
1089         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1090         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1091         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1092         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1093         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1094         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1095         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1096         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1097         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1098         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1099         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1100         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1101         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1102         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1103         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1104         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1105         pix1 += line_size;
1106         pix2 += line_size;
1107         pix3 += line_size;
1108     }
1109     return s;
1110 }
1111
1112 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1113 {
1114     int s, i;
1115
1116     s = 0;
1117     for(i=0;i<8;i++) {
1118         s += abs(pix1[0] - pix2[0]);
1119         s += abs(pix1[1] - pix2[1]);
1120         s += abs(pix1[2] - pix2[2]);
1121         s += abs(pix1[3] - pix2[3]);
1122         s += abs(pix1[4] - pix2[4]);
1123         s += abs(pix1[5] - pix2[5]);
1124         s += abs(pix1[6] - pix2[6]);
1125         s += abs(pix1[7] - pix2[7]);
1126         pix1 += line_size;
1127         pix2 += line_size;
1128     }
1129     return s;
1130 }
1131
1132 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1133 {
1134     int s, i;
1135
1136     s = 0;
1137     for(i=0;i<8;i++) {
1138         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1139         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1140         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1141         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1142         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1143         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1144         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1145         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1146         pix1 += line_size;
1147         pix2 += line_size;
1148     }
1149     return s;
1150 }
1151
1152 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1153 {
1154     int s, i;
1155     UINT8 *pix3 = pix2 + line_size;
1156
1157     s = 0;
1158     for(i=0;i<8;i++) {
1159         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1160         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1161         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1162         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1163         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1164         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1165         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1166         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1167         pix1 += line_size;
1168         pix2 += line_size;
1169         pix3 += line_size;
1170     }
1171     return s;
1172 }
1173
1174 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1175 {
1176     int s, i;
1177     UINT8 *pix3 = pix2 + line_size;
1178
1179     s = 0;
1180     for(i=0;i<8;i++) {
1181         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1182         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1183         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1184         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1185         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1186         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1187         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1188         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1189         pix1 += line_size;
1190         pix2 += line_size;
1191         pix3 += line_size;
1192     }
1193     return s;
1194 }
1195
1196 /* permute block according so that it corresponds to the MMX idct
1197    order */
1198 #ifdef SIMPLE_IDCT
1199  /* general permutation, but perhaps slightly slower */
1200 void block_permute(INT16 *block)
1201 {
1202         int i;
1203         INT16 temp[64];
1204
1205         for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1206
1207         for(i=0; i<64; i++) block[i] = temp[i];
1208 }
1209 #else
1210
1211 void block_permute(INT16 *block)
1212 {
1213     int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1214     int i;
1215
1216     for(i=0;i<8;i++) {
1217         tmp1 = block[1];
1218         tmp2 = block[2];
1219         tmp3 = block[3];
1220         tmp4 = block[4];
1221         tmp5 = block[5];
1222         tmp6 = block[6];
1223         block[1] = tmp2;
1224         block[2] = tmp4;
1225         block[3] = tmp6;
1226         block[4] = tmp1;
1227         block[5] = tmp3;
1228         block[6] = tmp5;
1229         block += 8;
1230     }
1231 }
1232 #endif
1233
1234 void clear_blocks_c(DCTELEM *blocks)
1235 {
1236     memset(blocks, 0, sizeof(DCTELEM)*6*64);
1237 }
1238
1239 void dsputil_init(void)
1240 {
1241     int i, j;
1242     int use_permuted_idct;
1243
1244     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1245     for(i=0;i<MAX_NEG_CROP;i++) {
1246         cropTbl[i] = 0;
1247         cropTbl[i + MAX_NEG_CROP + 256] = 255;
1248     }
1249
1250     for(i=0;i<512;i++) {
1251         squareTbl[i] = (i - 256) * (i - 256);
1252     }
1253
1254 #ifdef SIMPLE_IDCT
1255     ff_idct = simple_idct;
1256 #else
1257     ff_idct = j_rev_dct;
1258 #endif
1259     get_pixels = get_pixels_c;
1260     diff_pixels = diff_pixels_c;
1261     put_pixels_clamped = put_pixels_clamped_c;
1262     add_pixels_clamped = add_pixels_clamped_c;
1263     gmc1= gmc1_c;
1264     clear_blocks= clear_blocks_c;
1265
1266     pix_abs16x16     = pix_abs16x16_c;
1267     pix_abs16x16_x2  = pix_abs16x16_x2_c;
1268     pix_abs16x16_y2  = pix_abs16x16_y2_c;
1269     pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1270     pix_abs8x8     = pix_abs8x8_c;
1271     pix_abs8x8_x2  = pix_abs8x8_x2_c;
1272     pix_abs8x8_y2  = pix_abs8x8_y2_c;
1273     pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1274     av_fdct = jpeg_fdct_ifast;
1275
1276     use_permuted_idct = 1;
1277
1278 #ifdef HAVE_MMX
1279     dsputil_init_mmx();
1280 #endif
1281 #ifdef ARCH_ARMV4L
1282     dsputil_init_armv4l();
1283 #endif
1284 #ifdef HAVE_MLIB
1285     dsputil_init_mlib();
1286     use_permuted_idct = 0;
1287 #endif
1288 #ifdef ARCH_ALPHA
1289     dsputil_init_alpha();
1290     use_permuted_idct = 0;
1291 #endif
1292
1293 #ifdef SIMPLE_IDCT
1294     if(ff_idct == simple_idct) use_permuted_idct=0;
1295 #endif
1296
1297     if(use_permuted_idct)
1298 #ifdef SIMPLE_IDCT
1299         for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1300 #else
1301         for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1302 #endif
1303     else
1304         for(i=0; i<64; i++) permutation[i]=i;
1305
1306     for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1307     for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1308
1309     if (use_permuted_idct) {
1310         /* permute for IDCT */
1311         for(i=0;i<64;i++) {
1312             j = zigzag_direct[i];
1313             zigzag_direct[i] = block_permute_op(j);
1314             j = ff_alternate_horizontal_scan[i];
1315             ff_alternate_horizontal_scan[i] = block_permute_op(j);
1316             j = ff_alternate_vertical_scan[i];
1317             ff_alternate_vertical_scan[i] = block_permute_op(j);
1318         }
1319         block_permute(default_intra_matrix);
1320         block_permute(default_non_intra_matrix);
1321         block_permute(ff_mpeg4_default_intra_matrix);
1322         block_permute(ff_mpeg4_default_non_intra_matrix);
1323     }
1324
1325     build_zigzag_end();
1326 }
1327
1328 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1329               int orig_linesize[3], int coded_linesize,
1330               AVCodecContext *avctx)
1331 {
1332     int quad, diff, x, y;
1333     UINT8 *orig, *coded;
1334     UINT32 *sq = squareTbl + 256;
1335
1336     quad = 0;
1337     diff = 0;
1338
1339     /* Luminance */
1340     orig = orig_image[0];
1341     coded = coded_image[0];
1342
1343     for (y=0;y<avctx->height;y++) {
1344         for (x=0;x<avctx->width;x++) {
1345             diff = *(orig + x) - *(coded + x);
1346             quad += sq[diff];
1347         }
1348         orig += orig_linesize[0];
1349         coded += coded_linesize;
1350     }
1351
1352     avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1353
1354     if (avctx->psnr_y) {
1355         avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1356         avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1357     } else
1358         avctx->psnr_y = 99.99;
1359 }
1360