3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23 #include "simple_idct.h"
25 void (*ff_idct)(DCTELEM *block);
26 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
28 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
29 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
30 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
31 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
33 void (*clear_blocks)(DCTELEM *blocks);
34 int (*pix_sum)(UINT8 * pix, int line_size);
35 int (*pix_norm1)(UINT8 * pix, int line_size);
37 op_pixels_abs_func pix_abs16x16;
38 op_pixels_abs_func pix_abs16x16_x2;
39 op_pixels_abs_func pix_abs16x16_y2;
40 op_pixels_abs_func pix_abs16x16_xy2;
42 op_pixels_abs_func pix_abs8x8;
43 op_pixels_abs_func pix_abs8x8_x2;
44 op_pixels_abs_func pix_abs8x8_y2;
45 op_pixels_abs_func pix_abs8x8_xy2;
49 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
50 UINT32 squareTbl[512];
52 extern INT16 ff_mpeg1_default_intra_matrix[64];
53 extern INT16 ff_mpeg1_default_non_intra_matrix[64];
54 extern INT16 ff_mpeg4_default_intra_matrix[64];
55 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
57 UINT8 zigzag_direct[64] = {
58 0, 1, 8, 16, 9, 2, 3, 10,
59 17, 24, 32, 25, 18, 11, 4, 5,
60 12, 19, 26, 33, 40, 48, 41, 34,
61 27, 20, 13, 6, 7, 14, 21, 28,
62 35, 42, 49, 56, 57, 50, 43, 36,
63 29, 22, 15, 23, 30, 37, 44, 51,
64 58, 59, 52, 45, 38, 31, 39, 46,
65 53, 60, 61, 54, 47, 55, 62, 63
68 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
69 UINT16 __align8 inv_zigzag_direct16[64];
71 /* not permutated zigzag_direct for MMX quantizer */
72 UINT8 zigzag_direct_noperm[64];
74 UINT8 ff_alternate_horizontal_scan[64] = {
75 0, 1, 2, 3, 8, 9, 16, 17,
76 10, 11, 4, 5, 6, 7, 15, 14,
77 13, 12, 19, 18, 24, 25, 32, 33,
78 26, 27, 20, 21, 22, 23, 28, 29,
79 30, 31, 34, 35, 40, 41, 48, 49,
80 42, 43, 36, 37, 38, 39, 44, 45,
81 46, 47, 50, 51, 56, 57, 58, 59,
82 52, 53, 54, 55, 60, 61, 62, 63,
85 UINT8 ff_alternate_vertical_scan[64] = {
86 0, 8, 16, 24, 1, 9, 2, 10,
87 17, 25, 32, 40, 48, 56, 57, 49,
88 41, 33, 26, 18, 3, 11, 4, 12,
89 19, 27, 34, 42, 50, 58, 35, 43,
90 51, 59, 20, 28, 5, 13, 6, 14,
91 21, 29, 36, 44, 52, 60, 37, 45,
92 53, 61, 22, 30, 7, 15, 23, 31,
93 38, 46, 54, 62, 39, 47, 55, 63,
98 /* Input permutation for the simple_idct_mmx */
99 static UINT8 simple_mmx_permutation[64]={
100 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
101 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
102 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
103 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
104 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
105 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
106 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
107 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
111 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
112 UINT32 inverse[256]={
113 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
114 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
115 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
116 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
117 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
118 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
119 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
120 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
121 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
122 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
123 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
124 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
125 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
126 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
127 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
128 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
129 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
130 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
131 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
132 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
133 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
134 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
135 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
136 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
137 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
138 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
139 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
140 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
141 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
142 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
143 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
144 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
147 /* used to skip zeros at the end */
148 UINT8 zigzag_end[64];
150 UINT8 permutation[64];
151 //UINT8 invPermutation[64];
153 static void build_zigzag_end(void)
156 int lastIndexAfterPerm=0;
157 for(lastIndex=0; lastIndex<64; lastIndex++)
159 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
160 lastIndexAfterPerm= zigzag_direct[lastIndex];
161 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
165 int pix_sum_c(UINT8 * pix, int line_size)
170 for (i = 0; i < 16; i++) {
171 for (j = 0; j < 16; j += 8) {
182 pix += line_size - 16;
187 int pix_norm1_c(UINT8 * pix, int line_size)
190 UINT32 *sq = squareTbl + 256;
193 for (i = 0; i < 16; i++) {
194 for (j = 0; j < 16; j += 8) {
205 pix += line_size - 16;
211 void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
215 /* read the pixels */
217 block[0] = pixels[0];
218 block[1] = pixels[1];
219 block[2] = pixels[2];
220 block[3] = pixels[3];
221 block[4] = pixels[4];
222 block[5] = pixels[5];
223 block[6] = pixels[6];
224 block[7] = pixels[7];
230 void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
234 /* read the pixels */
236 block[0] = s1[0] - s2[0];
237 block[1] = s1[1] - s2[1];
238 block[2] = s1[2] - s2[2];
239 block[3] = s1[3] - s2[3];
240 block[4] = s1[4] - s2[4];
241 block[5] = s1[5] - s2[5];
242 block[6] = s1[6] - s2[6];
243 block[7] = s1[7] - s2[7];
251 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
255 UINT8 *cm = cropTbl + MAX_NEG_CROP;
257 /* read the pixels */
259 pixels[0] = cm[block[0]];
260 pixels[1] = cm[block[1]];
261 pixels[2] = cm[block[2]];
262 pixels[3] = cm[block[3]];
263 pixels[4] = cm[block[4]];
264 pixels[5] = cm[block[5]];
265 pixels[6] = cm[block[6]];
266 pixels[7] = cm[block[7]];
273 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
277 UINT8 *cm = cropTbl + MAX_NEG_CROP;
279 /* read the pixels */
281 pixels[0] = cm[pixels[0] + block[0]];
282 pixels[1] = cm[pixels[1] + block[1]];
283 pixels[2] = cm[pixels[2] + block[2]];
284 pixels[3] = cm[pixels[3] + block[3]];
285 pixels[4] = cm[pixels[4] + block[4]];
286 pixels[5] = cm[pixels[5] + block[5]];
287 pixels[6] = cm[pixels[6] + block[6]];
288 pixels[7] = cm[pixels[7] + block[7]];
295 #define PIXOP2(OPNAME, OP) \
296 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
300 OP(*((uint64_t*)block), LD64(pixels));\
306 static void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
310 const uint64_t a= LD64(pixels );\
311 const uint64_t b= LD64(pixels+1);\
312 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
318 static void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
322 const uint64_t a= LD64(pixels );\
323 const uint64_t b= LD64(pixels+1);\
324 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
330 static void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
334 const uint64_t a= LD64(pixels );\
335 const uint64_t b= LD64(pixels+line_size);\
336 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
342 static void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
346 const uint64_t a= LD64(pixels );\
347 const uint64_t b= LD64(pixels+line_size);\
348 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
354 static void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
357 const uint64_t a= LD64(pixels );\
358 const uint64_t b= LD64(pixels+1);\
359 uint64_t l0= (a&0x0303030303030303ULL)\
360 + (b&0x0303030303030303ULL)\
361 + 0x0202020202020202ULL;\
362 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
363 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
367 for(i=0; i<h; i+=2){\
368 uint64_t a= LD64(pixels );\
369 uint64_t b= LD64(pixels+1);\
370 l1= (a&0x0303030303030303ULL)\
371 + (b&0x0303030303030303ULL);\
372 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
373 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
374 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
379 l0= (a&0x0303030303030303ULL)\
380 + (b&0x0303030303030303ULL)\
381 + 0x0202020202020202ULL;\
382 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
383 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
384 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
390 static void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
393 const uint64_t a= LD64(pixels );\
394 const uint64_t b= LD64(pixels+1);\
395 uint64_t l0= (a&0x0303030303030303ULL)\
396 + (b&0x0303030303030303ULL)\
397 + 0x0101010101010101ULL;\
398 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
399 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
403 for(i=0; i<h; i+=2){\
404 uint64_t a= LD64(pixels );\
405 uint64_t b= LD64(pixels+1);\
406 l1= (a&0x0303030303030303ULL)\
407 + (b&0x0303030303030303ULL);\
408 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
409 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
410 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
415 l0= (a&0x0303030303030303ULL)\
416 + (b&0x0303030303030303ULL)\
417 + 0x0101010101010101ULL;\
418 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
419 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
420 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
426 CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels , 8)\
427 CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels_x2 , 8)\
428 CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels_y2 , 8)\
429 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels_xy2, 8)\
430 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels_x2 , 8)\
431 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels_y2 , 8)\
432 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels_xy2, 8)\
434 void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
437 OPNAME ## _pixels_x2,\
438 OPNAME ## _pixels_y2,\
439 OPNAME ## _pixels_xy2},\
441 OPNAME ## _pixels16,\
442 OPNAME ## _pixels16_x2,\
443 OPNAME ## _pixels16_y2,\
444 OPNAME ## _pixels16_xy2}\
447 void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
450 OPNAME ## _no_rnd_pixels_x2,\
451 OPNAME ## _no_rnd_pixels_y2,\
452 OPNAME ## _no_rnd_pixels_xy2},\
454 OPNAME ## _pixels16,\
455 OPNAME ## _no_rnd_pixels16_x2,\
456 OPNAME ## _no_rnd_pixels16_y2,\
457 OPNAME ## _no_rnd_pixels16_xy2}\
460 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
461 #else // 64 bit variant
463 #define PIXOP2(OPNAME, OP) \
464 static void OPNAME ## _pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
467 OP(*((uint32_t*)(block )), LD32(pixels ));\
468 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
473 static inline void OPNAME ## _no_rnd_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
474 OPNAME ## _pixels8(block, pixels, line_size, h);\
477 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
478 int src_stride1, int src_stride2, int h){\
482 a= LD32(&src1[i*src_stride1 ]);\
483 b= LD32(&src2[i*src_stride2 ]);\
484 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
485 a= LD32(&src1[i*src_stride1+4]);\
486 b= LD32(&src2[i*src_stride2+4]);\
487 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
491 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
492 int src_stride1, int src_stride2, int h){\
496 a= LD32(&src1[i*src_stride1 ]);\
497 b= LD32(&src2[i*src_stride2 ]);\
498 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
499 a= LD32(&src1[i*src_stride1+4]);\
500 b= LD32(&src2[i*src_stride2+4]);\
501 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
505 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
506 int src_stride1, int src_stride2, int h){\
507 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
508 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
511 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
512 int src_stride1, int src_stride2, int h){\
513 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
514 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
517 static inline void OPNAME ## _no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
518 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
521 static inline void OPNAME ## _pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
522 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
525 static inline void OPNAME ## _no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
526 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
529 static inline void OPNAME ## _pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
530 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
533 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
534 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
537 uint32_t a, b, c, d, l0, l1, h0, h1;\
538 a= LD32(&src1[i*src_stride1]);\
539 b= LD32(&src2[i*src_stride2]);\
540 c= LD32(&src3[i*src_stride3]);\
541 d= LD32(&src4[i*src_stride4]);\
542 l0= (a&0x03030303UL)\
545 h0= ((a&0xFCFCFCFCUL)>>2)\
546 + ((b&0xFCFCFCFCUL)>>2);\
547 l1= (c&0x03030303UL)\
549 h1= ((c&0xFCFCFCFCUL)>>2)\
550 + ((d&0xFCFCFCFCUL)>>2);\
551 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
552 a= LD32(&src1[i*src_stride1+4]);\
553 b= LD32(&src2[i*src_stride2+4]);\
554 c= LD32(&src3[i*src_stride3+4]);\
555 d= LD32(&src4[i*src_stride4+4]);\
556 l0= (a&0x03030303UL)\
559 h0= ((a&0xFCFCFCFCUL)>>2)\
560 + ((b&0xFCFCFCFCUL)>>2);\
561 l1= (c&0x03030303UL)\
563 h1= ((c&0xFCFCFCFCUL)>>2)\
564 + ((d&0xFCFCFCFCUL)>>2);\
565 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
568 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
569 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
572 uint32_t a, b, c, d, l0, l1, h0, h1;\
573 a= LD32(&src1[i*src_stride1]);\
574 b= LD32(&src2[i*src_stride2]);\
575 c= LD32(&src3[i*src_stride3]);\
576 d= LD32(&src4[i*src_stride4]);\
577 l0= (a&0x03030303UL)\
580 h0= ((a&0xFCFCFCFCUL)>>2)\
581 + ((b&0xFCFCFCFCUL)>>2);\
582 l1= (c&0x03030303UL)\
584 h1= ((c&0xFCFCFCFCUL)>>2)\
585 + ((d&0xFCFCFCFCUL)>>2);\
586 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
587 a= LD32(&src1[i*src_stride1+4]);\
588 b= LD32(&src2[i*src_stride2+4]);\
589 c= LD32(&src3[i*src_stride3+4]);\
590 d= LD32(&src4[i*src_stride4+4]);\
591 l0= (a&0x03030303UL)\
594 h0= ((a&0xFCFCFCFCUL)>>2)\
595 + ((b&0xFCFCFCFCUL)>>2);\
596 l1= (c&0x03030303UL)\
598 h1= ((c&0xFCFCFCFCUL)>>2)\
599 + ((d&0xFCFCFCFCUL)>>2);\
600 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
603 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
604 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
605 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
606 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
608 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
609 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
610 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
611 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
614 static inline void OPNAME ## _pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
619 const uint32_t a= LD32(pixels );\
620 const uint32_t b= LD32(pixels+1);\
621 uint32_t l0= (a&0x03030303UL)\
624 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
625 + ((b&0xFCFCFCFCUL)>>2);\
629 for(i=0; i<h; i+=2){\
630 uint32_t a= LD32(pixels );\
631 uint32_t b= LD32(pixels+1);\
632 l1= (a&0x03030303UL)\
634 h1= ((a&0xFCFCFCFCUL)>>2)\
635 + ((b&0xFCFCFCFCUL)>>2);\
636 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
641 l0= (a&0x03030303UL)\
644 h0= ((a&0xFCFCFCFCUL)>>2)\
645 + ((b&0xFCFCFCFCUL)>>2);\
646 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
650 pixels+=4-line_size*(h+1);\
651 block +=4-line_size*h;\
655 static inline void OPNAME ## _no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
660 const uint32_t a= LD32(pixels );\
661 const uint32_t b= LD32(pixels+1);\
662 uint32_t l0= (a&0x03030303UL)\
665 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
666 + ((b&0xFCFCFCFCUL)>>2);\
670 for(i=0; i<h; i+=2){\
671 uint32_t a= LD32(pixels );\
672 uint32_t b= LD32(pixels+1);\
673 l1= (a&0x03030303UL)\
675 h1= ((a&0xFCFCFCFCUL)>>2)\
676 + ((b&0xFCFCFCFCUL)>>2);\
677 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
682 l0= (a&0x03030303UL)\
685 h0= ((a&0xFCFCFCFCUL)>>2)\
686 + ((b&0xFCFCFCFCUL)>>2);\
687 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
691 pixels+=4-line_size*(h+1);\
692 block +=4-line_size*h;\
696 CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels8 , 8)\
697 CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels8_x2 , 8)\
698 CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels8_y2 , 8)\
699 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels8_xy2, 8)\
700 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16 , OPNAME ## _pixels8 , 8)\
701 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels8_x2 , 8)\
702 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels8_y2 , 8)\
703 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels8_xy2, 8)\
705 void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
707 OPNAME ## _pixels16,\
708 OPNAME ## _pixels16_x2,\
709 OPNAME ## _pixels16_y2,\
710 OPNAME ## _pixels16_xy2},\
713 OPNAME ## _pixels8_x2,\
714 OPNAME ## _pixels8_y2,\
715 OPNAME ## _pixels8_xy2},\
718 void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
720 OPNAME ## _pixels16,\
721 OPNAME ## _no_rnd_pixels16_x2,\
722 OPNAME ## _no_rnd_pixels16_y2,\
723 OPNAME ## _no_rnd_pixels16_xy2},\
726 OPNAME ## _no_rnd_pixels8_x2,\
727 OPNAME ## _no_rnd_pixels8_y2,\
728 OPNAME ## _no_rnd_pixels8_xy2},\
731 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
733 #define op_put(a, b) a = b
741 /* FIXME this stuff could be removed as its ot really used anymore */
742 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
744 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
765 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
773 OP(p[0], avg2(pix[0], pix[1])); \
774 OP(p[1], avg2(pix[1], pix[2])); \
775 OP(p[2], avg2(pix[2], pix[3])); \
776 OP(p[3], avg2(pix[3], pix[4])); \
777 OP(p[4], avg2(pix[4], pix[5])); \
778 OP(p[5], avg2(pix[5], pix[6])); \
779 OP(p[6], avg2(pix[6], pix[7])); \
780 OP(p[7], avg2(pix[7], pix[8])); \
786 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
794 pix1 = pixels + line_size; \
796 OP(p[0], avg2(pix[0], pix1[0])); \
797 OP(p[1], avg2(pix[1], pix1[1])); \
798 OP(p[2], avg2(pix[2], pix1[2])); \
799 OP(p[3], avg2(pix[3], pix1[3])); \
800 OP(p[4], avg2(pix[4], pix1[4])); \
801 OP(p[5], avg2(pix[5], pix1[5])); \
802 OP(p[6], avg2(pix[6], pix1[6])); \
803 OP(p[7], avg2(pix[7], pix1[7])); \
810 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
818 pix1 = pixels + line_size; \
820 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
821 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
822 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
823 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
824 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
825 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
826 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
827 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
834 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
836 OPNAME ## _pixels_x2, \
837 OPNAME ## _pixels_y2, \
838 OPNAME ## _pixels_xy2, \
841 /* rounding primitives */
842 #define avg2(a,b) ((a+b+1)>>1)
843 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
845 #define op_avg(a, b) a = avg2(a, b)
846 #define op_sub(a, b) a -= b
847 #define op_put(a, b) a = b
849 PIXOP(DCTELEM, sub, op_sub, 8)
850 PIXOP(uint8_t, avg, op_avg, line_size)
851 PIXOP(uint8_t, put, op_put, line_size)
853 /* not rounding primitives */
856 #define avg2(a,b) ((a+b)>>1)
857 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
859 PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
860 PIXOP(uint8_t, put_no_rnd, op_put, line_size)
861 /* motion estimation */
867 #define avg2(a,b) ((a+b+1)>>1)
868 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
870 static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
872 const int A=(16-x16)*(16-y16);
873 const int B=( x16)*(16-y16);
874 const int C=(16-x16)*( y16);
875 const int D=( x16)*( y16);
877 rounder= 128 - rounder;
881 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
882 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
883 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
884 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
885 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
886 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
887 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
888 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
894 static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
899 ST32(dst , LD32(src ));
900 ST32(dst+4 , LD32(src+4 ));
901 ST32(dst+8 , LD32(src+8 ));
902 ST32(dst+12, LD32(src+12));
909 static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
914 ST32(dst , LD32(src ));
915 ST32(dst+4 , LD32(src+4 ));
922 #define QPEL_MC(r, OPNAME, RND, OP) \
923 static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
924 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
928 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
929 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
930 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
931 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
932 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
933 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
934 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
935 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
941 static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
942 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
946 const int src0= src[0*srcStride];\
947 const int src1= src[1*srcStride];\
948 const int src2= src[2*srcStride];\
949 const int src3= src[3*srcStride];\
950 const int src4= src[4*srcStride];\
951 const int src5= src[5*srcStride];\
952 const int src6= src[6*srcStride];\
953 const int src7= src[7*srcStride];\
954 const int src8= src[8*srcStride];\
955 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
956 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
957 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
958 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
959 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
960 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
961 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
962 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
968 static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
969 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
973 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
974 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
975 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
976 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
977 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
978 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
979 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
980 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
981 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
982 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
983 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
984 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
985 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
986 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
987 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
988 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
994 static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
995 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
999 const int src0= src[0*srcStride];\
1000 const int src1= src[1*srcStride];\
1001 const int src2= src[2*srcStride];\
1002 const int src3= src[3*srcStride];\
1003 const int src4= src[4*srcStride];\
1004 const int src5= src[5*srcStride];\
1005 const int src6= src[6*srcStride];\
1006 const int src7= src[7*srcStride];\
1007 const int src8= src[8*srcStride];\
1008 const int src9= src[9*srcStride];\
1009 const int src10= src[10*srcStride];\
1010 const int src11= src[11*srcStride];\
1011 const int src12= src[12*srcStride];\
1012 const int src13= src[13*srcStride];\
1013 const int src14= src[14*srcStride];\
1014 const int src15= src[15*srcStride];\
1015 const int src16= src[16*srcStride];\
1016 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1017 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1018 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1019 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1020 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1021 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1022 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1023 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1024 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1025 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1026 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1027 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1028 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1029 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1030 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1031 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1037 static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1038 OPNAME ## pixels8(dst, src, stride, 8);\
1041 static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1043 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1044 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1047 static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1048 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1051 static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1053 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1054 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1057 static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1060 copy_block9(full, src, 16, stride, 9);\
1061 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1062 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1065 static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1067 copy_block9(full, src, 16, stride, 9);\
1068 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
1071 static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1074 copy_block9(full, src, 16, stride, 9);\
1075 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1076 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1078 static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1083 copy_block9(full, src, 16, stride, 9);\
1084 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1085 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1086 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1087 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1089 static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1094 copy_block9(full, src, 16, stride, 9);\
1095 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1096 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1097 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1098 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1100 static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1105 copy_block9(full, src, 16, stride, 9);\
1106 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1107 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1108 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1109 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1111 static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1116 copy_block9(full, src, 16, stride, 9);\
1117 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1118 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1119 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1120 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1122 static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1125 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1126 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1127 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1129 static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1132 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1133 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1134 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1136 static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1141 copy_block9(full, src, 16, stride, 9);\
1142 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1143 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1144 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1145 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1147 static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1152 copy_block9(full, src, 16, stride, 9);\
1153 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1154 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1155 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1156 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1158 static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1160 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1161 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
1163 static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1164 OPNAME ## pixels16(dst, src, stride, 16);\
1167 static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1169 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1170 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1173 static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1174 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1177 static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1179 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1180 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1183 static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1186 copy_block17(full, src, 24, stride, 17);\
1187 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1188 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1191 static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1193 copy_block17(full, src, 24, stride, 17);\
1194 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1197 static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1200 copy_block17(full, src, 24, stride, 17);\
1201 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1202 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1204 static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1209 copy_block17(full, src, 24, stride, 17);\
1210 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1211 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1212 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1213 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1215 static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1220 copy_block17(full, src, 24, stride, 17);\
1221 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1222 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1223 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1224 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1226 static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1231 copy_block17(full, src, 24, stride, 17);\
1232 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1233 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1234 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1235 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1237 static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1242 copy_block17(full, src, 24, stride, 17);\
1243 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1244 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1245 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1246 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1248 static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1251 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1252 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1253 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1255 static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1258 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1259 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1260 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1262 static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1267 copy_block17(full, src, 24, stride, 17);\
1268 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1269 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1270 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1271 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1273 static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1278 copy_block17(full, src, 24, stride, 17);\
1279 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1280 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1281 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1282 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1284 static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1286 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1287 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
1289 qpel_mc_func OPNAME ## qpel_pixels_tab[2][16]={ \
1291 OPNAME ## qpel16_mc00_c, \
1292 OPNAME ## qpel16_mc10_c, \
1293 OPNAME ## qpel16_mc20_c, \
1294 OPNAME ## qpel16_mc30_c, \
1295 OPNAME ## qpel16_mc01_c, \
1296 OPNAME ## qpel16_mc11_c, \
1297 OPNAME ## qpel16_mc21_c, \
1298 OPNAME ## qpel16_mc31_c, \
1299 OPNAME ## qpel16_mc02_c, \
1300 OPNAME ## qpel16_mc12_c, \
1301 OPNAME ## qpel16_mc22_c, \
1302 OPNAME ## qpel16_mc32_c, \
1303 OPNAME ## qpel16_mc03_c, \
1304 OPNAME ## qpel16_mc13_c, \
1305 OPNAME ## qpel16_mc23_c, \
1306 OPNAME ## qpel16_mc33_c, \
1308 OPNAME ## qpel8_mc00_c, \
1309 OPNAME ## qpel8_mc10_c, \
1310 OPNAME ## qpel8_mc20_c, \
1311 OPNAME ## qpel8_mc30_c, \
1312 OPNAME ## qpel8_mc01_c, \
1313 OPNAME ## qpel8_mc11_c, \
1314 OPNAME ## qpel8_mc21_c, \
1315 OPNAME ## qpel8_mc31_c, \
1316 OPNAME ## qpel8_mc02_c, \
1317 OPNAME ## qpel8_mc12_c, \
1318 OPNAME ## qpel8_mc22_c, \
1319 OPNAME ## qpel8_mc32_c, \
1320 OPNAME ## qpel8_mc03_c, \
1321 OPNAME ## qpel8_mc13_c, \
1322 OPNAME ## qpel8_mc23_c, \
1323 OPNAME ## qpel8_mc33_c, \
1327 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1328 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1329 #define op_put(a, b) a = cm[((b) + 16)>>5]
1330 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1332 QPEL_MC(0, put_ , _ , op_put)
1333 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1334 QPEL_MC(0, avg_ , _ , op_avg)
1335 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1337 #undef op_avg_no_rnd
1339 #undef op_put_no_rnd
1341 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1347 s += abs(pix1[0] - pix2[0]);
1348 s += abs(pix1[1] - pix2[1]);
1349 s += abs(pix1[2] - pix2[2]);
1350 s += abs(pix1[3] - pix2[3]);
1351 s += abs(pix1[4] - pix2[4]);
1352 s += abs(pix1[5] - pix2[5]);
1353 s += abs(pix1[6] - pix2[6]);
1354 s += abs(pix1[7] - pix2[7]);
1355 s += abs(pix1[8] - pix2[8]);
1356 s += abs(pix1[9] - pix2[9]);
1357 s += abs(pix1[10] - pix2[10]);
1358 s += abs(pix1[11] - pix2[11]);
1359 s += abs(pix1[12] - pix2[12]);
1360 s += abs(pix1[13] - pix2[13]);
1361 s += abs(pix1[14] - pix2[14]);
1362 s += abs(pix1[15] - pix2[15]);
1369 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1375 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1376 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1377 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1378 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1379 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1380 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1381 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1382 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1383 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1384 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1385 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1386 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1387 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1388 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1389 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1390 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1397 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1400 UINT8 *pix3 = pix2 + line_size;
1404 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1405 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1406 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1407 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1408 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1409 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1410 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1411 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1412 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1413 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1414 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1415 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1416 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1417 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1418 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1419 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1427 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1430 UINT8 *pix3 = pix2 + line_size;
1434 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1435 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1436 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1437 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1438 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1439 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1440 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1441 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1442 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1443 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1444 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1445 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1446 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1447 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1448 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1449 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1457 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1463 s += abs(pix1[0] - pix2[0]);
1464 s += abs(pix1[1] - pix2[1]);
1465 s += abs(pix1[2] - pix2[2]);
1466 s += abs(pix1[3] - pix2[3]);
1467 s += abs(pix1[4] - pix2[4]);
1468 s += abs(pix1[5] - pix2[5]);
1469 s += abs(pix1[6] - pix2[6]);
1470 s += abs(pix1[7] - pix2[7]);
1477 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1483 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1484 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1485 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1486 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1487 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1488 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1489 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1490 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1497 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1500 UINT8 *pix3 = pix2 + line_size;
1504 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1505 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1506 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1507 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1508 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1509 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1510 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1511 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1519 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1522 UINT8 *pix3 = pix2 + line_size;
1526 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1527 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1528 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1529 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1530 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1531 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1532 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1533 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1541 /* permute block according so that it corresponds to the MMX idct
1544 /* general permutation, but perhaps slightly slower */
1545 void block_permute(INT16 *block)
1550 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1552 for(i=0; i<64; i++) block[i] = temp[i];
1556 void block_permute(INT16 *block)
1558 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1579 void clear_blocks_c(DCTELEM *blocks)
1581 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1584 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1586 void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1589 put_pixels_clamped(block, dest, line_size);
1592 void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1595 add_pixels_clamped(block, dest, line_size);
1598 void dsputil_init(void)
1601 int use_permuted_idct;
1603 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1604 for(i=0;i<MAX_NEG_CROP;i++) {
1606 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1609 for(i=0;i<512;i++) {
1610 squareTbl[i] = (i - 256) * (i - 256);
1616 ff_idct = j_rev_dct;
1618 get_pixels = get_pixels_c;
1619 diff_pixels = diff_pixels_c;
1620 put_pixels_clamped = put_pixels_clamped_c;
1621 add_pixels_clamped = add_pixels_clamped_c;
1623 clear_blocks= clear_blocks_c;
1625 pix_norm1= pix_norm1_c;
1627 pix_abs16x16 = pix_abs16x16_c;
1628 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1629 pix_abs16x16_y2 = pix_abs16x16_y2_c;
1630 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1631 pix_abs8x8 = pix_abs8x8_c;
1632 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1633 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1634 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1636 use_permuted_idct = 1;
1642 dsputil_init_armv4l();
1645 dsputil_init_mlib();
1646 use_permuted_idct = 0;
1649 dsputil_init_alpha();
1650 use_permuted_idct = 0;
1657 if (ff_idct == NULL) {
1658 ff_idct_put = simple_idct_put;
1659 ff_idct_add = simple_idct_add;
1660 use_permuted_idct=0;
1663 if(ff_idct != NULL) {
1664 ff_idct_put = gen_idct_put;
1665 ff_idct_add = gen_idct_add;
1668 if(use_permuted_idct)
1670 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1672 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1675 for(i=0; i<64; i++) permutation[i]=i;
1677 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1678 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1680 if (use_permuted_idct) {
1681 /* permute for IDCT */
1683 j = zigzag_direct[i];
1684 zigzag_direct[i] = block_permute_op(j);
1685 j = ff_alternate_horizontal_scan[i];
1686 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1687 j = ff_alternate_vertical_scan[i];
1688 ff_alternate_vertical_scan[i] = block_permute_op(j);
1690 block_permute(ff_mpeg1_default_intra_matrix);
1691 block_permute(ff_mpeg1_default_non_intra_matrix);
1692 block_permute(ff_mpeg4_default_intra_matrix);
1693 block_permute(ff_mpeg4_default_non_intra_matrix);
1699 /* remove any non bit exact operation (testing purpose) */
1700 void avcodec_set_bit_exact(void)
1704 dsputil_set_bit_exact_mmx();
1708 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1709 int orig_linesize[3], int coded_linesize,
1710 AVCodecContext *avctx)
1712 int quad, diff, x, y;
1713 UINT8 *orig, *coded;
1714 UINT32 *sq = squareTbl + 256;
1720 orig = orig_image[0];
1721 coded = coded_image[0];
1723 for (y=0;y<avctx->height;y++) {
1724 for (x=0;x<avctx->width;x++) {
1725 diff = *(orig + x) - *(coded + x);
1728 orig += orig_linesize[0];
1729 coded += coded_linesize;
1732 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1734 if (avctx->psnr_y) {
1735 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1736 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1738 avctx->psnr_y = 99.99;