3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23 #include "simple_idct.h"
25 void (*ff_idct)(DCTELEM *block);
26 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
28 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
29 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
30 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
31 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
33 void (*clear_blocks)(DCTELEM *blocks);
34 int (*pix_sum)(UINT8 * pix, int line_size);
35 int (*pix_norm1)(UINT8 * pix, int line_size);
37 op_pixels_abs_func pix_abs16x16;
38 op_pixels_abs_func pix_abs16x16_x2;
39 op_pixels_abs_func pix_abs16x16_y2;
40 op_pixels_abs_func pix_abs16x16_xy2;
42 op_pixels_abs_func pix_abs8x8;
43 op_pixels_abs_func pix_abs8x8_x2;
44 op_pixels_abs_func pix_abs8x8_y2;
45 op_pixels_abs_func pix_abs8x8_xy2;
47 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
48 UINT32 squareTbl[512];
50 extern INT16 ff_mpeg1_default_intra_matrix[64];
51 extern INT16 ff_mpeg1_default_non_intra_matrix[64];
52 extern INT16 ff_mpeg4_default_intra_matrix[64];
53 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
55 UINT8 zigzag_direct[64] = {
56 0, 1, 8, 16, 9, 2, 3, 10,
57 17, 24, 32, 25, 18, 11, 4, 5,
58 12, 19, 26, 33, 40, 48, 41, 34,
59 27, 20, 13, 6, 7, 14, 21, 28,
60 35, 42, 49, 56, 57, 50, 43, 36,
61 29, 22, 15, 23, 30, 37, 44, 51,
62 58, 59, 52, 45, 38, 31, 39, 46,
63 53, 60, 61, 54, 47, 55, 62, 63
66 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
67 UINT16 __align8 inv_zigzag_direct16[64];
69 /* not permutated zigzag_direct for MMX quantizer */
70 UINT8 zigzag_direct_noperm[64];
72 UINT8 ff_alternate_horizontal_scan[64] = {
73 0, 1, 2, 3, 8, 9, 16, 17,
74 10, 11, 4, 5, 6, 7, 15, 14,
75 13, 12, 19, 18, 24, 25, 32, 33,
76 26, 27, 20, 21, 22, 23, 28, 29,
77 30, 31, 34, 35, 40, 41, 48, 49,
78 42, 43, 36, 37, 38, 39, 44, 45,
79 46, 47, 50, 51, 56, 57, 58, 59,
80 52, 53, 54, 55, 60, 61, 62, 63,
83 UINT8 ff_alternate_vertical_scan[64] = {
84 0, 8, 16, 24, 1, 9, 2, 10,
85 17, 25, 32, 40, 48, 56, 57, 49,
86 41, 33, 26, 18, 3, 11, 4, 12,
87 19, 27, 34, 42, 50, 58, 35, 43,
88 51, 59, 20, 28, 5, 13, 6, 14,
89 21, 29, 36, 44, 52, 60, 37, 45,
90 53, 61, 22, 30, 7, 15, 23, 31,
91 38, 46, 54, 62, 39, 47, 55, 63,
96 /* Input permutation for the simple_idct_mmx */
97 static UINT8 simple_mmx_permutation[64]={
98 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
99 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
100 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
101 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
102 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
103 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
104 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
105 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
109 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
110 UINT32 inverse[256]={
111 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
112 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
113 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
114 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
115 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
116 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
117 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
118 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
119 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
120 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
121 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
122 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
123 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
124 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
125 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
126 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
127 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
128 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
129 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
130 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
131 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
132 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
133 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
134 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
135 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
136 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
137 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
138 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
139 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
140 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
141 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
142 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
145 /* used to skip zeros at the end */
146 UINT8 zigzag_end[64];
148 UINT8 permutation[64];
149 //UINT8 invPermutation[64];
151 static void build_zigzag_end(void)
154 int lastIndexAfterPerm=0;
155 for(lastIndex=0; lastIndex<64; lastIndex++)
157 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
158 lastIndexAfterPerm= zigzag_direct[lastIndex];
159 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
163 int pix_sum_c(UINT8 * pix, int line_size)
168 for (i = 0; i < 16; i++) {
169 for (j = 0; j < 16; j += 8) {
180 pix += line_size - 16;
185 int pix_norm1_c(UINT8 * pix, int line_size)
188 UINT32 *sq = squareTbl + 256;
191 for (i = 0; i < 16; i++) {
192 for (j = 0; j < 16; j += 8) {
203 pix += line_size - 16;
209 void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
213 /* read the pixels */
215 block[0] = pixels[0];
216 block[1] = pixels[1];
217 block[2] = pixels[2];
218 block[3] = pixels[3];
219 block[4] = pixels[4];
220 block[5] = pixels[5];
221 block[6] = pixels[6];
222 block[7] = pixels[7];
228 void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
232 /* read the pixels */
234 block[0] = s1[0] - s2[0];
235 block[1] = s1[1] - s2[1];
236 block[2] = s1[2] - s2[2];
237 block[3] = s1[3] - s2[3];
238 block[4] = s1[4] - s2[4];
239 block[5] = s1[5] - s2[5];
240 block[6] = s1[6] - s2[6];
241 block[7] = s1[7] - s2[7];
249 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
253 UINT8 *cm = cropTbl + MAX_NEG_CROP;
255 /* read the pixels */
257 pixels[0] = cm[block[0]];
258 pixels[1] = cm[block[1]];
259 pixels[2] = cm[block[2]];
260 pixels[3] = cm[block[3]];
261 pixels[4] = cm[block[4]];
262 pixels[5] = cm[block[5]];
263 pixels[6] = cm[block[6]];
264 pixels[7] = cm[block[7]];
271 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
275 UINT8 *cm = cropTbl + MAX_NEG_CROP;
277 /* read the pixels */
279 pixels[0] = cm[pixels[0] + block[0]];
280 pixels[1] = cm[pixels[1] + block[1]];
281 pixels[2] = cm[pixels[2] + block[2]];
282 pixels[3] = cm[pixels[3] + block[3]];
283 pixels[4] = cm[pixels[4] + block[4]];
284 pixels[5] = cm[pixels[5] + block[5]];
285 pixels[6] = cm[pixels[6] + block[6]];
286 pixels[7] = cm[pixels[7] + block[7]];
293 #define PIXOP2(OPNAME, OP) \
294 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
298 OP(*((uint64_t*)block), LD64(pixels));\
304 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
308 const uint64_t a= LD64(pixels );\
309 const uint64_t b= LD64(pixels+1);\
310 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
316 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
320 const uint64_t a= LD64(pixels );\
321 const uint64_t b= LD64(pixels+1);\
322 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
328 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
332 const uint64_t a= LD64(pixels );\
333 const uint64_t b= LD64(pixels+line_size);\
334 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
340 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
344 const uint64_t a= LD64(pixels );\
345 const uint64_t b= LD64(pixels+line_size);\
346 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
352 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
355 const uint64_t a= LD64(pixels );\
356 const uint64_t b= LD64(pixels+1);\
357 uint64_t l0= (a&0x0303030303030303ULL)\
358 + (b&0x0303030303030303ULL)\
359 + 0x0202020202020202ULL;\
360 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
361 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
365 for(i=0; i<h; i+=2){\
366 uint64_t a= LD64(pixels );\
367 uint64_t b= LD64(pixels+1);\
368 l1= (a&0x0303030303030303ULL)\
369 + (b&0x0303030303030303ULL);\
370 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
371 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
372 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
377 l0= (a&0x0303030303030303ULL)\
378 + (b&0x0303030303030303ULL)\
379 + 0x0202020202020202ULL;\
380 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
381 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
382 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
388 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
391 const uint64_t a= LD64(pixels );\
392 const uint64_t b= LD64(pixels+1);\
393 uint64_t l0= (a&0x0303030303030303ULL)\
394 + (b&0x0303030303030303ULL)\
395 + 0x0101010101010101ULL;\
396 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
397 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
401 for(i=0; i<h; i+=2){\
402 uint64_t a= LD64(pixels );\
403 uint64_t b= LD64(pixels+1);\
404 l1= (a&0x0303030303030303ULL)\
405 + (b&0x0303030303030303ULL);\
406 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
407 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
408 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
413 l0= (a&0x0303030303030303ULL)\
414 + (b&0x0303030303030303ULL)\
415 + 0x0101010101010101ULL;\
416 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
417 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
418 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
424 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
426 OPNAME ## _pixels_x2,\
427 OPNAME ## _pixels_y2,\
428 OPNAME ## _pixels_xy2,\
431 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
433 OPNAME ## _no_rnd_pixels_x2,\
434 OPNAME ## _no_rnd_pixels_y2,\
435 OPNAME ## _no_rnd_pixels_xy2,\
438 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
439 #else // 64 bit variant
441 #define PIXOP2(OPNAME, OP) \
442 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
446 OP(*((uint32_t*)(block )), LD32(pixels ));\
447 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
453 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
459 const uint32_t a= LD32(pixels );\
460 const uint32_t b= LD32(pixels+1);\
461 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
465 pixels+=line_size-8;\
466 block +=line_size-8;\
470 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
476 const uint32_t a= LD32(pixels );\
477 const uint32_t b= LD32(pixels+1);\
478 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
482 pixels+=line_size-8;\
483 block +=line_size-8;\
487 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
493 const uint32_t a= LD32(pixels );\
494 const uint32_t b= LD32(pixels+line_size);\
495 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
499 pixels+=line_size-8;\
500 block +=line_size-8;\
504 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
510 const uint32_t a= LD32(pixels );\
511 const uint32_t b= LD32(pixels+line_size);\
512 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
516 pixels+=line_size-8;\
517 block +=line_size-8;\
521 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
526 const uint32_t a= LD32(pixels );\
527 const uint32_t b= LD32(pixels+1);\
528 uint32_t l0= (a&0x03030303UL)\
531 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
532 + ((b&0xFCFCFCFCUL)>>2);\
536 for(i=0; i<h; i+=2){\
537 uint32_t a= LD32(pixels );\
538 uint32_t b= LD32(pixels+1);\
539 l1= (a&0x03030303UL)\
541 h1= ((a&0xFCFCFCFCUL)>>2)\
542 + ((b&0xFCFCFCFCUL)>>2);\
543 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
548 l0= (a&0x03030303UL)\
551 h0= ((a&0xFCFCFCFCUL)>>2)\
552 + ((b&0xFCFCFCFCUL)>>2);\
553 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
557 pixels+=4-line_size*(h+1);\
558 block +=4-line_size*h;\
562 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
567 const uint32_t a= LD32(pixels );\
568 const uint32_t b= LD32(pixels+1);\
569 uint32_t l0= (a&0x03030303UL)\
572 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
573 + ((b&0xFCFCFCFCUL)>>2);\
577 for(i=0; i<h; i+=2){\
578 uint32_t a= LD32(pixels );\
579 uint32_t b= LD32(pixels+1);\
580 l1= (a&0x03030303UL)\
582 h1= ((a&0xFCFCFCFCUL)>>2)\
583 + ((b&0xFCFCFCFCUL)>>2);\
584 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
589 l0= (a&0x03030303UL)\
592 h0= ((a&0xFCFCFCFCUL)>>2)\
593 + ((b&0xFCFCFCFCUL)>>2);\
594 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
598 pixels+=4-line_size*(h+1);\
599 block +=4-line_size*h;\
603 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
605 OPNAME ## _pixels_x2,\
606 OPNAME ## _pixels_y2,\
607 OPNAME ## _pixels_xy2,\
610 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
612 OPNAME ## _no_rnd_pixels_x2,\
613 OPNAME ## _no_rnd_pixels_y2,\
614 OPNAME ## _no_rnd_pixels_xy2,\
616 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
618 #define op_put(a, b) a = b
626 /* FIXME this stuff could be removed as its ot really used anymore */
627 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
629 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
650 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
658 OP(p[0], avg2(pix[0], pix[1])); \
659 OP(p[1], avg2(pix[1], pix[2])); \
660 OP(p[2], avg2(pix[2], pix[3])); \
661 OP(p[3], avg2(pix[3], pix[4])); \
662 OP(p[4], avg2(pix[4], pix[5])); \
663 OP(p[5], avg2(pix[5], pix[6])); \
664 OP(p[6], avg2(pix[6], pix[7])); \
665 OP(p[7], avg2(pix[7], pix[8])); \
671 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
679 pix1 = pixels + line_size; \
681 OP(p[0], avg2(pix[0], pix1[0])); \
682 OP(p[1], avg2(pix[1], pix1[1])); \
683 OP(p[2], avg2(pix[2], pix1[2])); \
684 OP(p[3], avg2(pix[3], pix1[3])); \
685 OP(p[4], avg2(pix[4], pix1[4])); \
686 OP(p[5], avg2(pix[5], pix1[5])); \
687 OP(p[6], avg2(pix[6], pix1[6])); \
688 OP(p[7], avg2(pix[7], pix1[7])); \
695 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
703 pix1 = pixels + line_size; \
705 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
706 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
707 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
708 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
709 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
710 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
711 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
712 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
719 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
721 OPNAME ## _pixels_x2, \
722 OPNAME ## _pixels_y2, \
723 OPNAME ## _pixels_xy2, \
726 /* rounding primitives */
727 #define avg2(a,b) ((a+b+1)>>1)
728 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
730 #define op_avg(a, b) a = avg2(a, b)
731 #define op_sub(a, b) a -= b
732 #define op_put(a, b) a = b
734 PIXOP(DCTELEM, sub, op_sub, 8)
735 PIXOP(uint8_t, avg, op_avg, line_size)
736 PIXOP(uint8_t, put, op_put, line_size)
738 /* not rounding primitives */
741 #define avg2(a,b) ((a+b)>>1)
742 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
744 PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
745 PIXOP(uint8_t, put_no_rnd, op_put, line_size)
746 /* motion estimation */
752 #define avg2(a,b) ((a+b+1)>>1)
753 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
755 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
757 const int A=(16-x16)*(16-y16);
758 const int B=( x16)*(16-y16);
759 const int C=(16-x16)*( y16);
760 const int D=( x16)*( y16);
762 rounder= 128 - rounder;
766 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
767 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
768 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
769 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
770 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
771 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
772 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
773 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
779 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
781 UINT8 *cm = cropTbl + MAX_NEG_CROP;
785 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
786 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
787 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
788 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
789 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
790 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
791 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
792 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
798 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
800 UINT8 *cm = cropTbl + MAX_NEG_CROP;
804 const int src0= src[0*srcStride];
805 const int src1= src[1*srcStride];
806 const int src2= src[2*srcStride];
807 const int src3= src[3*srcStride];
808 const int src4= src[4*srcStride];
809 const int src5= src[5*srcStride];
810 const int src6= src[6*srcStride];
811 const int src7= src[7*srcStride];
812 const int src8= src[8*srcStride];
813 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
814 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
815 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
816 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
817 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
818 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
819 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
820 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
826 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
844 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
849 dst[0]= (src1[0] + src2[0] + r)>>1;
850 dst[1]= (src1[1] + src2[1] + r)>>1;
851 dst[2]= (src1[2] + src2[2] + r)>>1;
852 dst[3]= (src1[3] + src2[3] + r)>>1;
853 dst[4]= (src1[4] + src2[4] + r)>>1;
854 dst[5]= (src1[5] + src2[5] + r)>>1;
855 dst[6]= (src1[6] + src2[6] + r)>>1;
856 dst[7]= (src1[7] + src2[7] + r)>>1;
863 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
868 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
869 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
870 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
871 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
872 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
873 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
874 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
875 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
884 #define QPEL_MC(r, name) \
885 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
887 put_block(dst, src, dstStride, srcStride);\
890 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
893 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
894 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
897 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
899 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
902 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
905 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
906 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
909 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
912 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
913 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
916 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
918 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
921 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
924 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
925 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
927 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
932 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
933 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
934 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
935 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
937 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
942 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
943 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
944 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
945 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
947 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
952 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
953 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
954 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
955 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
957 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
962 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
963 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
964 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
965 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
967 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
971 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
972 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
973 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
975 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
979 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
980 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
981 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
983 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
988 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
989 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
990 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
991 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
993 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
998 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
999 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
1000 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
1001 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
1003 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
1006 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
1007 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
1009 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
1010 qpel_mc00_c ## name, \
1011 qpel_mc10_c ## name, \
1012 qpel_mc20_c ## name, \
1013 qpel_mc30_c ## name, \
1014 qpel_mc01_c ## name, \
1015 qpel_mc11_c ## name, \
1016 qpel_mc21_c ## name, \
1017 qpel_mc31_c ## name, \
1018 qpel_mc02_c ## name, \
1019 qpel_mc12_c ## name, \
1020 qpel_mc22_c ## name, \
1021 qpel_mc32_c ## name, \
1022 qpel_mc03_c ## name, \
1023 qpel_mc13_c ## name, \
1024 qpel_mc23_c ## name, \
1025 qpel_mc33_c ## name, \
1031 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1037 s += abs(pix1[0] - pix2[0]);
1038 s += abs(pix1[1] - pix2[1]);
1039 s += abs(pix1[2] - pix2[2]);
1040 s += abs(pix1[3] - pix2[3]);
1041 s += abs(pix1[4] - pix2[4]);
1042 s += abs(pix1[5] - pix2[5]);
1043 s += abs(pix1[6] - pix2[6]);
1044 s += abs(pix1[7] - pix2[7]);
1045 s += abs(pix1[8] - pix2[8]);
1046 s += abs(pix1[9] - pix2[9]);
1047 s += abs(pix1[10] - pix2[10]);
1048 s += abs(pix1[11] - pix2[11]);
1049 s += abs(pix1[12] - pix2[12]);
1050 s += abs(pix1[13] - pix2[13]);
1051 s += abs(pix1[14] - pix2[14]);
1052 s += abs(pix1[15] - pix2[15]);
1059 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1065 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1066 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1067 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1068 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1069 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1070 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1071 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1072 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1073 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1074 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1075 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1076 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1077 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1078 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1079 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1080 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1087 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1090 UINT8 *pix3 = pix2 + line_size;
1094 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1095 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1096 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1097 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1098 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1099 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1100 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1101 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1102 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1103 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1104 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1105 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1106 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1107 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1108 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1109 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1117 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1120 UINT8 *pix3 = pix2 + line_size;
1124 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1125 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1126 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1127 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1128 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1129 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1130 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1131 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1132 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1133 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1134 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1135 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1136 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1137 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1138 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1139 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1147 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1153 s += abs(pix1[0] - pix2[0]);
1154 s += abs(pix1[1] - pix2[1]);
1155 s += abs(pix1[2] - pix2[2]);
1156 s += abs(pix1[3] - pix2[3]);
1157 s += abs(pix1[4] - pix2[4]);
1158 s += abs(pix1[5] - pix2[5]);
1159 s += abs(pix1[6] - pix2[6]);
1160 s += abs(pix1[7] - pix2[7]);
1167 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1173 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1174 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1175 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1176 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1177 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1178 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1179 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1180 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1187 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1190 UINT8 *pix3 = pix2 + line_size;
1194 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1195 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1196 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1197 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1198 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1199 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1200 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1201 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1209 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1212 UINT8 *pix3 = pix2 + line_size;
1216 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1217 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1218 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1219 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1220 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1221 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1222 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1223 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1231 /* permute block according so that it corresponds to the MMX idct
1234 /* general permutation, but perhaps slightly slower */
1235 void block_permute(INT16 *block)
1240 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1242 for(i=0; i<64; i++) block[i] = temp[i];
1246 void block_permute(INT16 *block)
1248 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1269 void clear_blocks_c(DCTELEM *blocks)
1271 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1274 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1276 void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1279 put_pixels_clamped(block, dest, line_size);
1282 void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1285 add_pixels_clamped(block, dest, line_size);
1288 void dsputil_init(void)
1291 int use_permuted_idct;
1293 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1294 for(i=0;i<MAX_NEG_CROP;i++) {
1296 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1299 for(i=0;i<512;i++) {
1300 squareTbl[i] = (i - 256) * (i - 256);
1306 ff_idct = j_rev_dct;
1308 get_pixels = get_pixels_c;
1309 diff_pixels = diff_pixels_c;
1310 put_pixels_clamped = put_pixels_clamped_c;
1311 add_pixels_clamped = add_pixels_clamped_c;
1313 clear_blocks= clear_blocks_c;
1315 pix_norm1= pix_norm1_c;
1317 pix_abs16x16 = pix_abs16x16_c;
1318 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1319 pix_abs16x16_y2 = pix_abs16x16_y2_c;
1320 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1321 pix_abs8x8 = pix_abs8x8_c;
1322 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1323 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1324 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1326 use_permuted_idct = 1;
1332 dsputil_init_armv4l();
1335 dsputil_init_mlib();
1336 use_permuted_idct = 0;
1339 dsputil_init_alpha();
1340 use_permuted_idct = 0;
1343 #ifdef CONFIG_DARWIN
1344 dsputil_init_altivec();
1349 if (ff_idct == NULL) {
1350 ff_idct_put = simple_idct_put;
1351 ff_idct_add = simple_idct_add;
1352 use_permuted_idct=0;
1355 if(ff_idct != NULL) {
1356 ff_idct_put = gen_idct_put;
1357 ff_idct_add = gen_idct_add;
1360 if(use_permuted_idct)
1362 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1364 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1367 for(i=0; i<64; i++) permutation[i]=i;
1369 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1370 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1372 if (use_permuted_idct) {
1373 /* permute for IDCT */
1375 j = zigzag_direct[i];
1376 zigzag_direct[i] = block_permute_op(j);
1377 j = ff_alternate_horizontal_scan[i];
1378 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1379 j = ff_alternate_vertical_scan[i];
1380 ff_alternate_vertical_scan[i] = block_permute_op(j);
1382 block_permute(ff_mpeg1_default_intra_matrix);
1383 block_permute(ff_mpeg1_default_non_intra_matrix);
1384 block_permute(ff_mpeg4_default_intra_matrix);
1385 block_permute(ff_mpeg4_default_non_intra_matrix);
1391 /* remove any non bit exact operation (testing purpose) */
1392 void avcodec_set_bit_exact(void)
1395 dsputil_set_bit_exact_mmx();
1399 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1400 int orig_linesize[3], int coded_linesize,
1401 AVCodecContext *avctx)
1403 int quad, diff, x, y;
1404 UINT8 *orig, *coded;
1405 UINT32 *sq = squareTbl + 256;
1411 orig = orig_image[0];
1412 coded = coded_image[0];
1414 for (y=0;y<avctx->height;y++) {
1415 for (x=0;x<avctx->width;x++) {
1416 diff = *(orig + x) - *(coded + x);
1419 orig += orig_linesize[0];
1420 coded += coded_linesize;
1423 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1425 if (avctx->psnr_y) {
1426 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1427 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1429 avctx->psnr_y = 99.99;