3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23 #include "simple_idct.h"
25 void (*ff_idct)(DCTELEM *block);
26 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
28 void (*av_fdct)(DCTELEM *block);
29 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
30 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
31 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
33 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
34 void (*clear_blocks)(DCTELEM *blocks);
35 int (*pix_sum)(UINT8 * pix, int line_size);
36 int (*pix_norm1)(UINT8 * pix, int line_size);
38 op_pixels_abs_func pix_abs16x16;
39 op_pixels_abs_func pix_abs16x16_x2;
40 op_pixels_abs_func pix_abs16x16_y2;
41 op_pixels_abs_func pix_abs16x16_xy2;
43 op_pixels_abs_func pix_abs8x8;
44 op_pixels_abs_func pix_abs8x8_x2;
45 op_pixels_abs_func pix_abs8x8_y2;
46 op_pixels_abs_func pix_abs8x8_xy2;
48 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
49 UINT32 squareTbl[512];
51 extern INT16 ff_mpeg1_default_intra_matrix[64];
52 extern INT16 ff_mpeg1_default_non_intra_matrix[64];
53 extern INT16 ff_mpeg4_default_intra_matrix[64];
54 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
56 UINT8 zigzag_direct[64] = {
57 0, 1, 8, 16, 9, 2, 3, 10,
58 17, 24, 32, 25, 18, 11, 4, 5,
59 12, 19, 26, 33, 40, 48, 41, 34,
60 27, 20, 13, 6, 7, 14, 21, 28,
61 35, 42, 49, 56, 57, 50, 43, 36,
62 29, 22, 15, 23, 30, 37, 44, 51,
63 58, 59, 52, 45, 38, 31, 39, 46,
64 53, 60, 61, 54, 47, 55, 62, 63
67 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
68 UINT16 __align8 inv_zigzag_direct16[64];
70 /* not permutated zigzag_direct for MMX quantizer */
71 UINT8 zigzag_direct_noperm[64];
73 UINT8 ff_alternate_horizontal_scan[64] = {
74 0, 1, 2, 3, 8, 9, 16, 17,
75 10, 11, 4, 5, 6, 7, 15, 14,
76 13, 12, 19, 18, 24, 25, 32, 33,
77 26, 27, 20, 21, 22, 23, 28, 29,
78 30, 31, 34, 35, 40, 41, 48, 49,
79 42, 43, 36, 37, 38, 39, 44, 45,
80 46, 47, 50, 51, 56, 57, 58, 59,
81 52, 53, 54, 55, 60, 61, 62, 63,
84 UINT8 ff_alternate_vertical_scan[64] = {
85 0, 8, 16, 24, 1, 9, 2, 10,
86 17, 25, 32, 40, 48, 56, 57, 49,
87 41, 33, 26, 18, 3, 11, 4, 12,
88 19, 27, 34, 42, 50, 58, 35, 43,
89 51, 59, 20, 28, 5, 13, 6, 14,
90 21, 29, 36, 44, 52, 60, 37, 45,
91 53, 61, 22, 30, 7, 15, 23, 31,
92 38, 46, 54, 62, 39, 47, 55, 63,
97 /* Input permutation for the simple_idct_mmx */
98 static UINT8 simple_mmx_permutation[64]={
99 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
100 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
101 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
102 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
103 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
104 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
105 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
106 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
110 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
111 UINT32 inverse[256]={
112 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
113 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
114 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
115 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
116 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
117 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
118 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
119 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
120 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
121 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
122 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
123 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
124 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
125 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
126 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
127 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
128 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
129 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
130 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
131 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
132 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
133 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
134 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
135 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
136 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
137 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
138 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
139 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
140 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
141 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
142 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
143 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
146 /* used to skip zeros at the end */
147 UINT8 zigzag_end[64];
149 UINT8 permutation[64];
150 //UINT8 invPermutation[64];
152 static void build_zigzag_end(void)
155 int lastIndexAfterPerm=0;
156 for(lastIndex=0; lastIndex<64; lastIndex++)
158 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
159 lastIndexAfterPerm= zigzag_direct[lastIndex];
160 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
164 int pix_sum_c(UINT8 * pix, int line_size)
169 for (i = 0; i < 16; i++) {
170 for (j = 0; j < 16; j += 8) {
181 pix += line_size - 16;
186 int pix_norm1_c(UINT8 * pix, int line_size)
189 UINT32 *sq = squareTbl + 256;
192 for (i = 0; i < 16; i++) {
193 for (j = 0; j < 16; j += 8) {
204 pix += line_size - 16;
210 void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
214 /* read the pixels */
216 block[0] = pixels[0];
217 block[1] = pixels[1];
218 block[2] = pixels[2];
219 block[3] = pixels[3];
220 block[4] = pixels[4];
221 block[5] = pixels[5];
222 block[6] = pixels[6];
223 block[7] = pixels[7];
229 void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
233 /* read the pixels */
235 block[0] = s1[0] - s2[0];
236 block[1] = s1[1] - s2[1];
237 block[2] = s1[2] - s2[2];
238 block[3] = s1[3] - s2[3];
239 block[4] = s1[4] - s2[4];
240 block[5] = s1[5] - s2[5];
241 block[6] = s1[6] - s2[6];
242 block[7] = s1[7] - s2[7];
250 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
254 UINT8 *cm = cropTbl + MAX_NEG_CROP;
256 /* read the pixels */
258 pixels[0] = cm[block[0]];
259 pixels[1] = cm[block[1]];
260 pixels[2] = cm[block[2]];
261 pixels[3] = cm[block[3]];
262 pixels[4] = cm[block[4]];
263 pixels[5] = cm[block[5]];
264 pixels[6] = cm[block[6]];
265 pixels[7] = cm[block[7]];
272 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
276 UINT8 *cm = cropTbl + MAX_NEG_CROP;
278 /* read the pixels */
280 pixels[0] = cm[pixels[0] + block[0]];
281 pixels[1] = cm[pixels[1] + block[1]];
282 pixels[2] = cm[pixels[2] + block[2]];
283 pixels[3] = cm[pixels[3] + block[3]];
284 pixels[4] = cm[pixels[4] + block[4]];
285 pixels[5] = cm[pixels[5] + block[5]];
286 pixels[6] = cm[pixels[6] + block[6]];
287 pixels[7] = cm[pixels[7] + block[7]];
294 #define PIXOP2(OPNAME, OP) \
295 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
299 OP(*((uint64_t*)block), LD64(pixels));\
305 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
309 const uint64_t a= LD64(pixels );\
310 const uint64_t b= LD64(pixels+1);\
311 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
317 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
321 const uint64_t a= LD64(pixels );\
322 const uint64_t b= LD64(pixels+1);\
323 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
329 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
333 const uint64_t a= LD64(pixels );\
334 const uint64_t b= LD64(pixels+line_size);\
335 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
341 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
345 const uint64_t a= LD64(pixels );\
346 const uint64_t b= LD64(pixels+line_size);\
347 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
353 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
356 const uint64_t a= LD64(pixels );\
357 const uint64_t b= LD64(pixels+1);\
358 uint64_t l0= (a&0x0303030303030303ULL)\
359 + (b&0x0303030303030303ULL)\
360 + 0x0202020202020202ULL;\
361 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
362 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
366 for(i=0; i<h; i+=2){\
367 uint64_t a= LD64(pixels );\
368 uint64_t b= LD64(pixels+1);\
369 l1= (a&0x0303030303030303ULL)\
370 + (b&0x0303030303030303ULL);\
371 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
372 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
373 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
378 l0= (a&0x0303030303030303ULL)\
379 + (b&0x0303030303030303ULL)\
380 + 0x0202020202020202ULL;\
381 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
382 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
383 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
389 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
392 const uint64_t a= LD64(pixels );\
393 const uint64_t b= LD64(pixels+1);\
394 uint64_t l0= (a&0x0303030303030303ULL)\
395 + (b&0x0303030303030303ULL)\
396 + 0x0101010101010101ULL;\
397 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
398 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
402 for(i=0; i<h; i+=2){\
403 uint64_t a= LD64(pixels );\
404 uint64_t b= LD64(pixels+1);\
405 l1= (a&0x0303030303030303ULL)\
406 + (b&0x0303030303030303ULL);\
407 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
408 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
409 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
414 l0= (a&0x0303030303030303ULL)\
415 + (b&0x0303030303030303ULL)\
416 + 0x0101010101010101ULL;\
417 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
418 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
419 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
425 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
427 OPNAME ## _pixels_x2,\
428 OPNAME ## _pixels_y2,\
429 OPNAME ## _pixels_xy2,\
432 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
434 OPNAME ## _no_rnd_pixels_x2,\
435 OPNAME ## _no_rnd_pixels_y2,\
436 OPNAME ## _no_rnd_pixels_xy2,\
439 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
440 #else // 64 bit variant
442 #define PIXOP2(OPNAME, OP) \
443 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
447 OP(*((uint32_t*)(block )), LD32(pixels ));\
448 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
454 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
460 const uint32_t a= LD32(pixels );\
461 const uint32_t b= LD32(pixels+1);\
462 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
466 pixels+=line_size-8;\
467 block +=line_size-8;\
471 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
477 const uint32_t a= LD32(pixels );\
478 const uint32_t b= LD32(pixels+1);\
479 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
483 pixels+=line_size-8;\
484 block +=line_size-8;\
488 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
494 const uint32_t a= LD32(pixels );\
495 const uint32_t b= LD32(pixels+line_size);\
496 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
500 pixels+=line_size-8;\
501 block +=line_size-8;\
505 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
511 const uint32_t a= LD32(pixels );\
512 const uint32_t b= LD32(pixels+line_size);\
513 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
517 pixels+=line_size-8;\
518 block +=line_size-8;\
522 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
527 const uint32_t a= LD32(pixels );\
528 const uint32_t b= LD32(pixels+1);\
529 uint32_t l0= (a&0x03030303UL)\
532 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
533 + ((b&0xFCFCFCFCUL)>>2);\
537 for(i=0; i<h; i+=2){\
538 uint32_t a= LD32(pixels );\
539 uint32_t b= LD32(pixels+1);\
540 l1= (a&0x03030303UL)\
542 h1= ((a&0xFCFCFCFCUL)>>2)\
543 + ((b&0xFCFCFCFCUL)>>2);\
544 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
549 l0= (a&0x03030303UL)\
552 h0= ((a&0xFCFCFCFCUL)>>2)\
553 + ((b&0xFCFCFCFCUL)>>2);\
554 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
558 pixels+=4-line_size*(h+1);\
559 block +=4-line_size*h;\
563 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
568 const uint32_t a= LD32(pixels );\
569 const uint32_t b= LD32(pixels+1);\
570 uint32_t l0= (a&0x03030303UL)\
573 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
574 + ((b&0xFCFCFCFCUL)>>2);\
578 for(i=0; i<h; i+=2){\
579 uint32_t a= LD32(pixels );\
580 uint32_t b= LD32(pixels+1);\
581 l1= (a&0x03030303UL)\
583 h1= ((a&0xFCFCFCFCUL)>>2)\
584 + ((b&0xFCFCFCFCUL)>>2);\
585 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
590 l0= (a&0x03030303UL)\
593 h0= ((a&0xFCFCFCFCUL)>>2)\
594 + ((b&0xFCFCFCFCUL)>>2);\
595 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
599 pixels+=4-line_size*(h+1);\
600 block +=4-line_size*h;\
604 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
606 OPNAME ## _pixels_x2,\
607 OPNAME ## _pixels_y2,\
608 OPNAME ## _pixels_xy2,\
611 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
613 OPNAME ## _no_rnd_pixels_x2,\
614 OPNAME ## _no_rnd_pixels_y2,\
615 OPNAME ## _no_rnd_pixels_xy2,\
617 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
619 #define op_put(a, b) a = b
627 /* FIXME this stuff could be removed as its ot really used anymore */
628 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
630 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
651 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
659 OP(p[0], avg2(pix[0], pix[1])); \
660 OP(p[1], avg2(pix[1], pix[2])); \
661 OP(p[2], avg2(pix[2], pix[3])); \
662 OP(p[3], avg2(pix[3], pix[4])); \
663 OP(p[4], avg2(pix[4], pix[5])); \
664 OP(p[5], avg2(pix[5], pix[6])); \
665 OP(p[6], avg2(pix[6], pix[7])); \
666 OP(p[7], avg2(pix[7], pix[8])); \
672 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
680 pix1 = pixels + line_size; \
682 OP(p[0], avg2(pix[0], pix1[0])); \
683 OP(p[1], avg2(pix[1], pix1[1])); \
684 OP(p[2], avg2(pix[2], pix1[2])); \
685 OP(p[3], avg2(pix[3], pix1[3])); \
686 OP(p[4], avg2(pix[4], pix1[4])); \
687 OP(p[5], avg2(pix[5], pix1[5])); \
688 OP(p[6], avg2(pix[6], pix1[6])); \
689 OP(p[7], avg2(pix[7], pix1[7])); \
696 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
704 pix1 = pixels + line_size; \
706 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
707 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
708 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
709 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
710 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
711 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
712 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
713 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
720 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
722 OPNAME ## _pixels_x2, \
723 OPNAME ## _pixels_y2, \
724 OPNAME ## _pixels_xy2, \
727 /* rounding primitives */
728 #define avg2(a,b) ((a+b+1)>>1)
729 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
731 #define op_avg(a, b) a = avg2(a, b)
732 #define op_sub(a, b) a -= b
733 #define op_put(a, b) a = b
735 PIXOP(DCTELEM, sub, op_sub, 8)
736 PIXOP(uint8_t, avg, op_avg, line_size)
737 PIXOP(uint8_t, put, op_put, line_size)
739 /* not rounding primitives */
742 #define avg2(a,b) ((a+b)>>1)
743 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
745 PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
746 PIXOP(uint8_t, put_no_rnd, op_put, line_size)
747 /* motion estimation */
753 #define avg2(a,b) ((a+b+1)>>1)
754 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
756 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
758 const int A=(16-x16)*(16-y16);
759 const int B=( x16)*(16-y16);
760 const int C=(16-x16)*( y16);
761 const int D=( x16)*( y16);
763 rounder= 128 - rounder;
767 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
768 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
769 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
770 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
771 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
772 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
773 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
774 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
780 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
782 UINT8 *cm = cropTbl + MAX_NEG_CROP;
786 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
787 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
788 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
789 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
790 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
791 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
792 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
793 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
799 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
801 UINT8 *cm = cropTbl + MAX_NEG_CROP;
805 const int src0= src[0*srcStride];
806 const int src1= src[1*srcStride];
807 const int src2= src[2*srcStride];
808 const int src3= src[3*srcStride];
809 const int src4= src[4*srcStride];
810 const int src5= src[5*srcStride];
811 const int src6= src[6*srcStride];
812 const int src7= src[7*srcStride];
813 const int src8= src[8*srcStride];
814 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
815 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
816 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
817 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
818 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
819 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
820 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
821 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
827 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
845 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
850 dst[0]= (src1[0] + src2[0] + r)>>1;
851 dst[1]= (src1[1] + src2[1] + r)>>1;
852 dst[2]= (src1[2] + src2[2] + r)>>1;
853 dst[3]= (src1[3] + src2[3] + r)>>1;
854 dst[4]= (src1[4] + src2[4] + r)>>1;
855 dst[5]= (src1[5] + src2[5] + r)>>1;
856 dst[6]= (src1[6] + src2[6] + r)>>1;
857 dst[7]= (src1[7] + src2[7] + r)>>1;
864 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
869 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
870 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
871 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
872 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
873 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
874 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
875 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
876 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
885 #define QPEL_MC(r, name) \
886 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
888 put_block(dst, src, dstStride, srcStride);\
891 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
894 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
895 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
898 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
900 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
903 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
906 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
907 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
910 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
913 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
914 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
917 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
919 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
922 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
925 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
926 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
928 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
933 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
934 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
935 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
936 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
938 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
943 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
944 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
945 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
946 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
948 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
953 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
954 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
955 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
956 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
958 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
963 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
964 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
965 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
966 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
968 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
972 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
973 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
974 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
976 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
980 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
981 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
982 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
984 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
989 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
990 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
991 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
992 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
994 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
999 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
1000 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
1001 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
1002 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
1004 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
1007 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
1008 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
1010 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
1011 qpel_mc00_c ## name, \
1012 qpel_mc10_c ## name, \
1013 qpel_mc20_c ## name, \
1014 qpel_mc30_c ## name, \
1015 qpel_mc01_c ## name, \
1016 qpel_mc11_c ## name, \
1017 qpel_mc21_c ## name, \
1018 qpel_mc31_c ## name, \
1019 qpel_mc02_c ## name, \
1020 qpel_mc12_c ## name, \
1021 qpel_mc22_c ## name, \
1022 qpel_mc32_c ## name, \
1023 qpel_mc03_c ## name, \
1024 qpel_mc13_c ## name, \
1025 qpel_mc23_c ## name, \
1026 qpel_mc33_c ## name, \
1032 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1038 s += abs(pix1[0] - pix2[0]);
1039 s += abs(pix1[1] - pix2[1]);
1040 s += abs(pix1[2] - pix2[2]);
1041 s += abs(pix1[3] - pix2[3]);
1042 s += abs(pix1[4] - pix2[4]);
1043 s += abs(pix1[5] - pix2[5]);
1044 s += abs(pix1[6] - pix2[6]);
1045 s += abs(pix1[7] - pix2[7]);
1046 s += abs(pix1[8] - pix2[8]);
1047 s += abs(pix1[9] - pix2[9]);
1048 s += abs(pix1[10] - pix2[10]);
1049 s += abs(pix1[11] - pix2[11]);
1050 s += abs(pix1[12] - pix2[12]);
1051 s += abs(pix1[13] - pix2[13]);
1052 s += abs(pix1[14] - pix2[14]);
1053 s += abs(pix1[15] - pix2[15]);
1060 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1066 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1067 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1068 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1069 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1070 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1071 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1072 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1073 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1074 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1075 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1076 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1077 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1078 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1079 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1080 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1081 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1088 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1091 UINT8 *pix3 = pix2 + line_size;
1095 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1096 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1097 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1098 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1099 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1100 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1101 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1102 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1103 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1104 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1105 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1106 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1107 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1108 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1109 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1110 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1118 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1121 UINT8 *pix3 = pix2 + line_size;
1125 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1126 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1127 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1128 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1129 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1130 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1131 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1132 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1133 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1134 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1135 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1136 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1137 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1138 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1139 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1140 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1148 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1154 s += abs(pix1[0] - pix2[0]);
1155 s += abs(pix1[1] - pix2[1]);
1156 s += abs(pix1[2] - pix2[2]);
1157 s += abs(pix1[3] - pix2[3]);
1158 s += abs(pix1[4] - pix2[4]);
1159 s += abs(pix1[5] - pix2[5]);
1160 s += abs(pix1[6] - pix2[6]);
1161 s += abs(pix1[7] - pix2[7]);
1168 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1174 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1175 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1176 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1177 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1178 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1179 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1180 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1181 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1188 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1191 UINT8 *pix3 = pix2 + line_size;
1195 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1196 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1197 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1198 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1199 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1200 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1201 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1202 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1210 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1213 UINT8 *pix3 = pix2 + line_size;
1217 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1218 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1219 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1220 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1221 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1222 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1223 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1224 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1232 /* permute block according so that it corresponds to the MMX idct
1235 /* general permutation, but perhaps slightly slower */
1236 void block_permute(INT16 *block)
1241 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1243 for(i=0; i<64; i++) block[i] = temp[i];
1247 void block_permute(INT16 *block)
1249 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1270 void clear_blocks_c(DCTELEM *blocks)
1272 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1275 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1277 void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1280 put_pixels_clamped(block, dest, line_size);
1283 void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1286 add_pixels_clamped(block, dest, line_size);
1289 void dsputil_init(void)
1292 int use_permuted_idct;
1294 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1295 for(i=0;i<MAX_NEG_CROP;i++) {
1297 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1300 for(i=0;i<512;i++) {
1301 squareTbl[i] = (i - 256) * (i - 256);
1307 ff_idct = j_rev_dct;
1309 get_pixels = get_pixels_c;
1310 diff_pixels = diff_pixels_c;
1311 put_pixels_clamped = put_pixels_clamped_c;
1312 add_pixels_clamped = add_pixels_clamped_c;
1314 clear_blocks= clear_blocks_c;
1316 pix_norm1= pix_norm1_c;
1318 pix_abs16x16 = pix_abs16x16_c;
1319 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1320 pix_abs16x16_y2 = pix_abs16x16_y2_c;
1321 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1322 pix_abs8x8 = pix_abs8x8_c;
1323 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1324 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1325 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1326 av_fdct = fdct_ifast;
1328 use_permuted_idct = 1;
1334 dsputil_init_armv4l();
1337 dsputil_init_mlib();
1338 use_permuted_idct = 0;
1341 dsputil_init_alpha();
1342 use_permuted_idct = 0;
1345 dsputil_init_altivec();
1349 if (ff_idct == NULL) {
1350 ff_idct_put = simple_idct_put;
1351 ff_idct_add = simple_idct_add;
1352 use_permuted_idct=0;
1355 if(ff_idct != NULL) {
1356 ff_idct_put = gen_idct_put;
1357 ff_idct_add = gen_idct_add;
1360 if(use_permuted_idct)
1362 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1364 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1367 for(i=0; i<64; i++) permutation[i]=i;
1369 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1370 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1372 if (use_permuted_idct) {
1373 /* permute for IDCT */
1375 j = zigzag_direct[i];
1376 zigzag_direct[i] = block_permute_op(j);
1377 j = ff_alternate_horizontal_scan[i];
1378 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1379 j = ff_alternate_vertical_scan[i];
1380 ff_alternate_vertical_scan[i] = block_permute_op(j);
1382 block_permute(ff_mpeg1_default_intra_matrix);
1383 block_permute(ff_mpeg1_default_non_intra_matrix);
1384 block_permute(ff_mpeg4_default_intra_matrix);
1385 block_permute(ff_mpeg4_default_non_intra_matrix);
1391 /* remove any non bit exact operation (testing purpose) */
1392 void avcodec_set_bit_exact(void)
1395 dsputil_set_bit_exact_mmx();
1399 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1400 int orig_linesize[3], int coded_linesize,
1401 AVCodecContext *avctx)
1403 int quad, diff, x, y;
1404 UINT8 *orig, *coded;
1405 UINT32 *sq = squareTbl + 256;
1411 orig = orig_image[0];
1412 coded = coded_image[0];
1414 for (y=0;y<avctx->height;y++) {
1415 for (x=0;x<avctx->width;x++) {
1416 diff = *(orig + x) - *(coded + x);
1419 orig += orig_linesize[0];
1420 coded += coded_linesize;
1423 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1425 if (avctx->psnr_y) {
1426 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1427 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1429 avctx->psnr_y = 99.99;