3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23 #include "simple_idct.h"
25 void (*ff_idct)(DCTELEM *block);
26 void (*av_fdct)(DCTELEM *block);
27 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
28 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
29 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
30 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
31 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
32 void (*clear_blocks)(DCTELEM *blocks);
34 op_pixels_abs_func pix_abs16x16;
35 op_pixels_abs_func pix_abs16x16_x2;
36 op_pixels_abs_func pix_abs16x16_y2;
37 op_pixels_abs_func pix_abs16x16_xy2;
39 op_pixels_abs_func pix_abs8x8;
40 op_pixels_abs_func pix_abs8x8_x2;
41 op_pixels_abs_func pix_abs8x8_y2;
42 op_pixels_abs_func pix_abs8x8_xy2;
44 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
45 UINT32 squareTbl[512];
47 extern INT16 default_intra_matrix[64];
48 extern INT16 default_non_intra_matrix[64];
49 extern INT16 ff_mpeg4_default_intra_matrix[64];
50 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
52 UINT8 zigzag_direct[64] = {
53 0, 1, 8, 16, 9, 2, 3, 10,
54 17, 24, 32, 25, 18, 11, 4, 5,
55 12, 19, 26, 33, 40, 48, 41, 34,
56 27, 20, 13, 6, 7, 14, 21, 28,
57 35, 42, 49, 56, 57, 50, 43, 36,
58 29, 22, 15, 23, 30, 37, 44, 51,
59 58, 59, 52, 45, 38, 31, 39, 46,
60 53, 60, 61, 54, 47, 55, 62, 63
63 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
64 UINT16 __align8 inv_zigzag_direct16[64];
66 /* not permutated zigzag_direct for MMX quantizer */
67 UINT8 zigzag_direct_noperm[64];
69 UINT8 ff_alternate_horizontal_scan[64] = {
70 0, 1, 2, 3, 8, 9, 16, 17,
71 10, 11, 4, 5, 6, 7, 15, 14,
72 13, 12, 19, 18, 24, 25, 32, 33,
73 26, 27, 20, 21, 22, 23, 28, 29,
74 30, 31, 34, 35, 40, 41, 48, 49,
75 42, 43, 36, 37, 38, 39, 44, 45,
76 46, 47, 50, 51, 56, 57, 58, 59,
77 52, 53, 54, 55, 60, 61, 62, 63,
80 UINT8 ff_alternate_vertical_scan[64] = {
81 0, 8, 16, 24, 1, 9, 2, 10,
82 17, 25, 32, 40, 48, 56, 57, 49,
83 41, 33, 26, 18, 3, 11, 4, 12,
84 19, 27, 34, 42, 50, 58, 35, 43,
85 51, 59, 20, 28, 5, 13, 6, 14,
86 21, 29, 36, 44, 52, 60, 37, 45,
87 53, 61, 22, 30, 7, 15, 23, 31,
88 38, 46, 54, 62, 39, 47, 55, 63,
93 /* Input permutation for the simple_idct_mmx */
94 static UINT8 simple_mmx_permutation[64]={
95 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
96 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
97 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
98 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
99 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
100 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
101 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
102 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
106 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
107 UINT32 inverse[256]={
108 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
109 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
110 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
111 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
112 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
113 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
114 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
115 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
116 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
117 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
118 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
119 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
120 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
121 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
122 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
123 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
124 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
125 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
126 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
127 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
128 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
129 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
130 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
131 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
132 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
133 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
134 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
135 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
136 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
137 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
138 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
139 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
142 /* used to skip zeros at the end */
143 UINT8 zigzag_end[64];
145 UINT8 permutation[64];
146 //UINT8 invPermutation[64];
148 static void build_zigzag_end(void)
151 int lastIndexAfterPerm=0;
152 for(lastIndex=0; lastIndex<64; lastIndex++)
154 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
155 lastIndexAfterPerm= zigzag_direct[lastIndex];
156 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
160 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
166 /* read the pixels */
183 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
187 /* read the pixels */
190 p[0] = s1[0] - s2[0];
191 p[1] = s1[1] - s2[1];
192 p[2] = s1[2] - s2[2];
193 p[3] = s1[3] - s2[3];
194 p[4] = s1[4] - s2[4];
195 p[5] = s1[5] - s2[5];
196 p[6] = s1[6] - s2[6];
197 p[7] = s1[7] - s2[7];
205 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
210 UINT8 *cm = cropTbl + MAX_NEG_CROP;
212 /* read the pixels */
229 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
234 UINT8 *cm = cropTbl + MAX_NEG_CROP;
236 /* read the pixels */
240 pix[0] = cm[pix[0] + p[0]];
241 pix[1] = cm[pix[1] + p[1]];
242 pix[2] = cm[pix[2] + p[2]];
243 pix[3] = cm[pix[3] + p[3]];
244 pix[4] = cm[pix[4] + p[4]];
245 pix[5] = cm[pix[5] + p[5]];
246 pix[6] = cm[pix[6] + p[6]];
247 pix[7] = cm[pix[7] + p[7]];
255 struct unaligned_64 { uint64_t l; } __attribute__((packed));
256 struct unaligned_32 { uint32_t l; } __attribute__((packed));
258 #define LD32(a) (((const struct unaligned_32 *) (a))->l)
259 #define LD64(a) (((const struct unaligned_64 *) (a))->l)
263 #define LD32(a) (*((uint32_t*)(a)))
264 #define LD64(a) (*((uint64_t*)(a)))
266 #endif /* !__GNUC__ */
270 #define PIXOP2(OPNAME, OP) \
271 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
275 OP(*((uint64_t*)block), LD64(pixels));\
281 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
285 const uint64_t a= LD64(pixels );\
286 const uint64_t b= LD64(pixels+1);\
287 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
293 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
297 const uint64_t a= LD64(pixels );\
298 const uint64_t b= LD64(pixels+1);\
299 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
305 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
309 const uint64_t a= LD64(pixels );\
310 const uint64_t b= LD64(pixels+line_size);\
311 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
317 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
321 const uint64_t a= LD64(pixels );\
322 const uint64_t b= LD64(pixels+line_size);\
323 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
329 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
332 const uint64_t a= LD64(pixels );\
333 const uint64_t b= LD64(pixels+1);\
334 uint64_t l0= (a&0x0303030303030303ULL)\
335 + (b&0x0303030303030303ULL)\
336 + 0x0202020202020202ULL;\
337 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
338 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
342 for(i=0; i<h; i+=2){\
343 uint64_t a= LD64(pixels );\
344 uint64_t b= LD64(pixels+1);\
345 l1= (a&0x0303030303030303ULL)\
346 + (b&0x0303030303030303ULL);\
347 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
348 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
349 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
354 l0= (a&0x0303030303030303ULL)\
355 + (b&0x0303030303030303ULL)\
356 + 0x0202020202020202ULL;\
357 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
358 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
359 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
365 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
368 const uint64_t a= LD64(pixels );\
369 const uint64_t b= LD64(pixels+1);\
370 uint64_t l0= (a&0x0303030303030303ULL)\
371 + (b&0x0303030303030303ULL)\
372 + 0x0101010101010101ULL;\
373 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
374 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
378 for(i=0; i<h; i+=2){\
379 uint64_t a= LD64(pixels );\
380 uint64_t b= LD64(pixels+1);\
381 l1= (a&0x0303030303030303ULL)\
382 + (b&0x0303030303030303ULL);\
383 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
384 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
385 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
390 l0= (a&0x0303030303030303ULL)\
391 + (b&0x0303030303030303ULL)\
392 + 0x0101010101010101ULL;\
393 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
394 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
395 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
401 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
403 OPNAME ## _pixels_x2,\
404 OPNAME ## _pixels_y2,\
405 OPNAME ## _pixels_xy2,\
408 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
410 OPNAME ## _no_rnd_pixels_x2,\
411 OPNAME ## _no_rnd_pixels_y2,\
412 OPNAME ## _no_rnd_pixels_xy2,\
415 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
416 #else // 64 bit variant
418 #define PIXOP2(OPNAME, OP) \
419 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
423 OP(*((uint32_t*)(block )), LD32(pixels ));\
424 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
430 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
436 const uint32_t a= LD32(pixels );\
437 const uint32_t b= LD32(pixels+1);\
438 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
442 pixels+=line_size-8;\
443 block +=line_size-8;\
447 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
453 const uint32_t a= LD32(pixels );\
454 const uint32_t b= LD32(pixels+1);\
455 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
459 pixels+=line_size-8;\
460 block +=line_size-8;\
464 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
470 const uint32_t a= LD32(pixels );\
471 const uint32_t b= LD32(pixels+line_size);\
472 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
476 pixels+=line_size-8;\
477 block +=line_size-8;\
481 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
487 const uint32_t a= LD32(pixels );\
488 const uint32_t b= LD32(pixels+line_size);\
489 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
493 pixels+=line_size-8;\
494 block +=line_size-8;\
498 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
503 const uint32_t a= LD32(pixels );\
504 const uint32_t b= LD32(pixels+1);\
505 uint32_t l0= (a&0x03030303UL)\
508 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
509 + ((b&0xFCFCFCFCUL)>>2);\
513 for(i=0; i<h; i+=2){\
514 uint32_t a= LD32(pixels );\
515 uint32_t b= LD32(pixels+1);\
516 l1= (a&0x03030303UL)\
518 h1= ((a&0xFCFCFCFCUL)>>2)\
519 + ((b&0xFCFCFCFCUL)>>2);\
520 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
525 l0= (a&0x03030303UL)\
528 h0= ((a&0xFCFCFCFCUL)>>2)\
529 + ((b&0xFCFCFCFCUL)>>2);\
530 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
534 pixels+=4-line_size*(h+1);\
535 block +=4-line_size*h;\
539 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
544 const uint32_t a= LD32(pixels );\
545 const uint32_t b= LD32(pixels+1);\
546 uint32_t l0= (a&0x03030303UL)\
549 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
550 + ((b&0xFCFCFCFCUL)>>2);\
554 for(i=0; i<h; i+=2){\
555 uint32_t a= LD32(pixels );\
556 uint32_t b= LD32(pixels+1);\
557 l1= (a&0x03030303UL)\
559 h1= ((a&0xFCFCFCFCUL)>>2)\
560 + ((b&0xFCFCFCFCUL)>>2);\
561 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
566 l0= (a&0x03030303UL)\
569 h0= ((a&0xFCFCFCFCUL)>>2)\
570 + ((b&0xFCFCFCFCUL)>>2);\
571 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
575 pixels+=4-line_size*(h+1);\
576 block +=4-line_size*h;\
580 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
582 OPNAME ## _pixels_x2,\
583 OPNAME ## _pixels_y2,\
584 OPNAME ## _pixels_xy2,\
587 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
589 OPNAME ## _no_rnd_pixels_x2,\
590 OPNAME ## _no_rnd_pixels_y2,\
591 OPNAME ## _no_rnd_pixels_xy2,\
593 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
596 #define op_put(a, b) a = b
604 /* FIXME this stuff could be removed as its ot really used anymore */
605 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
607 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
628 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
636 OP(p[0], avg2(pix[0], pix[1])); \
637 OP(p[1], avg2(pix[1], pix[2])); \
638 OP(p[2], avg2(pix[2], pix[3])); \
639 OP(p[3], avg2(pix[3], pix[4])); \
640 OP(p[4], avg2(pix[4], pix[5])); \
641 OP(p[5], avg2(pix[5], pix[6])); \
642 OP(p[6], avg2(pix[6], pix[7])); \
643 OP(p[7], avg2(pix[7], pix[8])); \
649 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
657 pix1 = pixels + line_size; \
659 OP(p[0], avg2(pix[0], pix1[0])); \
660 OP(p[1], avg2(pix[1], pix1[1])); \
661 OP(p[2], avg2(pix[2], pix1[2])); \
662 OP(p[3], avg2(pix[3], pix1[3])); \
663 OP(p[4], avg2(pix[4], pix1[4])); \
664 OP(p[5], avg2(pix[5], pix1[5])); \
665 OP(p[6], avg2(pix[6], pix1[6])); \
666 OP(p[7], avg2(pix[7], pix1[7])); \
673 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
681 pix1 = pixels + line_size; \
683 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
684 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
685 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
686 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
687 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
688 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
689 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
690 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
697 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
699 OPNAME ## _pixels_x2, \
700 OPNAME ## _pixels_y2, \
701 OPNAME ## _pixels_xy2, \
704 /* rounding primitives */
705 #define avg2(a,b) ((a+b+1)>>1)
706 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
708 #define op_avg(a, b) a = avg2(a, b)
709 #define op_sub(a, b) a -= b
711 PIXOP(DCTELEM, sub, op_sub, 8)
713 /* not rounding primitives */
716 #define avg2(a,b) ((a+b)>>1)
717 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
719 /* motion estimation */
725 #define avg2(a,b) ((a+b+1)>>1)
726 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
728 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
730 const int A=(16-x16)*(16-y16);
731 const int B=( x16)*(16-y16);
732 const int C=(16-x16)*( y16);
733 const int D=( x16)*( y16);
735 rounder= 128 - rounder;
739 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
740 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
741 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
742 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
743 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
744 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
745 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
746 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
752 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
754 UINT8 *cm = cropTbl + MAX_NEG_CROP;
758 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
759 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
760 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
761 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
762 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
763 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
764 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
765 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
771 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
773 UINT8 *cm = cropTbl + MAX_NEG_CROP;
777 const int src0= src[0*srcStride];
778 const int src1= src[1*srcStride];
779 const int src2= src[2*srcStride];
780 const int src3= src[3*srcStride];
781 const int src4= src[4*srcStride];
782 const int src5= src[5*srcStride];
783 const int src6= src[6*srcStride];
784 const int src7= src[7*srcStride];
785 const int src8= src[8*srcStride];
786 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
787 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
788 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
789 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
790 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
791 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
792 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
793 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
799 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
817 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
822 dst[0]= (src1[0] + src2[0] + r)>>1;
823 dst[1]= (src1[1] + src2[1] + r)>>1;
824 dst[2]= (src1[2] + src2[2] + r)>>1;
825 dst[3]= (src1[3] + src2[3] + r)>>1;
826 dst[4]= (src1[4] + src2[4] + r)>>1;
827 dst[5]= (src1[5] + src2[5] + r)>>1;
828 dst[6]= (src1[6] + src2[6] + r)>>1;
829 dst[7]= (src1[7] + src2[7] + r)>>1;
836 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
841 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
842 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
843 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
844 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
845 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
846 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
847 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
848 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
857 #define QPEL_MC(r, name) \
858 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
860 put_block(dst, src, dstStride, srcStride);\
863 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
866 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
867 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
870 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
872 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
875 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
878 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
879 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
882 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
885 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
886 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
889 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
891 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
894 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
897 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
898 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
900 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
905 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
906 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
907 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
908 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
910 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
915 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
916 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
917 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
918 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
920 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
925 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
926 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
927 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
928 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
930 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
935 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
936 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
937 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
938 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
940 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
944 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
945 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
946 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
948 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
952 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
953 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
954 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
956 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
961 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
962 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
963 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
964 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
966 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
971 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
972 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
973 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
974 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
976 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
979 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
980 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
982 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
983 qpel_mc00_c ## name, \
984 qpel_mc10_c ## name, \
985 qpel_mc20_c ## name, \
986 qpel_mc30_c ## name, \
987 qpel_mc01_c ## name, \
988 qpel_mc11_c ## name, \
989 qpel_mc21_c ## name, \
990 qpel_mc31_c ## name, \
991 qpel_mc02_c ## name, \
992 qpel_mc12_c ## name, \
993 qpel_mc22_c ## name, \
994 qpel_mc32_c ## name, \
995 qpel_mc03_c ## name, \
996 qpel_mc13_c ## name, \
997 qpel_mc23_c ## name, \
998 qpel_mc33_c ## name, \
1004 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1010 s += abs(pix1[0] - pix2[0]);
1011 s += abs(pix1[1] - pix2[1]);
1012 s += abs(pix1[2] - pix2[2]);
1013 s += abs(pix1[3] - pix2[3]);
1014 s += abs(pix1[4] - pix2[4]);
1015 s += abs(pix1[5] - pix2[5]);
1016 s += abs(pix1[6] - pix2[6]);
1017 s += abs(pix1[7] - pix2[7]);
1018 s += abs(pix1[8] - pix2[8]);
1019 s += abs(pix1[9] - pix2[9]);
1020 s += abs(pix1[10] - pix2[10]);
1021 s += abs(pix1[11] - pix2[11]);
1022 s += abs(pix1[12] - pix2[12]);
1023 s += abs(pix1[13] - pix2[13]);
1024 s += abs(pix1[14] - pix2[14]);
1025 s += abs(pix1[15] - pix2[15]);
1032 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1038 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1039 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1040 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1041 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1042 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1043 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1044 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1045 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1046 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1047 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1048 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1049 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1050 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1051 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1052 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1053 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1060 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1063 UINT8 *pix3 = pix2 + line_size;
1067 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1068 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1069 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1070 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1071 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1072 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1073 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1074 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1075 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1076 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1077 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1078 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1079 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1080 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1081 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1082 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1090 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1093 UINT8 *pix3 = pix2 + line_size;
1097 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1098 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1099 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1100 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1101 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1102 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1103 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1104 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1105 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1106 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1107 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1108 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1109 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1110 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1111 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1112 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1120 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1126 s += abs(pix1[0] - pix2[0]);
1127 s += abs(pix1[1] - pix2[1]);
1128 s += abs(pix1[2] - pix2[2]);
1129 s += abs(pix1[3] - pix2[3]);
1130 s += abs(pix1[4] - pix2[4]);
1131 s += abs(pix1[5] - pix2[5]);
1132 s += abs(pix1[6] - pix2[6]);
1133 s += abs(pix1[7] - pix2[7]);
1140 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1146 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1147 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1148 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1149 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1150 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1151 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1152 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1153 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1160 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1163 UINT8 *pix3 = pix2 + line_size;
1167 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1168 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1169 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1170 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1171 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1172 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1173 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1174 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1182 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1185 UINT8 *pix3 = pix2 + line_size;
1189 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1190 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1191 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1192 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1193 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1194 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1195 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1196 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1204 /* permute block according so that it corresponds to the MMX idct
1207 /* general permutation, but perhaps slightly slower */
1208 void block_permute(INT16 *block)
1213 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1215 for(i=0; i<64; i++) block[i] = temp[i];
1219 void block_permute(INT16 *block)
1221 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1242 void clear_blocks_c(DCTELEM *blocks)
1244 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1247 void dsputil_init(void)
1250 int use_permuted_idct;
1252 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1253 for(i=0;i<MAX_NEG_CROP;i++) {
1255 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1258 for(i=0;i<512;i++) {
1259 squareTbl[i] = (i - 256) * (i - 256);
1263 ff_idct = simple_idct;
1265 ff_idct = j_rev_dct;
1267 get_pixels = get_pixels_c;
1268 diff_pixels = diff_pixels_c;
1269 put_pixels_clamped = put_pixels_clamped_c;
1270 add_pixels_clamped = add_pixels_clamped_c;
1272 clear_blocks= clear_blocks_c;
1274 pix_abs16x16 = pix_abs16x16_c;
1275 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1276 pix_abs16x16_y2 = pix_abs16x16_y2_c;
1277 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1278 pix_abs8x8 = pix_abs8x8_c;
1279 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1280 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1281 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1282 av_fdct = fdct_ifast;
1284 use_permuted_idct = 1;
1290 dsputil_init_armv4l();
1293 dsputil_init_mlib();
1294 use_permuted_idct = 0;
1297 dsputil_init_alpha();
1298 use_permuted_idct = 0;
1302 if(ff_idct == simple_idct) use_permuted_idct=0;
1305 if(use_permuted_idct)
1307 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1309 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1312 for(i=0; i<64; i++) permutation[i]=i;
1314 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1315 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1317 if (use_permuted_idct) {
1318 /* permute for IDCT */
1320 j = zigzag_direct[i];
1321 zigzag_direct[i] = block_permute_op(j);
1322 j = ff_alternate_horizontal_scan[i];
1323 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1324 j = ff_alternate_vertical_scan[i];
1325 ff_alternate_vertical_scan[i] = block_permute_op(j);
1327 block_permute(default_intra_matrix);
1328 block_permute(default_non_intra_matrix);
1329 block_permute(ff_mpeg4_default_intra_matrix);
1330 block_permute(ff_mpeg4_default_non_intra_matrix);
1336 /* remove any non bit exact operation (testing purpose) */
1337 void avcodec_set_bit_exact(void)
1340 dsputil_set_bit_exact_mmx();
1344 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1345 int orig_linesize[3], int coded_linesize,
1346 AVCodecContext *avctx)
1348 int quad, diff, x, y;
1349 UINT8 *orig, *coded;
1350 UINT32 *sq = squareTbl + 256;
1356 orig = orig_image[0];
1357 coded = coded_image[0];
1359 for (y=0;y<avctx->height;y++) {
1360 for (x=0;x<avctx->width;x++) {
1361 diff = *(orig + x) - *(coded + x);
1364 orig += orig_linesize[0];
1365 coded += coded_linesize;
1368 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1370 if (avctx->psnr_y) {
1371 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1372 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1374 avctx->psnr_y = 99.99;