3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23 #include "simple_idct.h"
25 void (*ff_idct)(DCTELEM *block);
26 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
27 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
28 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
29 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
30 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
31 void (*clear_blocks)(DCTELEM *blocks);
33 op_pixels_abs_func pix_abs16x16;
34 op_pixels_abs_func pix_abs16x16_x2;
35 op_pixels_abs_func pix_abs16x16_y2;
36 op_pixels_abs_func pix_abs16x16_xy2;
38 op_pixels_abs_func pix_abs8x8;
39 op_pixels_abs_func pix_abs8x8_x2;
40 op_pixels_abs_func pix_abs8x8_y2;
41 op_pixels_abs_func pix_abs8x8_xy2;
43 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
44 UINT32 squareTbl[512];
46 extern UINT16 default_intra_matrix[64];
47 extern UINT16 default_non_intra_matrix[64];
48 extern UINT16 ff_mpeg4_default_intra_matrix[64];
49 extern UINT16 ff_mpeg4_default_non_intra_matrix[64];
51 UINT8 zigzag_direct[64] = {
52 0, 1, 8, 16, 9, 2, 3, 10,
53 17, 24, 32, 25, 18, 11, 4, 5,
54 12, 19, 26, 33, 40, 48, 41, 34,
55 27, 20, 13, 6, 7, 14, 21, 28,
56 35, 42, 49, 56, 57, 50, 43, 36,
57 29, 22, 15, 23, 30, 37, 44, 51,
58 58, 59, 52, 45, 38, 31, 39, 46,
59 53, 60, 61, 54, 47, 55, 62, 63
62 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
63 UINT16 __align8 inv_zigzag_direct16[64];
65 /* not permutated zigzag_direct for MMX quantizer */
66 UINT8 zigzag_direct_noperm[64];
68 UINT8 ff_alternate_horizontal_scan[64] = {
69 0, 1, 2, 3, 8, 9, 16, 17,
70 10, 11, 4, 5, 6, 7, 15, 14,
71 13, 12, 19, 18, 24, 25, 32, 33,
72 26, 27, 20, 21, 22, 23, 28, 29,
73 30, 31, 34, 35, 40, 41, 48, 49,
74 42, 43, 36, 37, 38, 39, 44, 45,
75 46, 47, 50, 51, 56, 57, 58, 59,
76 52, 53, 54, 55, 60, 61, 62, 63,
79 UINT8 ff_alternate_vertical_scan[64] = {
80 0, 8, 16, 24, 1, 9, 2, 10,
81 17, 25, 32, 40, 48, 56, 57, 49,
82 41, 33, 26, 18, 3, 11, 4, 12,
83 19, 27, 34, 42, 50, 58, 35, 43,
84 51, 59, 20, 28, 5, 13, 6, 14,
85 21, 29, 36, 44, 52, 60, 37, 45,
86 53, 61, 22, 30, 7, 15, 23, 31,
87 38, 46, 54, 62, 39, 47, 55, 63,
92 /* Input permutation for the simple_idct_mmx */
93 static UINT8 simple_mmx_permutation[64]={
94 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
95 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
96 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
97 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
98 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
99 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
100 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
101 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
105 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
106 UINT32 inverse[256]={
107 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
108 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
109 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
110 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
111 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
112 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
113 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
114 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
115 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
116 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
117 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
118 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
119 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
120 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
121 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
122 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
123 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
124 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
125 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
126 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
127 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
128 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
129 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
130 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
131 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
132 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
133 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
134 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
135 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
136 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
137 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
138 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
141 /* used to skip zeros at the end */
142 UINT8 zigzag_end[64];
144 UINT8 permutation[64];
145 //UINT8 invPermutation[64];
147 static void build_zigzag_end()
150 int lastIndexAfterPerm=0;
151 for(lastIndex=0; lastIndex<64; lastIndex++)
153 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
154 lastIndexAfterPerm= zigzag_direct[lastIndex];
155 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
159 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
165 /* read the pixels */
182 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
186 /* read the pixels */
189 p[0] = s1[0] - s2[0];
190 p[1] = s1[1] - s2[1];
191 p[2] = s1[2] - s2[2];
192 p[3] = s1[3] - s2[3];
193 p[4] = s1[4] - s2[4];
194 p[5] = s1[5] - s2[5];
195 p[6] = s1[6] - s2[6];
196 p[7] = s1[7] - s2[7];
204 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
209 UINT8 *cm = cropTbl + MAX_NEG_CROP;
211 /* read the pixels */
228 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
233 UINT8 *cm = cropTbl + MAX_NEG_CROP;
235 /* read the pixels */
239 pix[0] = cm[pix[0] + p[0]];
240 pix[1] = cm[pix[1] + p[1]];
241 pix[2] = cm[pix[2] + p[2]];
242 pix[3] = cm[pix[3] + p[3]];
243 pix[4] = cm[pix[4] + p[4]];
244 pix[5] = cm[pix[5] + p[5]];
245 pix[6] = cm[pix[6] + p[6]];
246 pix[7] = cm[pix[7] + p[7]];
254 struct unaligned_64 { uint64_t l; } __attribute__((packed));
255 struct unaligned_32 { uint32_t l; } __attribute__((packed));
257 #define LD32(a) (((const struct unaligned_32 *) (a))->l)
258 #define LD64(a) (((const struct unaligned_64 *) (a))->l)
262 #define LD32(a) (*((uint32_t*)(a)))
263 #define LD64(a) (*((uint64_t*)(a)))
265 #endif /* !__GNUC__ */
269 #define PIXOP2(OPNAME, OP) \
270 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
274 OP(*((uint64_t*)block), LD64(pixels));\
280 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
284 const uint64_t a= LD64(pixels );\
285 const uint64_t b= LD64(pixels+1);\
286 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
292 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
296 const uint64_t a= LD64(pixels );\
297 const uint64_t b= LD64(pixels+1);\
298 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
304 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
308 const uint64_t a= LD64(pixels );\
309 const uint64_t b= LD64(pixels+line_size);\
310 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
316 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
320 const uint64_t a= LD64(pixels );\
321 const uint64_t b= LD64(pixels+line_size);\
322 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
328 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
331 const uint64_t a= LD64(pixels );\
332 const uint64_t b= LD64(pixels+1);\
333 uint64_t l0= (a&0x0303030303030303ULL)\
334 + (b&0x0303030303030303ULL)\
335 + 0x0202020202020202ULL;\
336 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
337 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
341 for(i=0; i<h; i+=2){\
342 uint64_t a= LD64(pixels );\
343 uint64_t b= LD64(pixels+1);\
344 l1= (a&0x0303030303030303ULL)\
345 + (b&0x0303030303030303ULL);\
346 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
347 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
348 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
353 l0= (a&0x0303030303030303ULL)\
354 + (b&0x0303030303030303ULL)\
355 + 0x0202020202020202ULL;\
356 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
357 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
358 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
364 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
367 const uint64_t a= LD64(pixels );\
368 const uint64_t b= LD64(pixels+1);\
369 uint64_t l0= (a&0x0303030303030303ULL)\
370 + (b&0x0303030303030303ULL)\
371 + 0x0101010101010101ULL;\
372 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
373 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
377 for(i=0; i<h; i+=2){\
378 uint64_t a= LD64(pixels );\
379 uint64_t b= LD64(pixels+1);\
380 l1= (a&0x0303030303030303ULL)\
381 + (b&0x0303030303030303ULL);\
382 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
383 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
384 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
389 l0= (a&0x0303030303030303ULL)\
390 + (b&0x0303030303030303ULL)\
391 + 0x0101010101010101ULL;\
392 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
393 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
394 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
400 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
402 OPNAME ## _pixels_x2,\
403 OPNAME ## _pixels_y2,\
404 OPNAME ## _pixels_xy2,\
407 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
409 OPNAME ## _no_rnd_pixels_x2,\
410 OPNAME ## _no_rnd_pixels_y2,\
411 OPNAME ## _no_rnd_pixels_xy2,\
414 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
415 #else // 64 bit variant
417 #define PIXOP2(OPNAME, OP) \
418 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
422 OP(*((uint32_t*)(block )), LD32(pixels ));\
423 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
429 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
435 const uint32_t a= LD32(pixels );\
436 const uint32_t b= LD32(pixels+1);\
437 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
441 pixels+=line_size-8;\
442 block +=line_size-8;\
446 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
452 const uint32_t a= LD32(pixels );\
453 const uint32_t b= LD32(pixels+1);\
454 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
458 pixels+=line_size-8;\
459 block +=line_size-8;\
463 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
469 const uint32_t a= LD32(pixels );\
470 const uint32_t b= LD32(pixels+line_size);\
471 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
475 pixels+=line_size-8;\
476 block +=line_size-8;\
480 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
486 const uint32_t a= LD32(pixels );\
487 const uint32_t b= LD32(pixels+line_size);\
488 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
492 pixels+=line_size-8;\
493 block +=line_size-8;\
497 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
502 const uint32_t a= LD32(pixels );\
503 const uint32_t b= LD32(pixels+1);\
504 uint32_t l0= (a&0x03030303UL)\
507 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
508 + ((b&0xFCFCFCFCUL)>>2);\
512 for(i=0; i<h; i+=2){\
513 uint32_t a= LD32(pixels );\
514 uint32_t b= LD32(pixels+1);\
515 l1= (a&0x03030303UL)\
517 h1= ((a&0xFCFCFCFCUL)>>2)\
518 + ((b&0xFCFCFCFCUL)>>2);\
519 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
524 l0= (a&0x03030303UL)\
527 h0= ((a&0xFCFCFCFCUL)>>2)\
528 + ((b&0xFCFCFCFCUL)>>2);\
529 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
533 pixels+=4-line_size*(h+1);\
534 block +=4-line_size*h;\
538 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
543 const uint32_t a= LD32(pixels );\
544 const uint32_t b= LD32(pixels+1);\
545 uint32_t l0= (a&0x03030303UL)\
548 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
549 + ((b&0xFCFCFCFCUL)>>2);\
553 for(i=0; i<h; i+=2){\
554 uint32_t a= LD32(pixels );\
555 uint32_t b= LD32(pixels+1);\
556 l1= (a&0x03030303UL)\
558 h1= ((a&0xFCFCFCFCUL)>>2)\
559 + ((b&0xFCFCFCFCUL)>>2);\
560 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
565 l0= (a&0x03030303UL)\
568 h0= ((a&0xFCFCFCFCUL)>>2)\
569 + ((b&0xFCFCFCFCUL)>>2);\
570 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
574 pixels+=4-line_size*(h+1);\
575 block +=4-line_size*h;\
579 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
581 OPNAME ## _pixels_x2,\
582 OPNAME ## _pixels_y2,\
583 OPNAME ## _pixels_xy2,\
586 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
588 OPNAME ## _no_rnd_pixels_x2,\
589 OPNAME ## _no_rnd_pixels_y2,\
590 OPNAME ## _no_rnd_pixels_xy2,\
592 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
595 #define op_put(a, b) a = b
603 /* FIXME this stuff could be removed as its ot really used anymore */
604 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
606 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
627 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
635 OP(p[0], avg2(pix[0], pix[1])); \
636 OP(p[1], avg2(pix[1], pix[2])); \
637 OP(p[2], avg2(pix[2], pix[3])); \
638 OP(p[3], avg2(pix[3], pix[4])); \
639 OP(p[4], avg2(pix[4], pix[5])); \
640 OP(p[5], avg2(pix[5], pix[6])); \
641 OP(p[6], avg2(pix[6], pix[7])); \
642 OP(p[7], avg2(pix[7], pix[8])); \
648 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
656 pix1 = pixels + line_size; \
658 OP(p[0], avg2(pix[0], pix1[0])); \
659 OP(p[1], avg2(pix[1], pix1[1])); \
660 OP(p[2], avg2(pix[2], pix1[2])); \
661 OP(p[3], avg2(pix[3], pix1[3])); \
662 OP(p[4], avg2(pix[4], pix1[4])); \
663 OP(p[5], avg2(pix[5], pix1[5])); \
664 OP(p[6], avg2(pix[6], pix1[6])); \
665 OP(p[7], avg2(pix[7], pix1[7])); \
672 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
680 pix1 = pixels + line_size; \
682 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
683 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
684 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
685 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
686 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
687 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
688 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
689 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
696 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
698 OPNAME ## _pixels_x2, \
699 OPNAME ## _pixels_y2, \
700 OPNAME ## _pixels_xy2, \
703 /* rounding primitives */
704 #define avg2(a,b) ((a+b+1)>>1)
705 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
707 #define op_avg(a, b) a = avg2(a, b)
708 #define op_sub(a, b) a -= b
710 PIXOP(DCTELEM, sub, op_sub, 8)
712 /* not rounding primitives */
715 #define avg2(a,b) ((a+b)>>1)
716 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
718 /* motion estimation */
724 #define avg2(a,b) ((a+b+1)>>1)
725 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
727 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
729 const int A=(16-x16)*(16-y16);
730 const int B=( x16)*(16-y16);
731 const int C=(16-x16)*( y16);
732 const int D=( x16)*( y16);
734 rounder= 128 - rounder;
738 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
739 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
740 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
741 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
742 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
743 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
744 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
745 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
751 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
753 UINT8 *cm = cropTbl + MAX_NEG_CROP;
757 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
758 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
759 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
760 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
761 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
762 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
763 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
764 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
770 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
772 UINT8 *cm = cropTbl + MAX_NEG_CROP;
776 const int src0= src[0*srcStride];
777 const int src1= src[1*srcStride];
778 const int src2= src[2*srcStride];
779 const int src3= src[3*srcStride];
780 const int src4= src[4*srcStride];
781 const int src5= src[5*srcStride];
782 const int src6= src[6*srcStride];
783 const int src7= src[7*srcStride];
784 const int src8= src[8*srcStride];
785 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
786 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
787 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
788 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
789 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
790 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
791 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
792 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
798 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
816 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
821 dst[0]= (src1[0] + src2[0] + r)>>1;
822 dst[1]= (src1[1] + src2[1] + r)>>1;
823 dst[2]= (src1[2] + src2[2] + r)>>1;
824 dst[3]= (src1[3] + src2[3] + r)>>1;
825 dst[4]= (src1[4] + src2[4] + r)>>1;
826 dst[5]= (src1[5] + src2[5] + r)>>1;
827 dst[6]= (src1[6] + src2[6] + r)>>1;
828 dst[7]= (src1[7] + src2[7] + r)>>1;
835 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
840 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
841 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
842 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
843 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
844 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
845 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
846 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
847 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
856 #define QPEL_MC(r, name) \
857 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
859 put_block(dst, src, dstStride, srcStride);\
862 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
865 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
866 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
869 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
871 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
874 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
877 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
878 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
881 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
884 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
885 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
888 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
890 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
893 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
896 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
897 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
899 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
904 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
905 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
906 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
907 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
909 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
914 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
915 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
916 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
917 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
919 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
924 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
925 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
926 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
927 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
929 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
934 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
935 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
936 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
937 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
939 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
943 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
944 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
945 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
947 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
951 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
952 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
953 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
955 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
960 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
961 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
962 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
963 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
965 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
970 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
971 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
972 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
973 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
975 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
978 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
979 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
981 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
982 qpel_mc00_c ## name, \
983 qpel_mc10_c ## name, \
984 qpel_mc20_c ## name, \
985 qpel_mc30_c ## name, \
986 qpel_mc01_c ## name, \
987 qpel_mc11_c ## name, \
988 qpel_mc21_c ## name, \
989 qpel_mc31_c ## name, \
990 qpel_mc02_c ## name, \
991 qpel_mc12_c ## name, \
992 qpel_mc22_c ## name, \
993 qpel_mc32_c ## name, \
994 qpel_mc03_c ## name, \
995 qpel_mc13_c ## name, \
996 qpel_mc23_c ## name, \
997 qpel_mc33_c ## name, \
1003 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1009 s += abs(pix1[0] - pix2[0]);
1010 s += abs(pix1[1] - pix2[1]);
1011 s += abs(pix1[2] - pix2[2]);
1012 s += abs(pix1[3] - pix2[3]);
1013 s += abs(pix1[4] - pix2[4]);
1014 s += abs(pix1[5] - pix2[5]);
1015 s += abs(pix1[6] - pix2[6]);
1016 s += abs(pix1[7] - pix2[7]);
1017 s += abs(pix1[8] - pix2[8]);
1018 s += abs(pix1[9] - pix2[9]);
1019 s += abs(pix1[10] - pix2[10]);
1020 s += abs(pix1[11] - pix2[11]);
1021 s += abs(pix1[12] - pix2[12]);
1022 s += abs(pix1[13] - pix2[13]);
1023 s += abs(pix1[14] - pix2[14]);
1024 s += abs(pix1[15] - pix2[15]);
1031 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1037 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1038 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1039 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1040 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1041 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1042 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1043 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1044 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1045 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1046 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1047 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1048 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1049 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1050 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1051 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1052 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1059 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1062 UINT8 *pix3 = pix2 + line_size;
1066 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1067 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1068 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1069 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1070 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1071 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1072 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1073 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1074 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1075 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1076 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1077 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1078 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1079 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1080 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1081 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1089 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1092 UINT8 *pix3 = pix2 + line_size;
1096 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1097 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1098 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1099 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1100 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1101 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1102 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1103 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1104 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1105 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1106 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1107 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1108 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1109 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1110 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1111 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1119 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1125 s += abs(pix1[0] - pix2[0]);
1126 s += abs(pix1[1] - pix2[1]);
1127 s += abs(pix1[2] - pix2[2]);
1128 s += abs(pix1[3] - pix2[3]);
1129 s += abs(pix1[4] - pix2[4]);
1130 s += abs(pix1[5] - pix2[5]);
1131 s += abs(pix1[6] - pix2[6]);
1132 s += abs(pix1[7] - pix2[7]);
1139 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1145 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1146 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1147 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1148 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1149 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1150 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1151 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1152 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1159 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1162 UINT8 *pix3 = pix2 + line_size;
1166 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1167 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1168 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1169 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1170 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1171 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1172 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1173 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1181 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1184 UINT8 *pix3 = pix2 + line_size;
1188 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1189 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1190 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1191 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1192 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1193 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1194 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1195 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1203 /* permute block according so that it corresponds to the MMX idct
1206 /* general permutation, but perhaps slightly slower */
1207 void block_permute(INT16 *block)
1212 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1214 for(i=0; i<64; i++) block[i] = temp[i];
1218 void block_permute(INT16 *block)
1220 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1241 void clear_blocks_c(DCTELEM *blocks)
1243 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1246 void dsputil_init(void)
1249 int use_permuted_idct;
1251 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1252 for(i=0;i<MAX_NEG_CROP;i++) {
1254 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1257 for(i=0;i<512;i++) {
1258 squareTbl[i] = (i - 256) * (i - 256);
1262 ff_idct = simple_idct;
1264 ff_idct = j_rev_dct;
1266 get_pixels = get_pixels_c;
1267 diff_pixels = diff_pixels_c;
1268 put_pixels_clamped = put_pixels_clamped_c;
1269 add_pixels_clamped = add_pixels_clamped_c;
1271 clear_blocks= clear_blocks_c;
1273 pix_abs16x16 = pix_abs16x16_c;
1274 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1275 pix_abs16x16_y2 = pix_abs16x16_y2_c;
1276 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1277 pix_abs8x8 = pix_abs8x8_c;
1278 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1279 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1280 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1281 av_fdct = jpeg_fdct_ifast;
1283 use_permuted_idct = 1;
1289 dsputil_init_armv4l();
1292 dsputil_init_mlib();
1293 use_permuted_idct = 0;
1296 dsputil_init_alpha();
1297 use_permuted_idct = 0;
1301 if(ff_idct == simple_idct) use_permuted_idct=0;
1304 if(use_permuted_idct)
1306 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1308 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1311 for(i=0; i<64; i++) permutation[i]=i;
1313 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1314 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1316 if (use_permuted_idct) {
1317 /* permute for IDCT */
1319 j = zigzag_direct[i];
1320 zigzag_direct[i] = block_permute_op(j);
1321 j = ff_alternate_horizontal_scan[i];
1322 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1323 j = ff_alternate_vertical_scan[i];
1324 ff_alternate_vertical_scan[i] = block_permute_op(j);
1326 block_permute(default_intra_matrix);
1327 block_permute(default_non_intra_matrix);
1328 block_permute(ff_mpeg4_default_intra_matrix);
1329 block_permute(ff_mpeg4_default_non_intra_matrix);
1335 /* remove any non bit exact operation (testing purpose) */
1336 void avcodec_set_bit_exact(void)
1339 dsputil_set_bit_exact_mmx();
1343 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1344 int orig_linesize[3], int coded_linesize,
1345 AVCodecContext *avctx)
1347 int quad, diff, x, y;
1348 UINT8 *orig, *coded;
1349 UINT32 *sq = squareTbl + 256;
1355 orig = orig_image[0];
1356 coded = coded_image[0];
1358 for (y=0;y<avctx->height;y++) {
1359 for (x=0;x<avctx->width;x++) {
1360 diff = *(orig + x) - *(coded + x);
1363 orig += orig_linesize[0];
1364 coded += coded_linesize;
1367 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1369 if (avctx->psnr_y) {
1370 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1371 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1373 avctx->psnr_y = 99.99;