3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23 #include "simple_idct.h"
25 void (*ff_idct)(DCTELEM *block);
26 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
28 void (*av_fdct)(DCTELEM *block);
29 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
30 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
31 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
33 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
34 void (*clear_blocks)(DCTELEM *blocks);
36 op_pixels_abs_func pix_abs16x16;
37 op_pixels_abs_func pix_abs16x16_x2;
38 op_pixels_abs_func pix_abs16x16_y2;
39 op_pixels_abs_func pix_abs16x16_xy2;
41 op_pixels_abs_func pix_abs8x8;
42 op_pixels_abs_func pix_abs8x8_x2;
43 op_pixels_abs_func pix_abs8x8_y2;
44 op_pixels_abs_func pix_abs8x8_xy2;
46 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
47 UINT32 squareTbl[512];
49 extern INT16 default_intra_matrix[64];
50 extern INT16 default_non_intra_matrix[64];
51 extern INT16 ff_mpeg4_default_intra_matrix[64];
52 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
54 UINT8 zigzag_direct[64] = {
55 0, 1, 8, 16, 9, 2, 3, 10,
56 17, 24, 32, 25, 18, 11, 4, 5,
57 12, 19, 26, 33, 40, 48, 41, 34,
58 27, 20, 13, 6, 7, 14, 21, 28,
59 35, 42, 49, 56, 57, 50, 43, 36,
60 29, 22, 15, 23, 30, 37, 44, 51,
61 58, 59, 52, 45, 38, 31, 39, 46,
62 53, 60, 61, 54, 47, 55, 62, 63
65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66 UINT16 __align8 inv_zigzag_direct16[64];
68 /* not permutated zigzag_direct for MMX quantizer */
69 UINT8 zigzag_direct_noperm[64];
71 UINT8 ff_alternate_horizontal_scan[64] = {
72 0, 1, 2, 3, 8, 9, 16, 17,
73 10, 11, 4, 5, 6, 7, 15, 14,
74 13, 12, 19, 18, 24, 25, 32, 33,
75 26, 27, 20, 21, 22, 23, 28, 29,
76 30, 31, 34, 35, 40, 41, 48, 49,
77 42, 43, 36, 37, 38, 39, 44, 45,
78 46, 47, 50, 51, 56, 57, 58, 59,
79 52, 53, 54, 55, 60, 61, 62, 63,
82 UINT8 ff_alternate_vertical_scan[64] = {
83 0, 8, 16, 24, 1, 9, 2, 10,
84 17, 25, 32, 40, 48, 56, 57, 49,
85 41, 33, 26, 18, 3, 11, 4, 12,
86 19, 27, 34, 42, 50, 58, 35, 43,
87 51, 59, 20, 28, 5, 13, 6, 14,
88 21, 29, 36, 44, 52, 60, 37, 45,
89 53, 61, 22, 30, 7, 15, 23, 31,
90 38, 46, 54, 62, 39, 47, 55, 63,
95 /* Input permutation for the simple_idct_mmx */
96 static UINT8 simple_mmx_permutation[64]={
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
108 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
109 UINT32 inverse[256]={
110 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
111 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
112 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
113 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
114 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
115 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
116 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
117 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
118 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
119 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
120 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
121 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
122 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
123 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
124 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
125 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
126 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
127 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
128 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
129 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
130 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
131 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
132 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
133 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
134 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
135 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
136 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
137 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
138 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
139 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
140 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
141 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
144 /* used to skip zeros at the end */
145 UINT8 zigzag_end[64];
147 UINT8 permutation[64];
148 //UINT8 invPermutation[64];
150 static void build_zigzag_end(void)
153 int lastIndexAfterPerm=0;
154 for(lastIndex=0; lastIndex<64; lastIndex++)
156 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
157 lastIndexAfterPerm= zigzag_direct[lastIndex];
158 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
162 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
168 /* read the pixels */
185 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
189 /* read the pixels */
192 p[0] = s1[0] - s2[0];
193 p[1] = s1[1] - s2[1];
194 p[2] = s1[2] - s2[2];
195 p[3] = s1[3] - s2[3];
196 p[4] = s1[4] - s2[4];
197 p[5] = s1[5] - s2[5];
198 p[6] = s1[6] - s2[6];
199 p[7] = s1[7] - s2[7];
207 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
212 UINT8 *cm = cropTbl + MAX_NEG_CROP;
214 /* read the pixels */
231 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
236 UINT8 *cm = cropTbl + MAX_NEG_CROP;
238 /* read the pixels */
242 pix[0] = cm[pix[0] + p[0]];
243 pix[1] = cm[pix[1] + p[1]];
244 pix[2] = cm[pix[2] + p[2]];
245 pix[3] = cm[pix[3] + p[3]];
246 pix[4] = cm[pix[4] + p[4]];
247 pix[5] = cm[pix[5] + p[5]];
248 pix[6] = cm[pix[6] + p[6]];
249 pix[7] = cm[pix[7] + p[7]];
257 #define PIXOP2(OPNAME, OP) \
258 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
262 OP(*((uint64_t*)block), LD64(pixels));\
268 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
272 const uint64_t a= LD64(pixels );\
273 const uint64_t b= LD64(pixels+1);\
274 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
280 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
284 const uint64_t a= LD64(pixels );\
285 const uint64_t b= LD64(pixels+1);\
286 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
292 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
296 const uint64_t a= LD64(pixels );\
297 const uint64_t b= LD64(pixels+line_size);\
298 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
304 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
308 const uint64_t a= LD64(pixels );\
309 const uint64_t b= LD64(pixels+line_size);\
310 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
316 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
319 const uint64_t a= LD64(pixels );\
320 const uint64_t b= LD64(pixels+1);\
321 uint64_t l0= (a&0x0303030303030303ULL)\
322 + (b&0x0303030303030303ULL)\
323 + 0x0202020202020202ULL;\
324 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
325 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
329 for(i=0; i<h; i+=2){\
330 uint64_t a= LD64(pixels );\
331 uint64_t b= LD64(pixels+1);\
332 l1= (a&0x0303030303030303ULL)\
333 + (b&0x0303030303030303ULL);\
334 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
335 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
336 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
341 l0= (a&0x0303030303030303ULL)\
342 + (b&0x0303030303030303ULL)\
343 + 0x0202020202020202ULL;\
344 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
345 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
346 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
352 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
355 const uint64_t a= LD64(pixels );\
356 const uint64_t b= LD64(pixels+1);\
357 uint64_t l0= (a&0x0303030303030303ULL)\
358 + (b&0x0303030303030303ULL)\
359 + 0x0101010101010101ULL;\
360 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
361 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
365 for(i=0; i<h; i+=2){\
366 uint64_t a= LD64(pixels );\
367 uint64_t b= LD64(pixels+1);\
368 l1= (a&0x0303030303030303ULL)\
369 + (b&0x0303030303030303ULL);\
370 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
371 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
372 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
377 l0= (a&0x0303030303030303ULL)\
378 + (b&0x0303030303030303ULL)\
379 + 0x0101010101010101ULL;\
380 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
381 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
382 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
388 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
390 OPNAME ## _pixels_x2,\
391 OPNAME ## _pixels_y2,\
392 OPNAME ## _pixels_xy2,\
395 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
397 OPNAME ## _no_rnd_pixels_x2,\
398 OPNAME ## _no_rnd_pixels_y2,\
399 OPNAME ## _no_rnd_pixels_xy2,\
402 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
403 #else // 64 bit variant
405 #define PIXOP2(OPNAME, OP) \
406 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
410 OP(*((uint32_t*)(block )), LD32(pixels ));\
411 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
417 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
423 const uint32_t a= LD32(pixels );\
424 const uint32_t b= LD32(pixels+1);\
425 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
429 pixels+=line_size-8;\
430 block +=line_size-8;\
434 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
440 const uint32_t a= LD32(pixels );\
441 const uint32_t b= LD32(pixels+1);\
442 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
446 pixels+=line_size-8;\
447 block +=line_size-8;\
451 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
457 const uint32_t a= LD32(pixels );\
458 const uint32_t b= LD32(pixels+line_size);\
459 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
463 pixels+=line_size-8;\
464 block +=line_size-8;\
468 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
474 const uint32_t a= LD32(pixels );\
475 const uint32_t b= LD32(pixels+line_size);\
476 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
480 pixels+=line_size-8;\
481 block +=line_size-8;\
485 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
490 const uint32_t a= LD32(pixels );\
491 const uint32_t b= LD32(pixels+1);\
492 uint32_t l0= (a&0x03030303UL)\
495 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
496 + ((b&0xFCFCFCFCUL)>>2);\
500 for(i=0; i<h; i+=2){\
501 uint32_t a= LD32(pixels );\
502 uint32_t b= LD32(pixels+1);\
503 l1= (a&0x03030303UL)\
505 h1= ((a&0xFCFCFCFCUL)>>2)\
506 + ((b&0xFCFCFCFCUL)>>2);\
507 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
512 l0= (a&0x03030303UL)\
515 h0= ((a&0xFCFCFCFCUL)>>2)\
516 + ((b&0xFCFCFCFCUL)>>2);\
517 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
521 pixels+=4-line_size*(h+1);\
522 block +=4-line_size*h;\
526 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
531 const uint32_t a= LD32(pixels );\
532 const uint32_t b= LD32(pixels+1);\
533 uint32_t l0= (a&0x03030303UL)\
536 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
537 + ((b&0xFCFCFCFCUL)>>2);\
541 for(i=0; i<h; i+=2){\
542 uint32_t a= LD32(pixels );\
543 uint32_t b= LD32(pixels+1);\
544 l1= (a&0x03030303UL)\
546 h1= ((a&0xFCFCFCFCUL)>>2)\
547 + ((b&0xFCFCFCFCUL)>>2);\
548 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
553 l0= (a&0x03030303UL)\
556 h0= ((a&0xFCFCFCFCUL)>>2)\
557 + ((b&0xFCFCFCFCUL)>>2);\
558 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
562 pixels+=4-line_size*(h+1);\
563 block +=4-line_size*h;\
567 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
569 OPNAME ## _pixels_x2,\
570 OPNAME ## _pixels_y2,\
571 OPNAME ## _pixels_xy2,\
574 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
576 OPNAME ## _no_rnd_pixels_x2,\
577 OPNAME ## _no_rnd_pixels_y2,\
578 OPNAME ## _no_rnd_pixels_xy2,\
580 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
583 #define op_put(a, b) a = b
591 /* FIXME this stuff could be removed as its ot really used anymore */
592 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
594 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
615 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
623 OP(p[0], avg2(pix[0], pix[1])); \
624 OP(p[1], avg2(pix[1], pix[2])); \
625 OP(p[2], avg2(pix[2], pix[3])); \
626 OP(p[3], avg2(pix[3], pix[4])); \
627 OP(p[4], avg2(pix[4], pix[5])); \
628 OP(p[5], avg2(pix[5], pix[6])); \
629 OP(p[6], avg2(pix[6], pix[7])); \
630 OP(p[7], avg2(pix[7], pix[8])); \
636 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
644 pix1 = pixels + line_size; \
646 OP(p[0], avg2(pix[0], pix1[0])); \
647 OP(p[1], avg2(pix[1], pix1[1])); \
648 OP(p[2], avg2(pix[2], pix1[2])); \
649 OP(p[3], avg2(pix[3], pix1[3])); \
650 OP(p[4], avg2(pix[4], pix1[4])); \
651 OP(p[5], avg2(pix[5], pix1[5])); \
652 OP(p[6], avg2(pix[6], pix1[6])); \
653 OP(p[7], avg2(pix[7], pix1[7])); \
660 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
668 pix1 = pixels + line_size; \
670 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
671 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
672 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
673 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
674 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
675 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
676 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
677 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
684 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
686 OPNAME ## _pixels_x2, \
687 OPNAME ## _pixels_y2, \
688 OPNAME ## _pixels_xy2, \
691 /* rounding primitives */
692 #define avg2(a,b) ((a+b+1)>>1)
693 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
695 #define op_avg(a, b) a = avg2(a, b)
696 #define op_sub(a, b) a -= b
698 PIXOP(DCTELEM, sub, op_sub, 8)
700 /* not rounding primitives */
703 #define avg2(a,b) ((a+b)>>1)
704 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
706 /* motion estimation */
712 #define avg2(a,b) ((a+b+1)>>1)
713 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
715 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
717 const int A=(16-x16)*(16-y16);
718 const int B=( x16)*(16-y16);
719 const int C=(16-x16)*( y16);
720 const int D=( x16)*( y16);
722 rounder= 128 - rounder;
726 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
727 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
728 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
729 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
730 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
731 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
732 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
733 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
739 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
741 UINT8 *cm = cropTbl + MAX_NEG_CROP;
745 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
746 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
747 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
748 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
749 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
750 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
751 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
752 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
758 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
760 UINT8 *cm = cropTbl + MAX_NEG_CROP;
764 const int src0= src[0*srcStride];
765 const int src1= src[1*srcStride];
766 const int src2= src[2*srcStride];
767 const int src3= src[3*srcStride];
768 const int src4= src[4*srcStride];
769 const int src5= src[5*srcStride];
770 const int src6= src[6*srcStride];
771 const int src7= src[7*srcStride];
772 const int src8= src[8*srcStride];
773 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
774 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
775 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
776 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
777 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
778 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
779 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
780 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
786 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
804 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
809 dst[0]= (src1[0] + src2[0] + r)>>1;
810 dst[1]= (src1[1] + src2[1] + r)>>1;
811 dst[2]= (src1[2] + src2[2] + r)>>1;
812 dst[3]= (src1[3] + src2[3] + r)>>1;
813 dst[4]= (src1[4] + src2[4] + r)>>1;
814 dst[5]= (src1[5] + src2[5] + r)>>1;
815 dst[6]= (src1[6] + src2[6] + r)>>1;
816 dst[7]= (src1[7] + src2[7] + r)>>1;
823 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
828 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
829 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
830 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
831 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
832 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
833 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
834 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
835 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
844 #define QPEL_MC(r, name) \
845 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
847 put_block(dst, src, dstStride, srcStride);\
850 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
853 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
854 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
857 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
859 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
862 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
865 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
866 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
869 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
872 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
873 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
876 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
878 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
881 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
884 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
885 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
887 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
892 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
893 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
894 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
895 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
897 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
902 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
903 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
904 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
905 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
907 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
912 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
913 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
914 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
915 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
917 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
922 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
923 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
924 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
925 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
927 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
931 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
932 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
933 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
935 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
939 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
940 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
941 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
943 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
948 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
949 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
950 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
951 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
953 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
958 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
959 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
960 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
961 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
963 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
966 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
967 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
969 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
970 qpel_mc00_c ## name, \
971 qpel_mc10_c ## name, \
972 qpel_mc20_c ## name, \
973 qpel_mc30_c ## name, \
974 qpel_mc01_c ## name, \
975 qpel_mc11_c ## name, \
976 qpel_mc21_c ## name, \
977 qpel_mc31_c ## name, \
978 qpel_mc02_c ## name, \
979 qpel_mc12_c ## name, \
980 qpel_mc22_c ## name, \
981 qpel_mc32_c ## name, \
982 qpel_mc03_c ## name, \
983 qpel_mc13_c ## name, \
984 qpel_mc23_c ## name, \
985 qpel_mc33_c ## name, \
991 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
997 s += abs(pix1[0] - pix2[0]);
998 s += abs(pix1[1] - pix2[1]);
999 s += abs(pix1[2] - pix2[2]);
1000 s += abs(pix1[3] - pix2[3]);
1001 s += abs(pix1[4] - pix2[4]);
1002 s += abs(pix1[5] - pix2[5]);
1003 s += abs(pix1[6] - pix2[6]);
1004 s += abs(pix1[7] - pix2[7]);
1005 s += abs(pix1[8] - pix2[8]);
1006 s += abs(pix1[9] - pix2[9]);
1007 s += abs(pix1[10] - pix2[10]);
1008 s += abs(pix1[11] - pix2[11]);
1009 s += abs(pix1[12] - pix2[12]);
1010 s += abs(pix1[13] - pix2[13]);
1011 s += abs(pix1[14] - pix2[14]);
1012 s += abs(pix1[15] - pix2[15]);
1019 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1025 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1026 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1027 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1028 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1029 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1030 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1031 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1032 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1033 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1034 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1035 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1036 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1037 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1038 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1039 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1040 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1047 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1050 UINT8 *pix3 = pix2 + line_size;
1054 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1055 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1056 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1057 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1058 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1059 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1060 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1061 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1062 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1063 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1064 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1065 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1066 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1067 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1068 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1069 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1077 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1080 UINT8 *pix3 = pix2 + line_size;
1084 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1085 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1086 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1087 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1088 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1089 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1090 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1091 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1092 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1093 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1094 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1095 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1096 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1097 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1098 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1099 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1107 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1113 s += abs(pix1[0] - pix2[0]);
1114 s += abs(pix1[1] - pix2[1]);
1115 s += abs(pix1[2] - pix2[2]);
1116 s += abs(pix1[3] - pix2[3]);
1117 s += abs(pix1[4] - pix2[4]);
1118 s += abs(pix1[5] - pix2[5]);
1119 s += abs(pix1[6] - pix2[6]);
1120 s += abs(pix1[7] - pix2[7]);
1127 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1133 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1134 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1135 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1136 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1137 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1138 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1139 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1140 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1147 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1150 UINT8 *pix3 = pix2 + line_size;
1154 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1155 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1156 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1157 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1158 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1159 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1160 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1161 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1169 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1172 UINT8 *pix3 = pix2 + line_size;
1176 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1177 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1178 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1179 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1180 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1181 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1182 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1183 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1191 /* permute block according so that it corresponds to the MMX idct
1194 /* general permutation, but perhaps slightly slower */
1195 void block_permute(INT16 *block)
1200 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1202 for(i=0; i<64; i++) block[i] = temp[i];
1206 void block_permute(INT16 *block)
1208 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1229 void clear_blocks_c(DCTELEM *blocks)
1231 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1234 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1236 void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1239 put_pixels_clamped(block, dest, line_size);
1242 void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1245 add_pixels_clamped(block, dest, line_size);
1248 void dsputil_init(void)
1251 int use_permuted_idct;
1253 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1254 for(i=0;i<MAX_NEG_CROP;i++) {
1256 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1259 for(i=0;i<512;i++) {
1260 squareTbl[i] = (i - 256) * (i - 256);
1266 ff_idct = j_rev_dct;
1268 get_pixels = get_pixels_c;
1269 diff_pixels = diff_pixels_c;
1270 put_pixels_clamped = put_pixels_clamped_c;
1271 add_pixels_clamped = add_pixels_clamped_c;
1273 clear_blocks= clear_blocks_c;
1275 pix_abs16x16 = pix_abs16x16_c;
1276 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1277 pix_abs16x16_y2 = pix_abs16x16_y2_c;
1278 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1279 pix_abs8x8 = pix_abs8x8_c;
1280 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1281 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1282 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1283 av_fdct = fdct_ifast;
1285 use_permuted_idct = 1;
1291 dsputil_init_armv4l();
1294 dsputil_init_mlib();
1295 use_permuted_idct = 0;
1298 dsputil_init_alpha();
1299 use_permuted_idct = 0;
1303 if (ff_idct == NULL) {
1304 ff_idct_put = simple_idct_put;
1305 ff_idct_add = simple_idct_add;
1306 use_permuted_idct=0;
1308 ff_idct_put = gen_idct_put;
1309 ff_idct_add = gen_idct_add;
1313 if(use_permuted_idct)
1315 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1317 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1320 for(i=0; i<64; i++) permutation[i]=i;
1322 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1323 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1325 if (use_permuted_idct) {
1326 /* permute for IDCT */
1328 j = zigzag_direct[i];
1329 zigzag_direct[i] = block_permute_op(j);
1330 j = ff_alternate_horizontal_scan[i];
1331 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1332 j = ff_alternate_vertical_scan[i];
1333 ff_alternate_vertical_scan[i] = block_permute_op(j);
1335 block_permute(default_intra_matrix);
1336 block_permute(default_non_intra_matrix);
1337 block_permute(ff_mpeg4_default_intra_matrix);
1338 block_permute(ff_mpeg4_default_non_intra_matrix);
1344 /* remove any non bit exact operation (testing purpose) */
1345 void avcodec_set_bit_exact(void)
1348 dsputil_set_bit_exact_mmx();
1352 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1353 int orig_linesize[3], int coded_linesize,
1354 AVCodecContext *avctx)
1356 int quad, diff, x, y;
1357 UINT8 *orig, *coded;
1358 UINT32 *sq = squareTbl + 256;
1364 orig = orig_image[0];
1365 coded = coded_image[0];
1367 for (y=0;y<avctx->height;y++) {
1368 for (x=0;x<avctx->width;x++) {
1369 diff = *(orig + x) - *(coded + x);
1372 orig += orig_linesize[0];
1373 coded += coded_linesize;
1376 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1378 if (avctx->psnr_y) {
1379 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1380 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1382 avctx->psnr_y = 99.99;