3 * Copyright (c) 2000, 2001 Gerard Lantau.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
26 #include "simple_idct.h"
28 void (*ff_idct)(DCTELEM *block);
29 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
30 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
31 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
33 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
34 void (*clear_blocks)(DCTELEM *blocks);
36 op_pixels_abs_func pix_abs16x16;
37 op_pixels_abs_func pix_abs16x16_x2;
38 op_pixels_abs_func pix_abs16x16_y2;
39 op_pixels_abs_func pix_abs16x16_xy2;
41 op_pixels_abs_func pix_abs8x8;
42 op_pixels_abs_func pix_abs8x8_x2;
43 op_pixels_abs_func pix_abs8x8_y2;
44 op_pixels_abs_func pix_abs8x8_xy2;
46 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
47 UINT32 squareTbl[512];
49 extern UINT16 default_intra_matrix[64];
50 extern UINT16 default_non_intra_matrix[64];
51 extern UINT16 ff_mpeg4_default_intra_matrix[64];
52 extern UINT16 ff_mpeg4_default_non_intra_matrix[64];
54 UINT8 zigzag_direct[64] = {
55 0, 1, 8, 16, 9, 2, 3, 10,
56 17, 24, 32, 25, 18, 11, 4, 5,
57 12, 19, 26, 33, 40, 48, 41, 34,
58 27, 20, 13, 6, 7, 14, 21, 28,
59 35, 42, 49, 56, 57, 50, 43, 36,
60 29, 22, 15, 23, 30, 37, 44, 51,
61 58, 59, 52, 45, 38, 31, 39, 46,
62 53, 60, 61, 54, 47, 55, 62, 63
65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66 UINT16 __align8 inv_zigzag_direct16[64];
68 /* not permutated zigzag_direct for MMX quantizer */
69 UINT8 zigzag_direct_noperm[64];
71 UINT8 ff_alternate_horizontal_scan[64] = {
72 0, 1, 2, 3, 8, 9, 16, 17,
73 10, 11, 4, 5, 6, 7, 15, 14,
74 13, 12, 19, 18, 24, 25, 32, 33,
75 26, 27, 20, 21, 22, 23, 28, 29,
76 30, 31, 34, 35, 40, 41, 48, 49,
77 42, 43, 36, 37, 38, 39, 44, 45,
78 46, 47, 50, 51, 56, 57, 58, 59,
79 52, 53, 54, 55, 60, 61, 62, 63,
82 UINT8 ff_alternate_vertical_scan[64] = {
83 0, 8, 16, 24, 1, 9, 2, 10,
84 17, 25, 32, 40, 48, 56, 57, 49,
85 41, 33, 26, 18, 3, 11, 4, 12,
86 19, 27, 34, 42, 50, 58, 35, 43,
87 51, 59, 20, 28, 5, 13, 6, 14,
88 21, 29, 36, 44, 52, 60, 37, 45,
89 53, 61, 22, 30, 7, 15, 23, 31,
90 38, 46, 54, 62, 39, 47, 55, 63,
95 /* Input permutation for the simple_idct_mmx */
96 static UINT8 simple_mmx_permutation[64]={
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
108 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
109 UINT32 inverse[256]={
110 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
111 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
112 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
113 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
114 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
115 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
116 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
117 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
118 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
119 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
120 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
121 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
122 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
123 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
124 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
125 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
126 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
127 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
128 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
129 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
130 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
131 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
132 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
133 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
134 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
135 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
136 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
137 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
138 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
139 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
140 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
141 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
144 /* used to skip zeros at the end */
145 UINT8 zigzag_end[64];
147 UINT8 permutation[64];
148 //UINT8 invPermutation[64];
150 static void build_zigzag_end()
153 int lastIndexAfterPerm=0;
154 for(lastIndex=0; lastIndex<64; lastIndex++)
156 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
157 lastIndexAfterPerm= zigzag_direct[lastIndex];
158 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
162 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
168 /* read the pixels */
185 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
189 /* read the pixels */
192 p[0] = s1[0] - s2[0];
193 p[1] = s1[1] - s2[1];
194 p[2] = s1[2] - s2[2];
195 p[3] = s1[3] - s2[3];
196 p[4] = s1[4] - s2[4];
197 p[5] = s1[5] - s2[5];
198 p[6] = s1[6] - s2[6];
199 p[7] = s1[7] - s2[7];
207 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
212 UINT8 *cm = cropTbl + MAX_NEG_CROP;
214 /* read the pixels */
231 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
236 UINT8 *cm = cropTbl + MAX_NEG_CROP;
238 /* read the pixels */
242 pix[0] = cm[pix[0] + p[0]];
243 pix[1] = cm[pix[1] + p[1]];
244 pix[2] = cm[pix[2] + p[2]];
245 pix[3] = cm[pix[3] + p[3]];
246 pix[4] = cm[pix[4] + p[4]];
247 pix[5] = cm[pix[5] + p[5]];
248 pix[6] = cm[pix[6] + p[6]];
249 pix[7] = cm[pix[7] + p[7]];
255 //FIXME someone with a alignemtent picky cpu should change these
257 #define LD32(a) (*((uint32_t*)(a)))
258 #define LD64(a) (*((uint64_t*)(a)))
262 #define PIXOP2(OPNAME, OP) \
263 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
267 OP(*((uint64_t*)block), LD64(pixels));\
273 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
277 const uint64_t a= LD64(pixels );\
278 const uint64_t b= LD64(pixels+1);\
279 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
285 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
289 const uint64_t a= LD64(pixels );\
290 const uint64_t b= LD64(pixels+1);\
291 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
297 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
301 const uint64_t a= LD64(pixels );\
302 const uint64_t b= LD64(pixels+line_size);\
303 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
309 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
313 const uint64_t a= LD64(pixels );\
314 const uint64_t b= LD64(pixels+line_size);\
315 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
321 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
324 const uint64_t a= LD64(pixels );\
325 const uint64_t b= LD64(pixels+1);\
326 uint64_t l0= (a&0x0303030303030303ULL)\
327 + (b&0x0303030303030303ULL)\
328 + 0x0202020202020202ULL;\
329 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
330 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
334 for(i=0; i<h; i+=2){\
335 uint64_t a= LD64(pixels );\
336 uint64_t b= LD64(pixels+1);\
337 l1= (a&0x0303030303030303ULL)\
338 + (b&0x0303030303030303ULL);\
339 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
340 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
341 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
346 l0= (a&0x0303030303030303ULL)\
347 + (b&0x0303030303030303ULL)\
348 + 0x0202020202020202ULL;\
349 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
350 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
351 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
357 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
360 const uint64_t a= LD64(pixels );\
361 const uint64_t b= LD64(pixels+1);\
362 uint64_t l0= (a&0x0303030303030303ULL)\
363 + (b&0x0303030303030303ULL)\
364 + 0x0101010101010101ULL;\
365 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
366 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
370 for(i=0; i<h; i+=2){\
371 uint64_t a= LD64(pixels );\
372 uint64_t b= LD64(pixels+1);\
373 l1= (a&0x0303030303030303ULL)\
374 + (b&0x0303030303030303ULL);\
375 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
376 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
377 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
382 l0= (a&0x0303030303030303ULL)\
383 + (b&0x0303030303030303ULL)\
384 + 0x0101010101010101ULL;\
385 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
386 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
387 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
393 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
395 OPNAME ## _pixels_x2,\
396 OPNAME ## _pixels_y2,\
397 OPNAME ## _pixels_xy2,\
400 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
402 OPNAME ## _no_rnd_pixels_x2,\
403 OPNAME ## _no_rnd_pixels_y2,\
404 OPNAME ## _no_rnd_pixels_xy2,\
407 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
408 #else // 64 bit variant
410 #define PIXOP2(OPNAME, OP) \
411 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
415 OP(*((uint32_t*)(block )), LD32(pixels ));\
416 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
422 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
428 const uint32_t a= LD32(pixels );\
429 const uint32_t b= LD32(pixels+1);\
430 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
434 pixels+=line_size-8;\
435 block +=line_size-8;\
439 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
445 const uint32_t a= LD32(pixels );\
446 const uint32_t b= LD32(pixels+1);\
447 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
451 pixels+=line_size-8;\
452 block +=line_size-8;\
456 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
462 const uint32_t a= LD32(pixels );\
463 const uint32_t b= LD32(pixels+line_size);\
464 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
468 pixels+=line_size-8;\
469 block +=line_size-8;\
473 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
479 const uint32_t a= LD32(pixels );\
480 const uint32_t b= LD32(pixels+line_size);\
481 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
485 pixels+=line_size-8;\
486 block +=line_size-8;\
490 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
495 const uint32_t a= LD32(pixels );\
496 const uint32_t b= LD32(pixels+1);\
497 uint32_t l0= (a&0x03030303UL)\
500 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
501 + ((b&0xFCFCFCFCUL)>>2);\
505 for(i=0; i<h; i+=2){\
506 uint32_t a= LD32(pixels );\
507 uint32_t b= LD32(pixels+1);\
508 l1= (a&0x03030303UL)\
510 h1= ((a&0xFCFCFCFCUL)>>2)\
511 + ((b&0xFCFCFCFCUL)>>2);\
512 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
517 l0= (a&0x03030303UL)\
520 h0= ((a&0xFCFCFCFCUL)>>2)\
521 + ((b&0xFCFCFCFCUL)>>2);\
522 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
526 pixels+=4-line_size*(h+1);\
527 block +=4-line_size*h;\
531 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
536 const uint32_t a= LD32(pixels );\
537 const uint32_t b= LD32(pixels+1);\
538 uint32_t l0= (a&0x03030303UL)\
541 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
542 + ((b&0xFCFCFCFCUL)>>2);\
546 for(i=0; i<h; i+=2){\
547 uint32_t a= LD32(pixels );\
548 uint32_t b= LD32(pixels+1);\
549 l1= (a&0x03030303UL)\
551 h1= ((a&0xFCFCFCFCUL)>>2)\
552 + ((b&0xFCFCFCFCUL)>>2);\
553 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
558 l0= (a&0x03030303UL)\
561 h0= ((a&0xFCFCFCFCUL)>>2)\
562 + ((b&0xFCFCFCFCUL)>>2);\
563 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
567 pixels+=4-line_size*(h+1);\
568 block +=4-line_size*h;\
572 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
574 OPNAME ## _pixels_x2,\
575 OPNAME ## _pixels_y2,\
576 OPNAME ## _pixels_xy2,\
579 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
581 OPNAME ## _no_rnd_pixels_x2,\
582 OPNAME ## _no_rnd_pixels_y2,\
583 OPNAME ## _no_rnd_pixels_xy2,\
585 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
588 #define op_put(a, b) a = b
595 /* FIXME this stuff could be removed as its ot really used anymore */
596 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
598 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
619 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
627 OP(p[0], avg2(pix[0], pix[1])); \
628 OP(p[1], avg2(pix[1], pix[2])); \
629 OP(p[2], avg2(pix[2], pix[3])); \
630 OP(p[3], avg2(pix[3], pix[4])); \
631 OP(p[4], avg2(pix[4], pix[5])); \
632 OP(p[5], avg2(pix[5], pix[6])); \
633 OP(p[6], avg2(pix[6], pix[7])); \
634 OP(p[7], avg2(pix[7], pix[8])); \
640 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
648 pix1 = pixels + line_size; \
650 OP(p[0], avg2(pix[0], pix1[0])); \
651 OP(p[1], avg2(pix[1], pix1[1])); \
652 OP(p[2], avg2(pix[2], pix1[2])); \
653 OP(p[3], avg2(pix[3], pix1[3])); \
654 OP(p[4], avg2(pix[4], pix1[4])); \
655 OP(p[5], avg2(pix[5], pix1[5])); \
656 OP(p[6], avg2(pix[6], pix1[6])); \
657 OP(p[7], avg2(pix[7], pix1[7])); \
664 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
672 pix1 = pixels + line_size; \
674 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
675 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
676 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
677 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
678 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
679 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
680 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
681 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
688 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
690 OPNAME ## _pixels_x2, \
691 OPNAME ## _pixels_y2, \
692 OPNAME ## _pixels_xy2, \
696 /* rounding primitives */
697 #define avg2(a,b) ((a+b+1)>>1)
698 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
700 #define op_avg(a, b) a = avg2(a, b)
701 #define op_sub(a, b) a -= b
703 PIXOP(DCTELEM, sub, op_sub, 8)
705 /* not rounding primitives */
708 #define avg2(a,b) ((a+b)>>1)
709 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
711 /* motion estimation */
715 #define avg2(a,b) ((a+b+1)>>1)
716 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
718 /* end of removeale stuff */
720 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
722 const int A=(16-x16)*(16-y16);
723 const int B=( x16)*(16-y16);
724 const int C=(16-x16)*( y16);
725 const int D=( x16)*( y16);
727 rounder= 128 - rounder;
731 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
732 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
733 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
734 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
735 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
736 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
737 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
738 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
744 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
746 UINT8 *cm = cropTbl + MAX_NEG_CROP;
750 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
751 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
752 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
753 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
754 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
755 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
756 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
757 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
763 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
765 UINT8 *cm = cropTbl + MAX_NEG_CROP;
769 const int src0= src[0*srcStride];
770 const int src1= src[1*srcStride];
771 const int src2= src[2*srcStride];
772 const int src3= src[3*srcStride];
773 const int src4= src[4*srcStride];
774 const int src5= src[5*srcStride];
775 const int src6= src[6*srcStride];
776 const int src7= src[7*srcStride];
777 const int src8= src[8*srcStride];
778 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
779 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
780 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
781 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
782 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
783 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
784 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
785 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
791 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
809 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
814 dst[0]= (src1[0] + src2[0] + r)>>1;
815 dst[1]= (src1[1] + src2[1] + r)>>1;
816 dst[2]= (src1[2] + src2[2] + r)>>1;
817 dst[3]= (src1[3] + src2[3] + r)>>1;
818 dst[4]= (src1[4] + src2[4] + r)>>1;
819 dst[5]= (src1[5] + src2[5] + r)>>1;
820 dst[6]= (src1[6] + src2[6] + r)>>1;
821 dst[7]= (src1[7] + src2[7] + r)>>1;
828 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
833 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
834 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
835 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
836 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
837 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
838 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
839 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
840 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
849 #define QPEL_MC(r, name) \
850 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
852 put_block(dst, src, dstStride, srcStride);\
855 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
858 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
859 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
862 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
864 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
867 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
870 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
871 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
874 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
877 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
878 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
881 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
883 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
886 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
889 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
890 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
892 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
897 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
898 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
899 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
900 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
902 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
907 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
908 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
909 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
910 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
912 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
917 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
918 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
919 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
920 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
922 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
927 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
928 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
929 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
930 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
932 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
936 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
937 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
938 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
940 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
944 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
945 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
946 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
948 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
953 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
954 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
955 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
956 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
958 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
963 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
964 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
965 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
966 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
968 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
971 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
972 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
974 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
975 qpel_mc00_c ## name, \
976 qpel_mc10_c ## name, \
977 qpel_mc20_c ## name, \
978 qpel_mc30_c ## name, \
979 qpel_mc01_c ## name, \
980 qpel_mc11_c ## name, \
981 qpel_mc21_c ## name, \
982 qpel_mc31_c ## name, \
983 qpel_mc02_c ## name, \
984 qpel_mc12_c ## name, \
985 qpel_mc22_c ## name, \
986 qpel_mc32_c ## name, \
987 qpel_mc03_c ## name, \
988 qpel_mc13_c ## name, \
989 qpel_mc23_c ## name, \
990 qpel_mc33_c ## name, \
996 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1002 s += abs(pix1[0] - pix2[0]);
1003 s += abs(pix1[1] - pix2[1]);
1004 s += abs(pix1[2] - pix2[2]);
1005 s += abs(pix1[3] - pix2[3]);
1006 s += abs(pix1[4] - pix2[4]);
1007 s += abs(pix1[5] - pix2[5]);
1008 s += abs(pix1[6] - pix2[6]);
1009 s += abs(pix1[7] - pix2[7]);
1010 s += abs(pix1[8] - pix2[8]);
1011 s += abs(pix1[9] - pix2[9]);
1012 s += abs(pix1[10] - pix2[10]);
1013 s += abs(pix1[11] - pix2[11]);
1014 s += abs(pix1[12] - pix2[12]);
1015 s += abs(pix1[13] - pix2[13]);
1016 s += abs(pix1[14] - pix2[14]);
1017 s += abs(pix1[15] - pix2[15]);
1024 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1030 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1031 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1032 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1033 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1034 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1035 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1036 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1037 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1038 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1039 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1040 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1041 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1042 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1043 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1044 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1045 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1052 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1055 UINT8 *pix3 = pix2 + line_size;
1059 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1060 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1061 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1062 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1063 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1064 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1065 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1066 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1067 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1068 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1069 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1070 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1071 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1072 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1073 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1074 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1082 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1085 UINT8 *pix3 = pix2 + line_size;
1089 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1090 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1091 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1092 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1093 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1094 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1095 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1096 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1097 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1098 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1099 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1100 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1101 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1102 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1103 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1104 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1112 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1118 s += abs(pix1[0] - pix2[0]);
1119 s += abs(pix1[1] - pix2[1]);
1120 s += abs(pix1[2] - pix2[2]);
1121 s += abs(pix1[3] - pix2[3]);
1122 s += abs(pix1[4] - pix2[4]);
1123 s += abs(pix1[5] - pix2[5]);
1124 s += abs(pix1[6] - pix2[6]);
1125 s += abs(pix1[7] - pix2[7]);
1132 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1138 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1139 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1140 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1141 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1142 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1143 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1144 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1145 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1152 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1155 UINT8 *pix3 = pix2 + line_size;
1159 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1160 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1161 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1162 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1163 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1164 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1165 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1166 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1174 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1177 UINT8 *pix3 = pix2 + line_size;
1181 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1182 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1183 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1184 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1185 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1186 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1187 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1188 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1196 /* permute block according so that it corresponds to the MMX idct
1199 /* general permutation, but perhaps slightly slower */
1200 void block_permute(INT16 *block)
1205 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1207 for(i=0; i<64; i++) block[i] = temp[i];
1211 void block_permute(INT16 *block)
1213 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1234 void clear_blocks_c(DCTELEM *blocks)
1236 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1239 void dsputil_init(void)
1242 int use_permuted_idct;
1244 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1245 for(i=0;i<MAX_NEG_CROP;i++) {
1247 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1250 for(i=0;i<512;i++) {
1251 squareTbl[i] = (i - 256) * (i - 256);
1255 ff_idct = simple_idct;
1257 ff_idct = j_rev_dct;
1259 get_pixels = get_pixels_c;
1260 diff_pixels = diff_pixels_c;
1261 put_pixels_clamped = put_pixels_clamped_c;
1262 add_pixels_clamped = add_pixels_clamped_c;
1264 clear_blocks= clear_blocks_c;
1266 pix_abs16x16 = pix_abs16x16_c;
1267 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1268 pix_abs16x16_y2 = pix_abs16x16_y2_c;
1269 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1270 pix_abs8x8 = pix_abs8x8_c;
1271 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1272 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1273 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1274 av_fdct = jpeg_fdct_ifast;
1276 use_permuted_idct = 1;
1282 dsputil_init_armv4l();
1285 dsputil_init_mlib();
1286 use_permuted_idct = 0;
1289 dsputil_init_alpha();
1290 use_permuted_idct = 0;
1294 if(ff_idct == simple_idct) use_permuted_idct=0;
1297 if(use_permuted_idct)
1299 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1301 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1304 for(i=0; i<64; i++) permutation[i]=i;
1306 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1307 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1309 if (use_permuted_idct) {
1310 /* permute for IDCT */
1312 j = zigzag_direct[i];
1313 zigzag_direct[i] = block_permute_op(j);
1314 j = ff_alternate_horizontal_scan[i];
1315 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1316 j = ff_alternate_vertical_scan[i];
1317 ff_alternate_vertical_scan[i] = block_permute_op(j);
1319 block_permute(default_intra_matrix);
1320 block_permute(default_non_intra_matrix);
1321 block_permute(ff_mpeg4_default_intra_matrix);
1322 block_permute(ff_mpeg4_default_non_intra_matrix);
1328 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1329 int orig_linesize[3], int coded_linesize,
1330 AVCodecContext *avctx)
1332 int quad, diff, x, y;
1333 UINT8 *orig, *coded;
1334 UINT32 *sq = squareTbl + 256;
1340 orig = orig_image[0];
1341 coded = coded_image[0];
1343 for (y=0;y<avctx->height;y++) {
1344 for (x=0;x<avctx->width;x++) {
1345 diff = *(orig + x) - *(coded + x);
1348 orig += orig_linesize[0];
1349 coded += coded_linesize;
1352 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1354 if (avctx->psnr_y) {
1355 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1356 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1358 avctx->psnr_y = 99.99;