3 * Copyright (c) 2000, 2001 Gerard Lantau.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 * gmc & q-pel support by Michael Niedermayer <michaelni@gmx.at>
26 #include "simple_idct.h"
28 void (*ff_idct)(DCTELEM *block);
29 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
30 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
31 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
33 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
34 void (*clear_blocks)(DCTELEM *blocks);
36 op_pixels_abs_func pix_abs16x16;
37 op_pixels_abs_func pix_abs16x16_x2;
38 op_pixels_abs_func pix_abs16x16_y2;
39 op_pixels_abs_func pix_abs16x16_xy2;
41 op_pixels_abs_func pix_abs8x8;
42 op_pixels_abs_func pix_abs8x8_x2;
43 op_pixels_abs_func pix_abs8x8_y2;
44 op_pixels_abs_func pix_abs8x8_xy2;
46 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
47 UINT32 squareTbl[512];
49 extern UINT16 default_intra_matrix[64];
50 extern UINT16 default_non_intra_matrix[64];
51 extern UINT16 ff_mpeg4_default_intra_matrix[64];
52 extern UINT16 ff_mpeg4_default_non_intra_matrix[64];
54 UINT8 zigzag_direct[64] = {
55 0, 1, 8, 16, 9, 2, 3, 10,
56 17, 24, 32, 25, 18, 11, 4, 5,
57 12, 19, 26, 33, 40, 48, 41, 34,
58 27, 20, 13, 6, 7, 14, 21, 28,
59 35, 42, 49, 56, 57, 50, 43, 36,
60 29, 22, 15, 23, 30, 37, 44, 51,
61 58, 59, 52, 45, 38, 31, 39, 46,
62 53, 60, 61, 54, 47, 55, 62, 63
65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66 UINT16 __align8 inv_zigzag_direct16[64];
68 /* not permutated zigzag_direct for MMX quantizer */
69 UINT8 zigzag_direct_noperm[64];
71 UINT8 ff_alternate_horizontal_scan[64] = {
72 0, 1, 2, 3, 8, 9, 16, 17,
73 10, 11, 4, 5, 6, 7, 15, 14,
74 13, 12, 19, 18, 24, 25, 32, 33,
75 26, 27, 20, 21, 22, 23, 28, 29,
76 30, 31, 34, 35, 40, 41, 48, 49,
77 42, 43, 36, 37, 38, 39, 44, 45,
78 46, 47, 50, 51, 56, 57, 58, 59,
79 52, 53, 54, 55, 60, 61, 62, 63,
82 UINT8 ff_alternate_vertical_scan[64] = {
83 0, 8, 16, 24, 1, 9, 2, 10,
84 17, 25, 32, 40, 48, 56, 57, 49,
85 41, 33, 26, 18, 3, 11, 4, 12,
86 19, 27, 34, 42, 50, 58, 35, 43,
87 51, 59, 20, 28, 5, 13, 6, 14,
88 21, 29, 36, 44, 52, 60, 37, 45,
89 53, 61, 22, 30, 7, 15, 23, 31,
90 38, 46, 54, 62, 39, 47, 55, 63,
95 /* Input permutation for the simple_idct_mmx */
96 static UINT8 simple_mmx_permutation[64]={
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
108 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
109 UINT32 inverse[256]={
110 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
111 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
112 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
113 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
114 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
115 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
116 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
117 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
118 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
119 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
120 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
121 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
122 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
123 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
124 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
125 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
126 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
127 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
128 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
129 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
130 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
131 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
132 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
133 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
134 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
135 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
136 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
137 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
138 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
139 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
140 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
141 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
144 /* used to skip zeros at the end */
145 UINT8 zigzag_end[64];
147 UINT8 permutation[64];
148 //UINT8 invPermutation[64];
150 static void build_zigzag_end()
153 int lastIndexAfterPerm=0;
154 for(lastIndex=0; lastIndex<64; lastIndex++)
156 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
157 lastIndexAfterPerm= zigzag_direct[lastIndex];
158 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
162 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
168 /* read the pixels */
185 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
189 /* read the pixels */
192 p[0] = s1[0] - s2[0];
193 p[1] = s1[1] - s2[1];
194 p[2] = s1[2] - s2[2];
195 p[3] = s1[3] - s2[3];
196 p[4] = s1[4] - s2[4];
197 p[5] = s1[5] - s2[5];
198 p[6] = s1[6] - s2[6];
199 p[7] = s1[7] - s2[7];
207 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
212 UINT8 *cm = cropTbl + MAX_NEG_CROP;
214 /* read the pixels */
231 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
236 UINT8 *cm = cropTbl + MAX_NEG_CROP;
238 /* read the pixels */
242 pix[0] = cm[pix[0] + p[0]];
243 pix[1] = cm[pix[1] + p[1]];
244 pix[2] = cm[pix[2] + p[2]];
245 pix[3] = cm[pix[3] + p[3]];
246 pix[4] = cm[pix[4] + p[4]];
247 pix[5] = cm[pix[5] + p[5]];
248 pix[6] = cm[pix[6] + p[6]];
249 pix[7] = cm[pix[7] + p[7]];
255 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
257 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
278 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
286 OP(p[0], avg2(pix[0], pix[1])); \
287 OP(p[1], avg2(pix[1], pix[2])); \
288 OP(p[2], avg2(pix[2], pix[3])); \
289 OP(p[3], avg2(pix[3], pix[4])); \
290 OP(p[4], avg2(pix[4], pix[5])); \
291 OP(p[5], avg2(pix[5], pix[6])); \
292 OP(p[6], avg2(pix[6], pix[7])); \
293 OP(p[7], avg2(pix[7], pix[8])); \
299 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
307 pix1 = pixels + line_size; \
309 OP(p[0], avg2(pix[0], pix1[0])); \
310 OP(p[1], avg2(pix[1], pix1[1])); \
311 OP(p[2], avg2(pix[2], pix1[2])); \
312 OP(p[3], avg2(pix[3], pix1[3])); \
313 OP(p[4], avg2(pix[4], pix1[4])); \
314 OP(p[5], avg2(pix[5], pix1[5])); \
315 OP(p[6], avg2(pix[6], pix1[6])); \
316 OP(p[7], avg2(pix[7], pix1[7])); \
323 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
331 pix1 = pixels + line_size; \
333 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
334 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
335 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
336 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
337 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
338 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
339 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
340 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
347 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
349 OPNAME ## _pixels_x2, \
350 OPNAME ## _pixels_y2, \
351 OPNAME ## _pixels_xy2, \
355 /* rounding primitives */
356 #define avg2(a,b) ((a+b+1)>>1)
357 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
359 #define op_put(a, b) a = b
360 #define op_avg(a, b) a = avg2(a, b)
361 #define op_sub(a, b) a -= b
363 PIXOP(UINT8, put, op_put, line_size)
364 PIXOP(UINT8, avg, op_avg, line_size)
366 PIXOP(DCTELEM, sub, op_sub, 8)
368 /* not rounding primitives */
371 #define avg2(a,b) ((a+b)>>1)
372 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
374 PIXOP(UINT8, put_no_rnd, op_put, line_size)
375 PIXOP(UINT8, avg_no_rnd, op_avg, line_size)
377 /* motion estimation */
381 #define avg2(a,b) ((a+b+1)>>1)
382 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
384 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
386 const int A=(16-x16)*(16-y16);
387 const int B=( x16)*(16-y16);
388 const int C=(16-x16)*( y16);
389 const int D=( x16)*( y16);
391 rounder= 128 - rounder;
395 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
396 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
397 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
398 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
399 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
400 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
401 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
402 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
408 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
410 UINT8 *cm = cropTbl + MAX_NEG_CROP;
414 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
415 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
416 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
417 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
418 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
419 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
420 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
421 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
427 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
429 UINT8 *cm = cropTbl + MAX_NEG_CROP;
433 const int src0= src[0*srcStride];
434 const int src1= src[1*srcStride];
435 const int src2= src[2*srcStride];
436 const int src3= src[3*srcStride];
437 const int src4= src[4*srcStride];
438 const int src5= src[5*srcStride];
439 const int src6= src[6*srcStride];
440 const int src7= src[7*srcStride];
441 const int src8= src[8*srcStride];
442 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
443 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
444 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
445 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
446 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
447 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
448 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
449 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
455 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
473 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
478 dst[0]= (src1[0] + src2[0] + r)>>1;
479 dst[1]= (src1[1] + src2[1] + r)>>1;
480 dst[2]= (src1[2] + src2[2] + r)>>1;
481 dst[3]= (src1[3] + src2[3] + r)>>1;
482 dst[4]= (src1[4] + src2[4] + r)>>1;
483 dst[5]= (src1[5] + src2[5] + r)>>1;
484 dst[6]= (src1[6] + src2[6] + r)>>1;
485 dst[7]= (src1[7] + src2[7] + r)>>1;
492 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
497 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
498 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
499 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
500 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
501 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
502 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
503 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
504 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
513 #define QPEL_MC(r, name) \
514 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
516 put_block(dst, src, dstStride, srcStride);\
519 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
522 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
523 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
526 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
528 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
531 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
534 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
535 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
538 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
541 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
542 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
545 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
547 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
550 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
553 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
554 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
556 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
561 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
562 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
563 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
564 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
566 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
571 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
572 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
573 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
574 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
576 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
581 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
582 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
583 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
584 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
586 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
591 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
592 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
593 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
594 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
596 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
600 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
601 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
602 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
604 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
608 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
609 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
610 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
612 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
617 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
618 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
619 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
620 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
622 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
627 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
628 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
629 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
630 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
632 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
635 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
636 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
638 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
639 qpel_mc00_c ## name, \
640 qpel_mc10_c ## name, \
641 qpel_mc20_c ## name, \
642 qpel_mc30_c ## name, \
643 qpel_mc01_c ## name, \
644 qpel_mc11_c ## name, \
645 qpel_mc21_c ## name, \
646 qpel_mc31_c ## name, \
647 qpel_mc02_c ## name, \
648 qpel_mc12_c ## name, \
649 qpel_mc22_c ## name, \
650 qpel_mc32_c ## name, \
651 qpel_mc03_c ## name, \
652 qpel_mc13_c ## name, \
653 qpel_mc23_c ## name, \
654 qpel_mc33_c ## name, \
660 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
666 s += abs(pix1[0] - pix2[0]);
667 s += abs(pix1[1] - pix2[1]);
668 s += abs(pix1[2] - pix2[2]);
669 s += abs(pix1[3] - pix2[3]);
670 s += abs(pix1[4] - pix2[4]);
671 s += abs(pix1[5] - pix2[5]);
672 s += abs(pix1[6] - pix2[6]);
673 s += abs(pix1[7] - pix2[7]);
674 s += abs(pix1[8] - pix2[8]);
675 s += abs(pix1[9] - pix2[9]);
676 s += abs(pix1[10] - pix2[10]);
677 s += abs(pix1[11] - pix2[11]);
678 s += abs(pix1[12] - pix2[12]);
679 s += abs(pix1[13] - pix2[13]);
680 s += abs(pix1[14] - pix2[14]);
681 s += abs(pix1[15] - pix2[15]);
688 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
694 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
695 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
696 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
697 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
698 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
699 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
700 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
701 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
702 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
703 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
704 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
705 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
706 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
707 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
708 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
709 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
716 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
719 UINT8 *pix3 = pix2 + line_size;
723 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
724 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
725 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
726 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
727 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
728 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
729 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
730 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
731 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
732 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
733 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
734 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
735 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
736 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
737 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
738 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
746 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
749 UINT8 *pix3 = pix2 + line_size;
753 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
754 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
755 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
756 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
757 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
758 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
759 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
760 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
761 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
762 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
763 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
764 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
765 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
766 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
767 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
768 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
776 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
782 s += abs(pix1[0] - pix2[0]);
783 s += abs(pix1[1] - pix2[1]);
784 s += abs(pix1[2] - pix2[2]);
785 s += abs(pix1[3] - pix2[3]);
786 s += abs(pix1[4] - pix2[4]);
787 s += abs(pix1[5] - pix2[5]);
788 s += abs(pix1[6] - pix2[6]);
789 s += abs(pix1[7] - pix2[7]);
796 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
802 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
803 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
804 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
805 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
806 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
807 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
808 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
809 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
816 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
819 UINT8 *pix3 = pix2 + line_size;
823 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
824 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
825 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
826 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
827 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
828 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
829 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
830 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
838 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
841 UINT8 *pix3 = pix2 + line_size;
845 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
846 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
847 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
848 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
849 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
850 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
851 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
852 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
860 /* permute block according so that it corresponds to the MMX idct
863 /* general permutation, but perhaps slightly slower */
864 void block_permute(INT16 *block)
869 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
871 for(i=0; i<64; i++) block[i] = temp[i];
875 void block_permute(INT16 *block)
877 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
898 void clear_blocks_c(DCTELEM *blocks)
900 memset(blocks, 0, sizeof(DCTELEM)*6*64);
903 void dsputil_init(void)
906 int use_permuted_idct;
908 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
909 for(i=0;i<MAX_NEG_CROP;i++) {
911 cropTbl[i + MAX_NEG_CROP + 256] = 255;
915 squareTbl[i] = (i - 256) * (i - 256);
919 ff_idct = simple_idct;
923 get_pixels = get_pixels_c;
924 diff_pixels = diff_pixels_c;
925 put_pixels_clamped = put_pixels_clamped_c;
926 add_pixels_clamped = add_pixels_clamped_c;
928 clear_blocks= clear_blocks_c;
930 pix_abs16x16 = pix_abs16x16_c;
931 pix_abs16x16_x2 = pix_abs16x16_x2_c;
932 pix_abs16x16_y2 = pix_abs16x16_y2_c;
933 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
934 pix_abs8x8 = pix_abs8x8_c;
935 pix_abs8x8_x2 = pix_abs8x8_x2_c;
936 pix_abs8x8_y2 = pix_abs8x8_y2_c;
937 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
938 av_fdct = jpeg_fdct_ifast;
940 use_permuted_idct = 1;
946 dsputil_init_armv4l();
950 use_permuted_idct = 0;
953 dsputil_init_alpha();
954 use_permuted_idct = 0;
958 if(ff_idct == simple_idct) use_permuted_idct=0;
961 if(use_permuted_idct)
963 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
965 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
968 for(i=0; i<64; i++) permutation[i]=i;
970 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
971 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
973 if (use_permuted_idct) {
974 /* permute for IDCT */
976 j = zigzag_direct[i];
977 zigzag_direct[i] = block_permute_op(j);
978 j = ff_alternate_horizontal_scan[i];
979 ff_alternate_horizontal_scan[i] = block_permute_op(j);
980 j = ff_alternate_vertical_scan[i];
981 ff_alternate_vertical_scan[i] = block_permute_op(j);
983 block_permute(default_intra_matrix);
984 block_permute(default_non_intra_matrix);
985 block_permute(ff_mpeg4_default_intra_matrix);
986 block_permute(ff_mpeg4_default_non_intra_matrix);
992 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
993 int orig_linesize[3], int coded_linesize,
994 AVCodecContext *avctx)
996 int quad, diff, x, y;
998 UINT32 *sq = squareTbl + 256;
1004 orig = orig_image[0];
1005 coded = coded_image[0];
1007 for (y=0;y<avctx->height;y++) {
1008 for (x=0;x<avctx->width;x++) {
1009 diff = *(orig + x) - *(coded + x);
1012 orig += orig_linesize[0];
1013 coded += coded_linesize;
1016 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1018 if (avctx->psnr_y) {
1019 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1020 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1022 avctx->psnr_y = 99.99;