3 * Copyright (c) 2000, 2001 Gerard Lantau.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 #include "simple_idct.h"
25 void (*ff_idct)(DCTELEM *block);
26 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
27 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
28 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
30 op_pixels_abs_func pix_abs16x16;
31 op_pixels_abs_func pix_abs16x16_x2;
32 op_pixels_abs_func pix_abs16x16_y2;
33 op_pixels_abs_func pix_abs16x16_xy2;
35 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
36 UINT32 squareTbl[512];
38 extern UINT16 default_intra_matrix[64];
39 extern UINT16 default_non_intra_matrix[64];
41 UINT8 zigzag_direct[64] = {
42 0, 1, 8, 16, 9, 2, 3, 10,
43 17, 24, 32, 25, 18, 11, 4, 5,
44 12, 19, 26, 33, 40, 48, 41, 34,
45 27, 20, 13, 6, 7, 14, 21, 28,
46 35, 42, 49, 56, 57, 50, 43, 36,
47 29, 22, 15, 23, 30, 37, 44, 51,
48 58, 59, 52, 45, 38, 31, 39, 46,
49 53, 60, 61, 54, 47, 55, 62, 63
52 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
53 UINT16 __align8 inv_zigzag_direct16[64];
55 /* not permutated zigzag_direct for MMX quantizer */
56 UINT8 zigzag_direct_noperm[64];
58 UINT8 ff_alternate_horizontal_scan[64] = {
59 0, 1, 2, 3, 8, 9, 16, 17,
60 10, 11, 4, 5, 6, 7, 15, 14,
61 13, 12, 19, 18, 24, 25, 32, 33,
62 26, 27, 20, 21, 22, 23, 28, 29,
63 30, 31, 34, 35, 40, 41, 48, 49,
64 42, 43, 36, 37, 38, 39, 44, 45,
65 46, 47, 50, 51, 56, 57, 58, 59,
66 52, 53, 54, 55, 60, 61, 62, 63,
69 UINT8 ff_alternate_vertical_scan[64] = {
70 0, 8, 16, 24, 1, 9, 2, 10,
71 17, 25, 32, 40, 48, 56, 57, 49,
72 41, 33, 26, 18, 3, 11, 4, 12,
73 19, 27, 34, 42, 50, 58, 35, 43,
74 51, 59, 20, 28, 5, 13, 6, 14,
75 21, 29, 36, 44, 52, 60, 37, 45,
76 53, 61, 22, 30, 7, 15, 23, 31,
77 38, 46, 54, 62, 39, 47, 55, 63,
80 /* Input permutation for the simple_idct_mmx */
81 static UINT8 simple_mmx_permutation[64]={
82 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
83 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
84 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
85 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
86 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
87 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
88 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
89 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
92 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
94 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
95 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
96 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
97 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
98 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
99 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
100 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
101 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
102 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
103 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
104 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
105 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
106 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
107 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
108 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
109 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
110 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
111 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
112 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
113 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
114 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
115 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
116 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
117 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
118 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
119 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
120 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
121 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
122 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
123 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
124 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
125 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
128 /* used to skip zeros at the end */
129 UINT8 zigzag_end[64];
131 UINT8 permutation[64];
132 //UINT8 invPermutation[64];
134 static void build_zigzag_end()
137 int lastIndexAfterPerm=0;
138 for(lastIndex=0; lastIndex<64; lastIndex++)
140 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
141 lastIndexAfterPerm= zigzag_direct[lastIndex];
142 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
146 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
152 /* read the pixels */
169 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
174 UINT8 *cm = cropTbl + MAX_NEG_CROP;
176 /* read the pixels */
193 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
198 UINT8 *cm = cropTbl + MAX_NEG_CROP;
200 /* read the pixels */
204 pix[0] = cm[pix[0] + p[0]];
205 pix[1] = cm[pix[1] + p[1]];
206 pix[2] = cm[pix[2] + p[2]];
207 pix[3] = cm[pix[3] + p[3]];
208 pix[4] = cm[pix[4] + p[4]];
209 pix[5] = cm[pix[5] + p[5]];
210 pix[6] = cm[pix[6] + p[6]];
211 pix[7] = cm[pix[7] + p[7]];
217 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
219 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
240 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
248 OP(p[0], avg2(pix[0], pix[1])); \
249 OP(p[1], avg2(pix[1], pix[2])); \
250 OP(p[2], avg2(pix[2], pix[3])); \
251 OP(p[3], avg2(pix[3], pix[4])); \
252 OP(p[4], avg2(pix[4], pix[5])); \
253 OP(p[5], avg2(pix[5], pix[6])); \
254 OP(p[6], avg2(pix[6], pix[7])); \
255 OP(p[7], avg2(pix[7], pix[8])); \
261 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
269 pix1 = pixels + line_size; \
271 OP(p[0], avg2(pix[0], pix1[0])); \
272 OP(p[1], avg2(pix[1], pix1[1])); \
273 OP(p[2], avg2(pix[2], pix1[2])); \
274 OP(p[3], avg2(pix[3], pix1[3])); \
275 OP(p[4], avg2(pix[4], pix1[4])); \
276 OP(p[5], avg2(pix[5], pix1[5])); \
277 OP(p[6], avg2(pix[6], pix1[6])); \
278 OP(p[7], avg2(pix[7], pix1[7])); \
285 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
293 pix1 = pixels + line_size; \
295 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
296 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
297 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
298 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
299 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
300 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
301 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
302 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
309 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
311 OPNAME ## _pixels_x2, \
312 OPNAME ## _pixels_y2, \
313 OPNAME ## _pixels_xy2, \
317 /* rounding primitives */
318 #define avg2(a,b) ((a+b+1)>>1)
319 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
321 #define op_put(a, b) a = b
322 #define op_avg(a, b) a = avg2(a, b)
323 #define op_sub(a, b) a -= b
325 PIXOP(UINT8, put, op_put, line_size)
326 PIXOP(UINT8, avg, op_avg, line_size)
328 PIXOP(DCTELEM, sub, op_sub, 8)
330 /* not rounding primitives */
333 #define avg2(a,b) ((a+b)>>1)
334 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
336 PIXOP(UINT8, put_no_rnd, op_put, line_size)
337 PIXOP(UINT8, avg_no_rnd, op_avg, line_size)
339 /* motion estimation */
343 #define avg2(a,b) ((a+b+1)>>1)
344 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
346 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
352 s += abs(pix1[0] - pix2[0]);
353 s += abs(pix1[1] - pix2[1]);
354 s += abs(pix1[2] - pix2[2]);
355 s += abs(pix1[3] - pix2[3]);
356 s += abs(pix1[4] - pix2[4]);
357 s += abs(pix1[5] - pix2[5]);
358 s += abs(pix1[6] - pix2[6]);
359 s += abs(pix1[7] - pix2[7]);
360 s += abs(pix1[8] - pix2[8]);
361 s += abs(pix1[9] - pix2[9]);
362 s += abs(pix1[10] - pix2[10]);
363 s += abs(pix1[11] - pix2[11]);
364 s += abs(pix1[12] - pix2[12]);
365 s += abs(pix1[13] - pix2[13]);
366 s += abs(pix1[14] - pix2[14]);
367 s += abs(pix1[15] - pix2[15]);
374 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
380 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
381 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
382 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
383 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
384 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
385 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
386 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
387 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
388 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
389 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
390 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
391 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
392 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
393 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
394 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
395 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
402 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
405 UINT8 *pix3 = pix2 + line_size;
409 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
410 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
411 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
412 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
413 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
414 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
415 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
416 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
417 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
418 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
419 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
420 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
421 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
422 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
423 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
424 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
432 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
435 UINT8 *pix3 = pix2 + line_size;
439 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
440 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
441 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
442 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
443 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
444 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
445 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
446 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
447 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
448 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
449 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
450 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
451 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
452 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
453 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
454 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
462 /* permute block according so that it corresponds to the MMX idct
465 /* general permutation, but perhaps slightly slower */
466 void block_permute(INT16 *block)
471 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
473 for(i=0; i<64; i++) block[i] = temp[i];
477 void block_permute(INT16 *block)
479 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
500 void dsputil_init(void)
503 int use_permuted_idct;
505 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
506 for(i=0;i<MAX_NEG_CROP;i++) {
508 cropTbl[i + MAX_NEG_CROP + 256] = 255;
512 squareTbl[i] = (i - 256) * (i - 256);
516 ff_idct = simple_idct;
520 get_pixels = get_pixels_c;
521 put_pixels_clamped = put_pixels_clamped_c;
522 add_pixels_clamped = add_pixels_clamped_c;
524 pix_abs16x16 = pix_abs16x16_c;
525 pix_abs16x16_x2 = pix_abs16x16_x2_c;
526 pix_abs16x16_y2 = pix_abs16x16_y2_c;
527 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
528 av_fdct = jpeg_fdct_ifast;
530 use_permuted_idct = 1;
536 dsputil_init_armv4l();
540 use_permuted_idct = 0;
543 dsputil_init_alpha();
544 use_permuted_idct = 0;
548 if(ff_idct == simple_idct) use_permuted_idct=0;
551 if(use_permuted_idct)
553 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
555 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
558 for(i=0; i<64; i++) permutation[i]=i;
560 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
561 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
563 if (use_permuted_idct) {
564 /* permute for IDCT */
566 j = zigzag_direct[i];
567 zigzag_direct[i] = block_permute_op(j);
568 j = ff_alternate_horizontal_scan[i];
569 ff_alternate_horizontal_scan[i] = block_permute_op(j);
570 j = ff_alternate_vertical_scan[i];
571 ff_alternate_vertical_scan[i] = block_permute_op(j);
573 block_permute(default_intra_matrix);
574 block_permute(default_non_intra_matrix);