4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 based upon some outcommented c code from mpeg2dec (idct_mmx.c
22 written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
26 #include "simple_idct.h"
31 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
32 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
33 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
34 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
35 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
36 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
37 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
41 #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49 #define COL_SHIFT 20 // 6
56 #if defined(ARCH_POWERPC_405)
58 /* signed 16x16 -> 32 multiply add accumulate */
59 #define MAC16(rt, ra, rb) \
60 asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
62 /* signed 16x16 -> 32 multiply */
63 #define MUL16(rt, ra, rb) \
64 asm ("mullhw %0, %1, %2" : "=r" (rt) : "r" (ra), "r" (rb));
68 /* signed 16x16 -> 32 multiply add accumulate */
69 #define MAC16(rt, ra, rb) rt += (ra) * (rb)
71 /* signed 16x16 -> 32 multiply */
72 #define MUL16(rt, ra, rb) rt = (ra) * (rb)
77 /* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */
78 static inline int idctRowCondDC(int16_t *row)
80 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
81 uint64_t *lrow = (uint64_t *) row;
86 if ((lrow[0] & ~0xffffULL) == 0) {
88 #if 1 //is ok if |a0| < 1024 than theres an +-1 error (for the *W4 case for W4=16383 !!!)
92 a0 += 1 << (ROW_SHIFT - 1);
105 a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
164 row[0] = (a0 + b0) >> ROW_SHIFT;
165 row[1] = (a1 + b1) >> ROW_SHIFT;
166 row[2] = (a2 + b2) >> ROW_SHIFT;
167 row[3] = (a3 + b3) >> ROW_SHIFT;
168 row[4] = (a3 - b3) >> ROW_SHIFT;
169 row[5] = (a2 - b2) >> ROW_SHIFT;
170 row[6] = (a1 - b1) >> ROW_SHIFT;
171 row[7] = (a0 - b0) >> ROW_SHIFT;
176 inline static void idctSparseCol2(int16_t *col)
178 int a0, a1, a2, a3, b0, b1, b2, b3;
180 col[0] += (1 << (COL_SHIFT - 1)) / W4;
182 a0 = W4 * col[8 * 0];
183 a1 = W4 * col[8 * 0];
184 a2 = W4 * col[8 * 0];
185 a3 = W4 * col[8 * 0];
188 a0 += W2 * col[8 * 2];
189 a1 += W6 * col[8 * 2];
190 a2 -= W6 * col[8 * 2];
191 a3 -= W2 * col[8 * 2];
195 a0 += W4 * col[8 * 4];
196 a1 -= W4 * col[8 * 4];
197 a2 -= W4 * col[8 * 4];
198 a3 += W4 * col[8 * 4];
202 a0 += W6 * col[8 * 6];
203 a1 -= W2 * col[8 * 6];
204 a2 += W2 * col[8 * 6];
205 a3 -= W6 * col[8 * 6];
209 b0 = W1 * col[8 * 1];
210 b1 = W3 * col[8 * 1];
211 b2 = W5 * col[8 * 1];
212 b3 = W7 * col[8 * 1];
214 b0 = b1 = b2 = b3 = 0;
218 b0 += W3 * col[8 * 3];
219 b1 -= W7 * col[8 * 3];
220 b2 -= W1 * col[8 * 3];
221 b3 -= W5 * col[8 * 3];
225 b0 += W5 * col[8 * 5];
226 b1 -= W1 * col[8 * 5];
227 b2 += W7 * col[8 * 5];
228 b3 += W3 * col[8 * 5];
232 b0 += W7 * col[8 * 7];
233 b1 -= W5 * col[8 * 7];
234 b2 += W3 * col[8 * 7];
235 b3 -= W1 * col[8 * 7];
238 col[8 * 0] = (a0 + b0) >> COL_SHIFT;
239 col[8 * 7] = (a0 - b0) >> COL_SHIFT;
240 col[8 * 1] = (a1 + b1) >> COL_SHIFT;
241 col[8 * 6] = (a1 - b1) >> COL_SHIFT;
242 col[8 * 2] = (a2 + b2) >> COL_SHIFT;
243 col[8 * 5] = (a2 - b2) >> COL_SHIFT;
244 col[8 * 3] = (a3 + b3) >> COL_SHIFT;
245 col[8 * 4] = (a3 - b3) >> COL_SHIFT;
248 #else /* not ARCH_ALPHA */
250 static inline void idctRowCondDC (int16_t * row)
252 int a0, a1, a2, a3, b0, b1, b2, b3;
260 #ifdef WORDS_BIGENDIAN
261 #define ROW0_MASK 0xffff000000000000LL
263 #define ROW0_MASK 0xffffLL
265 if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) |
266 ((uint64_t *)row)[1]) == 0) {
267 temp = (row[0] << 3) & 0xffff;
270 ((uint64_t *)row)[0] = temp;
271 ((uint64_t *)row)[1] = temp;
275 if (!(((uint32_t*)row)[1] |
276 ((uint32_t*)row)[2] |
277 ((uint32_t*)row)[3] |
279 temp = (row[0] << 3) & 0xffff;
281 ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
282 ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
287 a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
292 /* no need to optimize : gcc does it */
298 MUL16(b0, W1, row[1]);
299 MAC16(b0, W3, row[3]);
300 MUL16(b1, W3, row[1]);
301 MAC16(b1, -W7, row[3]);
302 MUL16(b2, W5, row[1]);
303 MAC16(b2, -W1, row[3]);
304 MUL16(b3, W7, row[1]);
305 MAC16(b3, -W5, row[3]);
308 temp = ((uint64_t*)row)[1];
310 temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
313 a0 += W4*row[4] + W6*row[6];
314 a1 += - W4*row[4] - W2*row[6];
315 a2 += - W4*row[4] + W2*row[6];
316 a3 += W4*row[4] - W6*row[6];
318 MAC16(b0, W5, row[5]);
319 MAC16(b0, W7, row[7]);
321 MAC16(b1, -W1, row[5]);
322 MAC16(b1, -W5, row[7]);
324 MAC16(b2, W7, row[5]);
325 MAC16(b2, W3, row[7]);
327 MAC16(b3, W3, row[5]);
328 MAC16(b3, -W1, row[7]);
331 row[0] = (a0 + b0) >> ROW_SHIFT;
332 row[7] = (a0 - b0) >> ROW_SHIFT;
333 row[1] = (a1 + b1) >> ROW_SHIFT;
334 row[6] = (a1 - b1) >> ROW_SHIFT;
335 row[2] = (a2 + b2) >> ROW_SHIFT;
336 row[5] = (a2 - b2) >> ROW_SHIFT;
337 row[3] = (a3 + b3) >> ROW_SHIFT;
338 row[4] = (a3 - b3) >> ROW_SHIFT;
340 #endif /* not ARCH_ALPHA */
342 static inline void idctSparseColPut (UINT8 *dest, int line_size,
345 int a0, a1, a2, a3, b0, b1, b2, b3;
346 UINT8 *cm = cropTbl + MAX_NEG_CROP;
348 /* XXX: I did that only to give same values as previous code */
349 a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
359 MUL16(b0, W1, col[8*1]);
360 MUL16(b1, W3, col[8*1]);
361 MUL16(b2, W5, col[8*1]);
362 MUL16(b3, W7, col[8*1]);
364 MAC16(b0, + W3, col[8*3]);
365 MAC16(b1, - W7, col[8*3]);
366 MAC16(b2, - W1, col[8*3]);
367 MAC16(b3, - W5, col[8*3]);
377 MAC16(b0, + W5, col[8*5]);
378 MAC16(b1, - W1, col[8*5]);
379 MAC16(b2, + W7, col[8*5]);
380 MAC16(b3, + W3, col[8*5]);
391 MAC16(b0, + W7, col[8*7]);
392 MAC16(b1, - W5, col[8*7]);
393 MAC16(b2, + W3, col[8*7]);
394 MAC16(b3, - W1, col[8*7]);
397 dest[0] = cm[(a0 + b0) >> COL_SHIFT];
399 dest[0] = cm[(a1 + b1) >> COL_SHIFT];
401 dest[0] = cm[(a2 + b2) >> COL_SHIFT];
403 dest[0] = cm[(a3 + b3) >> COL_SHIFT];
405 dest[0] = cm[(a3 - b3) >> COL_SHIFT];
407 dest[0] = cm[(a2 - b2) >> COL_SHIFT];
409 dest[0] = cm[(a1 - b1) >> COL_SHIFT];
411 dest[0] = cm[(a0 - b0) >> COL_SHIFT];
414 static inline void idctSparseColAdd (UINT8 *dest, int line_size,
417 int a0, a1, a2, a3, b0, b1, b2, b3;
418 UINT8 *cm = cropTbl + MAX_NEG_CROP;
420 /* XXX: I did that only to give same values as previous code */
421 a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
431 MUL16(b0, W1, col[8*1]);
432 MUL16(b1, W3, col[8*1]);
433 MUL16(b2, W5, col[8*1]);
434 MUL16(b3, W7, col[8*1]);
436 MAC16(b0, + W3, col[8*3]);
437 MAC16(b1, - W7, col[8*3]);
438 MAC16(b2, - W1, col[8*3]);
439 MAC16(b3, - W5, col[8*3]);
449 MAC16(b0, + W5, col[8*5]);
450 MAC16(b1, - W1, col[8*5]);
451 MAC16(b2, + W7, col[8*5]);
452 MAC16(b3, + W3, col[8*5]);
463 MAC16(b0, + W7, col[8*7]);
464 MAC16(b1, - W5, col[8*7]);
465 MAC16(b2, + W3, col[8*7]);
466 MAC16(b3, - W1, col[8*7]);
469 dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)];
471 dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)];
473 dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)];
475 dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)];
477 dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)];
479 dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)];
481 dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)];
483 dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
486 static inline void idctSparseCol (int16_t * col)
488 int a0, a1, a2, a3, b0, b1, b2, b3;
490 /* XXX: I did that only to give same values as previous code */
491 a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
501 MUL16(b0, W1, col[8*1]);
502 MUL16(b1, W3, col[8*1]);
503 MUL16(b2, W5, col[8*1]);
504 MUL16(b3, W7, col[8*1]);
506 MAC16(b0, + W3, col[8*3]);
507 MAC16(b1, - W7, col[8*3]);
508 MAC16(b2, - W1, col[8*3]);
509 MAC16(b3, - W5, col[8*3]);
519 MAC16(b0, + W5, col[8*5]);
520 MAC16(b1, - W1, col[8*5]);
521 MAC16(b2, + W7, col[8*5]);
522 MAC16(b3, + W3, col[8*5]);
533 MAC16(b0, + W7, col[8*7]);
534 MAC16(b1, - W5, col[8*7]);
535 MAC16(b2, + W3, col[8*7]);
536 MAC16(b3, - W1, col[8*7]);
539 col[0 ] = ((a0 + b0) >> COL_SHIFT);
540 col[8 ] = ((a1 + b1) >> COL_SHIFT);
541 col[16] = ((a2 + b2) >> COL_SHIFT);
542 col[24] = ((a3 + b3) >> COL_SHIFT);
543 col[32] = ((a3 - b3) >> COL_SHIFT);
544 col[40] = ((a2 - b2) >> COL_SHIFT);
545 col[48] = ((a1 - b1) >> COL_SHIFT);
546 col[56] = ((a0 - b0) >> COL_SHIFT);
551 /* If all rows but the first one are zero after row transformation,
552 all rows will be identical after column transformation. */
553 static inline void idctCol2(int16_t *col)
557 uint64_t *lcol = (uint64_t *) col;
559 for (i = 0; i < 8; ++i) {
560 int a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4;
563 col[0] = a0 >> COL_SHIFT;
569 lcol[ 2] = l; lcol[ 3] = r;
570 lcol[ 4] = l; lcol[ 5] = r;
571 lcol[ 6] = l; lcol[ 7] = r;
572 lcol[ 8] = l; lcol[ 9] = r;
573 lcol[10] = l; lcol[11] = r;
574 lcol[12] = l; lcol[13] = r;
575 lcol[14] = l; lcol[15] = r;
578 void simple_idct (short *block)
582 int rowsZero = 1; /* all rows except row 0 zero */
583 int rowsConstant = 1; /* all rows consist of a constant value */
585 for (i = 0; i < 8; i++) {
586 int sparseness = idctRowCondDC(block + 8 * i);
588 if (i > 0 && sparseness > 0)
596 } else if (rowsConstant) {
597 uint64_t *lblock = (uint64_t *) block;
599 idctSparseCol2(block);
600 for (i = 0; i < 8; i++) {
601 uint64_t v = (uint16_t) block[i * 8];
610 for (i = 0; i < 8; i++)
611 idctSparseCol2(block + i);
615 /* XXX: suppress this mess */
616 void simple_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
619 put_pixels_clamped(block, dest, line_size);
622 void simple_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
625 add_pixels_clamped(block, dest, line_size);
630 void simple_idct_put(UINT8 *dest, int line_size, INT16 *block)
634 idctRowCondDC(block + i*8);
637 idctSparseColPut(dest + i, line_size, block + i);
640 void simple_idct_add(UINT8 *dest, int line_size, INT16 *block)
644 idctRowCondDC(block + i*8);
647 idctSparseColAdd(dest + i, line_size, block + i);
650 void simple_idct(INT16 *block)
654 idctRowCondDC(block + i*8);
657 idctSparseCol(block + i);
665 #define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5))
666 #define C1 C_FIX(0.6532814824)
667 #define C2 C_FIX(0.2705980501)
669 /* row idct is multiple by 16 * sqrt(2.0), col idct4 is normalized,
670 and the butterfly must be multiplied by 0.5 * sqrt(2.0) */
671 #define C_SHIFT (4+1+12)
673 static inline void idct4col(UINT8 *dest, int line_size, const INT16 *col)
675 int c0, c1, c2, c3, a0, a1, a2, a3;
676 const UINT8 *cm = cropTbl + MAX_NEG_CROP;
682 c0 = ((a0 + a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
683 c2 = ((a0 - a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
684 c1 = a1 * C1 + a3 * C2;
685 c3 = a1 * C2 - a3 * C1;
686 dest[0] = cm[(c0 + c1) >> C_SHIFT];
688 dest[0] = cm[(c2 + c3) >> C_SHIFT];
690 dest[0] = cm[(c2 - c3) >> C_SHIFT];
692 dest[0] = cm[(c0 - c1) >> C_SHIFT];
701 ptr[8 + k] = a0 - a1;\
704 /* only used by DV codec. The input must be interlaced. 128 is added
705 to the pixels before clamping to avoid systematic error
706 (1024*sqrt(2)) offset would be needed otherwise. */
707 /* XXX: I think a 1.0/sqrt(2) normalization should be needed to
708 compensate the extra butterfly stage - I don't have the full DV
710 void simple_idct248_put(UINT8 *dest, int line_size, INT16 *block)
729 /* IDCT8 on each line */
731 idctRowCondDC(block + i*8);
734 /* IDCT4 and store */
736 idct4col(dest + i, 2 * line_size, block + i);
737 idct4col(dest + line_size + i, 2 * line_size, block + 8 + i);