4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 based upon some outcommented c code from mpeg2dec (idct_mmx.c
22 written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
26 #include "simple_idct.h"
31 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
32 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
33 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
34 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
35 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
36 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
37 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
41 #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49 #define COL_SHIFT 20 // 6
56 #if defined(ARCH_POWERPC_405)
58 /* signed 16x16 -> 32 multiply add accumulate */
59 #define MAC16(rt, ra, rb) \
60 asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
62 /* signed 16x16 -> 32 multiply */
63 #define MUL16(rt, ra, rb) \
64 asm ("mullhw %0, %1, %2" : "=r" (rt) : "r" (ra), "r" (rb));
68 /* signed 16x16 -> 32 multiply add accumulate */
69 #define MAC16(rt, ra, rb) rt += (ra) * (rb)
71 /* signed 16x16 -> 32 multiply */
72 #define MUL16(rt, ra, rb) rt = (ra) * (rb)
77 /* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */
78 static inline int idctRowCondDC(int16_t *row)
80 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
81 uint64_t *lrow = (uint64_t *) row;
86 if ((lrow[0] & ~0xffffULL) == 0) {
88 #if 1 //is ok if |a0| < 1024 than theres an +-1 error (for the *W4 case for W4=16383 !!!)
92 a0 += 1 << (ROW_SHIFT - 1);
105 a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
164 row[0] = (a0 + b0) >> ROW_SHIFT;
165 row[1] = (a1 + b1) >> ROW_SHIFT;
166 row[2] = (a2 + b2) >> ROW_SHIFT;
167 row[3] = (a3 + b3) >> ROW_SHIFT;
168 row[4] = (a3 - b3) >> ROW_SHIFT;
169 row[5] = (a2 - b2) >> ROW_SHIFT;
170 row[6] = (a1 - b1) >> ROW_SHIFT;
171 row[7] = (a0 - b0) >> ROW_SHIFT;
176 inline static void idctSparseCol(int16_t *col)
178 int a0, a1, a2, a3, b0, b1, b2, b3;
180 col[0] += (1 << (COL_SHIFT - 1)) / W4;
182 a0 = W4 * col[8 * 0];
183 a1 = W4 * col[8 * 0];
184 a2 = W4 * col[8 * 0];
185 a3 = W4 * col[8 * 0];
188 a0 += W2 * col[8 * 2];
189 a1 += W6 * col[8 * 2];
190 a2 -= W6 * col[8 * 2];
191 a3 -= W2 * col[8 * 2];
195 a0 += W4 * col[8 * 4];
196 a1 -= W4 * col[8 * 4];
197 a2 -= W4 * col[8 * 4];
198 a3 += W4 * col[8 * 4];
202 a0 += W6 * col[8 * 6];
203 a1 -= W2 * col[8 * 6];
204 a2 += W2 * col[8 * 6];
205 a3 -= W6 * col[8 * 6];
209 b0 = W1 * col[8 * 1];
210 b1 = W3 * col[8 * 1];
211 b2 = W5 * col[8 * 1];
212 b3 = W7 * col[8 * 1];
214 b0 = b1 = b2 = b3 = 0;
218 b0 += W3 * col[8 * 3];
219 b1 -= W7 * col[8 * 3];
220 b2 -= W1 * col[8 * 3];
221 b3 -= W5 * col[8 * 3];
225 b0 += W5 * col[8 * 5];
226 b1 -= W1 * col[8 * 5];
227 b2 += W7 * col[8 * 5];
228 b3 += W3 * col[8 * 5];
232 b0 += W7 * col[8 * 7];
233 b1 -= W5 * col[8 * 7];
234 b2 += W3 * col[8 * 7];
235 b3 -= W1 * col[8 * 7];
238 col[8 * 0] = (a0 + b0) >> COL_SHIFT;
239 col[8 * 7] = (a0 - b0) >> COL_SHIFT;
240 col[8 * 1] = (a1 + b1) >> COL_SHIFT;
241 col[8 * 6] = (a1 - b1) >> COL_SHIFT;
242 col[8 * 2] = (a2 + b2) >> COL_SHIFT;
243 col[8 * 5] = (a2 - b2) >> COL_SHIFT;
244 col[8 * 3] = (a3 + b3) >> COL_SHIFT;
245 col[8 * 4] = (a3 - b3) >> COL_SHIFT;
248 #else /* not ARCH_ALPHA */
250 static inline void idctRowCondDC (int16_t * row)
252 int a0, a1, a2, a3, b0, b1, b2, b3;
260 #ifdef WORDS_BIGENDIAN
261 #define ROW0_MASK 0xffff000000000000LL
263 #define ROW0_MASK 0xffffLL
265 if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) |
266 ((uint64_t *)row)[1]) == 0) {
267 temp = (row[0] << 3) & 0xffff;
270 ((uint64_t *)row)[0] = temp;
271 ((uint64_t *)row)[1] = temp;
275 if (!(((uint32_t*)row)[1] |
276 ((uint32_t*)row)[2] |
277 ((uint32_t*)row)[3] |
279 temp = (row[0] << 3) & 0xffff;
281 ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
282 ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
287 a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
292 /* no need to optimize : gcc does it */
298 MUL16(b0, W1, row[1]);
299 MAC16(b0, W3, row[3]);
300 MUL16(b1, W3, row[1]);
301 MAC16(b1, -W7, row[3]);
302 MUL16(b2, W5, row[1]);
303 MAC16(b2, -W1, row[3]);
304 MUL16(b3, W7, row[1]);
305 MAC16(b3, -W5, row[3]);
308 temp = ((uint64_t*)row)[1];
310 temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
313 a0 += W4*row[4] + W6*row[6];
314 a1 += - W4*row[4] - W2*row[6];
315 a2 += - W4*row[4] + W2*row[6];
316 a3 += W4*row[4] - W6*row[6];
318 MAC16(b0, W5, row[5]);
319 MAC16(b0, W7, row[7]);
321 MAC16(b1, -W1, row[5]);
322 MAC16(b1, -W5, row[7]);
324 MAC16(b2, W7, row[5]);
325 MAC16(b2, W3, row[7]);
327 MAC16(b3, W3, row[5]);
328 MAC16(b3, -W1, row[7]);
331 row[0] = (a0 + b0) >> ROW_SHIFT;
332 row[7] = (a0 - b0) >> ROW_SHIFT;
333 row[1] = (a1 + b1) >> ROW_SHIFT;
334 row[6] = (a1 - b1) >> ROW_SHIFT;
335 row[2] = (a2 + b2) >> ROW_SHIFT;
336 row[5] = (a2 - b2) >> ROW_SHIFT;
337 row[3] = (a3 + b3) >> ROW_SHIFT;
338 row[4] = (a3 - b3) >> ROW_SHIFT;
340 #endif /* not ARCH_ALPHA */
342 static inline void idctSparseColPut (UINT8 *dest, int line_size,
345 int a0, a1, a2, a3, b0, b1, b2, b3;
346 UINT8 *cm = cropTbl + MAX_NEG_CROP;
348 /* XXX: I did that only to give same values as previous code */
349 a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
359 MUL16(b0, W1, col[8*1]);
360 MUL16(b1, W3, col[8*1]);
361 MUL16(b2, W5, col[8*1]);
362 MUL16(b3, W7, col[8*1]);
364 MAC16(b0, + W3, col[8*3]);
365 MAC16(b1, - W7, col[8*3]);
366 MAC16(b2, - W1, col[8*3]);
367 MAC16(b3, - W5, col[8*3]);
377 MAC16(b0, + W5, col[8*5]);
378 MAC16(b1, - W1, col[8*5]);
379 MAC16(b2, + W7, col[8*5]);
380 MAC16(b3, + W3, col[8*5]);
391 MAC16(b0, + W7, col[8*7]);
392 MAC16(b1, - W5, col[8*7]);
393 MAC16(b2, + W3, col[8*7]);
394 MAC16(b3, - W1, col[8*7]);
397 dest[0] = cm[(a0 + b0) >> COL_SHIFT];
399 dest[0] = cm[(a1 + b1) >> COL_SHIFT];
401 dest[0] = cm[(a2 + b2) >> COL_SHIFT];
403 dest[0] = cm[(a3 + b3) >> COL_SHIFT];
405 dest[0] = cm[(a3 - b3) >> COL_SHIFT];
407 dest[0] = cm[(a2 - b2) >> COL_SHIFT];
409 dest[0] = cm[(a1 - b1) >> COL_SHIFT];
411 dest[0] = cm[(a0 - b0) >> COL_SHIFT];
414 static inline void idctSparseColAdd (UINT8 *dest, int line_size,
417 int a0, a1, a2, a3, b0, b1, b2, b3;
418 UINT8 *cm = cropTbl + MAX_NEG_CROP;
420 /* XXX: I did that only to give same values as previous code */
421 a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
431 MUL16(b0, W1, col[8*1]);
432 MUL16(b1, W3, col[8*1]);
433 MUL16(b2, W5, col[8*1]);
434 MUL16(b3, W7, col[8*1]);
436 MAC16(b0, + W3, col[8*3]);
437 MAC16(b1, - W7, col[8*3]);
438 MAC16(b2, - W1, col[8*3]);
439 MAC16(b3, - W5, col[8*3]);
449 MAC16(b0, + W5, col[8*5]);
450 MAC16(b1, - W1, col[8*5]);
451 MAC16(b2, + W7, col[8*5]);
452 MAC16(b3, + W3, col[8*5]);
463 MAC16(b0, + W7, col[8*7]);
464 MAC16(b1, - W5, col[8*7]);
465 MAC16(b2, + W3, col[8*7]);
466 MAC16(b3, - W1, col[8*7]);
469 dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)];
471 dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)];
473 dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)];
475 dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)];
477 dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)];
479 dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)];
481 dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)];
483 dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
486 static inline void idctSparseCol (int16_t * col)
488 int a0, a1, a2, a3, b0, b1, b2, b3;
489 UINT8 *cm = cropTbl + MAX_NEG_CROP;
491 /* XXX: I did that only to give same values as previous code */
492 a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
502 MUL16(b0, W1, col[8*1]);
503 MUL16(b1, W3, col[8*1]);
504 MUL16(b2, W5, col[8*1]);
505 MUL16(b3, W7, col[8*1]);
507 MAC16(b0, + W3, col[8*3]);
508 MAC16(b1, - W7, col[8*3]);
509 MAC16(b2, - W1, col[8*3]);
510 MAC16(b3, - W5, col[8*3]);
520 MAC16(b0, + W5, col[8*5]);
521 MAC16(b1, - W1, col[8*5]);
522 MAC16(b2, + W7, col[8*5]);
523 MAC16(b3, + W3, col[8*5]);
534 MAC16(b0, + W7, col[8*7]);
535 MAC16(b1, - W5, col[8*7]);
536 MAC16(b2, + W3, col[8*7]);
537 MAC16(b3, - W1, col[8*7]);
540 col[0 ] = ((a0 + b0) >> COL_SHIFT);
541 col[8 ] = ((a1 + b1) >> COL_SHIFT);
542 col[16] = ((a2 + b2) >> COL_SHIFT);
543 col[24] = ((a3 + b3) >> COL_SHIFT);
544 col[32] = ((a3 - b3) >> COL_SHIFT);
545 col[40] = ((a2 - b2) >> COL_SHIFT);
546 col[48] = ((a1 - b1) >> COL_SHIFT);
547 col[56] = ((a0 - b0) >> COL_SHIFT);
552 /* If all rows but the first one are zero after row transformation,
553 all rows will be identical after column transformation. */
554 static inline void idctCol2(int16_t *col)
558 uint64_t *lcol = (uint64_t *) col;
560 for (i = 0; i < 8; ++i) {
561 int a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4;
564 col[0] = a0 >> COL_SHIFT;
570 lcol[ 2] = l; lcol[ 3] = r;
571 lcol[ 4] = l; lcol[ 5] = r;
572 lcol[ 6] = l; lcol[ 7] = r;
573 lcol[ 8] = l; lcol[ 9] = r;
574 lcol[10] = l; lcol[11] = r;
575 lcol[12] = l; lcol[13] = r;
576 lcol[14] = l; lcol[15] = r;
579 void simple_idct (short *block)
583 int rowsZero = 1; /* all rows except row 0 zero */
584 int rowsConstant = 1; /* all rows consist of a constant value */
586 for (i = 0; i < 8; i++) {
587 int sparseness = idctRowCondDC(block + 8 * i);
589 if (i > 0 && sparseness > 0)
597 } else if (rowsConstant) {
598 uint64_t *lblock = (uint64_t *) block;
600 idctSparseCol(block);
601 for (i = 0; i < 8; i++) {
602 uint64_t v = (uint16_t) block[i * 8];
611 for (i = 0; i < 8; i++)
612 idctSparseCol(block + i);
616 /* XXX: suppress this mess */
617 void simple_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
620 put_pixels_clamped(block, dest, line_size);
623 void simple_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
626 add_pixels_clamped(block, dest, line_size);
631 void simple_idct_put(UINT8 *dest, int line_size, INT16 *block)
635 idctRowCondDC(block + i*8);
638 idctSparseColPut(dest + i, line_size, block + i);
641 void simple_idct_add(UINT8 *dest, int line_size, INT16 *block)
645 idctRowCondDC(block + i*8);
648 idctSparseColAdd(dest + i, line_size, block + i);
651 void simple_idct(INT16 *block)
655 idctRowCondDC(block + i*8);
658 idctSparseCol(block + i);