1 /*****************************************************************************
2 * vdec_idct.c : IDCT functions
4 *****************************************************************************/
6 /*****************************************************************************
8 *****************************************************************************/
20 #include "vlc_thread.h"
23 #include "debug.h" /* XXX?? temporaire, requis par netlist.h */
26 #include "input_netlist.h"
27 #include "decoder_fifo.h"
29 #include "video_output.h"
31 #include "vdec_idct.h"
32 #include "video_decoder.h"
33 #include "vdec_motion.h"
35 #include "vpar_blocks.h"
36 #include "vpar_headers.h"
37 #include "vpar_synchro.h"
38 #include "video_parser.h"
39 #include "video_fifo.h"
45 /* Our current implementation is a fast DCT, we might move to a fast DFT or
46 * an MMX DCT in the future. */
48 /*****************************************************************************
49 * vdec_InitIDCT : initialize datas for vdec_SparceIDCT
50 * vdec_SparseIDCT : IDCT function for sparse matrices
51 *****************************************************************************/
53 void vdec_InitIDCT (vdec_thread_t * p_vdec)
57 dctelem_t * p_pre = p_vdec->p_pre_idct;
58 memset( p_pre, 0, 64*64*sizeof(dctelem_t) );
60 for( i=0 ; i < 64 ; i++ )
62 p_pre[i*64+i] = 1 << SPARSE_SCALE_FACTOR;
63 vdec_IDCT( p_vdec, &p_pre[i*64], 0) ;
68 void vdec_SparseIDCT (vdec_thread_t * p_vdec, dctelem_t * p_block,
78 /* If DC Coefficient. */
79 if ( i_sparse_pos == 0 )
82 val=RIGHT_SHIFT((*p_block + 4), 3);
83 /* Compute int to assign. This speeds things up a bit */
84 v = ((val & 0xffff) | (val << 16));
85 dp[0] = v; dp[1] = v; dp[2] = v; dp[3] = v;
86 dp[4] = v; dp[5] = v; dp[6] = v; dp[7] = v;
87 dp[8] = v; dp[9] = v; dp[10] = v; dp[11] = v;
88 dp[12] = v; dp[13] = v; dp[14] = v; dp[15] = v;
89 dp[16] = v; dp[17] = v; dp[18] = v; dp[19] = v;
90 dp[20] = v; dp[21] = v; dp[22] = v; dp[23] = v;
91 dp[24] = v; dp[25] = v; dp[26] = v; dp[27] = v;
92 dp[28] = v; dp[29] = v; dp[30] = v; dp[31] = v;
95 /* Some other coefficient. */
96 p_dest = (s16*)p_block;
97 p_source = (s16*)&p_vdec->p_pre_idct[i_sparse_pos];
98 coeff = (int)p_dest[i_sparse_pos];
99 for( rr=0 ; rr < 4 ; rr++ )
101 p_dest[0] = (p_source[0] * coeff) >> SPARSE_SCALE_FACTOR;
102 p_dest[1] = (p_source[1] * coeff) >> SPARSE_SCALE_FACTOR;
103 p_dest[2] = (p_source[2] * coeff) >> SPARSE_SCALE_FACTOR;
104 p_dest[3] = (p_source[3] * coeff) >> SPARSE_SCALE_FACTOR;
105 p_dest[4] = (p_source[4] * coeff) >> SPARSE_SCALE_FACTOR;
106 p_dest[5] = (p_source[5] * coeff) >> SPARSE_SCALE_FACTOR;
107 p_dest[6] = (p_source[6] * coeff) >> SPARSE_SCALE_FACTOR;
108 p_dest[7] = (p_source[7] * coeff) >> SPARSE_SCALE_FACTOR;
109 p_dest[8] = (p_source[8] * coeff) >> SPARSE_SCALE_FACTOR;
110 p_dest[9] = (p_source[9] * coeff) >> SPARSE_SCALE_FACTOR;
111 p_dest[10] = (p_source[10] * coeff) >> SPARSE_SCALE_FACTOR;
112 p_dest[11] = (p_source[11] * coeff) >> SPARSE_SCALE_FACTOR;
113 p_dest[12] = (p_source[12] * coeff) >> SPARSE_SCALE_FACTOR;
114 p_dest[13] = (p_source[13] * coeff) >> SPARSE_SCALE_FACTOR;
115 p_dest[14] = (p_source[14] * coeff) >> SPARSE_SCALE_FACTOR;
116 p_dest[15] = (p_source[15] * coeff) >> SPARSE_SCALE_FACTOR;
124 /*****************************************************************************
125 * vdec_IDCT : IDCT function for normal matrices
126 *****************************************************************************/
129 void vdec_IDCT( vdec_thread_t * p_vdec, dctelem_t * p_block, int i_idontcare )
132 /* dct classique: pour tester la meilleure entre la classique et la */
134 s32 tmp0, tmp1, tmp2, tmp3;
135 s32 tmp10, tmp11, tmp12, tmp13;
136 s32 z1, z2, z3, z4, z5;
141 /* Pass 1: process rows. */
142 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
143 /* furthermore, we scale the results by 2**PASS1_BITS. */
146 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
148 /* Due to quantization, we will usually find that many of the input
149 * coefficients are zero, especially the AC terms. We can exploit this
150 * by short-circuiting the IDCT calculation for any row in which all
151 * the AC terms are zero. In that case each output is equal to the
152 * DC coefficient (with scale factor as needed).
153 * With typical images and quantization tables, half or more of the
154 * row DCT calculations can be simplified this way.
157 if ((dataptr[1] | dataptr[2] | dataptr[3] | dataptr[4] |
158 dataptr[5] | dataptr[6] | dataptr[7]) == 0)
160 /* AC terms all zero */
161 dctelem_t dcval = (dctelem_t) (dataptr[0] << PASS1_BITS);
172 dataptr += DCTSIZE; /* advance pointer to next row */
176 /* Even part: reverse the even part of the forward DCT. */
177 /* The rotator is sqrt(2)*c(-6). */
179 z2 = (s32) dataptr[2];
180 z3 = (s32) dataptr[6];
182 z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
183 tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
184 tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
186 tmp0 = ((s32) dataptr[0] + (s32) dataptr[4]) << CONST_BITS;
187 tmp1 = ((s32) dataptr[0] - (s32) dataptr[4]) << CONST_BITS;
194 /* Odd part per figure 8; the matrix is unitary and hence its
195 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
198 tmp0 = (s32) dataptr[7];
199 tmp1 = (s32) dataptr[5];
200 tmp2 = (s32) dataptr[3];
201 tmp3 = (s32) dataptr[1];
207 z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
209 tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
210 tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
211 tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
212 tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
213 z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
214 z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
215 z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
216 z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
226 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
228 dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
229 dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
230 dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
231 dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
232 dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
233 dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
234 dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
235 dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
237 dataptr += DCTSIZE; /* advance pointer to next row */
240 /* Pass 2: process columns. */
241 /* Note that we must descale the results by a factor of 8 == 2**3, */
242 /* and also undo the PASS1_BITS scaling. */
245 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
247 /* Columns of zeroes can be exploited in the same way as we did with rows.
248 * However, the row calculation has created many nonzero AC terms, so the
249 * simplification applies less often (typically 5% to 10% of the time).
250 * On machines with very fast multiplication, it's possible that the
251 * test takes more time than it's worth. In that case this section
252 * may be commented out.
255 #ifndef NO_ZERO_COLUMN_TEST /*ajoute un test mais evite des calculs */
256 if ((dataptr[DCTSIZE*1] | dataptr[DCTSIZE*2] | dataptr[DCTSIZE*3] |
257 dataptr[DCTSIZE*4] | dataptr[DCTSIZE*5] | dataptr[DCTSIZE*6] |
258 dataptr[DCTSIZE*7]) == 0)
260 /* AC terms all zero */
261 dctelem_t dcval = (dctelem_t) DESCALE((s32) dataptr[0], PASS1_BITS+3);
263 dataptr[DCTSIZE*0] = dcval;
264 dataptr[DCTSIZE*1] = dcval;
265 dataptr[DCTSIZE*2] = dcval;
266 dataptr[DCTSIZE*3] = dcval;
267 dataptr[DCTSIZE*4] = dcval;
268 dataptr[DCTSIZE*5] = dcval;
269 dataptr[DCTSIZE*6] = dcval;
270 dataptr[DCTSIZE*7] = dcval;
272 dataptr++; /* advance pointer to next column */
277 /* Even part: reverse the even part of the forward DCT. */
278 /* The rotator is sqrt(2)*c(-6). */
280 z2 = (s32) dataptr[DCTSIZE*2];
281 z3 = (s32) dataptr[DCTSIZE*6];
283 z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
284 tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
285 tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
287 tmp0 = ((s32) dataptr[DCTSIZE*0] + (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
288 tmp1 = ((s32) dataptr[DCTSIZE*0] - (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
295 /* Odd part per figure 8; the matrix is unitary and hence its
296 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
299 tmp0 = (s32) dataptr[DCTSIZE*7];
300 tmp1 = (s32) dataptr[DCTSIZE*5];
301 tmp2 = (s32) dataptr[DCTSIZE*3];
302 tmp3 = (s32) dataptr[DCTSIZE*1];
308 z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
310 tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
311 tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
312 tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
313 tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
314 z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
315 z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
316 z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
317 z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
327 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
329 dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
330 CONST_BITS+PASS1_BITS+3);
331 dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
332 CONST_BITS+PASS1_BITS+3);
333 dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
334 CONST_BITS+PASS1_BITS+3);
335 dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
336 CONST_BITS+PASS1_BITS+3);
337 dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
338 CONST_BITS+PASS1_BITS+3);
339 dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
340 CONST_BITS+PASS1_BITS+3);
341 dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
342 CONST_BITS+PASS1_BITS+3);
343 dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
344 CONST_BITS+PASS1_BITS+3);
346 dataptr++; /* advance pointer to next column */
350 #if 1 /*dct avec non classique*/
352 s32 tmp0, tmp1, tmp2, tmp3;
353 s32 tmp10, tmp11, tmp12, tmp13;
354 s32 z1, z2, z3, z4, z5;
355 s32 d0, d1, d2, d3, d4, d5, d6, d7;
361 /* Pass 1: process rows. */
362 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
363 /* furthermore, we scale the results by 2**PASS1_BITS. */
367 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
369 /* Due to quantization, we will usually find that many of the input
370 * coefficients are zero, especially the AC terms. We can exploit this
371 * by short-circuiting the IDCT calculation for any row in which all
372 * the AC terms are zero. In that case each output is equal to the
373 * DC coefficient (with scale factor as needed).
374 * With typical images and quantization tables, half or more of the
375 * row DCT calculations can be simplified this way.
378 register int * idataptr = (int*)dataptr;
381 if ( (d1 == 0) && ((idataptr[1] | idataptr[2] | idataptr[3]) == 0) )
383 /* AC terms all zero */
386 /* Compute a 32 bit value to assign. */
387 dctelem_t dcval = (dctelem_t) (d0 << PASS1_BITS);
388 register int v = (dcval & 0xffff) | (dcval << 16);
396 dataptr += DCTSIZE; /* advance pointer to next row */
406 /* Even part: reverse the even part of the forward DCT. */
407 /* The rotator is sqrt(2)*c(-6). */
416 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
417 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
418 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
419 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
421 tmp0 = (d0 + d4) << CONST_BITS;
422 tmp1 = (d0 - d4) << CONST_BITS;
431 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
432 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
433 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
434 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
436 tmp0 = d4 << CONST_BITS;
441 tmp12 = -(tmp0 + tmp2);
448 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
449 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
450 tmp3 = MULTIPLY(d6, FIX(0.541196100));
452 tmp0 = (d0 + d4) << CONST_BITS;
453 tmp1 = (d0 - d4) << CONST_BITS;
462 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
463 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
464 tmp3 = MULTIPLY(d6, FIX(0.541196100));
466 tmp0 = d4 << CONST_BITS;
471 tmp12 = -(tmp0 + tmp2);
481 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
482 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
483 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
484 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
486 tmp0 = d0 << CONST_BITS;
495 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
496 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
497 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
498 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
510 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
511 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
512 tmp3 = MULTIPLY(d6, FIX(0.541196100));
514 tmp0 = d0 << CONST_BITS;
523 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
524 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
525 tmp3 = MULTIPLY(d6, FIX(0.541196100));
543 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
544 tmp2 = MULTIPLY(d2, FIX(0.541196100));
545 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
547 tmp0 = (d0 + d4) << CONST_BITS;
548 tmp1 = (d0 - d4) << CONST_BITS;
557 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
558 tmp2 = MULTIPLY(d2, FIX(0.541196100));
559 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
561 tmp0 = d4 << CONST_BITS;
566 tmp12 = -(tmp0 + tmp2);
573 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
574 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
575 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
579 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
580 tmp10 = tmp13 = d4 << CONST_BITS;
581 tmp11 = tmp12 = -tmp10;
591 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
592 tmp2 = MULTIPLY(d2, FIX(0.541196100));
593 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
595 tmp0 = d0 << CONST_BITS;
604 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
605 tmp2 = MULTIPLY(d2, FIX(0.541196100));
606 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
618 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
619 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
623 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
624 tmp10 = tmp13 = tmp11 = tmp12 = 0;
631 /* Odd part per figure 8; the matrix is unitary and hence its
632 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
643 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
648 z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
650 tmp0 = MULTIPLY(d7, FIX(0.298631336));
651 tmp1 = MULTIPLY(d5, FIX(2.053119869));
652 tmp2 = MULTIPLY(d3, FIX(3.072711026));
653 tmp3 = MULTIPLY(d1, FIX(1.501321110));
654 z1 = MULTIPLY(z1, - FIX(0.899976223));
655 z2 = MULTIPLY(z2, - FIX(2.562915447));
656 z3 = MULTIPLY(z3, - FIX(1.961570560));
657 z4 = MULTIPLY(z4, - FIX(0.390180644));
669 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
672 z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
674 tmp0 = MULTIPLY(d7, FIX(0.298631336));
675 tmp1 = MULTIPLY(d5, FIX(2.053119869));
676 tmp2 = MULTIPLY(d3, FIX(3.072711026));
677 z1 = MULTIPLY(d7, - FIX(0.899976223));
678 z2 = MULTIPLY(z2, - FIX(2.562915447));
679 z3 = MULTIPLY(z3, - FIX(1.961570560));
680 z4 = MULTIPLY(d5, - FIX(0.390180644));
695 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
698 z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
700 tmp0 = MULTIPLY(d7, FIX(0.298631336));
701 tmp1 = MULTIPLY(d5, FIX(2.053119869));
702 tmp3 = MULTIPLY(d1, FIX(1.501321110));
703 z1 = MULTIPLY(z1, - FIX(0.899976223));
704 z2 = MULTIPLY(d5, - FIX(2.562915447));
705 z3 = MULTIPLY(d7, - FIX(1.961570560));
706 z4 = MULTIPLY(z4, - FIX(0.390180644));
718 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
719 z5 = MULTIPLY(d7 + d5, FIX(1.175875602));
721 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
722 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
723 z1 = MULTIPLY(d7, - FIX(0.899976223));
724 z3 = MULTIPLY(d7, - FIX(1.961570560));
725 z2 = MULTIPLY(d5, - FIX(2.562915447));
726 z4 = MULTIPLY(d5, - FIX(0.390180644));
744 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
747 z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
749 tmp0 = MULTIPLY(d7, FIX(0.298631336));
750 tmp2 = MULTIPLY(d3, FIX(3.072711026));
751 tmp3 = MULTIPLY(d1, FIX(1.501321110));
752 z1 = MULTIPLY(z1, - FIX(0.899976223));
753 z2 = MULTIPLY(d3, - FIX(2.562915447));
754 z3 = MULTIPLY(z3, - FIX(1.961570560));
755 z4 = MULTIPLY(d1, - FIX(0.390180644));
767 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
769 z5 = MULTIPLY(z3, FIX(1.175875602));
771 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
772 tmp2 = MULTIPLY(d3, FIX(0.509795579));
773 z1 = MULTIPLY(d7, - FIX(0.899976223));
774 z2 = MULTIPLY(d3, - FIX(2.562915447));
775 z3 = MULTIPLY(z3, - FIX2(0.785694958));
787 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
789 z5 = MULTIPLY(z1, FIX(1.175875602));
791 tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
792 tmp3 = MULTIPLY(d1, FIX2(1.111140466));
793 z1 = MULTIPLY(z1, FIX2(0.275899379));
794 z3 = MULTIPLY(d7, - FIX(1.961570560));
795 z4 = MULTIPLY(d1, - FIX(0.390180644));
804 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
805 tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
806 tmp1 = MULTIPLY(d7, FIX(1.175875602));
807 tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
808 tmp3 = MULTIPLY(d7, FIX2(0.275899379));
821 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
824 z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
826 tmp1 = MULTIPLY(d5, FIX(2.053119869));
827 tmp2 = MULTIPLY(d3, FIX(3.072711026));
828 tmp3 = MULTIPLY(d1, FIX(1.501321110));
829 z1 = MULTIPLY(d1, - FIX(0.899976223));
830 z2 = MULTIPLY(z2, - FIX(2.562915447));
831 z3 = MULTIPLY(d3, - FIX(1.961570560));
832 z4 = MULTIPLY(z4, - FIX(0.390180644));
844 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
846 z5 = MULTIPLY(z2, FIX(1.175875602));
848 tmp1 = MULTIPLY(d5, FIX2(1.662939225));
849 tmp2 = MULTIPLY(d3, FIX2(1.111140466));
850 z2 = MULTIPLY(z2, - FIX2(1.387039845));
851 z3 = MULTIPLY(d3, - FIX(1.961570560));
852 z4 = MULTIPLY(d5, - FIX(0.390180644));
864 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
866 z5 = MULTIPLY(z4, FIX(1.175875602));
868 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
869 tmp3 = MULTIPLY(d1, FIX2(0.601344887));
870 z1 = MULTIPLY(d1, - FIX(0.899976223));
871 z2 = MULTIPLY(d5, - FIX(2.562915447));
872 z4 = MULTIPLY(z4, FIX2(0.785694958));
881 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
882 tmp0 = MULTIPLY(d5, FIX(1.175875602));
883 tmp1 = MULTIPLY(d5, FIX2(0.275899380));
884 tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
885 tmp3 = MULTIPLY(d5, FIX2(0.785694958));
895 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
898 tmp2 = MULTIPLY(d3, - FIX(1.451774981));
899 tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
900 z1 = MULTIPLY(d1, FIX(1.061594337));
901 z2 = MULTIPLY(d3, - FIX(2.172734803));
902 z4 = MULTIPLY(z5, FIX(0.785694958));
903 z5 = MULTIPLY(z5, FIX(1.175875602));
912 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
913 tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
914 tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
915 tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
916 tmp3 = MULTIPLY(d3, FIX(1.175875602));
923 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
924 tmp0 = MULTIPLY(d1, FIX2(0.275899379));
925 tmp1 = MULTIPLY(d1, FIX2(0.785694958));
926 tmp2 = MULTIPLY(d1, FIX(1.175875602));
927 tmp3 = MULTIPLY(d1, FIX2(1.387039845));
931 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
932 tmp0 = tmp1 = tmp2 = tmp3 = 0;
938 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
940 dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
941 dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
942 dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
943 dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
944 dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
945 dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
946 dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
947 dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
949 dataptr += DCTSIZE; /* advance pointer to next row */
952 /* Pass 2: process columns. */
953 /* Note that we must descale the results by a factor of 8 == 2**3, */
954 /* and also undo the PASS1_BITS scaling. */
957 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
959 /* Columns of zeroes can be exploited in the same way as we did with rows.
960 * However, the row calculation has created many nonzero AC terms, so the
961 * simplification applies less often (typically 5% to 10% of the time).
962 * On machines with very fast multiplication, it's possible that the
963 * test takes more time than it's worth. In that case this section
964 * may be commented out.
967 d0 = dataptr[DCTSIZE*0];
968 d1 = dataptr[DCTSIZE*1];
969 d2 = dataptr[DCTSIZE*2];
970 d3 = dataptr[DCTSIZE*3];
971 d4 = dataptr[DCTSIZE*4];
972 d5 = dataptr[DCTSIZE*5];
973 d6 = dataptr[DCTSIZE*6];
974 d7 = dataptr[DCTSIZE*7];
976 /* Even part: reverse the even part of the forward DCT. */
977 /* The rotator is sqrt(2)*c(-6). */
986 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
987 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
988 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
989 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
991 tmp0 = (d0 + d4) << CONST_BITS;
992 tmp1 = (d0 - d4) << CONST_BITS;
1001 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
1002 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1003 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1004 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1006 tmp0 = d4 << CONST_BITS;
1008 tmp10 = tmp0 + tmp3;
1009 tmp13 = tmp0 - tmp3;
1010 tmp11 = tmp2 - tmp0;
1011 tmp12 = -(tmp0 + tmp2);
1018 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1019 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1020 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1022 tmp0 = (d0 + d4) << CONST_BITS;
1023 tmp1 = (d0 - d4) << CONST_BITS;
1025 tmp10 = tmp0 + tmp3;
1026 tmp13 = tmp0 - tmp3;
1027 tmp11 = tmp1 + tmp2;
1028 tmp12 = tmp1 - tmp2;
1032 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
1033 tmp2 = MULTIPLY(d6, -FIX2(1.306562965));
1034 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1036 tmp0 = d4 << CONST_BITS;
1038 tmp10 = tmp0 + tmp3;
1039 tmp13 = tmp0 - tmp3;
1040 tmp11 = tmp2 - tmp0;
1041 tmp12 = -(tmp0 + tmp2);
1051 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
1052 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1053 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1054 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1056 tmp0 = d0 << CONST_BITS;
1058 tmp10 = tmp0 + tmp3;
1059 tmp13 = tmp0 - tmp3;
1060 tmp11 = tmp0 + tmp2;
1061 tmp12 = tmp0 - tmp2;
1065 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
1066 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1067 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1068 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1080 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
1081 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1082 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1084 tmp0 = d0 << CONST_BITS;
1086 tmp10 = tmp0 + tmp3;
1087 tmp13 = tmp0 - tmp3;
1088 tmp11 = tmp0 + tmp2;
1089 tmp12 = tmp0 - tmp2;
1093 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
1094 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1095 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1112 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1113 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1114 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1116 tmp0 = (d0 + d4) << CONST_BITS;
1117 tmp1 = (d0 - d4) << CONST_BITS;
1119 tmp10 = tmp0 + tmp3;
1120 tmp13 = tmp0 - tmp3;
1121 tmp11 = tmp1 + tmp2;
1122 tmp12 = tmp1 - tmp2;
1126 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
1127 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1128 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1130 tmp0 = d4 << CONST_BITS;
1132 tmp10 = tmp0 + tmp3;
1133 tmp13 = tmp0 - tmp3;
1134 tmp11 = tmp2 - tmp0;
1135 tmp12 = -(tmp0 + tmp2);
1142 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1143 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1144 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1148 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
1149 tmp10 = tmp13 = d4 << CONST_BITS;
1150 tmp11 = tmp12 = -tmp10;
1160 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
1161 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1162 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1164 tmp0 = d0 << CONST_BITS;
1166 tmp10 = tmp0 + tmp3;
1167 tmp13 = tmp0 - tmp3;
1168 tmp11 = tmp0 + tmp2;
1169 tmp12 = tmp0 - tmp2;
1173 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
1174 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1175 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1187 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
1188 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
1192 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
1193 tmp10 = tmp13 = tmp11 = tmp12 = 0;
1199 /* Odd part per figure 8; the matrix is unitary and hence its
1200 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
1210 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
1215 z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
1217 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1218 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1219 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1220 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1221 z1 = MULTIPLY(z1, - FIX(0.899976223));
1222 z2 = MULTIPLY(z2, - FIX(2.562915447));
1223 z3 = MULTIPLY(z3, - FIX(1.961570560));
1224 z4 = MULTIPLY(z4, - FIX(0.390180644));
1236 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
1239 z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
1241 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1242 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1243 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1244 z1 = MULTIPLY(d7, - FIX(0.899976223));
1245 z2 = MULTIPLY(z2, - FIX(2.562915447));
1246 z3 = MULTIPLY(z3, - FIX(1.961570560));
1247 z4 = MULTIPLY(d5, - FIX(0.390180644));
1262 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
1265 z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
1267 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1268 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1269 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1270 z1 = MULTIPLY(z1, - FIX(0.899976223));
1271 z2 = MULTIPLY(d5, - FIX(2.562915447));
1272 z3 = MULTIPLY(d7, - FIX(1.961570560));
1273 z4 = MULTIPLY(z4, - FIX(0.390180644));
1285 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
1286 z5 = MULTIPLY(d5 + d7, FIX(1.175875602));
1288 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
1289 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1290 z1 = MULTIPLY(d7, - FIX(0.899976223));
1291 z3 = MULTIPLY(d7, - FIX(1.961570560));
1292 z2 = MULTIPLY(d5, - FIX(2.562915447));
1293 z4 = MULTIPLY(d5, - FIX(0.390180644));
1311 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
1314 z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
1316 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1317 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1318 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1319 z1 = MULTIPLY(z1, - FIX(0.899976223));
1320 z2 = MULTIPLY(d3, - FIX(2.562915447));
1321 z3 = MULTIPLY(z3, - FIX(1.961570560));
1322 z4 = MULTIPLY(d1, - FIX(0.390180644));
1334 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1336 z5 = MULTIPLY(z3, FIX(1.175875602));
1338 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
1339 z1 = MULTIPLY(d7, - FIX(0.899976223));
1340 tmp2 = MULTIPLY(d3, FIX(0.509795579));
1341 z2 = MULTIPLY(d3, - FIX(2.562915447));
1342 z3 = MULTIPLY(z3, - FIX2(0.785694958));
1354 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1356 z5 = MULTIPLY(z1, FIX(1.175875602));
1358 tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
1359 tmp3 = MULTIPLY(d1, FIX2(1.111140466));
1360 z1 = MULTIPLY(z1, FIX2(0.275899379));
1361 z3 = MULTIPLY(d7, - FIX(1.961570560));
1362 z4 = MULTIPLY(d1, - FIX(0.390180644));
1371 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1372 tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
1373 tmp1 = MULTIPLY(d7, FIX(1.175875602));
1374 tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
1375 tmp3 = MULTIPLY(d7, FIX2(0.275899379));
1388 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1391 z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
1393 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1394 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1395 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1396 z1 = MULTIPLY(d1, - FIX(0.899976223));
1397 z2 = MULTIPLY(z2, - FIX(2.562915447));
1398 z3 = MULTIPLY(d3, - FIX(1.961570560));
1399 z4 = MULTIPLY(z4, - FIX(0.390180644));
1411 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1413 z5 = MULTIPLY(z2, FIX(1.175875602));
1415 tmp1 = MULTIPLY(d5, FIX2(1.662939225));
1416 tmp2 = MULTIPLY(d3, FIX2(1.111140466));
1417 z2 = MULTIPLY(z2, - FIX2(1.387039845));
1418 z3 = MULTIPLY(d3, - FIX(1.961570560));
1419 z4 = MULTIPLY(d5, - FIX(0.390180644));
1431 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1433 z5 = MULTIPLY(z4, FIX(1.175875602));
1435 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1436 tmp3 = MULTIPLY(d1, FIX2(0.601344887));
1437 z1 = MULTIPLY(d1, - FIX(0.899976223));
1438 z2 = MULTIPLY(d5, - FIX(2.562915447));
1439 z4 = MULTIPLY(z4, FIX2(0.785694958));
1448 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1449 tmp0 = MULTIPLY(d5, FIX(1.175875602));
1450 tmp1 = MULTIPLY(d5, FIX2(0.275899380));
1451 tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
1452 tmp3 = MULTIPLY(d5, FIX2(0.785694958));
1462 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1465 tmp2 = MULTIPLY(d3, - FIX(1.451774981));
1466 tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
1467 z1 = MULTIPLY(d1, FIX(1.061594337));
1468 z2 = MULTIPLY(d3, - FIX(2.172734803));
1469 z4 = MULTIPLY(z5, FIX(0.785694958));
1470 z5 = MULTIPLY(z5, FIX(1.175875602));
1479 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1480 tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
1481 tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
1482 tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
1483 tmp3 = MULTIPLY(d3, FIX(1.175875602));
1490 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1491 tmp0 = MULTIPLY(d1, FIX2(0.275899379));
1492 tmp1 = MULTIPLY(d1, FIX2(0.785694958));
1493 tmp2 = MULTIPLY(d1, FIX(1.175875602));
1494 tmp3 = MULTIPLY(d1, FIX2(1.387039845));
1498 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1499 tmp0 = tmp1 = tmp2 = tmp3 = 0;
1505 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1507 dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
1508 CONST_BITS+PASS1_BITS+3);
1509 dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
1510 CONST_BITS+PASS1_BITS+3);
1511 dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
1512 CONST_BITS+PASS1_BITS+3);
1513 dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
1514 CONST_BITS+PASS1_BITS+3);
1515 dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
1516 CONST_BITS+PASS1_BITS+3);
1517 dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
1518 CONST_BITS+PASS1_BITS+3);
1519 dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
1520 CONST_BITS+PASS1_BITS+3);
1521 dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
1522 CONST_BITS+PASS1_BITS+3);
1524 dataptr++; /* advance pointer to next column */