1 /*****************************************************************************
2 * vdec_idct.c : IDCT functions
4 *****************************************************************************/
6 /*****************************************************************************
8 *****************************************************************************/
20 #include "vlc_thread.h"
23 #include "debug.h" /* ?? temporaire, requis par netlist.h */
26 #include "input_netlist.h"
27 #include "decoder_fifo.h"
29 #include "video_output.h"
31 #include "vdec_idct.h"
32 #include "video_decoder.h"
33 #include "vdec_motion.h"
35 #include "vpar_blocks.h"
36 #include "vpar_headers.h"
37 #include "vpar_synchro.h"
38 #include "video_parser.h"
39 #include "video_fifo.h"
45 /* Our current implementation is a fast DCT, we might move to a fast DFT or
46 * an MMX DCT in the future. */
48 /*****************************************************************************
49 * vdec_DummyIDCT : dummy function that does nothing
50 *****************************************************************************/
51 void vdec_DummyIDCT( vdec_thread_t * p_vdec, dctelem_t * p_block,
56 /*****************************************************************************
57 * init_SparseIDCT : initialize datas for vdec_SparceIDCT
58 * vdec_SparseIDCT : IDCT function for sparse matrices
59 *****************************************************************************/
61 void vdec_InitIDCT (vdec_thread_t * p_vdec)
65 dctelem_t * p_pre = p_vdec->p_pre_idct;
66 memset( p_pre, 0, 64*64*sizeof(dctelem_t) );
68 for( i=0 ; i < 64 ; i++ )
70 p_pre[i*64+i] = 1 << SPARSE_SCALE_FACTOR;
71 vdec_IDCT( p_vdec, &p_pre[i*64], 0) ;
76 void vdec_SparseIDCT (vdec_thread_t * p_vdec, dctelem_t * p_block,
86 /* If DC Coefficient. */
87 if ( i_sparse_pos == 0 )
90 val=RIGHT_SHIFT((*p_block + 4), 3);
91 /* Compute int to assign. This speeds things up a bit */
92 v = ((val & 0xffff) | (val << 16));
93 dp[0] = v; dp[1] = v; dp[2] = v; dp[3] = v;
94 dp[4] = v; dp[5] = v; dp[6] = v; dp[7] = v;
95 dp[8] = v; dp[9] = v; dp[10] = v; dp[11] = v;
96 dp[12] = v; dp[13] = v; dp[14] = v; dp[15] = v;
97 dp[16] = v; dp[17] = v; dp[18] = v; dp[19] = v;
98 dp[20] = v; dp[21] = v; dp[22] = v; dp[23] = v;
99 dp[24] = v; dp[25] = v; dp[26] = v; dp[27] = v;
100 dp[28] = v; dp[29] = v; dp[30] = v; dp[31] = v;
103 /* Some other coefficient. */
104 p_dest = (s16*)p_block;
105 p_source = (s16*)&p_vdec->p_pre_idct[i_sparse_pos];
106 coeff = (int)p_dest[i_sparse_pos];
107 for( rr=0 ; rr < 4 ; rr++ )
109 p_dest[0] = (p_source[0] * coeff) >> SPARSE_SCALE_FACTOR;
110 p_dest[1] = (p_source[1] * coeff) >> SPARSE_SCALE_FACTOR;
111 p_dest[2] = (p_source[2] * coeff) >> SPARSE_SCALE_FACTOR;
112 p_dest[3] = (p_source[3] * coeff) >> SPARSE_SCALE_FACTOR;
113 p_dest[4] = (p_source[4] * coeff) >> SPARSE_SCALE_FACTOR;
114 p_dest[5] = (p_source[5] * coeff) >> SPARSE_SCALE_FACTOR;
115 p_dest[6] = (p_source[6] * coeff) >> SPARSE_SCALE_FACTOR;
116 p_dest[7] = (p_source[7] * coeff) >> SPARSE_SCALE_FACTOR;
117 p_dest[8] = (p_source[8] * coeff) >> SPARSE_SCALE_FACTOR;
118 p_dest[9] = (p_source[9] * coeff) >> SPARSE_SCALE_FACTOR;
119 p_dest[10] = (p_source[10] * coeff) >> SPARSE_SCALE_FACTOR;
120 p_dest[11] = (p_source[11] * coeff) >> SPARSE_SCALE_FACTOR;
121 p_dest[12] = (p_source[12] * coeff) >> SPARSE_SCALE_FACTOR;
122 p_dest[13] = (p_source[13] * coeff) >> SPARSE_SCALE_FACTOR;
123 p_dest[14] = (p_source[14] * coeff) >> SPARSE_SCALE_FACTOR;
124 p_dest[15] = (p_source[15] * coeff) >> SPARSE_SCALE_FACTOR;
132 /*****************************************************************************
133 * vdec_IDCT : IDCT function for normal matrices
134 *****************************************************************************/
137 void vdec_IDCT( vdec_thread_t * p_vdec, dctelem_t * p_block, int i_idontcare )
140 /* dct classique: pour tester la meilleure entre la classique et la */
142 s32 tmp0, tmp1, tmp2, tmp3;
143 s32 tmp10, tmp11, tmp12, tmp13;
144 s32 z1, z2, z3, z4, z5;
149 /* Pass 1: process rows. */
150 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
151 /* furthermore, we scale the results by 2**PASS1_BITS. */
154 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
156 /* Due to quantization, we will usually find that many of the input
157 * coefficients are zero, especially the AC terms. We can exploit this
158 * by short-circuiting the IDCT calculation for any row in which all
159 * the AC terms are zero. In that case each output is equal to the
160 * DC coefficient (with scale factor as needed).
161 * With typical images and quantization tables, half or more of the
162 * row DCT calculations can be simplified this way.
165 if ((dataptr[1] | dataptr[2] | dataptr[3] | dataptr[4] |
166 dataptr[5] | dataptr[6] | dataptr[7]) == 0)
168 /* AC terms all zero */
169 dctelem_t dcval = (dctelem_t) (dataptr[0] << PASS1_BITS);
180 dataptr += DCTSIZE; /* advance pointer to next row */
184 /* Even part: reverse the even part of the forward DCT. */
185 /* The rotator is sqrt(2)*c(-6). */
187 z2 = (s32) dataptr[2];
188 z3 = (s32) dataptr[6];
190 z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
191 tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
192 tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
194 tmp0 = ((s32) dataptr[0] + (s32) dataptr[4]) << CONST_BITS;
195 tmp1 = ((s32) dataptr[0] - (s32) dataptr[4]) << CONST_BITS;
202 /* Odd part per figure 8; the matrix is unitary and hence its
203 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
206 tmp0 = (s32) dataptr[7];
207 tmp1 = (s32) dataptr[5];
208 tmp2 = (s32) dataptr[3];
209 tmp3 = (s32) dataptr[1];
215 z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
217 tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
218 tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
219 tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
220 tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
221 z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
222 z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
223 z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
224 z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
234 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
236 dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
237 dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
238 dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
239 dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
240 dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
241 dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
242 dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
243 dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
245 dataptr += DCTSIZE; /* advance pointer to next row */
248 /* Pass 2: process columns. */
249 /* Note that we must descale the results by a factor of 8 == 2**3, */
250 /* and also undo the PASS1_BITS scaling. */
253 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
255 /* Columns of zeroes can be exploited in the same way as we did with rows.
256 * However, the row calculation has created many nonzero AC terms, so the
257 * simplification applies less often (typically 5% to 10% of the time).
258 * On machines with very fast multiplication, it's possible that the
259 * test takes more time than it's worth. In that case this section
260 * may be commented out.
263 #ifndef NO_ZERO_COLUMN_TEST /*ajoute un test mais evite des calculs */
264 if ((dataptr[DCTSIZE*1] | dataptr[DCTSIZE*2] | dataptr[DCTSIZE*3] |
265 dataptr[DCTSIZE*4] | dataptr[DCTSIZE*5] | dataptr[DCTSIZE*6] |
266 dataptr[DCTSIZE*7]) == 0)
268 /* AC terms all zero */
269 dctelem_t dcval = (dctelem_t) DESCALE((s32) dataptr[0], PASS1_BITS+3);
271 dataptr[DCTSIZE*0] = dcval;
272 dataptr[DCTSIZE*1] = dcval;
273 dataptr[DCTSIZE*2] = dcval;
274 dataptr[DCTSIZE*3] = dcval;
275 dataptr[DCTSIZE*4] = dcval;
276 dataptr[DCTSIZE*5] = dcval;
277 dataptr[DCTSIZE*6] = dcval;
278 dataptr[DCTSIZE*7] = dcval;
280 dataptr++; /* advance pointer to next column */
285 /* Even part: reverse the even part of the forward DCT. */
286 /* The rotator is sqrt(2)*c(-6). */
288 z2 = (s32) dataptr[DCTSIZE*2];
289 z3 = (s32) dataptr[DCTSIZE*6];
291 z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
292 tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
293 tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
295 tmp0 = ((s32) dataptr[DCTSIZE*0] + (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
296 tmp1 = ((s32) dataptr[DCTSIZE*0] - (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
303 /* Odd part per figure 8; the matrix is unitary and hence its
304 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
307 tmp0 = (s32) dataptr[DCTSIZE*7];
308 tmp1 = (s32) dataptr[DCTSIZE*5];
309 tmp2 = (s32) dataptr[DCTSIZE*3];
310 tmp3 = (s32) dataptr[DCTSIZE*1];
316 z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
318 tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
319 tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
320 tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
321 tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
322 z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
323 z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
324 z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
325 z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
335 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
337 dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
338 CONST_BITS+PASS1_BITS+3);
339 dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
340 CONST_BITS+PASS1_BITS+3);
341 dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
342 CONST_BITS+PASS1_BITS+3);
343 dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
344 CONST_BITS+PASS1_BITS+3);
345 dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
346 CONST_BITS+PASS1_BITS+3);
347 dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
348 CONST_BITS+PASS1_BITS+3);
349 dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
350 CONST_BITS+PASS1_BITS+3);
351 dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
352 CONST_BITS+PASS1_BITS+3);
354 dataptr++; /* advance pointer to next column */
358 #if 1 /*dct avec non classique*/
360 s32 tmp0, tmp1, tmp2, tmp3;
361 s32 tmp10, tmp11, tmp12, tmp13;
362 s32 z1, z2, z3, z4, z5;
363 s32 d0, d1, d2, d3, d4, d5, d6, d7;
369 /* Pass 1: process rows. */
370 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
371 /* furthermore, we scale the results by 2**PASS1_BITS. */
375 fprintf( stderr, "normal dct" );
376 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
378 /* Due to quantization, we will usually find that many of the input
379 * coefficients are zero, especially the AC terms. We can exploit this
380 * by short-circuiting the IDCT calculation for any row in which all
381 * the AC terms are zero. In that case each output is equal to the
382 * DC coefficient (with scale factor as needed).
383 * With typical images and quantization tables, half or more of the
384 * row DCT calculations can be simplified this way.
387 register int * idataptr = (int*)dataptr;
390 if ( (d1 == 0) && ((idataptr[1] | idataptr[2] | idataptr[3]) == 0) )
392 /* AC terms all zero */
395 /* Compute a 32 bit value to assign. */
396 dctelem_t dcval = (dctelem_t) (d0 << PASS1_BITS);
397 register int v = (dcval & 0xffff) | (dcval << 16);
405 dataptr += DCTSIZE; /* advance pointer to next row */
415 /* Even part: reverse the even part of the forward DCT. */
416 /* The rotator is sqrt(2)*c(-6). */
425 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
426 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
427 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
428 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
430 tmp0 = (d0 + d4) << CONST_BITS;
431 tmp1 = (d0 - d4) << CONST_BITS;
440 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
441 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
442 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
443 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
445 tmp0 = d4 << CONST_BITS;
450 tmp12 = -(tmp0 + tmp2);
457 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
458 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
459 tmp3 = MULTIPLY(d6, FIX(0.541196100));
461 tmp0 = (d0 + d4) << CONST_BITS;
462 tmp1 = (d0 - d4) << CONST_BITS;
471 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
472 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
473 tmp3 = MULTIPLY(d6, FIX(0.541196100));
475 tmp0 = d4 << CONST_BITS;
480 tmp12 = -(tmp0 + tmp2);
490 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
491 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
492 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
493 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
495 tmp0 = d0 << CONST_BITS;
504 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
505 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
506 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
507 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
519 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
520 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
521 tmp3 = MULTIPLY(d6, FIX(0.541196100));
523 tmp0 = d0 << CONST_BITS;
532 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
533 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
534 tmp3 = MULTIPLY(d6, FIX(0.541196100));
552 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
553 tmp2 = MULTIPLY(d2, FIX(0.541196100));
554 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
556 tmp0 = (d0 + d4) << CONST_BITS;
557 tmp1 = (d0 - d4) << CONST_BITS;
566 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
567 tmp2 = MULTIPLY(d2, FIX(0.541196100));
568 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
570 tmp0 = d4 << CONST_BITS;
575 tmp12 = -(tmp0 + tmp2);
582 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
583 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
584 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
588 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
589 tmp10 = tmp13 = d4 << CONST_BITS;
590 tmp11 = tmp12 = -tmp10;
600 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
601 tmp2 = MULTIPLY(d2, FIX(0.541196100));
602 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
604 tmp0 = d0 << CONST_BITS;
613 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
614 tmp2 = MULTIPLY(d2, FIX(0.541196100));
615 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
627 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
628 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
632 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
633 tmp10 = tmp13 = tmp11 = tmp12 = 0;
640 /* Odd part per figure 8; the matrix is unitary and hence its
641 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
652 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
657 z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
659 tmp0 = MULTIPLY(d7, FIX(0.298631336));
660 tmp1 = MULTIPLY(d5, FIX(2.053119869));
661 tmp2 = MULTIPLY(d3, FIX(3.072711026));
662 tmp3 = MULTIPLY(d1, FIX(1.501321110));
663 z1 = MULTIPLY(z1, - FIX(0.899976223));
664 z2 = MULTIPLY(z2, - FIX(2.562915447));
665 z3 = MULTIPLY(z3, - FIX(1.961570560));
666 z4 = MULTIPLY(z4, - FIX(0.390180644));
678 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
681 z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
683 tmp0 = MULTIPLY(d7, FIX(0.298631336));
684 tmp1 = MULTIPLY(d5, FIX(2.053119869));
685 tmp2 = MULTIPLY(d3, FIX(3.072711026));
686 z1 = MULTIPLY(d7, - FIX(0.899976223));
687 z2 = MULTIPLY(z2, - FIX(2.562915447));
688 z3 = MULTIPLY(z3, - FIX(1.961570560));
689 z4 = MULTIPLY(d5, - FIX(0.390180644));
704 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
707 z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
709 tmp0 = MULTIPLY(d7, FIX(0.298631336));
710 tmp1 = MULTIPLY(d5, FIX(2.053119869));
711 tmp3 = MULTIPLY(d1, FIX(1.501321110));
712 z1 = MULTIPLY(z1, - FIX(0.899976223));
713 z2 = MULTIPLY(d5, - FIX(2.562915447));
714 z3 = MULTIPLY(d7, - FIX(1.961570560));
715 z4 = MULTIPLY(z4, - FIX(0.390180644));
727 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
728 z5 = MULTIPLY(d7 + d5, FIX(1.175875602));
730 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
731 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
732 z1 = MULTIPLY(d7, - FIX(0.899976223));
733 z3 = MULTIPLY(d7, - FIX(1.961570560));
734 z2 = MULTIPLY(d5, - FIX(2.562915447));
735 z4 = MULTIPLY(d5, - FIX(0.390180644));
753 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
756 z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
758 tmp0 = MULTIPLY(d7, FIX(0.298631336));
759 tmp2 = MULTIPLY(d3, FIX(3.072711026));
760 tmp3 = MULTIPLY(d1, FIX(1.501321110));
761 z1 = MULTIPLY(z1, - FIX(0.899976223));
762 z2 = MULTIPLY(d3, - FIX(2.562915447));
763 z3 = MULTIPLY(z3, - FIX(1.961570560));
764 z4 = MULTIPLY(d1, - FIX(0.390180644));
776 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
778 z5 = MULTIPLY(z3, FIX(1.175875602));
780 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
781 tmp2 = MULTIPLY(d3, FIX(0.509795579));
782 z1 = MULTIPLY(d7, - FIX(0.899976223));
783 z2 = MULTIPLY(d3, - FIX(2.562915447));
784 z3 = MULTIPLY(z3, - FIX2(0.785694958));
796 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
798 z5 = MULTIPLY(z1, FIX(1.175875602));
800 tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
801 tmp3 = MULTIPLY(d1, FIX2(1.111140466));
802 z1 = MULTIPLY(z1, FIX2(0.275899379));
803 z3 = MULTIPLY(d7, - FIX(1.961570560));
804 z4 = MULTIPLY(d1, - FIX(0.390180644));
813 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
814 tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
815 tmp1 = MULTIPLY(d7, FIX(1.175875602));
816 tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
817 tmp3 = MULTIPLY(d7, FIX2(0.275899379));
830 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
833 z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
835 tmp1 = MULTIPLY(d5, FIX(2.053119869));
836 tmp2 = MULTIPLY(d3, FIX(3.072711026));
837 tmp3 = MULTIPLY(d1, FIX(1.501321110));
838 z1 = MULTIPLY(d1, - FIX(0.899976223));
839 z2 = MULTIPLY(z2, - FIX(2.562915447));
840 z3 = MULTIPLY(d3, - FIX(1.961570560));
841 z4 = MULTIPLY(z4, - FIX(0.390180644));
853 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
855 z5 = MULTIPLY(z2, FIX(1.175875602));
857 tmp1 = MULTIPLY(d5, FIX2(1.662939225));
858 tmp2 = MULTIPLY(d3, FIX2(1.111140466));
859 z2 = MULTIPLY(z2, - FIX2(1.387039845));
860 z3 = MULTIPLY(d3, - FIX(1.961570560));
861 z4 = MULTIPLY(d5, - FIX(0.390180644));
873 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
875 z5 = MULTIPLY(z4, FIX(1.175875602));
877 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
878 tmp3 = MULTIPLY(d1, FIX2(0.601344887));
879 z1 = MULTIPLY(d1, - FIX(0.899976223));
880 z2 = MULTIPLY(d5, - FIX(2.562915447));
881 z4 = MULTIPLY(z4, FIX2(0.785694958));
890 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
891 tmp0 = MULTIPLY(d5, FIX(1.175875602));
892 tmp1 = MULTIPLY(d5, FIX2(0.275899380));
893 tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
894 tmp3 = MULTIPLY(d5, FIX2(0.785694958));
904 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
907 tmp2 = MULTIPLY(d3, - FIX(1.451774981));
908 tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
909 z1 = MULTIPLY(d1, FIX(1.061594337));
910 z2 = MULTIPLY(d3, - FIX(2.172734803));
911 z4 = MULTIPLY(z5, FIX(0.785694958));
912 z5 = MULTIPLY(z5, FIX(1.175875602));
921 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
922 tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
923 tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
924 tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
925 tmp3 = MULTIPLY(d3, FIX(1.175875602));
932 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
933 tmp0 = MULTIPLY(d1, FIX2(0.275899379));
934 tmp1 = MULTIPLY(d1, FIX2(0.785694958));
935 tmp2 = MULTIPLY(d1, FIX(1.175875602));
936 tmp3 = MULTIPLY(d1, FIX2(1.387039845));
940 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
941 tmp0 = tmp1 = tmp2 = tmp3 = 0;
947 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
949 dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
950 dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
951 dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
952 dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
953 dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
954 dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
955 dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
956 dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
958 dataptr += DCTSIZE; /* advance pointer to next row */
961 /* Pass 2: process columns. */
962 /* Note that we must descale the results by a factor of 8 == 2**3, */
963 /* and also undo the PASS1_BITS scaling. */
966 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
968 /* Columns of zeroes can be exploited in the same way as we did with rows.
969 * However, the row calculation has created many nonzero AC terms, so the
970 * simplification applies less often (typically 5% to 10% of the time).
971 * On machines with very fast multiplication, it's possible that the
972 * test takes more time than it's worth. In that case this section
973 * may be commented out.
976 d0 = dataptr[DCTSIZE*0];
977 d1 = dataptr[DCTSIZE*1];
978 d2 = dataptr[DCTSIZE*2];
979 d3 = dataptr[DCTSIZE*3];
980 d4 = dataptr[DCTSIZE*4];
981 d5 = dataptr[DCTSIZE*5];
982 d6 = dataptr[DCTSIZE*6];
983 d7 = dataptr[DCTSIZE*7];
985 /* Even part: reverse the even part of the forward DCT. */
986 /* The rotator is sqrt(2)*c(-6). */
995 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
996 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
997 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
998 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1000 tmp0 = (d0 + d4) << CONST_BITS;
1001 tmp1 = (d0 - d4) << CONST_BITS;
1003 tmp10 = tmp0 + tmp3;
1004 tmp13 = tmp0 - tmp3;
1005 tmp11 = tmp1 + tmp2;
1006 tmp12 = tmp1 - tmp2;
1010 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
1011 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1012 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1013 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1015 tmp0 = d4 << CONST_BITS;
1017 tmp10 = tmp0 + tmp3;
1018 tmp13 = tmp0 - tmp3;
1019 tmp11 = tmp2 - tmp0;
1020 tmp12 = -(tmp0 + tmp2);
1027 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1028 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1029 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1031 tmp0 = (d0 + d4) << CONST_BITS;
1032 tmp1 = (d0 - d4) << CONST_BITS;
1034 tmp10 = tmp0 + tmp3;
1035 tmp13 = tmp0 - tmp3;
1036 tmp11 = tmp1 + tmp2;
1037 tmp12 = tmp1 - tmp2;
1041 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
1042 tmp2 = MULTIPLY(d6, -FIX2(1.306562965));
1043 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1045 tmp0 = d4 << CONST_BITS;
1047 tmp10 = tmp0 + tmp3;
1048 tmp13 = tmp0 - tmp3;
1049 tmp11 = tmp2 - tmp0;
1050 tmp12 = -(tmp0 + tmp2);
1060 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
1061 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1062 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1063 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1065 tmp0 = d0 << CONST_BITS;
1067 tmp10 = tmp0 + tmp3;
1068 tmp13 = tmp0 - tmp3;
1069 tmp11 = tmp0 + tmp2;
1070 tmp12 = tmp0 - tmp2;
1074 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
1075 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1076 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1077 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1089 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
1090 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1091 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1093 tmp0 = d0 << CONST_BITS;
1095 tmp10 = tmp0 + tmp3;
1096 tmp13 = tmp0 - tmp3;
1097 tmp11 = tmp0 + tmp2;
1098 tmp12 = tmp0 - tmp2;
1102 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
1103 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1104 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1121 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1122 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1123 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1125 tmp0 = (d0 + d4) << CONST_BITS;
1126 tmp1 = (d0 - d4) << CONST_BITS;
1128 tmp10 = tmp0 + tmp3;
1129 tmp13 = tmp0 - tmp3;
1130 tmp11 = tmp1 + tmp2;
1131 tmp12 = tmp1 - tmp2;
1135 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
1136 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1137 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1139 tmp0 = d4 << CONST_BITS;
1141 tmp10 = tmp0 + tmp3;
1142 tmp13 = tmp0 - tmp3;
1143 tmp11 = tmp2 - tmp0;
1144 tmp12 = -(tmp0 + tmp2);
1151 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1152 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1153 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1157 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
1158 tmp10 = tmp13 = d4 << CONST_BITS;
1159 tmp11 = tmp12 = -tmp10;
1169 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
1170 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1171 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1173 tmp0 = d0 << CONST_BITS;
1175 tmp10 = tmp0 + tmp3;
1176 tmp13 = tmp0 - tmp3;
1177 tmp11 = tmp0 + tmp2;
1178 tmp12 = tmp0 - tmp2;
1182 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
1183 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1184 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1196 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
1197 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
1201 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
1202 tmp10 = tmp13 = tmp11 = tmp12 = 0;
1208 /* Odd part per figure 8; the matrix is unitary and hence its
1209 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
1219 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
1224 z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
1226 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1227 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1228 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1229 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1230 z1 = MULTIPLY(z1, - FIX(0.899976223));
1231 z2 = MULTIPLY(z2, - FIX(2.562915447));
1232 z3 = MULTIPLY(z3, - FIX(1.961570560));
1233 z4 = MULTIPLY(z4, - FIX(0.390180644));
1245 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
1248 z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
1250 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1251 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1252 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1253 z1 = MULTIPLY(d7, - FIX(0.899976223));
1254 z2 = MULTIPLY(z2, - FIX(2.562915447));
1255 z3 = MULTIPLY(z3, - FIX(1.961570560));
1256 z4 = MULTIPLY(d5, - FIX(0.390180644));
1271 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
1274 z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
1276 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1277 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1278 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1279 z1 = MULTIPLY(z1, - FIX(0.899976223));
1280 z2 = MULTIPLY(d5, - FIX(2.562915447));
1281 z3 = MULTIPLY(d7, - FIX(1.961570560));
1282 z4 = MULTIPLY(z4, - FIX(0.390180644));
1294 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
1295 z5 = MULTIPLY(d5 + d7, FIX(1.175875602));
1297 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
1298 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1299 z1 = MULTIPLY(d7, - FIX(0.899976223));
1300 z3 = MULTIPLY(d7, - FIX(1.961570560));
1301 z2 = MULTIPLY(d5, - FIX(2.562915447));
1302 z4 = MULTIPLY(d5, - FIX(0.390180644));
1320 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
1323 z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
1325 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1326 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1327 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1328 z1 = MULTIPLY(z1, - FIX(0.899976223));
1329 z2 = MULTIPLY(d3, - FIX(2.562915447));
1330 z3 = MULTIPLY(z3, - FIX(1.961570560));
1331 z4 = MULTIPLY(d1, - FIX(0.390180644));
1343 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1345 z5 = MULTIPLY(z3, FIX(1.175875602));
1347 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
1348 z1 = MULTIPLY(d7, - FIX(0.899976223));
1349 tmp2 = MULTIPLY(d3, FIX(0.509795579));
1350 z2 = MULTIPLY(d3, - FIX(2.562915447));
1351 z3 = MULTIPLY(z3, - FIX2(0.785694958));
1363 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1365 z5 = MULTIPLY(z1, FIX(1.175875602));
1367 tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
1368 tmp3 = MULTIPLY(d1, FIX2(1.111140466));
1369 z1 = MULTIPLY(z1, FIX2(0.275899379));
1370 z3 = MULTIPLY(d7, - FIX(1.961570560));
1371 z4 = MULTIPLY(d1, - FIX(0.390180644));
1380 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1381 tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
1382 tmp1 = MULTIPLY(d7, FIX(1.175875602));
1383 tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
1384 tmp3 = MULTIPLY(d7, FIX2(0.275899379));
1397 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1400 z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
1402 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1403 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1404 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1405 z1 = MULTIPLY(d1, - FIX(0.899976223));
1406 z2 = MULTIPLY(z2, - FIX(2.562915447));
1407 z3 = MULTIPLY(d3, - FIX(1.961570560));
1408 z4 = MULTIPLY(z4, - FIX(0.390180644));
1420 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1422 z5 = MULTIPLY(z2, FIX(1.175875602));
1424 tmp1 = MULTIPLY(d5, FIX2(1.662939225));
1425 tmp2 = MULTIPLY(d3, FIX2(1.111140466));
1426 z2 = MULTIPLY(z2, - FIX2(1.387039845));
1427 z3 = MULTIPLY(d3, - FIX(1.961570560));
1428 z4 = MULTIPLY(d5, - FIX(0.390180644));
1440 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1442 z5 = MULTIPLY(z4, FIX(1.175875602));
1444 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1445 tmp3 = MULTIPLY(d1, FIX2(0.601344887));
1446 z1 = MULTIPLY(d1, - FIX(0.899976223));
1447 z2 = MULTIPLY(d5, - FIX(2.562915447));
1448 z4 = MULTIPLY(z4, FIX2(0.785694958));
1457 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1458 tmp0 = MULTIPLY(d5, FIX(1.175875602));
1459 tmp1 = MULTIPLY(d5, FIX2(0.275899380));
1460 tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
1461 tmp3 = MULTIPLY(d5, FIX2(0.785694958));
1471 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1474 tmp2 = MULTIPLY(d3, - FIX(1.451774981));
1475 tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
1476 z1 = MULTIPLY(d1, FIX(1.061594337));
1477 z2 = MULTIPLY(d3, - FIX(2.172734803));
1478 z4 = MULTIPLY(z5, FIX(0.785694958));
1479 z5 = MULTIPLY(z5, FIX(1.175875602));
1488 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1489 tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
1490 tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
1491 tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
1492 tmp3 = MULTIPLY(d3, FIX(1.175875602));
1499 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1500 tmp0 = MULTIPLY(d1, FIX2(0.275899379));
1501 tmp1 = MULTIPLY(d1, FIX2(0.785694958));
1502 tmp2 = MULTIPLY(d1, FIX(1.175875602));
1503 tmp3 = MULTIPLY(d1, FIX2(1.387039845));
1507 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1508 tmp0 = tmp1 = tmp2 = tmp3 = 0;
1514 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1516 dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
1517 CONST_BITS+PASS1_BITS+3);
1518 dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
1519 CONST_BITS+PASS1_BITS+3);
1520 dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
1521 CONST_BITS+PASS1_BITS+3);
1522 dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
1523 CONST_BITS+PASS1_BITS+3);
1524 dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
1525 CONST_BITS+PASS1_BITS+3);
1526 dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
1527 CONST_BITS+PASS1_BITS+3);
1528 dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
1529 CONST_BITS+PASS1_BITS+3);
1530 dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
1531 CONST_BITS+PASS1_BITS+3);
1533 dataptr++; /* advance pointer to next column */