1 /*****************************************************************************
2 * vdec_idct.c : IDCT functions
3 *****************************************************************************
4 * Copyright (C) 1999, 2000 VideoLAN
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 02111-1307, USA.
22 *****************************************************************************/
24 /*****************************************************************************
26 *****************************************************************************/
29 #include <sys/types.h> /* on BSD, uio.h needs types.h */
30 #include <sys/uio.h> /* for input.h */
41 #include "decoder_fifo.h"
43 #include "video_output.h"
45 #include "vdec_idct.h"
46 #include "video_decoder.h"
47 #include "vdec_motion.h"
49 #include "vpar_blocks.h"
50 #include "vpar_headers.h"
51 #include "vpar_synchro.h"
52 #include "video_parser.h"
53 #include "video_fifo.h"
59 /* Our current implementation is a fast DCT, we might move to a fast DFT or
60 * an MMX DCT in the future. */
62 /*****************************************************************************
63 * vdec_InitIDCT : initialize datas for vdec_SparceIDCT
64 * vdec_SparseIDCT : IDCT function for sparse matrices
65 *****************************************************************************/
67 void vdec_InitIDCT (vdec_thread_t * p_vdec)
71 dctelem_t * p_pre = p_vdec->p_pre_idct;
72 memset( p_pre, 0, 64*64*sizeof(dctelem_t) );
74 for( i=0 ; i < 64 ; i++ )
76 p_pre[i*64+i] = 1 << SPARSE_SCALE_FACTOR;
77 vdec_IDCT( p_vdec, &p_pre[i*64], 0) ;
82 void vdec_SparseIDCT (vdec_thread_t * p_vdec, dctelem_t * p_block,
92 /* If DC Coefficient. */
93 if ( i_sparse_pos == 0 )
96 val=RIGHT_SHIFT((*p_block + 4), 3);
97 /* Compute int to assign. This speeds things up a bit */
98 v = ((val & 0xffff) | (val << 16));
99 dp[0] = v; dp[1] = v; dp[2] = v; dp[3] = v;
100 dp[4] = v; dp[5] = v; dp[6] = v; dp[7] = v;
101 dp[8] = v; dp[9] = v; dp[10] = v; dp[11] = v;
102 dp[12] = v; dp[13] = v; dp[14] = v; dp[15] = v;
103 dp[16] = v; dp[17] = v; dp[18] = v; dp[19] = v;
104 dp[20] = v; dp[21] = v; dp[22] = v; dp[23] = v;
105 dp[24] = v; dp[25] = v; dp[26] = v; dp[27] = v;
106 dp[28] = v; dp[29] = v; dp[30] = v; dp[31] = v;
109 /* Some other coefficient. */
110 p_dest = (s16*)p_block;
111 p_source = (s16*)&p_vdec->p_pre_idct[i_sparse_pos];
112 coeff = (int)p_dest[i_sparse_pos];
113 for( rr=0 ; rr < 4 ; rr++ )
115 p_dest[0] = (p_source[0] * coeff) >> SPARSE_SCALE_FACTOR;
116 p_dest[1] = (p_source[1] * coeff) >> SPARSE_SCALE_FACTOR;
117 p_dest[2] = (p_source[2] * coeff) >> SPARSE_SCALE_FACTOR;
118 p_dest[3] = (p_source[3] * coeff) >> SPARSE_SCALE_FACTOR;
119 p_dest[4] = (p_source[4] * coeff) >> SPARSE_SCALE_FACTOR;
120 p_dest[5] = (p_source[5] * coeff) >> SPARSE_SCALE_FACTOR;
121 p_dest[6] = (p_source[6] * coeff) >> SPARSE_SCALE_FACTOR;
122 p_dest[7] = (p_source[7] * coeff) >> SPARSE_SCALE_FACTOR;
123 p_dest[8] = (p_source[8] * coeff) >> SPARSE_SCALE_FACTOR;
124 p_dest[9] = (p_source[9] * coeff) >> SPARSE_SCALE_FACTOR;
125 p_dest[10] = (p_source[10] * coeff) >> SPARSE_SCALE_FACTOR;
126 p_dest[11] = (p_source[11] * coeff) >> SPARSE_SCALE_FACTOR;
127 p_dest[12] = (p_source[12] * coeff) >> SPARSE_SCALE_FACTOR;
128 p_dest[13] = (p_source[13] * coeff) >> SPARSE_SCALE_FACTOR;
129 p_dest[14] = (p_source[14] * coeff) >> SPARSE_SCALE_FACTOR;
130 p_dest[15] = (p_source[15] * coeff) >> SPARSE_SCALE_FACTOR;
138 /*****************************************************************************
139 * vdec_IDCT : IDCT function for normal matrices
140 *****************************************************************************/
143 void vdec_IDCT( vdec_thread_t * p_vdec, dctelem_t * p_block, int i_idontcare )
146 /* dct classique: pour tester la meilleure entre la classique et la */
148 s32 tmp0, tmp1, tmp2, tmp3;
149 s32 tmp10, tmp11, tmp12, tmp13;
150 s32 z1, z2, z3, z4, z5;
155 /* Pass 1: process rows. */
156 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
157 /* furthermore, we scale the results by 2**PASS1_BITS. */
160 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
162 /* Due to quantization, we will usually find that many of the input
163 * coefficients are zero, especially the AC terms. We can exploit this
164 * by short-circuiting the IDCT calculation for any row in which all
165 * the AC terms are zero. In that case each output is equal to the
166 * DC coefficient (with scale factor as needed).
167 * With typical images and quantization tables, half or more of the
168 * row DCT calculations can be simplified this way.
171 if ((dataptr[1] | dataptr[2] | dataptr[3] | dataptr[4] |
172 dataptr[5] | dataptr[6] | dataptr[7]) == 0)
174 /* AC terms all zero */
175 dctelem_t dcval = (dctelem_t) (dataptr[0] << PASS1_BITS);
186 dataptr += DCTSIZE; /* advance pointer to next row */
190 /* Even part: reverse the even part of the forward DCT. */
191 /* The rotator is sqrt(2)*c(-6). */
193 z2 = (s32) dataptr[2];
194 z3 = (s32) dataptr[6];
196 z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
197 tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
198 tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
200 tmp0 = ((s32) dataptr[0] + (s32) dataptr[4]) << CONST_BITS;
201 tmp1 = ((s32) dataptr[0] - (s32) dataptr[4]) << CONST_BITS;
208 /* Odd part per figure 8; the matrix is unitary and hence its
209 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
212 tmp0 = (s32) dataptr[7];
213 tmp1 = (s32) dataptr[5];
214 tmp2 = (s32) dataptr[3];
215 tmp3 = (s32) dataptr[1];
221 z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
223 tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
224 tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
225 tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
226 tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
227 z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
228 z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
229 z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
230 z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
240 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
242 dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
243 dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
244 dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
245 dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
246 dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
247 dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
248 dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
249 dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
251 dataptr += DCTSIZE; /* advance pointer to next row */
254 /* Pass 2: process columns. */
255 /* Note that we must descale the results by a factor of 8 == 2**3, */
256 /* and also undo the PASS1_BITS scaling. */
259 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
261 /* Columns of zeroes can be exploited in the same way as we did with rows.
262 * However, the row calculation has created many nonzero AC terms, so the
263 * simplification applies less often (typically 5% to 10% of the time).
264 * On machines with very fast multiplication, it's possible that the
265 * test takes more time than it's worth. In that case this section
266 * may be commented out.
269 #ifndef NO_ZERO_COLUMN_TEST /*ajoute un test mais evite des calculs */
270 if ((dataptr[DCTSIZE*1] | dataptr[DCTSIZE*2] | dataptr[DCTSIZE*3] |
271 dataptr[DCTSIZE*4] | dataptr[DCTSIZE*5] | dataptr[DCTSIZE*6] |
272 dataptr[DCTSIZE*7]) == 0)
274 /* AC terms all zero */
275 dctelem_t dcval = (dctelem_t) DESCALE((s32) dataptr[0], PASS1_BITS+3);
277 dataptr[DCTSIZE*0] = dcval;
278 dataptr[DCTSIZE*1] = dcval;
279 dataptr[DCTSIZE*2] = dcval;
280 dataptr[DCTSIZE*3] = dcval;
281 dataptr[DCTSIZE*4] = dcval;
282 dataptr[DCTSIZE*5] = dcval;
283 dataptr[DCTSIZE*6] = dcval;
284 dataptr[DCTSIZE*7] = dcval;
286 dataptr++; /* advance pointer to next column */
291 /* Even part: reverse the even part of the forward DCT. */
292 /* The rotator is sqrt(2)*c(-6). */
294 z2 = (s32) dataptr[DCTSIZE*2];
295 z3 = (s32) dataptr[DCTSIZE*6];
297 z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
298 tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
299 tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
301 tmp0 = ((s32) dataptr[DCTSIZE*0] + (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
302 tmp1 = ((s32) dataptr[DCTSIZE*0] - (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
309 /* Odd part per figure 8; the matrix is unitary and hence its
310 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
313 tmp0 = (s32) dataptr[DCTSIZE*7];
314 tmp1 = (s32) dataptr[DCTSIZE*5];
315 tmp2 = (s32) dataptr[DCTSIZE*3];
316 tmp3 = (s32) dataptr[DCTSIZE*1];
322 z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
324 tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
325 tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
326 tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
327 tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
328 z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
329 z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
330 z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
331 z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
341 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
343 dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
344 CONST_BITS+PASS1_BITS+3);
345 dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
346 CONST_BITS+PASS1_BITS+3);
347 dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
348 CONST_BITS+PASS1_BITS+3);
349 dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
350 CONST_BITS+PASS1_BITS+3);
351 dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
352 CONST_BITS+PASS1_BITS+3);
353 dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
354 CONST_BITS+PASS1_BITS+3);
355 dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
356 CONST_BITS+PASS1_BITS+3);
357 dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
358 CONST_BITS+PASS1_BITS+3);
360 dataptr++; /* advance pointer to next column */
364 #if 1 /*dct avec non classique*/
366 s32 tmp0, tmp1, tmp2, tmp3;
367 s32 tmp10, tmp11, tmp12, tmp13;
368 s32 z1, z2, z3, z4, z5;
369 s32 d0, d1, d2, d3, d4, d5, d6, d7;
375 /* Pass 1: process rows. */
376 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
377 /* furthermore, we scale the results by 2**PASS1_BITS. */
381 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
383 /* Due to quantization, we will usually find that many of the input
384 * coefficients are zero, especially the AC terms. We can exploit this
385 * by short-circuiting the IDCT calculation for any row in which all
386 * the AC terms are zero. In that case each output is equal to the
387 * DC coefficient (with scale factor as needed).
388 * With typical images and quantization tables, half or more of the
389 * row DCT calculations can be simplified this way.
392 register int * idataptr = (int*)dataptr;
395 if ( (d1 == 0) && ((idataptr[1] | idataptr[2] | idataptr[3]) == 0) )
397 /* AC terms all zero */
400 /* Compute a 32 bit value to assign. */
401 dctelem_t dcval = (dctelem_t) (d0 << PASS1_BITS);
402 register int v = (dcval & 0xffff) | (dcval << 16);
410 dataptr += DCTSIZE; /* advance pointer to next row */
420 /* Even part: reverse the even part of the forward DCT. */
421 /* The rotator is sqrt(2)*c(-6). */
430 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
431 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
432 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
433 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
435 tmp0 = (d0 + d4) << CONST_BITS;
436 tmp1 = (d0 - d4) << CONST_BITS;
445 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
446 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
447 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
448 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
450 tmp0 = d4 << CONST_BITS;
455 tmp12 = -(tmp0 + tmp2);
462 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
463 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
464 tmp3 = MULTIPLY(d6, FIX(0.541196100));
466 tmp0 = (d0 + d4) << CONST_BITS;
467 tmp1 = (d0 - d4) << CONST_BITS;
476 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
477 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
478 tmp3 = MULTIPLY(d6, FIX(0.541196100));
480 tmp0 = d4 << CONST_BITS;
485 tmp12 = -(tmp0 + tmp2);
495 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
496 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
497 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
498 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
500 tmp0 = d0 << CONST_BITS;
509 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
510 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
511 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
512 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
524 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
525 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
526 tmp3 = MULTIPLY(d6, FIX(0.541196100));
528 tmp0 = d0 << CONST_BITS;
537 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
538 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
539 tmp3 = MULTIPLY(d6, FIX(0.541196100));
557 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
558 tmp2 = MULTIPLY(d2, FIX(0.541196100));
559 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
561 tmp0 = (d0 + d4) << CONST_BITS;
562 tmp1 = (d0 - d4) << CONST_BITS;
571 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
572 tmp2 = MULTIPLY(d2, FIX(0.541196100));
573 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
575 tmp0 = d4 << CONST_BITS;
580 tmp12 = -(tmp0 + tmp2);
587 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
588 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
589 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
593 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
594 tmp10 = tmp13 = d4 << CONST_BITS;
595 tmp11 = tmp12 = -tmp10;
605 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
606 tmp2 = MULTIPLY(d2, FIX(0.541196100));
607 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
609 tmp0 = d0 << CONST_BITS;
618 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
619 tmp2 = MULTIPLY(d2, FIX(0.541196100));
620 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
632 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
633 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
637 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
638 tmp10 = tmp13 = tmp11 = tmp12 = 0;
645 /* Odd part per figure 8; the matrix is unitary and hence its
646 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
657 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
662 z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
664 tmp0 = MULTIPLY(d7, FIX(0.298631336));
665 tmp1 = MULTIPLY(d5, FIX(2.053119869));
666 tmp2 = MULTIPLY(d3, FIX(3.072711026));
667 tmp3 = MULTIPLY(d1, FIX(1.501321110));
668 z1 = MULTIPLY(z1, - FIX(0.899976223));
669 z2 = MULTIPLY(z2, - FIX(2.562915447));
670 z3 = MULTIPLY(z3, - FIX(1.961570560));
671 z4 = MULTIPLY(z4, - FIX(0.390180644));
683 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
686 z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
688 tmp0 = MULTIPLY(d7, FIX(0.298631336));
689 tmp1 = MULTIPLY(d5, FIX(2.053119869));
690 tmp2 = MULTIPLY(d3, FIX(3.072711026));
691 z1 = MULTIPLY(d7, - FIX(0.899976223));
692 z2 = MULTIPLY(z2, - FIX(2.562915447));
693 z3 = MULTIPLY(z3, - FIX(1.961570560));
694 z4 = MULTIPLY(d5, - FIX(0.390180644));
709 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
712 z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
714 tmp0 = MULTIPLY(d7, FIX(0.298631336));
715 tmp1 = MULTIPLY(d5, FIX(2.053119869));
716 tmp3 = MULTIPLY(d1, FIX(1.501321110));
717 z1 = MULTIPLY(z1, - FIX(0.899976223));
718 z2 = MULTIPLY(d5, - FIX(2.562915447));
719 z3 = MULTIPLY(d7, - FIX(1.961570560));
720 z4 = MULTIPLY(z4, - FIX(0.390180644));
732 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
733 z5 = MULTIPLY(d7 + d5, FIX(1.175875602));
735 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
736 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
737 z1 = MULTIPLY(d7, - FIX(0.899976223));
738 z3 = MULTIPLY(d7, - FIX(1.961570560));
739 z2 = MULTIPLY(d5, - FIX(2.562915447));
740 z4 = MULTIPLY(d5, - FIX(0.390180644));
758 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
761 z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
763 tmp0 = MULTIPLY(d7, FIX(0.298631336));
764 tmp2 = MULTIPLY(d3, FIX(3.072711026));
765 tmp3 = MULTIPLY(d1, FIX(1.501321110));
766 z1 = MULTIPLY(z1, - FIX(0.899976223));
767 z2 = MULTIPLY(d3, - FIX(2.562915447));
768 z3 = MULTIPLY(z3, - FIX(1.961570560));
769 z4 = MULTIPLY(d1, - FIX(0.390180644));
781 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
783 z5 = MULTIPLY(z3, FIX(1.175875602));
785 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
786 tmp2 = MULTIPLY(d3, FIX(0.509795579));
787 z1 = MULTIPLY(d7, - FIX(0.899976223));
788 z2 = MULTIPLY(d3, - FIX(2.562915447));
789 z3 = MULTIPLY(z3, - FIX2(0.785694958));
801 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
803 z5 = MULTIPLY(z1, FIX(1.175875602));
805 tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
806 tmp3 = MULTIPLY(d1, FIX2(1.111140466));
807 z1 = MULTIPLY(z1, FIX2(0.275899379));
808 z3 = MULTIPLY(d7, - FIX(1.961570560));
809 z4 = MULTIPLY(d1, - FIX(0.390180644));
818 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
819 tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
820 tmp1 = MULTIPLY(d7, FIX(1.175875602));
821 tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
822 tmp3 = MULTIPLY(d7, FIX2(0.275899379));
835 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
838 z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
840 tmp1 = MULTIPLY(d5, FIX(2.053119869));
841 tmp2 = MULTIPLY(d3, FIX(3.072711026));
842 tmp3 = MULTIPLY(d1, FIX(1.501321110));
843 z1 = MULTIPLY(d1, - FIX(0.899976223));
844 z2 = MULTIPLY(z2, - FIX(2.562915447));
845 z3 = MULTIPLY(d3, - FIX(1.961570560));
846 z4 = MULTIPLY(z4, - FIX(0.390180644));
858 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
860 z5 = MULTIPLY(z2, FIX(1.175875602));
862 tmp1 = MULTIPLY(d5, FIX2(1.662939225));
863 tmp2 = MULTIPLY(d3, FIX2(1.111140466));
864 z2 = MULTIPLY(z2, - FIX2(1.387039845));
865 z3 = MULTIPLY(d3, - FIX(1.961570560));
866 z4 = MULTIPLY(d5, - FIX(0.390180644));
878 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
880 z5 = MULTIPLY(z4, FIX(1.175875602));
882 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
883 tmp3 = MULTIPLY(d1, FIX2(0.601344887));
884 z1 = MULTIPLY(d1, - FIX(0.899976223));
885 z2 = MULTIPLY(d5, - FIX(2.562915447));
886 z4 = MULTIPLY(z4, FIX2(0.785694958));
895 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
896 tmp0 = MULTIPLY(d5, FIX(1.175875602));
897 tmp1 = MULTIPLY(d5, FIX2(0.275899380));
898 tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
899 tmp3 = MULTIPLY(d5, FIX2(0.785694958));
909 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
912 tmp2 = MULTIPLY(d3, - FIX(1.451774981));
913 tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
914 z1 = MULTIPLY(d1, FIX(1.061594337));
915 z2 = MULTIPLY(d3, - FIX(2.172734803));
916 z4 = MULTIPLY(z5, FIX(0.785694958));
917 z5 = MULTIPLY(z5, FIX(1.175875602));
926 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
927 tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
928 tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
929 tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
930 tmp3 = MULTIPLY(d3, FIX(1.175875602));
937 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
938 tmp0 = MULTIPLY(d1, FIX2(0.275899379));
939 tmp1 = MULTIPLY(d1, FIX2(0.785694958));
940 tmp2 = MULTIPLY(d1, FIX(1.175875602));
941 tmp3 = MULTIPLY(d1, FIX2(1.387039845));
945 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
946 tmp0 = tmp1 = tmp2 = tmp3 = 0;
952 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
954 dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
955 dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
956 dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
957 dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
958 dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
959 dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
960 dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
961 dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
963 dataptr += DCTSIZE; /* advance pointer to next row */
966 /* Pass 2: process columns. */
967 /* Note that we must descale the results by a factor of 8 == 2**3, */
968 /* and also undo the PASS1_BITS scaling. */
971 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
973 /* Columns of zeroes can be exploited in the same way as we did with rows.
974 * However, the row calculation has created many nonzero AC terms, so the
975 * simplification applies less often (typically 5% to 10% of the time).
976 * On machines with very fast multiplication, it's possible that the
977 * test takes more time than it's worth. In that case this section
978 * may be commented out.
981 d0 = dataptr[DCTSIZE*0];
982 d1 = dataptr[DCTSIZE*1];
983 d2 = dataptr[DCTSIZE*2];
984 d3 = dataptr[DCTSIZE*3];
985 d4 = dataptr[DCTSIZE*4];
986 d5 = dataptr[DCTSIZE*5];
987 d6 = dataptr[DCTSIZE*6];
988 d7 = dataptr[DCTSIZE*7];
990 /* Even part: reverse the even part of the forward DCT. */
991 /* The rotator is sqrt(2)*c(-6). */
1000 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1001 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1002 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1003 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1005 tmp0 = (d0 + d4) << CONST_BITS;
1006 tmp1 = (d0 - d4) << CONST_BITS;
1008 tmp10 = tmp0 + tmp3;
1009 tmp13 = tmp0 - tmp3;
1010 tmp11 = tmp1 + tmp2;
1011 tmp12 = tmp1 - tmp2;
1015 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
1016 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1017 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1018 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1020 tmp0 = d4 << CONST_BITS;
1022 tmp10 = tmp0 + tmp3;
1023 tmp13 = tmp0 - tmp3;
1024 tmp11 = tmp2 - tmp0;
1025 tmp12 = -(tmp0 + tmp2);
1032 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1033 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1034 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1036 tmp0 = (d0 + d4) << CONST_BITS;
1037 tmp1 = (d0 - d4) << CONST_BITS;
1039 tmp10 = tmp0 + tmp3;
1040 tmp13 = tmp0 - tmp3;
1041 tmp11 = tmp1 + tmp2;
1042 tmp12 = tmp1 - tmp2;
1046 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
1047 tmp2 = MULTIPLY(d6, -FIX2(1.306562965));
1048 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1050 tmp0 = d4 << CONST_BITS;
1052 tmp10 = tmp0 + tmp3;
1053 tmp13 = tmp0 - tmp3;
1054 tmp11 = tmp2 - tmp0;
1055 tmp12 = -(tmp0 + tmp2);
1065 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
1066 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1067 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1068 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1070 tmp0 = d0 << CONST_BITS;
1072 tmp10 = tmp0 + tmp3;
1073 tmp13 = tmp0 - tmp3;
1074 tmp11 = tmp0 + tmp2;
1075 tmp12 = tmp0 - tmp2;
1079 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
1080 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1081 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1082 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1094 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
1095 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1096 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1098 tmp0 = d0 << CONST_BITS;
1100 tmp10 = tmp0 + tmp3;
1101 tmp13 = tmp0 - tmp3;
1102 tmp11 = tmp0 + tmp2;
1103 tmp12 = tmp0 - tmp2;
1107 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
1108 tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1109 tmp3 = MULTIPLY(d6, FIX(0.541196100));
1126 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1127 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1128 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1130 tmp0 = (d0 + d4) << CONST_BITS;
1131 tmp1 = (d0 - d4) << CONST_BITS;
1133 tmp10 = tmp0 + tmp3;
1134 tmp13 = tmp0 - tmp3;
1135 tmp11 = tmp1 + tmp2;
1136 tmp12 = tmp1 - tmp2;
1140 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
1141 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1142 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1144 tmp0 = d4 << CONST_BITS;
1146 tmp10 = tmp0 + tmp3;
1147 tmp13 = tmp0 - tmp3;
1148 tmp11 = tmp2 - tmp0;
1149 tmp12 = -(tmp0 + tmp2);
1156 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1157 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1158 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1162 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
1163 tmp10 = tmp13 = d4 << CONST_BITS;
1164 tmp11 = tmp12 = -tmp10;
1174 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
1175 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1176 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1178 tmp0 = d0 << CONST_BITS;
1180 tmp10 = tmp0 + tmp3;
1181 tmp13 = tmp0 - tmp3;
1182 tmp11 = tmp0 + tmp2;
1183 tmp12 = tmp0 - tmp2;
1187 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
1188 tmp2 = MULTIPLY(d2, FIX(0.541196100));
1189 tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1201 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
1202 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
1206 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
1207 tmp10 = tmp13 = tmp11 = tmp12 = 0;
1213 /* Odd part per figure 8; the matrix is unitary and hence its
1214 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
1224 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
1229 z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
1231 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1232 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1233 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1234 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1235 z1 = MULTIPLY(z1, - FIX(0.899976223));
1236 z2 = MULTIPLY(z2, - FIX(2.562915447));
1237 z3 = MULTIPLY(z3, - FIX(1.961570560));
1238 z4 = MULTIPLY(z4, - FIX(0.390180644));
1250 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
1253 z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
1255 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1256 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1257 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1258 z1 = MULTIPLY(d7, - FIX(0.899976223));
1259 z2 = MULTIPLY(z2, - FIX(2.562915447));
1260 z3 = MULTIPLY(z3, - FIX(1.961570560));
1261 z4 = MULTIPLY(d5, - FIX(0.390180644));
1276 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
1279 z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
1281 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1282 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1283 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1284 z1 = MULTIPLY(z1, - FIX(0.899976223));
1285 z2 = MULTIPLY(d5, - FIX(2.562915447));
1286 z3 = MULTIPLY(d7, - FIX(1.961570560));
1287 z4 = MULTIPLY(z4, - FIX(0.390180644));
1299 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
1300 z5 = MULTIPLY(d5 + d7, FIX(1.175875602));
1302 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
1303 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1304 z1 = MULTIPLY(d7, - FIX(0.899976223));
1305 z3 = MULTIPLY(d7, - FIX(1.961570560));
1306 z2 = MULTIPLY(d5, - FIX(2.562915447));
1307 z4 = MULTIPLY(d5, - FIX(0.390180644));
1325 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
1328 z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
1330 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1331 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1332 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1333 z1 = MULTIPLY(z1, - FIX(0.899976223));
1334 z2 = MULTIPLY(d3, - FIX(2.562915447));
1335 z3 = MULTIPLY(z3, - FIX(1.961570560));
1336 z4 = MULTIPLY(d1, - FIX(0.390180644));
1348 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1350 z5 = MULTIPLY(z3, FIX(1.175875602));
1352 tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
1353 z1 = MULTIPLY(d7, - FIX(0.899976223));
1354 tmp2 = MULTIPLY(d3, FIX(0.509795579));
1355 z2 = MULTIPLY(d3, - FIX(2.562915447));
1356 z3 = MULTIPLY(z3, - FIX2(0.785694958));
1368 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1370 z5 = MULTIPLY(z1, FIX(1.175875602));
1372 tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
1373 tmp3 = MULTIPLY(d1, FIX2(1.111140466));
1374 z1 = MULTIPLY(z1, FIX2(0.275899379));
1375 z3 = MULTIPLY(d7, - FIX(1.961570560));
1376 z4 = MULTIPLY(d1, - FIX(0.390180644));
1385 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1386 tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
1387 tmp1 = MULTIPLY(d7, FIX(1.175875602));
1388 tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
1389 tmp3 = MULTIPLY(d7, FIX2(0.275899379));
1402 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1405 z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
1407 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1408 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1409 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1410 z1 = MULTIPLY(d1, - FIX(0.899976223));
1411 z2 = MULTIPLY(z2, - FIX(2.562915447));
1412 z3 = MULTIPLY(d3, - FIX(1.961570560));
1413 z4 = MULTIPLY(z4, - FIX(0.390180644));
1425 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1427 z5 = MULTIPLY(z2, FIX(1.175875602));
1429 tmp1 = MULTIPLY(d5, FIX2(1.662939225));
1430 tmp2 = MULTIPLY(d3, FIX2(1.111140466));
1431 z2 = MULTIPLY(z2, - FIX2(1.387039845));
1432 z3 = MULTIPLY(d3, - FIX(1.961570560));
1433 z4 = MULTIPLY(d5, - FIX(0.390180644));
1445 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1447 z5 = MULTIPLY(z4, FIX(1.175875602));
1449 tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1450 tmp3 = MULTIPLY(d1, FIX2(0.601344887));
1451 z1 = MULTIPLY(d1, - FIX(0.899976223));
1452 z2 = MULTIPLY(d5, - FIX(2.562915447));
1453 z4 = MULTIPLY(z4, FIX2(0.785694958));
1462 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1463 tmp0 = MULTIPLY(d5, FIX(1.175875602));
1464 tmp1 = MULTIPLY(d5, FIX2(0.275899380));
1465 tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
1466 tmp3 = MULTIPLY(d5, FIX2(0.785694958));
1476 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1479 tmp2 = MULTIPLY(d3, - FIX(1.451774981));
1480 tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
1481 z1 = MULTIPLY(d1, FIX(1.061594337));
1482 z2 = MULTIPLY(d3, - FIX(2.172734803));
1483 z4 = MULTIPLY(z5, FIX(0.785694958));
1484 z5 = MULTIPLY(z5, FIX(1.175875602));
1493 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1494 tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
1495 tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
1496 tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
1497 tmp3 = MULTIPLY(d3, FIX(1.175875602));
1504 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1505 tmp0 = MULTIPLY(d1, FIX2(0.275899379));
1506 tmp1 = MULTIPLY(d1, FIX2(0.785694958));
1507 tmp2 = MULTIPLY(d1, FIX(1.175875602));
1508 tmp3 = MULTIPLY(d1, FIX2(1.387039845));
1512 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1513 tmp0 = tmp1 = tmp2 = tmp3 = 0;
1519 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1521 dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
1522 CONST_BITS+PASS1_BITS+3);
1523 dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
1524 CONST_BITS+PASS1_BITS+3);
1525 dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
1526 CONST_BITS+PASS1_BITS+3);
1527 dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
1528 CONST_BITS+PASS1_BITS+3);
1529 dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
1530 CONST_BITS+PASS1_BITS+3);
1531 dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
1532 CONST_BITS+PASS1_BITS+3);
1533 dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
1534 CONST_BITS+PASS1_BITS+3);
1535 dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
1536 CONST_BITS+PASS1_BITS+3);
1538 dataptr++; /* advance pointer to next column */