4 * Copyright (C) 1991, 1992, Thomas G. Lane.
5 * This file is part of the Independent JPEG Group's software.
6 * For conditions of distribution and use, see the accompanying README file.
8 * This file contains the basic inverse-DCT transformation subroutine.
10 * This implementation is based on an algorithm described in
11 * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
12 * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
13 * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
14 * The primary algorithm described there uses 11 multiplies and 29 adds.
15 * We use their alternate method with 12 multiplies and 32 adds.
16 * The advantage of this method is that no data path contains more than one
17 * multiplication; this allows a very simple and accurate implementation in
18 * scaled fixed-point arithmetic, with a minimal number of shifts.
20 * I've made lots of modifications to attempt to take advantage of the
21 * sparse nature of the DCT matrices we're getting. Although the logic
22 * is cumbersome, it's straightforward and the resulting code is much
25 * A better way to do this would be to pass in the DCT block as a sparse
26 * matrix, perhaps with the difference cases encoded.
31 #define EIGHT_BIT_SAMPLES
38 #define RIGHT_SHIFT(x, n) ((x) >> (n))
40 typedef DCTELEM DCTBLOCK[DCTSIZE2];
45 * This routine is specialized to the case DCTSIZE = 8.
49 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
54 * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
55 * on each column. Direct algorithms are also available, but they are
56 * much more complex and seem not to be any faster when reduced to code.
58 * The poop on this scaling stuff is as follows:
60 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
61 * larger than the true IDCT outputs. The final outputs are therefore
62 * a factor of N larger than desired; since N=8 this can be cured by
63 * a simple right shift at the end of the algorithm. The advantage of
64 * this arrangement is that we save two multiplications per 1-D IDCT,
65 * because the y0 and y4 inputs need not be divided by sqrt(N).
67 * We have to do addition and subtraction of the integer inputs, which
68 * is no problem, and multiplication by fractional constants, which is
69 * a problem to do in integer arithmetic. We multiply all the constants
70 * by CONST_SCALE and convert them to integer constants (thus retaining
71 * CONST_BITS bits of precision in the constants). After doing a
72 * multiplication we have to divide the product by CONST_SCALE, with proper
73 * rounding, to produce the correct output. This division can be done
74 * cheaply as a right shift of CONST_BITS bits. We postpone shifting
75 * as long as possible so that partial sums can be added together with
76 * full fractional precision.
78 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
79 * they are represented to better-than-integral precision. These outputs
80 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
81 * with the recommended scaling. (To scale up 12-bit sample data further, an
82 * intermediate int32 array would be needed.)
84 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
85 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
86 * shows that the values given below are the most effective.
89 #ifdef EIGHT_BIT_SAMPLES
92 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
95 #define ONE ((INT32) 1)
97 #define CONST_SCALE (ONE << CONST_BITS)
99 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
100 * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
101 * you will pay a significant penalty in run time. In that case, figure
102 * the correct integer constant values and insert them by hand.
105 /* Actually FIX is no longer used, we precomputed them all */
106 #define FIX(x) ((INT32) ((x) * CONST_SCALE + 0.5))
108 /* Descale and correctly round an INT32 value that's scaled by N bits.
109 * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
110 * the fudge factor is correct for either sign of X.
113 #define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
115 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
116 * For 8-bit samples with the recommended scaling, all the variable
117 * and constant values involved are no more than 16 bits wide, so a
118 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
119 * this provides a useful speedup on many machines.
120 * There is no way to specify a 16x16->32 multiply in portable C, but
121 * some C compilers will do the right thing if you provide the correct
122 * combination of casts.
123 * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
126 #ifdef EIGHT_BIT_SAMPLES
127 #ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */
128 #define MULTIPLY(var,const) (((INT16) (var)) * ((INT16) (const)))
130 #ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */
131 #define MULTIPLY(var,const) (((INT16) (var)) * ((INT32) (const)))
135 #ifndef MULTIPLY /* default definition */
136 #define MULTIPLY(var,const) ((var) * (const))
141 Unlike our decoder where we approximate the FIXes, we need to use exact
142 ones here or successive P-frames will drift too much with Reference frame coding
144 #define FIX_0_211164243 1730
145 #define FIX_0_275899380 2260
146 #define FIX_0_298631336 2446
147 #define FIX_0_390180644 3196
148 #define FIX_0_509795579 4176
149 #define FIX_0_541196100 4433
150 #define FIX_0_601344887 4926
151 #define FIX_0_765366865 6270
152 #define FIX_0_785694958 6436
153 #define FIX_0_899976223 7373
154 #define FIX_1_061594337 8697
155 #define FIX_1_111140466 9102
156 #define FIX_1_175875602 9633
157 #define FIX_1_306562965 10703
158 #define FIX_1_387039845 11363
159 #define FIX_1_451774981 11893
160 #define FIX_1_501321110 12299
161 #define FIX_1_662939225 13623
162 #define FIX_1_847759065 15137
163 #define FIX_1_961570560 16069
164 #define FIX_2_053119869 16819
165 #define FIX_2_172734803 17799
166 #define FIX_2_562915447 20995
167 #define FIX_3_072711026 25172
170 * Perform the inverse DCT on one block of coefficients.
173 void j_rev_dct(DCTBLOCK data)
175 INT32 tmp0, tmp1, tmp2, tmp3;
176 INT32 tmp10, tmp11, tmp12, tmp13;
177 INT32 z1, z2, z3, z4, z5;
178 INT32 d0, d1, d2, d3, d4, d5, d6, d7;
179 register DCTELEM *dataptr;
182 /* Pass 1: process rows. */
183 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
184 /* furthermore, we scale the results by 2**PASS1_BITS. */
188 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
189 /* Due to quantization, we will usually find that many of the input
190 * coefficients are zero, especially the AC terms. We can exploit this
191 * by short-circuiting the IDCT calculation for any row in which all
192 * the AC terms are zero. In that case each output is equal to the
193 * DC coefficient (with scale factor as needed).
194 * With typical images and quantization tables, half or more of the
195 * row DCT calculations can be simplified this way.
198 register int *idataptr = (int*)dataptr;
200 /* WARNING: we do the same permutation as MMX idct to simplify the
211 if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
212 /* AC terms all zero */
214 /* Compute a 32 bit value to assign. */
215 DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS);
216 register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
224 dataptr += DCTSIZE; /* advance pointer to next row */
228 /* Even part: reverse the even part of the forward DCT. */
229 /* The rotator is sqrt(2)*c(-6). */
235 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
236 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
237 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
238 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
240 tmp0 = (d0 + d4) << CONST_BITS;
241 tmp1 = (d0 - d4) << CONST_BITS;
248 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
249 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
250 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
251 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
253 tmp0 = d4 << CONST_BITS;
258 tmp12 = -(tmp0 + tmp2);
262 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
263 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
264 tmp3 = MULTIPLY(d6, FIX_0_541196100);
266 tmp0 = (d0 + d4) << CONST_BITS;
267 tmp1 = (d0 - d4) << CONST_BITS;
274 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
275 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
276 tmp3 = MULTIPLY(d6, FIX_0_541196100);
278 tmp0 = d4 << CONST_BITS;
283 tmp12 = -(tmp0 + tmp2);
289 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
290 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
291 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
292 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
294 tmp0 = d0 << CONST_BITS;
301 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
302 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
303 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
304 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
313 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
314 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
315 tmp3 = MULTIPLY(d6, FIX_0_541196100);
317 tmp0 = d0 << CONST_BITS;
324 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
325 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
326 tmp3 = MULTIPLY(d6, FIX_0_541196100);
339 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
340 tmp2 = MULTIPLY(d2, FIX_0_541196100);
341 tmp3 = MULTIPLY(d2, FIX_1_306562965);
343 tmp0 = (d0 + d4) << CONST_BITS;
344 tmp1 = (d0 - d4) << CONST_BITS;
351 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
352 tmp2 = MULTIPLY(d2, FIX_0_541196100);
353 tmp3 = MULTIPLY(d2, FIX_1_306562965);
355 tmp0 = d4 << CONST_BITS;
360 tmp12 = -(tmp0 + tmp2);
364 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
365 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
366 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
368 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
369 tmp10 = tmp13 = d4 << CONST_BITS;
370 tmp11 = tmp12 = -tmp10;
376 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
377 tmp2 = MULTIPLY(d2, FIX_0_541196100);
378 tmp3 = MULTIPLY(d2, FIX_1_306562965);
380 tmp0 = d0 << CONST_BITS;
387 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
388 tmp2 = MULTIPLY(d2, FIX_0_541196100);
389 tmp3 = MULTIPLY(d2, FIX_1_306562965);
398 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
399 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
401 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
402 tmp10 = tmp13 = tmp11 = tmp12 = 0;
408 /* Odd part per figure 8; the matrix is unitary and hence its
409 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
416 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
421 z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
423 tmp0 = MULTIPLY(d7, FIX_0_298631336);
424 tmp1 = MULTIPLY(d5, FIX_2_053119869);
425 tmp2 = MULTIPLY(d3, FIX_3_072711026);
426 tmp3 = MULTIPLY(d1, FIX_1_501321110);
427 z1 = MULTIPLY(-z1, FIX_0_899976223);
428 z2 = MULTIPLY(-z2, FIX_2_562915447);
429 z3 = MULTIPLY(-z3, FIX_1_961570560);
430 z4 = MULTIPLY(-z4, FIX_0_390180644);
440 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
443 z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
445 tmp0 = MULTIPLY(d7, FIX_0_298631336);
446 tmp1 = MULTIPLY(d5, FIX_2_053119869);
447 tmp2 = MULTIPLY(d3, FIX_3_072711026);
448 z1 = MULTIPLY(-d7, FIX_0_899976223);
449 z2 = MULTIPLY(-z2, FIX_2_562915447);
450 z3 = MULTIPLY(-z3, FIX_1_961570560);
451 z4 = MULTIPLY(-d5, FIX_0_390180644);
463 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
466 z5 = MULTIPLY(d7 + z4, FIX_1_175875602);
468 tmp0 = MULTIPLY(d7, FIX_0_298631336);
469 tmp1 = MULTIPLY(d5, FIX_2_053119869);
470 tmp3 = MULTIPLY(d1, FIX_1_501321110);
471 z1 = MULTIPLY(-z1, FIX_0_899976223);
472 z2 = MULTIPLY(-d5, FIX_2_562915447);
473 z3 = MULTIPLY(-d7, FIX_1_961570560);
474 z4 = MULTIPLY(-z4, FIX_0_390180644);
484 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
485 tmp0 = MULTIPLY(-d7, FIX_0_601344887);
486 z1 = MULTIPLY(-d7, FIX_0_899976223);
487 z3 = MULTIPLY(-d7, FIX_1_961570560);
488 tmp1 = MULTIPLY(-d5, FIX_0_509795579);
489 z2 = MULTIPLY(-d5, FIX_2_562915447);
490 z4 = MULTIPLY(-d5, FIX_0_390180644);
491 z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
505 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
508 z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
510 tmp0 = MULTIPLY(d7, FIX_0_298631336);
511 tmp2 = MULTIPLY(d3, FIX_3_072711026);
512 tmp3 = MULTIPLY(d1, FIX_1_501321110);
513 z1 = MULTIPLY(-z1, FIX_0_899976223);
514 z2 = MULTIPLY(-d3, FIX_2_562915447);
515 z3 = MULTIPLY(-z3, FIX_1_961570560);
516 z4 = MULTIPLY(-d1, FIX_0_390180644);
526 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
529 tmp0 = MULTIPLY(-d7, FIX_0_601344887);
530 z1 = MULTIPLY(-d7, FIX_0_899976223);
531 tmp2 = MULTIPLY(d3, FIX_0_509795579);
532 z2 = MULTIPLY(-d3, FIX_2_562915447);
533 z5 = MULTIPLY(z3, FIX_1_175875602);
534 z3 = MULTIPLY(-z3, FIX_0_785694958);
543 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
545 z5 = MULTIPLY(z1, FIX_1_175875602);
547 z1 = MULTIPLY(z1, FIX_0_275899380);
548 z3 = MULTIPLY(-d7, FIX_1_961570560);
549 tmp0 = MULTIPLY(-d7, FIX_1_662939225);
550 z4 = MULTIPLY(-d1, FIX_0_390180644);
551 tmp3 = MULTIPLY(d1, FIX_1_111140466);
558 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
559 tmp0 = MULTIPLY(-d7, FIX_1_387039845);
560 tmp1 = MULTIPLY(d7, FIX_1_175875602);
561 tmp2 = MULTIPLY(-d7, FIX_0_785694958);
562 tmp3 = MULTIPLY(d7, FIX_0_275899380);
570 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
573 z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
575 tmp1 = MULTIPLY(d5, FIX_2_053119869);
576 tmp2 = MULTIPLY(d3, FIX_3_072711026);
577 tmp3 = MULTIPLY(d1, FIX_1_501321110);
578 z1 = MULTIPLY(-d1, FIX_0_899976223);
579 z2 = MULTIPLY(-z2, FIX_2_562915447);
580 z3 = MULTIPLY(-d3, FIX_1_961570560);
581 z4 = MULTIPLY(-z4, FIX_0_390180644);
591 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
594 z5 = MULTIPLY(z2, FIX_1_175875602);
595 tmp1 = MULTIPLY(d5, FIX_1_662939225);
596 z4 = MULTIPLY(-d5, FIX_0_390180644);
597 z2 = MULTIPLY(-z2, FIX_1_387039845);
598 tmp2 = MULTIPLY(d3, FIX_1_111140466);
599 z3 = MULTIPLY(-d3, FIX_1_961570560);
608 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
611 z5 = MULTIPLY(z4, FIX_1_175875602);
612 z1 = MULTIPLY(-d1, FIX_0_899976223);
613 tmp3 = MULTIPLY(d1, FIX_0_601344887);
614 tmp1 = MULTIPLY(-d5, FIX_0_509795579);
615 z2 = MULTIPLY(-d5, FIX_2_562915447);
616 z4 = MULTIPLY(z4, FIX_0_785694958);
623 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
624 tmp0 = MULTIPLY(d5, FIX_1_175875602);
625 tmp1 = MULTIPLY(d5, FIX_0_275899380);
626 tmp2 = MULTIPLY(-d5, FIX_1_387039845);
627 tmp3 = MULTIPLY(d5, FIX_0_785694958);
633 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
635 tmp3 = MULTIPLY(d1, FIX_0_211164243);
636 tmp2 = MULTIPLY(-d3, FIX_1_451774981);
637 z1 = MULTIPLY(d1, FIX_1_061594337);
638 z2 = MULTIPLY(-d3, FIX_2_172734803);
639 z4 = MULTIPLY(z5, FIX_0_785694958);
640 z5 = MULTIPLY(z5, FIX_1_175875602);
647 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
648 tmp0 = MULTIPLY(-d3, FIX_0_785694958);
649 tmp1 = MULTIPLY(-d3, FIX_1_387039845);
650 tmp2 = MULTIPLY(-d3, FIX_0_275899380);
651 tmp3 = MULTIPLY(d3, FIX_1_175875602);
655 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
656 tmp0 = MULTIPLY(d1, FIX_0_275899380);
657 tmp1 = MULTIPLY(d1, FIX_0_785694958);
658 tmp2 = MULTIPLY(d1, FIX_1_175875602);
659 tmp3 = MULTIPLY(d1, FIX_1_387039845);
661 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
662 tmp0 = tmp1 = tmp2 = tmp3 = 0;
668 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
670 dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
671 dataptr[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
672 dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
673 dataptr[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
674 dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
675 dataptr[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
676 dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
677 dataptr[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
679 dataptr += DCTSIZE; /* advance pointer to next row */
682 /* Pass 2: process columns. */
683 /* Note that we must descale the results by a factor of 8 == 2**3, */
684 /* and also undo the PASS1_BITS scaling. */
687 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
688 /* Columns of zeroes can be exploited in the same way as we did with rows.
689 * However, the row calculation has created many nonzero AC terms, so the
690 * simplification applies less often (typically 5% to 10% of the time).
691 * On machines with very fast multiplication, it's possible that the
692 * test takes more time than it's worth. In that case this section
693 * may be commented out.
696 d0 = dataptr[DCTSIZE*0];
697 d1 = dataptr[DCTSIZE*1];
698 d2 = dataptr[DCTSIZE*2];
699 d3 = dataptr[DCTSIZE*3];
700 d4 = dataptr[DCTSIZE*4];
701 d5 = dataptr[DCTSIZE*5];
702 d6 = dataptr[DCTSIZE*6];
703 d7 = dataptr[DCTSIZE*7];
705 /* Even part: reverse the even part of the forward DCT. */
706 /* The rotator is sqrt(2)*c(-6). */
711 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
712 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
713 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
714 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
716 tmp0 = (d0 + d4) << CONST_BITS;
717 tmp1 = (d0 - d4) << CONST_BITS;
724 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
725 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
726 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
727 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
729 tmp0 = d4 << CONST_BITS;
734 tmp12 = -(tmp0 + tmp2);
738 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
739 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
740 tmp3 = MULTIPLY(d6, FIX_0_541196100);
742 tmp0 = (d0 + d4) << CONST_BITS;
743 tmp1 = (d0 - d4) << CONST_BITS;
750 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
751 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
752 tmp3 = MULTIPLY(d6, FIX_0_541196100);
754 tmp0 = d4 << CONST_BITS;
759 tmp12 = -(tmp0 + tmp2);
765 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
766 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
767 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
768 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
770 tmp0 = d0 << CONST_BITS;
777 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
778 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
779 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
780 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
789 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
790 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
791 tmp3 = MULTIPLY(d6, FIX_0_541196100);
793 tmp0 = d0 << CONST_BITS;
800 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
801 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
802 tmp3 = MULTIPLY(d6, FIX_0_541196100);
815 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
816 tmp2 = MULTIPLY(d2, FIX_0_541196100);
817 tmp3 = MULTIPLY(d2, FIX_1_306562965);
819 tmp0 = (d0 + d4) << CONST_BITS;
820 tmp1 = (d0 - d4) << CONST_BITS;
827 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
828 tmp2 = MULTIPLY(d2, FIX_0_541196100);
829 tmp3 = MULTIPLY(d2, FIX_1_306562965);
831 tmp0 = d4 << CONST_BITS;
836 tmp12 = -(tmp0 + tmp2);
840 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
841 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
842 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
844 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
845 tmp10 = tmp13 = d4 << CONST_BITS;
846 tmp11 = tmp12 = -tmp10;
852 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
853 tmp2 = MULTIPLY(d2, FIX_0_541196100);
854 tmp3 = MULTIPLY(d2, FIX_1_306562965);
856 tmp0 = d0 << CONST_BITS;
863 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
864 tmp2 = MULTIPLY(d2, FIX_0_541196100);
865 tmp3 = MULTIPLY(d2, FIX_1_306562965);
874 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
875 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
877 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
878 tmp10 = tmp13 = tmp11 = tmp12 = 0;
884 /* Odd part per figure 8; the matrix is unitary and hence its
885 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
891 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
896 z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
898 tmp0 = MULTIPLY(d7, FIX_0_298631336);
899 tmp1 = MULTIPLY(d5, FIX_2_053119869);
900 tmp2 = MULTIPLY(d3, FIX_3_072711026);
901 tmp3 = MULTIPLY(d1, FIX_1_501321110);
902 z1 = MULTIPLY(-z1, FIX_0_899976223);
903 z2 = MULTIPLY(-z2, FIX_2_562915447);
904 z3 = MULTIPLY(-z3, FIX_1_961570560);
905 z4 = MULTIPLY(-z4, FIX_0_390180644);
915 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
919 z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
921 tmp0 = MULTIPLY(d7, FIX_0_298631336);
922 tmp1 = MULTIPLY(d5, FIX_2_053119869);
923 tmp2 = MULTIPLY(d3, FIX_3_072711026);
924 z1 = MULTIPLY(-d7, FIX_0_899976223);
925 z2 = MULTIPLY(-z2, FIX_2_562915447);
926 z3 = MULTIPLY(-z3, FIX_1_961570560);
927 z4 = MULTIPLY(-d5, FIX_0_390180644);
939 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
944 z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
946 tmp0 = MULTIPLY(d7, FIX_0_298631336);
947 tmp1 = MULTIPLY(d5, FIX_2_053119869);
948 tmp3 = MULTIPLY(d1, FIX_1_501321110);
949 z1 = MULTIPLY(-z1, FIX_0_899976223);
950 z2 = MULTIPLY(-d5, FIX_2_562915447);
951 z3 = MULTIPLY(-d7, FIX_1_961570560);
952 z4 = MULTIPLY(-z4, FIX_0_390180644);
962 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
963 tmp0 = MULTIPLY(-d7, FIX_0_601344887);
964 z1 = MULTIPLY(-d7, FIX_0_899976223);
965 z3 = MULTIPLY(-d7, FIX_1_961570560);
966 tmp1 = MULTIPLY(-d5, FIX_0_509795579);
967 z2 = MULTIPLY(-d5, FIX_2_562915447);
968 z4 = MULTIPLY(-d5, FIX_0_390180644);
969 z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
983 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
986 z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
988 tmp0 = MULTIPLY(d7, FIX_0_298631336);
989 tmp2 = MULTIPLY(d3, FIX_3_072711026);
990 tmp3 = MULTIPLY(d1, FIX_1_501321110);
991 z1 = MULTIPLY(-z1, FIX_0_899976223);
992 z2 = MULTIPLY(-d3, FIX_2_562915447);
993 z3 = MULTIPLY(-z3, FIX_1_961570560);
994 z4 = MULTIPLY(-d1, FIX_0_390180644);
1004 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1007 tmp0 = MULTIPLY(-d7, FIX_0_601344887);
1008 z1 = MULTIPLY(-d7, FIX_0_899976223);
1009 tmp2 = MULTIPLY(d3, FIX_0_509795579);
1010 z2 = MULTIPLY(-d3, FIX_2_562915447);
1011 z5 = MULTIPLY(z3, FIX_1_175875602);
1012 z3 = MULTIPLY(-z3, FIX_0_785694958);
1021 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1023 z5 = MULTIPLY(z1, FIX_1_175875602);
1025 z1 = MULTIPLY(z1, FIX_0_275899380);
1026 z3 = MULTIPLY(-d7, FIX_1_961570560);
1027 tmp0 = MULTIPLY(-d7, FIX_1_662939225);
1028 z4 = MULTIPLY(-d1, FIX_0_390180644);
1029 tmp3 = MULTIPLY(d1, FIX_1_111140466);
1036 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1037 tmp0 = MULTIPLY(-d7, FIX_1_387039845);
1038 tmp1 = MULTIPLY(d7, FIX_1_175875602);
1039 tmp2 = MULTIPLY(-d7, FIX_0_785694958);
1040 tmp3 = MULTIPLY(d7, FIX_0_275899380);
1048 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1051 z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
1053 tmp1 = MULTIPLY(d5, FIX_2_053119869);
1054 tmp2 = MULTIPLY(d3, FIX_3_072711026);
1055 tmp3 = MULTIPLY(d1, FIX_1_501321110);
1056 z1 = MULTIPLY(-d1, FIX_0_899976223);
1057 z2 = MULTIPLY(-z2, FIX_2_562915447);
1058 z3 = MULTIPLY(-d3, FIX_1_961570560);
1059 z4 = MULTIPLY(-z4, FIX_0_390180644);
1069 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1072 z5 = MULTIPLY(z2, FIX_1_175875602);
1073 tmp1 = MULTIPLY(d5, FIX_1_662939225);
1074 z4 = MULTIPLY(-d5, FIX_0_390180644);
1075 z2 = MULTIPLY(-z2, FIX_1_387039845);
1076 tmp2 = MULTIPLY(d3, FIX_1_111140466);
1077 z3 = MULTIPLY(-d3, FIX_1_961570560);
1086 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1089 z5 = MULTIPLY(z4, FIX_1_175875602);
1090 z1 = MULTIPLY(-d1, FIX_0_899976223);
1091 tmp3 = MULTIPLY(d1, FIX_0_601344887);
1092 tmp1 = MULTIPLY(-d5, FIX_0_509795579);
1093 z2 = MULTIPLY(-d5, FIX_2_562915447);
1094 z4 = MULTIPLY(z4, FIX_0_785694958);
1101 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1102 tmp0 = MULTIPLY(d5, FIX_1_175875602);
1103 tmp1 = MULTIPLY(d5, FIX_0_275899380);
1104 tmp2 = MULTIPLY(-d5, FIX_1_387039845);
1105 tmp3 = MULTIPLY(d5, FIX_0_785694958);
1111 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1113 tmp3 = MULTIPLY(d1, FIX_0_211164243);
1114 tmp2 = MULTIPLY(-d3, FIX_1_451774981);
1115 z1 = MULTIPLY(d1, FIX_1_061594337);
1116 z2 = MULTIPLY(-d3, FIX_2_172734803);
1117 z4 = MULTIPLY(z5, FIX_0_785694958);
1118 z5 = MULTIPLY(z5, FIX_1_175875602);
1125 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1126 tmp0 = MULTIPLY(-d3, FIX_0_785694958);
1127 tmp1 = MULTIPLY(-d3, FIX_1_387039845);
1128 tmp2 = MULTIPLY(-d3, FIX_0_275899380);
1129 tmp3 = MULTIPLY(d3, FIX_1_175875602);
1133 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1134 tmp0 = MULTIPLY(d1, FIX_0_275899380);
1135 tmp1 = MULTIPLY(d1, FIX_0_785694958);
1136 tmp2 = MULTIPLY(d1, FIX_1_175875602);
1137 tmp3 = MULTIPLY(d1, FIX_1_387039845);
1139 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1140 tmp0 = tmp1 = tmp2 = tmp3 = 0;
1146 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1148 dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3,
1149 CONST_BITS+PASS1_BITS+3);
1150 dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10 - tmp3,
1151 CONST_BITS+PASS1_BITS+3);
1152 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2,
1153 CONST_BITS+PASS1_BITS+3);
1154 dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11 - tmp2,
1155 CONST_BITS+PASS1_BITS+3);
1156 dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1,
1157 CONST_BITS+PASS1_BITS+3);
1158 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12 - tmp1,
1159 CONST_BITS+PASS1_BITS+3);
1160 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0,
1161 CONST_BITS+PASS1_BITS+3);
1162 dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13 - tmp0,
1163 CONST_BITS+PASS1_BITS+3);
1165 dataptr++; /* advance pointer to next column */