]> git.sesse.net Git - vlc/blob - src/video_decoder/vdec_idct.c
b0b1aaee0036b21f15395228ffa38bad1a560008
[vlc] / src / video_decoder / vdec_idct.c
1 /*****************************************************************************
2  * vdec_idct.c : IDCT functions
3  *****************************************************************************
4  * Copyright (C) 1999, 2000 VideoLAN
5  *
6  * Authors:
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public
19  * License along with this program; if not, write to the
20  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21  * Boston, MA 02111-1307, USA.
22  *****************************************************************************/
23
24 /*****************************************************************************
25  * Preamble
26  *****************************************************************************/
27 #include <sys/types.h>                        /* on BSD, uio.h needs types.h */
28 #include <sys/uio.h>                                          /* for input.h */
29
30 #include "config.h"
31 #include "common.h"
32 #include "mtime.h"
33 #include "threads.h"
34
35 #include "intf_msg.h"
36
37 #include "input.h"
38 #include "decoder_fifo.h"
39 #include "video.h"
40 #include "video_output.h"
41
42 #include "vdec_idct.h"
43 #include "video_decoder.h"
44 #include "vdec_motion.h"
45
46 #include "vpar_blocks.h"
47 #include "vpar_headers.h"
48 #include "vpar_synchro.h"
49 #include "video_parser.h"
50 #include "video_fifo.h"
51
52 /*
53  * Local prototypes
54  */
55
56 /* Our current implementation is a fast DCT, we might move to a fast DFT or
57  * an MMX DCT in the future. */
58
59 /*****************************************************************************
60  * vdec_InitIDCT : initialize datas for vdec_SparceIDCT
61  * vdec_SparseIDCT : IDCT function for sparse matrices
62  *****************************************************************************/
63
64 void vdec_InitIDCT (vdec_thread_t * p_vdec)
65 {
66     int i;
67
68     dctelem_t * p_pre = p_vdec->p_pre_idct;
69     memset( p_pre, 0, 64*64*sizeof(dctelem_t) );
70
71     for( i=0 ; i < 64 ; i++ )
72     {
73         p_pre[i*64+i] = 1 << SPARSE_SCALE_FACTOR;
74         vdec_IDCT( p_vdec, &p_pre[i*64], 0) ;
75     }
76     return;
77 }
78
79 void vdec_SparseIDCT (vdec_thread_t * p_vdec, dctelem_t * p_block,
80                       int i_sparse_pos)
81 {
82     short int val;
83     int * dp;
84     int v;
85     short int * p_dest;
86     short int * p_source;
87     int coeff, rr;
88
89     /* If DC Coefficient. */
90     if ( i_sparse_pos == 0 )
91     {
92         dp=(int *)p_block;
93         val=RIGHT_SHIFT((*p_block + 4), 3);
94         /* Compute int to assign.  This speeds things up a bit */
95         v = ((val & 0xffff) | (val << 16));
96         dp[0] = v;     dp[1] = v;     dp[2] = v;     dp[3] = v;
97         dp[4] = v;     dp[5] = v;     dp[6] = v;     dp[7] = v;
98         dp[8] = v;     dp[9] = v;     dp[10] = v;    dp[11] = v;
99         dp[12] = v;    dp[13] = v;    dp[14] = v;    dp[15] = v;
100         dp[16] = v;    dp[17] = v;    dp[18] = v;    dp[19] = v;
101         dp[20] = v;    dp[21] = v;    dp[22] = v;    dp[23] = v;
102         dp[24] = v;    dp[25] = v;    dp[26] = v;    dp[27] = v;
103         dp[28] = v;    dp[29] = v;    dp[30] = v;    dp[31] = v;
104         return;
105     }
106     /* Some other coefficient. */
107     p_dest = (s16*)p_block;
108     p_source = (s16*)&p_vdec->p_pre_idct[i_sparse_pos];
109     coeff = (int)p_dest[i_sparse_pos];
110     for( rr=0 ; rr < 4 ; rr++ )
111     {
112         p_dest[0] = (p_source[0] * coeff) >> SPARSE_SCALE_FACTOR;
113         p_dest[1] = (p_source[1] * coeff) >> SPARSE_SCALE_FACTOR;
114         p_dest[2] = (p_source[2] * coeff) >> SPARSE_SCALE_FACTOR;
115         p_dest[3] = (p_source[3] * coeff) >> SPARSE_SCALE_FACTOR;
116         p_dest[4] = (p_source[4] * coeff) >> SPARSE_SCALE_FACTOR;
117         p_dest[5] = (p_source[5] * coeff) >> SPARSE_SCALE_FACTOR;
118         p_dest[6] = (p_source[6] * coeff) >> SPARSE_SCALE_FACTOR;
119         p_dest[7] = (p_source[7] * coeff) >> SPARSE_SCALE_FACTOR;
120         p_dest[8] = (p_source[8] * coeff) >> SPARSE_SCALE_FACTOR;
121         p_dest[9] = (p_source[9] * coeff) >> SPARSE_SCALE_FACTOR;
122         p_dest[10] = (p_source[10] * coeff) >> SPARSE_SCALE_FACTOR;
123         p_dest[11] = (p_source[11] * coeff) >> SPARSE_SCALE_FACTOR;
124         p_dest[12] = (p_source[12] * coeff) >> SPARSE_SCALE_FACTOR;
125         p_dest[13] = (p_source[13] * coeff) >> SPARSE_SCALE_FACTOR;
126         p_dest[14] = (p_source[14] * coeff) >> SPARSE_SCALE_FACTOR;
127         p_dest[15] = (p_source[15] * coeff) >> SPARSE_SCALE_FACTOR;
128         p_dest += 16;
129         p_source += 16;
130     }
131     return;
132 }
133
134
135 /*****************************************************************************
136  * vdec_IDCT : IDCT function for normal matrices
137  *****************************************************************************/
138
139 #ifndef HAVE_MMX
140 void vdec_IDCT( vdec_thread_t * p_vdec, dctelem_t * p_block, int i_idontcare )
141 {
142 #if 0
143     /* dct classique: pour tester la meilleure entre la classique et la */
144     /* no classique */
145     s32 tmp0, tmp1, tmp2, tmp3;
146     s32 tmp10, tmp11, tmp12, tmp13;
147     s32 z1, z2, z3, z4, z5;
148     dctelem_t * dataptr;
149     int rowctr;
150     SHIFT_TEMPS
151
152   /* Pass 1: process rows. */
153   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
154   /* furthermore, we scale the results by 2**PASS1_BITS. */
155
156     dataptr = p_block;
157     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
158     {
159     /* Due to quantization, we will usually find that many of the input
160      * coefficients are zero, especially the AC terms.  We can exploit this
161      * by short-circuiting the IDCT calculation for any row in which all
162      * the AC terms are zero.  In that case each output is equal to the
163      * DC coefficient (with scale factor as needed).
164      * With typical images and quantization tables, half or more of the
165      * row DCT calculations can be simplified this way.
166      */
167
168         if ((dataptr[1] | dataptr[2] | dataptr[3] | dataptr[4] |
169                 dataptr[5] | dataptr[6] | dataptr[7]) == 0)
170         {
171       /* AC terms all zero */
172             dctelem_t dcval = (dctelem_t) (dataptr[0] << PASS1_BITS);
173
174             dataptr[0] = dcval;
175             dataptr[1] = dcval;
176             dataptr[2] = dcval;
177             dataptr[3] = dcval;
178             dataptr[4] = dcval;
179             dataptr[5] = dcval;
180             dataptr[6] = dcval;
181             dataptr[7] = dcval;
182
183             dataptr += DCTSIZE; /* advance pointer to next row */
184             continue;
185         }
186
187     /* Even part: reverse the even part of the forward DCT. */
188     /* The rotator is sqrt(2)*c(-6). */
189
190         z2 = (s32) dataptr[2];
191         z3 = (s32) dataptr[6];
192
193         z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
194         tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
195         tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
196
197         tmp0 = ((s32) dataptr[0] + (s32) dataptr[4]) << CONST_BITS;
198         tmp1 = ((s32) dataptr[0] - (s32) dataptr[4]) << CONST_BITS;
199
200         tmp10 = tmp0 + tmp3;
201         tmp13 = tmp0 - tmp3;
202         tmp11 = tmp1 + tmp2;
203         tmp12 = tmp1 - tmp2;
204
205     /* Odd part per figure 8; the matrix is unitary and hence its
206      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
207      */
208
209         tmp0 = (s32) dataptr[7];
210         tmp1 = (s32) dataptr[5];
211         tmp2 = (s32) dataptr[3];
212         tmp3 = (s32) dataptr[1];
213
214         z1 = tmp0 + tmp3;
215         z2 = tmp1 + tmp2;
216         z3 = tmp0 + tmp2;
217         z4 = tmp1 + tmp3;
218         z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
219
220         tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
221         tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
222         tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
223         tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
224         z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
225         z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
226         z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
227         z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
228
229         z3 += z5;
230         z4 += z5;
231
232         tmp0 += z1 + z3;
233         tmp1 += z2 + z4;
234         tmp2 += z2 + z3;
235         tmp3 += z1 + z4;
236
237     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
238
239         dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
240         dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
241         dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
242         dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
243         dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
244         dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
245         dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
246         dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
247
248         dataptr += DCTSIZE;             /* advance pointer to next row */
249     }
250
251   /* Pass 2: process columns. */
252   /* Note that we must descale the results by a factor of 8 == 2**3, */
253   /* and also undo the PASS1_BITS scaling. */
254
255     dataptr = p_block;
256     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
257     {
258     /* Columns of zeroes can be exploited in the same way as we did with rows.
259      * However, the row calculation has created many nonzero AC terms, so the
260      * simplification applies less often (typically 5% to 10% of the time).
261      * On machines with very fast multiplication, it's possible that the
262      * test takes more time than it's worth.  In that case this section
263      * may be commented out.
264      */
265
266 #ifndef NO_ZERO_COLUMN_TEST /*ajoute un test mais evite des calculs */
267         if ((dataptr[DCTSIZE*1] | dataptr[DCTSIZE*2] | dataptr[DCTSIZE*3] |
268             dataptr[DCTSIZE*4] | dataptr[DCTSIZE*5] | dataptr[DCTSIZE*6] |
269             dataptr[DCTSIZE*7]) == 0)
270         {
271       /* AC terms all zero */
272             dctelem_t dcval = (dctelem_t) DESCALE((s32) dataptr[0], PASS1_BITS+3);
273
274             dataptr[DCTSIZE*0] = dcval;
275             dataptr[DCTSIZE*1] = dcval;
276             dataptr[DCTSIZE*2] = dcval;
277             dataptr[DCTSIZE*3] = dcval;
278             dataptr[DCTSIZE*4] = dcval;
279             dataptr[DCTSIZE*5] = dcval;
280             dataptr[DCTSIZE*6] = dcval;
281             dataptr[DCTSIZE*7] = dcval;
282
283             dataptr++;          /* advance pointer to next column */
284             continue;
285         }
286 #endif
287
288     /* Even part: reverse the even part of the forward DCT. */
289     /* The rotator is sqrt(2)*c(-6). */
290
291         z2 = (s32) dataptr[DCTSIZE*2];
292         z3 = (s32) dataptr[DCTSIZE*6];
293
294         z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
295         tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
296         tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
297
298         tmp0 = ((s32) dataptr[DCTSIZE*0] + (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
299         tmp1 = ((s32) dataptr[DCTSIZE*0] - (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
300
301         tmp10 = tmp0 + tmp3;
302         tmp13 = tmp0 - tmp3;
303         tmp11 = tmp1 + tmp2;
304         tmp12 = tmp1 - tmp2;
305
306     /* Odd part per figure 8; the matrix is unitary and hence its
307      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
308      */
309
310         tmp0 = (s32) dataptr[DCTSIZE*7];
311         tmp1 = (s32) dataptr[DCTSIZE*5];
312         tmp2 = (s32) dataptr[DCTSIZE*3];
313         tmp3 = (s32) dataptr[DCTSIZE*1];
314
315         z1 = tmp0 + tmp3;
316         z2 = tmp1 + tmp2;
317         z3 = tmp0 + tmp2;
318         z4 = tmp1 + tmp3;
319         z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
320
321         tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
322         tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
323         tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
324         tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
325         z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
326         z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
327         z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
328         z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
329
330         z3 += z5;
331         z4 += z5;
332
333         tmp0 += z1 + z3;
334         tmp1 += z2 + z4;
335         tmp2 += z2 + z3;
336         tmp3 += z1 + z4;
337
338     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
339
340         dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
341                                            CONST_BITS+PASS1_BITS+3);
342         dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
343                                            CONST_BITS+PASS1_BITS+3);
344         dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
345                                            CONST_BITS+PASS1_BITS+3);
346         dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
347                                            CONST_BITS+PASS1_BITS+3);
348         dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
349                                            CONST_BITS+PASS1_BITS+3);
350         dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
351                                            CONST_BITS+PASS1_BITS+3);
352         dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
353                                            CONST_BITS+PASS1_BITS+3);
354         dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
355                                            CONST_BITS+PASS1_BITS+3);
356
357         dataptr++;                      /* advance pointer to next column */
358     }
359 #endif
360
361 #if 1  /*dct avec non classique*/
362
363     s32 tmp0, tmp1, tmp2, tmp3;
364     s32 tmp10, tmp11, tmp12, tmp13;
365     s32 z1, z2, z3, z4, z5;
366     s32 d0, d1, d2, d3, d4, d5, d6, d7;
367     dctelem_t * dataptr;
368     int rowctr;
369
370     SHIFT_TEMPS
371
372     /* Pass 1: process rows. */
373     /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
374     /* furthermore, we scale the results by 2**PASS1_BITS. */
375
376     dataptr = p_block;
377
378     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
379     {
380         /* Due to quantization, we will usually find that many of the input
381          * coefficients are zero, especially the AC terms.  We can exploit this
382          * by short-circuiting the IDCT calculation for any row in which all
383          * the AC terms are zero.  In that case each output is equal to the
384          * DC coefficient (with scale factor as needed).
385          * With typical images and quantization tables, half or more of the
386          * row DCT calculations can be simplified this way.
387          */
388
389         register int * idataptr = (int*)dataptr;
390         d0 = dataptr[0];
391         d1 = dataptr[1];
392         if ( (d1 == 0) && ((idataptr[1] | idataptr[2] | idataptr[3]) == 0) )
393         {
394       /* AC terms all zero */
395             if (d0)
396             {
397       /* Compute a 32 bit value to assign. */
398                 dctelem_t dcval = (dctelem_t) (d0 << PASS1_BITS);
399                 register int v = (dcval & 0xffff) | (dcval << 16);
400
401                 idataptr[0] = v;
402                 idataptr[1] = v;
403                 idataptr[2] = v;
404                 idataptr[3] = v;
405             }
406
407             dataptr += DCTSIZE; /* advance pointer to next row */
408             continue;
409         }
410         d2 = dataptr[2];
411         d3 = dataptr[3];
412         d4 = dataptr[4];
413         d5 = dataptr[5];
414         d6 = dataptr[6];
415         d7 = dataptr[7];
416
417     /* Even part: reverse the even part of the forward DCT. */
418     /* The rotator is sqrt(2)*c(-6). */
419         if (d6)
420         {
421             if (d4)
422             {
423                 if (d2)
424                 {
425                     if (d0)
426                     {
427             /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
428                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
429                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
430                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
431
432                         tmp0 = (d0 + d4) << CONST_BITS;
433                         tmp1 = (d0 - d4) << CONST_BITS;
434
435                         tmp10 = tmp0 + tmp3;
436                         tmp13 = tmp0 - tmp3;
437                         tmp11 = tmp1 + tmp2;
438                         tmp12 = tmp1 - tmp2;
439                     }
440                     else
441                     {
442                     /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
443                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
444                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
445                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
446
447                         tmp0 = d4 << CONST_BITS;
448
449                         tmp10 = tmp0 + tmp3;
450                         tmp13 = tmp0 - tmp3;
451                         tmp11 = tmp2 - tmp0;
452                         tmp12 = -(tmp0 + tmp2);
453                         }
454                 }
455                 else
456                 {
457                     if (d0)
458                     {
459             /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
460                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
461                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
462
463                         tmp0 = (d0 + d4) << CONST_BITS;
464                         tmp1 = (d0 - d4) << CONST_BITS;
465
466                         tmp10 = tmp0 + tmp3;
467                         tmp13 = tmp0 - tmp3;
468                         tmp11 = tmp1 + tmp2;
469                         tmp12 = tmp1 - tmp2;
470                         }
471                     else
472                     {
473                     /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
474                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
475                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
476
477                         tmp0 = d4 << CONST_BITS;
478
479                         tmp10 = tmp0 + tmp3;
480                         tmp13 = tmp0 - tmp3;
481                         tmp11 = tmp2 - tmp0;
482                         tmp12 = -(tmp0 + tmp2);
483                         }
484                 }
485             }
486             else
487             {
488                 if (d2)
489                 {
490                     if (d0)
491                     {
492             /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
493                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
494                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
495                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
496
497                         tmp0 = d0 << CONST_BITS;
498
499                         tmp10 = tmp0 + tmp3;
500                         tmp13 = tmp0 - tmp3;
501                         tmp11 = tmp0 + tmp2;
502                         tmp12 = tmp0 - tmp2;
503                     }
504                     else
505                     {
506                     /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
507                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
508                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
509                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
510
511                         tmp10 = tmp3;
512                         tmp13 = -tmp3;
513                         tmp11 = tmp2;
514                         tmp12 = -tmp2;
515                             }
516                 }
517                 else
518                 {
519                     if (d0)
520                     {
521             /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
522                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
523                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
524
525                         tmp0 = d0 << CONST_BITS;
526
527                         tmp10 = tmp0 + tmp3;
528                         tmp13 = tmp0 - tmp3;
529                         tmp11 = tmp0 + tmp2;
530                         tmp12 = tmp0 - tmp2;
531                     }
532                     else
533                     {
534             /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
535                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
536                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
537
538                         tmp10 = tmp3;
539                         tmp13 = -tmp3;
540                         tmp11 = tmp2;
541                         tmp12 = -tmp2;
542                     }
543                 }
544             }
545         }
546         else
547         {
548             if (d4)
549             {
550                 if (d2)
551                 {
552                     if (d0)
553                     {
554                     /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
555                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
556                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
557
558                         tmp0 = (d0 + d4) << CONST_BITS;
559                         tmp1 = (d0 - d4) << CONST_BITS;
560
561                         tmp10 = tmp0 + tmp3;
562                         tmp13 = tmp0 - tmp3;
563                         tmp11 = tmp1 + tmp2;
564                         tmp12 = tmp1 - tmp2;
565                     }
566                     else
567                     {
568             /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
569                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
570                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
571
572                         tmp0 = d4 << CONST_BITS;
573
574                         tmp10 = tmp0 + tmp3;
575                         tmp13 = tmp0 - tmp3;
576                         tmp11 = tmp2 - tmp0;
577                         tmp12 = -(tmp0 + tmp2);
578                     }
579                 }
580                 else
581                 {
582                     if (d0)
583                     {
584             /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
585                         tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
586                         tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
587                     }
588                     else
589                     {
590             /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
591                         tmp10 = tmp13 = d4 << CONST_BITS;
592                         tmp11 = tmp12 = -tmp10;
593                     }
594                 }
595             }
596             else
597             {
598                 if (d2)
599                 {
600                     if (d0)
601                     {
602             /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
603                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
604                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
605
606                         tmp0 = d0 << CONST_BITS;
607
608                         tmp10 = tmp0 + tmp3;
609                         tmp13 = tmp0 - tmp3;
610                         tmp11 = tmp0 + tmp2;
611                         tmp12 = tmp0 - tmp2;
612                     }
613                     else
614                     {
615             /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
616                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
617                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
618
619                         tmp10 = tmp3;
620                         tmp13 = -tmp3;
621                         tmp11 = tmp2;
622                         tmp12 = -tmp2;
623                     }
624                 }
625                 else
626                 {
627                     if (d0)
628                     {
629             /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
630                         tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
631                     }
632                     else
633                     {
634             /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
635                         tmp10 = tmp13 = tmp11 = tmp12 = 0;
636                     }
637                 }
638             }
639         }
640
641
642     /* Odd part per figure 8; the matrix is unitary and hence its
643      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
644      */
645
646         if (d7)
647             {
648             if (d5)
649             {
650                 if (d3)
651                 {
652                     if (d1)
653                     {
654             /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
655                         z1 = d7 + d1;
656                         z2 = d5 + d3;
657                         z3 = d7 + d3;
658                         z4 = d5 + d1;
659                         z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
660
661                         tmp0 = MULTIPLY(d7, FIX(0.298631336));
662                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
663                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
664                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
665                         z1 = MULTIPLY(z1, - FIX(0.899976223));
666                         z2 = MULTIPLY(z2, - FIX(2.562915447));
667                         z3 = MULTIPLY(z3, - FIX(1.961570560));
668                         z4 = MULTIPLY(z4, - FIX(0.390180644));
669
670                         z3 += z5;
671                         z4 += z5;
672
673                         tmp0 += z1 + z3;
674                         tmp1 += z2 + z4;
675                         tmp2 += z2 + z3;
676                         tmp3 += z1 + z4;
677                     }
678                     else
679                     {
680             /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
681                         z2 = d5 + d3;
682                         z3 = d7 + d3;
683                         z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
684
685                         tmp0 = MULTIPLY(d7, FIX(0.298631336));
686                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
687                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
688                         z1 = MULTIPLY(d7, - FIX(0.899976223));
689                         z2 = MULTIPLY(z2, - FIX(2.562915447));
690                         z3 = MULTIPLY(z3, - FIX(1.961570560));
691                         z4 = MULTIPLY(d5, - FIX(0.390180644));
692
693                         z3 += z5;
694                         z4 += z5;
695
696                         tmp0 += z1 + z3;
697                         tmp1 += z2 + z4;
698                         tmp2 += z2 + z3;
699                         tmp3 = z1 + z4;
700                         }
701                     }
702                 else
703                 {
704                     if (d1)
705                     {
706             /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
707                         z1 = d7 + d1;
708                         z4 = d5 + d1;
709                         z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
710
711                         tmp0 = MULTIPLY(d7, FIX(0.298631336));
712                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
713                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
714                         z1 = MULTIPLY(z1, - FIX(0.899976223));
715                         z2 = MULTIPLY(d5, - FIX(2.562915447));
716                         z3 = MULTIPLY(d7, - FIX(1.961570560));
717                         z4 = MULTIPLY(z4, - FIX(0.390180644));
718
719                         z3 += z5;
720                         z4 += z5;
721
722                         tmp0 += z1 + z3;
723                         tmp1 += z2 + z4;
724                         tmp2 = z2 + z3;
725                         tmp3 += z1 + z4;
726                     }
727                     else
728                     {
729             /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
730                         z5 = MULTIPLY(d7 + d5, FIX(1.175875602));
731
732                         tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
733                         tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
734                         z1 = MULTIPLY(d7, - FIX(0.899976223));
735                         z3 = MULTIPLY(d7, - FIX(1.961570560));
736                         z2 = MULTIPLY(d5, - FIX(2.562915447));
737                         z4 = MULTIPLY(d5, - FIX(0.390180644));
738
739                         z3 += z5;
740                         z4 += z5;
741
742                         tmp0 += z3;
743                         tmp1 += z4;
744                         tmp2 = z2 + z3;
745                         tmp3 = z1 + z4;
746                     }
747                 }
748             }
749             else
750             {
751                 if (d3)
752                 {
753                     if (d1)
754                     {
755             /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
756                         z1 = d7 + d1;
757                         z3 = d7 + d3;
758                         z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
759
760                         tmp0 = MULTIPLY(d7, FIX(0.298631336));
761                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
762                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
763                         z1 = MULTIPLY(z1, - FIX(0.899976223));
764                         z2 = MULTIPLY(d3, - FIX(2.562915447));
765                         z3 = MULTIPLY(z3, - FIX(1.961570560));
766                         z4 = MULTIPLY(d1, - FIX(0.390180644));
767
768                         z3 += z5;
769                         z4 += z5;
770
771                         tmp0 += z1 + z3;
772                         tmp1 = z2 + z4;
773                         tmp2 += z2 + z3;
774                         tmp3 += z1 + z4;
775                     }
776                     else
777                     {
778             /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
779                         z3 = d7 + d3;
780                         z5 = MULTIPLY(z3, FIX(1.175875602));
781
782                         tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
783                         tmp2 = MULTIPLY(d3, FIX(0.509795579));
784                         z1 = MULTIPLY(d7, - FIX(0.899976223));
785                         z2 = MULTIPLY(d3, - FIX(2.562915447));
786                         z3 = MULTIPLY(z3, - FIX2(0.785694958));
787
788                         tmp0 += z3;
789                         tmp1 = z2 + z5;
790                         tmp2 += z3;
791                         tmp3 = z1 + z5;
792                     }
793                 }
794                 else
795                 {
796                     if (d1)
797                     {
798             /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
799                         z1 = d7 + d1;
800                         z5 = MULTIPLY(z1, FIX(1.175875602));
801
802                         tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
803                         tmp3 = MULTIPLY(d1, FIX2(1.111140466));
804                         z1 = MULTIPLY(z1, FIX2(0.275899379));
805                         z3 = MULTIPLY(d7, - FIX(1.961570560));
806                         z4 = MULTIPLY(d1, - FIX(0.390180644));
807
808                         tmp0 += z1;
809                         tmp1 = z4 + z5;
810                         tmp2 = z3 + z5;
811                         tmp3 += z1;
812                     }
813                 else
814                     {
815             /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
816                         tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
817                         tmp1 = MULTIPLY(d7, FIX(1.175875602));
818                         tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
819                         tmp3 = MULTIPLY(d7, FIX2(0.275899379));
820                     }
821                 }
822             }
823         }
824         else
825         {
826             if (d5)
827             {
828                 if (d3)
829                 {
830                     if (d1)
831                     {
832             /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
833                         z2 = d5 + d3;
834                         z4 = d5 + d1;
835                         z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
836
837                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
838                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
839                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
840                         z1 = MULTIPLY(d1, - FIX(0.899976223));
841                         z2 = MULTIPLY(z2, - FIX(2.562915447));
842                         z3 = MULTIPLY(d3, - FIX(1.961570560));
843                         z4 = MULTIPLY(z4, - FIX(0.390180644));
844
845                         z3 += z5;
846                         z4 += z5;
847
848                         tmp0 = z1 + z3;
849                         tmp1 += z2 + z4;
850                         tmp2 += z2 + z3;
851                         tmp3 += z1 + z4;
852                     }
853                     else
854                     {
855             /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
856                         z2 = d5 + d3;
857                         z5 = MULTIPLY(z2, FIX(1.175875602));
858
859                         tmp1 = MULTIPLY(d5, FIX2(1.662939225));
860                         tmp2 = MULTIPLY(d3, FIX2(1.111140466));
861                         z2 = MULTIPLY(z2, - FIX2(1.387039845));
862                         z3 = MULTIPLY(d3, - FIX(1.961570560));
863                         z4 = MULTIPLY(d5, - FIX(0.390180644));
864
865                         tmp0 = z3 + z5;
866                         tmp1 += z2;
867                         tmp2 += z2;
868                         tmp3 = z4 + z5;
869                     }
870                 }
871                 else
872                 {
873                     if (d1)
874                     {
875             /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
876                         z4 = d5 + d1;
877                         z5 = MULTIPLY(z4, FIX(1.175875602));
878
879                         tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
880                         tmp3 = MULTIPLY(d1, FIX2(0.601344887));
881                         z1 = MULTIPLY(d1, - FIX(0.899976223));
882                         z2 = MULTIPLY(d5, - FIX(2.562915447));
883                         z4 = MULTIPLY(z4, FIX2(0.785694958));
884
885                         tmp0 = z1 + z5;
886                         tmp1 += z4;
887                         tmp2 = z2 + z5;
888                         tmp3 += z4;
889                     }
890                     else
891                     {
892             /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
893                         tmp0 = MULTIPLY(d5, FIX(1.175875602));
894                         tmp1 = MULTIPLY(d5, FIX2(0.275899380));
895                         tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
896                         tmp3 = MULTIPLY(d5, FIX2(0.785694958));
897                     }
898                 }
899             }
900             else
901             {
902                 if (d3)
903                 {
904                     if (d1)
905                     {
906             /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
907                         z5 = d3 + d1;
908
909                         tmp2 = MULTIPLY(d3, - FIX(1.451774981));
910                         tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
911                         z1 = MULTIPLY(d1, FIX(1.061594337));
912                         z2 = MULTIPLY(d3, - FIX(2.172734803));
913                         z4 = MULTIPLY(z5, FIX(0.785694958));
914                         z5 = MULTIPLY(z5, FIX(1.175875602));
915
916                         tmp0 = z1 - z4;
917                         tmp1 = z2 + z4;
918                         tmp2 += z5;
919                         tmp3 += z5;
920                     }
921                     else
922                     {
923             /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
924                         tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
925                         tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
926                         tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
927                         tmp3 = MULTIPLY(d3, FIX(1.175875602));
928                     }
929                 }
930                 else
931                 {
932                     if (d1)
933                     {
934             /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
935                         tmp0 = MULTIPLY(d1, FIX2(0.275899379));
936                         tmp1 = MULTIPLY(d1, FIX2(0.785694958));
937                         tmp2 = MULTIPLY(d1, FIX(1.175875602));
938                         tmp3 = MULTIPLY(d1, FIX2(1.387039845));
939                     }
940                     else
941                     {
942             /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
943                         tmp0 = tmp1 = tmp2 = tmp3 = 0;
944                     }
945                 }
946             }
947         }
948
949     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
950
951         dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
952         dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
953         dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
954         dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
955         dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
956         dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
957         dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
958         dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
959
960         dataptr += DCTSIZE;              /* advance pointer to next row */
961     }
962
963   /* Pass 2: process columns. */
964   /* Note that we must descale the results by a factor of 8 == 2**3, */
965   /* and also undo the PASS1_BITS scaling. */
966
967     dataptr = p_block;
968     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
969     {
970     /* Columns of zeroes can be exploited in the same way as we did with rows.
971      * However, the row calculation has created many nonzero AC terms, so the
972      * simplification applies less often (typically 5% to 10% of the time).
973      * On machines with very fast multiplication, it's possible that the
974      * test takes more time than it's worth.  In that case this section
975      * may be commented out.
976      */
977
978         d0 = dataptr[DCTSIZE*0];
979         d1 = dataptr[DCTSIZE*1];
980         d2 = dataptr[DCTSIZE*2];
981         d3 = dataptr[DCTSIZE*3];
982         d4 = dataptr[DCTSIZE*4];
983         d5 = dataptr[DCTSIZE*5];
984         d6 = dataptr[DCTSIZE*6];
985         d7 = dataptr[DCTSIZE*7];
986
987     /* Even part: reverse the even part of the forward DCT. */
988     /* The rotator is sqrt(2)*c(-6). */
989         if (d6)
990         {
991             if (d4)
992             {
993                 if (d2)
994                 {
995                     if (d0)
996                     {
997             /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
998                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
999                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1000                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1001
1002                         tmp0 = (d0 + d4) << CONST_BITS;
1003                         tmp1 = (d0 - d4) << CONST_BITS;
1004
1005                         tmp10 = tmp0 + tmp3;
1006                         tmp13 = tmp0 - tmp3;
1007                         tmp11 = tmp1 + tmp2;
1008                         tmp12 = tmp1 - tmp2;
1009                     }
1010                     else
1011                     {
1012             /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
1013                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1014                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1015                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1016
1017                         tmp0 = d4 << CONST_BITS;
1018
1019                         tmp10 = tmp0 + tmp3;
1020                         tmp13 = tmp0 - tmp3;
1021                         tmp11 = tmp2 - tmp0;
1022                         tmp12 = -(tmp0 + tmp2);
1023                     }
1024                 }
1025                 else
1026                 {
1027                     if (d0)
1028                     {
1029             /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1030                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1031                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
1032
1033                         tmp0 = (d0 + d4) << CONST_BITS;
1034                         tmp1 = (d0 - d4) << CONST_BITS;
1035
1036                         tmp10 = tmp0 + tmp3;
1037                         tmp13 = tmp0 - tmp3;
1038                         tmp11 = tmp1 + tmp2;
1039                         tmp12 = tmp1 - tmp2;
1040                     }
1041                     else
1042                     {
1043             /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
1044                         tmp2 = MULTIPLY(d6, -FIX2(1.306562965));
1045                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
1046
1047                         tmp0 = d4 << CONST_BITS;
1048
1049                         tmp10 = tmp0 + tmp3;
1050                         tmp13 = tmp0 - tmp3;
1051                         tmp11 = tmp2 - tmp0;
1052                         tmp12 = -(tmp0 + tmp2);
1053                     }
1054                 }
1055             }
1056             else
1057             {
1058                 if (d2)
1059                 {
1060                     if (d0)
1061                     {
1062             /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
1063                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1064                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1065                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1066
1067                         tmp0 = d0 << CONST_BITS;
1068
1069                         tmp10 = tmp0 + tmp3;
1070                         tmp13 = tmp0 - tmp3;
1071                         tmp11 = tmp0 + tmp2;
1072                         tmp12 = tmp0 - tmp2;
1073                     }
1074                     else
1075                     {
1076             /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
1077                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1078                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1079                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1080
1081                         tmp10 = tmp3;
1082                         tmp13 = -tmp3;
1083                         tmp11 = tmp2;
1084                         tmp12 = -tmp2;
1085                     }
1086                 }
1087                 else
1088                 {
1089                     if (d0)
1090                     {
1091             /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
1092                     tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1093                     tmp3 = MULTIPLY(d6, FIX(0.541196100));
1094
1095                     tmp0 = d0 << CONST_BITS;
1096
1097                     tmp10 = tmp0 + tmp3;
1098                     tmp13 = tmp0 - tmp3;
1099                     tmp11 = tmp0 + tmp2;
1100                     tmp12 = tmp0 - tmp2;
1101                 }
1102                 else
1103                 {
1104             /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
1105                     tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1106                     tmp3 = MULTIPLY(d6, FIX(0.541196100));
1107                     tmp10 = tmp3;
1108                     tmp13 = -tmp3;
1109                     tmp11 = tmp2;
1110                     tmp12 = -tmp2;
1111                 }
1112             }
1113         }
1114     }
1115     else
1116     {
1117         if (d4)
1118         {
1119             if (d2)
1120             {
1121                 if (d0)
1122                 {
1123             /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1124                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1125                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1126
1127                     tmp0 = (d0 + d4) << CONST_BITS;
1128                     tmp1 = (d0 - d4) << CONST_BITS;
1129
1130                     tmp10 = tmp0 + tmp3;
1131                     tmp13 = tmp0 - tmp3;
1132                     tmp11 = tmp1 + tmp2;
1133                     tmp12 = tmp1 - tmp2;
1134                 }
1135                 else
1136                 {
1137             /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
1138                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1139                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1140
1141                     tmp0 = d4 << CONST_BITS;
1142
1143                     tmp10 = tmp0 + tmp3;
1144                     tmp13 = tmp0 - tmp3;
1145                     tmp11 = tmp2 - tmp0;
1146                     tmp12 = -(tmp0 + tmp2);
1147                 }
1148             }
1149             else
1150             {
1151                 if (d0)
1152                 {
1153             /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1154                     tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1155                     tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1156                 }
1157                 else
1158                 {
1159             /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
1160                     tmp10 = tmp13 = d4 << CONST_BITS;
1161                     tmp11 = tmp12 = -tmp10;
1162                 }
1163             }
1164         }
1165         else
1166         {
1167         if (d2)
1168         {
1169             if (d0)
1170             {
1171             /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
1172                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1173                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1174
1175                     tmp0 = d0 << CONST_BITS;
1176
1177                     tmp10 = tmp0 + tmp3;
1178                     tmp13 = tmp0 - tmp3;
1179                     tmp11 = tmp0 + tmp2;
1180                     tmp12 = tmp0 - tmp2;
1181             }
1182             else
1183             {
1184             /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
1185                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1186                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1187
1188                     tmp10 = tmp3;
1189                     tmp13 = -tmp3;
1190                     tmp11 = tmp2;
1191                     tmp12 = -tmp2;
1192             }
1193         }
1194         else
1195         {
1196             if (d0)
1197                 {
1198             /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
1199                     tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
1200                 }
1201                 else
1202                 {
1203             /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
1204                     tmp10 = tmp13 = tmp11 = tmp12 = 0;
1205                 }
1206             }
1207         }
1208     }
1209
1210     /* Odd part per figure 8; the matrix is unitary and hence its
1211      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
1212      */
1213     if (d7)
1214     {
1215         if (d5)
1216         {
1217             if (d3)
1218             {
1219                 if (d1)
1220                 {
1221             /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
1222                     z1 = d7 + d1;
1223                     z2 = d5 + d3;
1224                     z3 = d7 + d3;
1225                     z4 = d5 + d1;
1226                     z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
1227
1228                     tmp0 = MULTIPLY(d7, FIX(0.298631336));
1229                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1230                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1231                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1232                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1233                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1234                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1235                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1236
1237                     z3 += z5;
1238                     z4 += z5;
1239
1240                     tmp0 += z1 + z3;
1241                     tmp1 += z2 + z4;
1242                     tmp2 += z2 + z3;
1243                     tmp3 += z1 + z4;
1244                 }
1245                 else
1246                 {
1247             /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
1248                     z2 = d5 + d3;
1249                     z3 = d7 + d3;
1250                     z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
1251
1252                     tmp0 = MULTIPLY(d7, FIX(0.298631336));
1253                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1254                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1255                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1256                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1257                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1258                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1259
1260                     z3 += z5;
1261                     z4 += z5;
1262
1263                     tmp0 += z1 + z3;
1264                     tmp1 += z2 + z4;
1265                     tmp2 += z2 + z3;
1266                     tmp3 = z1 + z4;
1267                 }
1268             }
1269             else
1270             {
1271                 if (d1)
1272                 {
1273             /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
1274                     z1 = d7 + d1;
1275                     z4 = d5 + d1;
1276                     z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
1277
1278                     tmp0 = MULTIPLY(d7, FIX(0.298631336));
1279                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1280                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1281                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1282                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1283                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1284                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1285
1286                     z3 += z5;
1287                     z4 += z5;
1288
1289                     tmp0 += z1 + z3;
1290                     tmp1 += z2 + z4;
1291                     tmp2 = z2 + z3;
1292                     tmp3 += z1 + z4;
1293                 }
1294                 else
1295                 {
1296             /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
1297                     z5 = MULTIPLY(d5 + d7, FIX(1.175875602));
1298
1299                     tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
1300                     tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1301                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1302                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1303                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1304                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1305
1306                     z3 += z5;
1307                     z4 += z5;
1308
1309                     tmp0 += z3;
1310                     tmp1 += z4;
1311                     tmp2 = z2 + z3;
1312                     tmp3 = z1 + z4;
1313                 }
1314             }
1315         }
1316         else
1317         {
1318             if (d3)
1319             {
1320                 if (d1)
1321                 {
1322             /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
1323                     z1 = d7 + d1;
1324                     z3 = d7 + d3;
1325                     z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
1326
1327                     tmp0 = MULTIPLY(d7, FIX(0.298631336));
1328                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1329                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1330                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1331                     z2 = MULTIPLY(d3, - FIX(2.562915447));
1332                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1333                     z4 = MULTIPLY(d1, - FIX(0.390180644));
1334
1335                     z3 += z5;
1336                     z4 += z5;
1337
1338                     tmp0 += z1 + z3;
1339                     tmp1 = z2 + z4;
1340                     tmp2 += z2 + z3;
1341                     tmp3 += z1 + z4;
1342                 }
1343                 else
1344                 {
1345             /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1346                     z3 = d7 + d3;
1347                     z5 = MULTIPLY(z3, FIX(1.175875602));
1348
1349                     tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
1350                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1351                     tmp2 = MULTIPLY(d3, FIX(0.509795579));
1352                     z2 = MULTIPLY(d3, - FIX(2.562915447));
1353                     z3 = MULTIPLY(z3, - FIX2(0.785694958));
1354
1355                     tmp0 += z3;
1356                     tmp1 = z2 + z5;
1357                     tmp2 += z3;
1358                     tmp3 = z1 + z5;
1359                 }
1360             }
1361             else
1362             {
1363                 if (d1)
1364                 {
1365             /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1366                     z1 = d7 + d1;
1367                     z5 = MULTIPLY(z1, FIX(1.175875602));
1368
1369                     tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
1370                     tmp3 = MULTIPLY(d1, FIX2(1.111140466));
1371                     z1 = MULTIPLY(z1, FIX2(0.275899379));
1372                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1373                     z4 = MULTIPLY(d1, - FIX(0.390180644));
1374
1375                     tmp0 += z1;
1376                     tmp1 = z4 + z5;
1377                     tmp2 = z3 + z5;
1378                     tmp3 += z1;
1379                 }
1380                 else
1381                 {
1382             /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1383                     tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
1384                     tmp1 = MULTIPLY(d7, FIX(1.175875602));
1385                     tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
1386                     tmp3 = MULTIPLY(d7, FIX2(0.275899379));
1387                 }
1388             }
1389         }
1390     }
1391     else
1392     {
1393         if (d5)
1394         {
1395             if (d3)
1396             {
1397                 if (d1)
1398                 {
1399             /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1400                     z2 = d5 + d3;
1401                     z4 = d5 + d1;
1402                     z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
1403
1404                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1405                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1406                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1407                     z1 = MULTIPLY(d1, - FIX(0.899976223));
1408                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1409                     z3 = MULTIPLY(d3, - FIX(1.961570560));
1410                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1411
1412                     z3 += z5;
1413                     z4 += z5;
1414
1415                     tmp0 = z1 + z3;
1416                     tmp1 += z2 + z4;
1417                     tmp2 += z2 + z3;
1418                     tmp3 += z1 + z4;
1419                 }
1420                 else
1421                 {
1422             /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1423                     z2 = d5 + d3;
1424                     z5 = MULTIPLY(z2, FIX(1.175875602));
1425
1426                     tmp1 = MULTIPLY(d5, FIX2(1.662939225));
1427                     tmp2 = MULTIPLY(d3, FIX2(1.111140466));
1428                     z2 = MULTIPLY(z2, - FIX2(1.387039845));
1429                     z3 = MULTIPLY(d3, - FIX(1.961570560));
1430                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1431
1432                     tmp0 = z3 + z5;
1433                     tmp1 += z2;
1434                     tmp2 += z2;
1435                     tmp3 = z4 + z5;
1436                 }
1437             }
1438             else
1439             {
1440                 if (d1)
1441                 {
1442             /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1443                     z4 = d5 + d1;
1444                     z5 = MULTIPLY(z4, FIX(1.175875602));
1445
1446                     tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1447                     tmp3 = MULTIPLY(d1, FIX2(0.601344887));
1448                     z1 = MULTIPLY(d1, - FIX(0.899976223));
1449                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1450                     z4 = MULTIPLY(z4, FIX2(0.785694958));
1451
1452                     tmp0 = z1 + z5;
1453                     tmp1 += z4;
1454                     tmp2 = z2 + z5;
1455                     tmp3 += z4;
1456                 }
1457                 else
1458                 {
1459             /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1460                     tmp0 = MULTIPLY(d5, FIX(1.175875602));
1461                     tmp1 = MULTIPLY(d5, FIX2(0.275899380));
1462                     tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
1463                     tmp3 = MULTIPLY(d5, FIX2(0.785694958));
1464                 }
1465             }
1466         }
1467         else
1468         {
1469             if (d3)
1470             {
1471                 if (d1)
1472                 {
1473             /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1474                     z5 = d3 + d1;
1475
1476                     tmp2 = MULTIPLY(d3, - FIX(1.451774981));
1477                     tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
1478                     z1 = MULTIPLY(d1, FIX(1.061594337));
1479                     z2 = MULTIPLY(d3, - FIX(2.172734803));
1480                     z4 = MULTIPLY(z5, FIX(0.785694958));
1481                     z5 = MULTIPLY(z5, FIX(1.175875602));
1482
1483                     tmp0 = z1 - z4;
1484                     tmp1 = z2 + z4;
1485                     tmp2 += z5;
1486                     tmp3 += z5;
1487                 }
1488                 else
1489                 {
1490             /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1491                     tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
1492                     tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
1493                     tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
1494                     tmp3 = MULTIPLY(d3, FIX(1.175875602));
1495                 }
1496             }
1497             else
1498             {
1499                 if (d1)
1500                 {
1501             /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1502                     tmp0 = MULTIPLY(d1, FIX2(0.275899379));
1503                     tmp1 = MULTIPLY(d1, FIX2(0.785694958));
1504                     tmp2 = MULTIPLY(d1, FIX(1.175875602));
1505                     tmp3 = MULTIPLY(d1, FIX2(1.387039845));
1506                 }
1507                 else
1508                 {
1509             /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1510                     tmp0 = tmp1 = tmp2 = tmp3 = 0;
1511                 }
1512             }
1513         }
1514     }
1515
1516     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1517
1518     dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
1519                        CONST_BITS+PASS1_BITS+3);
1520     dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
1521                        CONST_BITS+PASS1_BITS+3);
1522     dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
1523                        CONST_BITS+PASS1_BITS+3);
1524     dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
1525                        CONST_BITS+PASS1_BITS+3);
1526     dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
1527                        CONST_BITS+PASS1_BITS+3);
1528     dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
1529                        CONST_BITS+PASS1_BITS+3);
1530     dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
1531                        CONST_BITS+PASS1_BITS+3);
1532     dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
1533                        CONST_BITS+PASS1_BITS+3);
1534
1535     dataptr++;             /* advance pointer to next column */
1536     }
1537 #endif
1538 }
1539 #endif