]> git.sesse.net Git - vlc/blob - src/video_decoder/vdec_idct.c
68d4c8b08edf880b7db1568afa126d91b9ba6eaf
[vlc] / src / video_decoder / vdec_idct.c
1 /*****************************************************************************
2  * vdec_idct.c : IDCT functions
3  * (c)1999 VideoLAN
4  *****************************************************************************/
5
6 /*****************************************************************************
7  * Preamble
8  *****************************************************************************/
9
10 #include <errno.h>
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <unistd.h>
14 #include <string.h>
15 #include <sys/uio.h>
16
17 #include "config.h"
18 #include "common.h"
19 #include "mtime.h"
20 #include "vlc_thread.h"
21
22 #include "intf_msg.h"
23 #include "debug.h"                    /* ?? temporaire, requis par netlist.h */
24
25 #include "input.h"
26 #include "input_netlist.h"
27 #include "decoder_fifo.h"
28 #include "video.h"
29 #include "video_output.h"
30
31 #include "vdec_idct.h"
32 #include "video_decoder.h"
33 #include "vdec_motion.h"
34
35 #include "vpar_blocks.h"
36 #include "vpar_headers.h"
37 #include "vpar_synchro.h"
38 #include "video_parser.h"
39 #include "video_fifo.h"
40
41 /*
42  * Local prototypes
43  */
44
45 /* Our current implementation is a fast DCT, we might move to a fast DFT or
46  * an MMX DCT in the future. */
47
48 /*****************************************************************************
49  * vdec_DummyIDCT : dummy function that does nothing
50  *****************************************************************************/
51 void vdec_DummyIDCT( vdec_thread_t * p_vdec, dctelem_t * p_block, 
52                      int i_idontcare )
53 {
54 }
55
56 /*****************************************************************************
57  * init_SparseIDCT : initialize datas for vdec_SparceIDCT
58  * vdec_SparseIDCT : IDCT function for sparse matrices
59  *****************************************************************************/
60
61 void vdec_InitIDCT (vdec_thread_t * p_vdec) 
62 {         
63     int i;
64     
65     dctelem_t * p_pre = p_vdec->p_pre_idct;
66     memset( p_pre, 0, 64*64*sizeof(dctelem_t) );
67     
68     for( i=0 ; i < 64 ; i++ ) 
69     {
70         p_pre[i*64+i] = 1 << SPARSE_SCALE_FACTOR;
71         vdec_IDCT( p_vdec, &p_pre[i*64], 0) ;
72     }
73     return;
74 }
75
76 void vdec_SparseIDCT (vdec_thread_t * p_vdec, dctelem_t * p_block, 
77                       int i_sparse_pos)
78 {
79     short int val;
80     int * dp;
81     int v;
82     short int * p_dest;
83     short int * p_source;
84     int coeff, rr;
85
86     /* If DC Coefficient. */
87     if ( i_sparse_pos == 0 ) 
88     {
89             dp=(int *)p_block;
90         val=RIGHT_SHIFT((*p_block + 4), 3);
91         /* Compute int to assign.  This speeds things up a bit */
92         v = ((val & 0xffff) | (val << 16));
93         dp[0] = v;     dp[1] = v;     dp[2] = v;     dp[3] = v;
94         dp[4] = v;     dp[5] = v;     dp[6] = v;     dp[7] = v;
95         dp[8] = v;     dp[9] = v;     dp[10] = v;    dp[11] = v;
96         dp[12] = v;    dp[13] = v;    dp[14] = v;    dp[15] = v;
97         dp[16] = v;    dp[17] = v;    dp[18] = v;    dp[19] = v;
98         dp[20] = v;    dp[21] = v;    dp[22] = v;    dp[23] = v;
99         dp[24] = v;    dp[25] = v;    dp[26] = v;    dp[27] = v;
100         dp[28] = v;    dp[29] = v;    dp[30] = v;    dp[31] = v;
101         return;
102     }
103     /* Some other coefficient. */
104     p_dest = (s16*)p_block;
105     p_source = (s16*)&p_vdec->p_pre_idct[i_sparse_pos];
106     coeff = (int)p_dest[i_sparse_pos];
107     for( rr=0 ; rr < 4 ; rr++ )
108     {
109         p_dest[0] = (p_source[0] * coeff) >> SPARSE_SCALE_FACTOR;
110         p_dest[1] = (p_source[1] * coeff) >> SPARSE_SCALE_FACTOR;
111         p_dest[2] = (p_source[2] * coeff) >> SPARSE_SCALE_FACTOR;
112         p_dest[3] = (p_source[3] * coeff) >> SPARSE_SCALE_FACTOR;
113         p_dest[4] = (p_source[4] * coeff) >> SPARSE_SCALE_FACTOR;
114         p_dest[5] = (p_source[5] * coeff) >> SPARSE_SCALE_FACTOR;
115         p_dest[6] = (p_source[6] * coeff) >> SPARSE_SCALE_FACTOR;
116         p_dest[7] = (p_source[7] * coeff) >> SPARSE_SCALE_FACTOR;
117         p_dest[8] = (p_source[8] * coeff) >> SPARSE_SCALE_FACTOR;
118         p_dest[9] = (p_source[9] * coeff) >> SPARSE_SCALE_FACTOR;
119         p_dest[10] = (p_source[10] * coeff) >> SPARSE_SCALE_FACTOR;
120         p_dest[11] = (p_source[11] * coeff) >> SPARSE_SCALE_FACTOR;
121         p_dest[12] = (p_source[12] * coeff) >> SPARSE_SCALE_FACTOR;
122         p_dest[13] = (p_source[13] * coeff) >> SPARSE_SCALE_FACTOR;
123         p_dest[14] = (p_source[14] * coeff) >> SPARSE_SCALE_FACTOR;
124         p_dest[15] = (p_source[15] * coeff) >> SPARSE_SCALE_FACTOR;
125         p_dest += 16;
126         p_source += 16;
127     }
128     return;
129 }
130       
131
132 /*****************************************************************************
133  * vdec_IDCT : IDCT function for normal matrices
134  *****************************************************************************/
135
136 #ifndef HAVE_MMX
137 void vdec_IDCT( vdec_thread_t * p_vdec, dctelem_t * p_block, int i_idontcare )
138 {
139 #if 0 
140     /* dct classique: pour tester la meilleure entre la classique et la */
141     /* no classique */
142     s32 tmp0, tmp1, tmp2, tmp3;
143     s32 tmp10, tmp11, tmp12, tmp13;
144     s32 z1, z2, z3, z4, z5;
145     dctelem_t * dataptr;
146     int rowctr;
147     SHIFT_TEMPS
148
149   /* Pass 1: process rows. */
150   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
151   /* furthermore, we scale the results by 2**PASS1_BITS. */
152
153     dataptr = p_block;
154     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) 
155     {
156     /* Due to quantization, we will usually find that many of the input
157      * coefficients are zero, especially the AC terms.  We can exploit this
158      * by short-circuiting the IDCT calculation for any row in which all
159      * the AC terms are zero.  In that case each output is equal to the
160      * DC coefficient (with scale factor as needed).
161      * With typical images and quantization tables, half or more of the
162      * row DCT calculations can be simplified this way.
163      */
164
165         if ((dataptr[1] | dataptr[2] | dataptr[3] | dataptr[4] |
166                 dataptr[5] | dataptr[6] | dataptr[7]) == 0) 
167         {
168       /* AC terms all zero */
169             dctelem_t dcval = (dctelem_t) (dataptr[0] << PASS1_BITS);
170       
171             dataptr[0] = dcval;
172             dataptr[1] = dcval;
173             dataptr[2] = dcval;
174             dataptr[3] = dcval;
175             dataptr[4] = dcval;
176             dataptr[5] = dcval;
177             dataptr[6] = dcval;
178             dataptr[7] = dcval;
179       
180             dataptr += DCTSIZE; /* advance pointer to next row */
181             continue;
182         }
183
184     /* Even part: reverse the even part of the forward DCT. */
185     /* The rotator is sqrt(2)*c(-6). */
186
187         z2 = (s32) dataptr[2];
188         z3 = (s32) dataptr[6];
189
190         z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
191         tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
192         tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
193
194         tmp0 = ((s32) dataptr[0] + (s32) dataptr[4]) << CONST_BITS;
195         tmp1 = ((s32) dataptr[0] - (s32) dataptr[4]) << CONST_BITS;
196
197         tmp10 = tmp0 + tmp3;
198         tmp13 = tmp0 - tmp3;
199         tmp11 = tmp1 + tmp2;
200         tmp12 = tmp1 - tmp2;
201     
202     /* Odd part per figure 8; the matrix is unitary and hence its
203      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
204      */
205
206         tmp0 = (s32) dataptr[7];
207         tmp1 = (s32) dataptr[5];
208         tmp2 = (s32) dataptr[3];
209         tmp3 = (s32) dataptr[1];
210
211         z1 = tmp0 + tmp3;
212         z2 = tmp1 + tmp2;
213         z3 = tmp0 + tmp2;
214         z4 = tmp1 + tmp3;
215         z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
216     
217         tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
218         tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
219         tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
220         tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
221         z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
222         z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
223         z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
224         z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
225     
226         z3 += z5;
227         z4 += z5;
228     
229         tmp0 += z1 + z3;
230         tmp1 += z2 + z4;
231         tmp2 += z2 + z3;
232         tmp3 += z1 + z4;
233
234     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
235
236         dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
237         dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
238         dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
239         dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
240         dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
241         dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
242         dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
243         dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
244
245         dataptr += DCTSIZE;             /* advance pointer to next row */
246     }
247
248   /* Pass 2: process columns. */
249   /* Note that we must descale the results by a factor of 8 == 2**3, */
250   /* and also undo the PASS1_BITS scaling. */
251
252     dataptr = p_block;
253     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) 
254     {
255     /* Columns of zeroes can be exploited in the same way as we did with rows.
256      * However, the row calculation has created many nonzero AC terms, so the
257      * simplification applies less often (typically 5% to 10% of the time).
258      * On machines with very fast multiplication, it's possible that the
259      * test takes more time than it's worth.  In that case this section
260      * may be commented out.
261      */
262
263 #ifndef NO_ZERO_COLUMN_TEST /*ajoute un test mais evite des calculs */
264         if ((dataptr[DCTSIZE*1] | dataptr[DCTSIZE*2] | dataptr[DCTSIZE*3] |
265             dataptr[DCTSIZE*4] | dataptr[DCTSIZE*5] | dataptr[DCTSIZE*6] |
266             dataptr[DCTSIZE*7]) == 0) 
267         {
268       /* AC terms all zero */
269             dctelem_t dcval = (dctelem_t) DESCALE((s32) dataptr[0], PASS1_BITS+3);
270       
271             dataptr[DCTSIZE*0] = dcval;
272             dataptr[DCTSIZE*1] = dcval;
273             dataptr[DCTSIZE*2] = dcval;
274             dataptr[DCTSIZE*3] = dcval;
275             dataptr[DCTSIZE*4] = dcval;
276             dataptr[DCTSIZE*5] = dcval;
277             dataptr[DCTSIZE*6] = dcval;
278             dataptr[DCTSIZE*7] = dcval;
279       
280             dataptr++;          /* advance pointer to next column */
281             continue;
282         }
283 #endif
284
285     /* Even part: reverse the even part of the forward DCT. */
286     /* The rotator is sqrt(2)*c(-6). */
287
288         z2 = (s32) dataptr[DCTSIZE*2];
289         z3 = (s32) dataptr[DCTSIZE*6];
290
291         z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
292         tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
293         tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
294
295         tmp0 = ((s32) dataptr[DCTSIZE*0] + (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
296         tmp1 = ((s32) dataptr[DCTSIZE*0] - (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
297
298         tmp10 = tmp0 + tmp3;
299         tmp13 = tmp0 - tmp3;
300         tmp11 = tmp1 + tmp2;
301         tmp12 = tmp1 - tmp2;
302     
303     /* Odd part per figure 8; the matrix is unitary and hence its
304      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
305      */
306
307         tmp0 = (s32) dataptr[DCTSIZE*7];
308         tmp1 = (s32) dataptr[DCTSIZE*5];
309         tmp2 = (s32) dataptr[DCTSIZE*3];
310         tmp3 = (s32) dataptr[DCTSIZE*1];
311
312         z1 = tmp0 + tmp3;
313         z2 = tmp1 + tmp2;
314         z3 = tmp0 + tmp2;
315         z4 = tmp1 + tmp3;
316         z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
317     
318         tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
319         tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
320         tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
321         tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
322         z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
323         z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
324         z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
325         z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
326     
327         z3 += z5;
328         z4 += z5;
329     
330         tmp0 += z1 + z3;
331         tmp1 += z2 + z4;
332         tmp2 += z2 + z3;
333         tmp3 += z1 + z4;
334
335     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
336
337         dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
338                                            CONST_BITS+PASS1_BITS+3);
339         dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
340                                            CONST_BITS+PASS1_BITS+3);
341         dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
342                                            CONST_BITS+PASS1_BITS+3);
343         dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
344                                            CONST_BITS+PASS1_BITS+3);
345         dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
346                                            CONST_BITS+PASS1_BITS+3);
347         dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
348                                            CONST_BITS+PASS1_BITS+3);
349         dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
350                                            CONST_BITS+PASS1_BITS+3);
351         dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
352                                            CONST_BITS+PASS1_BITS+3);
353     
354         dataptr++;                      /* advance pointer to next column */
355     }
356 #endif    
357
358 #if 1  /*dct avec non classique*/    
359     
360     s32 tmp0, tmp1, tmp2, tmp3;
361     s32 tmp10, tmp11, tmp12, tmp13;
362     s32 z1, z2, z3, z4, z5;
363     s32 d0, d1, d2, d3, d4, d5, d6, d7;
364     dctelem_t * dataptr;
365     int rowctr;
366     
367     SHIFT_TEMPS
368    
369     /* Pass 1: process rows. */
370     /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
371     /* furthermore, we scale the results by 2**PASS1_BITS. */
372
373     dataptr = p_block;
374
375     fprintf( stderr, "normal dct" );
376     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) 
377     {
378         /* Due to quantization, we will usually find that many of the input
379          * coefficients are zero, especially the AC terms.  We can exploit this
380          * by short-circuiting the IDCT calculation for any row in which all
381          * the AC terms are zero.  In that case each output is equal to the
382          * DC coefficient (with scale factor as needed).
383          * With typical images and quantization tables, half or more of the
384          * row DCT calculations can be simplified this way.
385          */
386
387         register int * idataptr = (int*)dataptr;
388         d0 = dataptr[0];
389         d1 = dataptr[1];
390         if ( (d1 == 0) && ((idataptr[1] | idataptr[2] | idataptr[3]) == 0) ) 
391         {
392       /* AC terms all zero */
393             if (d0) 
394             {
395       /* Compute a 32 bit value to assign. */
396                 dctelem_t dcval = (dctelem_t) (d0 << PASS1_BITS);
397                 register int v = (dcval & 0xffff) | (dcval << 16);
398   
399                 idataptr[0] = v;
400                 idataptr[1] = v;
401                 idataptr[2] = v;
402                 idataptr[3] = v;
403             }
404       
405             dataptr += DCTSIZE; /* advance pointer to next row */
406             continue;
407         }
408         d2 = dataptr[2];
409         d3 = dataptr[3];
410         d4 = dataptr[4];
411         d5 = dataptr[5];
412         d6 = dataptr[6];
413         d7 = dataptr[7];
414
415     /* Even part: reverse the even part of the forward DCT. */
416     /* The rotator is sqrt(2)*c(-6). */
417         if (d6) 
418         {
419             if (d4) 
420             {
421                 if (d2) 
422                 {
423                             if (d0) 
424                     {
425             /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
426                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
427                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
428                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
429
430                         tmp0 = (d0 + d4) << CONST_BITS;
431                         tmp1 = (d0 - d4) << CONST_BITS;
432
433                         tmp10 = tmp0 + tmp3;
434                         tmp13 = tmp0 - tmp3;
435                         tmp11 = tmp1 + tmp2;
436                         tmp12 = tmp1 - tmp2;
437                     } 
438                     else 
439                     {
440                     /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
441                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
442                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
443                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
444
445                         tmp0 = d4 << CONST_BITS;
446     
447                         tmp10 = tmp0 + tmp3;
448                         tmp13 = tmp0 - tmp3;
449                         tmp11 = tmp2 - tmp0;
450                         tmp12 = -(tmp0 + tmp2);
451                         }
452                 } 
453                 else 
454                 {
455                             if (d0) 
456                     {
457             /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
458                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
459                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
460     
461                         tmp0 = (d0 + d4) << CONST_BITS;
462                         tmp1 = (d0 - d4) << CONST_BITS;
463
464                         tmp10 = tmp0 + tmp3;
465                         tmp13 = tmp0 - tmp3;
466                         tmp11 = tmp1 + tmp2;
467                         tmp12 = tmp1 - tmp2;
468                         }
469                     else 
470                     {
471                     /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
472                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
473                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
474     
475                         tmp0 = d4 << CONST_BITS;
476  
477                         tmp10 = tmp0 + tmp3;
478                         tmp13 = tmp0 - tmp3;
479                         tmp11 = tmp2 - tmp0;
480                         tmp12 = -(tmp0 + tmp2);
481                         }
482                 }
483             } 
484             else 
485             {
486                 if (d2) 
487                 {
488                     if (d0) 
489                     {
490             /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
491                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
492                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
493                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
494
495                         tmp0 = d0 << CONST_BITS;
496     
497                         tmp10 = tmp0 + tmp3;
498                         tmp13 = tmp0 - tmp3;
499                         tmp11 = tmp0 + tmp2;
500                         tmp12 = tmp0 - tmp2;
501                     } 
502                     else 
503                     {
504                     /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
505                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
506                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
507                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
508
509                         tmp10 = tmp3;
510                         tmp13 = -tmp3;
511                         tmp11 = tmp2;
512                         tmp12 = -tmp2;
513                             }
514                 }
515                 else 
516                 {
517                     if (d0) 
518                     {
519             /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
520                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
521                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
522     
523                         tmp0 = d0 << CONST_BITS;
524     
525                         tmp10 = tmp0 + tmp3;
526                         tmp13 = tmp0 - tmp3;
527                         tmp11 = tmp0 + tmp2;
528                         tmp12 = tmp0 - tmp2;
529                     }
530                     else 
531                     {
532             /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
533                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
534                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
535
536                         tmp10 = tmp3;
537                         tmp13 = -tmp3;
538                         tmp11 = tmp2;
539                         tmp12 = -tmp2;
540                     }
541                 }
542             }
543         }
544         else
545         {
546             if (d4) 
547             {
548                 if (d2) 
549                 {
550                     if (d0) 
551                     {
552                     /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
553                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
554                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
555     
556                         tmp0 = (d0 + d4) << CONST_BITS;
557                         tmp1 = (d0 - d4) << CONST_BITS;
558     
559                         tmp10 = tmp0 + tmp3;
560                         tmp13 = tmp0 - tmp3;
561                         tmp11 = tmp1 + tmp2;
562                         tmp12 = tmp1 - tmp2;
563                     }
564                     else 
565                     {
566             /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
567                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
568                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
569     
570                         tmp0 = d4 << CONST_BITS;
571
572                         tmp10 = tmp0 + tmp3;
573                         tmp13 = tmp0 - tmp3;
574                         tmp11 = tmp2 - tmp0;
575                         tmp12 = -(tmp0 + tmp2);
576                     }
577                 }
578                 else 
579                 {
580                     if (d0)
581                     {
582             /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
583                         tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
584                         tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
585                     } 
586                     else 
587                     {
588             /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
589                         tmp10 = tmp13 = d4 << CONST_BITS;
590                         tmp11 = tmp12 = -tmp10;
591                     }
592                 }
593             }
594             else 
595             {
596                 if (d2) 
597                 {
598                     if (d0) 
599                     {
600             /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
601                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
602                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
603
604                         tmp0 = d0 << CONST_BITS;
605
606                         tmp10 = tmp0 + tmp3;
607                         tmp13 = tmp0 - tmp3;
608                         tmp11 = tmp0 + tmp2;
609                         tmp12 = tmp0 - tmp2;
610                     }
611                     else
612                     {
613             /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
614                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
615                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
616
617                         tmp10 = tmp3;
618                         tmp13 = -tmp3;
619                         tmp11 = tmp2;
620                         tmp12 = -tmp2;
621                     }
622                 }
623                 else
624                 {
625                     if (d0)
626                     {
627             /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
628                         tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
629                     } 
630                     else
631                     {
632             /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
633                         tmp10 = tmp13 = tmp11 = tmp12 = 0;
634                     }
635                 }    
636             }
637         }
638
639
640     /* Odd part per figure 8; the matrix is unitary and hence its
641      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
642      */
643
644         if (d7) 
645             {
646                 if (d5)
647             {
648                 if (d3) 
649                 {
650                     if (d1) 
651                     {
652             /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
653                         z1 = d7 + d1;
654                         z2 = d5 + d3;
655                         z3 = d7 + d3;
656                         z4 = d5 + d1;
657                         z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
658                     
659                         tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
660                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
661                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
662                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
663                         z1 = MULTIPLY(z1, - FIX(0.899976223));
664                         z2 = MULTIPLY(z2, - FIX(2.562915447));
665                         z3 = MULTIPLY(z3, - FIX(1.961570560));
666                         z4 = MULTIPLY(z4, - FIX(0.390180644));
667                     
668                         z3 += z5;
669                         z4 += z5;
670                     
671                         tmp0 += z1 + z3;
672                         tmp1 += z2 + z4;
673                         tmp2 += z2 + z3;
674                         tmp3 += z1 + z4;
675                     } 
676                     else 
677                     {
678             /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
679                         z2 = d5 + d3;
680                         z3 = d7 + d3;
681                         z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
682                     
683                         tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
684                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
685                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
686                         z1 = MULTIPLY(d7, - FIX(0.899976223));
687                         z2 = MULTIPLY(z2, - FIX(2.562915447));
688                         z3 = MULTIPLY(z3, - FIX(1.961570560));
689                         z4 = MULTIPLY(d5, - FIX(0.390180644));
690                     
691                         z3 += z5;
692                         z4 += z5;
693                     
694                         tmp0 += z1 + z3;
695                         tmp1 += z2 + z4;
696                         tmp2 += z2 + z3;
697                         tmp3 = z1 + z4;
698                         }
699                     }
700                 else 
701                 {
702                     if (d1)
703                     {
704             /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
705                         z1 = d7 + d1;
706                         z4 = d5 + d1;
707                         z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
708                     
709                         tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
710                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
711                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
712                         z1 = MULTIPLY(z1, - FIX(0.899976223));
713                         z2 = MULTIPLY(d5, - FIX(2.562915447));
714                         z3 = MULTIPLY(d7, - FIX(1.961570560));
715                         z4 = MULTIPLY(z4, - FIX(0.390180644));
716                     
717                         z3 += z5;
718                         z4 += z5;
719                     
720                         tmp0 += z1 + z3;
721                         tmp1 += z2 + z4;
722                         tmp2 = z2 + z3;
723                         tmp3 += z1 + z4;
724                     }
725                     else
726                     {
727             /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
728                         z5 = MULTIPLY(d7 + d5, FIX(1.175875602));
729
730                         tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
731                         tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
732                         z1 = MULTIPLY(d7, - FIX(0.899976223));
733                         z3 = MULTIPLY(d7, - FIX(1.961570560));
734                         z2 = MULTIPLY(d5, - FIX(2.562915447));
735                         z4 = MULTIPLY(d5, - FIX(0.390180644));
736                     
737                         z3 += z5;
738                         z4 += z5;
739                         
740                         tmp0 += z3;
741                         tmp1 += z4;
742                         tmp2 = z2 + z3;
743                         tmp3 = z1 + z4;
744                     }
745                 }
746             }
747             else 
748             {
749                 if (d3) 
750                 {
751                     if (d1) 
752                     {
753             /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
754                         z1 = d7 + d1;
755                         z3 = d7 + d3;
756                         z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
757                     
758                         tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
759                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
760                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
761                         z1 = MULTIPLY(z1, - FIX(0.899976223));
762                         z2 = MULTIPLY(d3, - FIX(2.562915447));
763                         z3 = MULTIPLY(z3, - FIX(1.961570560));
764                         z4 = MULTIPLY(d1, - FIX(0.390180644));
765                     
766                         z3 += z5;
767                         z4 += z5;
768                     
769                         tmp0 += z1 + z3;
770                         tmp1 = z2 + z4;
771                         tmp2 += z2 + z3;
772                         tmp3 += z1 + z4;
773                     }
774                     else
775                     {
776             /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
777                         z3 = d7 + d3;
778                         z5 = MULTIPLY(z3, FIX(1.175875602));
779                     
780                         tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
781                         tmp2 = MULTIPLY(d3, FIX(0.509795579));
782                         z1 = MULTIPLY(d7, - FIX(0.899976223));
783                         z2 = MULTIPLY(d3, - FIX(2.562915447));
784                         z3 = MULTIPLY(z3, - FIX2(0.785694958));
785     
786                         tmp0 += z3;
787                         tmp1 = z2 + z5;
788                         tmp2 += z3;
789                         tmp3 = z1 + z5;
790                     }
791                 } 
792                 else 
793                 {
794                     if (d1) 
795                     {
796             /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
797                         z1 = d7 + d1;
798                         z5 = MULTIPLY(z1, FIX(1.175875602));
799
800                         tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
801                         tmp3 = MULTIPLY(d1, FIX2(1.111140466));
802                         z1 = MULTIPLY(z1, FIX2(0.275899379));
803                         z3 = MULTIPLY(d7, - FIX(1.961570560));
804                         z4 = MULTIPLY(d1, - FIX(0.390180644));
805     
806                         tmp0 += z1;
807                         tmp1 = z4 + z5;
808                         tmp2 = z3 + z5;
809                         tmp3 += z1;
810                     }   
811                 else 
812                     {
813             /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
814                         tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
815                         tmp1 = MULTIPLY(d7, FIX(1.175875602));
816                         tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
817                         tmp3 = MULTIPLY(d7, FIX2(0.275899379));
818                     }
819                 }
820             }
821         }
822         else
823         { 
824             if (d5)
825             {
826                 if (d3) 
827                 {
828                     if (d1)
829                     {
830             /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
831                         z2 = d5 + d3;
832                         z4 = d5 + d1;
833                         z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
834                     
835                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
836                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
837                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
838                         z1 = MULTIPLY(d1, - FIX(0.899976223));
839                         z2 = MULTIPLY(z2, - FIX(2.562915447));
840                         z3 = MULTIPLY(d3, - FIX(1.961570560));
841                         z4 = MULTIPLY(z4, - FIX(0.390180644));
842                     
843                         z3 += z5;
844                         z4 += z5;
845                     
846                         tmp0 = z1 + z3;
847                         tmp1 += z2 + z4;
848                         tmp2 += z2 + z3;
849                         tmp3 += z1 + z4;
850                     } 
851                     else 
852                     {
853             /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
854                         z2 = d5 + d3;
855                         z5 = MULTIPLY(z2, FIX(1.175875602));
856                     
857                         tmp1 = MULTIPLY(d5, FIX2(1.662939225));
858                         tmp2 = MULTIPLY(d3, FIX2(1.111140466));
859                         z2 = MULTIPLY(z2, - FIX2(1.387039845));
860                         z3 = MULTIPLY(d3, - FIX(1.961570560));
861                         z4 = MULTIPLY(d5, - FIX(0.390180644));
862                     
863                         tmp0 = z3 + z5;
864                         tmp1 += z2;
865                         tmp2 += z2;
866                         tmp3 = z4 + z5;
867                     }
868                 }
869                 else 
870                 {
871                     if (d1) 
872                     {
873             /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
874                         z4 = d5 + d1;
875                         z5 = MULTIPLY(z4, FIX(1.175875602));
876                     
877                         tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
878                         tmp3 = MULTIPLY(d1, FIX2(0.601344887));
879                         z1 = MULTIPLY(d1, - FIX(0.899976223));
880                         z2 = MULTIPLY(d5, - FIX(2.562915447));
881                         z4 = MULTIPLY(z4, FIX2(0.785694958));
882                     
883                         tmp0 = z1 + z5;
884                         tmp1 += z4;
885                         tmp2 = z2 + z5;
886                         tmp3 += z4;
887                     }
888                     else 
889                     {
890             /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
891                         tmp0 = MULTIPLY(d5, FIX(1.175875602));
892                         tmp1 = MULTIPLY(d5, FIX2(0.275899380));
893                         tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
894                         tmp3 = MULTIPLY(d5, FIX2(0.785694958));
895                     }
896                 }
897             }
898             else 
899             {
900                 if (d3)
901                 {
902                     if (d1) 
903                     {
904             /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
905                         z5 = d3 + d1;
906
907                         tmp2 = MULTIPLY(d3, - FIX(1.451774981));
908                         tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
909                         z1 = MULTIPLY(d1, FIX(1.061594337));
910                         z2 = MULTIPLY(d3, - FIX(2.172734803));
911                         z4 = MULTIPLY(z5, FIX(0.785694958));
912                         z5 = MULTIPLY(z5, FIX(1.175875602));
913                     
914                         tmp0 = z1 - z4;
915                         tmp1 = z2 + z4;
916                         tmp2 += z5;
917                         tmp3 += z5;
918                     }
919                     else 
920                     {
921             /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
922                         tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
923                         tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
924                         tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
925                         tmp3 = MULTIPLY(d3, FIX(1.175875602));
926                     }
927                 }
928                 else 
929                 {
930                     if (d1) 
931                     {
932             /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
933                         tmp0 = MULTIPLY(d1, FIX2(0.275899379));
934                         tmp1 = MULTIPLY(d1, FIX2(0.785694958));
935                         tmp2 = MULTIPLY(d1, FIX(1.175875602));
936                         tmp3 = MULTIPLY(d1, FIX2(1.387039845));
937                     }
938                     else 
939                     {
940             /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
941                         tmp0 = tmp1 = tmp2 = tmp3 = 0;
942                     }
943                 }
944             }
945         }
946
947     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
948
949         dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
950         dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
951         dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
952         dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
953         dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
954         dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
955         dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
956         dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
957
958         dataptr += DCTSIZE;             /* advance pointer to next row */
959     }
960
961   /* Pass 2: process columns. */
962   /* Note that we must descale the results by a factor of 8 == 2**3, */
963   /* and also undo the PASS1_BITS scaling. */
964
965     dataptr = p_block;
966     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) 
967     {
968     /* Columns of zeroes can be exploited in the same way as we did with rows.
969      * However, the row calculation has created many nonzero AC terms, so the
970      * simplification applies less often (typically 5% to 10% of the time).
971      * On machines with very fast multiplication, it's possible that the
972      * test takes more time than it's worth.  In that case this section
973      * may be commented out.
974      */
975
976         d0 = dataptr[DCTSIZE*0];
977         d1 = dataptr[DCTSIZE*1];
978         d2 = dataptr[DCTSIZE*2];
979         d3 = dataptr[DCTSIZE*3];
980         d4 = dataptr[DCTSIZE*4];
981         d5 = dataptr[DCTSIZE*5];
982         d6 = dataptr[DCTSIZE*6];
983         d7 = dataptr[DCTSIZE*7];
984
985     /* Even part: reverse the even part of the forward DCT. */
986     /* The rotator is sqrt(2)*c(-6). */
987         if (d6) 
988         {
989             if (d4) 
990             {
991                 if (d2)
992                 {
993                     if (d0)
994                     {
995             /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
996                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
997                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
998                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
999
1000                         tmp0 = (d0 + d4) << CONST_BITS;
1001                         tmp1 = (d0 - d4) << CONST_BITS;
1002
1003                         tmp10 = tmp0 + tmp3;
1004                         tmp13 = tmp0 - tmp3;
1005                         tmp11 = tmp1 + tmp2;
1006                         tmp12 = tmp1 - tmp2;
1007                     }
1008                     else 
1009                     {
1010             /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
1011                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1012                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1013                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1014
1015                         tmp0 = d4 << CONST_BITS;
1016
1017                         tmp10 = tmp0 + tmp3;
1018                         tmp13 = tmp0 - tmp3;
1019                         tmp11 = tmp2 - tmp0;
1020                         tmp12 = -(tmp0 + tmp2);
1021                     }
1022                 }
1023                 else 
1024                 {
1025                     if (d0)
1026                     {
1027             /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1028                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1029                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
1030
1031                         tmp0 = (d0 + d4) << CONST_BITS;
1032                         tmp1 = (d0 - d4) << CONST_BITS;
1033
1034                         tmp10 = tmp0 + tmp3;
1035                         tmp13 = tmp0 - tmp3;
1036                         tmp11 = tmp1 + tmp2;
1037                         tmp12 = tmp1 - tmp2;
1038                     }
1039                     else
1040                     {
1041             /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
1042                         tmp2 = MULTIPLY(d6, -FIX2(1.306562965));
1043                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
1044
1045                         tmp0 = d4 << CONST_BITS;
1046
1047                         tmp10 = tmp0 + tmp3;
1048                         tmp13 = tmp0 - tmp3;
1049                         tmp11 = tmp2 - tmp0;
1050                         tmp12 = -(tmp0 + tmp2);
1051                     }
1052                 }
1053             }
1054             else
1055             {
1056                 if (d2) 
1057                 {
1058                     if (d0) 
1059                     {
1060             /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
1061                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1062                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1063                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1064
1065                         tmp0 = d0 << CONST_BITS;
1066
1067                         tmp10 = tmp0 + tmp3;
1068                         tmp13 = tmp0 - tmp3;
1069                         tmp11 = tmp0 + tmp2;
1070                         tmp12 = tmp0 - tmp2;
1071                     }
1072                     else
1073                     {
1074             /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
1075                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1076                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1077                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1078
1079                         tmp10 = tmp3;
1080                         tmp13 = -tmp3;
1081                         tmp11 = tmp2;
1082                         tmp12 = -tmp2;
1083                     }
1084                 } 
1085                 else 
1086                 {
1087                     if (d0)
1088                     {
1089             /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
1090                     tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1091                     tmp3 = MULTIPLY(d6, FIX(0.541196100));
1092
1093                     tmp0 = d0 << CONST_BITS;
1094
1095                     tmp10 = tmp0 + tmp3;
1096                     tmp13 = tmp0 - tmp3;
1097                     tmp11 = tmp0 + tmp2;
1098                     tmp12 = tmp0 - tmp2;
1099                 } 
1100                 else 
1101                 {
1102             /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
1103                     tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1104                     tmp3 = MULTIPLY(d6, FIX(0.541196100));
1105                     tmp10 = tmp3;
1106                     tmp13 = -tmp3;
1107                     tmp11 = tmp2;
1108                     tmp12 = -tmp2;
1109                 }
1110             }
1111         }
1112     }
1113     else
1114     {
1115         if (d4) 
1116         {
1117             if (d2) 
1118             {
1119                     if (d0) 
1120                 {
1121             /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1122                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1123                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1124
1125                     tmp0 = (d0 + d4) << CONST_BITS;
1126                     tmp1 = (d0 - d4) << CONST_BITS;
1127
1128                     tmp10 = tmp0 + tmp3;
1129                     tmp13 = tmp0 - tmp3;
1130                     tmp11 = tmp1 + tmp2;
1131                     tmp12 = tmp1 - tmp2;
1132                 }
1133                 else 
1134                 {
1135             /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
1136                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1137                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1138
1139                     tmp0 = d4 << CONST_BITS;
1140
1141                     tmp10 = tmp0 + tmp3;
1142                     tmp13 = tmp0 - tmp3;
1143                     tmp11 = tmp2 - tmp0;
1144                     tmp12 = -(tmp0 + tmp2);
1145                 }
1146             }
1147             else 
1148             {
1149                 if (d0)
1150                 {
1151             /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1152                     tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1153                     tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1154                 }
1155                 else 
1156                 {
1157             /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
1158                     tmp10 = tmp13 = d4 << CONST_BITS;
1159                     tmp11 = tmp12 = -tmp10;
1160                 }
1161             }
1162         } 
1163         else 
1164         {
1165         if (d2) 
1166         {
1167             if (d0) 
1168             {
1169             /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
1170                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1171                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1172
1173                     tmp0 = d0 << CONST_BITS;
1174
1175                     tmp10 = tmp0 + tmp3;
1176                     tmp13 = tmp0 - tmp3;
1177                     tmp11 = tmp0 + tmp2;
1178                     tmp12 = tmp0 - tmp2;
1179             }
1180             else 
1181             {
1182             /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
1183                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1184                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1185
1186                     tmp10 = tmp3;
1187                     tmp13 = -tmp3;
1188                     tmp11 = tmp2;
1189                     tmp12 = -tmp2;
1190             }
1191         }
1192         else 
1193         {
1194             if (d0) 
1195                 {
1196             /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
1197                     tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
1198                 }
1199                 else 
1200                 {
1201             /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
1202                     tmp10 = tmp13 = tmp11 = tmp12 = 0;
1203                 }
1204             }
1205         }
1206     }
1207
1208     /* Odd part per figure 8; the matrix is unitary and hence its
1209      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
1210      */
1211     if (d7) 
1212     {
1213             if (d5) 
1214         {
1215             if (d3) 
1216             {
1217                 if (d1) 
1218                 {
1219             /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
1220                     z1 = d7 + d1;
1221                     z2 = d5 + d3;
1222                     z3 = d7 + d3;
1223                     z4 = d5 + d1;
1224                     z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
1225                     
1226                     tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
1227                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1228                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1229                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1230                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1231                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1232                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1233                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1234                     
1235                     z3 += z5;
1236                     z4 += z5;
1237                     
1238                     tmp0 += z1 + z3;
1239                     tmp1 += z2 + z4;
1240                     tmp2 += z2 + z3;
1241                     tmp3 += z1 + z4;
1242                 }
1243                 else 
1244                 {
1245             /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
1246                     z2 = d5 + d3;
1247                     z3 = d7 + d3;
1248                     z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
1249                     
1250                     tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
1251                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1252                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1253                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1254                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1255                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1256                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1257                     
1258                     z3 += z5;
1259                     z4 += z5;
1260                     
1261                     tmp0 += z1 + z3;
1262                     tmp1 += z2 + z4;
1263                     tmp2 += z2 + z3;
1264                     tmp3 = z1 + z4;
1265                 }
1266             } 
1267             else
1268             {
1269                 if (d1) 
1270                 {
1271             /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
1272                     z1 = d7 + d1;
1273                     z4 = d5 + d1;
1274                     z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
1275                     
1276                     tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
1277                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1278                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1279                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1280                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1281                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1282                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1283                     
1284                     z3 += z5;
1285                     z4 += z5;
1286                     
1287                     tmp0 += z1 + z3;
1288                     tmp1 += z2 + z4;
1289                     tmp2 = z2 + z3;
1290                     tmp3 += z1 + z4;
1291                 }
1292                 else 
1293                 {
1294             /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
1295                     z5 = MULTIPLY(d5 + d7, FIX(1.175875602));
1296
1297                     tmp0 = MULTIPLY(d7, - FIX2(0.601344887)); 
1298                     tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1299                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1300                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1301                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1302                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1303                     
1304                     z3 += z5;
1305                     z4 += z5;
1306                     
1307                     tmp0 += z3;
1308                     tmp1 += z4;
1309                     tmp2 = z2 + z3;
1310                     tmp3 = z1 + z4;
1311                 }
1312             }
1313         }
1314         else 
1315         {
1316             if (d3)
1317             {
1318                 if (d1) 
1319                 {
1320             /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
1321                     z1 = d7 + d1;
1322                     z3 = d7 + d3;
1323                     z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
1324                     
1325                     tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
1326                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1327                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1328                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1329                     z2 = MULTIPLY(d3, - FIX(2.562915447));
1330                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1331                     z4 = MULTIPLY(d1, - FIX(0.390180644));
1332                     
1333                     z3 += z5;
1334                     z4 += z5;
1335                     
1336                     tmp0 += z1 + z3;
1337                     tmp1 = z2 + z4;
1338                     tmp2 += z2 + z3;
1339                     tmp3 += z1 + z4;
1340                 } 
1341                 else 
1342                 {
1343             /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1344                     z3 = d7 + d3;
1345                     z5 = MULTIPLY(z3, FIX(1.175875602));
1346                     
1347                     tmp0 = MULTIPLY(d7, - FIX2(0.601344887)); 
1348                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1349                     tmp2 = MULTIPLY(d3, FIX(0.509795579));
1350                     z2 = MULTIPLY(d3, - FIX(2.562915447));
1351                     z3 = MULTIPLY(z3, - FIX2(0.785694958));
1352                     
1353                     tmp0 += z3;
1354                     tmp1 = z2 + z5;
1355                     tmp2 += z3;
1356                     tmp3 = z1 + z5;
1357                 }
1358             } 
1359             else
1360             {
1361                 if (d1) 
1362                 {
1363             /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1364                     z1 = d7 + d1;
1365                     z5 = MULTIPLY(z1, FIX(1.175875602));
1366
1367                     tmp0 = MULTIPLY(d7, - FIX2(1.662939224)); 
1368                     tmp3 = MULTIPLY(d1, FIX2(1.111140466));
1369                     z1 = MULTIPLY(z1, FIX2(0.275899379));
1370                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1371                     z4 = MULTIPLY(d1, - FIX(0.390180644));
1372
1373                     tmp0 += z1;
1374                     tmp1 = z4 + z5;
1375                     tmp2 = z3 + z5;
1376                     tmp3 += z1;
1377                 }
1378                 else 
1379                 {
1380             /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1381                     tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
1382                     tmp1 = MULTIPLY(d7, FIX(1.175875602));
1383                     tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
1384                     tmp3 = MULTIPLY(d7, FIX2(0.275899379));
1385                 }
1386             }
1387         }
1388     } 
1389     else 
1390     {
1391         if (d5) 
1392         {
1393             if (d3) 
1394             {
1395                 if (d1) 
1396                 {
1397             /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1398                     z2 = d5 + d3;
1399                     z4 = d5 + d1;
1400                     z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
1401                     
1402                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1403                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1404                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1405                     z1 = MULTIPLY(d1, - FIX(0.899976223));
1406                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1407                     z3 = MULTIPLY(d3, - FIX(1.961570560));
1408                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1409                     
1410                     z3 += z5;
1411                     z4 += z5;
1412                     
1413                     tmp0 = z1 + z3;
1414                     tmp1 += z2 + z4;
1415                     tmp2 += z2 + z3;
1416                     tmp3 += z1 + z4;
1417                 }
1418                 else 
1419                 {
1420             /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1421                     z2 = d5 + d3;
1422                     z5 = MULTIPLY(z2, FIX(1.175875602));
1423
1424                     tmp1 = MULTIPLY(d5, FIX2(1.662939225));
1425                     tmp2 = MULTIPLY(d3, FIX2(1.111140466));
1426                     z2 = MULTIPLY(z2, - FIX2(1.387039845));
1427                     z3 = MULTIPLY(d3, - FIX(1.961570560));
1428                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1429                     
1430                     tmp0 = z3 + z5;
1431                     tmp1 += z2;
1432                     tmp2 += z2;
1433                     tmp3 = z4 + z5;
1434                 }
1435             } 
1436             else 
1437             {
1438                 if (d1) 
1439                 {
1440             /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1441                     z4 = d5 + d1;
1442                     z5 = MULTIPLY(z4, FIX(1.175875602));
1443                     
1444                     tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1445                     tmp3 = MULTIPLY(d1, FIX2(0.601344887));
1446                     z1 = MULTIPLY(d1, - FIX(0.899976223));
1447                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1448                     z4 = MULTIPLY(z4, FIX2(0.785694958));
1449                     
1450                     tmp0 = z1 + z5;
1451                     tmp1 += z4;
1452                     tmp2 = z2 + z5;
1453                     tmp3 += z4;
1454                 }
1455                 else
1456                 {
1457             /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1458                     tmp0 = MULTIPLY(d5, FIX(1.175875602));
1459                     tmp1 = MULTIPLY(d5, FIX2(0.275899380));
1460                     tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
1461                     tmp3 = MULTIPLY(d5, FIX2(0.785694958));
1462                 }
1463             }
1464         }
1465         else
1466         {
1467             if (d3)
1468             {
1469                 if (d1) 
1470                 {
1471             /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1472                     z5 = d3 + d1;
1473
1474                     tmp2 = MULTIPLY(d3, - FIX(1.451774981));
1475                     tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
1476                     z1 = MULTIPLY(d1, FIX(1.061594337));
1477                     z2 = MULTIPLY(d3, - FIX(2.172734803));
1478                     z4 = MULTIPLY(z5, FIX(0.785694958));
1479                     z5 = MULTIPLY(z5, FIX(1.175875602));
1480                     
1481                     tmp0 = z1 - z4;
1482                     tmp1 = z2 + z4;
1483                     tmp2 += z5;
1484                     tmp3 += z5;
1485                 }
1486                 else
1487                 {
1488             /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1489                     tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
1490                     tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
1491                     tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
1492                     tmp3 = MULTIPLY(d3, FIX(1.175875602));
1493                 }
1494             }
1495             else
1496             {
1497                 if (d1)
1498                 {
1499             /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1500                     tmp0 = MULTIPLY(d1, FIX2(0.275899379));
1501                     tmp1 = MULTIPLY(d1, FIX2(0.785694958));
1502                     tmp2 = MULTIPLY(d1, FIX(1.175875602));
1503                     tmp3 = MULTIPLY(d1, FIX2(1.387039845));
1504                 }
1505                 else 
1506                 {
1507             /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1508                     tmp0 = tmp1 = tmp2 = tmp3 = 0;
1509                 }
1510             }
1511         }
1512     }
1513
1514     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1515
1516     dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
1517                        CONST_BITS+PASS1_BITS+3);
1518     dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
1519                        CONST_BITS+PASS1_BITS+3);
1520     dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
1521                        CONST_BITS+PASS1_BITS+3);
1522     dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
1523                        CONST_BITS+PASS1_BITS+3);
1524     dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
1525                        CONST_BITS+PASS1_BITS+3);
1526     dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
1527                        CONST_BITS+PASS1_BITS+3);
1528     dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
1529                        CONST_BITS+PASS1_BITS+3);
1530     dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
1531                        CONST_BITS+PASS1_BITS+3);
1532     
1533     dataptr++;                  /* advance pointer to next column */
1534     }
1535 #endif
1536 }
1537 #endif