]> git.sesse.net Git - vlc/blob - src/video_decoder/vdec_idct.c
* Activation des Sparses idct dans le video parser;
[vlc] / src / video_decoder / vdec_idct.c
1 /*****************************************************************************
2  * vdec_idct.c : IDCT functions
3  * (c)1999 VideoLAN
4  *****************************************************************************/
5
6 /*****************************************************************************
7  * Preamble
8  *****************************************************************************/
9
10 #include <errno.h>
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <unistd.h>
14 #include <string.h>
15 #include <sys/uio.h>
16 #include <X11/Xlib.h>
17 #include <X11/extensions/XShm.h>
18
19 #include "config.h"
20 #include "common.h"
21 #include "mtime.h"
22 #include "vlc_thread.h"
23
24 #include "intf_msg.h"
25 #include "debug.h"                    /* ?? temporaire, requis par netlist.h */
26
27 #include "input.h"
28 #include "input_netlist.h"
29 #include "decoder_fifo.h"
30 #include "video.h"
31 #include "video_output.h"
32
33 #include "vdec_idct.h"
34 #include "video_decoder.h"
35 #include "vdec_motion.h"
36
37 #include "vpar_blocks.h"
38 #include "vpar_headers.h"
39 #include "video_fifo.h"
40 #include "vpar_synchro.h"
41 #include "video_parser.h"
42
43 /*
44  * Local prototypes
45  */
46
47 /* Our current implementation is a fast DCT, we might move to a fast DFT or
48  * an MMX DCT in the future. */
49
50 /*****************************************************************************
51  * vdec_DummyIDCT : dummy function that does nothing
52  *****************************************************************************/
53 void vdec_DummyIDCT( vdec_thread_t * p_vdec, dctelem_t * p_block, 
54                      int i_idontcare )
55 {
56 }
57
58 /*****************************************************************************
59  * init_SparseIDCT : initialize datas for vdec_SparceIDCT
60  * vdec_SparseIDCT : IDCT function for sparse matrices
61  *****************************************************************************/
62
63 void vdec_InitIDCT (vdec_thread_t * p_vdec) 
64 {         
65     int i;
66     
67     dctelem_t * p_pre = p_vdec->p_pre_idct;
68     memset( p_pre, 0, 64*64*sizeof(dctelem_t) );
69     
70     for( i=0 ; i < 64 ; i++ ) 
71     {
72         p_pre[i*64+i] = 1 << SPARSE_SCALE_FACTOR;
73         vdec_IDCT( p_vdec, &p_pre[i*64], 0) ;
74     }
75     return;
76 }
77
78 void vdec_SparseIDCT (vdec_thread_t * p_vdec, dctelem_t * p_block, 
79                       int i_sparse_pos)
80 {
81     short int val;
82     int * dp;
83     int v;
84     short int * p_dest;
85     short int * p_source;
86     int coeff, rr;
87
88     /* If DC Coefficient. */
89     if ( i_sparse_pos == 0 ) 
90     {
91             dp=(int *)p_block;
92         val=RIGHT_SHIFT((*p_block + 4), 3);
93         /* Compute int to assign.  This speeds things up a bit */
94         v = ((val & 0xffff) | (val << 16));
95         dp[0] = v;     dp[1] = v;     dp[2] = v;     dp[3] = v;
96         dp[4] = v;     dp[5] = v;     dp[6] = v;     dp[7] = v;
97         dp[8] = v;     dp[9] = v;     dp[10] = v;    dp[11] = v;
98         dp[12] = v;    dp[13] = v;    dp[14] = v;    dp[15] = v;
99         dp[16] = v;    dp[17] = v;    dp[18] = v;    dp[19] = v;
100         dp[20] = v;    dp[21] = v;    dp[22] = v;    dp[23] = v;
101         dp[24] = v;    dp[25] = v;    dp[26] = v;    dp[27] = v;
102         dp[28] = v;    dp[29] = v;    dp[30] = v;    dp[31] = v;
103         return;
104     }
105     /* Some other coefficient. */
106     p_dest = (s16*)p_block;
107     p_source = (s16*)&p_vdec->p_pre_idct[i_sparse_pos];
108     coeff = (int)p_dest[i_sparse_pos];
109     for( rr=0 ; rr < 4 ; rr++ )
110     {
111         p_dest[0] = (p_source[0] * coeff) >> SPARSE_SCALE_FACTOR;
112         p_dest[1] = (p_source[1] * coeff) >> SPARSE_SCALE_FACTOR;
113         p_dest[2] = (p_source[2] * coeff) >> SPARSE_SCALE_FACTOR;
114         p_dest[3] = (p_source[3] * coeff) >> SPARSE_SCALE_FACTOR;
115         p_dest[4] = (p_source[4] * coeff) >> SPARSE_SCALE_FACTOR;
116         p_dest[5] = (p_source[5] * coeff) >> SPARSE_SCALE_FACTOR;
117         p_dest[6] = (p_source[6] * coeff) >> SPARSE_SCALE_FACTOR;
118         p_dest[7] = (p_source[7] * coeff) >> SPARSE_SCALE_FACTOR;
119         p_dest[8] = (p_source[8] * coeff) >> SPARSE_SCALE_FACTOR;
120         p_dest[9] = (p_source[9] * coeff) >> SPARSE_SCALE_FACTOR;
121         p_dest[10] = (p_source[10] * coeff) >> SPARSE_SCALE_FACTOR;
122         p_dest[11] = (p_source[11] * coeff) >> SPARSE_SCALE_FACTOR;
123         p_dest[12] = (p_source[12] * coeff) >> SPARSE_SCALE_FACTOR;
124         p_dest[13] = (p_source[13] * coeff) >> SPARSE_SCALE_FACTOR;
125         p_dest[14] = (p_source[14] * coeff) >> SPARSE_SCALE_FACTOR;
126         p_dest[15] = (p_source[15] * coeff) >> SPARSE_SCALE_FACTOR;
127         p_dest += 16;
128         p_source += 16;
129     }
130     return;
131 }
132       
133
134 /*****************************************************************************
135  * vdec_IDCT : IDCT function for normal matrices
136  *****************************************************************************/
137
138 #ifndef HAVE_MMX
139 void vdec_IDCT( vdec_thread_t * p_vdec, dctelem_t * p_block, int i_idontcare )
140 {
141 #if 0 
142     /* dct classique: pour tester la meilleure entre la classique et la */
143     /* no classique */
144     s32 tmp0, tmp1, tmp2, tmp3;
145     s32 tmp10, tmp11, tmp12, tmp13;
146     s32 z1, z2, z3, z4, z5;
147     dctelem_t * dataptr;
148     int rowctr;
149     SHIFT_TEMPS
150
151   /* Pass 1: process rows. */
152   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
153   /* furthermore, we scale the results by 2**PASS1_BITS. */
154
155     dataptr = p_block;
156     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) 
157     {
158     /* Due to quantization, we will usually find that many of the input
159      * coefficients are zero, especially the AC terms.  We can exploit this
160      * by short-circuiting the IDCT calculation for any row in which all
161      * the AC terms are zero.  In that case each output is equal to the
162      * DC coefficient (with scale factor as needed).
163      * With typical images and quantization tables, half or more of the
164      * row DCT calculations can be simplified this way.
165      */
166
167         if ((dataptr[1] | dataptr[2] | dataptr[3] | dataptr[4] |
168                 dataptr[5] | dataptr[6] | dataptr[7]) == 0) 
169         {
170       /* AC terms all zero */
171             dctelem_t dcval = (dctelem_t) (dataptr[0] << PASS1_BITS);
172       
173             dataptr[0] = dcval;
174             dataptr[1] = dcval;
175             dataptr[2] = dcval;
176             dataptr[3] = dcval;
177             dataptr[4] = dcval;
178             dataptr[5] = dcval;
179             dataptr[6] = dcval;
180             dataptr[7] = dcval;
181       
182             dataptr += DCTSIZE; /* advance pointer to next row */
183             continue;
184         }
185
186     /* Even part: reverse the even part of the forward DCT. */
187     /* The rotator is sqrt(2)*c(-6). */
188
189         z2 = (s32) dataptr[2];
190         z3 = (s32) dataptr[6];
191
192         z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
193         tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
194         tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
195
196         tmp0 = ((s32) dataptr[0] + (s32) dataptr[4]) << CONST_BITS;
197         tmp1 = ((s32) dataptr[0] - (s32) dataptr[4]) << CONST_BITS;
198
199         tmp10 = tmp0 + tmp3;
200         tmp13 = tmp0 - tmp3;
201         tmp11 = tmp1 + tmp2;
202         tmp12 = tmp1 - tmp2;
203     
204     /* Odd part per figure 8; the matrix is unitary and hence its
205      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
206      */
207
208         tmp0 = (s32) dataptr[7];
209         tmp1 = (s32) dataptr[5];
210         tmp2 = (s32) dataptr[3];
211         tmp3 = (s32) dataptr[1];
212
213         z1 = tmp0 + tmp3;
214         z2 = tmp1 + tmp2;
215         z3 = tmp0 + tmp2;
216         z4 = tmp1 + tmp3;
217         z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
218     
219         tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
220         tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
221         tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
222         tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
223         z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
224         z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
225         z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
226         z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
227     
228         z3 += z5;
229         z4 += z5;
230     
231         tmp0 += z1 + z3;
232         tmp1 += z2 + z4;
233         tmp2 += z2 + z3;
234         tmp3 += z1 + z4;
235
236     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
237
238         dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
239         dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
240         dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
241         dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
242         dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
243         dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
244         dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
245         dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
246
247         dataptr += DCTSIZE;             /* advance pointer to next row */
248     }
249
250   /* Pass 2: process columns. */
251   /* Note that we must descale the results by a factor of 8 == 2**3, */
252   /* and also undo the PASS1_BITS scaling. */
253
254     dataptr = p_block;
255     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) 
256     {
257     /* Columns of zeroes can be exploited in the same way as we did with rows.
258      * However, the row calculation has created many nonzero AC terms, so the
259      * simplification applies less often (typically 5% to 10% of the time).
260      * On machines with very fast multiplication, it's possible that the
261      * test takes more time than it's worth.  In that case this section
262      * may be commented out.
263      */
264
265 #ifndef NO_ZERO_COLUMN_TEST /*ajoute un test mais evite des calculs */
266         if ((dataptr[DCTSIZE*1] | dataptr[DCTSIZE*2] | dataptr[DCTSIZE*3] |
267             dataptr[DCTSIZE*4] | dataptr[DCTSIZE*5] | dataptr[DCTSIZE*6] |
268             dataptr[DCTSIZE*7]) == 0) 
269         {
270       /* AC terms all zero */
271             dctelem_t dcval = (dctelem_t) DESCALE((s32) dataptr[0], PASS1_BITS+3);
272       
273             dataptr[DCTSIZE*0] = dcval;
274             dataptr[DCTSIZE*1] = dcval;
275             dataptr[DCTSIZE*2] = dcval;
276             dataptr[DCTSIZE*3] = dcval;
277             dataptr[DCTSIZE*4] = dcval;
278             dataptr[DCTSIZE*5] = dcval;
279             dataptr[DCTSIZE*6] = dcval;
280             dataptr[DCTSIZE*7] = dcval;
281       
282             dataptr++;          /* advance pointer to next column */
283             continue;
284         }
285 #endif
286
287     /* Even part: reverse the even part of the forward DCT. */
288     /* The rotator is sqrt(2)*c(-6). */
289
290         z2 = (s32) dataptr[DCTSIZE*2];
291         z3 = (s32) dataptr[DCTSIZE*6];
292
293         z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
294         tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
295         tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
296
297         tmp0 = ((s32) dataptr[DCTSIZE*0] + (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
298         tmp1 = ((s32) dataptr[DCTSIZE*0] - (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
299
300         tmp10 = tmp0 + tmp3;
301         tmp13 = tmp0 - tmp3;
302         tmp11 = tmp1 + tmp2;
303         tmp12 = tmp1 - tmp2;
304     
305     /* Odd part per figure 8; the matrix is unitary and hence its
306      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
307      */
308
309         tmp0 = (s32) dataptr[DCTSIZE*7];
310         tmp1 = (s32) dataptr[DCTSIZE*5];
311         tmp2 = (s32) dataptr[DCTSIZE*3];
312         tmp3 = (s32) dataptr[DCTSIZE*1];
313
314         z1 = tmp0 + tmp3;
315         z2 = tmp1 + tmp2;
316         z3 = tmp0 + tmp2;
317         z4 = tmp1 + tmp3;
318         z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
319     
320         tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
321         tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
322         tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
323         tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
324         z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
325         z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
326         z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
327         z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
328     
329         z3 += z5;
330         z4 += z5;
331     
332         tmp0 += z1 + z3;
333         tmp1 += z2 + z4;
334         tmp2 += z2 + z3;
335         tmp3 += z1 + z4;
336
337     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
338
339         dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
340                                            CONST_BITS+PASS1_BITS+3);
341         dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
342                                            CONST_BITS+PASS1_BITS+3);
343         dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
344                                            CONST_BITS+PASS1_BITS+3);
345         dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
346                                            CONST_BITS+PASS1_BITS+3);
347         dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
348                                            CONST_BITS+PASS1_BITS+3);
349         dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
350                                            CONST_BITS+PASS1_BITS+3);
351         dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
352                                            CONST_BITS+PASS1_BITS+3);
353         dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
354                                            CONST_BITS+PASS1_BITS+3);
355     
356         dataptr++;                      /* advance pointer to next column */
357     }
358 #endif    
359
360 #if 1  /*dct avec non classique*/    
361     
362     s32 tmp0, tmp1, tmp2, tmp3;
363     s32 tmp10, tmp11, tmp12, tmp13;
364     s32 z1, z2, z3, z4, z5;
365     s32 d0, d1, d2, d3, d4, d5, d6, d7;
366     dctelem_t * dataptr;
367     int rowctr;
368     
369     SHIFT_TEMPS
370    
371     /* Pass 1: process rows. */
372     /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
373     /* furthermore, we scale the results by 2**PASS1_BITS. */
374
375     dataptr = p_block;
376
377     fprintf( stderr, "normal dct" );
378     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) 
379     {
380         /* Due to quantization, we will usually find that many of the input
381          * coefficients are zero, especially the AC terms.  We can exploit this
382          * by short-circuiting the IDCT calculation for any row in which all
383          * the AC terms are zero.  In that case each output is equal to the
384          * DC coefficient (with scale factor as needed).
385          * With typical images and quantization tables, half or more of the
386          * row DCT calculations can be simplified this way.
387          */
388
389         register int * idataptr = (int*)dataptr;
390         d0 = dataptr[0];
391         d1 = dataptr[1];
392         if ( (d1 == 0) && ((idataptr[1] | idataptr[2] | idataptr[3]) == 0) ) 
393         {
394       /* AC terms all zero */
395             if (d0) 
396             {
397       /* Compute a 32 bit value to assign. */
398                 dctelem_t dcval = (dctelem_t) (d0 << PASS1_BITS);
399                 register int v = (dcval & 0xffff) | (dcval << 16);
400   
401                 idataptr[0] = v;
402                 idataptr[1] = v;
403                 idataptr[2] = v;
404                 idataptr[3] = v;
405             }
406       
407             dataptr += DCTSIZE; /* advance pointer to next row */
408             continue;
409         }
410         d2 = dataptr[2];
411         d3 = dataptr[3];
412         d4 = dataptr[4];
413         d5 = dataptr[5];
414         d6 = dataptr[6];
415         d7 = dataptr[7];
416
417     /* Even part: reverse the even part of the forward DCT. */
418     /* The rotator is sqrt(2)*c(-6). */
419         if (d6) 
420         {
421             if (d4) 
422             {
423                 if (d2) 
424                 {
425                             if (d0) 
426                     {
427             /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
428                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
429                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
430                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
431
432                         tmp0 = (d0 + d4) << CONST_BITS;
433                         tmp1 = (d0 - d4) << CONST_BITS;
434
435                         tmp10 = tmp0 + tmp3;
436                         tmp13 = tmp0 - tmp3;
437                         tmp11 = tmp1 + tmp2;
438                         tmp12 = tmp1 - tmp2;
439                     } 
440                     else 
441                     {
442                     /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
443                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
444                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
445                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
446
447                         tmp0 = d4 << CONST_BITS;
448     
449                         tmp10 = tmp0 + tmp3;
450                         tmp13 = tmp0 - tmp3;
451                         tmp11 = tmp2 - tmp0;
452                         tmp12 = -(tmp0 + tmp2);
453                         }
454                 } 
455                 else 
456                 {
457                             if (d0) 
458                     {
459             /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
460                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
461                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
462     
463                         tmp0 = (d0 + d4) << CONST_BITS;
464                         tmp1 = (d0 - d4) << CONST_BITS;
465
466                         tmp10 = tmp0 + tmp3;
467                         tmp13 = tmp0 - tmp3;
468                         tmp11 = tmp1 + tmp2;
469                         tmp12 = tmp1 - tmp2;
470                         }
471                     else 
472                     {
473                     /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
474                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
475                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
476     
477                         tmp0 = d4 << CONST_BITS;
478  
479                         tmp10 = tmp0 + tmp3;
480                         tmp13 = tmp0 - tmp3;
481                         tmp11 = tmp2 - tmp0;
482                         tmp12 = -(tmp0 + tmp2);
483                         }
484                 }
485             } 
486             else 
487             {
488                 if (d2) 
489                 {
490                     if (d0) 
491                     {
492             /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
493                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
494                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
495                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
496
497                         tmp0 = d0 << CONST_BITS;
498     
499                         tmp10 = tmp0 + tmp3;
500                         tmp13 = tmp0 - tmp3;
501                         tmp11 = tmp0 + tmp2;
502                         tmp12 = tmp0 - tmp2;
503                     } 
504                     else 
505                     {
506                     /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
507                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
508                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
509                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
510
511                         tmp10 = tmp3;
512                         tmp13 = -tmp3;
513                         tmp11 = tmp2;
514                         tmp12 = -tmp2;
515                             }
516                 }
517                 else 
518                 {
519                     if (d0) 
520                     {
521             /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
522                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
523                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
524     
525                         tmp0 = d0 << CONST_BITS;
526     
527                         tmp10 = tmp0 + tmp3;
528                         tmp13 = tmp0 - tmp3;
529                         tmp11 = tmp0 + tmp2;
530                         tmp12 = tmp0 - tmp2;
531                     }
532                     else 
533                     {
534             /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
535                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
536                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
537
538                         tmp10 = tmp3;
539                         tmp13 = -tmp3;
540                         tmp11 = tmp2;
541                         tmp12 = -tmp2;
542                     }
543                 }
544             }
545         }
546         else
547         {
548             if (d4) 
549             {
550                 if (d2) 
551                 {
552                     if (d0) 
553                     {
554                     /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
555                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
556                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
557     
558                         tmp0 = (d0 + d4) << CONST_BITS;
559                         tmp1 = (d0 - d4) << CONST_BITS;
560     
561                         tmp10 = tmp0 + tmp3;
562                         tmp13 = tmp0 - tmp3;
563                         tmp11 = tmp1 + tmp2;
564                         tmp12 = tmp1 - tmp2;
565                     }
566                     else 
567                     {
568             /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
569                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
570                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
571     
572                         tmp0 = d4 << CONST_BITS;
573
574                         tmp10 = tmp0 + tmp3;
575                         tmp13 = tmp0 - tmp3;
576                         tmp11 = tmp2 - tmp0;
577                         tmp12 = -(tmp0 + tmp2);
578                     }
579                 }
580                 else 
581                 {
582                     if (d0)
583                     {
584             /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
585                         tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
586                         tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
587                     } 
588                     else 
589                     {
590             /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
591                         tmp10 = tmp13 = d4 << CONST_BITS;
592                         tmp11 = tmp12 = -tmp10;
593                     }
594                 }
595             }
596             else 
597             {
598                 if (d2) 
599                 {
600                     if (d0) 
601                     {
602             /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
603                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
604                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
605
606                         tmp0 = d0 << CONST_BITS;
607
608                         tmp10 = tmp0 + tmp3;
609                         tmp13 = tmp0 - tmp3;
610                         tmp11 = tmp0 + tmp2;
611                         tmp12 = tmp0 - tmp2;
612                     }
613                     else
614                     {
615             /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
616                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
617                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
618
619                         tmp10 = tmp3;
620                         tmp13 = -tmp3;
621                         tmp11 = tmp2;
622                         tmp12 = -tmp2;
623                     }
624                 }
625                 else
626                 {
627                     if (d0)
628                     {
629             /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
630                         tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
631                     } 
632                     else
633                     {
634             /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
635                         tmp10 = tmp13 = tmp11 = tmp12 = 0;
636                     }
637                 }    
638             }
639         }
640
641
642     /* Odd part per figure 8; the matrix is unitary and hence its
643      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
644      */
645
646         if (d7) 
647             {
648                 if (d5)
649             {
650                 if (d3) 
651                 {
652                     if (d1) 
653                     {
654             /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
655                         z1 = d7 + d1;
656                         z2 = d5 + d3;
657                         z3 = d7 + d3;
658                         z4 = d5 + d1;
659                         z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
660                     
661                         tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
662                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
663                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
664                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
665                         z1 = MULTIPLY(z1, - FIX(0.899976223));
666                         z2 = MULTIPLY(z2, - FIX(2.562915447));
667                         z3 = MULTIPLY(z3, - FIX(1.961570560));
668                         z4 = MULTIPLY(z4, - FIX(0.390180644));
669                     
670                         z3 += z5;
671                         z4 += z5;
672                     
673                         tmp0 += z1 + z3;
674                         tmp1 += z2 + z4;
675                         tmp2 += z2 + z3;
676                         tmp3 += z1 + z4;
677                     } 
678                     else 
679                     {
680             /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
681                         z2 = d5 + d3;
682                         z3 = d7 + d3;
683                         z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
684                     
685                         tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
686                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
687                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
688                         z1 = MULTIPLY(d7, - FIX(0.899976223));
689                         z2 = MULTIPLY(z2, - FIX(2.562915447));
690                         z3 = MULTIPLY(z3, - FIX(1.961570560));
691                         z4 = MULTIPLY(d5, - FIX(0.390180644));
692                     
693                         z3 += z5;
694                         z4 += z5;
695                     
696                         tmp0 += z1 + z3;
697                         tmp1 += z2 + z4;
698                         tmp2 += z2 + z3;
699                         tmp3 = z1 + z4;
700                         }
701                     }
702                 else 
703                 {
704                     if (d1)
705                     {
706             /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
707                         z1 = d7 + d1;
708                         z4 = d5 + d1;
709                         z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
710                     
711                         tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
712                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
713                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
714                         z1 = MULTIPLY(z1, - FIX(0.899976223));
715                         z2 = MULTIPLY(d5, - FIX(2.562915447));
716                         z3 = MULTIPLY(d7, - FIX(1.961570560));
717                         z4 = MULTIPLY(z4, - FIX(0.390180644));
718                     
719                         z3 += z5;
720                         z4 += z5;
721                     
722                         tmp0 += z1 + z3;
723                         tmp1 += z2 + z4;
724                         tmp2 = z2 + z3;
725                         tmp3 += z1 + z4;
726                     }
727                     else
728                     {
729             /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
730                         z5 = MULTIPLY(d7 + d5, FIX(1.175875602));
731
732                         tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
733                         tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
734                         z1 = MULTIPLY(d7, - FIX(0.899976223));
735                         z3 = MULTIPLY(d7, - FIX(1.961570560));
736                         z2 = MULTIPLY(d5, - FIX(2.562915447));
737                         z4 = MULTIPLY(d5, - FIX(0.390180644));
738                     
739                         z3 += z5;
740                         z4 += z5;
741                         
742                         tmp0 += z3;
743                         tmp1 += z4;
744                         tmp2 = z2 + z3;
745                         tmp3 = z1 + z4;
746                     }
747                 }
748             }
749             else 
750             {
751                 if (d3) 
752                 {
753                     if (d1) 
754                     {
755             /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
756                         z1 = d7 + d1;
757                         z3 = d7 + d3;
758                         z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
759                     
760                         tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
761                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
762                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
763                         z1 = MULTIPLY(z1, - FIX(0.899976223));
764                         z2 = MULTIPLY(d3, - FIX(2.562915447));
765                         z3 = MULTIPLY(z3, - FIX(1.961570560));
766                         z4 = MULTIPLY(d1, - FIX(0.390180644));
767                     
768                         z3 += z5;
769                         z4 += z5;
770                     
771                         tmp0 += z1 + z3;
772                         tmp1 = z2 + z4;
773                         tmp2 += z2 + z3;
774                         tmp3 += z1 + z4;
775                     }
776                     else
777                     {
778             /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
779                         z3 = d7 + d3;
780                         z5 = MULTIPLY(z3, FIX(1.175875602));
781                     
782                         tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
783                         tmp2 = MULTIPLY(d3, FIX(0.509795579));
784                         z1 = MULTIPLY(d7, - FIX(0.899976223));
785                         z2 = MULTIPLY(d3, - FIX(2.562915447));
786                         z3 = MULTIPLY(z3, - FIX2(0.785694958));
787     
788                         tmp0 += z3;
789                         tmp1 = z2 + z5;
790                         tmp2 += z3;
791                         tmp3 = z1 + z5;
792                     }
793                 } 
794                 else 
795                 {
796                     if (d1) 
797                     {
798             /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
799                         z1 = d7 + d1;
800                         z5 = MULTIPLY(z1, FIX(1.175875602));
801
802                         tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
803                         tmp3 = MULTIPLY(d1, FIX2(1.111140466));
804                         z1 = MULTIPLY(z1, FIX2(0.275899379));
805                         z3 = MULTIPLY(d7, - FIX(1.961570560));
806                         z4 = MULTIPLY(d1, - FIX(0.390180644));
807     
808                         tmp0 += z1;
809                         tmp1 = z4 + z5;
810                         tmp2 = z3 + z5;
811                         tmp3 += z1;
812                     }   
813                 else 
814                     {
815             /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
816                         tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
817                         tmp1 = MULTIPLY(d7, FIX(1.175875602));
818                         tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
819                         tmp3 = MULTIPLY(d7, FIX2(0.275899379));
820                     }
821                 }
822             }
823         }
824         else
825         { 
826             if (d5)
827             {
828                 if (d3) 
829                 {
830                     if (d1)
831                     {
832             /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
833                         z2 = d5 + d3;
834                         z4 = d5 + d1;
835                         z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
836                     
837                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
838                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
839                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
840                         z1 = MULTIPLY(d1, - FIX(0.899976223));
841                         z2 = MULTIPLY(z2, - FIX(2.562915447));
842                         z3 = MULTIPLY(d3, - FIX(1.961570560));
843                         z4 = MULTIPLY(z4, - FIX(0.390180644));
844                     
845                         z3 += z5;
846                         z4 += z5;
847                     
848                         tmp0 = z1 + z3;
849                         tmp1 += z2 + z4;
850                         tmp2 += z2 + z3;
851                         tmp3 += z1 + z4;
852                     } 
853                     else 
854                     {
855             /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
856                         z2 = d5 + d3;
857                         z5 = MULTIPLY(z2, FIX(1.175875602));
858                     
859                         tmp1 = MULTIPLY(d5, FIX2(1.662939225));
860                         tmp2 = MULTIPLY(d3, FIX2(1.111140466));
861                         z2 = MULTIPLY(z2, - FIX2(1.387039845));
862                         z3 = MULTIPLY(d3, - FIX(1.961570560));
863                         z4 = MULTIPLY(d5, - FIX(0.390180644));
864                     
865                         tmp0 = z3 + z5;
866                         tmp1 += z2;
867                         tmp2 += z2;
868                         tmp3 = z4 + z5;
869                     }
870                 }
871                 else 
872                 {
873                     if (d1) 
874                     {
875             /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
876                         z4 = d5 + d1;
877                         z5 = MULTIPLY(z4, FIX(1.175875602));
878                     
879                         tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
880                         tmp3 = MULTIPLY(d1, FIX2(0.601344887));
881                         z1 = MULTIPLY(d1, - FIX(0.899976223));
882                         z2 = MULTIPLY(d5, - FIX(2.562915447));
883                         z4 = MULTIPLY(z4, FIX2(0.785694958));
884                     
885                         tmp0 = z1 + z5;
886                         tmp1 += z4;
887                         tmp2 = z2 + z5;
888                         tmp3 += z4;
889                     }
890                     else 
891                     {
892             /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
893                         tmp0 = MULTIPLY(d5, FIX(1.175875602));
894                         tmp1 = MULTIPLY(d5, FIX2(0.275899380));
895                         tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
896                         tmp3 = MULTIPLY(d5, FIX2(0.785694958));
897                     }
898                 }
899             }
900             else 
901             {
902                 if (d3)
903                 {
904                     if (d1) 
905                     {
906             /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
907                         z5 = d3 + d1;
908
909                         tmp2 = MULTIPLY(d3, - FIX(1.451774981));
910                         tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
911                         z1 = MULTIPLY(d1, FIX(1.061594337));
912                         z2 = MULTIPLY(d3, - FIX(2.172734803));
913                         z4 = MULTIPLY(z5, FIX(0.785694958));
914                         z5 = MULTIPLY(z5, FIX(1.175875602));
915                     
916                         tmp0 = z1 - z4;
917                         tmp1 = z2 + z4;
918                         tmp2 += z5;
919                         tmp3 += z5;
920                     }
921                     else 
922                     {
923             /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
924                         tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
925                         tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
926                         tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
927                         tmp3 = MULTIPLY(d3, FIX(1.175875602));
928                     }
929                 }
930                 else 
931                 {
932                     if (d1) 
933                     {
934             /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
935                         tmp0 = MULTIPLY(d1, FIX2(0.275899379));
936                         tmp1 = MULTIPLY(d1, FIX2(0.785694958));
937                         tmp2 = MULTIPLY(d1, FIX(1.175875602));
938                         tmp3 = MULTIPLY(d1, FIX2(1.387039845));
939                     }
940                     else 
941                     {
942             /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
943                         tmp0 = tmp1 = tmp2 = tmp3 = 0;
944                     }
945                 }
946             }
947         }
948
949     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
950
951         dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
952         dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
953         dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
954         dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
955         dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
956         dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
957         dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
958         dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
959
960         dataptr += DCTSIZE;             /* advance pointer to next row */
961     }
962
963   /* Pass 2: process columns. */
964   /* Note that we must descale the results by a factor of 8 == 2**3, */
965   /* and also undo the PASS1_BITS scaling. */
966
967     dataptr = p_block;
968     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) 
969     {
970     /* Columns of zeroes can be exploited in the same way as we did with rows.
971      * However, the row calculation has created many nonzero AC terms, so the
972      * simplification applies less often (typically 5% to 10% of the time).
973      * On machines with very fast multiplication, it's possible that the
974      * test takes more time than it's worth.  In that case this section
975      * may be commented out.
976      */
977
978         d0 = dataptr[DCTSIZE*0];
979         d1 = dataptr[DCTSIZE*1];
980         d2 = dataptr[DCTSIZE*2];
981         d3 = dataptr[DCTSIZE*3];
982         d4 = dataptr[DCTSIZE*4];
983         d5 = dataptr[DCTSIZE*5];
984         d6 = dataptr[DCTSIZE*6];
985         d7 = dataptr[DCTSIZE*7];
986
987     /* Even part: reverse the even part of the forward DCT. */
988     /* The rotator is sqrt(2)*c(-6). */
989         if (d6) 
990         {
991             if (d4) 
992             {
993                 if (d2)
994                 {
995                     if (d0)
996                     {
997             /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
998                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
999                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1000                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1001
1002                         tmp0 = (d0 + d4) << CONST_BITS;
1003                         tmp1 = (d0 - d4) << CONST_BITS;
1004
1005                         tmp10 = tmp0 + tmp3;
1006                         tmp13 = tmp0 - tmp3;
1007                         tmp11 = tmp1 + tmp2;
1008                         tmp12 = tmp1 - tmp2;
1009                     }
1010                     else 
1011                     {
1012             /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
1013                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1014                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1015                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1016
1017                         tmp0 = d4 << CONST_BITS;
1018
1019                         tmp10 = tmp0 + tmp3;
1020                         tmp13 = tmp0 - tmp3;
1021                         tmp11 = tmp2 - tmp0;
1022                         tmp12 = -(tmp0 + tmp2);
1023                     }
1024                 }
1025                 else 
1026                 {
1027                     if (d0)
1028                     {
1029             /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1030                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1031                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
1032
1033                         tmp0 = (d0 + d4) << CONST_BITS;
1034                         tmp1 = (d0 - d4) << CONST_BITS;
1035
1036                         tmp10 = tmp0 + tmp3;
1037                         tmp13 = tmp0 - tmp3;
1038                         tmp11 = tmp1 + tmp2;
1039                         tmp12 = tmp1 - tmp2;
1040                     }
1041                     else
1042                     {
1043             /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
1044                         tmp2 = MULTIPLY(d6, -FIX2(1.306562965));
1045                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
1046
1047                         tmp0 = d4 << CONST_BITS;
1048
1049                         tmp10 = tmp0 + tmp3;
1050                         tmp13 = tmp0 - tmp3;
1051                         tmp11 = tmp2 - tmp0;
1052                         tmp12 = -(tmp0 + tmp2);
1053                     }
1054                 }
1055             }
1056             else
1057             {
1058                 if (d2) 
1059                 {
1060                     if (d0) 
1061                     {
1062             /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
1063                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1064                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1065                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1066
1067                         tmp0 = d0 << CONST_BITS;
1068
1069                         tmp10 = tmp0 + tmp3;
1070                         tmp13 = tmp0 - tmp3;
1071                         tmp11 = tmp0 + tmp2;
1072                         tmp12 = tmp0 - tmp2;
1073                     }
1074                     else
1075                     {
1076             /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
1077                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1078                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1079                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1080
1081                         tmp10 = tmp3;
1082                         tmp13 = -tmp3;
1083                         tmp11 = tmp2;
1084                         tmp12 = -tmp2;
1085                     }
1086                 } 
1087                 else 
1088                 {
1089                     if (d0)
1090                     {
1091             /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
1092                     tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1093                     tmp3 = MULTIPLY(d6, FIX(0.541196100));
1094
1095                     tmp0 = d0 << CONST_BITS;
1096
1097                     tmp10 = tmp0 + tmp3;
1098                     tmp13 = tmp0 - tmp3;
1099                     tmp11 = tmp0 + tmp2;
1100                     tmp12 = tmp0 - tmp2;
1101                 } 
1102                 else 
1103                 {
1104             /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
1105                     tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1106                     tmp3 = MULTIPLY(d6, FIX(0.541196100));
1107                     tmp10 = tmp3;
1108                     tmp13 = -tmp3;
1109                     tmp11 = tmp2;
1110                     tmp12 = -tmp2;
1111                 }
1112             }
1113         }
1114     }
1115     else
1116     {
1117         if (d4) 
1118         {
1119             if (d2) 
1120             {
1121                     if (d0) 
1122                 {
1123             /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1124                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1125                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1126
1127                     tmp0 = (d0 + d4) << CONST_BITS;
1128                     tmp1 = (d0 - d4) << CONST_BITS;
1129
1130                     tmp10 = tmp0 + tmp3;
1131                     tmp13 = tmp0 - tmp3;
1132                     tmp11 = tmp1 + tmp2;
1133                     tmp12 = tmp1 - tmp2;
1134                 }
1135                 else 
1136                 {
1137             /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
1138                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1139                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1140
1141                     tmp0 = d4 << CONST_BITS;
1142
1143                     tmp10 = tmp0 + tmp3;
1144                     tmp13 = tmp0 - tmp3;
1145                     tmp11 = tmp2 - tmp0;
1146                     tmp12 = -(tmp0 + tmp2);
1147                 }
1148             }
1149             else 
1150             {
1151                 if (d0)
1152                 {
1153             /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1154                     tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1155                     tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1156                 }
1157                 else 
1158                 {
1159             /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
1160                     tmp10 = tmp13 = d4 << CONST_BITS;
1161                     tmp11 = tmp12 = -tmp10;
1162                 }
1163             }
1164         } 
1165         else 
1166         {
1167         if (d2) 
1168         {
1169             if (d0) 
1170             {
1171             /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
1172                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1173                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1174
1175                     tmp0 = d0 << CONST_BITS;
1176
1177                     tmp10 = tmp0 + tmp3;
1178                     tmp13 = tmp0 - tmp3;
1179                     tmp11 = tmp0 + tmp2;
1180                     tmp12 = tmp0 - tmp2;
1181             }
1182             else 
1183             {
1184             /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
1185                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1186                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1187
1188                     tmp10 = tmp3;
1189                     tmp13 = -tmp3;
1190                     tmp11 = tmp2;
1191                     tmp12 = -tmp2;
1192             }
1193         }
1194         else 
1195         {
1196             if (d0) 
1197                 {
1198             /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
1199                     tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
1200                 }
1201                 else 
1202                 {
1203             /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
1204                     tmp10 = tmp13 = tmp11 = tmp12 = 0;
1205                 }
1206             }
1207         }
1208     }
1209
1210     /* Odd part per figure 8; the matrix is unitary and hence its
1211      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
1212      */
1213     if (d7) 
1214     {
1215             if (d5) 
1216         {
1217             if (d3) 
1218             {
1219                 if (d1) 
1220                 {
1221             /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
1222                     z1 = d7 + d1;
1223                     z2 = d5 + d3;
1224                     z3 = d7 + d3;
1225                     z4 = d5 + d1;
1226                     z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
1227                     
1228                     tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
1229                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1230                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1231                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1232                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1233                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1234                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1235                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1236                     
1237                     z3 += z5;
1238                     z4 += z5;
1239                     
1240                     tmp0 += z1 + z3;
1241                     tmp1 += z2 + z4;
1242                     tmp2 += z2 + z3;
1243                     tmp3 += z1 + z4;
1244                 }
1245                 else 
1246                 {
1247             /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
1248                     z2 = d5 + d3;
1249                     z3 = d7 + d3;
1250                     z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
1251                     
1252                     tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
1253                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1254                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1255                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1256                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1257                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1258                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1259                     
1260                     z3 += z5;
1261                     z4 += z5;
1262                     
1263                     tmp0 += z1 + z3;
1264                     tmp1 += z2 + z4;
1265                     tmp2 += z2 + z3;
1266                     tmp3 = z1 + z4;
1267                 }
1268             } 
1269             else
1270             {
1271                 if (d1) 
1272                 {
1273             /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
1274                     z1 = d7 + d1;
1275                     z4 = d5 + d1;
1276                     z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
1277                     
1278                     tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
1279                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1280                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1281                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1282                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1283                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1284                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1285                     
1286                     z3 += z5;
1287                     z4 += z5;
1288                     
1289                     tmp0 += z1 + z3;
1290                     tmp1 += z2 + z4;
1291                     tmp2 = z2 + z3;
1292                     tmp3 += z1 + z4;
1293                 }
1294                 else 
1295                 {
1296             /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
1297                     z5 = MULTIPLY(d5 + d7, FIX(1.175875602));
1298
1299                     tmp0 = MULTIPLY(d7, - FIX2(0.601344887)); 
1300                     tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1301                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1302                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1303                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1304                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1305                     
1306                     z3 += z5;
1307                     z4 += z5;
1308                     
1309                     tmp0 += z3;
1310                     tmp1 += z4;
1311                     tmp2 = z2 + z3;
1312                     tmp3 = z1 + z4;
1313                 }
1314             }
1315         }
1316         else 
1317         {
1318             if (d3)
1319             {
1320                 if (d1) 
1321                 {
1322             /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
1323                     z1 = d7 + d1;
1324                     z3 = d7 + d3;
1325                     z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
1326                     
1327                     tmp0 = MULTIPLY(d7, FIX(0.298631336)); 
1328                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1329                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1330                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1331                     z2 = MULTIPLY(d3, - FIX(2.562915447));
1332                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1333                     z4 = MULTIPLY(d1, - FIX(0.390180644));
1334                     
1335                     z3 += z5;
1336                     z4 += z5;
1337                     
1338                     tmp0 += z1 + z3;
1339                     tmp1 = z2 + z4;
1340                     tmp2 += z2 + z3;
1341                     tmp3 += z1 + z4;
1342                 } 
1343                 else 
1344                 {
1345             /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1346                     z3 = d7 + d3;
1347                     z5 = MULTIPLY(z3, FIX(1.175875602));
1348                     
1349                     tmp0 = MULTIPLY(d7, - FIX2(0.601344887)); 
1350                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1351                     tmp2 = MULTIPLY(d3, FIX(0.509795579));
1352                     z2 = MULTIPLY(d3, - FIX(2.562915447));
1353                     z3 = MULTIPLY(z3, - FIX2(0.785694958));
1354                     
1355                     tmp0 += z3;
1356                     tmp1 = z2 + z5;
1357                     tmp2 += z3;
1358                     tmp3 = z1 + z5;
1359                 }
1360             } 
1361             else
1362             {
1363                 if (d1) 
1364                 {
1365             /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1366                     z1 = d7 + d1;
1367                     z5 = MULTIPLY(z1, FIX(1.175875602));
1368
1369                     tmp0 = MULTIPLY(d7, - FIX2(1.662939224)); 
1370                     tmp3 = MULTIPLY(d1, FIX2(1.111140466));
1371                     z1 = MULTIPLY(z1, FIX2(0.275899379));
1372                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1373                     z4 = MULTIPLY(d1, - FIX(0.390180644));
1374
1375                     tmp0 += z1;
1376                     tmp1 = z4 + z5;
1377                     tmp2 = z3 + z5;
1378                     tmp3 += z1;
1379                 }
1380                 else 
1381                 {
1382             /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1383                     tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
1384                     tmp1 = MULTIPLY(d7, FIX(1.175875602));
1385                     tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
1386                     tmp3 = MULTIPLY(d7, FIX2(0.275899379));
1387                 }
1388             }
1389         }
1390     } 
1391     else 
1392     {
1393         if (d5) 
1394         {
1395             if (d3) 
1396             {
1397                 if (d1) 
1398                 {
1399             /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1400                     z2 = d5 + d3;
1401                     z4 = d5 + d1;
1402                     z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
1403                     
1404                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1405                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1406                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1407                     z1 = MULTIPLY(d1, - FIX(0.899976223));
1408                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1409                     z3 = MULTIPLY(d3, - FIX(1.961570560));
1410                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1411                     
1412                     z3 += z5;
1413                     z4 += z5;
1414                     
1415                     tmp0 = z1 + z3;
1416                     tmp1 += z2 + z4;
1417                     tmp2 += z2 + z3;
1418                     tmp3 += z1 + z4;
1419                 }
1420                 else 
1421                 {
1422             /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1423                     z2 = d5 + d3;
1424                     z5 = MULTIPLY(z2, FIX(1.175875602));
1425
1426                     tmp1 = MULTIPLY(d5, FIX2(1.662939225));
1427                     tmp2 = MULTIPLY(d3, FIX2(1.111140466));
1428                     z2 = MULTIPLY(z2, - FIX2(1.387039845));
1429                     z3 = MULTIPLY(d3, - FIX(1.961570560));
1430                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1431                     
1432                     tmp0 = z3 + z5;
1433                     tmp1 += z2;
1434                     tmp2 += z2;
1435                     tmp3 = z4 + z5;
1436                 }
1437             } 
1438             else 
1439             {
1440                 if (d1) 
1441                 {
1442             /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1443                     z4 = d5 + d1;
1444                     z5 = MULTIPLY(z4, FIX(1.175875602));
1445                     
1446                     tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1447                     tmp3 = MULTIPLY(d1, FIX2(0.601344887));
1448                     z1 = MULTIPLY(d1, - FIX(0.899976223));
1449                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1450                     z4 = MULTIPLY(z4, FIX2(0.785694958));
1451                     
1452                     tmp0 = z1 + z5;
1453                     tmp1 += z4;
1454                     tmp2 = z2 + z5;
1455                     tmp3 += z4;
1456                 }
1457                 else
1458                 {
1459             /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1460                     tmp0 = MULTIPLY(d5, FIX(1.175875602));
1461                     tmp1 = MULTIPLY(d5, FIX2(0.275899380));
1462                     tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
1463                     tmp3 = MULTIPLY(d5, FIX2(0.785694958));
1464                 }
1465             }
1466         }
1467         else
1468         {
1469             if (d3)
1470             {
1471                 if (d1) 
1472                 {
1473             /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1474                     z5 = d3 + d1;
1475
1476                     tmp2 = MULTIPLY(d3, - FIX(1.451774981));
1477                     tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
1478                     z1 = MULTIPLY(d1, FIX(1.061594337));
1479                     z2 = MULTIPLY(d3, - FIX(2.172734803));
1480                     z4 = MULTIPLY(z5, FIX(0.785694958));
1481                     z5 = MULTIPLY(z5, FIX(1.175875602));
1482                     
1483                     tmp0 = z1 - z4;
1484                     tmp1 = z2 + z4;
1485                     tmp2 += z5;
1486                     tmp3 += z5;
1487                 }
1488                 else
1489                 {
1490             /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1491                     tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
1492                     tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
1493                     tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
1494                     tmp3 = MULTIPLY(d3, FIX(1.175875602));
1495                 }
1496             }
1497             else
1498             {
1499                 if (d1)
1500                 {
1501             /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1502                     tmp0 = MULTIPLY(d1, FIX2(0.275899379));
1503                     tmp1 = MULTIPLY(d1, FIX2(0.785694958));
1504                     tmp2 = MULTIPLY(d1, FIX(1.175875602));
1505                     tmp3 = MULTIPLY(d1, FIX2(1.387039845));
1506                 }
1507                 else 
1508                 {
1509             /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1510                     tmp0 = tmp1 = tmp2 = tmp3 = 0;
1511                 }
1512             }
1513         }
1514     }
1515
1516     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1517
1518     dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
1519                        CONST_BITS+PASS1_BITS+3);
1520     dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
1521                        CONST_BITS+PASS1_BITS+3);
1522     dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
1523                        CONST_BITS+PASS1_BITS+3);
1524     dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
1525                        CONST_BITS+PASS1_BITS+3);
1526     dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
1527                        CONST_BITS+PASS1_BITS+3);
1528     dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
1529                        CONST_BITS+PASS1_BITS+3);
1530     dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
1531                        CONST_BITS+PASS1_BITS+3);
1532     dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
1533                        CONST_BITS+PASS1_BITS+3);
1534     
1535     dataptr++;                  /* advance pointer to next column */
1536     }
1537 #endif
1538 }
1539 #endif