]> git.sesse.net Git - vlc/blob - src/video_decoder/vdec_idct.c
d9a9c3308239ebc162e557e97174ebdac198b1a9
[vlc] / src / video_decoder / vdec_idct.c
1 /*****************************************************************************
2  * vdec_idct.c : IDCT functions
3  * (c)1999 VideoLAN
4  *****************************************************************************/
5
6 /*****************************************************************************
7  * Preamble
8  *****************************************************************************/
9
10 #include <errno.h>
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <unistd.h>
14 #include <string.h>
15 #include <sys/uio.h>
16
17 #include "config.h"
18 #include "common.h"
19 #include "mtime.h"
20 #include "vlc_thread.h"
21
22 #include "intf_msg.h"
23 #include "debug.h"                 /* XXX?? temporaire, requis par netlist.h */
24
25 #include "input.h"
26 #include "input_netlist.h"
27 #include "decoder_fifo.h"
28 #include "video.h"
29 #include "video_output.h"
30
31 #include "vdec_idct.h"
32 #include "video_decoder.h"
33 #include "vdec_motion.h"
34
35 #include "vpar_blocks.h"
36 #include "vpar_headers.h"
37 #include "vpar_synchro.h"
38 #include "video_parser.h"
39 #include "video_fifo.h"
40
41 /*
42  * Local prototypes
43  */
44
45 /* Our current implementation is a fast DCT, we might move to a fast DFT or
46  * an MMX DCT in the future. */
47
48 /*****************************************************************************
49  * vdec_InitIDCT : initialize datas for vdec_SparceIDCT
50  * vdec_SparseIDCT : IDCT function for sparse matrices
51  *****************************************************************************/
52
53 void vdec_InitIDCT (vdec_thread_t * p_vdec)
54 {
55     int i;
56
57     dctelem_t * p_pre = p_vdec->p_pre_idct;
58     memset( p_pre, 0, 64*64*sizeof(dctelem_t) );
59
60     for( i=0 ; i < 64 ; i++ )
61     {
62         p_pre[i*64+i] = 1 << SPARSE_SCALE_FACTOR;
63         vdec_IDCT( p_vdec, &p_pre[i*64], 0) ;
64     }
65     return;
66 }
67
68 void vdec_SparseIDCT (vdec_thread_t * p_vdec, dctelem_t * p_block,
69                       int i_sparse_pos)
70 {
71     short int val;
72     int * dp;
73     int v;
74     short int * p_dest;
75     short int * p_source;
76     int coeff, rr;
77
78     /* If DC Coefficient. */
79     if ( i_sparse_pos == 0 )
80     {
81         dp=(int *)p_block;
82         val=RIGHT_SHIFT((*p_block + 4), 3);
83         /* Compute int to assign.  This speeds things up a bit */
84         v = ((val & 0xffff) | (val << 16));
85         dp[0] = v;     dp[1] = v;     dp[2] = v;     dp[3] = v;
86         dp[4] = v;     dp[5] = v;     dp[6] = v;     dp[7] = v;
87         dp[8] = v;     dp[9] = v;     dp[10] = v;    dp[11] = v;
88         dp[12] = v;    dp[13] = v;    dp[14] = v;    dp[15] = v;
89         dp[16] = v;    dp[17] = v;    dp[18] = v;    dp[19] = v;
90         dp[20] = v;    dp[21] = v;    dp[22] = v;    dp[23] = v;
91         dp[24] = v;    dp[25] = v;    dp[26] = v;    dp[27] = v;
92         dp[28] = v;    dp[29] = v;    dp[30] = v;    dp[31] = v;
93         return;
94     }
95     /* Some other coefficient. */
96     p_dest = (s16*)p_block;
97     p_source = (s16*)&p_vdec->p_pre_idct[i_sparse_pos];
98     coeff = (int)p_dest[i_sparse_pos];
99     for( rr=0 ; rr < 4 ; rr++ )
100     {
101         p_dest[0] = (p_source[0] * coeff) >> SPARSE_SCALE_FACTOR;
102         p_dest[1] = (p_source[1] * coeff) >> SPARSE_SCALE_FACTOR;
103         p_dest[2] = (p_source[2] * coeff) >> SPARSE_SCALE_FACTOR;
104         p_dest[3] = (p_source[3] * coeff) >> SPARSE_SCALE_FACTOR;
105         p_dest[4] = (p_source[4] * coeff) >> SPARSE_SCALE_FACTOR;
106         p_dest[5] = (p_source[5] * coeff) >> SPARSE_SCALE_FACTOR;
107         p_dest[6] = (p_source[6] * coeff) >> SPARSE_SCALE_FACTOR;
108         p_dest[7] = (p_source[7] * coeff) >> SPARSE_SCALE_FACTOR;
109         p_dest[8] = (p_source[8] * coeff) >> SPARSE_SCALE_FACTOR;
110         p_dest[9] = (p_source[9] * coeff) >> SPARSE_SCALE_FACTOR;
111         p_dest[10] = (p_source[10] * coeff) >> SPARSE_SCALE_FACTOR;
112         p_dest[11] = (p_source[11] * coeff) >> SPARSE_SCALE_FACTOR;
113         p_dest[12] = (p_source[12] * coeff) >> SPARSE_SCALE_FACTOR;
114         p_dest[13] = (p_source[13] * coeff) >> SPARSE_SCALE_FACTOR;
115         p_dest[14] = (p_source[14] * coeff) >> SPARSE_SCALE_FACTOR;
116         p_dest[15] = (p_source[15] * coeff) >> SPARSE_SCALE_FACTOR;
117         p_dest += 16;
118         p_source += 16;
119     }
120     return;
121 }
122
123
124 /*****************************************************************************
125  * vdec_IDCT : IDCT function for normal matrices
126  *****************************************************************************/
127
128 #ifndef HAVE_MMX
129 void vdec_IDCT( vdec_thread_t * p_vdec, dctelem_t * p_block, int i_idontcare )
130 {
131 #if 0
132     /* dct classique: pour tester la meilleure entre la classique et la */
133     /* no classique */
134     s32 tmp0, tmp1, tmp2, tmp3;
135     s32 tmp10, tmp11, tmp12, tmp13;
136     s32 z1, z2, z3, z4, z5;
137     dctelem_t * dataptr;
138     int rowctr;
139     SHIFT_TEMPS
140
141   /* Pass 1: process rows. */
142   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
143   /* furthermore, we scale the results by 2**PASS1_BITS. */
144
145     dataptr = p_block;
146     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
147     {
148     /* Due to quantization, we will usually find that many of the input
149      * coefficients are zero, especially the AC terms.  We can exploit this
150      * by short-circuiting the IDCT calculation for any row in which all
151      * the AC terms are zero.  In that case each output is equal to the
152      * DC coefficient (with scale factor as needed).
153      * With typical images and quantization tables, half or more of the
154      * row DCT calculations can be simplified this way.
155      */
156
157         if ((dataptr[1] | dataptr[2] | dataptr[3] | dataptr[4] |
158                 dataptr[5] | dataptr[6] | dataptr[7]) == 0)
159         {
160       /* AC terms all zero */
161             dctelem_t dcval = (dctelem_t) (dataptr[0] << PASS1_BITS);
162
163             dataptr[0] = dcval;
164             dataptr[1] = dcval;
165             dataptr[2] = dcval;
166             dataptr[3] = dcval;
167             dataptr[4] = dcval;
168             dataptr[5] = dcval;
169             dataptr[6] = dcval;
170             dataptr[7] = dcval;
171
172             dataptr += DCTSIZE; /* advance pointer to next row */
173             continue;
174         }
175
176     /* Even part: reverse the even part of the forward DCT. */
177     /* The rotator is sqrt(2)*c(-6). */
178
179         z2 = (s32) dataptr[2];
180         z3 = (s32) dataptr[6];
181
182         z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
183         tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
184         tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
185
186         tmp0 = ((s32) dataptr[0] + (s32) dataptr[4]) << CONST_BITS;
187         tmp1 = ((s32) dataptr[0] - (s32) dataptr[4]) << CONST_BITS;
188
189         tmp10 = tmp0 + tmp3;
190         tmp13 = tmp0 - tmp3;
191         tmp11 = tmp1 + tmp2;
192         tmp12 = tmp1 - tmp2;
193
194     /* Odd part per figure 8; the matrix is unitary and hence its
195      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
196      */
197
198         tmp0 = (s32) dataptr[7];
199         tmp1 = (s32) dataptr[5];
200         tmp2 = (s32) dataptr[3];
201         tmp3 = (s32) dataptr[1];
202
203         z1 = tmp0 + tmp3;
204         z2 = tmp1 + tmp2;
205         z3 = tmp0 + tmp2;
206         z4 = tmp1 + tmp3;
207         z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
208
209         tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
210         tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
211         tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
212         tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
213         z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
214         z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
215         z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
216         z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
217
218         z3 += z5;
219         z4 += z5;
220
221         tmp0 += z1 + z3;
222         tmp1 += z2 + z4;
223         tmp2 += z2 + z3;
224         tmp3 += z1 + z4;
225
226     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
227
228         dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
229         dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
230         dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
231         dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
232         dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
233         dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
234         dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
235         dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
236
237         dataptr += DCTSIZE;             /* advance pointer to next row */
238     }
239
240   /* Pass 2: process columns. */
241   /* Note that we must descale the results by a factor of 8 == 2**3, */
242   /* and also undo the PASS1_BITS scaling. */
243
244     dataptr = p_block;
245     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
246     {
247     /* Columns of zeroes can be exploited in the same way as we did with rows.
248      * However, the row calculation has created many nonzero AC terms, so the
249      * simplification applies less often (typically 5% to 10% of the time).
250      * On machines with very fast multiplication, it's possible that the
251      * test takes more time than it's worth.  In that case this section
252      * may be commented out.
253      */
254
255 #ifndef NO_ZERO_COLUMN_TEST /*ajoute un test mais evite des calculs */
256         if ((dataptr[DCTSIZE*1] | dataptr[DCTSIZE*2] | dataptr[DCTSIZE*3] |
257             dataptr[DCTSIZE*4] | dataptr[DCTSIZE*5] | dataptr[DCTSIZE*6] |
258             dataptr[DCTSIZE*7]) == 0)
259         {
260       /* AC terms all zero */
261             dctelem_t dcval = (dctelem_t) DESCALE((s32) dataptr[0], PASS1_BITS+3);
262
263             dataptr[DCTSIZE*0] = dcval;
264             dataptr[DCTSIZE*1] = dcval;
265             dataptr[DCTSIZE*2] = dcval;
266             dataptr[DCTSIZE*3] = dcval;
267             dataptr[DCTSIZE*4] = dcval;
268             dataptr[DCTSIZE*5] = dcval;
269             dataptr[DCTSIZE*6] = dcval;
270             dataptr[DCTSIZE*7] = dcval;
271
272             dataptr++;          /* advance pointer to next column */
273             continue;
274         }
275 #endif
276
277     /* Even part: reverse the even part of the forward DCT. */
278     /* The rotator is sqrt(2)*c(-6). */
279
280         z2 = (s32) dataptr[DCTSIZE*2];
281         z3 = (s32) dataptr[DCTSIZE*6];
282
283         z1 = MULTIPLY(z2 + z3, FIX(0.541196100));
284         tmp2 = z1 + MULTIPLY(z3, - FIX(1.847759065));
285         tmp3 = z1 + MULTIPLY(z2, FIX(0.765366865));
286
287         tmp0 = ((s32) dataptr[DCTSIZE*0] + (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
288         tmp1 = ((s32) dataptr[DCTSIZE*0] - (s32) dataptr[DCTSIZE*4]) << CONST_BITS;
289
290         tmp10 = tmp0 + tmp3;
291         tmp13 = tmp0 - tmp3;
292         tmp11 = tmp1 + tmp2;
293         tmp12 = tmp1 - tmp2;
294
295     /* Odd part per figure 8; the matrix is unitary and hence its
296      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
297      */
298
299         tmp0 = (s32) dataptr[DCTSIZE*7];
300         tmp1 = (s32) dataptr[DCTSIZE*5];
301         tmp2 = (s32) dataptr[DCTSIZE*3];
302         tmp3 = (s32) dataptr[DCTSIZE*1];
303
304         z1 = tmp0 + tmp3;
305         z2 = tmp1 + tmp2;
306         z3 = tmp0 + tmp2;
307         z4 = tmp1 + tmp3;
308         z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
309
310         tmp0 = MULTIPLY(tmp0, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
311         tmp1 = MULTIPLY(tmp1, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
312         tmp2 = MULTIPLY(tmp2, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
313         tmp3 = MULTIPLY(tmp3, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
314         z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
315         z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
316         z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
317         z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
318
319         z3 += z5;
320         z4 += z5;
321
322         tmp0 += z1 + z3;
323         tmp1 += z2 + z4;
324         tmp2 += z2 + z3;
325         tmp3 += z1 + z4;
326
327     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
328
329         dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
330                                            CONST_BITS+PASS1_BITS+3);
331         dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
332                                            CONST_BITS+PASS1_BITS+3);
333         dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
334                                            CONST_BITS+PASS1_BITS+3);
335         dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
336                                            CONST_BITS+PASS1_BITS+3);
337         dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
338                                            CONST_BITS+PASS1_BITS+3);
339         dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
340                                            CONST_BITS+PASS1_BITS+3);
341         dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
342                                            CONST_BITS+PASS1_BITS+3);
343         dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
344                                            CONST_BITS+PASS1_BITS+3);
345
346         dataptr++;                      /* advance pointer to next column */
347     }
348 #endif
349
350 #if 1  /*dct avec non classique*/
351
352     s32 tmp0, tmp1, tmp2, tmp3;
353     s32 tmp10, tmp11, tmp12, tmp13;
354     s32 z1, z2, z3, z4, z5;
355     s32 d0, d1, d2, d3, d4, d5, d6, d7;
356     dctelem_t * dataptr;
357     int rowctr;
358
359     SHIFT_TEMPS
360
361     /* Pass 1: process rows. */
362     /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
363     /* furthermore, we scale the results by 2**PASS1_BITS. */
364
365     dataptr = p_block;
366
367     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
368     {
369         /* Due to quantization, we will usually find that many of the input
370          * coefficients are zero, especially the AC terms.  We can exploit this
371          * by short-circuiting the IDCT calculation for any row in which all
372          * the AC terms are zero.  In that case each output is equal to the
373          * DC coefficient (with scale factor as needed).
374          * With typical images and quantization tables, half or more of the
375          * row DCT calculations can be simplified this way.
376          */
377
378         register int * idataptr = (int*)dataptr;
379         d0 = dataptr[0];
380         d1 = dataptr[1];
381         if ( (d1 == 0) && ((idataptr[1] | idataptr[2] | idataptr[3]) == 0) )
382         {
383       /* AC terms all zero */
384             if (d0)
385             {
386       /* Compute a 32 bit value to assign. */
387                 dctelem_t dcval = (dctelem_t) (d0 << PASS1_BITS);
388                 register int v = (dcval & 0xffff) | (dcval << 16);
389
390                 idataptr[0] = v;
391                 idataptr[1] = v;
392                 idataptr[2] = v;
393                 idataptr[3] = v;
394             }
395
396             dataptr += DCTSIZE; /* advance pointer to next row */
397             continue;
398         }
399         d2 = dataptr[2];
400         d3 = dataptr[3];
401         d4 = dataptr[4];
402         d5 = dataptr[5];
403         d6 = dataptr[6];
404         d7 = dataptr[7];
405
406     /* Even part: reverse the even part of the forward DCT. */
407     /* The rotator is sqrt(2)*c(-6). */
408         if (d6)
409         {
410             if (d4)
411             {
412                 if (d2)
413                 {
414                     if (d0)
415                     {
416             /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
417                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
418                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
419                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
420
421                         tmp0 = (d0 + d4) << CONST_BITS;
422                         tmp1 = (d0 - d4) << CONST_BITS;
423
424                         tmp10 = tmp0 + tmp3;
425                         tmp13 = tmp0 - tmp3;
426                         tmp11 = tmp1 + tmp2;
427                         tmp12 = tmp1 - tmp2;
428                     }
429                     else
430                     {
431                     /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
432                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
433                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
434                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
435
436                         tmp0 = d4 << CONST_BITS;
437
438                         tmp10 = tmp0 + tmp3;
439                         tmp13 = tmp0 - tmp3;
440                         tmp11 = tmp2 - tmp0;
441                         tmp12 = -(tmp0 + tmp2);
442                         }
443                 }
444                 else
445                 {
446                     if (d0)
447                     {
448             /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
449                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
450                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
451
452                         tmp0 = (d0 + d4) << CONST_BITS;
453                         tmp1 = (d0 - d4) << CONST_BITS;
454
455                         tmp10 = tmp0 + tmp3;
456                         tmp13 = tmp0 - tmp3;
457                         tmp11 = tmp1 + tmp2;
458                         tmp12 = tmp1 - tmp2;
459                         }
460                     else
461                     {
462                     /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
463                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
464                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
465
466                         tmp0 = d4 << CONST_BITS;
467
468                         tmp10 = tmp0 + tmp3;
469                         tmp13 = tmp0 - tmp3;
470                         tmp11 = tmp2 - tmp0;
471                         tmp12 = -(tmp0 + tmp2);
472                         }
473                 }
474             }
475             else
476             {
477                 if (d2)
478                 {
479                     if (d0)
480                     {
481             /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
482                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
483                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
484                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
485
486                         tmp0 = d0 << CONST_BITS;
487
488                         tmp10 = tmp0 + tmp3;
489                         tmp13 = tmp0 - tmp3;
490                         tmp11 = tmp0 + tmp2;
491                         tmp12 = tmp0 - tmp2;
492                     }
493                     else
494                     {
495                     /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
496                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
497                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
498                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
499
500                         tmp10 = tmp3;
501                         tmp13 = -tmp3;
502                         tmp11 = tmp2;
503                         tmp12 = -tmp2;
504                             }
505                 }
506                 else
507                 {
508                     if (d0)
509                     {
510             /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
511                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
512                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
513
514                         tmp0 = d0 << CONST_BITS;
515
516                         tmp10 = tmp0 + tmp3;
517                         tmp13 = tmp0 - tmp3;
518                         tmp11 = tmp0 + tmp2;
519                         tmp12 = tmp0 - tmp2;
520                     }
521                     else
522                     {
523             /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
524                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
525                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
526
527                         tmp10 = tmp3;
528                         tmp13 = -tmp3;
529                         tmp11 = tmp2;
530                         tmp12 = -tmp2;
531                     }
532                 }
533             }
534         }
535         else
536         {
537             if (d4)
538             {
539                 if (d2)
540                 {
541                     if (d0)
542                     {
543                     /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
544                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
545                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
546
547                         tmp0 = (d0 + d4) << CONST_BITS;
548                         tmp1 = (d0 - d4) << CONST_BITS;
549
550                         tmp10 = tmp0 + tmp3;
551                         tmp13 = tmp0 - tmp3;
552                         tmp11 = tmp1 + tmp2;
553                         tmp12 = tmp1 - tmp2;
554                     }
555                     else
556                     {
557             /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
558                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
559                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
560
561                         tmp0 = d4 << CONST_BITS;
562
563                         tmp10 = tmp0 + tmp3;
564                         tmp13 = tmp0 - tmp3;
565                         tmp11 = tmp2 - tmp0;
566                         tmp12 = -(tmp0 + tmp2);
567                     }
568                 }
569                 else
570                 {
571                     if (d0)
572                     {
573             /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
574                         tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
575                         tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
576                     }
577                     else
578                     {
579             /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
580                         tmp10 = tmp13 = d4 << CONST_BITS;
581                         tmp11 = tmp12 = -tmp10;
582                     }
583                 }
584             }
585             else
586             {
587                 if (d2)
588                 {
589                     if (d0)
590                     {
591             /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
592                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
593                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
594
595                         tmp0 = d0 << CONST_BITS;
596
597                         tmp10 = tmp0 + tmp3;
598                         tmp13 = tmp0 - tmp3;
599                         tmp11 = tmp0 + tmp2;
600                         tmp12 = tmp0 - tmp2;
601                     }
602                     else
603                     {
604             /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
605                         tmp2 = MULTIPLY(d2, FIX(0.541196100));
606                         tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
607
608                         tmp10 = tmp3;
609                         tmp13 = -tmp3;
610                         tmp11 = tmp2;
611                         tmp12 = -tmp2;
612                     }
613                 }
614                 else
615                 {
616                     if (d0)
617                     {
618             /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
619                         tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
620                     }
621                     else
622                     {
623             /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
624                         tmp10 = tmp13 = tmp11 = tmp12 = 0;
625                     }
626                 }
627             }
628         }
629
630
631     /* Odd part per figure 8; the matrix is unitary and hence its
632      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
633      */
634
635         if (d7)
636             {
637             if (d5)
638             {
639                 if (d3)
640                 {
641                     if (d1)
642                     {
643             /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
644                         z1 = d7 + d1;
645                         z2 = d5 + d3;
646                         z3 = d7 + d3;
647                         z4 = d5 + d1;
648                         z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
649
650                         tmp0 = MULTIPLY(d7, FIX(0.298631336));
651                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
652                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
653                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
654                         z1 = MULTIPLY(z1, - FIX(0.899976223));
655                         z2 = MULTIPLY(z2, - FIX(2.562915447));
656                         z3 = MULTIPLY(z3, - FIX(1.961570560));
657                         z4 = MULTIPLY(z4, - FIX(0.390180644));
658
659                         z3 += z5;
660                         z4 += z5;
661
662                         tmp0 += z1 + z3;
663                         tmp1 += z2 + z4;
664                         tmp2 += z2 + z3;
665                         tmp3 += z1 + z4;
666                     }
667                     else
668                     {
669             /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
670                         z2 = d5 + d3;
671                         z3 = d7 + d3;
672                         z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
673
674                         tmp0 = MULTIPLY(d7, FIX(0.298631336));
675                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
676                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
677                         z1 = MULTIPLY(d7, - FIX(0.899976223));
678                         z2 = MULTIPLY(z2, - FIX(2.562915447));
679                         z3 = MULTIPLY(z3, - FIX(1.961570560));
680                         z4 = MULTIPLY(d5, - FIX(0.390180644));
681
682                         z3 += z5;
683                         z4 += z5;
684
685                         tmp0 += z1 + z3;
686                         tmp1 += z2 + z4;
687                         tmp2 += z2 + z3;
688                         tmp3 = z1 + z4;
689                         }
690                     }
691                 else
692                 {
693                     if (d1)
694                     {
695             /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
696                         z1 = d7 + d1;
697                         z4 = d5 + d1;
698                         z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
699
700                         tmp0 = MULTIPLY(d7, FIX(0.298631336));
701                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
702                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
703                         z1 = MULTIPLY(z1, - FIX(0.899976223));
704                         z2 = MULTIPLY(d5, - FIX(2.562915447));
705                         z3 = MULTIPLY(d7, - FIX(1.961570560));
706                         z4 = MULTIPLY(z4, - FIX(0.390180644));
707
708                         z3 += z5;
709                         z4 += z5;
710
711                         tmp0 += z1 + z3;
712                         tmp1 += z2 + z4;
713                         tmp2 = z2 + z3;
714                         tmp3 += z1 + z4;
715                     }
716                     else
717                     {
718             /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
719                         z5 = MULTIPLY(d7 + d5, FIX(1.175875602));
720
721                         tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
722                         tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
723                         z1 = MULTIPLY(d7, - FIX(0.899976223));
724                         z3 = MULTIPLY(d7, - FIX(1.961570560));
725                         z2 = MULTIPLY(d5, - FIX(2.562915447));
726                         z4 = MULTIPLY(d5, - FIX(0.390180644));
727
728                         z3 += z5;
729                         z4 += z5;
730
731                         tmp0 += z3;
732                         tmp1 += z4;
733                         tmp2 = z2 + z3;
734                         tmp3 = z1 + z4;
735                     }
736                 }
737             }
738             else
739             {
740                 if (d3)
741                 {
742                     if (d1)
743                     {
744             /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
745                         z1 = d7 + d1;
746                         z3 = d7 + d3;
747                         z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
748
749                         tmp0 = MULTIPLY(d7, FIX(0.298631336));
750                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
751                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
752                         z1 = MULTIPLY(z1, - FIX(0.899976223));
753                         z2 = MULTIPLY(d3, - FIX(2.562915447));
754                         z3 = MULTIPLY(z3, - FIX(1.961570560));
755                         z4 = MULTIPLY(d1, - FIX(0.390180644));
756
757                         z3 += z5;
758                         z4 += z5;
759
760                         tmp0 += z1 + z3;
761                         tmp1 = z2 + z4;
762                         tmp2 += z2 + z3;
763                         tmp3 += z1 + z4;
764                     }
765                     else
766                     {
767             /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
768                         z3 = d7 + d3;
769                         z5 = MULTIPLY(z3, FIX(1.175875602));
770
771                         tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
772                         tmp2 = MULTIPLY(d3, FIX(0.509795579));
773                         z1 = MULTIPLY(d7, - FIX(0.899976223));
774                         z2 = MULTIPLY(d3, - FIX(2.562915447));
775                         z3 = MULTIPLY(z3, - FIX2(0.785694958));
776
777                         tmp0 += z3;
778                         tmp1 = z2 + z5;
779                         tmp2 += z3;
780                         tmp3 = z1 + z5;
781                     }
782                 }
783                 else
784                 {
785                     if (d1)
786                     {
787             /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
788                         z1 = d7 + d1;
789                         z5 = MULTIPLY(z1, FIX(1.175875602));
790
791                         tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
792                         tmp3 = MULTIPLY(d1, FIX2(1.111140466));
793                         z1 = MULTIPLY(z1, FIX2(0.275899379));
794                         z3 = MULTIPLY(d7, - FIX(1.961570560));
795                         z4 = MULTIPLY(d1, - FIX(0.390180644));
796
797                         tmp0 += z1;
798                         tmp1 = z4 + z5;
799                         tmp2 = z3 + z5;
800                         tmp3 += z1;
801                     }
802                 else
803                     {
804             /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
805                         tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
806                         tmp1 = MULTIPLY(d7, FIX(1.175875602));
807                         tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
808                         tmp3 = MULTIPLY(d7, FIX2(0.275899379));
809                     }
810                 }
811             }
812         }
813         else
814         {
815             if (d5)
816             {
817                 if (d3)
818                 {
819                     if (d1)
820                     {
821             /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
822                         z2 = d5 + d3;
823                         z4 = d5 + d1;
824                         z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
825
826                         tmp1 = MULTIPLY(d5, FIX(2.053119869));
827                         tmp2 = MULTIPLY(d3, FIX(3.072711026));
828                         tmp3 = MULTIPLY(d1, FIX(1.501321110));
829                         z1 = MULTIPLY(d1, - FIX(0.899976223));
830                         z2 = MULTIPLY(z2, - FIX(2.562915447));
831                         z3 = MULTIPLY(d3, - FIX(1.961570560));
832                         z4 = MULTIPLY(z4, - FIX(0.390180644));
833
834                         z3 += z5;
835                         z4 += z5;
836
837                         tmp0 = z1 + z3;
838                         tmp1 += z2 + z4;
839                         tmp2 += z2 + z3;
840                         tmp3 += z1 + z4;
841                     }
842                     else
843                     {
844             /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
845                         z2 = d5 + d3;
846                         z5 = MULTIPLY(z2, FIX(1.175875602));
847
848                         tmp1 = MULTIPLY(d5, FIX2(1.662939225));
849                         tmp2 = MULTIPLY(d3, FIX2(1.111140466));
850                         z2 = MULTIPLY(z2, - FIX2(1.387039845));
851                         z3 = MULTIPLY(d3, - FIX(1.961570560));
852                         z4 = MULTIPLY(d5, - FIX(0.390180644));
853
854                         tmp0 = z3 + z5;
855                         tmp1 += z2;
856                         tmp2 += z2;
857                         tmp3 = z4 + z5;
858                     }
859                 }
860                 else
861                 {
862                     if (d1)
863                     {
864             /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
865                         z4 = d5 + d1;
866                         z5 = MULTIPLY(z4, FIX(1.175875602));
867
868                         tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
869                         tmp3 = MULTIPLY(d1, FIX2(0.601344887));
870                         z1 = MULTIPLY(d1, - FIX(0.899976223));
871                         z2 = MULTIPLY(d5, - FIX(2.562915447));
872                         z4 = MULTIPLY(z4, FIX2(0.785694958));
873
874                         tmp0 = z1 + z5;
875                         tmp1 += z4;
876                         tmp2 = z2 + z5;
877                         tmp3 += z4;
878                     }
879                     else
880                     {
881             /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
882                         tmp0 = MULTIPLY(d5, FIX(1.175875602));
883                         tmp1 = MULTIPLY(d5, FIX2(0.275899380));
884                         tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
885                         tmp3 = MULTIPLY(d5, FIX2(0.785694958));
886                     }
887                 }
888             }
889             else
890             {
891                 if (d3)
892                 {
893                     if (d1)
894                     {
895             /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
896                         z5 = d3 + d1;
897
898                         tmp2 = MULTIPLY(d3, - FIX(1.451774981));
899                         tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
900                         z1 = MULTIPLY(d1, FIX(1.061594337));
901                         z2 = MULTIPLY(d3, - FIX(2.172734803));
902                         z4 = MULTIPLY(z5, FIX(0.785694958));
903                         z5 = MULTIPLY(z5, FIX(1.175875602));
904
905                         tmp0 = z1 - z4;
906                         tmp1 = z2 + z4;
907                         tmp2 += z5;
908                         tmp3 += z5;
909                     }
910                     else
911                     {
912             /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
913                         tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
914                         tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
915                         tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
916                         tmp3 = MULTIPLY(d3, FIX(1.175875602));
917                     }
918                 }
919                 else
920                 {
921                     if (d1)
922                     {
923             /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
924                         tmp0 = MULTIPLY(d1, FIX2(0.275899379));
925                         tmp1 = MULTIPLY(d1, FIX2(0.785694958));
926                         tmp2 = MULTIPLY(d1, FIX(1.175875602));
927                         tmp3 = MULTIPLY(d1, FIX2(1.387039845));
928                     }
929                     else
930                     {
931             /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
932                         tmp0 = tmp1 = tmp2 = tmp3 = 0;
933                     }
934                 }
935             }
936         }
937
938     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
939
940         dataptr[0] = (dctelem_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
941         dataptr[7] = (dctelem_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
942         dataptr[1] = (dctelem_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
943         dataptr[6] = (dctelem_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
944         dataptr[2] = (dctelem_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
945         dataptr[5] = (dctelem_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
946         dataptr[3] = (dctelem_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
947         dataptr[4] = (dctelem_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
948
949         dataptr += DCTSIZE;              /* advance pointer to next row */
950     }
951
952   /* Pass 2: process columns. */
953   /* Note that we must descale the results by a factor of 8 == 2**3, */
954   /* and also undo the PASS1_BITS scaling. */
955
956     dataptr = p_block;
957     for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--)
958     {
959     /* Columns of zeroes can be exploited in the same way as we did with rows.
960      * However, the row calculation has created many nonzero AC terms, so the
961      * simplification applies less often (typically 5% to 10% of the time).
962      * On machines with very fast multiplication, it's possible that the
963      * test takes more time than it's worth.  In that case this section
964      * may be commented out.
965      */
966
967         d0 = dataptr[DCTSIZE*0];
968         d1 = dataptr[DCTSIZE*1];
969         d2 = dataptr[DCTSIZE*2];
970         d3 = dataptr[DCTSIZE*3];
971         d4 = dataptr[DCTSIZE*4];
972         d5 = dataptr[DCTSIZE*5];
973         d6 = dataptr[DCTSIZE*6];
974         d7 = dataptr[DCTSIZE*7];
975
976     /* Even part: reverse the even part of the forward DCT. */
977     /* The rotator is sqrt(2)*c(-6). */
978         if (d6)
979         {
980             if (d4)
981             {
982                 if (d2)
983                 {
984                     if (d0)
985                     {
986             /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
987                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
988                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
989                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
990
991                         tmp0 = (d0 + d4) << CONST_BITS;
992                         tmp1 = (d0 - d4) << CONST_BITS;
993
994                         tmp10 = tmp0 + tmp3;
995                         tmp13 = tmp0 - tmp3;
996                         tmp11 = tmp1 + tmp2;
997                         tmp12 = tmp1 - tmp2;
998                     }
999                     else
1000                     {
1001             /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
1002                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1003                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1004                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1005
1006                         tmp0 = d4 << CONST_BITS;
1007
1008                         tmp10 = tmp0 + tmp3;
1009                         tmp13 = tmp0 - tmp3;
1010                         tmp11 = tmp2 - tmp0;
1011                         tmp12 = -(tmp0 + tmp2);
1012                     }
1013                 }
1014                 else
1015                 {
1016                     if (d0)
1017                     {
1018             /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1019                         tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1020                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
1021
1022                         tmp0 = (d0 + d4) << CONST_BITS;
1023                         tmp1 = (d0 - d4) << CONST_BITS;
1024
1025                         tmp10 = tmp0 + tmp3;
1026                         tmp13 = tmp0 - tmp3;
1027                         tmp11 = tmp1 + tmp2;
1028                         tmp12 = tmp1 - tmp2;
1029                     }
1030                     else
1031                     {
1032             /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
1033                         tmp2 = MULTIPLY(d6, -FIX2(1.306562965));
1034                         tmp3 = MULTIPLY(d6, FIX(0.541196100));
1035
1036                         tmp0 = d4 << CONST_BITS;
1037
1038                         tmp10 = tmp0 + tmp3;
1039                         tmp13 = tmp0 - tmp3;
1040                         tmp11 = tmp2 - tmp0;
1041                         tmp12 = -(tmp0 + tmp2);
1042                     }
1043                 }
1044             }
1045             else
1046             {
1047                 if (d2)
1048                 {
1049                     if (d0)
1050                     {
1051             /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
1052                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1053                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1054                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1055
1056                         tmp0 = d0 << CONST_BITS;
1057
1058                         tmp10 = tmp0 + tmp3;
1059                         tmp13 = tmp0 - tmp3;
1060                         tmp11 = tmp0 + tmp2;
1061                         tmp12 = tmp0 - tmp2;
1062                     }
1063                     else
1064                     {
1065             /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
1066                         z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1067                         tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1068                         tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1069
1070                         tmp10 = tmp3;
1071                         tmp13 = -tmp3;
1072                         tmp11 = tmp2;
1073                         tmp12 = -tmp2;
1074                     }
1075                 }
1076                 else
1077                 {
1078                     if (d0)
1079                     {
1080             /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
1081                     tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1082                     tmp3 = MULTIPLY(d6, FIX(0.541196100));
1083
1084                     tmp0 = d0 << CONST_BITS;
1085
1086                     tmp10 = tmp0 + tmp3;
1087                     tmp13 = tmp0 - tmp3;
1088                     tmp11 = tmp0 + tmp2;
1089                     tmp12 = tmp0 - tmp2;
1090                 }
1091                 else
1092                 {
1093             /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
1094                     tmp2 = MULTIPLY(d6, - FIX2(1.306562965));
1095                     tmp3 = MULTIPLY(d6, FIX(0.541196100));
1096                     tmp10 = tmp3;
1097                     tmp13 = -tmp3;
1098                     tmp11 = tmp2;
1099                     tmp12 = -tmp2;
1100                 }
1101             }
1102         }
1103     }
1104     else
1105     {
1106         if (d4)
1107         {
1108             if (d2)
1109             {
1110                 if (d0)
1111                 {
1112             /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1113                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1114                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1115
1116                     tmp0 = (d0 + d4) << CONST_BITS;
1117                     tmp1 = (d0 - d4) << CONST_BITS;
1118
1119                     tmp10 = tmp0 + tmp3;
1120                     tmp13 = tmp0 - tmp3;
1121                     tmp11 = tmp1 + tmp2;
1122                     tmp12 = tmp1 - tmp2;
1123                 }
1124                 else
1125                 {
1126             /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
1127                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1128                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1129
1130                     tmp0 = d4 << CONST_BITS;
1131
1132                     tmp10 = tmp0 + tmp3;
1133                     tmp13 = tmp0 - tmp3;
1134                     tmp11 = tmp2 - tmp0;
1135                     tmp12 = -(tmp0 + tmp2);
1136                 }
1137             }
1138             else
1139             {
1140                 if (d0)
1141                 {
1142             /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1143                     tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1144                     tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1145                 }
1146                 else
1147                 {
1148             /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
1149                     tmp10 = tmp13 = d4 << CONST_BITS;
1150                     tmp11 = tmp12 = -tmp10;
1151                 }
1152             }
1153         }
1154         else
1155         {
1156         if (d2)
1157         {
1158             if (d0)
1159             {
1160             /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
1161                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1162                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1163
1164                     tmp0 = d0 << CONST_BITS;
1165
1166                     tmp10 = tmp0 + tmp3;
1167                     tmp13 = tmp0 - tmp3;
1168                     tmp11 = tmp0 + tmp2;
1169                     tmp12 = tmp0 - tmp2;
1170             }
1171             else
1172             {
1173             /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
1174                     tmp2 = MULTIPLY(d2, FIX(0.541196100));
1175                     tmp3 = MULTIPLY(d2, (FIX(1.306562965) + .5));
1176
1177                     tmp10 = tmp3;
1178                     tmp13 = -tmp3;
1179                     tmp11 = tmp2;
1180                     tmp12 = -tmp2;
1181             }
1182         }
1183         else
1184         {
1185             if (d0)
1186                 {
1187             /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
1188                     tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
1189                 }
1190                 else
1191                 {
1192             /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
1193                     tmp10 = tmp13 = tmp11 = tmp12 = 0;
1194                 }
1195             }
1196         }
1197     }
1198
1199     /* Odd part per figure 8; the matrix is unitary and hence its
1200      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
1201      */
1202     if (d7)
1203     {
1204         if (d5)
1205         {
1206             if (d3)
1207             {
1208                 if (d1)
1209                 {
1210             /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
1211                     z1 = d7 + d1;
1212                     z2 = d5 + d3;
1213                     z3 = d7 + d3;
1214                     z4 = d5 + d1;
1215                     z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
1216
1217                     tmp0 = MULTIPLY(d7, FIX(0.298631336));
1218                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1219                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1220                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1221                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1222                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1223                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1224                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1225
1226                     z3 += z5;
1227                     z4 += z5;
1228
1229                     tmp0 += z1 + z3;
1230                     tmp1 += z2 + z4;
1231                     tmp2 += z2 + z3;
1232                     tmp3 += z1 + z4;
1233                 }
1234                 else
1235                 {
1236             /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
1237                     z2 = d5 + d3;
1238                     z3 = d7 + d3;
1239                     z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
1240
1241                     tmp0 = MULTIPLY(d7, FIX(0.298631336));
1242                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1243                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1244                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1245                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1246                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1247                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1248
1249                     z3 += z5;
1250                     z4 += z5;
1251
1252                     tmp0 += z1 + z3;
1253                     tmp1 += z2 + z4;
1254                     tmp2 += z2 + z3;
1255                     tmp3 = z1 + z4;
1256                 }
1257             }
1258             else
1259             {
1260                 if (d1)
1261                 {
1262             /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
1263                     z1 = d7 + d1;
1264                     z4 = d5 + d1;
1265                     z5 = MULTIPLY(d7 + z4, FIX(1.175875602));
1266
1267                     tmp0 = MULTIPLY(d7, FIX(0.298631336));
1268                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1269                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1270                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1271                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1272                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1273                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1274
1275                     z3 += z5;
1276                     z4 += z5;
1277
1278                     tmp0 += z1 + z3;
1279                     tmp1 += z2 + z4;
1280                     tmp2 = z2 + z3;
1281                     tmp3 += z1 + z4;
1282                 }
1283                 else
1284                 {
1285             /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
1286                     z5 = MULTIPLY(d5 + d7, FIX(1.175875602));
1287
1288                     tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
1289                     tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1290                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1291                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1292                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1293                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1294
1295                     z3 += z5;
1296                     z4 += z5;
1297
1298                     tmp0 += z3;
1299                     tmp1 += z4;
1300                     tmp2 = z2 + z3;
1301                     tmp3 = z1 + z4;
1302                 }
1303             }
1304         }
1305         else
1306         {
1307             if (d3)
1308             {
1309                 if (d1)
1310                 {
1311             /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
1312                     z1 = d7 + d1;
1313                     z3 = d7 + d3;
1314                     z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
1315
1316                     tmp0 = MULTIPLY(d7, FIX(0.298631336));
1317                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1318                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1319                     z1 = MULTIPLY(z1, - FIX(0.899976223));
1320                     z2 = MULTIPLY(d3, - FIX(2.562915447));
1321                     z3 = MULTIPLY(z3, - FIX(1.961570560));
1322                     z4 = MULTIPLY(d1, - FIX(0.390180644));
1323
1324                     z3 += z5;
1325                     z4 += z5;
1326
1327                     tmp0 += z1 + z3;
1328                     tmp1 = z2 + z4;
1329                     tmp2 += z2 + z3;
1330                     tmp3 += z1 + z4;
1331                 }
1332                 else
1333                 {
1334             /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1335                     z3 = d7 + d3;
1336                     z5 = MULTIPLY(z3, FIX(1.175875602));
1337
1338                     tmp0 = MULTIPLY(d7, - FIX2(0.601344887));
1339                     z1 = MULTIPLY(d7, - FIX(0.899976223));
1340                     tmp2 = MULTIPLY(d3, FIX(0.509795579));
1341                     z2 = MULTIPLY(d3, - FIX(2.562915447));
1342                     z3 = MULTIPLY(z3, - FIX2(0.785694958));
1343
1344                     tmp0 += z3;
1345                     tmp1 = z2 + z5;
1346                     tmp2 += z3;
1347                     tmp3 = z1 + z5;
1348                 }
1349             }
1350             else
1351             {
1352                 if (d1)
1353                 {
1354             /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1355                     z1 = d7 + d1;
1356                     z5 = MULTIPLY(z1, FIX(1.175875602));
1357
1358                     tmp0 = MULTIPLY(d7, - FIX2(1.662939224));
1359                     tmp3 = MULTIPLY(d1, FIX2(1.111140466));
1360                     z1 = MULTIPLY(z1, FIX2(0.275899379));
1361                     z3 = MULTIPLY(d7, - FIX(1.961570560));
1362                     z4 = MULTIPLY(d1, - FIX(0.390180644));
1363
1364                     tmp0 += z1;
1365                     tmp1 = z4 + z5;
1366                     tmp2 = z3 + z5;
1367                     tmp3 += z1;
1368                 }
1369                 else
1370                 {
1371             /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1372                     tmp0 = MULTIPLY(d7, - FIX2(1.387039845));
1373                     tmp1 = MULTIPLY(d7, FIX(1.175875602));
1374                     tmp2 = MULTIPLY(d7, - FIX2(0.785694958));
1375                     tmp3 = MULTIPLY(d7, FIX2(0.275899379));
1376                 }
1377             }
1378         }
1379     }
1380     else
1381     {
1382         if (d5)
1383         {
1384             if (d3)
1385             {
1386                 if (d1)
1387                 {
1388             /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1389                     z2 = d5 + d3;
1390                     z4 = d5 + d1;
1391                     z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
1392
1393                     tmp1 = MULTIPLY(d5, FIX(2.053119869));
1394                     tmp2 = MULTIPLY(d3, FIX(3.072711026));
1395                     tmp3 = MULTIPLY(d1, FIX(1.501321110));
1396                     z1 = MULTIPLY(d1, - FIX(0.899976223));
1397                     z2 = MULTIPLY(z2, - FIX(2.562915447));
1398                     z3 = MULTIPLY(d3, - FIX(1.961570560));
1399                     z4 = MULTIPLY(z4, - FIX(0.390180644));
1400
1401                     z3 += z5;
1402                     z4 += z5;
1403
1404                     tmp0 = z1 + z3;
1405                     tmp1 += z2 + z4;
1406                     tmp2 += z2 + z3;
1407                     tmp3 += z1 + z4;
1408                 }
1409                 else
1410                 {
1411             /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1412                     z2 = d5 + d3;
1413                     z5 = MULTIPLY(z2, FIX(1.175875602));
1414
1415                     tmp1 = MULTIPLY(d5, FIX2(1.662939225));
1416                     tmp2 = MULTIPLY(d3, FIX2(1.111140466));
1417                     z2 = MULTIPLY(z2, - FIX2(1.387039845));
1418                     z3 = MULTIPLY(d3, - FIX(1.961570560));
1419                     z4 = MULTIPLY(d5, - FIX(0.390180644));
1420
1421                     tmp0 = z3 + z5;
1422                     tmp1 += z2;
1423                     tmp2 += z2;
1424                     tmp3 = z4 + z5;
1425                 }
1426             }
1427             else
1428             {
1429                 if (d1)
1430                 {
1431             /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1432                     z4 = d5 + d1;
1433                     z5 = MULTIPLY(z4, FIX(1.175875602));
1434
1435                     tmp1 = MULTIPLY(d5, - FIX2(0.509795578));
1436                     tmp3 = MULTIPLY(d1, FIX2(0.601344887));
1437                     z1 = MULTIPLY(d1, - FIX(0.899976223));
1438                     z2 = MULTIPLY(d5, - FIX(2.562915447));
1439                     z4 = MULTIPLY(z4, FIX2(0.785694958));
1440
1441                     tmp0 = z1 + z5;
1442                     tmp1 += z4;
1443                     tmp2 = z2 + z5;
1444                     tmp3 += z4;
1445                 }
1446                 else
1447                 {
1448             /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1449                     tmp0 = MULTIPLY(d5, FIX(1.175875602));
1450                     tmp1 = MULTIPLY(d5, FIX2(0.275899380));
1451                     tmp2 = MULTIPLY(d5, - FIX2(1.387039845));
1452                     tmp3 = MULTIPLY(d5, FIX2(0.785694958));
1453                 }
1454             }
1455         }
1456         else
1457         {
1458             if (d3)
1459             {
1460                 if (d1)
1461                 {
1462             /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1463                     z5 = d3 + d1;
1464
1465                     tmp2 = MULTIPLY(d3, - FIX(1.451774981));
1466                     tmp3 = MULTIPLY(d1, (FIX(0.211164243) - 1));
1467                     z1 = MULTIPLY(d1, FIX(1.061594337));
1468                     z2 = MULTIPLY(d3, - FIX(2.172734803));
1469                     z4 = MULTIPLY(z5, FIX(0.785694958));
1470                     z5 = MULTIPLY(z5, FIX(1.175875602));
1471
1472                     tmp0 = z1 - z4;
1473                     tmp1 = z2 + z4;
1474                     tmp2 += z5;
1475                     tmp3 += z5;
1476                 }
1477                 else
1478                 {
1479             /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1480                     tmp0 = MULTIPLY(d3, - FIX2(0.785694958));
1481                     tmp1 = MULTIPLY(d3, - FIX2(1.387039845));
1482                     tmp2 = MULTIPLY(d3, - FIX2(0.275899379));
1483                     tmp3 = MULTIPLY(d3, FIX(1.175875602));
1484                 }
1485             }
1486             else
1487             {
1488                 if (d1)
1489                 {
1490             /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1491                     tmp0 = MULTIPLY(d1, FIX2(0.275899379));
1492                     tmp1 = MULTIPLY(d1, FIX2(0.785694958));
1493                     tmp2 = MULTIPLY(d1, FIX(1.175875602));
1494                     tmp3 = MULTIPLY(d1, FIX2(1.387039845));
1495                 }
1496                 else
1497                 {
1498             /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1499                     tmp0 = tmp1 = tmp2 = tmp3 = 0;
1500                 }
1501             }
1502         }
1503     }
1504
1505     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1506
1507     dataptr[DCTSIZE*0] = (dctelem_t) DESCALE(tmp10 + tmp3,
1508                        CONST_BITS+PASS1_BITS+3);
1509     dataptr[DCTSIZE*7] = (dctelem_t) DESCALE(tmp10 - tmp3,
1510                        CONST_BITS+PASS1_BITS+3);
1511     dataptr[DCTSIZE*1] = (dctelem_t) DESCALE(tmp11 + tmp2,
1512                        CONST_BITS+PASS1_BITS+3);
1513     dataptr[DCTSIZE*6] = (dctelem_t) DESCALE(tmp11 - tmp2,
1514                        CONST_BITS+PASS1_BITS+3);
1515     dataptr[DCTSIZE*2] = (dctelem_t) DESCALE(tmp12 + tmp1,
1516                        CONST_BITS+PASS1_BITS+3);
1517     dataptr[DCTSIZE*5] = (dctelem_t) DESCALE(tmp12 - tmp1,
1518                        CONST_BITS+PASS1_BITS+3);
1519     dataptr[DCTSIZE*3] = (dctelem_t) DESCALE(tmp13 + tmp0,
1520                        CONST_BITS+PASS1_BITS+3);
1521     dataptr[DCTSIZE*4] = (dctelem_t) DESCALE(tmp13 - tmp0,
1522                        CONST_BITS+PASS1_BITS+3);
1523
1524     dataptr++;             /* advance pointer to next column */
1525     }
1526 #endif
1527 }
1528 #endif