]> git.sesse.net Git - ffmpeg/blob - libavcodec/i386/simple_idct_mmx.c
(commit by michael)
[ffmpeg] / libavcodec / i386 / simple_idct_mmx.c
1 /*
2     Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 #include <inttypes.h>
20 #include "../dsputil.h"
21
22 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
23 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
24 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
25 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
26 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
27 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
28 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
29 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
30
31 #define ROW_SHIFT 11
32 #define COL_SHIFT 20 // 6
33
34 static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
35 static uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
36 static int16_t __attribute__((aligned(8))) temp[64];
37 static int16_t __attribute__((aligned(8))) coeffs[]= {
38         1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
39 //      1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
40 //      0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
41         1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
42         // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
43 //      0, 0, 0, 0,
44 //      0, 0, 0, 0,
45
46  C4,  C4,  C4,  C4,
47  C4, -C4,  C4, -C4,
48  
49  C2,  C6,  C2,  C6,
50  C6, -C2,  C6, -C2,
51  
52  C1,  C3,  C1,  C3,
53  C5,  C7,  C5,  C7,
54  
55  C3, -C7,  C3, -C7,
56 -C1, -C5, -C1, -C5,
57  
58  C5, -C1,  C5, -C1,
59  C7,  C3,  C7,  C3,
60  
61  C7, -C5,  C7, -C5,
62  C3, -C1,  C3, -C1
63 };
64
65 static void unused_var_killer(){
66         int a= wm1010 + d40000;
67         temp[0]=a;
68 }
69
70 #if 0
71 static void inline idctCol (int16_t * col, int16_t *input)
72 {
73 #undef C0
74 #undef C1
75 #undef C2
76 #undef C3
77 #undef C4
78 #undef C5
79 #undef C6
80 #undef C7
81         int a0, a1, a2, a3, b0, b1, b2, b3;
82         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
83         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
84         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
85         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
86         const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
87         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
88         const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
89         const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
90 /*
91         if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
92                 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
93                         col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
94                 return;
95         }*/
96
97 col[8*0] = input[8*0 + 0];
98 col[8*1] = input[8*2 + 0];
99 col[8*2] = input[8*0 + 1];
100 col[8*3] = input[8*2 + 1];
101 col[8*4] = input[8*4 + 0];
102 col[8*5] = input[8*6 + 0];
103 col[8*6] = input[8*4 + 1];
104 col[8*7] = input[8*6 + 1];
105
106         a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
107         a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
108         a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
109         a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
110
111         b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
112         b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
113         b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
114         b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
115
116         col[8*0] = (a0 + b0) >> COL_SHIFT;
117         col[8*1] = (a1 + b1) >> COL_SHIFT;
118         col[8*2] = (a2 + b2) >> COL_SHIFT;
119         col[8*3] = (a3 + b3) >> COL_SHIFT;
120         col[8*4] = (a3 - b3) >> COL_SHIFT;
121         col[8*5] = (a2 - b2) >> COL_SHIFT;
122         col[8*6] = (a1 - b1) >> COL_SHIFT;
123         col[8*7] = (a0 - b0) >> COL_SHIFT;
124 }
125
126 static void inline idctRow (int16_t * output, int16_t * input)
127 {
128         int16_t row[8];
129
130         int a0, a1, a2, a3, b0, b1, b2, b3;
131         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
132         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
133         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
134         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
135         const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
136         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
137         const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
138         const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
139
140 row[0] = input[0];
141 row[2] = input[1];
142 row[4] = input[4];
143 row[6] = input[5];
144 row[1] = input[8];
145 row[3] = input[9];
146 row[5] = input[12];
147 row[7] = input[13];
148
149         if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
150                 row[0] = row[1] = row[2] = row[3] = row[4] =
151                         row[5] = row[6] = row[7] = row[0]<<3;
152         output[0] = row[0];
153         output[2] = row[1];
154         output[4] = row[2];
155         output[6] = row[3];
156         output[8] = row[4];
157         output[10] = row[5];
158         output[12] = row[6];
159         output[14] = row[7];
160                 return;
161         }
162
163         a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
164         a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
165         a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
166         a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
167
168         b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
169         b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
170         b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
171         b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
172
173         row[0] = (a0 + b0) >> ROW_SHIFT;
174         row[1] = (a1 + b1) >> ROW_SHIFT;
175         row[2] = (a2 + b2) >> ROW_SHIFT;
176         row[3] = (a3 + b3) >> ROW_SHIFT;
177         row[4] = (a3 - b3) >> ROW_SHIFT;
178         row[5] = (a2 - b2) >> ROW_SHIFT;
179         row[6] = (a1 - b1) >> ROW_SHIFT;
180         row[7] = (a0 - b0) >> ROW_SHIFT;
181
182         output[0] = row[0];
183         output[2] = row[1];
184         output[4] = row[2];
185         output[6] = row[3];
186         output[8] = row[4];
187         output[10] = row[5];
188         output[12] = row[6];
189         output[14] = row[7];
190 }
191 #endif
192
193 static inline void idct(int16_t *block)
194 {
195         asm volatile(
196 #if 0 //Alternative, simpler variant
197
198 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
199         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
200         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
201         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
202         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
203         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
204         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
205         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
206         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
207         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
208         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
209         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
210         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
211         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
212         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
213         #rounder ", %%mm4                       \n\t"\
214         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
215         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
216         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
217         "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
218         "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
219         #rounder ", %%mm0                       \n\t"\
220         "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
221         "paddd %%mm0, %%mm0                     \n\t" \
222         "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
223         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
224         "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
225         "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
226         "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
227         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
228         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
229         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
230         "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
231         "psrad $" #shift ", %%mm7               \n\t"\
232         "psrad $" #shift ", %%mm4               \n\t"\
233         "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
234         "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
235         "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
236         "psrad $" #shift ", %%mm1               \n\t"\
237         "psrad $" #shift ", %%mm2               \n\t"\
238         "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
239         "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
240         "movq %%mm7, " #dst "                   \n\t"\
241         "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
242         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
243         "movq %%mm2, 24+" #dst "                \n\t"\
244         "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
245         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
246         "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
247         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
248         "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
249         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
250         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
251         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
252         "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
253         "psrad $" #shift ", %%mm2               \n\t"\
254         "psrad $" #shift ", %%mm0               \n\t"\
255         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
256         "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
257         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
258         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
259         "psrad $" #shift ", %%mm6               \n\t"\
260         "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
261         "movq %%mm2, 8+" #dst "                 \n\t"\
262         "psrad $" #shift ", %%mm4               \n\t"\
263         "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
264         "movq %%mm4, 16+" #dst "                \n\t"\
265
266 #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
267         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
268         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
269         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
270         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
271         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
272         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
273         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
274         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
275         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
276         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
277         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
278         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
279         #rounder ", %%mm4                       \n\t"\
280         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
281         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
282         #rounder ", %%mm0                       \n\t"\
283         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
284         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
285         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
286         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
287         "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
288         "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
289         "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
290         "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
291         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
292         "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
293         "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
294         "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
295         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
296         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
297         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
298         "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
299         "psrad $" #shift ", %%mm7               \n\t"\
300         "psrad $" #shift ", %%mm4               \n\t"\
301         "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
302         "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
303         "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
304         "psrad $" #shift ", %%mm0               \n\t"\
305         "psrad $" #shift ", %%mm2               \n\t"\
306         "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
307         "movd %%mm7, " #dst "                   \n\t"\
308         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
309         "movd %%mm0, 16+" #dst "                \n\t"\
310         "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
311         "movd %%mm2, 96+" #dst "                \n\t"\
312         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
313         "movd %%mm4, 112+" #dst "               \n\t"\
314         "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
315         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
316         "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
317         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
318         "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
319         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
320         "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
321         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
322         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
323         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
324         "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
325         "psrad $" #shift ", %%mm2               \n\t"\
326         "psrad $" #shift ", %%mm5               \n\t"\
327         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
328         "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
329         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
330         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
331         "psrad $" #shift ", %%mm6               \n\t"\
332         "psrad $" #shift ", %%mm4               \n\t"\
333         "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
334         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
335         "movd %%mm2, 32+" #dst "                \n\t"\
336         "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
337         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
338         "movd %%mm6, 48+" #dst "                \n\t"\
339         "movd %%mm4, 64+" #dst "                \n\t"\
340         "movd %%mm5, 80+" #dst "                \n\t"\
341
342         
343 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
344         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
345         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
346         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
347         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
348         "movq wm1010, %%mm4                     \n\t"\
349         "pand %%mm0, %%mm4                      \n\t"\
350         "por %%mm1, %%mm4                       \n\t"\
351         "por %%mm2, %%mm4                       \n\t"\
352         "por %%mm3, %%mm4                       \n\t"\
353         "packssdw %%mm4,%%mm4                   \n\t"\
354         "movd %%mm4, %%eax                      \n\t"\
355         "orl %%eax, %%eax                       \n\t"\
356         "jz 1f                                  \n\t"\
357         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
358         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
359         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
360         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
361         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
362         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
363         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
364         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
365         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
366         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
367         #rounder ", %%mm4                       \n\t"\
368         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
369         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
370         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
371         "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
372         "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
373         #rounder ", %%mm0                       \n\t"\
374         "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
375         "paddd %%mm0, %%mm0                     \n\t" \
376         "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
377         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
378         "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
379         "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
380         "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
381         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
382         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
383         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
384         "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
385         "psrad $" #shift ", %%mm7               \n\t"\
386         "psrad $" #shift ", %%mm4               \n\t"\
387         "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
388         "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
389         "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
390         "psrad $" #shift ", %%mm1               \n\t"\
391         "psrad $" #shift ", %%mm2               \n\t"\
392         "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
393         "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
394         "movq %%mm7, " #dst "                   \n\t"\
395         "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
396         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
397         "movq %%mm2, 24+" #dst "                \n\t"\
398         "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
399         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
400         "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
401         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
402         "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
403         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
404         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
405         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
406         "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
407         "psrad $" #shift ", %%mm2               \n\t"\
408         "psrad $" #shift ", %%mm0               \n\t"\
409         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
410         "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
411         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
412         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
413         "psrad $" #shift ", %%mm6               \n\t"\
414         "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
415         "movq %%mm2, 8+" #dst "                 \n\t"\
416         "psrad $" #shift ", %%mm4               \n\t"\
417         "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
418         "movq %%mm4, 16+" #dst "                \n\t"\
419         "jmp 2f                                 \n\t"\
420         "1:                                     \n\t"\
421         "pslld $16, %%mm0                       \n\t"\
422         "#paddd d40000, %%mm0                   \n\t"\
423         "psrad $13, %%mm0                       \n\t"\
424         "packssdw %%mm0, %%mm0                  \n\t"\
425         "movq %%mm0, " #dst "                   \n\t"\
426         "movq %%mm0, 8+" #dst "                 \n\t"\
427         "movq %%mm0, 16+" #dst "                \n\t"\
428         "movq %%mm0, 24+" #dst "                \n\t"\
429         "2:                                     \n\t"
430
431
432 //IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
433 ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
434 /*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
435 ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
436 ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
437
438 DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
439 DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
440 DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
441
442
443 //IDCT(      src0,   src4,   src1,    src5,    dst, rounder, shift)
444 COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
445 COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
446 COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
447 COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
448
449 #else
450
451 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
452         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
453         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
454         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
455         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
456         "movq wm1010, %%mm4                     \n\t"\
457         "pand %%mm0, %%mm4                      \n\t"\
458         "por %%mm1, %%mm4                       \n\t"\
459         "por %%mm2, %%mm4                       \n\t"\
460         "por %%mm3, %%mm4                       \n\t"\
461         "packssdw %%mm4,%%mm4                   \n\t"\
462         "movd %%mm4, %%eax                      \n\t"\
463         "orl %%eax, %%eax                       \n\t"\
464         "jz 1f                                  \n\t"\
465         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
466         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
467         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
468         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
469         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
470         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
471         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
472         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
473         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
474         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
475         #rounder ", %%mm4                       \n\t"\
476         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
477         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
478         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
479         "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
480         "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
481         #rounder ", %%mm0                       \n\t"\
482         "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
483         "paddd %%mm0, %%mm0                     \n\t" \
484         "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
485         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
486         "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
487         "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
488         "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
489         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
490         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
491         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
492         "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
493         "psrad $" #shift ", %%mm7               \n\t"\
494         "psrad $" #shift ", %%mm4               \n\t"\
495         "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
496         "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
497         "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
498         "psrad $" #shift ", %%mm1               \n\t"\
499         "psrad $" #shift ", %%mm2               \n\t"\
500         "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
501         "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
502         "movq %%mm7, " #dst "                   \n\t"\
503         "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
504         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
505         "movq %%mm2, 24+" #dst "                \n\t"\
506         "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
507         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
508         "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
509         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
510         "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
511         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
512         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
513         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
514         "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
515         "psrad $" #shift ", %%mm2               \n\t"\
516         "psrad $" #shift ", %%mm0               \n\t"\
517         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
518         "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
519         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
520         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
521         "psrad $" #shift ", %%mm6               \n\t"\
522         "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
523         "movq %%mm2, 8+" #dst "                 \n\t"\
524         "psrad $" #shift ", %%mm4               \n\t"\
525         "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
526         "movq %%mm4, 16+" #dst "                \n\t"\
527         "jmp 2f                                 \n\t"\
528         "1:                                     \n\t"\
529         "pslld $16, %%mm0                       \n\t"\
530         "paddd d40000, %%mm0                    \n\t"\
531         "psrad $13, %%mm0                       \n\t"\
532         "packssdw %%mm0, %%mm0                  \n\t"\
533         "movq %%mm0, " #dst "                   \n\t"\
534         "movq %%mm0, 8+" #dst "                 \n\t"\
535         "movq %%mm0, 16+" #dst "                \n\t"\
536         "movq %%mm0, 24+" #dst "                \n\t"\
537         "2:                                     \n\t"
538
539 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
540         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
541         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
542         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
543         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
544         "movq %%mm0, %%mm4                      \n\t"\
545         "por %%mm1, %%mm4                       \n\t"\
546         "por %%mm2, %%mm4                       \n\t"\
547         "por %%mm3, %%mm4                       \n\t"\
548         "packssdw %%mm4,%%mm4                   \n\t"\
549         "movd %%mm4, %%eax                      \n\t"\
550         "orl %%eax, %%eax                       \n\t"\
551         "jz " #bt "                             \n\t"\
552         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
553         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
554         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
555         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
556         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
557         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
558         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
559         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
560         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
561         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
562         #rounder ", %%mm4                       \n\t"\
563         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
564         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
565         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
566         "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
567         "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
568         #rounder ", %%mm0                       \n\t"\
569         "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
570         "paddd %%mm0, %%mm0                     \n\t" \
571         "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
572         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
573         "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
574         "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
575         "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
576         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
577         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
578         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
579         "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
580         "psrad $" #shift ", %%mm7               \n\t"\
581         "psrad $" #shift ", %%mm4               \n\t"\
582         "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
583         "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
584         "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
585         "psrad $" #shift ", %%mm1               \n\t"\
586         "psrad $" #shift ", %%mm2               \n\t"\
587         "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
588         "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
589         "movq %%mm7, " #dst "                   \n\t"\
590         "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
591         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
592         "movq %%mm2, 24+" #dst "                \n\t"\
593         "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
594         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
595         "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
596         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
597         "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
598         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
599         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
600         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
601         "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
602         "psrad $" #shift ", %%mm2               \n\t"\
603         "psrad $" #shift ", %%mm0               \n\t"\
604         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
605         "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
606         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
607         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
608         "psrad $" #shift ", %%mm6               \n\t"\
609         "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
610         "movq %%mm2, 8+" #dst "                 \n\t"\
611         "psrad $" #shift ", %%mm4               \n\t"\
612         "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
613         "movq %%mm4, 16+" #dst "                \n\t"\
614
615 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
616         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
617         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
618         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
619         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
620         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
621         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
622         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
623         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
624         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
625         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
626         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
627         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
628         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
629         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
630         #rounder ", %%mm4                       \n\t"\
631         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
632         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
633         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
634         "movq 56(%2), %%mm5                     \n\t" /* C7     C5      C7      C5 */\
635         "pmaddwd %%mm3, %%mm5                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
636         #rounder ", %%mm0                       \n\t"\
637         "paddd %%mm0, %%mm1                     \n\t" /* A1             a1 */\
638         "paddd %%mm0, %%mm0                     \n\t" \
639         "psubd %%mm1, %%mm0                     \n\t" /* A2             a2 */\
640         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
641         "paddd %%mm5, %%mm7                     \n\t" /* B0             b0 */\
642         "movq 72(%2), %%mm5                     \n\t" /* -C5    -C1     -C5     -C1 */\
643         "pmaddwd %%mm3, %%mm5                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
644         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
645         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
646         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
647         "paddd %%mm2, %%mm5                     \n\t" /* B1             b1 */\
648         "psrad $" #shift ", %%mm7               \n\t"\
649         "psrad $" #shift ", %%mm4               \n\t"\
650         "movq %%mm1, %%mm2                      \n\t" /* A1             a1 */\
651         "paddd %%mm5, %%mm1                     \n\t" /* A1+B1          a1+b1 */\
652         "psubd %%mm5, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
653         "psrad $" #shift ", %%mm1               \n\t"\
654         "psrad $" #shift ", %%mm2               \n\t"\
655         "packssdw %%mm1, %%mm7                  \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
656         "packssdw %%mm4, %%mm2                  \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
657         "movq %%mm7, " #dst "                   \n\t"\
658         "movq " #src1 ", %%mm1                  \n\t" /* R3     R1      r3      r1 */\
659         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
660         "movq %%mm2, 24+" #dst "                \n\t"\
661         "pmaddwd %%mm1, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
662         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
663         "pmaddwd 96(%2), %%mm1                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
664         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
665         "movq %%mm0, %%mm2                      \n\t" /* A2             a2 */\
666         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
667         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
668         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
669         "psubd %%mm4, %%mm0                     \n\t" /* a2-B2          a2-b2 */\
670         "psrad $" #shift ", %%mm2               \n\t"\
671         "psrad $" #shift ", %%mm0               \n\t"\
672         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
673         "paddd %%mm1, %%mm3                     \n\t" /* B3             b3 */\
674         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
675         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
676         "psrad $" #shift ", %%mm6               \n\t"\
677         "packssdw %%mm6, %%mm2                  \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
678         "movq %%mm2, 8+" #dst "                 \n\t"\
679         "psrad $" #shift ", %%mm4               \n\t"\
680         "packssdw %%mm0, %%mm4                  \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
681         "movq %%mm4, 16+" #dst "                \n\t"\
682
683 //IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
684 DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
685 Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
686 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
687 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
688
689 #undef IDCT
690 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
691         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
692         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
693         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
694         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
695         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
696         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
697         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
698         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
699         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
700         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
701         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
702         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
703         #rounder ", %%mm4                       \n\t"\
704         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
705         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
706         #rounder ", %%mm0                       \n\t"\
707         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
708         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
709         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
710         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
711         "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
712         "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
713         "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
714         "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
715         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
716         "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
717         "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
718         "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
719         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
720         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
721         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
722         "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
723         "psrad $" #shift ", %%mm7               \n\t"\
724         "psrad $" #shift ", %%mm4               \n\t"\
725         "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
726         "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
727         "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
728         "psrad $" #shift ", %%mm0               \n\t"\
729         "psrad $" #shift ", %%mm2               \n\t"\
730         "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
731         "movd %%mm7, " #dst "                   \n\t"\
732         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
733         "movd %%mm0, 16+" #dst "                \n\t"\
734         "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
735         "movd %%mm2, 96+" #dst "                \n\t"\
736         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
737         "movd %%mm4, 112+" #dst "               \n\t"\
738         "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
739         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
740         "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
741         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
742         "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
743         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
744         "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
745         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
746         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
747         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
748         "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
749         "psrad $" #shift ", %%mm2               \n\t"\
750         "psrad $" #shift ", %%mm5               \n\t"\
751         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
752         "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
753         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
754         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
755         "psrad $" #shift ", %%mm6               \n\t"\
756         "psrad $" #shift ", %%mm4               \n\t"\
757         "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
758         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
759         "movd %%mm2, 32+" #dst "                \n\t"\
760         "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
761         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
762         "movd %%mm6, 48+" #dst "                \n\t"\
763         "movd %%mm4, 64+" #dst "                \n\t"\
764         "movd %%mm5, 80+" #dst "                \n\t"
765
766
767 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
768 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
769 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
770 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
771 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
772         "jmp 9f                                 \n\t"
773
774         "#.balign 16                            \n\t"\
775         "4:                                     \n\t"
776 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
777 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
778
779 #undef IDCT
780 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
781         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
782         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
783         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
784         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
785         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
786         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
787         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
788         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
789         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
790         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
791         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
792         #rounder ", %%mm4                       \n\t"\
793         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
794         #rounder ", %%mm0                       \n\t"\
795         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
796         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
797         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
798         "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
799         "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
800         "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
801         "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
802         "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
803         "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
804         "paddd %%mm4, %%mm1                     \n\t" /* A0+B0          a0+b0 */\
805         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
806         "psubd %%mm1, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
807         "psrad $" #shift ", %%mm1               \n\t"\
808         "psrad $" #shift ", %%mm4               \n\t"\
809         "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
810         "paddd %%mm7, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
811         "psubd %%mm7, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
812         "psrad $" #shift ", %%mm0               \n\t"\
813         "psrad $" #shift ", %%mm2               \n\t"\
814         "packssdw %%mm1, %%mm1                  \n\t" /* A0+B0  a0+b0 */\
815         "movd %%mm1, " #dst "                   \n\t"\
816         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
817         "movd %%mm0, 16+" #dst "                \n\t"\
818         "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
819         "movd %%mm2, 96+" #dst "                \n\t"\
820         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
821         "movd %%mm4, 112+" #dst "               \n\t"\
822         "movq 88(%2), %%mm1                     \n\t" /* C3     C7      C3      C7 */\
823         "pmaddwd %%mm3, %%mm1                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
824         "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
825         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
826         "paddd %%mm1, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
827         "psubd %%mm1, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
828         "psrad $" #shift ", %%mm2               \n\t"\
829         "psrad $" #shift ", %%mm5               \n\t"\
830         "movq %%mm6, %%mm1                      \n\t" /* A3             a3 */\
831         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
832         "psubd %%mm3, %%mm1                     \n\t" /* a3-B3          a3-b3 */\
833         "psrad $" #shift ", %%mm6               \n\t"\
834         "psrad $" #shift ", %%mm1               \n\t"\
835         "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
836         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
837         "movd %%mm2, 32+" #dst "                \n\t"\
838         "packssdw %%mm1, %%mm1                  \n\t" /* A3-B3  a3-b3 */\
839         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
840         "movd %%mm6, 48+" #dst "                \n\t"\
841         "movd %%mm1, 64+" #dst "                \n\t"\
842         "movd %%mm5, 80+" #dst "                \n\t"   
843
844 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
845 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
846 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
847 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
848 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
849         "jmp 9f                                 \n\t"
850
851         "#.balign 16                            \n\t"\
852         "6:                                     \n\t"
853 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
854
855 #undef IDCT
856 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
857         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
858         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
859         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
860         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
861         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
862         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
863         #rounder ", %%mm4                       \n\t"\
864         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
865         #rounder ", %%mm0                       \n\t"\
866         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
867         "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
868         "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
869         "movq 72(%2), %%mm7                     \n\t" /* -C5    -C1     -C5     -C1 */\
870         "pmaddwd %%mm3, %%mm7                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
871         "paddd %%mm4, %%mm1                     \n\t" /* A0+B0          a0+b0 */\
872         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
873         "psubd %%mm1, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
874         "psrad $" #shift ", %%mm1               \n\t"\
875         "psrad $" #shift ", %%mm4               \n\t"\
876         "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
877         "paddd %%mm7, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
878         "psubd %%mm7, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
879         "psrad $" #shift ", %%mm0               \n\t"\
880         "psrad $" #shift ", %%mm2               \n\t"\
881         "packssdw %%mm1, %%mm1                  \n\t" /* A0+B0  a0+b0 */\
882         "movd %%mm1, " #dst "                   \n\t"\
883         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
884         "movd %%mm0, 16+" #dst "                \n\t"\
885         "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
886         "movd %%mm2, 96+" #dst "                \n\t"\
887         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
888         "movd %%mm4, 112+" #dst "               \n\t"\
889         "movq 88(%2), %%mm1                     \n\t" /* C3     C7      C3      C7 */\
890         "pmaddwd %%mm3, %%mm1                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
891         "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
892         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
893         "paddd %%mm1, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
894         "psubd %%mm1, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
895         "psrad $" #shift ", %%mm2               \n\t"\
896         "psrad $" #shift ", %%mm5               \n\t"\
897         "movq %%mm6, %%mm1                      \n\t" /* A3             a3 */\
898         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
899         "psubd %%mm3, %%mm1                     \n\t" /* a3-B3          a3-b3 */\
900         "psrad $" #shift ", %%mm6               \n\t"\
901         "psrad $" #shift ", %%mm1               \n\t"\
902         "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
903         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
904         "movd %%mm2, 32+" #dst "                \n\t"\
905         "packssdw %%mm1, %%mm1                  \n\t" /* A3-B3  a3-b3 */\
906         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
907         "movd %%mm6, 48+" #dst "                \n\t"\
908         "movd %%mm1, 64+" #dst "                \n\t"\
909         "movd %%mm5, 80+" #dst "                \n\t"   
910
911
912 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
913 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
914 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
915 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
916 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
917         "jmp 9f                                 \n\t"
918
919         "#.balign 16                            \n\t"\
920         "2:                                     \n\t"
921 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
922
923 #undef IDCT
924 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
925         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
926         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
927         "movq " #src5 ", %%mm3                  \n\t" /* R7     R5      r7      r5 */\
928         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
929         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
930         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
931         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
932         #rounder ", %%mm4                       \n\t"\
933         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
934         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
935         #rounder ", %%mm0                       \n\t"\
936         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
937         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
938         "movq 56(%2), %%mm1                     \n\t" /* C7     C5      C7      C5 */\
939         "pmaddwd %%mm3, %%mm1                   \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
940         "pmaddwd 64(%2), %%mm2                  \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
941         "paddd %%mm1, %%mm7                     \n\t" /* B0             b0 */\
942         "movq 72(%2), %%mm1                     \n\t" /* -C5    -C1     -C5     -C1 */\
943         "pmaddwd %%mm3, %%mm1                   \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
944         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
945         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
946         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
947         "paddd %%mm2, %%mm1                     \n\t" /* B1             b1 */\
948         "psrad $" #shift ", %%mm7               \n\t"\
949         "psrad $" #shift ", %%mm4               \n\t"\
950         "movq %%mm0, %%mm2                      \n\t" /* A1             a1 */\
951         "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
952         "psubd %%mm1, %%mm2                     \n\t" /* A1-B1          a1-b1 */\
953         "psrad $" #shift ", %%mm0               \n\t"\
954         "psrad $" #shift ", %%mm2               \n\t"\
955         "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
956         "movd %%mm7, " #dst "                   \n\t"\
957         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
958         "movd %%mm0, 16+" #dst "                \n\t"\
959         "packssdw %%mm2, %%mm2                  \n\t" /* A1-B1  a1-b1 */\
960         "movd %%mm2, 96+" #dst "                \n\t"\
961         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
962         "movd %%mm4, 112+" #dst "               \n\t"\
963         "movq " #src1 ", %%mm0                  \n\t" /* R3     R1      r3      r1 */\
964         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
965         "pmaddwd %%mm0, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
966         "movq 88(%2), %%mm7                     \n\t" /* C3     C7      C3      C7 */\
967         "pmaddwd 96(%2), %%mm0                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
968         "pmaddwd %%mm3, %%mm7                   \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
969         "movq %%mm5, %%mm2                      \n\t" /* A2             a2 */\
970         "pmaddwd 104(%2), %%mm3                 \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
971         "paddd %%mm7, %%mm4                     \n\t" /* B2             b2 */\
972         "paddd %%mm4, %%mm2                     \n\t" /* A2+B2          a2+b2 */\
973         "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
974         "psrad $" #shift ", %%mm2               \n\t"\
975         "psrad $" #shift ", %%mm5               \n\t"\
976         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
977         "paddd %%mm0, %%mm3                     \n\t" /* B3             b3 */\
978         "paddd %%mm3, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
979         "psubd %%mm3, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
980         "psrad $" #shift ", %%mm6               \n\t"\
981         "psrad $" #shift ", %%mm4               \n\t"\
982         "packssdw %%mm2, %%mm2                  \n\t" /* A2+B2  a2+b2 */\
983         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
984         "movd %%mm2, 32+" #dst "                \n\t"\
985         "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
986         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
987         "movd %%mm6, 48+" #dst "                \n\t"\
988         "movd %%mm4, 64+" #dst "                \n\t"\
989         "movd %%mm5, 80+" #dst "                \n\t"
990
991 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
992 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
993 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
994 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
995 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
996         "jmp 9f                                 \n\t"
997
998         "#.balign 16                            \n\t"\
999         "3:                                     \n\t"
1000 #undef IDCT
1001 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1002         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1003         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
1004         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1005         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1006         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1007         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1008         #rounder ", %%mm4                       \n\t"\
1009         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1010         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
1011         #rounder ", %%mm0                       \n\t"\
1012         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1013         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1014         "movq 64(%2), %%mm3                     \n\t"\
1015         "pmaddwd %%mm2, %%mm3                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1016         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
1017         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
1018         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
1019         "psrad $" #shift ", %%mm7               \n\t"\
1020         "psrad $" #shift ", %%mm4               \n\t"\
1021         "movq %%mm0, %%mm1                      \n\t" /* A1             a1 */\
1022         "paddd %%mm3, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
1023         "psubd %%mm3, %%mm1                     \n\t" /* A1-B1          a1-b1 */\
1024         "psrad $" #shift ", %%mm0               \n\t"\
1025         "psrad $" #shift ", %%mm1               \n\t"\
1026         "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
1027         "movd %%mm7, " #dst "                   \n\t"\
1028         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
1029         "movd %%mm0, 16+" #dst "                \n\t"\
1030         "packssdw %%mm1, %%mm1                  \n\t" /* A1-B1  a1-b1 */\
1031         "movd %%mm1, 96+" #dst "                \n\t"\
1032         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
1033         "movd %%mm4, 112+" #dst "               \n\t"\
1034         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
1035         "pmaddwd %%mm2, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1036         "pmaddwd 96(%2), %%mm2                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1037         "movq %%mm5, %%mm1                      \n\t" /* A2             a2 */\
1038         "paddd %%mm4, %%mm1                     \n\t" /* A2+B2          a2+b2 */\
1039         "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
1040         "psrad $" #shift ", %%mm1               \n\t"\
1041         "psrad $" #shift ", %%mm5               \n\t"\
1042         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
1043         "paddd %%mm2, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
1044         "psubd %%mm2, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
1045         "psrad $" #shift ", %%mm6               \n\t"\
1046         "psrad $" #shift ", %%mm4               \n\t"\
1047         "packssdw %%mm1, %%mm1                  \n\t" /* A2+B2  a2+b2 */\
1048         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1049         "movd %%mm1, 32+" #dst "                \n\t"\
1050         "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1051         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1052         "movd %%mm6, 48+" #dst "                \n\t"\
1053         "movd %%mm4, 64+" #dst "                \n\t"\
1054         "movd %%mm5, 80+" #dst "                \n\t"
1055
1056
1057 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1058 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1059 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1060 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1061 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1062         "jmp 9f                                 \n\t"
1063
1064         "#.balign 16                            \n\t"\
1065         "5:                                     \n\t"
1066 #undef IDCT
1067 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1068         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1069         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
1070         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1071         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1072         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1073         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1074         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
1075         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1076         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
1077         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1078         #rounder ", %%mm4                       \n\t"\
1079         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1080         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
1081         #rounder ", %%mm0                       \n\t"\
1082         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
1083         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1084         "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
1085         "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
1086         "movq 8+" #src0 ", %%mm2                \n\t" /* R4     R0      r4      r0 */\
1087         "movq 8+" #src4 ", %%mm3                \n\t" /* R6     R2      r6      r2 */\
1088         "movq 16(%2), %%mm1                     \n\t" /* C4     C4      C4      C4 */\
1089         "pmaddwd %%mm2, %%mm1                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1090         "movq 24(%2), %%mm7                     \n\t" /* -C4    C4      -C4     C4 */\
1091         "pmaddwd %%mm7, %%mm2                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1092         "movq 32(%2), %%mm7                     \n\t" /* C6     C2      C6      C2 */\
1093         "pmaddwd %%mm3, %%mm7                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1094         "pmaddwd 40(%2), %%mm3                  \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1095         #rounder ", %%mm1                       \n\t"\
1096         "paddd %%mm1, %%mm7                     \n\t" /* A0             a0 */\
1097         "paddd %%mm1, %%mm1                     \n\t" /* 2C0            2c0 */\
1098         #rounder ", %%mm2                       \n\t"\
1099         "psubd %%mm7, %%mm1                     \n\t" /* A3             a3 */\
1100         "paddd %%mm2, %%mm3                     \n\t" /* A1             a1 */\
1101         "paddd %%mm2, %%mm2                     \n\t" /* 2C1            2c1 */\
1102         "psubd %%mm3, %%mm2                     \n\t" /* A2             a2 */\
1103         "psrad $" #shift ", %%mm4               \n\t"\
1104         "psrad $" #shift ", %%mm7               \n\t"\
1105         "psrad $" #shift ", %%mm3               \n\t"\
1106         "packssdw %%mm7, %%mm4                  \n\t" /* A0     a0 */\
1107         "movq %%mm4, " #dst "                   \n\t"\
1108         "psrad $" #shift ", %%mm0               \n\t"\
1109         "packssdw %%mm3, %%mm0                  \n\t" /* A1     a1 */\
1110         "movq %%mm0, 16+" #dst "                \n\t"\
1111         "movq %%mm0, 96+" #dst "                \n\t"\
1112         "movq %%mm4, 112+" #dst "               \n\t"\
1113         "psrad $" #shift ", %%mm5               \n\t"\
1114         "psrad $" #shift ", %%mm6               \n\t"\
1115         "psrad $" #shift ", %%mm2               \n\t"\
1116         "packssdw %%mm2, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1117         "movq %%mm5, 32+" #dst "                \n\t"\
1118         "psrad $" #shift ", %%mm1               \n\t"\
1119         "packssdw %%mm1, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1120         "movq %%mm6, 48+" #dst "                \n\t"\
1121         "movq %%mm6, 64+" #dst "                \n\t"\
1122         "movq %%mm5, 80+" #dst "                \n\t"   
1123         
1124
1125 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1126 IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1127 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1128 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1129 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1130         "jmp 9f                                 \n\t"
1131
1132
1133         "#.balign 16                            \n\t"\
1134         "1:                                     \n\t"
1135 #undef IDCT
1136 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1137         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1138         "movq " #src4 ", %%mm1                  \n\t" /* R6     R2      r6      r2 */\
1139         "movq " #src1 ", %%mm2                  \n\t" /* R3     R1      r3      r1 */\
1140         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1141         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1142         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1143         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1144         "movq 32(%2), %%mm5                     \n\t" /* C6     C2      C6      C2 */\
1145         "pmaddwd %%mm1, %%mm5                   \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1146         "movq 40(%2), %%mm6                     \n\t" /* -C2    C6      -C2     C6 */\
1147         "pmaddwd %%mm6, %%mm1                   \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1148         #rounder ", %%mm4                       \n\t"\
1149         "movq %%mm4, %%mm6                      \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1150         "movq 48(%2), %%mm7                     \n\t" /* C3     C1      C3      C1 */\
1151         #rounder ", %%mm0                       \n\t"\
1152         "pmaddwd %%mm2, %%mm7                   \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1153         "paddd %%mm5, %%mm4                     \n\t" /* A0             a0 */\
1154         "psubd %%mm5, %%mm6                     \n\t" /* A3             a3 */\
1155         "movq %%mm0, %%mm5                      \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1156         "paddd %%mm1, %%mm0                     \n\t" /* A1             a1 */\
1157         "psubd %%mm1, %%mm5                     \n\t" /* A2             a2 */\
1158         "movq 64(%2), %%mm1                     \n\t"\
1159         "pmaddwd %%mm2, %%mm1                   \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1160         "paddd %%mm4, %%mm7                     \n\t" /* A0+B0          a0+b0 */\
1161         "paddd %%mm4, %%mm4                     \n\t" /* 2A0            2a0 */\
1162         "psubd %%mm7, %%mm4                     \n\t" /* A0-B0          a0-b0 */\
1163         "psrad $" #shift ", %%mm7               \n\t"\
1164         "psrad $" #shift ", %%mm4               \n\t"\
1165         "movq %%mm0, %%mm3                      \n\t" /* A1             a1 */\
1166         "paddd %%mm1, %%mm0                     \n\t" /* A1+B1          a1+b1 */\
1167         "psubd %%mm1, %%mm3                     \n\t" /* A1-B1          a1-b1 */\
1168         "psrad $" #shift ", %%mm0               \n\t"\
1169         "psrad $" #shift ", %%mm3               \n\t"\
1170         "packssdw %%mm7, %%mm7                  \n\t" /* A0+B0  a0+b0 */\
1171         "movd %%mm7, " #dst "                   \n\t"\
1172         "packssdw %%mm0, %%mm0                  \n\t" /* A1+B1  a1+b1 */\
1173         "movd %%mm0, 16+" #dst "                \n\t"\
1174         "packssdw %%mm3, %%mm3                  \n\t" /* A1-B1  a1-b1 */\
1175         "movd %%mm3, 96+" #dst "                \n\t"\
1176         "packssdw %%mm4, %%mm4                  \n\t" /* A0-B0  a0-b0 */\
1177         "movd %%mm4, 112+" #dst "               \n\t"\
1178         "movq 80(%2), %%mm4                     \n\t" /* -C1    C5      -C1     C5 */\
1179         "pmaddwd %%mm2, %%mm4                   \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1180         "pmaddwd 96(%2), %%mm2                  \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1181         "movq %%mm5, %%mm3                      \n\t" /* A2             a2 */\
1182         "paddd %%mm4, %%mm3                     \n\t" /* A2+B2          a2+b2 */\
1183         "psubd %%mm4, %%mm5                     \n\t" /* a2-B2          a2-b2 */\
1184         "psrad $" #shift ", %%mm3               \n\t"\
1185         "psrad $" #shift ", %%mm5               \n\t"\
1186         "movq %%mm6, %%mm4                      \n\t" /* A3             a3 */\
1187         "paddd %%mm2, %%mm6                     \n\t" /* A3+B3          a3+b3 */\
1188         "psubd %%mm2, %%mm4                     \n\t" /* a3-B3          a3-b3 */\
1189         "psrad $" #shift ", %%mm6               \n\t"\
1190         "packssdw %%mm3, %%mm3                  \n\t" /* A2+B2  a2+b2 */\
1191         "movd %%mm3, 32+" #dst "                \n\t"\
1192         "psrad $" #shift ", %%mm4               \n\t"\
1193         "packssdw %%mm6, %%mm6                  \n\t" /* A3+B3  a3+b3 */\
1194         "movd %%mm6, 48+" #dst "                \n\t"\
1195         "packssdw %%mm4, %%mm4                  \n\t" /* A3-B3  a3-b3 */\
1196         "packssdw %%mm5, %%mm5                  \n\t" /* A2-B2  a2-b2 */\
1197         "movd %%mm4, 64+" #dst "                \n\t"\
1198         "movd %%mm5, 80+" #dst "                \n\t"
1199         
1200
1201 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1202 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1203 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1204 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1205 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1206         "jmp 9f                                 \n\t"
1207
1208
1209         "#.balign 16                            \n\t"
1210         "7:                                     \n\t"
1211 #undef IDCT
1212 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1213         "movq " #src0 ", %%mm0                  \n\t" /* R4     R0      r4      r0 */\
1214         "movq 16(%2), %%mm4                     \n\t" /* C4     C4      C4      C4 */\
1215         "pmaddwd %%mm0, %%mm4                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1216         "movq 24(%2), %%mm5                     \n\t" /* -C4    C4      -C4     C4 */\
1217         "pmaddwd %%mm5, %%mm0                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1218         #rounder ", %%mm4                       \n\t"\
1219         #rounder ", %%mm0                       \n\t"\
1220         "psrad $" #shift ", %%mm4               \n\t"\
1221         "psrad $" #shift ", %%mm0               \n\t"\
1222         "movq 8+" #src0 ", %%mm2                \n\t" /* R4     R0      r4      r0 */\
1223         "movq 16(%2), %%mm1                     \n\t" /* C4     C4      C4      C4 */\
1224         "pmaddwd %%mm2, %%mm1                   \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1225         "movq 24(%2), %%mm7                     \n\t" /* -C4    C4      -C4     C4 */\
1226         "pmaddwd %%mm7, %%mm2                   \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1227         "movq 32(%2), %%mm7                     \n\t" /* C6     C2      C6      C2 */\
1228         #rounder ", %%mm1                       \n\t"\
1229         #rounder ", %%mm2                       \n\t"\
1230         "psrad $" #shift ", %%mm1               \n\t"\
1231         "packssdw %%mm1, %%mm4                  \n\t" /* A0     a0 */\
1232         "movq %%mm4, " #dst "                   \n\t"\
1233         "psrad $" #shift ", %%mm2               \n\t"\
1234         "packssdw %%mm2, %%mm0                  \n\t" /* A1     a1 */\
1235         "movq %%mm0, 16+" #dst "                \n\t"\
1236         "movq %%mm0, 96+" #dst "                \n\t"\
1237         "movq %%mm4, 112+" #dst "               \n\t"\
1238         "movq %%mm0, 32+" #dst "                \n\t"\
1239         "movq %%mm4, 48+" #dst "                \n\t"\
1240         "movq %%mm4, 64+" #dst "                \n\t"\
1241         "movq %%mm0, 80+" #dst "                \n\t"   
1242
1243 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1244 IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1245 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1246 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1247 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1248
1249
1250 #endif
1251
1252 /*
1253 Input
1254  00 40 04 44 20 60 24 64
1255  10 30 14 34 50 70 54 74
1256  01 41 03 43 21 61 23 63
1257  11 31 13 33 51 71 53 73
1258  02 42 06 46 22 62 26 66
1259  12 32 16 36 52 72 56 76
1260  05 45 07 47 25 65 27 67
1261  15 35 17 37 55 75 57 77
1262   
1263 Temp
1264  00 04 10 14 20 24 30 34
1265  40 44 50 54 60 64 70 74
1266  01 03 11 13 21 23 31 33
1267  41 43 51 53 61 63 71 73
1268  02 06 12 16 22 26 32 36
1269  42 46 52 56 62 66 72 76
1270  05 07 15 17 25 27 35 37
1271  45 47 55 57 65 67 75 77
1272 */
1273
1274 "9: \n\t"
1275                 :: "r" (block), "r" (temp), "r" (coeffs)
1276                 : "%eax"
1277         );
1278 }
1279
1280 void simple_idct_mmx(int16_t *block)
1281 {
1282         idct(block);
1283 }