]> git.sesse.net Git - ffmpeg/blob - libavcodec/i386/simple_idct_mmx.c
Chinese AVS decoder
[ffmpeg] / libavcodec / i386 / simple_idct_mmx.c
1 /*
2  * Simple IDCT MMX
3  *
4  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 #include "../dsputil.h"
21 #include "../simple_idct.h"
22
23 /*
24 23170.475006
25 22725.260826
26 21406.727617
27 19265.545870
28 16384.000000
29 12872.826198
30 8866.956905
31 4520.335430
32 */
33 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37 #if 0
38 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39 #else
40 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
41 #endif
42 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45
46 #define ROW_SHIFT 11
47 #define COL_SHIFT 20 // 6
48
49 static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
50 static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
51
52 static const int16_t __attribute__((aligned(8))) coeffs[]= {
53         1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
54 //        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
55 //        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
56         1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
57         // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
58 //        0, 0, 0, 0,
59 //        0, 0, 0, 0,
60
61  C4,  C4,  C4,  C4,
62  C4, -C4,  C4, -C4,
63
64  C2,  C6,  C2,  C6,
65  C6, -C2,  C6, -C2,
66
67  C1,  C3,  C1,  C3,
68  C5,  C7,  C5,  C7,
69
70  C3, -C7,  C3, -C7,
71 -C1, -C5, -C1, -C5,
72
73  C5, -C1,  C5, -C1,
74  C7,  C3,  C7,  C3,
75
76  C7, -C5,  C7, -C5,
77  C3, -C1,  C3, -C1
78 };
79
80 #if 0
81 static void unused_var_killer(){
82         int a= wm1010 + d40000;
83         temp[0]=a;
84 }
85
86 static void inline idctCol (int16_t * col, int16_t *input)
87 {
88 #undef C0
89 #undef C1
90 #undef C2
91 #undef C3
92 #undef C4
93 #undef C5
94 #undef C6
95 #undef C7
96         int a0, a1, a2, a3, b0, b1, b2, b3;
97         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
98         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
99         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101         const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103         const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104         const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105 /*
106         if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
107                 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
108                         col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
109                 return;
110         }*/
111
112 col[8*0] = input[8*0 + 0];
113 col[8*1] = input[8*2 + 0];
114 col[8*2] = input[8*0 + 1];
115 col[8*3] = input[8*2 + 1];
116 col[8*4] = input[8*4 + 0];
117 col[8*5] = input[8*6 + 0];
118 col[8*6] = input[8*4 + 1];
119 col[8*7] = input[8*6 + 1];
120
121         a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
122         a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
123         a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
124         a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
125
126         b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
127         b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
128         b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
129         b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
130
131         col[8*0] = (a0 + b0) >> COL_SHIFT;
132         col[8*1] = (a1 + b1) >> COL_SHIFT;
133         col[8*2] = (a2 + b2) >> COL_SHIFT;
134         col[8*3] = (a3 + b3) >> COL_SHIFT;
135         col[8*4] = (a3 - b3) >> COL_SHIFT;
136         col[8*5] = (a2 - b2) >> COL_SHIFT;
137         col[8*6] = (a1 - b1) >> COL_SHIFT;
138         col[8*7] = (a0 - b0) >> COL_SHIFT;
139 }
140
141 static void inline idctRow (int16_t * output, int16_t * input)
142 {
143         int16_t row[8];
144
145         int a0, a1, a2, a3, b0, b1, b2, b3;
146         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
147         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
148         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150         const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152         const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153         const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154
155 row[0] = input[0];
156 row[2] = input[1];
157 row[4] = input[4];
158 row[6] = input[5];
159 row[1] = input[8];
160 row[3] = input[9];
161 row[5] = input[12];
162 row[7] = input[13];
163
164         if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
165                 row[0] = row[1] = row[2] = row[3] = row[4] =
166                         row[5] = row[6] = row[7] = row[0]<<3;
167         output[0]  = row[0];
168         output[2]  = row[1];
169         output[4]  = row[2];
170         output[6]  = row[3];
171         output[8]  = row[4];
172         output[10] = row[5];
173         output[12] = row[6];
174         output[14] = row[7];
175                 return;
176         }
177
178         a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
179         a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
180         a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
181         a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
182
183         b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
184         b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
185         b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
186         b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
187
188         row[0] = (a0 + b0) >> ROW_SHIFT;
189         row[1] = (a1 + b1) >> ROW_SHIFT;
190         row[2] = (a2 + b2) >> ROW_SHIFT;
191         row[3] = (a3 + b3) >> ROW_SHIFT;
192         row[4] = (a3 - b3) >> ROW_SHIFT;
193         row[5] = (a2 - b2) >> ROW_SHIFT;
194         row[6] = (a1 - b1) >> ROW_SHIFT;
195         row[7] = (a0 - b0) >> ROW_SHIFT;
196
197         output[0]  = row[0];
198         output[2]  = row[1];
199         output[4]  = row[2];
200         output[6]  = row[3];
201         output[8]  = row[4];
202         output[10] = row[5];
203         output[12] = row[6];
204         output[14] = row[7];
205 }
206 #endif
207
208 static inline void idct(int16_t *block)
209 {
210         int64_t __attribute__((aligned(8))) align_tmp[16];
211         int16_t * const temp= (int16_t*)align_tmp;
212
213         asm volatile(
214 #if 0 //Alternative, simpler variant
215
216 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
217         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
218         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
219         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
220         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
221         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
222         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
223         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
224         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
225         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
226         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
227         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
228         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
229         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
230         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
231         #rounder ", %%mm4               \n\t"\
232         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
233         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
234         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
235         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
236         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
237         #rounder ", %%mm0               \n\t"\
238         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
239         "paddd %%mm0, %%mm0             \n\t" \
240         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
241         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
242         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
243         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
244         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
245         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
246         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
247         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
248         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
249         "psrad $" #shift ", %%mm7       \n\t"\
250         "psrad $" #shift ", %%mm4       \n\t"\
251         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
252         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
253         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
254         "psrad $" #shift ", %%mm1       \n\t"\
255         "psrad $" #shift ", %%mm2       \n\t"\
256         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
257         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
258         "movq %%mm7, " #dst "           \n\t"\
259         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
260         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
261         "movq %%mm2, 24+" #dst "        \n\t"\
262         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
263         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
264         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
265         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
266         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
267         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
268         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
269         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
270         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
271         "psrad $" #shift ", %%mm2       \n\t"\
272         "psrad $" #shift ", %%mm0       \n\t"\
273         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
274         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
275         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
276         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
277         "psrad $" #shift ", %%mm6       \n\t"\
278         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
279         "movq %%mm2, 8+" #dst "         \n\t"\
280         "psrad $" #shift ", %%mm4       \n\t"\
281         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
282         "movq %%mm4, 16+" #dst "        \n\t"\
283
284 #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
285         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
286         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
287         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
288         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
289         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
290         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
291         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
292         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
293         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
294         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
295         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
296         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
297         #rounder ", %%mm4               \n\t"\
298         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
299         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
300         #rounder ", %%mm0               \n\t"\
301         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
302         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
303         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
304         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
305         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
306         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
307         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
308         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
309         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
310         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
311         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
312         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
313         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
314         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
315         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
316         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
317         "psrad $" #shift ", %%mm7       \n\t"\
318         "psrad $" #shift ", %%mm4       \n\t"\
319         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
320         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
321         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
322         "psrad $" #shift ", %%mm0       \n\t"\
323         "psrad $" #shift ", %%mm2       \n\t"\
324         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
325         "movd %%mm7, " #dst "           \n\t"\
326         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
327         "movd %%mm0, 16+" #dst "        \n\t"\
328         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
329         "movd %%mm2, 96+" #dst "        \n\t"\
330         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
331         "movd %%mm4, 112+" #dst "       \n\t"\
332         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
333         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
334         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
335         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
336         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
337         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
338         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
339         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
340         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
341         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
342         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
343         "psrad $" #shift ", %%mm2       \n\t"\
344         "psrad $" #shift ", %%mm5       \n\t"\
345         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
346         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
347         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
348         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
349         "psrad $" #shift ", %%mm6       \n\t"\
350         "psrad $" #shift ", %%mm4       \n\t"\
351         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
352         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
353         "movd %%mm2, 32+" #dst "        \n\t"\
354         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
355         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
356         "movd %%mm6, 48+" #dst "        \n\t"\
357         "movd %%mm4, 64+" #dst "        \n\t"\
358         "movd %%mm5, 80+" #dst "        \n\t"\
359
360
361 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
362         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
363         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
364         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
365         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
366         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
367         "pand %%mm0, %%mm4              \n\t"\
368         "por %%mm1, %%mm4               \n\t"\
369         "por %%mm2, %%mm4               \n\t"\
370         "por %%mm3, %%mm4               \n\t"\
371         "packssdw %%mm4,%%mm4           \n\t"\
372         "movd %%mm4, %%eax              \n\t"\
373         "orl %%eax, %%eax               \n\t"\
374         "jz 1f                          \n\t"\
375         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
376         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
377         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
378         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
379         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
380         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
381         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
382         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
383         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
384         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
385         #rounder ", %%mm4               \n\t"\
386         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
387         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
388         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
389         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
390         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
391         #rounder ", %%mm0               \n\t"\
392         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
393         "paddd %%mm0, %%mm0             \n\t" \
394         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
395         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
396         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
397         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
398         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
399         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
400         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
401         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
402         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
403         "psrad $" #shift ", %%mm7       \n\t"\
404         "psrad $" #shift ", %%mm4       \n\t"\
405         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
406         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
407         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
408         "psrad $" #shift ", %%mm1       \n\t"\
409         "psrad $" #shift ", %%mm2       \n\t"\
410         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
411         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
412         "movq %%mm7, " #dst "           \n\t"\
413         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
414         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
415         "movq %%mm2, 24+" #dst "        \n\t"\
416         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
417         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
418         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
419         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
420         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
421         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
422         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
423         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
424         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
425         "psrad $" #shift ", %%mm2       \n\t"\
426         "psrad $" #shift ", %%mm0       \n\t"\
427         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
428         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
429         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
430         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
431         "psrad $" #shift ", %%mm6       \n\t"\
432         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
433         "movq %%mm2, 8+" #dst "         \n\t"\
434         "psrad $" #shift ", %%mm4       \n\t"\
435         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
436         "movq %%mm4, 16+" #dst "        \n\t"\
437         "jmp 2f                         \n\t"\
438         "1:                             \n\t"\
439         "pslld $16, %%mm0               \n\t"\
440         "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
441         "psrad $13, %%mm0               \n\t"\
442         "packssdw %%mm0, %%mm0          \n\t"\
443         "movq %%mm0, " #dst "           \n\t"\
444         "movq %%mm0, 8+" #dst "         \n\t"\
445         "movq %%mm0, 16+" #dst "        \n\t"\
446         "movq %%mm0, 24+" #dst "        \n\t"\
447         "2:                             \n\t"
448
449
450 //IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
451 ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
452 /*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
453 ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
454 ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
455
456 DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
457 DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
458 DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
459
460
461 //IDCT(      src0,   src4,   src1,    src5,    dst, rounder, shift)
462 COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
463 COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
464 COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
465 COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
466
467 #else
468
469 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
470         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
471         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
472         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
473         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
474         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
475         "pand %%mm0, %%mm4              \n\t"\
476         "por %%mm1, %%mm4               \n\t"\
477         "por %%mm2, %%mm4               \n\t"\
478         "por %%mm3, %%mm4               \n\t"\
479         "packssdw %%mm4,%%mm4           \n\t"\
480         "movd %%mm4, %%eax              \n\t"\
481         "orl %%eax, %%eax               \n\t"\
482         "jz 1f                          \n\t"\
483         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
484         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
485         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
486         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
487         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
488         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
489         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
490         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
491         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
492         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
493         #rounder ", %%mm4               \n\t"\
494         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
495         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
496         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
497         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
498         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
499         #rounder ", %%mm0               \n\t"\
500         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
501         "paddd %%mm0, %%mm0             \n\t" \
502         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
503         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
504         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
505         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
506         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
507         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
508         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
509         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
510         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
511         "psrad $" #shift ", %%mm7       \n\t"\
512         "psrad $" #shift ", %%mm4       \n\t"\
513         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
514         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
515         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
516         "psrad $" #shift ", %%mm1       \n\t"\
517         "psrad $" #shift ", %%mm2       \n\t"\
518         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
519         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
520         "movq %%mm7, " #dst "           \n\t"\
521         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
522         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
523         "movq %%mm2, 24+" #dst "        \n\t"\
524         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
525         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
526         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
527         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
528         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
529         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
530         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
531         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
532         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
533         "psrad $" #shift ", %%mm2       \n\t"\
534         "psrad $" #shift ", %%mm0       \n\t"\
535         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
536         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
537         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
538         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
539         "psrad $" #shift ", %%mm6       \n\t"\
540         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
541         "movq %%mm2, 8+" #dst "         \n\t"\
542         "psrad $" #shift ", %%mm4       \n\t"\
543         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
544         "movq %%mm4, 16+" #dst "        \n\t"\
545         "jmp 2f                         \n\t"\
546         "1:                             \n\t"\
547         "pslld $16, %%mm0               \n\t"\
548         "paddd "MANGLE(d40000)", %%mm0  \n\t"\
549         "psrad $13, %%mm0               \n\t"\
550         "packssdw %%mm0, %%mm0          \n\t"\
551         "movq %%mm0, " #dst "           \n\t"\
552         "movq %%mm0, 8+" #dst "         \n\t"\
553         "movq %%mm0, 16+" #dst "        \n\t"\
554         "movq %%mm0, 24+" #dst "        \n\t"\
555         "2:                             \n\t"
556
557 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
558         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
559         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
560         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
561         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
562         "movq %%mm0, %%mm4              \n\t"\
563         "por %%mm1, %%mm4               \n\t"\
564         "por %%mm2, %%mm4               \n\t"\
565         "por %%mm3, %%mm4               \n\t"\
566         "packssdw %%mm4,%%mm4           \n\t"\
567         "movd %%mm4, %%eax              \n\t"\
568         "orl %%eax, %%eax               \n\t"\
569         "jz " #bt "                     \n\t"\
570         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
571         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
572         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
573         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
574         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
575         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
576         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
577         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
578         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
579         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
580         #rounder ", %%mm4               \n\t"\
581         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
582         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
583         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
584         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
585         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
586         #rounder ", %%mm0               \n\t"\
587         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
588         "paddd %%mm0, %%mm0             \n\t" \
589         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
590         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
591         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
592         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
593         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
594         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
595         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
596         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
597         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
598         "psrad $" #shift ", %%mm7       \n\t"\
599         "psrad $" #shift ", %%mm4       \n\t"\
600         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
601         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
602         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
603         "psrad $" #shift ", %%mm1       \n\t"\
604         "psrad $" #shift ", %%mm2       \n\t"\
605         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
606         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
607         "movq %%mm7, " #dst "           \n\t"\
608         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
609         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
610         "movq %%mm2, 24+" #dst "        \n\t"\
611         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
612         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
613         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
614         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
615         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
616         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
617         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
618         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
619         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
620         "psrad $" #shift ", %%mm2       \n\t"\
621         "psrad $" #shift ", %%mm0       \n\t"\
622         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
623         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
624         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
625         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
626         "psrad $" #shift ", %%mm6       \n\t"\
627         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
628         "movq %%mm2, 8+" #dst "         \n\t"\
629         "psrad $" #shift ", %%mm4       \n\t"\
630         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
631         "movq %%mm4, 16+" #dst "        \n\t"\
632
633 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
634         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
635         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
636         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
637         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
638         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
639         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
640         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
641         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
642         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
643         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
644         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
645         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
646         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
647         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
648         #rounder ", %%mm4               \n\t"\
649         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
650         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
651         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
652         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
653         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
654         #rounder ", %%mm0               \n\t"\
655         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
656         "paddd %%mm0, %%mm0             \n\t" \
657         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
658         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
659         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
660         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
661         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
662         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
663         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
664         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
665         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
666         "psrad $" #shift ", %%mm7       \n\t"\
667         "psrad $" #shift ", %%mm4       \n\t"\
668         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
669         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
670         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
671         "psrad $" #shift ", %%mm1       \n\t"\
672         "psrad $" #shift ", %%mm2       \n\t"\
673         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
674         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
675         "movq %%mm7, " #dst "           \n\t"\
676         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
677         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
678         "movq %%mm2, 24+" #dst "        \n\t"\
679         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
680         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
681         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
682         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
683         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
684         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
685         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
686         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
687         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
688         "psrad $" #shift ", %%mm2       \n\t"\
689         "psrad $" #shift ", %%mm0       \n\t"\
690         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
691         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
692         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
693         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
694         "psrad $" #shift ", %%mm6       \n\t"\
695         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
696         "movq %%mm2, 8+" #dst "         \n\t"\
697         "psrad $" #shift ", %%mm4       \n\t"\
698         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
699         "movq %%mm4, 16+" #dst "        \n\t"\
700
701 //IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
702 DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
703 Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
704 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
705 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
706
707 #undef IDCT
708 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
709         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
710         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
711         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
712         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
713         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
714         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
715         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
716         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
717         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
718         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
719         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
720         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
721         #rounder ", %%mm4               \n\t"\
722         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
723         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
724         #rounder ", %%mm0               \n\t"\
725         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
726         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
727         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
728         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
729         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
730         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
731         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
732         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
733         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
734         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
735         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
736         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
737         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
738         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
739         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
740         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
741         "psrad $" #shift ", %%mm7       \n\t"\
742         "psrad $" #shift ", %%mm4       \n\t"\
743         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
744         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
745         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
746         "psrad $" #shift ", %%mm0       \n\t"\
747         "psrad $" #shift ", %%mm2       \n\t"\
748         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
749         "movd %%mm7, " #dst "           \n\t"\
750         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
751         "movd %%mm0, 16+" #dst "        \n\t"\
752         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
753         "movd %%mm2, 96+" #dst "        \n\t"\
754         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
755         "movd %%mm4, 112+" #dst "       \n\t"\
756         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
757         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
758         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
759         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
760         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
761         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
762         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
763         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
764         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
765         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
766         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
767         "psrad $" #shift ", %%mm2       \n\t"\
768         "psrad $" #shift ", %%mm5       \n\t"\
769         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
770         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
771         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
772         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
773         "psrad $" #shift ", %%mm6       \n\t"\
774         "psrad $" #shift ", %%mm4       \n\t"\
775         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
776         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
777         "movd %%mm2, 32+" #dst "        \n\t"\
778         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
779         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
780         "movd %%mm6, 48+" #dst "        \n\t"\
781         "movd %%mm4, 64+" #dst "        \n\t"\
782         "movd %%mm5, 80+" #dst "        \n\t"
783
784
785 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
786 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
787 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
788 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
789 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
790         "jmp 9f                         \n\t"
791
792         "#.balign 16                    \n\t"\
793         "4:                             \n\t"
794 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
795 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
796
797 #undef IDCT
798 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
799         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
800         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
801         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
802         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
803         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
804         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
805         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
806         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
807         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
808         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
809         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
810         #rounder ", %%mm4               \n\t"\
811         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
812         #rounder ", %%mm0               \n\t"\
813         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
814         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
815         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
816         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
817         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
818         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
819         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
820         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
821         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
822         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
823         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
824         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
825         "psrad $" #shift ", %%mm1       \n\t"\
826         "psrad $" #shift ", %%mm4       \n\t"\
827         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
828         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
829         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
830         "psrad $" #shift ", %%mm0       \n\t"\
831         "psrad $" #shift ", %%mm2       \n\t"\
832         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
833         "movd %%mm1, " #dst "           \n\t"\
834         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
835         "movd %%mm0, 16+" #dst "        \n\t"\
836         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
837         "movd %%mm2, 96+" #dst "        \n\t"\
838         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
839         "movd %%mm4, 112+" #dst "       \n\t"\
840         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
841         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
842         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
843         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
844         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
845         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
846         "psrad $" #shift ", %%mm2       \n\t"\
847         "psrad $" #shift ", %%mm5       \n\t"\
848         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
849         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
850         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
851         "psrad $" #shift ", %%mm6       \n\t"\
852         "psrad $" #shift ", %%mm1       \n\t"\
853         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
854         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
855         "movd %%mm2, 32+" #dst "        \n\t"\
856         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
857         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
858         "movd %%mm6, 48+" #dst "        \n\t"\
859         "movd %%mm1, 64+" #dst "        \n\t"\
860         "movd %%mm5, 80+" #dst "        \n\t"
861
862 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
863 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
864 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
865 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
866 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
867         "jmp 9f                         \n\t"
868
869         "#.balign 16                    \n\t"\
870         "6:                             \n\t"
871 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
872
873 #undef IDCT
874 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
875         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
876         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
877         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
878         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
879         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
880         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
881         #rounder ", %%mm4               \n\t"\
882         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
883         #rounder ", %%mm0               \n\t"\
884         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
885         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
886         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
887         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
888         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
889         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
890         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
891         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
892         "psrad $" #shift ", %%mm1       \n\t"\
893         "psrad $" #shift ", %%mm4       \n\t"\
894         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
895         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
896         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
897         "psrad $" #shift ", %%mm0       \n\t"\
898         "psrad $" #shift ", %%mm2       \n\t"\
899         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
900         "movd %%mm1, " #dst "           \n\t"\
901         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
902         "movd %%mm0, 16+" #dst "        \n\t"\
903         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
904         "movd %%mm2, 96+" #dst "        \n\t"\
905         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
906         "movd %%mm4, 112+" #dst "       \n\t"\
907         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
908         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
909         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
910         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
911         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
912         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
913         "psrad $" #shift ", %%mm2       \n\t"\
914         "psrad $" #shift ", %%mm5       \n\t"\
915         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
916         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
917         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
918         "psrad $" #shift ", %%mm6       \n\t"\
919         "psrad $" #shift ", %%mm1       \n\t"\
920         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
921         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
922         "movd %%mm2, 32+" #dst "        \n\t"\
923         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
924         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
925         "movd %%mm6, 48+" #dst "        \n\t"\
926         "movd %%mm1, 64+" #dst "        \n\t"\
927         "movd %%mm5, 80+" #dst "        \n\t"
928
929
930 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
931 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
932 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
933 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
934 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
935         "jmp 9f                         \n\t"
936
937         "#.balign 16                    \n\t"\
938         "2:                             \n\t"
939 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
940
941 #undef IDCT
942 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
943         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
944         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
945         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
946         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
947         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
948         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
949         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
950         #rounder ", %%mm4               \n\t"\
951         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
952         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
953         #rounder ", %%mm0               \n\t"\
954         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
955         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
956         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
957         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
958         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
959         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
960         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
961         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
962         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
963         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
964         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
965         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
966         "psrad $" #shift ", %%mm7       \n\t"\
967         "psrad $" #shift ", %%mm4       \n\t"\
968         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
969         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
970         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
971         "psrad $" #shift ", %%mm0       \n\t"\
972         "psrad $" #shift ", %%mm2       \n\t"\
973         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
974         "movd %%mm7, " #dst "           \n\t"\
975         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
976         "movd %%mm0, 16+" #dst "        \n\t"\
977         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
978         "movd %%mm2, 96+" #dst "        \n\t"\
979         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
980         "movd %%mm4, 112+" #dst "       \n\t"\
981         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
982         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
983         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
984         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
985         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
986         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
987         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
988         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
989         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
990         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
991         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
992         "psrad $" #shift ", %%mm2       \n\t"\
993         "psrad $" #shift ", %%mm5       \n\t"\
994         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
995         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
996         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
997         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
998         "psrad $" #shift ", %%mm6       \n\t"\
999         "psrad $" #shift ", %%mm4       \n\t"\
1000         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
1001         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1002         "movd %%mm2, 32+" #dst "        \n\t"\
1003         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1004         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1005         "movd %%mm6, 48+" #dst "        \n\t"\
1006         "movd %%mm4, 64+" #dst "        \n\t"\
1007         "movd %%mm5, 80+" #dst "        \n\t"
1008
1009 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1010 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1011 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1012 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1013 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1014         "jmp 9f                         \n\t"
1015
1016         "#.balign 16                    \n\t"\
1017         "3:                             \n\t"
1018 #undef IDCT
1019 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1020         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1021         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1022         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1023         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1024         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1025         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1026         #rounder ", %%mm4               \n\t"\
1027         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1028         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1029         #rounder ", %%mm0               \n\t"\
1030         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1031         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1032         "movq 64(%2), %%mm3             \n\t"\
1033         "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1034         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1035         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1036         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1037         "psrad $" #shift ", %%mm7       \n\t"\
1038         "psrad $" #shift ", %%mm4       \n\t"\
1039         "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
1040         "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1041         "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
1042         "psrad $" #shift ", %%mm0       \n\t"\
1043         "psrad $" #shift ", %%mm1       \n\t"\
1044         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1045         "movd %%mm7, " #dst "           \n\t"\
1046         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1047         "movd %%mm0, 16+" #dst "        \n\t"\
1048         "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
1049         "movd %%mm1, 96+" #dst "        \n\t"\
1050         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1051         "movd %%mm4, 112+" #dst "       \n\t"\
1052         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1053         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1054         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1055         "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
1056         "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
1057         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1058         "psrad $" #shift ", %%mm1       \n\t"\
1059         "psrad $" #shift ", %%mm5       \n\t"\
1060         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1061         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1062         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1063         "psrad $" #shift ", %%mm6       \n\t"\
1064         "psrad $" #shift ", %%mm4       \n\t"\
1065         "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
1066         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1067         "movd %%mm1, 32+" #dst "        \n\t"\
1068         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1069         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1070         "movd %%mm6, 48+" #dst "        \n\t"\
1071         "movd %%mm4, 64+" #dst "        \n\t"\
1072         "movd %%mm5, 80+" #dst "        \n\t"
1073
1074
1075 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1076 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1077 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1078 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1079 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1080         "jmp 9f                         \n\t"
1081
1082         "#.balign 16                    \n\t"\
1083         "5:                             \n\t"
1084 #undef IDCT
1085 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1086         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1087         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1088         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1089         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1090         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1091         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1092         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1093         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1094         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1095         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1096         #rounder ", %%mm4               \n\t"\
1097         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1098         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1099         #rounder ", %%mm0               \n\t"\
1100         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1101         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1102         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1103         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1104         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1105         "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
1106         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1107         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1108         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1109         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1110         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1111         "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1112         "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1113         #rounder ", %%mm1               \n\t"\
1114         "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
1115         "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
1116         #rounder ", %%mm2               \n\t"\
1117         "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
1118         "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
1119         "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
1120         "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
1121         "psrad $" #shift ", %%mm4       \n\t"\
1122         "psrad $" #shift ", %%mm7       \n\t"\
1123         "psrad $" #shift ", %%mm3       \n\t"\
1124         "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
1125         "movq %%mm4, " #dst "           \n\t"\
1126         "psrad $" #shift ", %%mm0       \n\t"\
1127         "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
1128         "movq %%mm0, 16+" #dst "        \n\t"\
1129         "movq %%mm0, 96+" #dst "        \n\t"\
1130         "movq %%mm4, 112+" #dst "       \n\t"\
1131         "psrad $" #shift ", %%mm5       \n\t"\
1132         "psrad $" #shift ", %%mm6       \n\t"\
1133         "psrad $" #shift ", %%mm2       \n\t"\
1134         "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1135         "movq %%mm5, 32+" #dst "        \n\t"\
1136         "psrad $" #shift ", %%mm1       \n\t"\
1137         "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1138         "movq %%mm6, 48+" #dst "        \n\t"\
1139         "movq %%mm6, 64+" #dst "        \n\t"\
1140         "movq %%mm5, 80+" #dst "        \n\t"
1141
1142
1143 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1144 IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1145 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1146 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1147 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1148         "jmp 9f                         \n\t"
1149
1150
1151         "#.balign 16                    \n\t"\
1152         "1:                             \n\t"
1153 #undef IDCT
1154 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1155         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1156         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1157         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1158         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1159         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1160         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1161         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1162         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1163         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1164         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1165         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1166         #rounder ", %%mm4               \n\t"\
1167         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1168         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1169         #rounder ", %%mm0               \n\t"\
1170         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1171         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1172         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1173         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1174         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1175         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1176         "movq 64(%2), %%mm1             \n\t"\
1177         "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1178         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1179         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1180         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1181         "psrad $" #shift ", %%mm7       \n\t"\
1182         "psrad $" #shift ", %%mm4       \n\t"\
1183         "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
1184         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1185         "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
1186         "psrad $" #shift ", %%mm0       \n\t"\
1187         "psrad $" #shift ", %%mm3       \n\t"\
1188         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1189         "movd %%mm7, " #dst "           \n\t"\
1190         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1191         "movd %%mm0, 16+" #dst "        \n\t"\
1192         "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
1193         "movd %%mm3, 96+" #dst "        \n\t"\
1194         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1195         "movd %%mm4, 112+" #dst "       \n\t"\
1196         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1197         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1198         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1199         "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
1200         "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
1201         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1202         "psrad $" #shift ", %%mm3       \n\t"\
1203         "psrad $" #shift ", %%mm5       \n\t"\
1204         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1205         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1206         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1207         "psrad $" #shift ", %%mm6       \n\t"\
1208         "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
1209         "movd %%mm3, 32+" #dst "        \n\t"\
1210         "psrad $" #shift ", %%mm4       \n\t"\
1211         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1212         "movd %%mm6, 48+" #dst "        \n\t"\
1213         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1214         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1215         "movd %%mm4, 64+" #dst "        \n\t"\
1216         "movd %%mm5, 80+" #dst "        \n\t"
1217
1218
1219 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1220 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1221 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1222 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1223 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1224         "jmp 9f                         \n\t"
1225
1226
1227         "#.balign 16                    \n\t"
1228         "7:                             \n\t"
1229 #undef IDCT
1230 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1231         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1232         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1233         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1234         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1235         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1236         #rounder ", %%mm4               \n\t"\
1237         #rounder ", %%mm0               \n\t"\
1238         "psrad $" #shift ", %%mm4       \n\t"\
1239         "psrad $" #shift ", %%mm0       \n\t"\
1240         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1241         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1242         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1243         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1244         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1245         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1246         #rounder ", %%mm1               \n\t"\
1247         #rounder ", %%mm2               \n\t"\
1248         "psrad $" #shift ", %%mm1       \n\t"\
1249         "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
1250         "movq %%mm4, " #dst "           \n\t"\
1251         "psrad $" #shift ", %%mm2       \n\t"\
1252         "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
1253         "movq %%mm0, 16+" #dst "        \n\t"\
1254         "movq %%mm0, 96+" #dst "        \n\t"\
1255         "movq %%mm4, 112+" #dst "       \n\t"\
1256         "movq %%mm0, 32+" #dst "        \n\t"\
1257         "movq %%mm4, 48+" #dst "        \n\t"\
1258         "movq %%mm4, 64+" #dst "        \n\t"\
1259         "movq %%mm0, 80+" #dst "        \n\t"
1260
1261 //IDCT(  src0,   src4,   src1,    src5,    dst, rounder, shift)
1262 IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0),/nop, 20)
1263 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0),/nop, 20)
1264 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0),/nop, 20)
1265 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1266
1267
1268 #endif
1269
1270 /*
1271 Input
1272  00 40 04 44 20 60 24 64
1273  10 30 14 34 50 70 54 74
1274  01 41 03 43 21 61 23 63
1275  11 31 13 33 51 71 53 73
1276  02 42 06 46 22 62 26 66
1277  12 32 16 36 52 72 56 76
1278  05 45 07 47 25 65 27 67
1279  15 35 17 37 55 75 57 77
1280
1281 Temp
1282  00 04 10 14 20 24 30 34
1283  40 44 50 54 60 64 70 74
1284  01 03 11 13 21 23 31 33
1285  41 43 51 53 61 63 71 73
1286  02 06 12 16 22 26 32 36
1287  42 46 52 56 62 66 72 76
1288  05 07 15 17 25 27 35 37
1289  45 47 55 57 65 67 75 77
1290 */
1291
1292 "9: \n\t"
1293                 :: "r" (block), "r" (temp), "r" (coeffs)
1294                 : "%eax"
1295         );
1296 }
1297
1298 void ff_simple_idct_mmx(int16_t *block)
1299 {
1300     idct(block);
1301 }
1302
1303 //FIXME merge add/put into the idct
1304
1305 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1306 {
1307     idct(block);
1308     put_pixels_clamped_mmx(block, dest, line_size);
1309 }
1310 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1311 {
1312     idct(block);
1313     add_pixels_clamped_mmx(block, dest, line_size);
1314 }