]> git.sesse.net Git - ffmpeg/blob - libavcodec/x86/simple_idct_mmx.c
check validity of pointer srcC
[ffmpeg] / libavcodec / x86 / simple_idct_mmx.c
1 /*
2  * Simple IDCT MMX
3  *
4  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 #include "libavcodec/dsputil.h"
23 #include "libavcodec/simple_idct.h"
24
25 /*
26 23170.475006
27 22725.260826
28 21406.727617
29 19265.545870
30 16384.000000
31 12872.826198
32 8866.956905
33 4520.335430
34 */
35 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39 #if 0
40 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41 #else
42 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
43 #endif
44 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47
48 #define ROW_SHIFT 11
49 #define COL_SHIFT 20 // 6
50
51 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
52 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
53
54 DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
55         1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
56 //        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
57 //        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
58         1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
59         // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
60 //        0, 0, 0, 0,
61 //        0, 0, 0, 0,
62
63  C4,  C4,  C4,  C4,
64  C4, -C4,  C4, -C4,
65
66  C2,  C6,  C2,  C6,
67  C6, -C2,  C6, -C2,
68
69  C1,  C3,  C1,  C3,
70  C5,  C7,  C5,  C7,
71
72  C3, -C7,  C3, -C7,
73 -C1, -C5, -C1, -C5,
74
75  C5, -C1,  C5, -C1,
76  C7,  C3,  C7,  C3,
77
78  C7, -C5,  C7, -C5,
79  C3, -C1,  C3, -C1
80 };
81
82 #if 0
83 static void unused_var_killer(void)
84 {
85         int a= wm1010 + d40000;
86         temp[0]=a;
87 }
88
89 static void inline idctCol (int16_t * col, int16_t *input)
90 {
91 #undef C0
92 #undef C1
93 #undef C2
94 #undef C3
95 #undef C4
96 #undef C5
97 #undef C6
98 #undef C7
99         int a0, a1, a2, a3, b0, b1, b2, b3;
100         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104         const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106         const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
107         const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
108 /*
109         if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
110                 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
111                         col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
112                 return;
113         }*/
114
115 col[8*0] = input[8*0 + 0];
116 col[8*1] = input[8*2 + 0];
117 col[8*2] = input[8*0 + 1];
118 col[8*3] = input[8*2 + 1];
119 col[8*4] = input[8*4 + 0];
120 col[8*5] = input[8*6 + 0];
121 col[8*6] = input[8*4 + 1];
122 col[8*7] = input[8*6 + 1];
123
124         a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
125         a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
126         a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
127         a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
128
129         b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
130         b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
131         b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
132         b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
133
134         col[8*0] = (a0 + b0) >> COL_SHIFT;
135         col[8*1] = (a1 + b1) >> COL_SHIFT;
136         col[8*2] = (a2 + b2) >> COL_SHIFT;
137         col[8*3] = (a3 + b3) >> COL_SHIFT;
138         col[8*4] = (a3 - b3) >> COL_SHIFT;
139         col[8*5] = (a2 - b2) >> COL_SHIFT;
140         col[8*6] = (a1 - b1) >> COL_SHIFT;
141         col[8*7] = (a0 - b0) >> COL_SHIFT;
142 }
143
144 static void inline idctRow (int16_t * output, int16_t * input)
145 {
146         int16_t row[8];
147
148         int a0, a1, a2, a3, b0, b1, b2, b3;
149         const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150         const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151         const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152         const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153         const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154         const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155         const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
156         const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
157
158 row[0] = input[0];
159 row[2] = input[1];
160 row[4] = input[4];
161 row[6] = input[5];
162 row[1] = input[8];
163 row[3] = input[9];
164 row[5] = input[12];
165 row[7] = input[13];
166
167         if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
168                 row[0] = row[1] = row[2] = row[3] = row[4] =
169                         row[5] = row[6] = row[7] = row[0]<<3;
170         output[0]  = row[0];
171         output[2]  = row[1];
172         output[4]  = row[2];
173         output[6]  = row[3];
174         output[8]  = row[4];
175         output[10] = row[5];
176         output[12] = row[6];
177         output[14] = row[7];
178                 return;
179         }
180
181         a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
182         a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
183         a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
184         a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
185
186         b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
187         b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
188         b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
189         b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
190
191         row[0] = (a0 + b0) >> ROW_SHIFT;
192         row[1] = (a1 + b1) >> ROW_SHIFT;
193         row[2] = (a2 + b2) >> ROW_SHIFT;
194         row[3] = (a3 + b3) >> ROW_SHIFT;
195         row[4] = (a3 - b3) >> ROW_SHIFT;
196         row[5] = (a2 - b2) >> ROW_SHIFT;
197         row[6] = (a1 - b1) >> ROW_SHIFT;
198         row[7] = (a0 - b0) >> ROW_SHIFT;
199
200         output[0]  = row[0];
201         output[2]  = row[1];
202         output[4]  = row[2];
203         output[6]  = row[3];
204         output[8]  = row[4];
205         output[10] = row[5];
206         output[12] = row[6];
207         output[14] = row[7];
208 }
209 #endif
210
211 static inline void idct(int16_t *block)
212 {
213         DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
214         int16_t * const temp= (int16_t*)align_tmp;
215
216         __asm__ volatile(
217 #if 0 //Alternative, simpler variant
218
219 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
220         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
221         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
222         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
223         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
224         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
225         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
226         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
227         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
228         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
229         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
230         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
231         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
232         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
233         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
234         #rounder ", %%mm4               \n\t"\
235         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
236         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
237         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
238         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
239         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
240         #rounder ", %%mm0               \n\t"\
241         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
242         "paddd %%mm0, %%mm0             \n\t" \
243         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
244         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
245         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
246         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
247         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
248         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
249         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
250         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
251         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
252         "psrad $" #shift ", %%mm7       \n\t"\
253         "psrad $" #shift ", %%mm4       \n\t"\
254         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
255         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
256         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
257         "psrad $" #shift ", %%mm1       \n\t"\
258         "psrad $" #shift ", %%mm2       \n\t"\
259         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
260         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
261         "movq %%mm7, " #dst "           \n\t"\
262         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
263         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
264         "movq %%mm2, 24+" #dst "        \n\t"\
265         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
266         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
267         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
268         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
269         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
270         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
271         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
272         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
273         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
274         "psrad $" #shift ", %%mm2       \n\t"\
275         "psrad $" #shift ", %%mm0       \n\t"\
276         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
277         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
278         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
279         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
280         "psrad $" #shift ", %%mm6       \n\t"\
281         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
282         "movq %%mm2, 8+" #dst "         \n\t"\
283         "psrad $" #shift ", %%mm4       \n\t"\
284         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
285         "movq %%mm4, 16+" #dst "        \n\t"\
286
287 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
288         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
289         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
290         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
291         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
292         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
293         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
294         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
295         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
296         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
297         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
298         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
299         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
300         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
301         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
302         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
303         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
304         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
305         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
306         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
307         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
308         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
309         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
310         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
311         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
312         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
313         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
314         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
315         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
316         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
317         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
318         "psrad $" #shift ", %%mm7       \n\t"\
319         "psrad $" #shift ", %%mm4       \n\t"\
320         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
321         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
322         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
323         "psrad $" #shift ", %%mm0       \n\t"\
324         "psrad $" #shift ", %%mm2       \n\t"\
325         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
326         "movd %%mm7, " #dst "           \n\t"\
327         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
328         "movd %%mm0, 16+" #dst "        \n\t"\
329         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
330         "movd %%mm2, 96+" #dst "        \n\t"\
331         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
332         "movd %%mm4, 112+" #dst "       \n\t"\
333         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
334         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
335         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
336         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
337         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
338         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
339         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
340         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
341         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
342         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
343         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
344         "psrad $" #shift ", %%mm2       \n\t"\
345         "psrad $" #shift ", %%mm5       \n\t"\
346         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
347         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
348         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
349         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
350         "psrad $" #shift ", %%mm6       \n\t"\
351         "psrad $" #shift ", %%mm4       \n\t"\
352         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
353         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
354         "movd %%mm2, 32+" #dst "        \n\t"\
355         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
356         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
357         "movd %%mm6, 48+" #dst "        \n\t"\
358         "movd %%mm4, 64+" #dst "        \n\t"\
359         "movd %%mm5, 80+" #dst "        \n\t"\
360
361
362 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
363         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
364         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
365         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
366         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
367         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
368         "pand %%mm0, %%mm4              \n\t"\
369         "por %%mm1, %%mm4               \n\t"\
370         "por %%mm2, %%mm4               \n\t"\
371         "por %%mm3, %%mm4               \n\t"\
372         "packssdw %%mm4,%%mm4           \n\t"\
373         "movd %%mm4, %%eax              \n\t"\
374         "orl %%eax, %%eax               \n\t"\
375         "jz 1f                          \n\t"\
376         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
377         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
378         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
379         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
380         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
381         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
382         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
383         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
384         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
385         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
386         #rounder ", %%mm4               \n\t"\
387         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
388         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
389         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
390         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
391         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
392         #rounder ", %%mm0               \n\t"\
393         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
394         "paddd %%mm0, %%mm0             \n\t" \
395         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
396         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
397         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
398         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
399         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
400         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
401         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
402         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
403         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
404         "psrad $" #shift ", %%mm7       \n\t"\
405         "psrad $" #shift ", %%mm4       \n\t"\
406         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
407         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
408         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
409         "psrad $" #shift ", %%mm1       \n\t"\
410         "psrad $" #shift ", %%mm2       \n\t"\
411         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
412         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
413         "movq %%mm7, " #dst "           \n\t"\
414         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
415         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
416         "movq %%mm2, 24+" #dst "        \n\t"\
417         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
418         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
419         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
420         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
421         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
422         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
423         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
424         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
425         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
426         "psrad $" #shift ", %%mm2       \n\t"\
427         "psrad $" #shift ", %%mm0       \n\t"\
428         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
429         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
430         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
431         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
432         "psrad $" #shift ", %%mm6       \n\t"\
433         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
434         "movq %%mm2, 8+" #dst "         \n\t"\
435         "psrad $" #shift ", %%mm4       \n\t"\
436         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
437         "movq %%mm4, 16+" #dst "        \n\t"\
438         "jmp 2f                         \n\t"\
439         "1:                             \n\t"\
440         "pslld $16, %%mm0               \n\t"\
441         "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
442         "psrad $13, %%mm0               \n\t"\
443         "packssdw %%mm0, %%mm0          \n\t"\
444         "movq %%mm0, " #dst "           \n\t"\
445         "movq %%mm0, 8+" #dst "         \n\t"\
446         "movq %%mm0, 16+" #dst "        \n\t"\
447         "movq %%mm0, 24+" #dst "        \n\t"\
448         "2:                             \n\t"
449
450
451 //IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
452 ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
453 /*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
454 ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
455 ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
456
457 DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
458 DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
459 DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
460
461
462 //IDCT(      src0,   src4,   src1,    src5,    dst, shift)
463 COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
464 COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
465 COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
466 COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
467
468 #else
469
470 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
471         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
472         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
473         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
474         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
475         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
476         "pand %%mm0, %%mm4              \n\t"\
477         "por %%mm1, %%mm4               \n\t"\
478         "por %%mm2, %%mm4               \n\t"\
479         "por %%mm3, %%mm4               \n\t"\
480         "packssdw %%mm4,%%mm4           \n\t"\
481         "movd %%mm4, %%eax              \n\t"\
482         "orl %%eax, %%eax               \n\t"\
483         "jz 1f                          \n\t"\
484         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
485         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
486         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
487         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
488         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
489         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
490         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
491         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
492         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
493         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
494         #rounder ", %%mm4               \n\t"\
495         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
496         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
497         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
498         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
499         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
500         #rounder ", %%mm0               \n\t"\
501         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
502         "paddd %%mm0, %%mm0             \n\t" \
503         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
504         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
505         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
506         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
507         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
508         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
509         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
510         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
511         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
512         "psrad $" #shift ", %%mm7       \n\t"\
513         "psrad $" #shift ", %%mm4       \n\t"\
514         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
515         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
516         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
517         "psrad $" #shift ", %%mm1       \n\t"\
518         "psrad $" #shift ", %%mm2       \n\t"\
519         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
520         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
521         "movq %%mm7, " #dst "           \n\t"\
522         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
523         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
524         "movq %%mm2, 24+" #dst "        \n\t"\
525         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
526         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
527         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
528         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
529         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
530         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
531         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
532         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
533         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
534         "psrad $" #shift ", %%mm2       \n\t"\
535         "psrad $" #shift ", %%mm0       \n\t"\
536         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
537         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
538         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
539         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
540         "psrad $" #shift ", %%mm6       \n\t"\
541         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
542         "movq %%mm2, 8+" #dst "         \n\t"\
543         "psrad $" #shift ", %%mm4       \n\t"\
544         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
545         "movq %%mm4, 16+" #dst "        \n\t"\
546         "jmp 2f                         \n\t"\
547         "1:                             \n\t"\
548         "pslld $16, %%mm0               \n\t"\
549         "paddd "MANGLE(d40000)", %%mm0  \n\t"\
550         "psrad $13, %%mm0               \n\t"\
551         "packssdw %%mm0, %%mm0          \n\t"\
552         "movq %%mm0, " #dst "           \n\t"\
553         "movq %%mm0, 8+" #dst "         \n\t"\
554         "movq %%mm0, 16+" #dst "        \n\t"\
555         "movq %%mm0, 24+" #dst "        \n\t"\
556         "2:                             \n\t"
557
558 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
559         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
560         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
561         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
562         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
563         "movq %%mm0, %%mm4              \n\t"\
564         "por %%mm1, %%mm4               \n\t"\
565         "por %%mm2, %%mm4               \n\t"\
566         "por %%mm3, %%mm4               \n\t"\
567         "packssdw %%mm4,%%mm4           \n\t"\
568         "movd %%mm4, %%eax              \n\t"\
569         "orl %%eax, %%eax               \n\t"\
570         "jz " #bt "                     \n\t"\
571         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
572         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
573         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
574         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
575         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
576         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
577         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
578         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
579         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
580         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
581         #rounder ", %%mm4               \n\t"\
582         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
583         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
584         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
585         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
586         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
587         #rounder ", %%mm0               \n\t"\
588         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
589         "paddd %%mm0, %%mm0             \n\t" \
590         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
591         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
592         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
593         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
594         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
595         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
596         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
597         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
598         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
599         "psrad $" #shift ", %%mm7       \n\t"\
600         "psrad $" #shift ", %%mm4       \n\t"\
601         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
602         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
603         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
604         "psrad $" #shift ", %%mm1       \n\t"\
605         "psrad $" #shift ", %%mm2       \n\t"\
606         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
607         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
608         "movq %%mm7, " #dst "           \n\t"\
609         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
610         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
611         "movq %%mm2, 24+" #dst "        \n\t"\
612         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
613         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
614         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
615         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
616         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
617         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
618         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
619         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
620         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
621         "psrad $" #shift ", %%mm2       \n\t"\
622         "psrad $" #shift ", %%mm0       \n\t"\
623         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
624         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
625         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
626         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
627         "psrad $" #shift ", %%mm6       \n\t"\
628         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
629         "movq %%mm2, 8+" #dst "         \n\t"\
630         "psrad $" #shift ", %%mm4       \n\t"\
631         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
632         "movq %%mm4, 16+" #dst "        \n\t"\
633
634 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
635         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
636         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
637         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
638         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
639         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
640         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
641         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
642         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
643         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
644         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
645         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
646         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
647         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
648         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
649         #rounder ", %%mm4               \n\t"\
650         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
651         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
652         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
653         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
654         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
655         #rounder ", %%mm0               \n\t"\
656         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
657         "paddd %%mm0, %%mm0             \n\t" \
658         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
659         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
660         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
661         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
662         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
663         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
664         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
665         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
666         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
667         "psrad $" #shift ", %%mm7       \n\t"\
668         "psrad $" #shift ", %%mm4       \n\t"\
669         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
670         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
671         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
672         "psrad $" #shift ", %%mm1       \n\t"\
673         "psrad $" #shift ", %%mm2       \n\t"\
674         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
675         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
676         "movq %%mm7, " #dst "           \n\t"\
677         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
678         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
679         "movq %%mm2, 24+" #dst "        \n\t"\
680         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
681         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
682         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
683         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
684         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
685         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
686         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
687         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
688         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
689         "psrad $" #shift ", %%mm2       \n\t"\
690         "psrad $" #shift ", %%mm0       \n\t"\
691         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
692         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
693         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
694         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
695         "psrad $" #shift ", %%mm6       \n\t"\
696         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
697         "movq %%mm2, 8+" #dst "         \n\t"\
698         "psrad $" #shift ", %%mm4       \n\t"\
699         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
700         "movq %%mm4, 16+" #dst "        \n\t"\
701
702 //IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
703 DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
704 Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
705 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
706 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
707
708 #undef IDCT
709 #define IDCT(src0, src4, src1, src5, dst, shift) \
710         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
711         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
712         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
713         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
714         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
715         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
716         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
717         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
718         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
719         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
720         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
721         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
722         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
723         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
724         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
725         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
726         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
727         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
728         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
729         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
730         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
731         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
732         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
733         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
734         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
735         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
736         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
737         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
738         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
739         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
740         "psrad $" #shift ", %%mm7       \n\t"\
741         "psrad $" #shift ", %%mm4       \n\t"\
742         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
743         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
744         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
745         "psrad $" #shift ", %%mm0       \n\t"\
746         "psrad $" #shift ", %%mm2       \n\t"\
747         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
748         "movd %%mm7, " #dst "           \n\t"\
749         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
750         "movd %%mm0, 16+" #dst "        \n\t"\
751         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
752         "movd %%mm2, 96+" #dst "        \n\t"\
753         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
754         "movd %%mm4, 112+" #dst "       \n\t"\
755         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
756         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
757         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
758         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
759         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
760         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
761         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
762         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
763         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
764         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
765         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
766         "psrad $" #shift ", %%mm2       \n\t"\
767         "psrad $" #shift ", %%mm5       \n\t"\
768         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
769         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
770         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
771         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
772         "psrad $" #shift ", %%mm6       \n\t"\
773         "psrad $" #shift ", %%mm4       \n\t"\
774         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
775         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
776         "movd %%mm2, 32+" #dst "        \n\t"\
777         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
778         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
779         "movd %%mm6, 48+" #dst "        \n\t"\
780         "movd %%mm4, 64+" #dst "        \n\t"\
781         "movd %%mm5, 80+" #dst "        \n\t"
782
783
784 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
785 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
786 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
787 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
788 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
789         "jmp 9f                         \n\t"
790
791         "#" ASMALIGN(4)                      \
792         "4:                             \n\t"
793 Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
794 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
795
796 #undef IDCT
797 #define IDCT(src0, src4, src1, src5, dst, shift) \
798         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
799         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
800         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
801         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
802         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
803         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
804         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
805         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
806         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
807         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
808         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
809         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
810         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
811         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
812         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
813         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
814         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
815         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
816         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
817         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
818         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
819         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
820         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
821         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
822         "psrad $" #shift ", %%mm1       \n\t"\
823         "psrad $" #shift ", %%mm4       \n\t"\
824         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
825         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
826         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
827         "psrad $" #shift ", %%mm0       \n\t"\
828         "psrad $" #shift ", %%mm2       \n\t"\
829         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
830         "movd %%mm1, " #dst "           \n\t"\
831         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
832         "movd %%mm0, 16+" #dst "        \n\t"\
833         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
834         "movd %%mm2, 96+" #dst "        \n\t"\
835         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
836         "movd %%mm4, 112+" #dst "       \n\t"\
837         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
838         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
839         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
840         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
841         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
842         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
843         "psrad $" #shift ", %%mm2       \n\t"\
844         "psrad $" #shift ", %%mm5       \n\t"\
845         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
846         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
847         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
848         "psrad $" #shift ", %%mm6       \n\t"\
849         "psrad $" #shift ", %%mm1       \n\t"\
850         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
851         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
852         "movd %%mm2, 32+" #dst "        \n\t"\
853         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
854         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
855         "movd %%mm6, 48+" #dst "        \n\t"\
856         "movd %%mm1, 64+" #dst "        \n\t"\
857         "movd %%mm5, 80+" #dst "        \n\t"
858
859 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
860 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
861 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
862 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
863 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
864         "jmp 9f                         \n\t"
865
866         "#" ASMALIGN(4)                      \
867         "6:                             \n\t"
868 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
869
870 #undef IDCT
871 #define IDCT(src0, src4, src1, src5, dst, shift) \
872         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
873         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
874         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
875         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
876         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
877         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
878         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
879         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
880         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
881         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
882         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
883         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
884         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
885         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
886         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
887         "psrad $" #shift ", %%mm1       \n\t"\
888         "psrad $" #shift ", %%mm4       \n\t"\
889         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
890         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
891         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
892         "psrad $" #shift ", %%mm0       \n\t"\
893         "psrad $" #shift ", %%mm2       \n\t"\
894         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
895         "movd %%mm1, " #dst "           \n\t"\
896         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
897         "movd %%mm0, 16+" #dst "        \n\t"\
898         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
899         "movd %%mm2, 96+" #dst "        \n\t"\
900         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
901         "movd %%mm4, 112+" #dst "       \n\t"\
902         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
903         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
904         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
905         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
906         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
907         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
908         "psrad $" #shift ", %%mm2       \n\t"\
909         "psrad $" #shift ", %%mm5       \n\t"\
910         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
911         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
912         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
913         "psrad $" #shift ", %%mm6       \n\t"\
914         "psrad $" #shift ", %%mm1       \n\t"\
915         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
916         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
917         "movd %%mm2, 32+" #dst "        \n\t"\
918         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
919         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
920         "movd %%mm6, 48+" #dst "        \n\t"\
921         "movd %%mm1, 64+" #dst "        \n\t"\
922         "movd %%mm5, 80+" #dst "        \n\t"
923
924
925 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
926 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
927 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
928 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
929 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
930         "jmp 9f                         \n\t"
931
932         "#" ASMALIGN(4)                      \
933         "2:                             \n\t"
934 Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
935
936 #undef IDCT
937 #define IDCT(src0, src4, src1, src5, dst, shift) \
938         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
939         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
940         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
941         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
942         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
943         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
944         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
945         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
946         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
947         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
948         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
949         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
950         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
951         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
952         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
953         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
954         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
955         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
956         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
957         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
958         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
959         "psrad $" #shift ", %%mm7       \n\t"\
960         "psrad $" #shift ", %%mm4       \n\t"\
961         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
962         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
963         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
964         "psrad $" #shift ", %%mm0       \n\t"\
965         "psrad $" #shift ", %%mm2       \n\t"\
966         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
967         "movd %%mm7, " #dst "           \n\t"\
968         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
969         "movd %%mm0, 16+" #dst "        \n\t"\
970         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
971         "movd %%mm2, 96+" #dst "        \n\t"\
972         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
973         "movd %%mm4, 112+" #dst "       \n\t"\
974         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
975         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
976         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
977         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
978         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
979         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
980         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
981         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
982         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
983         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
984         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
985         "psrad $" #shift ", %%mm2       \n\t"\
986         "psrad $" #shift ", %%mm5       \n\t"\
987         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
988         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
989         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
990         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
991         "psrad $" #shift ", %%mm6       \n\t"\
992         "psrad $" #shift ", %%mm4       \n\t"\
993         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
994         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
995         "movd %%mm2, 32+" #dst "        \n\t"\
996         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
997         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
998         "movd %%mm6, 48+" #dst "        \n\t"\
999         "movd %%mm4, 64+" #dst "        \n\t"\
1000         "movd %%mm5, 80+" #dst "        \n\t"
1001
1002 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1003 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1004 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1005 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1006 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1007         "jmp 9f                         \n\t"
1008
1009         "#" ASMALIGN(4)                      \
1010         "3:                             \n\t"
1011 #undef IDCT
1012 #define IDCT(src0, src4, src1, src5, dst, shift) \
1013         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1014         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1015         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1016         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1017         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1018         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1019         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1020         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1021         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1022         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1023         "movq 64(%2), %%mm3             \n\t"\
1024         "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1025         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1026         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1027         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1028         "psrad $" #shift ", %%mm7       \n\t"\
1029         "psrad $" #shift ", %%mm4       \n\t"\
1030         "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
1031         "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1032         "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
1033         "psrad $" #shift ", %%mm0       \n\t"\
1034         "psrad $" #shift ", %%mm1       \n\t"\
1035         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1036         "movd %%mm7, " #dst "           \n\t"\
1037         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1038         "movd %%mm0, 16+" #dst "        \n\t"\
1039         "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
1040         "movd %%mm1, 96+" #dst "        \n\t"\
1041         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1042         "movd %%mm4, 112+" #dst "       \n\t"\
1043         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1044         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1045         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1046         "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
1047         "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
1048         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1049         "psrad $" #shift ", %%mm1       \n\t"\
1050         "psrad $" #shift ", %%mm5       \n\t"\
1051         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1052         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1053         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1054         "psrad $" #shift ", %%mm6       \n\t"\
1055         "psrad $" #shift ", %%mm4       \n\t"\
1056         "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
1057         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1058         "movd %%mm1, 32+" #dst "        \n\t"\
1059         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1060         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1061         "movd %%mm6, 48+" #dst "        \n\t"\
1062         "movd %%mm4, 64+" #dst "        \n\t"\
1063         "movd %%mm5, 80+" #dst "        \n\t"
1064
1065
1066 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1067 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1068 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1069 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1070 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1071         "jmp 9f                         \n\t"
1072
1073         "#" ASMALIGN(4)                      \
1074         "5:                             \n\t"
1075 #undef IDCT
1076 #define IDCT(src0, src4, src1, src5, dst, shift) \
1077         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1078         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1079         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1080         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1081         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1082         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1083         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1084         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1085         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1086         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1087         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1088         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1089         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1090         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1091         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1092         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1093         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1094         "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
1095         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1096         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1097         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1098         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1099         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1100         "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1101         "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1102         "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
1103         "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
1104         "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
1105         "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
1106         "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
1107         "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
1108         "psrad $" #shift ", %%mm4       \n\t"\
1109         "psrad $" #shift ", %%mm7       \n\t"\
1110         "psrad $" #shift ", %%mm3       \n\t"\
1111         "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
1112         "movq %%mm4, " #dst "           \n\t"\
1113         "psrad $" #shift ", %%mm0       \n\t"\
1114         "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
1115         "movq %%mm0, 16+" #dst "        \n\t"\
1116         "movq %%mm0, 96+" #dst "        \n\t"\
1117         "movq %%mm4, 112+" #dst "       \n\t"\
1118         "psrad $" #shift ", %%mm5       \n\t"\
1119         "psrad $" #shift ", %%mm6       \n\t"\
1120         "psrad $" #shift ", %%mm2       \n\t"\
1121         "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1122         "movq %%mm5, 32+" #dst "        \n\t"\
1123         "psrad $" #shift ", %%mm1       \n\t"\
1124         "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1125         "movq %%mm6, 48+" #dst "        \n\t"\
1126         "movq %%mm6, 64+" #dst "        \n\t"\
1127         "movq %%mm5, 80+" #dst "        \n\t"
1128
1129
1130 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1131 IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1132 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1133 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1134 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1135         "jmp 9f                         \n\t"
1136
1137
1138         "#" ASMALIGN(4)                      \
1139         "1:                             \n\t"
1140 #undef IDCT
1141 #define IDCT(src0, src4, src1, src5, dst, shift) \
1142         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1143         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1144         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1145         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1146         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1147         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1148         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1149         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1150         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1151         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1152         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1153         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1154         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1155         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1156         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1157         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1158         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1159         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1160         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1161         "movq 64(%2), %%mm1             \n\t"\
1162         "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1163         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1164         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1165         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1166         "psrad $" #shift ", %%mm7       \n\t"\
1167         "psrad $" #shift ", %%mm4       \n\t"\
1168         "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
1169         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1170         "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
1171         "psrad $" #shift ", %%mm0       \n\t"\
1172         "psrad $" #shift ", %%mm3       \n\t"\
1173         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1174         "movd %%mm7, " #dst "           \n\t"\
1175         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1176         "movd %%mm0, 16+" #dst "        \n\t"\
1177         "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
1178         "movd %%mm3, 96+" #dst "        \n\t"\
1179         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1180         "movd %%mm4, 112+" #dst "       \n\t"\
1181         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1182         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1183         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1184         "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
1185         "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
1186         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1187         "psrad $" #shift ", %%mm3       \n\t"\
1188         "psrad $" #shift ", %%mm5       \n\t"\
1189         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1190         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1191         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1192         "psrad $" #shift ", %%mm6       \n\t"\
1193         "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
1194         "movd %%mm3, 32+" #dst "        \n\t"\
1195         "psrad $" #shift ", %%mm4       \n\t"\
1196         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1197         "movd %%mm6, 48+" #dst "        \n\t"\
1198         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1199         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1200         "movd %%mm4, 64+" #dst "        \n\t"\
1201         "movd %%mm5, 80+" #dst "        \n\t"
1202
1203
1204 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1205 IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1206 IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1207 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1208 IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1209         "jmp 9f                         \n\t"
1210
1211
1212         "#" ASMALIGN(4)
1213         "7:                             \n\t"
1214 #undef IDCT
1215 #define IDCT(src0, src4, src1, src5, dst, shift) \
1216         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1217         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1218         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1219         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1220         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1221         "psrad $" #shift ", %%mm4       \n\t"\
1222         "psrad $" #shift ", %%mm0       \n\t"\
1223         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1224         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1225         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1226         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1227         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1228         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1229         "psrad $" #shift ", %%mm1       \n\t"\
1230         "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
1231         "movq %%mm4, " #dst "           \n\t"\
1232         "psrad $" #shift ", %%mm2       \n\t"\
1233         "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
1234         "movq %%mm0, 16+" #dst "        \n\t"\
1235         "movq %%mm0, 96+" #dst "        \n\t"\
1236         "movq %%mm4, 112+" #dst "       \n\t"\
1237         "movq %%mm0, 32+" #dst "        \n\t"\
1238         "movq %%mm4, 48+" #dst "        \n\t"\
1239         "movq %%mm4, 64+" #dst "        \n\t"\
1240         "movq %%mm0, 80+" #dst "        \n\t"
1241
1242 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1243 IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1244 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1245 IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1246 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1247
1248
1249 #endif
1250
1251 /*
1252 Input
1253  00 40 04 44 20 60 24 64
1254  10 30 14 34 50 70 54 74
1255  01 41 03 43 21 61 23 63
1256  11 31 13 33 51 71 53 73
1257  02 42 06 46 22 62 26 66
1258  12 32 16 36 52 72 56 76
1259  05 45 07 47 25 65 27 67
1260  15 35 17 37 55 75 57 77
1261
1262 Temp
1263  00 04 10 14 20 24 30 34
1264  40 44 50 54 60 64 70 74
1265  01 03 11 13 21 23 31 33
1266  41 43 51 53 61 63 71 73
1267  02 06 12 16 22 26 32 36
1268  42 46 52 56 62 66 72 76
1269  05 07 15 17 25 27 35 37
1270  45 47 55 57 65 67 75 77
1271 */
1272
1273 "9: \n\t"
1274                 :: "r" (block), "r" (temp), "r" (coeffs)
1275                 : "%eax"
1276         );
1277 }
1278
1279 void ff_simple_idct_mmx(int16_t *block)
1280 {
1281     idct(block);
1282 }
1283
1284 //FIXME merge add/put into the idct
1285
1286 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1287 {
1288     idct(block);
1289     put_pixels_clamped_mmx(block, dest, line_size);
1290 }
1291 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1292 {
1293     idct(block);
1294     add_pixels_clamped_mmx(block, dest, line_size);
1295 }