4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavcodec/dsputil.h"
23 #include "libavcodec/simple_idct.h"
24 #include "dsputil_mmx.h"
36 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
40 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
41 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define COL_SHIFT 20 // 6
48 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
49 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
51 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
52 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
53 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
54 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
55 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
56 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
79 static inline void idct(int16_t *block)
81 DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
82 int16_t * const temp= (int16_t*)align_tmp;
85 #if 0 //Alternative, simpler variant
87 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
88 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
89 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
90 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
91 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
92 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
93 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
94 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
95 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
96 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
97 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
98 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
99 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
100 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
101 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
102 #rounder ", %%mm4 \n\t"\
103 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
104 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
105 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
106 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
107 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
108 #rounder ", %%mm0 \n\t"\
109 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
110 "paddd %%mm0, %%mm0 \n\t" \
111 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
112 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
113 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
114 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
115 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
116 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
117 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
118 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
119 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
120 "psrad $" #shift ", %%mm7 \n\t"\
121 "psrad $" #shift ", %%mm4 \n\t"\
122 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
123 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
124 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
125 "psrad $" #shift ", %%mm1 \n\t"\
126 "psrad $" #shift ", %%mm2 \n\t"\
127 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
128 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
129 "movq %%mm7, " #dst " \n\t"\
130 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
131 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
132 "movq %%mm2, 24+" #dst " \n\t"\
133 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
134 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
135 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
136 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
137 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
138 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
139 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
140 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
141 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
142 "psrad $" #shift ", %%mm2 \n\t"\
143 "psrad $" #shift ", %%mm0 \n\t"\
144 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
145 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
146 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
147 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
148 "psrad $" #shift ", %%mm6 \n\t"\
149 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
150 "movq %%mm2, 8+" #dst " \n\t"\
151 "psrad $" #shift ", %%mm4 \n\t"\
152 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
153 "movq %%mm4, 16+" #dst " \n\t"\
155 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
156 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
157 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
158 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
159 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
160 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
161 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
162 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
163 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
164 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
165 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
166 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
167 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
168 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
169 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
170 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
171 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
172 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
173 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
174 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
175 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
176 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
177 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
178 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
179 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
180 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
181 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
182 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
183 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
184 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
185 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
186 "psrad $" #shift ", %%mm7 \n\t"\
187 "psrad $" #shift ", %%mm4 \n\t"\
188 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
189 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
190 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
191 "psrad $" #shift ", %%mm0 \n\t"\
192 "psrad $" #shift ", %%mm2 \n\t"\
193 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
194 "movd %%mm7, " #dst " \n\t"\
195 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
196 "movd %%mm0, 16+" #dst " \n\t"\
197 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
198 "movd %%mm2, 96+" #dst " \n\t"\
199 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
200 "movd %%mm4, 112+" #dst " \n\t"\
201 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
202 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
203 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
204 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
205 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
206 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
207 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
208 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
209 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
210 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
211 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
212 "psrad $" #shift ", %%mm2 \n\t"\
213 "psrad $" #shift ", %%mm5 \n\t"\
214 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
215 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
216 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
217 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
218 "psrad $" #shift ", %%mm6 \n\t"\
219 "psrad $" #shift ", %%mm4 \n\t"\
220 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
221 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
222 "movd %%mm2, 32+" #dst " \n\t"\
223 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
224 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
225 "movd %%mm6, 48+" #dst " \n\t"\
226 "movd %%mm4, 64+" #dst " \n\t"\
227 "movd %%mm5, 80+" #dst " \n\t"\
230 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
231 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
232 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
233 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
234 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
235 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
236 "pand %%mm0, %%mm4 \n\t"\
237 "por %%mm1, %%mm4 \n\t"\
238 "por %%mm2, %%mm4 \n\t"\
239 "por %%mm3, %%mm4 \n\t"\
240 "packssdw %%mm4,%%mm4 \n\t"\
241 "movd %%mm4, %%eax \n\t"\
242 "orl %%eax, %%eax \n\t"\
244 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
245 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
246 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
247 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
248 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
249 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
250 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
251 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
252 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
253 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
254 #rounder ", %%mm4 \n\t"\
255 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
256 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
257 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
258 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
259 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
260 #rounder ", %%mm0 \n\t"\
261 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
262 "paddd %%mm0, %%mm0 \n\t" \
263 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
264 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
265 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
266 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
267 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
268 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
269 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
270 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
271 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
272 "psrad $" #shift ", %%mm7 \n\t"\
273 "psrad $" #shift ", %%mm4 \n\t"\
274 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
275 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
276 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
277 "psrad $" #shift ", %%mm1 \n\t"\
278 "psrad $" #shift ", %%mm2 \n\t"\
279 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
280 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
281 "movq %%mm7, " #dst " \n\t"\
282 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
283 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
284 "movq %%mm2, 24+" #dst " \n\t"\
285 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
286 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
287 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
288 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
289 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
290 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
291 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
292 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
293 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
294 "psrad $" #shift ", %%mm2 \n\t"\
295 "psrad $" #shift ", %%mm0 \n\t"\
296 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
297 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
298 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
299 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
300 "psrad $" #shift ", %%mm6 \n\t"\
301 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
302 "movq %%mm2, 8+" #dst " \n\t"\
303 "psrad $" #shift ", %%mm4 \n\t"\
304 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
305 "movq %%mm4, 16+" #dst " \n\t"\
308 "pslld $16, %%mm0 \n\t"\
309 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
310 "psrad $13, %%mm0 \n\t"\
311 "packssdw %%mm0, %%mm0 \n\t"\
312 "movq %%mm0, " #dst " \n\t"\
313 "movq %%mm0, 8+" #dst " \n\t"\
314 "movq %%mm0, 16+" #dst " \n\t"\
315 "movq %%mm0, 24+" #dst " \n\t"\
319 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
320 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
321 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
322 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
323 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
325 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
326 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
327 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
330 //IDCT( src0, src4, src1, src5, dst, shift)
331 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
332 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
333 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
334 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
338 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
339 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
340 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
341 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
342 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
343 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
344 "pand %%mm0, %%mm4 \n\t"\
345 "por %%mm1, %%mm4 \n\t"\
346 "por %%mm2, %%mm4 \n\t"\
347 "por %%mm3, %%mm4 \n\t"\
348 "packssdw %%mm4,%%mm4 \n\t"\
349 "movd %%mm4, %%eax \n\t"\
350 "orl %%eax, %%eax \n\t"\
352 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
353 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
354 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
355 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
356 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
357 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
358 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
359 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
360 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
361 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
362 #rounder ", %%mm4 \n\t"\
363 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
364 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
365 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
366 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
367 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
368 #rounder ", %%mm0 \n\t"\
369 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
370 "paddd %%mm0, %%mm0 \n\t" \
371 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
372 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
373 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
374 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
375 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
376 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
377 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
378 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
379 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
380 "psrad $" #shift ", %%mm7 \n\t"\
381 "psrad $" #shift ", %%mm4 \n\t"\
382 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
383 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
384 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
385 "psrad $" #shift ", %%mm1 \n\t"\
386 "psrad $" #shift ", %%mm2 \n\t"\
387 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
388 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
389 "movq %%mm7, " #dst " \n\t"\
390 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
391 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
392 "movq %%mm2, 24+" #dst " \n\t"\
393 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
394 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
395 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
396 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
397 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
398 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
399 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
400 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
401 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
402 "psrad $" #shift ", %%mm2 \n\t"\
403 "psrad $" #shift ", %%mm0 \n\t"\
404 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
405 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
406 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
407 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
408 "psrad $" #shift ", %%mm6 \n\t"\
409 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
410 "movq %%mm2, 8+" #dst " \n\t"\
411 "psrad $" #shift ", %%mm4 \n\t"\
412 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
413 "movq %%mm4, 16+" #dst " \n\t"\
416 "pslld $16, %%mm0 \n\t"\
417 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
418 "psrad $13, %%mm0 \n\t"\
419 "packssdw %%mm0, %%mm0 \n\t"\
420 "movq %%mm0, " #dst " \n\t"\
421 "movq %%mm0, 8+" #dst " \n\t"\
422 "movq %%mm0, 16+" #dst " \n\t"\
423 "movq %%mm0, 24+" #dst " \n\t"\
426 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
427 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
428 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
429 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
430 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
431 "movq %%mm0, %%mm4 \n\t"\
432 "por %%mm1, %%mm4 \n\t"\
433 "por %%mm2, %%mm4 \n\t"\
434 "por %%mm3, %%mm4 \n\t"\
435 "packssdw %%mm4,%%mm4 \n\t"\
436 "movd %%mm4, %%eax \n\t"\
437 "orl %%eax, %%eax \n\t"\
439 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
440 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
441 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
442 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
443 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
444 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
445 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
446 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
447 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
448 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
449 #rounder ", %%mm4 \n\t"\
450 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
451 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
452 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
453 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
454 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
455 #rounder ", %%mm0 \n\t"\
456 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
457 "paddd %%mm0, %%mm0 \n\t" \
458 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
459 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
460 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
461 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
462 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
463 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
464 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
465 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
466 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
467 "psrad $" #shift ", %%mm7 \n\t"\
468 "psrad $" #shift ", %%mm4 \n\t"\
469 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
470 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
471 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
472 "psrad $" #shift ", %%mm1 \n\t"\
473 "psrad $" #shift ", %%mm2 \n\t"\
474 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
475 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
476 "movq %%mm7, " #dst " \n\t"\
477 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
478 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
479 "movq %%mm2, 24+" #dst " \n\t"\
480 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
481 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
482 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
483 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
484 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
485 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
486 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
487 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
488 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
489 "psrad $" #shift ", %%mm2 \n\t"\
490 "psrad $" #shift ", %%mm0 \n\t"\
491 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
492 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
493 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
494 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
495 "psrad $" #shift ", %%mm6 \n\t"\
496 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
497 "movq %%mm2, 8+" #dst " \n\t"\
498 "psrad $" #shift ", %%mm4 \n\t"\
499 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
500 "movq %%mm4, 16+" #dst " \n\t"\
502 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
503 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
504 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
505 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
506 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
507 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
508 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
509 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
510 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
511 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
512 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
513 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
514 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
515 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
516 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
517 #rounder ", %%mm4 \n\t"\
518 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
519 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
520 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
521 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
522 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
523 #rounder ", %%mm0 \n\t"\
524 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
525 "paddd %%mm0, %%mm0 \n\t" \
526 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
527 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
528 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
529 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
530 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
531 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
532 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
533 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
534 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
535 "psrad $" #shift ", %%mm7 \n\t"\
536 "psrad $" #shift ", %%mm4 \n\t"\
537 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
538 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
539 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
540 "psrad $" #shift ", %%mm1 \n\t"\
541 "psrad $" #shift ", %%mm2 \n\t"\
542 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
543 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
544 "movq %%mm7, " #dst " \n\t"\
545 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
546 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
547 "movq %%mm2, 24+" #dst " \n\t"\
548 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
549 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
550 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
551 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
552 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
553 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
554 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
555 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
556 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
557 "psrad $" #shift ", %%mm2 \n\t"\
558 "psrad $" #shift ", %%mm0 \n\t"\
559 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
560 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
561 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
562 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
563 "psrad $" #shift ", %%mm6 \n\t"\
564 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
565 "movq %%mm2, 8+" #dst " \n\t"\
566 "psrad $" #shift ", %%mm4 \n\t"\
567 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
568 "movq %%mm4, 16+" #dst " \n\t"\
570 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
571 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
572 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
573 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
574 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
577 #define IDCT(src0, src4, src1, src5, dst, shift) \
578 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
579 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
580 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
581 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
582 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
583 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
584 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
585 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
586 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
587 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
588 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
589 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
590 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
591 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
592 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
593 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
594 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
595 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
596 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
597 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
598 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
599 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
600 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
601 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
602 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
603 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
604 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
605 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
606 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
607 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
608 "psrad $" #shift ", %%mm7 \n\t"\
609 "psrad $" #shift ", %%mm4 \n\t"\
610 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
611 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
612 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
613 "psrad $" #shift ", %%mm0 \n\t"\
614 "psrad $" #shift ", %%mm2 \n\t"\
615 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
616 "movd %%mm7, " #dst " \n\t"\
617 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
618 "movd %%mm0, 16+" #dst " \n\t"\
619 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
620 "movd %%mm2, 96+" #dst " \n\t"\
621 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
622 "movd %%mm4, 112+" #dst " \n\t"\
623 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
624 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
625 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
626 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
627 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
628 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
629 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
630 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
631 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
632 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
633 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
634 "psrad $" #shift ", %%mm2 \n\t"\
635 "psrad $" #shift ", %%mm5 \n\t"\
636 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
637 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
638 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
639 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
640 "psrad $" #shift ", %%mm6 \n\t"\
641 "psrad $" #shift ", %%mm4 \n\t"\
642 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
643 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
644 "movd %%mm2, 32+" #dst " \n\t"\
645 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
646 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
647 "movd %%mm6, 48+" #dst " \n\t"\
648 "movd %%mm4, 64+" #dst " \n\t"\
649 "movd %%mm5, 80+" #dst " \n\t"
652 //IDCT( src0, src4, src1, src5, dst, shift)
653 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
654 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
655 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
656 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
661 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
662 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
665 #define IDCT(src0, src4, src1, src5, dst, shift) \
666 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
667 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
668 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
669 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
670 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
671 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
672 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
673 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
674 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
675 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
676 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
677 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
678 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
679 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
680 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
681 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
682 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
683 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
684 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
685 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
686 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
687 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
688 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
689 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
690 "psrad $" #shift ", %%mm1 \n\t"\
691 "psrad $" #shift ", %%mm4 \n\t"\
692 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
693 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
694 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
695 "psrad $" #shift ", %%mm0 \n\t"\
696 "psrad $" #shift ", %%mm2 \n\t"\
697 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
698 "movd %%mm1, " #dst " \n\t"\
699 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
700 "movd %%mm0, 16+" #dst " \n\t"\
701 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
702 "movd %%mm2, 96+" #dst " \n\t"\
703 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
704 "movd %%mm4, 112+" #dst " \n\t"\
705 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
706 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
707 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
708 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
709 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
710 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
711 "psrad $" #shift ", %%mm2 \n\t"\
712 "psrad $" #shift ", %%mm5 \n\t"\
713 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
714 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
715 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
716 "psrad $" #shift ", %%mm6 \n\t"\
717 "psrad $" #shift ", %%mm1 \n\t"\
718 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
719 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
720 "movd %%mm2, 32+" #dst " \n\t"\
721 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
722 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
723 "movd %%mm6, 48+" #dst " \n\t"\
724 "movd %%mm1, 64+" #dst " \n\t"\
725 "movd %%mm5, 80+" #dst " \n\t"
727 //IDCT( src0, src4, src1, src5, dst, shift)
728 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
729 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
730 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
731 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
736 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
739 #define IDCT(src0, src4, src1, src5, dst, shift) \
740 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
741 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
742 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
743 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
744 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
745 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
746 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
747 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
748 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
749 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
750 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
751 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
752 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
753 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
754 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
755 "psrad $" #shift ", %%mm1 \n\t"\
756 "psrad $" #shift ", %%mm4 \n\t"\
757 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
758 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
759 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
760 "psrad $" #shift ", %%mm0 \n\t"\
761 "psrad $" #shift ", %%mm2 \n\t"\
762 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
763 "movd %%mm1, " #dst " \n\t"\
764 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
765 "movd %%mm0, 16+" #dst " \n\t"\
766 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
767 "movd %%mm2, 96+" #dst " \n\t"\
768 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
769 "movd %%mm4, 112+" #dst " \n\t"\
770 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
771 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
772 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
773 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
774 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
775 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
776 "psrad $" #shift ", %%mm2 \n\t"\
777 "psrad $" #shift ", %%mm5 \n\t"\
778 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
779 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
780 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
781 "psrad $" #shift ", %%mm6 \n\t"\
782 "psrad $" #shift ", %%mm1 \n\t"\
783 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
784 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
785 "movd %%mm2, 32+" #dst " \n\t"\
786 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
787 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
788 "movd %%mm6, 48+" #dst " \n\t"\
789 "movd %%mm1, 64+" #dst " \n\t"\
790 "movd %%mm5, 80+" #dst " \n\t"
793 //IDCT( src0, src4, src1, src5, dst, shift)
794 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
795 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
796 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
797 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
802 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
805 #define IDCT(src0, src4, src1, src5, dst, shift) \
806 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
807 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
808 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
809 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
810 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
811 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
812 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
813 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
814 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
815 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
816 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
817 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
818 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
819 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
820 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
821 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
822 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
823 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
824 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
825 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
826 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
827 "psrad $" #shift ", %%mm7 \n\t"\
828 "psrad $" #shift ", %%mm4 \n\t"\
829 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
830 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
831 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
832 "psrad $" #shift ", %%mm0 \n\t"\
833 "psrad $" #shift ", %%mm2 \n\t"\
834 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
835 "movd %%mm7, " #dst " \n\t"\
836 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
837 "movd %%mm0, 16+" #dst " \n\t"\
838 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
839 "movd %%mm2, 96+" #dst " \n\t"\
840 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
841 "movd %%mm4, 112+" #dst " \n\t"\
842 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
843 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
844 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
845 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
846 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
847 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
848 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
849 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
850 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
851 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
852 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
853 "psrad $" #shift ", %%mm2 \n\t"\
854 "psrad $" #shift ", %%mm5 \n\t"\
855 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
856 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
857 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
858 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
859 "psrad $" #shift ", %%mm6 \n\t"\
860 "psrad $" #shift ", %%mm4 \n\t"\
861 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
862 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
863 "movd %%mm2, 32+" #dst " \n\t"\
864 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
865 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
866 "movd %%mm6, 48+" #dst " \n\t"\
867 "movd %%mm4, 64+" #dst " \n\t"\
868 "movd %%mm5, 80+" #dst " \n\t"
870 //IDCT( src0, src4, src1, src5, dst, shift)
871 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
872 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
873 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
874 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
880 #define IDCT(src0, src4, src1, src5, dst, shift) \
881 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
882 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
883 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
884 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
885 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
886 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
887 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
888 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
889 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
890 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
891 "movq 64(%2), %%mm3 \n\t"\
892 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
893 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
894 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
895 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
896 "psrad $" #shift ", %%mm7 \n\t"\
897 "psrad $" #shift ", %%mm4 \n\t"\
898 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
899 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
900 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
901 "psrad $" #shift ", %%mm0 \n\t"\
902 "psrad $" #shift ", %%mm1 \n\t"\
903 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
904 "movd %%mm7, " #dst " \n\t"\
905 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
906 "movd %%mm0, 16+" #dst " \n\t"\
907 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
908 "movd %%mm1, 96+" #dst " \n\t"\
909 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
910 "movd %%mm4, 112+" #dst " \n\t"\
911 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
912 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
913 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
914 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
915 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
916 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
917 "psrad $" #shift ", %%mm1 \n\t"\
918 "psrad $" #shift ", %%mm5 \n\t"\
919 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
920 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
921 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
922 "psrad $" #shift ", %%mm6 \n\t"\
923 "psrad $" #shift ", %%mm4 \n\t"\
924 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
925 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
926 "movd %%mm1, 32+" #dst " \n\t"\
927 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
928 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
929 "movd %%mm6, 48+" #dst " \n\t"\
930 "movd %%mm4, 64+" #dst " \n\t"\
931 "movd %%mm5, 80+" #dst " \n\t"
934 //IDCT( src0, src4, src1, src5, dst, shift)
935 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
936 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
937 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
938 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
944 #define IDCT(src0, src4, src1, src5, dst, shift) \
945 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
946 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
947 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
948 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
949 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
950 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
951 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
952 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
953 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
954 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
955 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
956 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
957 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
958 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
959 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
960 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
961 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
962 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
963 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
964 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
965 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
966 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
967 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
968 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
969 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
970 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
971 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
972 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
973 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
974 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
975 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
976 "psrad $" #shift ", %%mm4 \n\t"\
977 "psrad $" #shift ", %%mm7 \n\t"\
978 "psrad $" #shift ", %%mm3 \n\t"\
979 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
980 "movq %%mm4, " #dst " \n\t"\
981 "psrad $" #shift ", %%mm0 \n\t"\
982 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
983 "movq %%mm0, 16+" #dst " \n\t"\
984 "movq %%mm0, 96+" #dst " \n\t"\
985 "movq %%mm4, 112+" #dst " \n\t"\
986 "psrad $" #shift ", %%mm5 \n\t"\
987 "psrad $" #shift ", %%mm6 \n\t"\
988 "psrad $" #shift ", %%mm2 \n\t"\
989 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
990 "movq %%mm5, 32+" #dst " \n\t"\
991 "psrad $" #shift ", %%mm1 \n\t"\
992 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
993 "movq %%mm6, 48+" #dst " \n\t"\
994 "movq %%mm6, 64+" #dst " \n\t"\
995 "movq %%mm5, 80+" #dst " \n\t"
998 //IDCT( src0, src4, src1, src5, dst, shift)
999 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1000 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1001 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1002 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1006 "# .p2align 4 \n\t"\
1009 #define IDCT(src0, src4, src1, src5, dst, shift) \
1010 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1011 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1012 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1013 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1014 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1015 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1016 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1017 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1018 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1019 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1020 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1021 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1022 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1023 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1024 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1025 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1026 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1027 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1028 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1029 "movq 64(%2), %%mm1 \n\t"\
1030 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1031 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1032 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1033 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1034 "psrad $" #shift ", %%mm7 \n\t"\
1035 "psrad $" #shift ", %%mm4 \n\t"\
1036 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1037 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1038 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1039 "psrad $" #shift ", %%mm0 \n\t"\
1040 "psrad $" #shift ", %%mm3 \n\t"\
1041 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1042 "movd %%mm7, " #dst " \n\t"\
1043 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1044 "movd %%mm0, 16+" #dst " \n\t"\
1045 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1046 "movd %%mm3, 96+" #dst " \n\t"\
1047 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1048 "movd %%mm4, 112+" #dst " \n\t"\
1049 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1050 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1051 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1052 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1053 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1054 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1055 "psrad $" #shift ", %%mm3 \n\t"\
1056 "psrad $" #shift ", %%mm5 \n\t"\
1057 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1058 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1059 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1060 "psrad $" #shift ", %%mm6 \n\t"\
1061 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1062 "movd %%mm3, 32+" #dst " \n\t"\
1063 "psrad $" #shift ", %%mm4 \n\t"\
1064 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1065 "movd %%mm6, 48+" #dst " \n\t"\
1066 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1067 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1068 "movd %%mm4, 64+" #dst " \n\t"\
1069 "movd %%mm5, 80+" #dst " \n\t"
1072 //IDCT( src0, src4, src1, src5, dst, shift)
1073 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1074 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1075 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1076 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1083 #define IDCT(src0, src4, src1, src5, dst, shift) \
1084 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1085 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1086 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1087 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1088 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1089 "psrad $" #shift ", %%mm4 \n\t"\
1090 "psrad $" #shift ", %%mm0 \n\t"\
1091 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1092 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1093 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1094 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1095 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1096 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1097 "psrad $" #shift ", %%mm1 \n\t"\
1098 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1099 "movq %%mm4, " #dst " \n\t"\
1100 "psrad $" #shift ", %%mm2 \n\t"\
1101 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1102 "movq %%mm0, 16+" #dst " \n\t"\
1103 "movq %%mm0, 96+" #dst " \n\t"\
1104 "movq %%mm4, 112+" #dst " \n\t"\
1105 "movq %%mm0, 32+" #dst " \n\t"\
1106 "movq %%mm4, 48+" #dst " \n\t"\
1107 "movq %%mm4, 64+" #dst " \n\t"\
1108 "movq %%mm0, 80+" #dst " \n\t"
1110 //IDCT( src0, src4, src1, src5, dst, shift)
1111 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1112 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1113 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1114 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1121 00 40 04 44 20 60 24 64
1122 10 30 14 34 50 70 54 74
1123 01 41 03 43 21 61 23 63
1124 11 31 13 33 51 71 53 73
1125 02 42 06 46 22 62 26 66
1126 12 32 16 36 52 72 56 76
1127 05 45 07 47 25 65 27 67
1128 15 35 17 37 55 75 57 77
1131 00 04 10 14 20 24 30 34
1132 40 44 50 54 60 64 70 74
1133 01 03 11 13 21 23 31 33
1134 41 43 51 53 61 63 71 73
1135 02 06 12 16 22 26 32 36
1136 42 46 52 56 62 66 72 76
1137 05 07 15 17 25 27 35 37
1138 45 47 55 57 65 67 75 77
1142 :: "r" (block), "r" (temp), "r" (coeffs)
1147 void ff_simple_idct_mmx(int16_t *block)
1152 //FIXME merge add/put into the idct
1154 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1157 ff_put_pixels_clamped_mmx(block, dest, line_size);
1159 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1162 ff_add_pixels_clamped_mmx(block, dest, line_size);