4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/mem.h"
24 #include "libavutil/x86/asm.h"
26 #include "libavcodec/idctdsp.h"
29 #include "simple_idct.h"
43 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
48 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
50 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
53 #define COL_SHIFT 20 // 6
55 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
56 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
58 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
59 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
60 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
61 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
62 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
63 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
86 static inline void idct(int16_t *block)
88 LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
89 int16_t * const temp= (int16_t*)align_tmp;
92 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
93 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
94 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
95 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
96 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
97 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
98 "pand %%mm0, %%mm4 \n\t"\
99 "por %%mm1, %%mm4 \n\t"\
100 "por %%mm2, %%mm4 \n\t"\
101 "por %%mm3, %%mm4 \n\t"\
102 "packssdw %%mm4,%%mm4 \n\t"\
103 "movd %%mm4, %%eax \n\t"\
104 "orl %%eax, %%eax \n\t"\
106 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
107 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
108 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
109 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
110 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
111 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
112 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
113 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
114 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
115 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
116 #rounder ", %%mm4 \n\t"\
117 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
118 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
119 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
120 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
121 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
122 #rounder ", %%mm0 \n\t"\
123 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
124 "paddd %%mm0, %%mm0 \n\t" \
125 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
126 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
127 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
128 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
129 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
130 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
131 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
132 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
133 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
134 "psrad $" #shift ", %%mm7 \n\t"\
135 "psrad $" #shift ", %%mm4 \n\t"\
136 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
137 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
138 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
139 "psrad $" #shift ", %%mm1 \n\t"\
140 "psrad $" #shift ", %%mm2 \n\t"\
141 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
142 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
143 "movq %%mm7, " #dst " \n\t"\
144 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
145 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
146 "movq %%mm2, 24+" #dst " \n\t"\
147 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
148 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
149 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
150 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
151 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
152 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
153 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
154 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
155 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
156 "psrad $" #shift ", %%mm2 \n\t"\
157 "psrad $" #shift ", %%mm0 \n\t"\
158 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
159 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
160 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
161 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
162 "psrad $" #shift ", %%mm6 \n\t"\
163 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
164 "movq %%mm2, 8+" #dst " \n\t"\
165 "psrad $" #shift ", %%mm4 \n\t"\
166 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
167 "movq %%mm4, 16+" #dst " \n\t"\
170 "pslld $16, %%mm0 \n\t"\
171 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
172 "psrad $13, %%mm0 \n\t"\
173 "packssdw %%mm0, %%mm0 \n\t"\
174 "movq %%mm0, " #dst " \n\t"\
175 "movq %%mm0, 8+" #dst " \n\t"\
176 "movq %%mm0, 16+" #dst " \n\t"\
177 "movq %%mm0, 24+" #dst " \n\t"\
180 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
181 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
182 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
183 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
184 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
185 "movq %%mm0, %%mm4 \n\t"\
186 "por %%mm1, %%mm4 \n\t"\
187 "por %%mm2, %%mm4 \n\t"\
188 "por %%mm3, %%mm4 \n\t"\
189 "packssdw %%mm4,%%mm4 \n\t"\
190 "movd %%mm4, %%eax \n\t"\
191 "orl %%eax, %%eax \n\t"\
193 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
194 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
195 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
196 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
197 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
198 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
199 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
200 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
201 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
202 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
203 #rounder ", %%mm4 \n\t"\
204 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
205 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
206 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
207 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
208 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
209 #rounder ", %%mm0 \n\t"\
210 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
211 "paddd %%mm0, %%mm0 \n\t" \
212 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
213 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
214 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
215 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
216 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
217 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
218 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
219 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
220 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
221 "psrad $" #shift ", %%mm7 \n\t"\
222 "psrad $" #shift ", %%mm4 \n\t"\
223 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
224 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
225 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
226 "psrad $" #shift ", %%mm1 \n\t"\
227 "psrad $" #shift ", %%mm2 \n\t"\
228 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
229 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
230 "movq %%mm7, " #dst " \n\t"\
231 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
232 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
233 "movq %%mm2, 24+" #dst " \n\t"\
234 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
235 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
236 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
237 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
238 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
239 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
240 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
241 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
242 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
243 "psrad $" #shift ", %%mm2 \n\t"\
244 "psrad $" #shift ", %%mm0 \n\t"\
245 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
246 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
247 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
248 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
249 "psrad $" #shift ", %%mm6 \n\t"\
250 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
251 "movq %%mm2, 8+" #dst " \n\t"\
252 "psrad $" #shift ", %%mm4 \n\t"\
253 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
254 "movq %%mm4, 16+" #dst " \n\t"\
256 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
257 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
258 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
259 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
260 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
261 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
262 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
263 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
264 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
265 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
266 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
267 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
268 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
269 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
270 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
271 #rounder ", %%mm4 \n\t"\
272 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
273 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
274 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
275 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
276 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
277 #rounder ", %%mm0 \n\t"\
278 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
279 "paddd %%mm0, %%mm0 \n\t" \
280 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
281 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
282 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
283 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
284 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
285 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
286 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
287 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
288 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
289 "psrad $" #shift ", %%mm7 \n\t"\
290 "psrad $" #shift ", %%mm4 \n\t"\
291 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
292 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
293 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
294 "psrad $" #shift ", %%mm1 \n\t"\
295 "psrad $" #shift ", %%mm2 \n\t"\
296 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
297 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
298 "movq %%mm7, " #dst " \n\t"\
299 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
300 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
301 "movq %%mm2, 24+" #dst " \n\t"\
302 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
303 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
304 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
305 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
306 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
307 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
308 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
309 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
310 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
311 "psrad $" #shift ", %%mm2 \n\t"\
312 "psrad $" #shift ", %%mm0 \n\t"\
313 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
314 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
315 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
316 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
317 "psrad $" #shift ", %%mm6 \n\t"\
318 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
319 "movq %%mm2, 8+" #dst " \n\t"\
320 "psrad $" #shift ", %%mm4 \n\t"\
321 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
322 "movq %%mm4, 16+" #dst " \n\t"\
324 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
325 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
326 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
327 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
328 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
331 #define IDCT(src0, src4, src1, src5, dst, shift) \
332 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
333 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
334 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
335 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
336 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
337 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
338 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
339 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
340 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
341 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
342 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
343 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
344 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
345 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
346 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
347 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
348 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
349 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
350 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
351 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
352 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
353 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
354 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
355 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
356 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
357 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
358 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
359 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
360 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
361 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
362 "psrad $" #shift ", %%mm7 \n\t"\
363 "psrad $" #shift ", %%mm4 \n\t"\
364 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
365 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
366 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
367 "psrad $" #shift ", %%mm0 \n\t"\
368 "psrad $" #shift ", %%mm2 \n\t"\
369 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
370 "movd %%mm7, " #dst " \n\t"\
371 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
372 "movd %%mm0, 16+" #dst " \n\t"\
373 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
374 "movd %%mm2, 96+" #dst " \n\t"\
375 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
376 "movd %%mm4, 112+" #dst " \n\t"\
377 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
378 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
379 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
380 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
381 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
382 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
383 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
384 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
385 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
386 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
387 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
388 "psrad $" #shift ", %%mm2 \n\t"\
389 "psrad $" #shift ", %%mm5 \n\t"\
390 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
391 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
392 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
393 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
394 "psrad $" #shift ", %%mm6 \n\t"\
395 "psrad $" #shift ", %%mm4 \n\t"\
396 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
397 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
398 "movd %%mm2, 32+" #dst " \n\t"\
399 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
400 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
401 "movd %%mm6, 48+" #dst " \n\t"\
402 "movd %%mm4, 64+" #dst " \n\t"\
403 "movd %%mm5, 80+" #dst " \n\t"
406 //IDCT( src0, src4, src1, src5, dst, shift)
407 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
408 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
409 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
410 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
415 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
416 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
419 #define IDCT(src0, src4, src1, src5, dst, shift) \
420 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
421 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
422 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
423 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
424 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
425 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
426 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
427 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
428 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
429 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
430 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
431 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
432 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
433 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
434 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
435 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
436 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
437 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
438 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
439 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
440 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
441 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
442 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
443 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
444 "psrad $" #shift ", %%mm1 \n\t"\
445 "psrad $" #shift ", %%mm4 \n\t"\
446 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
447 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
448 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
449 "psrad $" #shift ", %%mm0 \n\t"\
450 "psrad $" #shift ", %%mm2 \n\t"\
451 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
452 "movd %%mm1, " #dst " \n\t"\
453 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
454 "movd %%mm0, 16+" #dst " \n\t"\
455 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
456 "movd %%mm2, 96+" #dst " \n\t"\
457 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
458 "movd %%mm4, 112+" #dst " \n\t"\
459 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
460 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
461 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
462 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
463 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
464 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
465 "psrad $" #shift ", %%mm2 \n\t"\
466 "psrad $" #shift ", %%mm5 \n\t"\
467 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
468 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
469 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
470 "psrad $" #shift ", %%mm6 \n\t"\
471 "psrad $" #shift ", %%mm1 \n\t"\
472 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
473 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
474 "movd %%mm2, 32+" #dst " \n\t"\
475 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
476 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
477 "movd %%mm6, 48+" #dst " \n\t"\
478 "movd %%mm1, 64+" #dst " \n\t"\
479 "movd %%mm5, 80+" #dst " \n\t"
481 //IDCT( src0, src4, src1, src5, dst, shift)
482 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
483 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
484 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
485 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
490 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
493 #define IDCT(src0, src4, src1, src5, dst, shift) \
494 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
495 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
496 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
497 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
498 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
499 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
500 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
501 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
502 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
503 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
504 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
505 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
506 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
507 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
508 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
509 "psrad $" #shift ", %%mm1 \n\t"\
510 "psrad $" #shift ", %%mm4 \n\t"\
511 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
512 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
513 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
514 "psrad $" #shift ", %%mm0 \n\t"\
515 "psrad $" #shift ", %%mm2 \n\t"\
516 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
517 "movd %%mm1, " #dst " \n\t"\
518 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
519 "movd %%mm0, 16+" #dst " \n\t"\
520 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
521 "movd %%mm2, 96+" #dst " \n\t"\
522 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
523 "movd %%mm4, 112+" #dst " \n\t"\
524 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
525 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
526 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
527 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
528 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
529 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
530 "psrad $" #shift ", %%mm2 \n\t"\
531 "psrad $" #shift ", %%mm5 \n\t"\
532 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
533 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
534 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
535 "psrad $" #shift ", %%mm6 \n\t"\
536 "psrad $" #shift ", %%mm1 \n\t"\
537 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
538 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
539 "movd %%mm2, 32+" #dst " \n\t"\
540 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
541 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
542 "movd %%mm6, 48+" #dst " \n\t"\
543 "movd %%mm1, 64+" #dst " \n\t"\
544 "movd %%mm5, 80+" #dst " \n\t"
547 //IDCT( src0, src4, src1, src5, dst, shift)
548 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
549 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
550 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
551 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
556 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
559 #define IDCT(src0, src4, src1, src5, dst, shift) \
560 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
561 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
562 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
563 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
564 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
565 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
566 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
567 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
568 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
569 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
570 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
571 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
572 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
573 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
574 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
575 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
576 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
577 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
578 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
579 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
580 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
581 "psrad $" #shift ", %%mm7 \n\t"\
582 "psrad $" #shift ", %%mm4 \n\t"\
583 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
584 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
585 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
586 "psrad $" #shift ", %%mm0 \n\t"\
587 "psrad $" #shift ", %%mm2 \n\t"\
588 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
589 "movd %%mm7, " #dst " \n\t"\
590 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
591 "movd %%mm0, 16+" #dst " \n\t"\
592 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
593 "movd %%mm2, 96+" #dst " \n\t"\
594 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
595 "movd %%mm4, 112+" #dst " \n\t"\
596 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
597 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
598 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
599 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
600 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
601 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
602 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
603 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
604 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
605 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
606 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
607 "psrad $" #shift ", %%mm2 \n\t"\
608 "psrad $" #shift ", %%mm5 \n\t"\
609 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
610 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
611 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
612 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
613 "psrad $" #shift ", %%mm6 \n\t"\
614 "psrad $" #shift ", %%mm4 \n\t"\
615 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
616 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
617 "movd %%mm2, 32+" #dst " \n\t"\
618 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
619 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
620 "movd %%mm6, 48+" #dst " \n\t"\
621 "movd %%mm4, 64+" #dst " \n\t"\
622 "movd %%mm5, 80+" #dst " \n\t"
624 //IDCT( src0, src4, src1, src5, dst, shift)
625 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
626 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
627 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
628 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
634 #define IDCT(src0, src4, src1, src5, dst, shift) \
635 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
636 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
637 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
638 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
639 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
640 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
641 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
642 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
643 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
644 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
645 "movq 64(%2), %%mm3 \n\t"\
646 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
647 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
648 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
649 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
650 "psrad $" #shift ", %%mm7 \n\t"\
651 "psrad $" #shift ", %%mm4 \n\t"\
652 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
653 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
654 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
655 "psrad $" #shift ", %%mm0 \n\t"\
656 "psrad $" #shift ", %%mm1 \n\t"\
657 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
658 "movd %%mm7, " #dst " \n\t"\
659 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
660 "movd %%mm0, 16+" #dst " \n\t"\
661 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
662 "movd %%mm1, 96+" #dst " \n\t"\
663 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
664 "movd %%mm4, 112+" #dst " \n\t"\
665 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
666 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
667 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
668 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
669 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
670 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
671 "psrad $" #shift ", %%mm1 \n\t"\
672 "psrad $" #shift ", %%mm5 \n\t"\
673 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
674 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
675 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
676 "psrad $" #shift ", %%mm6 \n\t"\
677 "psrad $" #shift ", %%mm4 \n\t"\
678 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
679 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
680 "movd %%mm1, 32+" #dst " \n\t"\
681 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
682 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
683 "movd %%mm6, 48+" #dst " \n\t"\
684 "movd %%mm4, 64+" #dst " \n\t"\
685 "movd %%mm5, 80+" #dst " \n\t"
688 //IDCT( src0, src4, src1, src5, dst, shift)
689 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
690 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
691 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
692 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
698 #define IDCT(src0, src4, src1, src5, dst, shift) \
699 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
700 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
701 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
702 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
703 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
704 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
705 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
706 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
707 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
708 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
709 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
710 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
711 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
712 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
713 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
714 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
715 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
716 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
717 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
718 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
719 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
720 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
721 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
722 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
723 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
724 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
725 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
726 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
727 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
728 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
729 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
730 "psrad $" #shift ", %%mm4 \n\t"\
731 "psrad $" #shift ", %%mm7 \n\t"\
732 "psrad $" #shift ", %%mm3 \n\t"\
733 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
734 "movq %%mm4, " #dst " \n\t"\
735 "psrad $" #shift ", %%mm0 \n\t"\
736 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
737 "movq %%mm0, 16+" #dst " \n\t"\
738 "movq %%mm0, 96+" #dst " \n\t"\
739 "movq %%mm4, 112+" #dst " \n\t"\
740 "psrad $" #shift ", %%mm5 \n\t"\
741 "psrad $" #shift ", %%mm6 \n\t"\
742 "psrad $" #shift ", %%mm2 \n\t"\
743 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
744 "movq %%mm5, 32+" #dst " \n\t"\
745 "psrad $" #shift ", %%mm1 \n\t"\
746 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
747 "movq %%mm6, 48+" #dst " \n\t"\
748 "movq %%mm6, 64+" #dst " \n\t"\
749 "movq %%mm5, 80+" #dst " \n\t"
752 //IDCT( src0, src4, src1, src5, dst, shift)
753 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
754 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
755 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
756 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
763 #define IDCT(src0, src4, src1, src5, dst, shift) \
764 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
765 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
766 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
767 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
768 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
769 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
770 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
771 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
772 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
773 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
774 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
775 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
776 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
777 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
778 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
779 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
780 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
781 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
782 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
783 "movq 64(%2), %%mm1 \n\t"\
784 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
785 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
786 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
787 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
788 "psrad $" #shift ", %%mm7 \n\t"\
789 "psrad $" #shift ", %%mm4 \n\t"\
790 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
791 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
792 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
793 "psrad $" #shift ", %%mm0 \n\t"\
794 "psrad $" #shift ", %%mm3 \n\t"\
795 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
796 "movd %%mm7, " #dst " \n\t"\
797 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
798 "movd %%mm0, 16+" #dst " \n\t"\
799 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
800 "movd %%mm3, 96+" #dst " \n\t"\
801 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
802 "movd %%mm4, 112+" #dst " \n\t"\
803 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
804 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
805 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
806 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
807 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
808 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
809 "psrad $" #shift ", %%mm3 \n\t"\
810 "psrad $" #shift ", %%mm5 \n\t"\
811 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
812 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
813 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
814 "psrad $" #shift ", %%mm6 \n\t"\
815 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
816 "movd %%mm3, 32+" #dst " \n\t"\
817 "psrad $" #shift ", %%mm4 \n\t"\
818 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
819 "movd %%mm6, 48+" #dst " \n\t"\
820 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
821 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
822 "movd %%mm4, 64+" #dst " \n\t"\
823 "movd %%mm5, 80+" #dst " \n\t"
826 //IDCT( src0, src4, src1, src5, dst, shift)
827 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
828 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
829 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
830 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
837 #define IDCT(src0, src4, src1, src5, dst, shift) \
838 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
839 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
840 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
841 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
842 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
843 "psrad $" #shift ", %%mm4 \n\t"\
844 "psrad $" #shift ", %%mm0 \n\t"\
845 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
846 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
847 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
848 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
849 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
850 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
851 "psrad $" #shift ", %%mm1 \n\t"\
852 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
853 "movq %%mm4, " #dst " \n\t"\
854 "psrad $" #shift ", %%mm2 \n\t"\
855 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
856 "movq %%mm0, 16+" #dst " \n\t"\
857 "movq %%mm0, 96+" #dst " \n\t"\
858 "movq %%mm4, 112+" #dst " \n\t"\
859 "movq %%mm0, 32+" #dst " \n\t"\
860 "movq %%mm4, 48+" #dst " \n\t"\
861 "movq %%mm4, 64+" #dst " \n\t"\
862 "movq %%mm0, 80+" #dst " \n\t"
864 //IDCT( src0, src4, src1, src5, dst, shift)
865 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
866 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
867 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
868 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
873 00 40 04 44 20 60 24 64
874 10 30 14 34 50 70 54 74
875 01 41 03 43 21 61 23 63
876 11 31 13 33 51 71 53 73
877 02 42 06 46 22 62 26 66
878 12 32 16 36 52 72 56 76
879 05 45 07 47 25 65 27 67
880 15 35 17 37 55 75 57 77
883 00 04 10 14 20 24 30 34
884 40 44 50 54 60 64 70 74
885 01 03 11 13 21 23 31 33
886 41 43 51 53 61 63 71 73
887 02 06 12 16 22 26 32 36
888 42 46 52 56 62 66 72 76
889 05 07 15 17 25 27 35 37
890 45 47 55 57 65 67 75 77
894 :: "r" (block), "r" (temp), "r" (coeffs)
895 NAMED_CONSTRAINTS_ADD(wm1010,d40000)
900 void ff_simple_idct_mmx(int16_t *block)
905 //FIXME merge add/put into the idct
907 void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
910 ff_put_pixels_clamped(block, dest, line_size);
912 void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
915 ff_add_pixels_clamped(block, dest, line_size);
918 #endif /* HAVE_INLINE_ASM */