4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of Libav.
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/internal.h"
24 #include "libavutil/mem.h"
25 #include "libavutil/x86/asm.h"
27 #include "simple_idct.h"
41 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
46 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51 #define COL_SHIFT 20 // 6
53 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
54 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
56 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
57 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
58 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
59 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
60 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
61 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
84 static inline void idct(int16_t *block)
86 DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
87 int16_t * const temp= (int16_t*)align_tmp;
90 #if 0 //Alternative, simpler variant
92 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
93 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
94 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
95 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
96 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
97 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
98 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
99 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
100 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
101 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
102 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
103 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
104 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
105 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
106 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
107 #rounder ", %%mm4 \n\t"\
108 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
109 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
110 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
111 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
112 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
113 #rounder ", %%mm0 \n\t"\
114 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
115 "paddd %%mm0, %%mm0 \n\t" \
116 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
117 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
118 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
119 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
120 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
121 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
122 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
123 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
124 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
125 "psrad $" #shift ", %%mm7 \n\t"\
126 "psrad $" #shift ", %%mm4 \n\t"\
127 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
128 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
129 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
130 "psrad $" #shift ", %%mm1 \n\t"\
131 "psrad $" #shift ", %%mm2 \n\t"\
132 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
133 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
134 "movq %%mm7, " #dst " \n\t"\
135 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
136 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
137 "movq %%mm2, 24+" #dst " \n\t"\
138 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
139 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
140 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
141 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
142 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
143 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
144 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
145 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
146 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
147 "psrad $" #shift ", %%mm2 \n\t"\
148 "psrad $" #shift ", %%mm0 \n\t"\
149 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
150 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
151 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
152 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
153 "psrad $" #shift ", %%mm6 \n\t"\
154 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
155 "movq %%mm2, 8+" #dst " \n\t"\
156 "psrad $" #shift ", %%mm4 \n\t"\
157 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
158 "movq %%mm4, 16+" #dst " \n\t"\
160 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
161 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
162 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
163 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
164 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
165 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
166 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
167 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
168 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
169 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
170 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
171 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
172 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
173 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
174 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
175 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
176 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
177 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
178 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
179 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
180 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
181 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
182 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
183 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
184 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
185 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
186 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
187 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
188 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
189 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
190 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
191 "psrad $" #shift ", %%mm7 \n\t"\
192 "psrad $" #shift ", %%mm4 \n\t"\
193 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
194 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
195 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
196 "psrad $" #shift ", %%mm0 \n\t"\
197 "psrad $" #shift ", %%mm2 \n\t"\
198 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
199 "movd %%mm7, " #dst " \n\t"\
200 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
201 "movd %%mm0, 16+" #dst " \n\t"\
202 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
203 "movd %%mm2, 96+" #dst " \n\t"\
204 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
205 "movd %%mm4, 112+" #dst " \n\t"\
206 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
207 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
208 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
209 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
210 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
211 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
212 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
213 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
214 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
215 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
216 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
217 "psrad $" #shift ", %%mm2 \n\t"\
218 "psrad $" #shift ", %%mm5 \n\t"\
219 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
220 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
221 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
222 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
223 "psrad $" #shift ", %%mm6 \n\t"\
224 "psrad $" #shift ", %%mm4 \n\t"\
225 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
226 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
227 "movd %%mm2, 32+" #dst " \n\t"\
228 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
229 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
230 "movd %%mm6, 48+" #dst " \n\t"\
231 "movd %%mm4, 64+" #dst " \n\t"\
232 "movd %%mm5, 80+" #dst " \n\t"\
235 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
236 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
237 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
238 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
239 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
240 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
241 "pand %%mm0, %%mm4 \n\t"\
242 "por %%mm1, %%mm4 \n\t"\
243 "por %%mm2, %%mm4 \n\t"\
244 "por %%mm3, %%mm4 \n\t"\
245 "packssdw %%mm4,%%mm4 \n\t"\
246 "movd %%mm4, %%eax \n\t"\
247 "orl %%eax, %%eax \n\t"\
249 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
250 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
251 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
252 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
253 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
254 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
255 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
256 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
257 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
258 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
259 #rounder ", %%mm4 \n\t"\
260 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
261 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
262 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
263 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
264 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
265 #rounder ", %%mm0 \n\t"\
266 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
267 "paddd %%mm0, %%mm0 \n\t" \
268 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
269 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
270 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
271 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
272 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
273 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
274 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
275 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
276 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
277 "psrad $" #shift ", %%mm7 \n\t"\
278 "psrad $" #shift ", %%mm4 \n\t"\
279 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
280 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
281 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
282 "psrad $" #shift ", %%mm1 \n\t"\
283 "psrad $" #shift ", %%mm2 \n\t"\
284 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
285 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
286 "movq %%mm7, " #dst " \n\t"\
287 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
288 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
289 "movq %%mm2, 24+" #dst " \n\t"\
290 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
291 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
292 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
293 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
294 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
295 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
296 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
297 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
298 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
299 "psrad $" #shift ", %%mm2 \n\t"\
300 "psrad $" #shift ", %%mm0 \n\t"\
301 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
302 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
303 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
304 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
305 "psrad $" #shift ", %%mm6 \n\t"\
306 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
307 "movq %%mm2, 8+" #dst " \n\t"\
308 "psrad $" #shift ", %%mm4 \n\t"\
309 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
310 "movq %%mm4, 16+" #dst " \n\t"\
313 "pslld $16, %%mm0 \n\t"\
314 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
315 "psrad $13, %%mm0 \n\t"\
316 "packssdw %%mm0, %%mm0 \n\t"\
317 "movq %%mm0, " #dst " \n\t"\
318 "movq %%mm0, 8+" #dst " \n\t"\
319 "movq %%mm0, 16+" #dst " \n\t"\
320 "movq %%mm0, 24+" #dst " \n\t"\
324 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
325 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
326 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
327 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
328 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
330 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
331 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
332 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
335 //IDCT( src0, src4, src1, src5, dst, shift)
336 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
337 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
338 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
339 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
343 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
344 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
345 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
346 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
347 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
348 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
349 "pand %%mm0, %%mm4 \n\t"\
350 "por %%mm1, %%mm4 \n\t"\
351 "por %%mm2, %%mm4 \n\t"\
352 "por %%mm3, %%mm4 \n\t"\
353 "packssdw %%mm4,%%mm4 \n\t"\
354 "movd %%mm4, %%eax \n\t"\
355 "orl %%eax, %%eax \n\t"\
357 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
358 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
359 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
360 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
361 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
362 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
363 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
364 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
365 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
366 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
367 #rounder ", %%mm4 \n\t"\
368 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
369 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
370 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
371 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
372 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
373 #rounder ", %%mm0 \n\t"\
374 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
375 "paddd %%mm0, %%mm0 \n\t" \
376 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
377 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
378 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
379 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
380 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
381 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
382 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
383 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
384 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
385 "psrad $" #shift ", %%mm7 \n\t"\
386 "psrad $" #shift ", %%mm4 \n\t"\
387 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
388 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
389 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
390 "psrad $" #shift ", %%mm1 \n\t"\
391 "psrad $" #shift ", %%mm2 \n\t"\
392 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
393 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
394 "movq %%mm7, " #dst " \n\t"\
395 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
396 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
397 "movq %%mm2, 24+" #dst " \n\t"\
398 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
399 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
400 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
401 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
402 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
403 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
404 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
405 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
406 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
407 "psrad $" #shift ", %%mm2 \n\t"\
408 "psrad $" #shift ", %%mm0 \n\t"\
409 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
410 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
411 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
412 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
413 "psrad $" #shift ", %%mm6 \n\t"\
414 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
415 "movq %%mm2, 8+" #dst " \n\t"\
416 "psrad $" #shift ", %%mm4 \n\t"\
417 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
418 "movq %%mm4, 16+" #dst " \n\t"\
421 "pslld $16, %%mm0 \n\t"\
422 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
423 "psrad $13, %%mm0 \n\t"\
424 "packssdw %%mm0, %%mm0 \n\t"\
425 "movq %%mm0, " #dst " \n\t"\
426 "movq %%mm0, 8+" #dst " \n\t"\
427 "movq %%mm0, 16+" #dst " \n\t"\
428 "movq %%mm0, 24+" #dst " \n\t"\
431 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
432 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
433 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
434 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
435 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
436 "movq %%mm0, %%mm4 \n\t"\
437 "por %%mm1, %%mm4 \n\t"\
438 "por %%mm2, %%mm4 \n\t"\
439 "por %%mm3, %%mm4 \n\t"\
440 "packssdw %%mm4,%%mm4 \n\t"\
441 "movd %%mm4, %%eax \n\t"\
442 "orl %%eax, %%eax \n\t"\
444 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
445 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
446 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
447 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
448 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
449 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
450 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
451 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
452 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
453 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
454 #rounder ", %%mm4 \n\t"\
455 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
456 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
457 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
458 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
459 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
460 #rounder ", %%mm0 \n\t"\
461 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
462 "paddd %%mm0, %%mm0 \n\t" \
463 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
464 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
465 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
466 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
467 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
468 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
469 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
470 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
471 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
472 "psrad $" #shift ", %%mm7 \n\t"\
473 "psrad $" #shift ", %%mm4 \n\t"\
474 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
475 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
476 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
477 "psrad $" #shift ", %%mm1 \n\t"\
478 "psrad $" #shift ", %%mm2 \n\t"\
479 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
480 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
481 "movq %%mm7, " #dst " \n\t"\
482 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
483 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
484 "movq %%mm2, 24+" #dst " \n\t"\
485 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
486 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
487 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
488 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
489 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
490 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
491 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
492 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
493 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
494 "psrad $" #shift ", %%mm2 \n\t"\
495 "psrad $" #shift ", %%mm0 \n\t"\
496 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
497 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
498 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
499 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
500 "psrad $" #shift ", %%mm6 \n\t"\
501 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
502 "movq %%mm2, 8+" #dst " \n\t"\
503 "psrad $" #shift ", %%mm4 \n\t"\
504 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
505 "movq %%mm4, 16+" #dst " \n\t"\
507 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
508 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
509 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
510 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
511 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
512 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
513 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
514 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
515 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
516 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
517 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
518 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
519 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
520 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
521 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
522 #rounder ", %%mm4 \n\t"\
523 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
524 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
525 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
526 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
527 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
528 #rounder ", %%mm0 \n\t"\
529 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
530 "paddd %%mm0, %%mm0 \n\t" \
531 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
532 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
533 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
534 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
535 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
536 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
537 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
538 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
539 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
540 "psrad $" #shift ", %%mm7 \n\t"\
541 "psrad $" #shift ", %%mm4 \n\t"\
542 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
543 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
544 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
545 "psrad $" #shift ", %%mm1 \n\t"\
546 "psrad $" #shift ", %%mm2 \n\t"\
547 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
548 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
549 "movq %%mm7, " #dst " \n\t"\
550 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
551 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
552 "movq %%mm2, 24+" #dst " \n\t"\
553 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
554 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
555 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
556 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
557 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
558 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
559 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
560 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
561 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
562 "psrad $" #shift ", %%mm2 \n\t"\
563 "psrad $" #shift ", %%mm0 \n\t"\
564 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
565 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
566 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
567 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
568 "psrad $" #shift ", %%mm6 \n\t"\
569 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
570 "movq %%mm2, 8+" #dst " \n\t"\
571 "psrad $" #shift ", %%mm4 \n\t"\
572 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
573 "movq %%mm4, 16+" #dst " \n\t"\
575 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
576 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
577 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
578 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
579 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
582 #define IDCT(src0, src4, src1, src5, dst, shift) \
583 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
584 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
585 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
586 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
587 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
588 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
589 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
590 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
591 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
592 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
593 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
594 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
595 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
596 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
597 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
598 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
599 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
600 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
601 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
602 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
603 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
604 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
605 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
606 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
607 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
608 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
609 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
610 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
611 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
612 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
613 "psrad $" #shift ", %%mm7 \n\t"\
614 "psrad $" #shift ", %%mm4 \n\t"\
615 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
616 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
617 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
618 "psrad $" #shift ", %%mm0 \n\t"\
619 "psrad $" #shift ", %%mm2 \n\t"\
620 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
621 "movd %%mm7, " #dst " \n\t"\
622 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
623 "movd %%mm0, 16+" #dst " \n\t"\
624 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
625 "movd %%mm2, 96+" #dst " \n\t"\
626 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
627 "movd %%mm4, 112+" #dst " \n\t"\
628 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
629 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
630 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
631 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
632 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
633 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
634 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
635 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
636 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
637 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
638 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
639 "psrad $" #shift ", %%mm2 \n\t"\
640 "psrad $" #shift ", %%mm5 \n\t"\
641 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
642 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
643 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
644 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
645 "psrad $" #shift ", %%mm6 \n\t"\
646 "psrad $" #shift ", %%mm4 \n\t"\
647 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
648 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
649 "movd %%mm2, 32+" #dst " \n\t"\
650 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
651 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
652 "movd %%mm6, 48+" #dst " \n\t"\
653 "movd %%mm4, 64+" #dst " \n\t"\
654 "movd %%mm5, 80+" #dst " \n\t"
657 //IDCT( src0, src4, src1, src5, dst, shift)
658 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
659 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
660 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
661 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
666 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
667 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
670 #define IDCT(src0, src4, src1, src5, dst, shift) \
671 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
672 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
673 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
674 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
675 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
676 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
677 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
678 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
679 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
680 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
681 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
682 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
683 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
684 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
685 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
686 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
687 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
688 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
689 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
690 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
691 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
692 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
693 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
694 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
695 "psrad $" #shift ", %%mm1 \n\t"\
696 "psrad $" #shift ", %%mm4 \n\t"\
697 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
698 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
699 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
700 "psrad $" #shift ", %%mm0 \n\t"\
701 "psrad $" #shift ", %%mm2 \n\t"\
702 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
703 "movd %%mm1, " #dst " \n\t"\
704 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
705 "movd %%mm0, 16+" #dst " \n\t"\
706 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
707 "movd %%mm2, 96+" #dst " \n\t"\
708 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
709 "movd %%mm4, 112+" #dst " \n\t"\
710 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
711 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
712 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
713 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
714 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
715 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
716 "psrad $" #shift ", %%mm2 \n\t"\
717 "psrad $" #shift ", %%mm5 \n\t"\
718 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
719 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
720 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
721 "psrad $" #shift ", %%mm6 \n\t"\
722 "psrad $" #shift ", %%mm1 \n\t"\
723 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
724 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
725 "movd %%mm2, 32+" #dst " \n\t"\
726 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
727 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
728 "movd %%mm6, 48+" #dst " \n\t"\
729 "movd %%mm1, 64+" #dst " \n\t"\
730 "movd %%mm5, 80+" #dst " \n\t"
732 //IDCT( src0, src4, src1, src5, dst, shift)
733 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
734 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
735 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
736 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
741 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
744 #define IDCT(src0, src4, src1, src5, dst, shift) \
745 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
746 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
747 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
748 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
749 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
750 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
751 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
752 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
753 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
754 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
755 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
756 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
757 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
758 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
759 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
760 "psrad $" #shift ", %%mm1 \n\t"\
761 "psrad $" #shift ", %%mm4 \n\t"\
762 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
763 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
764 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
765 "psrad $" #shift ", %%mm0 \n\t"\
766 "psrad $" #shift ", %%mm2 \n\t"\
767 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
768 "movd %%mm1, " #dst " \n\t"\
769 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
770 "movd %%mm0, 16+" #dst " \n\t"\
771 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
772 "movd %%mm2, 96+" #dst " \n\t"\
773 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
774 "movd %%mm4, 112+" #dst " \n\t"\
775 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
776 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
777 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
778 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
779 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
780 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
781 "psrad $" #shift ", %%mm2 \n\t"\
782 "psrad $" #shift ", %%mm5 \n\t"\
783 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
784 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
785 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
786 "psrad $" #shift ", %%mm6 \n\t"\
787 "psrad $" #shift ", %%mm1 \n\t"\
788 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
789 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
790 "movd %%mm2, 32+" #dst " \n\t"\
791 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
792 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
793 "movd %%mm6, 48+" #dst " \n\t"\
794 "movd %%mm1, 64+" #dst " \n\t"\
795 "movd %%mm5, 80+" #dst " \n\t"
798 //IDCT( src0, src4, src1, src5, dst, shift)
799 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
800 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
801 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
802 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
807 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
810 #define IDCT(src0, src4, src1, src5, dst, shift) \
811 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
812 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
813 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
814 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
815 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
816 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
817 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
818 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
819 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
820 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
821 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
822 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
823 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
824 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
825 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
826 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
827 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
828 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
829 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
830 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
831 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
832 "psrad $" #shift ", %%mm7 \n\t"\
833 "psrad $" #shift ", %%mm4 \n\t"\
834 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
835 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
836 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
837 "psrad $" #shift ", %%mm0 \n\t"\
838 "psrad $" #shift ", %%mm2 \n\t"\
839 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
840 "movd %%mm7, " #dst " \n\t"\
841 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
842 "movd %%mm0, 16+" #dst " \n\t"\
843 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
844 "movd %%mm2, 96+" #dst " \n\t"\
845 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
846 "movd %%mm4, 112+" #dst " \n\t"\
847 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
848 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
849 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
850 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
851 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
852 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
853 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
854 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
855 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
856 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
857 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
858 "psrad $" #shift ", %%mm2 \n\t"\
859 "psrad $" #shift ", %%mm5 \n\t"\
860 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
861 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
862 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
863 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
864 "psrad $" #shift ", %%mm6 \n\t"\
865 "psrad $" #shift ", %%mm4 \n\t"\
866 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
867 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
868 "movd %%mm2, 32+" #dst " \n\t"\
869 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
870 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
871 "movd %%mm6, 48+" #dst " \n\t"\
872 "movd %%mm4, 64+" #dst " \n\t"\
873 "movd %%mm5, 80+" #dst " \n\t"
875 //IDCT( src0, src4, src1, src5, dst, shift)
876 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
877 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
878 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
879 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
885 #define IDCT(src0, src4, src1, src5, dst, shift) \
886 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
887 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
888 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
889 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
890 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
891 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
892 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
893 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
894 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
895 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
896 "movq 64(%2), %%mm3 \n\t"\
897 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
898 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
899 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
900 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
901 "psrad $" #shift ", %%mm7 \n\t"\
902 "psrad $" #shift ", %%mm4 \n\t"\
903 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
904 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
905 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
906 "psrad $" #shift ", %%mm0 \n\t"\
907 "psrad $" #shift ", %%mm1 \n\t"\
908 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
909 "movd %%mm7, " #dst " \n\t"\
910 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
911 "movd %%mm0, 16+" #dst " \n\t"\
912 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
913 "movd %%mm1, 96+" #dst " \n\t"\
914 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
915 "movd %%mm4, 112+" #dst " \n\t"\
916 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
917 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
918 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
919 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
920 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
921 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
922 "psrad $" #shift ", %%mm1 \n\t"\
923 "psrad $" #shift ", %%mm5 \n\t"\
924 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
925 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
926 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
927 "psrad $" #shift ", %%mm6 \n\t"\
928 "psrad $" #shift ", %%mm4 \n\t"\
929 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
930 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
931 "movd %%mm1, 32+" #dst " \n\t"\
932 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
933 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
934 "movd %%mm6, 48+" #dst " \n\t"\
935 "movd %%mm4, 64+" #dst " \n\t"\
936 "movd %%mm5, 80+" #dst " \n\t"
939 //IDCT( src0, src4, src1, src5, dst, shift)
940 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
941 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
942 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
943 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
949 #define IDCT(src0, src4, src1, src5, dst, shift) \
950 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
951 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
952 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
953 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
954 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
955 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
956 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
957 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
958 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
959 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
960 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
961 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
962 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
963 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
964 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
965 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
966 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
967 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
968 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
969 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
970 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
971 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
972 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
973 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
974 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
975 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
976 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
977 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
978 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
979 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
980 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
981 "psrad $" #shift ", %%mm4 \n\t"\
982 "psrad $" #shift ", %%mm7 \n\t"\
983 "psrad $" #shift ", %%mm3 \n\t"\
984 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
985 "movq %%mm4, " #dst " \n\t"\
986 "psrad $" #shift ", %%mm0 \n\t"\
987 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
988 "movq %%mm0, 16+" #dst " \n\t"\
989 "movq %%mm0, 96+" #dst " \n\t"\
990 "movq %%mm4, 112+" #dst " \n\t"\
991 "psrad $" #shift ", %%mm5 \n\t"\
992 "psrad $" #shift ", %%mm6 \n\t"\
993 "psrad $" #shift ", %%mm2 \n\t"\
994 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
995 "movq %%mm5, 32+" #dst " \n\t"\
996 "psrad $" #shift ", %%mm1 \n\t"\
997 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
998 "movq %%mm6, 48+" #dst " \n\t"\
999 "movq %%mm6, 64+" #dst " \n\t"\
1000 "movq %%mm5, 80+" #dst " \n\t"
1003 //IDCT( src0, src4, src1, src5, dst, shift)
1004 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1005 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1006 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1007 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1011 "# .p2align 4 \n\t"\
1014 #define IDCT(src0, src4, src1, src5, dst, shift) \
1015 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1016 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1017 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1018 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1019 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1020 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1021 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1022 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1023 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1024 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1025 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1026 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1027 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1028 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1029 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1030 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1031 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1032 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1033 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1034 "movq 64(%2), %%mm1 \n\t"\
1035 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1036 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1037 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1038 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1039 "psrad $" #shift ", %%mm7 \n\t"\
1040 "psrad $" #shift ", %%mm4 \n\t"\
1041 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1042 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1043 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1044 "psrad $" #shift ", %%mm0 \n\t"\
1045 "psrad $" #shift ", %%mm3 \n\t"\
1046 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1047 "movd %%mm7, " #dst " \n\t"\
1048 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1049 "movd %%mm0, 16+" #dst " \n\t"\
1050 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1051 "movd %%mm3, 96+" #dst " \n\t"\
1052 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1053 "movd %%mm4, 112+" #dst " \n\t"\
1054 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1055 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1056 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1057 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1058 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1059 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1060 "psrad $" #shift ", %%mm3 \n\t"\
1061 "psrad $" #shift ", %%mm5 \n\t"\
1062 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1063 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1064 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1065 "psrad $" #shift ", %%mm6 \n\t"\
1066 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1067 "movd %%mm3, 32+" #dst " \n\t"\
1068 "psrad $" #shift ", %%mm4 \n\t"\
1069 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1070 "movd %%mm6, 48+" #dst " \n\t"\
1071 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1072 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1073 "movd %%mm4, 64+" #dst " \n\t"\
1074 "movd %%mm5, 80+" #dst " \n\t"
1077 //IDCT( src0, src4, src1, src5, dst, shift)
1078 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1079 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1080 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1081 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1088 #define IDCT(src0, src4, src1, src5, dst, shift) \
1089 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1090 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1091 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1092 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1093 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1094 "psrad $" #shift ", %%mm4 \n\t"\
1095 "psrad $" #shift ", %%mm0 \n\t"\
1096 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1097 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1098 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1099 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1100 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1101 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1102 "psrad $" #shift ", %%mm1 \n\t"\
1103 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1104 "movq %%mm4, " #dst " \n\t"\
1105 "psrad $" #shift ", %%mm2 \n\t"\
1106 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1107 "movq %%mm0, 16+" #dst " \n\t"\
1108 "movq %%mm0, 96+" #dst " \n\t"\
1109 "movq %%mm4, 112+" #dst " \n\t"\
1110 "movq %%mm0, 32+" #dst " \n\t"\
1111 "movq %%mm4, 48+" #dst " \n\t"\
1112 "movq %%mm4, 64+" #dst " \n\t"\
1113 "movq %%mm0, 80+" #dst " \n\t"
1115 //IDCT( src0, src4, src1, src5, dst, shift)
1116 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1117 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1118 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1119 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1126 00 40 04 44 20 60 24 64
1127 10 30 14 34 50 70 54 74
1128 01 41 03 43 21 61 23 63
1129 11 31 13 33 51 71 53 73
1130 02 42 06 46 22 62 26 66
1131 12 32 16 36 52 72 56 76
1132 05 45 07 47 25 65 27 67
1133 15 35 17 37 55 75 57 77
1136 00 04 10 14 20 24 30 34
1137 40 44 50 54 60 64 70 74
1138 01 03 11 13 21 23 31 33
1139 41 43 51 53 61 63 71 73
1140 02 06 12 16 22 26 32 36
1141 42 46 52 56 62 66 72 76
1142 05 07 15 17 25 27 35 37
1143 45 47 55 57 65 67 75 77
1147 :: "r" (block), "r" (temp), "r" (coeffs)
1152 void ff_simple_idct_mmx(int16_t *block)
1157 //FIXME merge add/put into the idct
1159 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
1162 ff_put_pixels_clamped_mmx(block, dest, line_size);
1164 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
1167 ff_add_pixels_clamped_mmx(block, dest, line_size);
1170 #endif /* HAVE_INLINE_ASM */