4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavcodec/dsputil.h"
23 #include "libavcodec/simple_idct.h"
35 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
40 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
44 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49 #define COL_SHIFT 20 // 6
51 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
52 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
54 DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
55 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
56 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
57 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
58 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
59 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
83 static void unused_var_killer(void)
85 int a= wm1010 + d40000;
89 static void inline idctCol (int16_t * col, int16_t *input)
99 int a0, a1, a2, a3, b0, b1, b2, b3;
100 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
107 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
109 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
110 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
111 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
115 col[8*0] = input[8*0 + 0];
116 col[8*1] = input[8*2 + 0];
117 col[8*2] = input[8*0 + 1];
118 col[8*3] = input[8*2 + 1];
119 col[8*4] = input[8*4 + 0];
120 col[8*5] = input[8*6 + 0];
121 col[8*6] = input[8*4 + 1];
122 col[8*7] = input[8*6 + 1];
124 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
125 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
126 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
127 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
129 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
130 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
131 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
132 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
134 col[8*0] = (a0 + b0) >> COL_SHIFT;
135 col[8*1] = (a1 + b1) >> COL_SHIFT;
136 col[8*2] = (a2 + b2) >> COL_SHIFT;
137 col[8*3] = (a3 + b3) >> COL_SHIFT;
138 col[8*4] = (a3 - b3) >> COL_SHIFT;
139 col[8*5] = (a2 - b2) >> COL_SHIFT;
140 col[8*6] = (a1 - b1) >> COL_SHIFT;
141 col[8*7] = (a0 - b0) >> COL_SHIFT;
144 static void inline idctRow (int16_t * output, int16_t * input)
148 int a0, a1, a2, a3, b0, b1, b2, b3;
149 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
156 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
167 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
168 row[0] = row[1] = row[2] = row[3] = row[4] =
169 row[5] = row[6] = row[7] = row[0]<<3;
181 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
182 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
183 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
184 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
186 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
187 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
188 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
189 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
191 row[0] = (a0 + b0) >> ROW_SHIFT;
192 row[1] = (a1 + b1) >> ROW_SHIFT;
193 row[2] = (a2 + b2) >> ROW_SHIFT;
194 row[3] = (a3 + b3) >> ROW_SHIFT;
195 row[4] = (a3 - b3) >> ROW_SHIFT;
196 row[5] = (a2 - b2) >> ROW_SHIFT;
197 row[6] = (a1 - b1) >> ROW_SHIFT;
198 row[7] = (a0 - b0) >> ROW_SHIFT;
211 static inline void idct(int16_t *block)
213 DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
214 int16_t * const temp= (int16_t*)align_tmp;
217 #if 0 //Alternative, simpler variant
219 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
220 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
221 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
222 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
223 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
224 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
225 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
226 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
227 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
228 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
229 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
230 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
231 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
232 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
233 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
234 #rounder ", %%mm4 \n\t"\
235 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
236 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
237 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
238 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
239 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
240 #rounder ", %%mm0 \n\t"\
241 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
242 "paddd %%mm0, %%mm0 \n\t" \
243 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
244 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
245 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
246 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
247 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
248 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
249 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
250 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
251 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
252 "psrad $" #shift ", %%mm7 \n\t"\
253 "psrad $" #shift ", %%mm4 \n\t"\
254 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
255 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
256 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
257 "psrad $" #shift ", %%mm1 \n\t"\
258 "psrad $" #shift ", %%mm2 \n\t"\
259 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
260 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
261 "movq %%mm7, " #dst " \n\t"\
262 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
263 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
264 "movq %%mm2, 24+" #dst " \n\t"\
265 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
266 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
267 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
268 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
269 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
270 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
271 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
272 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
273 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
274 "psrad $" #shift ", %%mm2 \n\t"\
275 "psrad $" #shift ", %%mm0 \n\t"\
276 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
277 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
278 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
279 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
280 "psrad $" #shift ", %%mm6 \n\t"\
281 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
282 "movq %%mm2, 8+" #dst " \n\t"\
283 "psrad $" #shift ", %%mm4 \n\t"\
284 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
285 "movq %%mm4, 16+" #dst " \n\t"\
287 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
288 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
289 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
290 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
291 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
292 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
293 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
294 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
295 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
296 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
297 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
298 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
299 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
300 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
301 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
302 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
303 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
304 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
305 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
306 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
307 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
308 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
309 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
310 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
311 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
312 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
313 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
314 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
315 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
316 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
317 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
318 "psrad $" #shift ", %%mm7 \n\t"\
319 "psrad $" #shift ", %%mm4 \n\t"\
320 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
321 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
322 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
323 "psrad $" #shift ", %%mm0 \n\t"\
324 "psrad $" #shift ", %%mm2 \n\t"\
325 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
326 "movd %%mm7, " #dst " \n\t"\
327 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
328 "movd %%mm0, 16+" #dst " \n\t"\
329 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
330 "movd %%mm2, 96+" #dst " \n\t"\
331 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
332 "movd %%mm4, 112+" #dst " \n\t"\
333 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
334 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
335 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
336 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
337 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
338 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
339 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
340 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
341 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
342 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
343 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
344 "psrad $" #shift ", %%mm2 \n\t"\
345 "psrad $" #shift ", %%mm5 \n\t"\
346 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
347 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
348 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
349 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
350 "psrad $" #shift ", %%mm6 \n\t"\
351 "psrad $" #shift ", %%mm4 \n\t"\
352 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
353 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
354 "movd %%mm2, 32+" #dst " \n\t"\
355 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
356 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
357 "movd %%mm6, 48+" #dst " \n\t"\
358 "movd %%mm4, 64+" #dst " \n\t"\
359 "movd %%mm5, 80+" #dst " \n\t"\
362 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
363 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
364 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
365 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
366 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
367 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
368 "pand %%mm0, %%mm4 \n\t"\
369 "por %%mm1, %%mm4 \n\t"\
370 "por %%mm2, %%mm4 \n\t"\
371 "por %%mm3, %%mm4 \n\t"\
372 "packssdw %%mm4,%%mm4 \n\t"\
373 "movd %%mm4, %%eax \n\t"\
374 "orl %%eax, %%eax \n\t"\
376 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
377 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
378 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
379 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
380 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
381 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
382 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
383 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
384 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
385 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
386 #rounder ", %%mm4 \n\t"\
387 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
388 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
389 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
390 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
391 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
392 #rounder ", %%mm0 \n\t"\
393 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
394 "paddd %%mm0, %%mm0 \n\t" \
395 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
396 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
397 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
398 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
399 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
400 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
401 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
402 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
403 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
404 "psrad $" #shift ", %%mm7 \n\t"\
405 "psrad $" #shift ", %%mm4 \n\t"\
406 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
407 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
408 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
409 "psrad $" #shift ", %%mm1 \n\t"\
410 "psrad $" #shift ", %%mm2 \n\t"\
411 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
412 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
413 "movq %%mm7, " #dst " \n\t"\
414 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
415 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
416 "movq %%mm2, 24+" #dst " \n\t"\
417 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
418 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
419 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
420 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
421 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
422 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
423 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
424 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
425 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
426 "psrad $" #shift ", %%mm2 \n\t"\
427 "psrad $" #shift ", %%mm0 \n\t"\
428 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
429 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
430 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
431 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
432 "psrad $" #shift ", %%mm6 \n\t"\
433 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
434 "movq %%mm2, 8+" #dst " \n\t"\
435 "psrad $" #shift ", %%mm4 \n\t"\
436 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
437 "movq %%mm4, 16+" #dst " \n\t"\
440 "pslld $16, %%mm0 \n\t"\
441 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
442 "psrad $13, %%mm0 \n\t"\
443 "packssdw %%mm0, %%mm0 \n\t"\
444 "movq %%mm0, " #dst " \n\t"\
445 "movq %%mm0, 8+" #dst " \n\t"\
446 "movq %%mm0, 16+" #dst " \n\t"\
447 "movq %%mm0, 24+" #dst " \n\t"\
451 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
452 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
453 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
454 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
455 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
457 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
458 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
459 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
462 //IDCT( src0, src4, src1, src5, dst, shift)
463 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
464 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
465 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
466 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
470 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
471 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
472 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
473 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
474 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
475 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
476 "pand %%mm0, %%mm4 \n\t"\
477 "por %%mm1, %%mm4 \n\t"\
478 "por %%mm2, %%mm4 \n\t"\
479 "por %%mm3, %%mm4 \n\t"\
480 "packssdw %%mm4,%%mm4 \n\t"\
481 "movd %%mm4, %%eax \n\t"\
482 "orl %%eax, %%eax \n\t"\
484 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
485 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
486 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
487 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
488 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
489 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
490 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
491 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
492 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
493 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
494 #rounder ", %%mm4 \n\t"\
495 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
496 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
497 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
498 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
499 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
500 #rounder ", %%mm0 \n\t"\
501 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
502 "paddd %%mm0, %%mm0 \n\t" \
503 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
504 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
505 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
506 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
507 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
508 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
509 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
510 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
511 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
512 "psrad $" #shift ", %%mm7 \n\t"\
513 "psrad $" #shift ", %%mm4 \n\t"\
514 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
515 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
516 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
517 "psrad $" #shift ", %%mm1 \n\t"\
518 "psrad $" #shift ", %%mm2 \n\t"\
519 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
520 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
521 "movq %%mm7, " #dst " \n\t"\
522 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
523 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
524 "movq %%mm2, 24+" #dst " \n\t"\
525 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
526 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
527 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
528 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
529 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
530 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
531 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
532 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
533 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
534 "psrad $" #shift ", %%mm2 \n\t"\
535 "psrad $" #shift ", %%mm0 \n\t"\
536 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
537 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
538 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
539 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
540 "psrad $" #shift ", %%mm6 \n\t"\
541 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
542 "movq %%mm2, 8+" #dst " \n\t"\
543 "psrad $" #shift ", %%mm4 \n\t"\
544 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
545 "movq %%mm4, 16+" #dst " \n\t"\
548 "pslld $16, %%mm0 \n\t"\
549 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
550 "psrad $13, %%mm0 \n\t"\
551 "packssdw %%mm0, %%mm0 \n\t"\
552 "movq %%mm0, " #dst " \n\t"\
553 "movq %%mm0, 8+" #dst " \n\t"\
554 "movq %%mm0, 16+" #dst " \n\t"\
555 "movq %%mm0, 24+" #dst " \n\t"\
558 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
559 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
560 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
561 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
562 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
563 "movq %%mm0, %%mm4 \n\t"\
564 "por %%mm1, %%mm4 \n\t"\
565 "por %%mm2, %%mm4 \n\t"\
566 "por %%mm3, %%mm4 \n\t"\
567 "packssdw %%mm4,%%mm4 \n\t"\
568 "movd %%mm4, %%eax \n\t"\
569 "orl %%eax, %%eax \n\t"\
571 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
572 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
573 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
574 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
575 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
576 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
577 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
578 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
579 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
580 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
581 #rounder ", %%mm4 \n\t"\
582 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
583 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
584 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
585 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
586 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
587 #rounder ", %%mm0 \n\t"\
588 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
589 "paddd %%mm0, %%mm0 \n\t" \
590 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
591 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
592 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
593 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
594 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
595 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
596 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
597 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
598 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
599 "psrad $" #shift ", %%mm7 \n\t"\
600 "psrad $" #shift ", %%mm4 \n\t"\
601 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
602 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
603 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
604 "psrad $" #shift ", %%mm1 \n\t"\
605 "psrad $" #shift ", %%mm2 \n\t"\
606 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
607 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
608 "movq %%mm7, " #dst " \n\t"\
609 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
610 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
611 "movq %%mm2, 24+" #dst " \n\t"\
612 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
613 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
614 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
615 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
616 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
617 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
618 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
619 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
620 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
621 "psrad $" #shift ", %%mm2 \n\t"\
622 "psrad $" #shift ", %%mm0 \n\t"\
623 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
624 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
625 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
626 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
627 "psrad $" #shift ", %%mm6 \n\t"\
628 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
629 "movq %%mm2, 8+" #dst " \n\t"\
630 "psrad $" #shift ", %%mm4 \n\t"\
631 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
632 "movq %%mm4, 16+" #dst " \n\t"\
634 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
635 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
636 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
637 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
638 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
639 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
640 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
641 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
642 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
643 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
644 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
645 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
646 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
647 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
648 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
649 #rounder ", %%mm4 \n\t"\
650 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
651 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
652 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
653 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
654 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
655 #rounder ", %%mm0 \n\t"\
656 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
657 "paddd %%mm0, %%mm0 \n\t" \
658 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
659 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
660 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
661 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
662 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
663 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
664 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
665 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
666 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
667 "psrad $" #shift ", %%mm7 \n\t"\
668 "psrad $" #shift ", %%mm4 \n\t"\
669 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
670 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
671 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
672 "psrad $" #shift ", %%mm1 \n\t"\
673 "psrad $" #shift ", %%mm2 \n\t"\
674 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
675 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
676 "movq %%mm7, " #dst " \n\t"\
677 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
678 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
679 "movq %%mm2, 24+" #dst " \n\t"\
680 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
681 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
682 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
683 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
684 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
685 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
686 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
687 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
688 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
689 "psrad $" #shift ", %%mm2 \n\t"\
690 "psrad $" #shift ", %%mm0 \n\t"\
691 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
692 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
693 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
694 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
695 "psrad $" #shift ", %%mm6 \n\t"\
696 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
697 "movq %%mm2, 8+" #dst " \n\t"\
698 "psrad $" #shift ", %%mm4 \n\t"\
699 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
700 "movq %%mm4, 16+" #dst " \n\t"\
702 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
703 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
704 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
705 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
706 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
709 #define IDCT(src0, src4, src1, src5, dst, shift) \
710 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
711 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
712 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
713 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
714 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
715 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
716 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
717 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
718 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
719 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
720 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
721 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
722 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
723 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
724 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
725 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
726 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
727 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
728 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
729 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
730 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
731 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
732 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
733 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
734 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
735 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
736 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
737 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
738 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
739 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
740 "psrad $" #shift ", %%mm7 \n\t"\
741 "psrad $" #shift ", %%mm4 \n\t"\
742 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
743 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
744 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
745 "psrad $" #shift ", %%mm0 \n\t"\
746 "psrad $" #shift ", %%mm2 \n\t"\
747 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
748 "movd %%mm7, " #dst " \n\t"\
749 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
750 "movd %%mm0, 16+" #dst " \n\t"\
751 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
752 "movd %%mm2, 96+" #dst " \n\t"\
753 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
754 "movd %%mm4, 112+" #dst " \n\t"\
755 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
756 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
757 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
758 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
759 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
760 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
761 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
762 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
763 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
764 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
765 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
766 "psrad $" #shift ", %%mm2 \n\t"\
767 "psrad $" #shift ", %%mm5 \n\t"\
768 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
769 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
770 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
771 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
772 "psrad $" #shift ", %%mm6 \n\t"\
773 "psrad $" #shift ", %%mm4 \n\t"\
774 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
775 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
776 "movd %%mm2, 32+" #dst " \n\t"\
777 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
778 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
779 "movd %%mm6, 48+" #dst " \n\t"\
780 "movd %%mm4, 64+" #dst " \n\t"\
781 "movd %%mm5, 80+" #dst " \n\t"
784 //IDCT( src0, src4, src1, src5, dst, shift)
785 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
786 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
787 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
788 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
793 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
794 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
797 #define IDCT(src0, src4, src1, src5, dst, shift) \
798 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
799 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
800 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
801 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
802 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
803 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
804 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
805 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
806 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
807 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
808 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
809 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
810 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
811 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
812 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
813 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
814 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
815 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
816 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
817 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
818 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
819 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
820 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
821 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
822 "psrad $" #shift ", %%mm1 \n\t"\
823 "psrad $" #shift ", %%mm4 \n\t"\
824 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
825 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
826 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
827 "psrad $" #shift ", %%mm0 \n\t"\
828 "psrad $" #shift ", %%mm2 \n\t"\
829 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
830 "movd %%mm1, " #dst " \n\t"\
831 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
832 "movd %%mm0, 16+" #dst " \n\t"\
833 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
834 "movd %%mm2, 96+" #dst " \n\t"\
835 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
836 "movd %%mm4, 112+" #dst " \n\t"\
837 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
838 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
839 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
840 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
841 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
842 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
843 "psrad $" #shift ", %%mm2 \n\t"\
844 "psrad $" #shift ", %%mm5 \n\t"\
845 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
846 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
847 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
848 "psrad $" #shift ", %%mm6 \n\t"\
849 "psrad $" #shift ", %%mm1 \n\t"\
850 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
851 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
852 "movd %%mm2, 32+" #dst " \n\t"\
853 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
854 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
855 "movd %%mm6, 48+" #dst " \n\t"\
856 "movd %%mm1, 64+" #dst " \n\t"\
857 "movd %%mm5, 80+" #dst " \n\t"
859 //IDCT( src0, src4, src1, src5, dst, shift)
860 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
861 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
862 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
863 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
868 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
871 #define IDCT(src0, src4, src1, src5, dst, shift) \
872 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
873 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
874 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
875 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
876 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
877 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
878 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
879 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
880 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
881 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
882 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
883 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
884 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
885 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
886 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
887 "psrad $" #shift ", %%mm1 \n\t"\
888 "psrad $" #shift ", %%mm4 \n\t"\
889 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
890 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
891 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
892 "psrad $" #shift ", %%mm0 \n\t"\
893 "psrad $" #shift ", %%mm2 \n\t"\
894 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
895 "movd %%mm1, " #dst " \n\t"\
896 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
897 "movd %%mm0, 16+" #dst " \n\t"\
898 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
899 "movd %%mm2, 96+" #dst " \n\t"\
900 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
901 "movd %%mm4, 112+" #dst " \n\t"\
902 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
903 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
904 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
905 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
906 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
907 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
908 "psrad $" #shift ", %%mm2 \n\t"\
909 "psrad $" #shift ", %%mm5 \n\t"\
910 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
911 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
912 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
913 "psrad $" #shift ", %%mm6 \n\t"\
914 "psrad $" #shift ", %%mm1 \n\t"\
915 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
916 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
917 "movd %%mm2, 32+" #dst " \n\t"\
918 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
919 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
920 "movd %%mm6, 48+" #dst " \n\t"\
921 "movd %%mm1, 64+" #dst " \n\t"\
922 "movd %%mm5, 80+" #dst " \n\t"
925 //IDCT( src0, src4, src1, src5, dst, shift)
926 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
927 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
928 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
929 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
934 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
937 #define IDCT(src0, src4, src1, src5, dst, shift) \
938 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
939 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
940 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
941 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
942 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
943 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
944 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
945 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
946 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
947 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
948 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
949 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
950 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
951 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
952 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
953 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
954 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
955 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
956 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
957 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
958 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
959 "psrad $" #shift ", %%mm7 \n\t"\
960 "psrad $" #shift ", %%mm4 \n\t"\
961 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
962 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
963 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
964 "psrad $" #shift ", %%mm0 \n\t"\
965 "psrad $" #shift ", %%mm2 \n\t"\
966 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
967 "movd %%mm7, " #dst " \n\t"\
968 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
969 "movd %%mm0, 16+" #dst " \n\t"\
970 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
971 "movd %%mm2, 96+" #dst " \n\t"\
972 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
973 "movd %%mm4, 112+" #dst " \n\t"\
974 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
975 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
976 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
977 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
978 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
979 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
980 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
981 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
982 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
983 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
984 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
985 "psrad $" #shift ", %%mm2 \n\t"\
986 "psrad $" #shift ", %%mm5 \n\t"\
987 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
988 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
989 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
990 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
991 "psrad $" #shift ", %%mm6 \n\t"\
992 "psrad $" #shift ", %%mm4 \n\t"\
993 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
994 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
995 "movd %%mm2, 32+" #dst " \n\t"\
996 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
997 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
998 "movd %%mm6, 48+" #dst " \n\t"\
999 "movd %%mm4, 64+" #dst " \n\t"\
1000 "movd %%mm5, 80+" #dst " \n\t"
1002 //IDCT( src0, src4, src1, src5, dst, shift)
1003 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1004 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1005 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1006 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1012 #define IDCT(src0, src4, src1, src5, dst, shift) \
1013 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1014 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1015 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1016 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1017 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1018 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1019 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1020 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1021 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1022 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1023 "movq 64(%2), %%mm3 \n\t"\
1024 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1025 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1026 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1027 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1028 "psrad $" #shift ", %%mm7 \n\t"\
1029 "psrad $" #shift ", %%mm4 \n\t"\
1030 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
1031 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1032 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1033 "psrad $" #shift ", %%mm0 \n\t"\
1034 "psrad $" #shift ", %%mm1 \n\t"\
1035 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1036 "movd %%mm7, " #dst " \n\t"\
1037 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1038 "movd %%mm0, 16+" #dst " \n\t"\
1039 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1040 "movd %%mm1, 96+" #dst " \n\t"\
1041 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1042 "movd %%mm4, 112+" #dst " \n\t"\
1043 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1044 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1045 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1046 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
1047 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1048 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1049 "psrad $" #shift ", %%mm1 \n\t"\
1050 "psrad $" #shift ", %%mm5 \n\t"\
1051 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1052 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1053 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1054 "psrad $" #shift ", %%mm6 \n\t"\
1055 "psrad $" #shift ", %%mm4 \n\t"\
1056 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1057 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1058 "movd %%mm1, 32+" #dst " \n\t"\
1059 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1060 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1061 "movd %%mm6, 48+" #dst " \n\t"\
1062 "movd %%mm4, 64+" #dst " \n\t"\
1063 "movd %%mm5, 80+" #dst " \n\t"
1066 //IDCT( src0, src4, src1, src5, dst, shift)
1067 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1068 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1069 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1070 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1076 #define IDCT(src0, src4, src1, src5, dst, shift) \
1077 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1078 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1079 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1080 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1081 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1082 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1083 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1084 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1085 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1086 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1087 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1088 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1089 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1090 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1091 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1092 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1093 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1094 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
1095 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1096 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1097 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1098 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1099 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1100 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1101 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1102 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
1103 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
1104 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
1105 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
1106 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
1107 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
1108 "psrad $" #shift ", %%mm4 \n\t"\
1109 "psrad $" #shift ", %%mm7 \n\t"\
1110 "psrad $" #shift ", %%mm3 \n\t"\
1111 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
1112 "movq %%mm4, " #dst " \n\t"\
1113 "psrad $" #shift ", %%mm0 \n\t"\
1114 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
1115 "movq %%mm0, 16+" #dst " \n\t"\
1116 "movq %%mm0, 96+" #dst " \n\t"\
1117 "movq %%mm4, 112+" #dst " \n\t"\
1118 "psrad $" #shift ", %%mm5 \n\t"\
1119 "psrad $" #shift ", %%mm6 \n\t"\
1120 "psrad $" #shift ", %%mm2 \n\t"\
1121 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1122 "movq %%mm5, 32+" #dst " \n\t"\
1123 "psrad $" #shift ", %%mm1 \n\t"\
1124 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1125 "movq %%mm6, 48+" #dst " \n\t"\
1126 "movq %%mm6, 64+" #dst " \n\t"\
1127 "movq %%mm5, 80+" #dst " \n\t"
1130 //IDCT( src0, src4, src1, src5, dst, shift)
1131 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1132 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1133 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1134 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1141 #define IDCT(src0, src4, src1, src5, dst, shift) \
1142 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1143 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1144 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1145 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1146 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1147 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1148 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1149 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1150 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1151 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1152 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1153 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1154 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1155 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1156 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1157 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1158 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1159 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1160 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1161 "movq 64(%2), %%mm1 \n\t"\
1162 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1163 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1164 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1165 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1166 "psrad $" #shift ", %%mm7 \n\t"\
1167 "psrad $" #shift ", %%mm4 \n\t"\
1168 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1169 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1170 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1171 "psrad $" #shift ", %%mm0 \n\t"\
1172 "psrad $" #shift ", %%mm3 \n\t"\
1173 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1174 "movd %%mm7, " #dst " \n\t"\
1175 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1176 "movd %%mm0, 16+" #dst " \n\t"\
1177 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1178 "movd %%mm3, 96+" #dst " \n\t"\
1179 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1180 "movd %%mm4, 112+" #dst " \n\t"\
1181 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1182 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1183 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1184 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1185 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1186 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1187 "psrad $" #shift ", %%mm3 \n\t"\
1188 "psrad $" #shift ", %%mm5 \n\t"\
1189 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1190 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1191 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1192 "psrad $" #shift ", %%mm6 \n\t"\
1193 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1194 "movd %%mm3, 32+" #dst " \n\t"\
1195 "psrad $" #shift ", %%mm4 \n\t"\
1196 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1197 "movd %%mm6, 48+" #dst " \n\t"\
1198 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1199 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1200 "movd %%mm4, 64+" #dst " \n\t"\
1201 "movd %%mm5, 80+" #dst " \n\t"
1204 //IDCT( src0, src4, src1, src5, dst, shift)
1205 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1206 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1207 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1208 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1215 #define IDCT(src0, src4, src1, src5, dst, shift) \
1216 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1217 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1218 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1219 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1220 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1221 "psrad $" #shift ", %%mm4 \n\t"\
1222 "psrad $" #shift ", %%mm0 \n\t"\
1223 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1224 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1225 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1226 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1227 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1228 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1229 "psrad $" #shift ", %%mm1 \n\t"\
1230 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1231 "movq %%mm4, " #dst " \n\t"\
1232 "psrad $" #shift ", %%mm2 \n\t"\
1233 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1234 "movq %%mm0, 16+" #dst " \n\t"\
1235 "movq %%mm0, 96+" #dst " \n\t"\
1236 "movq %%mm4, 112+" #dst " \n\t"\
1237 "movq %%mm0, 32+" #dst " \n\t"\
1238 "movq %%mm4, 48+" #dst " \n\t"\
1239 "movq %%mm4, 64+" #dst " \n\t"\
1240 "movq %%mm0, 80+" #dst " \n\t"
1242 //IDCT( src0, src4, src1, src5, dst, shift)
1243 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1244 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1245 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1246 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1253 00 40 04 44 20 60 24 64
1254 10 30 14 34 50 70 54 74
1255 01 41 03 43 21 61 23 63
1256 11 31 13 33 51 71 53 73
1257 02 42 06 46 22 62 26 66
1258 12 32 16 36 52 72 56 76
1259 05 45 07 47 25 65 27 67
1260 15 35 17 37 55 75 57 77
1263 00 04 10 14 20 24 30 34
1264 40 44 50 54 60 64 70 74
1265 01 03 11 13 21 23 31 33
1266 41 43 51 53 61 63 71 73
1267 02 06 12 16 22 26 32 36
1268 42 46 52 56 62 66 72 76
1269 05 07 15 17 25 27 35 37
1270 45 47 55 57 65 67 75 77
1274 :: "r" (block), "r" (temp), "r" (coeffs)
1279 void ff_simple_idct_mmx(int16_t *block)
1284 //FIXME merge add/put into the idct
1286 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1289 put_pixels_clamped_mmx(block, dest, line_size);
1291 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1294 add_pixels_clamped_mmx(block, dest, line_size);