4 ; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
6 ; Conversion from gcc syntax to x264asm syntax with minimal modifications
7 ; by James Darnley <jdarnley@obe.tv>.
9 ; This file is part of FFmpeg.
11 ; FFmpeg is free software; you can redistribute it and/or
12 ; modify it under the terms of the GNU Lesser General Public
13 ; License as published by the Free Software Foundation; either
14 ; version 2.1 of the License, or (at your option) any later version.
16 ; FFmpeg is distributed in the hope that it will be useful,
17 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ; Lesser General Public License for more details.
21 ; You should have received a copy of the GNU Lesser General Public
22 ; License along with FFmpeg; if not, write to the Free Software
23 ; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 %include "libavutil/x86/x86util.asm"
32 wm1010: dw 0, 0xffff, 0, 0xffff
44 %define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 %define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 %define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 %define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48 %define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
49 %define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
50 %define C6 8867 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51 %define C7 4520 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
54 %define COL_SHIFT 20 ; 6
57 dw 1 << (ROW_SHIFT - 1), 0
58 dw 1 << (ROW_SHIFT - 1), 0
59 dw 1 << (ROW_SHIFT - 1), 1
60 dw 1 << (ROW_SHIFT - 1), 0
83 movq mm0, [blockq + %1] ; R4 R0 r4 r0
84 movq mm1, [blockq + %2] ; R6 R2 r6 r2
85 movq mm2, [blockq + %3] ; R3 R1 r3 r1
86 movq mm3, [blockq + %4] ; R7 R5 r7 r5
96 movq mm4, [coeffs + 16] ; C4 C4 C4 C4
97 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
98 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
99 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
100 movq mm5, [coeffs + 32] ; C6 C2 C6 C2
101 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
102 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
103 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
104 movq mm7, [coeffs + 48] ; C3 C1 C3 C1
105 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
106 paddd mm4, [coeffs + 8]
107 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
108 paddd mm4, mm5 ; A0 a0
109 psubd mm6, mm5 ; A3 a3
110 movq mm5, [coeffs + 56] ; C7 C5 C7 C5
111 pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5
112 paddd mm0, [coeffs + 8]
113 paddd mm1, mm0 ; A1 a1
115 psubd mm0, mm1 ; A2 a2
116 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
117 paddd mm7, mm5 ; B0 b0
118 movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1
119 pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5
120 paddd mm7, mm4 ; A0+B0 a0+b0
121 paddd mm4, mm4 ; 2A0 2a0
122 psubd mm4, mm7 ; A0-B0 a0-b0
123 paddd mm5, mm2 ; B1 b1
126 movq mm2, mm1 ; A1 a1
127 paddd mm1, mm5 ; A1+B1 a1+b1
128 psubd mm2, mm5 ; A1-B1 a1-b1
131 packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0
132 packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1
134 movq mm1, [blockq + %3] ; R3 R1 r3 r1
135 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
137 pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1
138 movq mm7, [coeffs + 88] ; C3 C7 C3 C7
139 pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
140 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
141 movq mm2, mm0 ; A2 a2
142 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
143 paddd mm4, mm7 ; B2 b2
144 paddd mm2, mm4 ; A2+B2 a2+b2
145 psubd mm0, mm4 ; a2-B2 a2-b2
148 movq mm4, mm6 ; A3 a3
149 paddd mm3, mm1 ; B3 b3
150 paddd mm6, mm3 ; A3+B3 a3+b3
151 psubd mm4, mm3 ; a3-B3 a3-b3
153 packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2
156 packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3
172 movq mm0, [blockq + %1] ; R4 R0 r4 r0
173 movq mm1, [blockq + %2] ; R6 R2 r6 r2
174 movq mm2, [blockq + %3] ; R3 R1 r3 r1
175 movq mm3, [blockq + %4] ; R7 R5 r7 r5
184 movq mm4, [coeffs + 16] ; C4 C4 C4 C4
185 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
186 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
187 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
188 movq mm5, [coeffs + 32] ; C6 C2 C6 C2
189 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
190 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
191 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
192 movq mm7, [coeffs + 48] ; C3 C1 C3 C1
193 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
195 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
196 paddd mm4, mm5 ; A0 a0
197 psubd mm6, mm5 ; A3 a3
198 movq mm5, [coeffs + 56] ; C7 C5 C7 C5
199 pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5
201 paddd mm1, mm0 ; A1 a1
203 psubd mm0, mm1 ; A2 a2
204 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
205 paddd mm7, mm5 ; B0 b0
206 movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1
207 pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5
208 paddd mm7, mm4 ; A0+B0 a0+b0
209 paddd mm4, mm4 ; 2A0 2a0
210 psubd mm4, mm7 ; A0-B0 a0-b0
211 paddd mm5, mm2 ; B1 b1
214 movq mm2, mm1 ; A1 a1
215 paddd mm1, mm5 ; A1+B1 a1+b1
216 psubd mm2, mm5 ; A1-B1 a1-b1
219 packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0
220 packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1
222 movq mm1, [blockq + %3] ; R3 R1 r3 r1
223 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
225 pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1
226 movq mm7, [coeffs + 88] ; C3 C7 C3 C7
227 pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
228 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
229 movq mm2, mm0 ; A2 a2
230 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
231 paddd mm4, mm7 ; B2 b2
232 paddd mm2, mm4 ; A2+B2 a2+b2
233 psubd mm0, mm4 ; a2-B2 a2-b2
236 movq mm4, mm6 ; A3 a3
237 paddd mm3, mm1 ; B3 b3
238 paddd mm6, mm3 ; A3+B3 a3+b3
239 psubd mm4, mm3 ; a3-B3 a3-b3
241 packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2
244 packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3
249 movq mm0, %1 ; R4 R0 r4 r0
250 movq mm1, %2 ; R6 R2 r6 r2
251 movq mm2, %3 ; R3 R1 r3 r1
252 movq mm3, %4 ; R7 R5 r7 r5
253 movq mm4, [coeffs + 16] ; C4 C4 C4 C4
254 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
255 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
256 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
257 movq mm5, [coeffs + 32] ; C6 C2 C6 C2
258 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
259 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
260 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
261 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
262 movq mm7, [coeffs + 48] ; C3 C1 C3 C1
263 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
264 paddd mm4, mm5 ; A0 a0
265 psubd mm6, mm5 ; A3 a3
266 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
267 paddd mm0, mm1 ; A1 a1
268 psubd mm5, mm1 ; A2 a2
269 movq mm1, [coeffs + 56] ; C7 C5 C7 C5
270 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
271 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
272 paddd mm7, mm1 ; B0 b0
273 movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1
274 pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5
275 paddd mm7, mm4 ; A0+B0 a0+b0
276 paddd mm4, mm4 ; 2A0 2a0
277 psubd mm4, mm7 ; A0-B0 a0-b0
278 paddd mm1, mm2 ; B1 b1
281 movq mm2, mm0 ; A1 a1
282 paddd mm0, mm1 ; A1+B1 a1+b1
283 psubd mm2, mm1 ; A1-B1 a1-b1
286 packssdw mm7, mm7 ; A0+B0 a0+b0
288 packssdw mm0, mm0 ; A1+B1 a1+b1
290 packssdw mm2, mm2 ; A1-B1 a1-b1
292 packssdw mm4, mm4 ; A0-B0 a0-b0
294 movq mm0, %3 ; R3 R1 r3 r1
295 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
296 pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1
297 movq mm7, [coeffs + 88] ; C3 C7 C3 C7
298 pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
299 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
300 movq mm2, mm5 ; A2 a2
301 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
302 paddd mm4, mm7 ; B2 b2
303 paddd mm2, mm4 ; A2+B2 a2+b2
304 psubd mm5, mm4 ; a2-B2 a2-b2
307 movq mm4, mm6 ; A3 a3
308 paddd mm3, mm0 ; B3 b3
309 paddd mm6, mm3 ; A3+B3 a3+b3
310 psubd mm4, mm3 ; a3-B3 a3-b3
313 packssdw mm2, mm2 ; A2+B2 a2+b2
314 packssdw mm6, mm6 ; A3+B3 a3+b3
316 packssdw mm4, mm4 ; A3-B3 a3-b3
317 packssdw mm5, mm5 ; A2-B2 a2-b2
324 movq mm0, %1 ; R4 R0 r4 r0
325 movq mm1, %2 ; R6 R2 r6 r2
326 movq mm3, %4 ; R7 R5 r7 r5
327 movq mm4, [coeffs + 16] ; C4 C4 C4 C4
328 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
329 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
330 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
331 movq mm5, [coeffs + 32] ; C6 C2 C6 C2
332 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
333 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
334 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
335 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
336 paddd mm4, mm5 ; A0 a0
337 psubd mm6, mm5 ; A3 a3
338 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
339 paddd mm0, mm1 ; A1 a1
340 psubd mm5, mm1 ; A2 a2
341 movq mm1, [coeffs + 56] ; C7 C5 C7 C5
342 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
343 movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1
344 pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5
345 paddd mm1, mm4 ; A0+B0 a0+b0
346 paddd mm4, mm4 ; 2A0 2a0
347 psubd mm4, mm1 ; A0-B0 a0-b0
350 movq mm2, mm0 ; A1 a1
351 paddd mm0, mm7 ; A1+B1 a1+b1
352 psubd mm2, mm7 ; A1-B1 a1-b1
355 packssdw mm1, mm1 ; A0+B0 a0+b0
357 packssdw mm0, mm0 ; A1+B1 a1+b1
359 packssdw mm2, mm2 ; A1-B1 a1-b1
361 packssdw mm4, mm4 ; A0-B0 a0-b0
363 movq mm1, [coeffs + 88] ; C3 C7 C3 C7
364 pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5
365 movq mm2, mm5 ; A2 a2
366 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
367 paddd mm2, mm1 ; A2+B2 a2+b2
368 psubd mm5, mm1 ; a2-B2 a2-b2
371 movq mm1, mm6 ; A3 a3
372 paddd mm6, mm3 ; A3+B3 a3+b3
373 psubd mm1, mm3 ; a3-B3 a3-b3
376 packssdw mm2, mm2 ; A2+B2 a2+b2
377 packssdw mm6, mm6 ; A3+B3 a3+b3
379 packssdw mm1, mm1 ; A3-B3 a3-b3
380 packssdw mm5, mm5 ; A2-B2 a2-b2
387 movq mm0, %1 ; R4 R0 r4 r0
388 movq mm3, %4 ; R7 R5 r7 r5
389 movq mm4, [coeffs + 16] ; C4 C4 C4 C4
390 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
391 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
392 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
393 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
394 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
395 movq mm1, [coeffs + 56] ; C7 C5 C7 C5
396 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
397 movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1
398 pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5
399 paddd mm1, mm4 ; A0+B0 a0+b0
400 paddd mm4, mm4 ; 2A0 2a0
401 psubd mm4, mm1 ; A0-B0 a0-b0
404 movq mm2, mm0 ; A1 a1
405 paddd mm0, mm7 ; A1+B1 a1+b1
406 psubd mm2, mm7 ; A1-B1 a1-b1
409 packssdw mm1, mm1 ; A0+B0 a0+b0
411 packssdw mm0, mm0 ; A1+B1 a1+b1
413 packssdw mm2, mm2 ; A1-B1 a1-b1
415 packssdw mm4, mm4 ; A0-B0 a0-b0
417 movq mm1, [coeffs + 88] ; C3 C7 C3 C7
418 pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5
419 movq mm2, mm5 ; A2 a2
420 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
421 paddd mm2, mm1 ; A2+B2 a2+b2
422 psubd mm5, mm1 ; a2-B2 a2-b2
425 movq mm1, mm6 ; A3 a3
426 paddd mm6, mm3 ; A3+B3 a3+b3
427 psubd mm1, mm3 ; a3-B3 a3-b3
430 packssdw mm2, mm2 ; A2+B2 a2+b2
431 packssdw mm6, mm6 ; A3+B3 a3+b3
433 packssdw mm1, mm1 ; A3-B3 a3-b3
434 packssdw mm5, mm5 ; A2-B2 a2-b2
441 movq mm0, %1 ; R4 R0 r4 r0
442 movq mm2, %3 ; R3 R1 r3 r1
443 movq mm3, %4 ; R7 R5 r7 r5
444 movq mm4, [coeffs + 16] ; C4 C4 C4 C4
445 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
446 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
447 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
448 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
449 movq mm7, [coeffs + 48] ; C3 C1 C3 C1
450 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
451 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
452 movq mm1, [coeffs + 56] ; C7 C5 C7 C5
453 pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
454 pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
455 paddd mm7, mm1 ; B0 b0
456 movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1
457 pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5
458 paddd mm7, mm4 ; A0+B0 a0+b0
459 paddd mm4, mm4 ; 2A0 2a0
460 psubd mm4, mm7 ; A0-B0 a0-b0
461 paddd mm1, mm2 ; B1 b1
464 movq mm2, mm0 ; A1 a1
465 paddd mm0, mm1 ; A1+B1 a1+b1
466 psubd mm2, mm1 ; A1-B1 a1-b1
469 packssdw mm7, mm7 ; A0+B0 a0+b0
471 packssdw mm0, mm0 ; A1+B1 a1+b1
473 packssdw mm2, mm2 ; A1-B1 a1-b1
475 packssdw mm4, mm4 ; A0-B0 a0-b0
477 movq mm0, %3 ; R3 R1 r3 r1
478 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
479 pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1
480 movq mm7, [coeffs + 88] ; C3 C7 C3 C7
481 pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
482 pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
483 movq mm2, mm5 ; A2 a2
484 pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
485 paddd mm4, mm7 ; B2 b2
486 paddd mm2, mm4 ; A2+B2 a2+b2
487 psubd mm5, mm4 ; a2-B2 a2-b2
490 movq mm4, mm6 ; A3 a3
491 paddd mm3, mm0 ; B3 b3
492 paddd mm6, mm3 ; A3+B3 a3+b3
493 psubd mm4, mm3 ; a3-B3 a3-b3
496 packssdw mm2, mm2 ; A2+B2 a2+b2
497 packssdw mm6, mm6 ; A3+B3 a3+b3
499 packssdw mm4, mm4 ; A3-B3 a3-b3
500 packssdw mm5, mm5 ; A2-B2 a2-b2
507 movq mm0, %1 ; R4 R0 r4 r0
508 movq mm2, %3 ; R3 R1 r3 r1
509 movq mm4, [coeffs + 16] ; C4 C4 C4 C4
510 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
511 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
512 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
513 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
514 movq mm7, [coeffs + 48] ; C3 C1 C3 C1
515 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
516 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
517 movq mm3, [coeffs + 64]
518 pmaddwd mm3, mm2 ; -C7R3+C3R1 -C7r3+C3r1
519 paddd mm7, mm4 ; A0+B0 a0+b0
520 paddd mm4, mm4 ; 2A0 2a0
521 psubd mm4, mm7 ; A0-B0 a0-b0
524 movq mm1, mm0 ; A1 a1
525 paddd mm0, mm3 ; A1+B1 a1+b1
526 psubd mm1, mm3 ; A1-B1 a1-b1
529 packssdw mm7, mm7 ; A0+B0 a0+b0
531 packssdw mm0, mm0 ; A1+B1 a1+b1
533 packssdw mm1, mm1 ; A1-B1 a1-b1
535 packssdw mm4, mm4 ; A0-B0 a0-b0
537 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
538 pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1
539 pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
540 movq mm1, mm5 ; A2 a2
541 paddd mm1, mm4 ; A2+B2 a2+b2
542 psubd mm5, mm4 ; a2-B2 a2-b2
545 movq mm4, mm6 ; A3 a3
546 paddd mm6, mm2 ; A3+B3 a3+b3
547 psubd mm4, mm2 ; a3-B3 a3-b3
550 packssdw mm1, mm1 ; A2+B2 a2+b2
551 packssdw mm6, mm6 ; A3+B3 a3+b3
553 packssdw mm4, mm4 ; A3-B3 a3-b3
554 packssdw mm5, mm5 ; A2-B2 a2-b2
561 movq mm0, [%1] ; R4 R0 r4 r0
562 movq mm1, [%2] ; R6 R2 r6 r2
563 movq mm4, [coeffs + 16] ; C4 C4 C4 C4
564 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
565 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
566 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
567 movq mm5, [coeffs + 32] ; C6 C2 C6 C2
568 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
569 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
570 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
571 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
572 paddd mm4, mm5 ; A0 a0
573 psubd mm6, mm5 ; A3 a3
574 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
575 paddd mm0, mm1 ; A1 a1
576 psubd mm5, mm1 ; A2 a2
577 movq mm2, [8 + %1] ; R4 R0 r4 r0
578 movq mm3, [8 + %2] ; R6 R2 r6 r2
579 movq mm1, [coeffs + 16] ; C4 C4 C4 C4
580 pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0
581 movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4
582 pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0
583 movq mm7, [coeffs + 32] ; C6 C2 C6 C2
584 pmaddwd mm7, mm3 ; C6R6+C2R2 C6r6+C2r2
585 pmaddwd mm3, [coeffs + 40] ; -C2R6+C6R2 -C2r6+C6r2
586 paddd mm7, mm1 ; A0 a0
587 paddd mm1, mm1 ; 2C0 2c0
588 psubd mm1, mm7 ; A3 a3
589 paddd mm3, mm2 ; A1 a1
590 paddd mm2, mm2 ; 2C1 2c1
591 psubd mm2, mm3 ; A2 a2
595 packssdw mm4, mm7 ; A0 a0
598 packssdw mm0, mm3 ; A1 a1
605 packssdw mm5, mm2 ; A2-B2 a2-b2
608 packssdw mm6, mm1 ; A3+B3 a3+b3
615 movq mm0, %1 ; R4 R0 r4 r0
616 movq mm1, %2 ; R6 R2 r6 r2
617 movq mm2, %3 ; R3 R1 r3 r1
618 movq mm4, [coeffs + 16] ; C4 C4 C4 C4
619 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
620 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
621 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
622 movq mm5, [coeffs + 32] ; C6 C2 C6 C2
623 pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
624 movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
625 pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
626 movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
627 movq mm7, [coeffs + 48] ; C3 C1 C3 C1
628 pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
629 paddd mm4, mm5 ; A0 a0
630 psubd mm6, mm5 ; A3 a3
631 movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
632 paddd mm0, mm1 ; A1 a1
633 psubd mm5, mm1 ; A2 a2
634 movq mm1, [coeffs + 64]
635 pmaddwd mm1, mm2 ; -C7R3+C3R1 -C7r3+C3r1
636 paddd mm7, mm4 ; A0+B0 a0+b0
637 paddd mm4, mm4 ; 2A0 2a0
638 psubd mm4, mm7 ; A0-B0 a0-b0
641 movq mm3, mm0 ; A1 a1
642 paddd mm0, mm1 ; A1+B1 a1+b1
643 psubd mm3, mm1 ; A1-B1 a1-b1
646 packssdw mm7, mm7 ; A0+B0 a0+b0
648 packssdw mm0, mm0 ; A1+B1 a1+b1
650 packssdw mm3, mm3 ; A1-B1 a1-b1
652 packssdw mm4, mm4 ; A0-B0 a0-b0
654 movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
655 pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1
656 pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
657 movq mm3, mm5 ; A2 a2
658 paddd mm3, mm4 ; A2+B2 a2+b2
659 psubd mm5, mm4 ; a2-B2 a2-b2
662 movq mm4, mm6 ; A3 a3
663 paddd mm6, mm2 ; A3+B3 a3+b3
664 psubd mm4, mm2 ; a3-B3 a3-b3
666 packssdw mm3, mm3 ; A2+B2 a2+b2
669 packssdw mm6, mm6 ; A3+B3 a3+b3
671 packssdw mm4, mm4 ; A3-B3 a3-b3
672 packssdw mm5, mm5 ; A2-B2 a2-b2
678 movq mm0, [%1] ; R4 R0 r4 r0
679 movq mm4, [coeffs + 16] ; C4 C4 C4 C4
680 pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
681 movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
682 pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
685 movq mm2, [8 + %1] ; R4 R0 r4 r0
686 movq mm1, [coeffs + 16] ; C4 C4 C4 C4
687 pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0
688 movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4
689 pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0
690 movq mm7, [coeffs + 32] ; C6 C2 C6 C2
692 packssdw mm4, mm1 ; A0 a0
695 packssdw mm0, mm2 ; A1 a1
706 DC_COND_IDCT 0, 8, 16, 24, rsp + 0, null, 11
707 Z_COND_IDCT 32, 40, 48, 56, rsp + 32, null, 11, %%4
708 Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%2
709 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%1
711 IDCT1 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
712 IDCT1 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
713 IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
714 IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
719 Z_COND_IDCT 64, 72, 80, 88, rsp + 64, null, 11, %%6
720 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
722 IDCT2 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
723 IDCT2 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
724 IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
725 IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
730 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
732 IDCT3 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
733 IDCT3 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
734 IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
735 IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
740 Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
742 IDCT4 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
743 IDCT4 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
744 IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
745 IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
751 IDCT5 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
752 IDCT5 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
753 IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
754 IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
760 IDCT6 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20
761 IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20
767 IDCT7 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
768 IDCT7 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
769 IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
770 IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
776 IDCT8 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20
777 IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20
782 %macro PUT_PIXELS_CLAMPED_HALF 1
783 mova m0, [blockq+mmsize*0+%1]
784 mova m1, [blockq+mmsize*2+%1]
786 mova m2, [blockq+mmsize*4+%1]
787 mova m3, [blockq+mmsize*6+%1]
789 packuswb m0, [blockq+mmsize*1+%1]
790 packuswb m1, [blockq+mmsize*3+%1]
792 packuswb m2, [blockq+mmsize*5+%1]
793 packuswb m3, [blockq+mmsize*7+%1]
795 movq [lsizeq+pixelsq], m1
796 movq [2*lsizeq+pixelsq], m2
797 movq [lsize3q+pixelsq], m3
800 movhps [lsizeq+pixelsq], m0
801 movq [2*lsizeq+pixelsq], m1
802 movhps [lsize3q+pixelsq], m1
806 %macro ADD_PIXELS_CLAMPED 1
807 mova m0, [blockq+mmsize*0+%1]
808 mova m1, [blockq+mmsize*1+%1]
810 mova m5, [blockq+mmsize*2+%1]
811 mova m6, [blockq+mmsize*3+%1]
814 movq m3, [pixelsq+lsizeq]
836 movq [pixelsq+lsizeq], m5
839 movhps [pixelsq+lsizeq], m0
845 cglobal simple_idct, 1, 2, 8, 128, block, t0
849 cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
851 lea lsize3q, [lsizeq*3]
852 PUT_PIXELS_CLAMPED_HALF 0
853 lea pixelsq, [pixelsq+lsizeq*4]
854 PUT_PIXELS_CLAMPED_HALF 64
857 cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
861 lea pixelsq, [pixelsq+lsizeq*2]
862 ADD_PIXELS_CLAMPED 32
863 lea pixelsq, [pixelsq+lsizeq*2]
864 ADD_PIXELS_CLAMPED 64
865 lea pixelsq, [pixelsq+lsizeq*2]
866 ADD_PIXELS_CLAMPED 96
871 cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
873 lea lsize3q, [lsizeq*3]
874 PUT_PIXELS_CLAMPED_HALF 0
875 lea pixelsq, [pixelsq+lsizeq*4]
876 PUT_PIXELS_CLAMPED_HALF 64
879 cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
883 lea pixelsq, [pixelsq+lsizeq*2]
884 ADD_PIXELS_CLAMPED 32
885 lea pixelsq, [pixelsq+lsizeq*2]
886 ADD_PIXELS_CLAMPED 64
887 lea pixelsq, [pixelsq+lsizeq*2]
888 ADD_PIXELS_CLAMPED 96