1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This algorithm (though not any of the implementation details) is
6 ;* based on libdjbfft by D. J. Bernstein.
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 ; These functions are not individually interchangeable with the C versions.
26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
27 ; in blocks as conventient to the vector size.
28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
51 %define M_SQRT1_2 0.70710678118654752440
52 ps_root2: times 4 dd M_SQRT1_2
53 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
54 ps_p1p1m1p1: dd 0, 0, 1<<31, 0
75 section .text align=16
77 %macro T2_3DN 4 ; z0, z1, mem0, mem1
84 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
87 pfadd %5, %4 ; {t6,t5}
88 pxor %3, [ps_m1p1] ; {t8,t7}
91 pfadd %1, %5 ; {r0,i0}
92 pfsub %6, %5 ; {r2,i2}
94 pfadd %2, %3 ; {r1,i1}
95 pfsub %4, %3 ; {r3,i3}
99 ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
100 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
103 addps %1, %2 ; {t1,t2,t6,t5}
104 subps %3, %2 ; {t3,t4,-t8,t7}
105 xorps %3, [ps_p1p1m1p1]
107 shufps %1, %3, 0x44 ; {t1,t2,t3,t4}
108 shufps %2, %3, 0xbe ; {t6,t5,t7,t8}
110 addps %1, %2 ; {r0,i0,r1,i1}
111 subps %3, %2 ; {r2,i2,r3,i3}
113 shufps %1, %3, 0x88 ; {r0,r1,r2,r3}
114 shufps %2, %3, 0xdd ; {i0,i1,i2,i3}
117 ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
118 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
121 subps %3, %4 ; {r5,i5,r7,i7}
122 addps %6, %4 ; {t1,t2,t3,t4}
124 shufps %4, %4, 0xb1 ; {i5,r5,i7,r7}
125 mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
127 addps %3, %4 ; {t8,t7,ta,t9}
129 shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
130 shufps %4, %3, 0x9c ; {t1,t4,t7,ta}
132 addps %6, %4 ; {t1,t2,t9,ta}
133 subps %3, %4 ; {t6,t5,tc,tb}
135 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb}
136 shufps %4, %3, 0x8d ; {t2,ta,t6,tc}
139 addps %1, %6 ; {r0,r1,r2,r3}
140 addps %2, %4 ; {i0,i1,i2,i3}
141 subps %3, %6 ; {r4,r5,r6,r7}
142 subps %5, %4 ; {i4,i5,i6,i7}
146 ; scheduled for cpu-bound sizes
147 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
154 mulps m2, m0 ; r2*wre
156 mulps m3, m1 ; i2*wim
158 mulps m4, m1 ; r2*wim
159 mulps m5, m0 ; i2*wre
160 addps m2, m3 ; r2*wre + i2*wim
162 mulps m1, m6 ; r3*wim
163 subps m5, m4 ; i2*wre - r2*wim
165 mulps m3, m7 ; i3*wim
166 mulps m4, m6 ; r3*wre
167 mulps m0, m7 ; i3*wre
168 subps m4, m3 ; r3*wre - i3*wim
170 addps m0, m1 ; i3*wre + r3*wim
200 ; scheduled to avoid store->load aliasing
201 %macro PASS_BIG 1 ; (!interleave)
207 mova m1, [wq+o1q] ; wim
208 mulps m2, m0 ; r2*wre
210 mulps m3, m1 ; i2*wim
212 mulps m4, m1 ; r2*wim
213 mulps m5, m0 ; i2*wre
214 addps m2, m3 ; r2*wre + i2*wim
216 mulps m1, m6 ; r3*wim
217 subps m5, m4 ; i2*wre - r2*wim
219 mulps m3, m7 ; i3*wim
220 mulps m4, m6 ; r3*wre
221 mulps m0, m7 ; i3*wre
222 subps m4, m3 ; r3*wre - i3*wim
224 addps m0, m1 ; i3*wre + r3*wim
287 %define Z(x) [r0+mmsize*x]
288 %define Z2(x) [r0+mmsize*x]
306 T8_SSE m0, m1, m2, m3, m4, m5
320 T8_SSE m0, m1, m2, m3, m4, m5
331 PASS_SMALL 0, [cos_16], [cos_16+16]
340 T2_3DN m0, m1, Z(0), Z(1)
343 T4_3DN m0, m1, m2, m3, m4, m5
354 T2_3DN m0, m1, Z(0), Z(1)
357 T4_3DN m0, m1, m2, m3, m4, m5
360 T2_3DN m4, m5, Z(4), Z(5)
361 T2_3DN m6, m7, Z2(6), Z2(7)
370 T4_3DN m1, m3, m5, m7, m0, m2
375 T4_3DN m0, m2, m4, m6, m5, m7
407 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
408 %define Z2(x) [zq + o3q + mmsize*(x&1)]
410 %macro DECL_PASS 2+ ; name, payload
413 DEFINE_ARGS z, w, n, o1, o3
428 DECL_PASS pass_sse, PASS_BIG 1
429 DECL_PASS pass_interleave_sse, PASS_BIG 0
435 %define unpcklps punpckldq
436 %define unpckhps punpckhdq
437 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
438 DECL_PASS pass_interleave_3dn, PASS_BIG 0
439 %define pass_3dn2 pass_3dn
440 %define pass_interleave_3dn2 pass_interleave_3dn
443 %define SECTION_REL - $$
448 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
449 lea r2, [dispatch_tab%1]
450 mov r2, [r2 + (%2q-2)*gprsize]
456 %endmacro ; FFT_DISPATCH
458 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
459 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
461 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
468 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
473 add r0, n*4 - (n&(-2<<%1))
475 add r0, n*2 - (n2&(-2<<%1))
477 sub r0, n*6 + (n2&(-2<<%1))
487 dispatch_tab%3%2: pointer list_of_fft
491 ; On x86_32, this function does the register saving and restoring for all of fft.
492 ; The others pass args in registers and don't spill anything.
493 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
494 FFT_DISPATCH %3%2, nbits
499 DECL_FFT 5, _sse, _interleave
501 DECL_FFT 4, _3dn, _interleave
503 DECL_FFT 4, _3dn2, _interleave
512 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
513 movaps xmm0, [%3+%2*4]
514 movaps xmm1, [%3+%1*4-0x10]
516 shufps xmm0, xmm1, 0x88
517 shufps xmm1, xmm2, 0x77
518 movlps xmm4, [%4+%2*2]
519 movlps xmm5, [%5+%2*2+0x0]
520 movhps xmm4, [%4+%1*2-0x8]
521 movhps xmm5, [%5+%1*2-0x8]
535 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
536 movaps xmm6, [%4+%1*2]
537 movaps %2, [%4+%1*2+0x10]
548 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
550 CMUL %1, xmm0, xmm1, %3, %4, %5
551 CMUL %2, xmm4, xmm5, %3, %4, %5
552 shufps xmm1, xmm1, 0x1b
553 shufps xmm5, xmm5, 0x1b
560 movaps [%3+%2*2], xmm6
561 movaps [%3+%2*2+0x10], xmm4
562 movaps [%3+%1*2], xmm0
563 movaps [%3+%1*2+0x10], xmm2
569 cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
582 mov r3d, [r0+FFTContext.mdctsize]
585 mov rtcos, [r0+FFTContext.tcos]
586 mov rtsin, [r0+FFTContext.tsin]
594 mov rrevtab, [r0+FFTContext.revtab]
614 PREROTATER r4, r3, r2, rtcos, rtsin
616 movzx r5, word [rrevtab+r4-4]
617 movzx r6, word [rrevtab+r4-2]
618 movzx r13, word [rrevtab+r3]
619 movzx r14, word [rrevtab+r3+2]
620 movlps [r1+r5 *8], xmm0
621 movhps [r1+r6 *8], xmm0
622 movlps [r1+r13*8], xmm1
623 movhps [r1+r14*8], xmm1
627 movzx r5, word [r6+r4-4]
628 movzx r4, word [r6+r4-2]
629 movlps [r1+r5*8], xmm0
630 movhps [r1+r4*8], xmm0
631 movzx r5, word [r6+r3]
632 movzx r4, word [r6+r3+2]
633 movlps [r1+r5*8], xmm1
634 movhps [r1+r4*8], xmm1
642 mov r1d, [r5+FFTContext.nbits]
644 FFT_DISPATCH _sse, r1
646 mov r0d, [r5+FFTContext.mdctsize]
658 POSROTATESHUF r0, r1, r6, rtcos, rtsin