1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2011 Vitor Sessak
6 ;* This algorithm (though not any of the implementation details) is
7 ;* based on libdjbfft by D. J. Bernstein.
9 ;* This file is part of FFmpeg.
11 ;* FFmpeg is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* FFmpeg is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with FFmpeg; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 ; These functions are not individually interchangeable with the C versions.
27 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28 ; in blocks as conventient to the vector size.
29 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
31 %include "libavutil/x86/x86inc.asm"
52 %define M_SQRT1_2 0.70710678118654752440
53 %define M_COS_PI_1_8 0.923879532511287
54 %define M_COS_PI_3_8 0.38268343236509
57 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
58 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
60 ps_root2: times 8 dd M_SQRT1_2
61 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
62 ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
64 perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
65 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
66 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
67 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
90 %macro T2_3DN 4 ; z0, z1, mem0, mem1
97 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
100 pfadd %5, %4 ; {t6,t5}
101 pxor %3, [ps_m1p1] ; {t8,t7}
104 pfadd %1, %5 ; {r0,i0}
105 pfsub %6, %5 ; {r2,i2}
107 pfadd %2, %3 ; {r1,i1}
108 pfsub %4, %3 ; {r3,i3}
112 ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
113 ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
115 ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
116 ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
118 vsubps %5, %1, %2 ; v = %1 - %2
119 vaddps %3, %1, %2 ; w = %1 + %2
120 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
121 vpermilps %2, %2, [perm1]
122 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
123 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
124 vsubps %4, %5, %1 ; s = r - q
125 vaddps %1, %5, %1 ; u = r + q
126 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
127 vshufps %5, %4, %1, 0xbb
128 vshufps %3, %4, %1, 0xee
129 vperm2f128 %3, %3, %5, 0x13
130 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
131 vshufps %2, %1, %4, 0xdd
132 vshufps %1, %1, %4, 0x88
133 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
134 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
136 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
137 vsubps %2, %4, %1 ; %2 = v - w
138 vaddps %1, %4, %1 ; %1 = v + w
141 ; In SSE mode do one fft4 transforms
142 ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
143 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
145 ; In AVX mode do two fft4 transforms
146 ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
147 ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
149 subps %3, %1, %2 ; {t3,t4,-t8,t7}
150 addps %1, %1, %2 ; {t1,t2,t6,t5}
151 xorps %3, %3, [ps_p1p1m1p1]
152 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
153 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
154 subps %3, %1, %2 ; {r2,i2,r3,i3}
155 addps %1, %1, %2 ; {r0,i0,r1,i1}
156 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
157 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
160 ; In SSE mode do one FFT8
161 ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
162 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
164 ; In AVX mode do two FFT8
165 ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
166 ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
167 ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
168 ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
170 addps %6, %3, %4 ; {t1,t2,t3,t4}
171 subps %3, %3, %4 ; {r5,i5,r7,i7}
172 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
173 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
174 mulps %4, %4, [ps_root2]
175 addps %3, %3, %4 ; {t8,t7,ta,t9}
176 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
177 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
178 subps %3, %6, %4 ; {t6,t5,tc,tb}
179 addps %6, %6, %4 ; {t1,t2,t9,ta}
180 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
181 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
182 subps %3, %1, %6 ; {r4,r5,r6,r7}
183 addps %1, %1, %6 ; {r0,r1,r2,r3}
184 subps %4, %2, %5 ; {i4,i5,i6,i7}
185 addps %2, %2, %5 ; {i0,i1,i2,i3}
188 ; scheduled for cpu-bound sizes
189 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
194 mulps m2, m4, m0 ; r2*wre
196 mulps m3, m5, m1 ; i2*wim
198 mulps m4, m4, m1 ; r2*wim
199 mulps m5, m5, m0 ; i2*wre
200 addps m2, m2, m3 ; r2*wre + i2*wim
201 mulps m3, m1, m7 ; i3*wim
202 subps m5, m5, m4 ; i2*wre - r2*wim
203 mulps m1, m1, m6 ; r3*wim
204 mulps m4, m0, m6 ; r3*wre
205 mulps m0, m0, m7 ; i3*wre
206 subps m4, m4, m3 ; r3*wre - i3*wim
208 addps m0, m0, m1 ; i3*wre + r3*wim
209 subps m1, m4, m2 ; t3
210 addps m4, m4, m2 ; t5
211 subps m3, m3, m4 ; r2
212 addps m4, m4, Z(0) ; r0
216 subps m3, m5, m0 ; t4
217 subps m4, m6, m3 ; r3
218 addps m3, m3, m6 ; r1
222 addps m3, m5, m0 ; t6
223 subps m2, m2, m1 ; i3
225 addps m1, m1, Z(3) ; i1
228 subps m4, m7, m3 ; i2
229 addps m3, m3, m7 ; i0
234 ; scheduled to avoid store->load aliasing
235 %macro PASS_BIG 1 ; (!interleave)
239 mova m1, [wq+o1q] ; wim
240 mulps m2, m4, m0 ; r2*wre
242 mulps m3, m5, m1 ; i2*wim
244 mulps m4, m4, m1 ; r2*wim
245 mulps m5, m5, m0 ; i2*wre
246 addps m2, m2, m3 ; r2*wre + i2*wim
247 mulps m3, m1, m7 ; i3*wim
248 mulps m1, m1, m6 ; r3*wim
249 subps m5, m5, m4 ; i2*wre - r2*wim
250 mulps m4, m0, m6 ; r3*wre
251 mulps m0, m0, m7 ; i3*wre
252 subps m4, m4, m3 ; r3*wre - i3*wim
254 addps m0, m0, m1 ; i3*wre + r3*wim
255 subps m1, m4, m2 ; t3
256 addps m4, m4, m2 ; t5
257 subps m3, m3, m4 ; r2
258 addps m4, m4, Z(0) ; r0
262 subps m3, m5, m0 ; t4
263 subps m4, m6, m3 ; r3
264 addps m3, m3, m6 ; r1
268 addps m5, m5, m0 ; t6
269 subps m2, m2, m1 ; i3
271 addps m1, m1, Z(3) ; i1
274 subps m6, m7, m5 ; i2
275 addps m5, m5, m7 ; i0
279 INTERL m1, m3, m7, Z, 2
280 INTERL m2, m4, m0, Z2, 6
285 INTERL m5, m1, m3, Z, 0
286 INTERL m6, m2, m7, Z, 4
296 %define Z(x) [r0+mmsize*x]
297 %define Z2(x) [r0+mmsize*x]
298 %define ZH(x) [r0+mmsize*x+mmsize/2]
307 T8_AVX m0, m1, m2, m3, m4
321 T8_AVX m0, m1, m4, m5, m7
323 mova m4, [ps_cos16_1]
324 mova m5, [ps_cos16_2]
331 vblendps m2, m7, m3, 0xf0
332 vperm2f128 m3, m7, m3, 0x21
335 vperm2f128 m2, m2, m2, 0x01
340 vextractf128 Z(0), m0, 0
341 vextractf128 ZH(0), m1, 0
342 vextractf128 Z(1), m0, 1
343 vextractf128 ZH(1), m1, 1
344 vextractf128 Z(2), m5, 0
345 vextractf128 ZH(2), m3, 0
346 vextractf128 Z(3), m5, 1
347 vextractf128 ZH(3), m3, 1
362 T8_SSE m0, m1, m2, m3, m4, m6
363 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
364 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
366 vperm2f128 m4, m0, m2, 0x20
367 vperm2f128 m5, m1, m3, 0x20
368 vperm2f128 m6, m0, m2, 0x31
369 vperm2f128 m7, m1, m3, 0x31
371 PASS_SMALL 0, [cos_32], [cos_32+32]
375 fft32_interleave_avx:
383 vextractf128 Z(0), m0, 0
384 vextractf128 ZH(0), m1, 0
385 vextractf128 Z(1), m0, 1
386 vextractf128 ZH(1), m1, 1
395 %define movdqa movaps
414 T8_SSE m0, m1, m2, m3, m4, m5
428 T8_SSE m0, m1, m2, m3, m4, m5
439 PASS_SMALL 0, [cos_16], [cos_16+16]
448 T2_3DN m0, m1, Z(0), Z(1)
451 T4_3DN m0, m1, m2, m3, m4, m5
462 T2_3DN m0, m1, Z(0), Z(1)
465 T4_3DN m0, m1, m2, m3, m4, m5
468 T2_3DN m4, m5, Z(4), Z(5)
469 T2_3DN m6, m7, Z2(6), Z2(7)
478 T4_3DN m1, m3, m5, m7, m0, m2
483 T4_3DN m0, m2, m4, m6, m5, m7
515 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
516 %define Z2(x) [zq + o3q + mmsize*(x&1)]
517 %define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
518 %define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
520 %macro DECL_PASS 2+ ; name, payload
523 DEFINE_ARGS z, w, n, o1, o3
542 vextractf128 %4(%5), %2, 0
543 vextractf128 %4 %+ H(%5), %3, 0
544 vextractf128 %4(%5 + 1), %2, 1
545 vextractf128 %4 %+ H(%5 + 1), %3, 1
548 %define INTERL INTERL_AVX
550 DECL_PASS pass_avx, PASS_BIG 1
551 DECL_PASS pass_interleave_avx, PASS_BIG 0
564 %define INTERL INTERL_SSE
566 DECL_PASS pass_sse, PASS_BIG 1
567 DECL_PASS pass_interleave_sse, PASS_BIG 0
573 %define unpcklps punpckldq
574 %define unpckhps punpckhdq
575 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
576 DECL_PASS pass_interleave_3dn, PASS_BIG 0
577 %define pass_3dn2 pass_3dn
578 %define pass_interleave_3dn2 pass_interleave_3dn
581 %define SECTION_REL - $$
586 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
587 lea r2, [dispatch_tab%1]
588 mov r2, [r2 + (%2q-2)*gprsize]
594 %endmacro ; FFT_DISPATCH
596 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
597 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
599 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
602 %xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
609 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
614 add r0, n*4 - (n&(-2<<%1))
616 add r0, n*2 - (n2&(-2<<%1))
618 sub r0, n*6 + (n2&(-2<<%1))
628 dispatch_tab%3%2: pointer list_of_fft
632 ; On x86_32, this function does the register saving and restoring for all of fft.
633 ; The others pass args in registers and don't spill anything.
634 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
635 FFT_DISPATCH %3%2, nbits
645 DECL_FFT 6, _avx, _interleave
649 DECL_FFT 5, _sse, _interleave
652 DECL_FFT 4, _3dn, _interleave
654 DECL_FFT 4, _3dn2, _interleave
663 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
664 movaps xmm0, [%3+%2*4]
665 movaps xmm1, [%3+%1*4-0x10]
667 shufps xmm0, xmm1, 0x88
668 shufps xmm1, xmm2, 0x77
669 movlps xmm4, [%4+%2*2]
670 movlps xmm5, [%5+%2*2+0x0]
671 movhps xmm4, [%4+%1*2-0x8]
672 movhps xmm5, [%5+%1*2-0x8]
686 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
687 mulps m6, %3, [%5+%1]
688 mulps m7, %2, [%5+%1]
689 mulps %2, %2, [%6+%1]
690 mulps %3, %3, [%6+%1]
695 %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
697 vmovaps ymm1, [%3+%1*2]
698 vmovaps ymm0, [%3+%1*2+0x20]
699 vmovaps ymm3, [%3+%2*2]
700 vmovaps ymm2, [%3+%2*2+0x20]
702 CMUL %1, ymm0, ymm1, %3, %4, %5
703 CMUL %2, ymm2, ymm3, %3, %4, %5
704 vshufps ymm1, ymm1, ymm1, 0x1b
705 vshufps ymm3, ymm3, ymm3, 0x1b
706 vperm2f128 ymm1, ymm1, ymm1, 0x01
707 vperm2f128 ymm3, ymm3, ymm3, 0x01
708 vunpcklps ymm6, ymm2, ymm1
709 vunpckhps ymm4, ymm2, ymm1
710 vunpcklps ymm7, ymm0, ymm3
711 vunpckhps ymm5, ymm0, ymm3
713 vextractf128 [%3+%1*2], ymm7, 0
714 vextractf128 [%3+%1*2+0x10], ymm5, 0
715 vextractf128 [%3+%1*2+0x20], ymm7, 1
716 vextractf128 [%3+%1*2+0x30], ymm5, 1
718 vextractf128 [%3+%2*2], ymm6, 0
719 vextractf128 [%3+%2*2+0x10], ymm4, 0
720 vextractf128 [%3+%2*2+0x20], ymm6, 1
721 vextractf128 [%3+%2*2+0x30], ymm4, 1
727 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
729 movaps xmm1, [%3+%1*2]
730 movaps xmm0, [%3+%1*2+0x10]
731 CMUL %1, xmm0, xmm1, %3, %4, %5
732 movaps xmm5, [%3+%2*2]
733 movaps xmm4, [%3+%2*2+0x10]
734 CMUL %2, xmm4, xmm5, %3, %4, %5
735 shufps xmm1, xmm1, 0x1b
736 shufps xmm5, xmm5, 0x1b
743 movaps [%3+%2*2], xmm6
744 movaps [%3+%2*2+0x10], xmm4
745 movaps [%3+%1*2], xmm0
746 movaps [%3+%1*2+0x10], xmm2
753 cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
763 mov r3d, [r0+FFTContext.mdctsize]
766 mov rtcos, [r0+FFTContext.tcos]
767 mov rtsin, [r0+FFTContext.tsin]
775 mov rrevtab, [r0+FFTContext.revtab]
795 PREROTATER r4, r3, r2, rtcos, rtsin
797 movzx r5, word [rrevtab+r4-4]
798 movzx r6, word [rrevtab+r4-2]
799 movzx r10, word [rrevtab+r3]
800 movzx r11, word [rrevtab+r3+2]
801 movlps [r1+r5 *8], xmm0
802 movhps [r1+r6 *8], xmm0
803 movlps [r1+r10*8], xmm1
804 movhps [r1+r11*8], xmm1
808 movzx r5, word [r6+r4-4]
809 movzx r4, word [r6+r4-2]
810 movlps [r1+r5*8], xmm0
811 movhps [r1+r4*8], xmm0
812 movzx r5, word [r6+r3]
813 movzx r4, word [r6+r3+2]
814 movlps [r1+r5*8], xmm1
815 movhps [r1+r4*8], xmm1
823 mov r1d, [r5+FFTContext.nbits]
827 mov r0d, [r5+FFTContext.mdctsize]
839 %2 r0, r1, r6, rtcos, rtsin
843 %ifidn avx_enabled, 1
849 DECL_IMDCT _sse, POSROTATESHUF
854 DECL_IMDCT _avx, POSROTATESHUF_AVX