1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2011 Vitor Sessak
6 ;* This algorithm (though not any of the implementation details) is
7 ;* based on libdjbfft by D. J. Bernstein.
9 ;* This file is part of Libav.
11 ;* Libav is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* Libav is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with Libav; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 ; These functions are not individually interchangeable with the C versions.
27 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28 ; in blocks as conventient to the vector size.
29 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
52 %define M_SQRT1_2 0.70710678118654752440
53 %define M_COS_PI_1_8 0.923879532511287
54 %define M_COS_PI_3_8 0.38268343236509
57 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
58 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
60 ps_root2: times 8 dd M_SQRT1_2
61 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
62 ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
64 perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
65 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
66 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
67 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
90 %macro T2_3DN 4 ; z0, z1, mem0, mem1
97 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
100 pfadd %5, %4 ; {t6,t5}
101 pxor %3, [ps_m1p1] ; {t8,t7}
104 pfadd %1, %5 ; {r0,i0}
105 pfsub %6, %5 ; {r2,i2}
107 pfadd %2, %3 ; {r1,i1}
108 pfsub %4, %3 ; {r3,i3}
112 ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
113 ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
115 ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
116 ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
118 vsubps %5, %1, %2 ; v = %1 - %2
119 vaddps %3, %1, %2 ; w = %1 + %2
120 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
121 vpermilps %2, %2, [perm1]
122 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
123 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
124 vsubps %4, %5, %1 ; s = r - q
125 vaddps %1, %5, %1 ; u = r + q
126 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
127 vshufps %5, %4, %1, 0xbb
128 vshufps %3, %4, %1, 0xee
129 vperm2f128 %3, %3, %5, 0x13
130 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
131 vshufps %2, %1, %4, 0xdd
132 vshufps %1, %1, %4, 0x88
133 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
134 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
136 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
137 vsubps %2, %4, %1 ; %2 = v - w
138 vaddps %1, %4, %1 ; %1 = v + w
141 ; In SSE mode do one fft4 transforms
142 ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
143 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
145 ; In AVX mode do two fft4 transforms
146 ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
147 ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
149 subps %3, %1, %2 ; {t3,t4,-t8,t7}
150 addps %1, %1, %2 ; {t1,t2,t6,t5}
151 xorps %3, %3, [ps_p1p1m1p1]
152 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
153 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
154 subps %3, %1, %2 ; {r2,i2,r3,i3}
155 addps %1, %1, %2 ; {r0,i0,r1,i1}
156 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
157 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
160 ; In SSE mode do one FFT8
161 ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
162 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
164 ; In AVX mode do two FFT8
165 ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
166 ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
167 ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
168 ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
170 addps %6, %3, %4 ; {t1,t2,t3,t4}
171 subps %3, %3, %4 ; {r5,i5,r7,i7}
172 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
173 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
174 mulps %4, %4, [ps_root2]
175 addps %3, %3, %4 ; {t8,t7,ta,t9}
176 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
177 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
178 subps %3, %6, %4 ; {t6,t5,tc,tb}
179 addps %6, %6, %4 ; {t1,t2,t9,ta}
180 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
181 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
182 subps %3, %1, %6 ; {r4,r5,r6,r7}
183 addps %1, %1, %6 ; {r0,r1,r2,r3}
184 subps %4, %2, %5 ; {i4,i5,i6,i7}
185 addps %2, %2, %5 ; {i0,i1,i2,i3}
188 ; scheduled for cpu-bound sizes
189 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
194 mulps m2, m4, m0 ; r2*wre
196 mulps m3, m5, m1 ; i2*wim
198 mulps m4, m4, m1 ; r2*wim
199 mulps m5, m5, m0 ; i2*wre
200 addps m2, m2, m3 ; r2*wre + i2*wim
201 mulps m3, m1, m7 ; i3*wim
202 subps m5, m5, m4 ; i2*wre - r2*wim
203 mulps m1, m1, m6 ; r3*wim
204 mulps m4, m0, m6 ; r3*wre
205 mulps m0, m0, m7 ; i3*wre
206 subps m4, m4, m3 ; r3*wre - i3*wim
208 addps m0, m0, m1 ; i3*wre + r3*wim
209 subps m1, m4, m2 ; t3
210 addps m4, m4, m2 ; t5
211 subps m3, m3, m4 ; r2
212 addps m4, m4, Z(0) ; r0
216 subps m3, m5, m0 ; t4
217 subps m4, m6, m3 ; r3
218 addps m3, m3, m6 ; r1
222 addps m3, m5, m0 ; t6
223 subps m2, m2, m1 ; i3
225 addps m1, m1, Z(3) ; i1
228 subps m4, m7, m3 ; i2
229 addps m3, m3, m7 ; i0
234 ; scheduled to avoid store->load aliasing
235 %macro PASS_BIG 1 ; (!interleave)
239 mova m1, [wq+o1q] ; wim
240 mulps m2, m4, m0 ; r2*wre
242 mulps m3, m5, m1 ; i2*wim
244 mulps m4, m4, m1 ; r2*wim
245 mulps m5, m5, m0 ; i2*wre
246 addps m2, m2, m3 ; r2*wre + i2*wim
247 mulps m3, m1, m7 ; i3*wim
248 mulps m1, m1, m6 ; r3*wim
249 subps m5, m5, m4 ; i2*wre - r2*wim
250 mulps m4, m0, m6 ; r3*wre
251 mulps m0, m0, m7 ; i3*wre
252 subps m4, m4, m3 ; r3*wre - i3*wim
254 addps m0, m0, m1 ; i3*wre + r3*wim
255 subps m1, m4, m2 ; t3
256 addps m4, m4, m2 ; t5
257 subps m3, m3, m4 ; r2
258 addps m4, m4, Z(0) ; r0
262 subps m3, m5, m0 ; t4
263 subps m4, m6, m3 ; r3
264 addps m3, m3, m6 ; r1
268 addps m5, m5, m0 ; t6
269 subps m2, m2, m1 ; i3
271 addps m1, m1, Z(3) ; i1
274 subps m6, m7, m5 ; i2
275 addps m5, m5, m7 ; i0
279 INTERL m1, m3, m7, Z, 2
280 INTERL m2, m4, m0, Z2, 6
285 INTERL m5, m1, m3, Z, 0
286 INTERL m6, m2, m7, Z, 4
296 %define Z(x) [r0+mmsize*x]
297 %define Z2(x) [r0+mmsize*x]
298 %define ZH(x) [r0+mmsize*x+mmsize/2]
307 T8_AVX m0, m1, m2, m3, m4
321 T8_AVX m0, m1, m4, m5, m7
323 mova m4, [ps_cos16_1]
324 mova m5, [ps_cos16_2]
331 vblendps m2, m7, m3, 0xf0
332 vperm2f128 m3, m7, m3, 0x21
335 vperm2f128 m2, m2, m2, 0x01
340 vextractf128 Z(0), m0, 0
341 vextractf128 ZH(0), m1, 0
342 vextractf128 Z(1), m0, 1
343 vextractf128 ZH(1), m1, 1
344 vextractf128 Z(2), m5, 0
345 vextractf128 ZH(2), m3, 0
346 vextractf128 Z(3), m5, 1
347 vextractf128 ZH(3), m3, 1
362 T8_SSE m0, m1, m2, m3, m4, m6
363 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
364 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
366 vperm2f128 m4, m0, m2, 0x20
367 vperm2f128 m5, m1, m3, 0x20
368 vperm2f128 m6, m0, m2, 0x31
369 vperm2f128 m7, m1, m3, 0x31
371 PASS_SMALL 0, [cos_32], [cos_32+32]
375 fft32_interleave_avx:
383 vextractf128 Z(0), m0, 0
384 vextractf128 ZH(0), m1, 0
385 vextractf128 Z(1), m0, 1
386 vextractf128 ZH(1), m1, 1
394 %define movdqa movaps
413 T8_SSE m0, m1, m2, m3, m4, m5
427 T8_SSE m0, m1, m2, m3, m4, m5
438 PASS_SMALL 0, [cos_16], [cos_16+16]
447 T2_3DN m0, m1, Z(0), Z(1)
450 T4_3DN m0, m1, m2, m3, m4, m5
461 T2_3DN m0, m1, Z(0), Z(1)
464 T4_3DN m0, m1, m2, m3, m4, m5
467 T2_3DN m4, m5, Z(4), Z(5)
468 T2_3DN m6, m7, Z2(6), Z2(7)
477 T4_3DN m1, m3, m5, m7, m0, m2
482 T4_3DN m0, m2, m4, m6, m5, m7
514 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
515 %define Z2(x) [zq + o3q + mmsize*(x&1)]
516 %define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
517 %define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
519 %macro DECL_PASS 2+ ; name, payload
522 DEFINE_ARGS z, w, n, o1, o3
541 vextractf128 %4(%5), %2, 0
542 vextractf128 %4 %+ H(%5), %3, 0
543 vextractf128 %4(%5 + 1), %2, 1
544 vextractf128 %4 %+ H(%5 + 1), %3, 1
547 %define INTERL INTERL_AVX
549 DECL_PASS pass_avx, PASS_BIG 1
550 DECL_PASS pass_interleave_avx, PASS_BIG 0
563 %define INTERL INTERL_SSE
565 DECL_PASS pass_sse, PASS_BIG 1
566 DECL_PASS pass_interleave_sse, PASS_BIG 0
572 %define unpcklps punpckldq
573 %define unpckhps punpckhdq
574 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
575 DECL_PASS pass_interleave_3dn, PASS_BIG 0
576 %define pass_3dn2 pass_3dn
577 %define pass_interleave_3dn2 pass_interleave_3dn
580 %define SECTION_REL - $$
585 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
586 lea r2, [dispatch_tab%1]
587 mov r2, [r2 + (%2q-2)*gprsize]
593 %endmacro ; FFT_DISPATCH
595 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
596 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
598 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
601 %xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
608 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
613 add r0, n*4 - (n&(-2<<%1))
615 add r0, n*2 - (n2&(-2<<%1))
617 sub r0, n*6 + (n2&(-2<<%1))
627 dispatch_tab%3%2: pointer list_of_fft
631 ; On x86_32, this function does the register saving and restoring for all of fft.
632 ; The others pass args in registers and don't spill anything.
633 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
634 FFT_DISPATCH %3%2, nbits
644 DECL_FFT 6, _avx, _interleave
648 DECL_FFT 5, _sse, _interleave
651 DECL_FFT 4, _3dn, _interleave
653 DECL_FFT 4, _3dn2, _interleave
662 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
663 movaps xmm0, [%3+%2*4]
664 movaps xmm1, [%3+%1*4-0x10]
666 shufps xmm0, xmm1, 0x88
667 shufps xmm1, xmm2, 0x77
668 movlps xmm4, [%4+%2*2]
669 movlps xmm5, [%5+%2*2+0x0]
670 movhps xmm4, [%4+%1*2-0x8]
671 movhps xmm5, [%5+%1*2-0x8]
685 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
686 mulps m6, %3, [%5+%1]
687 mulps m7, %2, [%5+%1]
688 mulps %2, %2, [%6+%1]
689 mulps %3, %3, [%6+%1]
694 %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
696 vmovaps ymm1, [%3+%1*2]
697 vmovaps ymm0, [%3+%1*2+0x20]
698 vmovaps ymm3, [%3+%2*2]
699 vmovaps ymm2, [%3+%2*2+0x20]
701 CMUL %1, ymm0, ymm1, %3, %4, %5
702 CMUL %2, ymm2, ymm3, %3, %4, %5
703 vshufps ymm1, ymm1, ymm1, 0x1b
704 vshufps ymm3, ymm3, ymm3, 0x1b
705 vperm2f128 ymm1, ymm1, ymm1, 0x01
706 vperm2f128 ymm3, ymm3, ymm3, 0x01
707 vunpcklps ymm6, ymm2, ymm1
708 vunpckhps ymm4, ymm2, ymm1
709 vunpcklps ymm7, ymm0, ymm3
710 vunpckhps ymm5, ymm0, ymm3
712 vextractf128 [%3+%1*2], ymm7, 0
713 vextractf128 [%3+%1*2+0x10], ymm5, 0
714 vextractf128 [%3+%1*2+0x20], ymm7, 1
715 vextractf128 [%3+%1*2+0x30], ymm5, 1
717 vextractf128 [%3+%2*2], ymm6, 0
718 vextractf128 [%3+%2*2+0x10], ymm4, 0
719 vextractf128 [%3+%2*2+0x20], ymm6, 1
720 vextractf128 [%3+%2*2+0x30], ymm4, 1
726 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
728 movaps xmm1, [%3+%1*2]
729 movaps xmm0, [%3+%1*2+0x10]
730 CMUL %1, xmm0, xmm1, %3, %4, %5
731 movaps xmm5, [%3+%2*2]
732 movaps xmm4, [%3+%2*2+0x10]
733 CMUL %2, xmm4, xmm5, %3, %4, %5
734 shufps xmm1, xmm1, 0x1b
735 shufps xmm5, xmm5, 0x1b
742 movaps [%3+%2*2], xmm6
743 movaps [%3+%2*2+0x10], xmm4
744 movaps [%3+%1*2], xmm0
745 movaps [%3+%1*2+0x10], xmm2
752 cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
765 mov r3d, [r0+FFTContext.mdctsize]
768 mov rtcos, [r0+FFTContext.tcos]
769 mov rtsin, [r0+FFTContext.tsin]
777 mov rrevtab, [r0+FFTContext.revtab]
797 PREROTATER r4, r3, r2, rtcos, rtsin
799 movzx r5, word [rrevtab+r4-4]
800 movzx r6, word [rrevtab+r4-2]
801 movzx r13, word [rrevtab+r3]
802 movzx r14, word [rrevtab+r3+2]
803 movlps [r1+r5 *8], xmm0
804 movhps [r1+r6 *8], xmm0
805 movlps [r1+r13*8], xmm1
806 movhps [r1+r14*8], xmm1
810 movzx r5, word [r6+r4-4]
811 movzx r4, word [r6+r4-2]
812 movlps [r1+r5*8], xmm0
813 movhps [r1+r4*8], xmm0
814 movzx r5, word [r6+r3]
815 movzx r4, word [r6+r3+2]
816 movlps [r1+r5*8], xmm1
817 movhps [r1+r4*8], xmm1
825 mov r1d, [r5+FFTContext.nbits]
829 mov r0d, [r5+FFTContext.mdctsize]
841 %2 r0, r1, r6, rtcos, rtsin
849 %ifidn avx_enabled, 1
855 DECL_IMDCT _sse, POSROTATESHUF
860 DECL_IMDCT _avx, POSROTATESHUF_AVX