1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2011 Vitor Sessak
6 ;* This algorithm (though not any of the implementation details) is
7 ;* based on libdjbfft by D. J. Bernstein.
9 ;* This file is part of Libav.
11 ;* Libav is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* Libav is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with Libav; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 ; These functions are not individually interchangeable with the C versions.
27 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28 ; in blocks as conventient to the vector size.
29 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
52 %define M_SQRT1_2 0.70710678118654752440
53 %define M_COS_PI_1_8 0.923879532511287
54 %define M_COS_PI_3_8 0.38268343236509
57 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
58 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
60 ps_root2: times 8 dd M_SQRT1_2
61 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
62 ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
64 perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
65 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
66 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
67 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
90 %macro T2_3DN 4 ; z0, z1, mem0, mem1
97 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
100 pfadd %5, %4 ; {t6,t5}
101 pxor %3, [ps_m1p1] ; {t8,t7}
104 pfadd %1, %5 ; {r0,i0}
105 pfsub %6, %5 ; {r2,i2}
107 pfadd %2, %3 ; {r1,i1}
108 pfsub %4, %3 ; {r3,i3}
112 ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
113 ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
115 ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
116 ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
118 vsubps %5, %1, %2 ; v = %1 - %2
119 vaddps %3, %1, %2 ; w = %1 + %2
120 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
121 vpermilps %2, %2, [perm1]
122 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
123 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
124 vsubps %4, %5, %1 ; s = r - q
125 vaddps %1, %5, %1 ; u = r + q
126 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
127 vshufps %5, %4, %1, 0xbb
128 vshufps %3, %4, %1, 0xee
129 vperm2f128 %3, %3, %5, 0x13
130 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
131 vshufps %2, %1, %4, 0xdd
132 vshufps %1, %1, %4, 0x88
133 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
134 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
136 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
137 vsubps %2, %4, %1 ; %2 = v - w
138 vaddps %1, %4, %1 ; %1 = v + w
141 ; In SSE mode do one fft4 transforms
142 ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
143 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
145 ; In AVX mode do two fft4 transforms
146 ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
147 ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
149 subps %3, %1, %2 ; {t3,t4,-t8,t7}
150 addps %1, %1, %2 ; {t1,t2,t6,t5}
151 xorps %3, %3, [ps_p1p1m1p1]
152 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
153 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
154 subps %3, %1, %2 ; {r2,i2,r3,i3}
155 addps %1, %1, %2 ; {r0,i0,r1,i1}
156 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
157 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
160 ; In SSE mode do one FFT8
161 ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
162 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
164 ; In AVX mode do two FFT8
165 ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
166 ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
167 ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
168 ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
170 addps %6, %3, %4 ; {t1,t2,t3,t4}
171 subps %3, %3, %4 ; {r5,i5,r7,i7}
172 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
173 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
174 mulps %4, %4, [ps_root2]
175 addps %3, %3, %4 ; {t8,t7,ta,t9}
176 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
177 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
178 subps %3, %6, %4 ; {t6,t5,tc,tb}
179 addps %6, %6, %4 ; {t1,t2,t9,ta}
180 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
181 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
182 subps %3, %1, %6 ; {r4,r5,r6,r7}
183 addps %1, %1, %6 ; {r0,r1,r2,r3}
184 subps %4, %2, %5 ; {i4,i5,i6,i7}
185 addps %2, %2, %5 ; {i0,i1,i2,i3}
188 ; scheduled for cpu-bound sizes
189 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
194 mulps m2, m4, m0 ; r2*wre
196 mulps m3, m5, m1 ; i2*wim
198 mulps m4, m4, m1 ; r2*wim
199 mulps m5, m5, m0 ; i2*wre
200 addps m2, m2, m3 ; r2*wre + i2*wim
201 mulps m3, m1, m7 ; i3*wim
202 subps m5, m5, m4 ; i2*wre - r2*wim
203 mulps m1, m1, m6 ; r3*wim
204 mulps m4, m0, m6 ; r3*wre
205 mulps m0, m0, m7 ; i3*wre
206 subps m4, m4, m3 ; r3*wre - i3*wim
208 addps m0, m0, m1 ; i3*wre + r3*wim
209 subps m1, m4, m2 ; t3
210 addps m4, m4, m2 ; t5
211 subps m3, m3, m4 ; r2
212 addps m4, m4, Z(0) ; r0
216 subps m3, m5, m0 ; t4
217 subps m4, m6, m3 ; r3
218 addps m3, m3, m6 ; r1
222 addps m3, m5, m0 ; t6
223 subps m2, m2, m1 ; i3
225 addps m1, m1, Z(3) ; i1
228 subps m4, m7, m3 ; i2
229 addps m3, m3, m7 ; i0
234 ; scheduled to avoid store->load aliasing
235 %macro PASS_BIG 1 ; (!interleave)
239 mova m1, [wq+o1q] ; wim
240 mulps m2, m4, m0 ; r2*wre
242 mulps m3, m5, m1 ; i2*wim
244 mulps m4, m4, m1 ; r2*wim
245 mulps m5, m5, m0 ; i2*wre
246 addps m2, m2, m3 ; r2*wre + i2*wim
247 mulps m3, m1, m7 ; i3*wim
248 mulps m1, m1, m6 ; r3*wim
249 subps m5, m5, m4 ; i2*wre - r2*wim
250 mulps m4, m0, m6 ; r3*wre
251 mulps m0, m0, m7 ; i3*wre
252 subps m4, m4, m3 ; r3*wre - i3*wim
254 addps m0, m0, m1 ; i3*wre + r3*wim
255 subps m1, m4, m2 ; t3
256 addps m4, m4, m2 ; t5
257 subps m3, m3, m4 ; r2
258 addps m4, m4, Z(0) ; r0
262 subps m3, m5, m0 ; t4
263 subps m4, m6, m3 ; r3
264 addps m3, m3, m6 ; r1
268 addps m5, m5, m0 ; t6
269 subps m2, m2, m1 ; i3
271 addps m1, m1, Z(3) ; i1
274 subps m6, m7, m5 ; i2
275 addps m5, m5, m7 ; i0
279 INTERL m1, m3, m7, Z, 2
280 INTERL m2, m4, m0, Z2, 6
285 INTERL m5, m1, m3, Z, 0
286 INTERL m6, m2, m7, Z, 4
296 %define Z(x) [r0+mmsize*x]
297 %define Z2(x) [r0+mmsize*x]
298 %define ZH(x) [r0+mmsize*x+mmsize/2]
307 T8_AVX m0, m1, m2, m3, m4
321 T8_AVX m0, m1, m4, m5, m7
323 mova m4, [ps_cos16_1]
324 mova m5, [ps_cos16_2]
331 vblendps m2, m7, m3, 0xf0
332 vperm2f128 m3, m7, m3, 0x21
335 vperm2f128 m2, m2, m2, 0x01
340 vextractf128 Z(0), m0, 0
341 vextractf128 ZH(0), m1, 0
342 vextractf128 Z(1), m0, 1
343 vextractf128 ZH(1), m1, 1
344 vextractf128 Z(2), m5, 0
345 vextractf128 ZH(2), m3, 0
346 vextractf128 Z(3), m5, 1
347 vextractf128 ZH(3), m3, 1
362 T8_SSE m0, m1, m2, m3, m4, m6
363 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
364 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
366 vperm2f128 m4, m0, m2, 0x20
367 vperm2f128 m5, m1, m3, 0x20
368 vperm2f128 m6, m0, m2, 0x31
369 vperm2f128 m7, m1, m3, 0x31
371 PASS_SMALL 0, [cos_32], [cos_32+32]
375 fft32_interleave_avx:
383 vextractf128 Z(0), m0, 0
384 vextractf128 ZH(0), m1, 0
385 vextractf128 Z(1), m0, 1
386 vextractf128 ZH(1), m1, 1
394 %define movdqa movaps
413 T8_SSE m0, m1, m2, m3, m4, m5
427 T8_SSE m0, m1, m2, m3, m4, m5
438 PASS_SMALL 0, [cos_16], [cos_16+16]
447 T2_3DN m0, m1, Z(0), Z(1)
450 T4_3DN m0, m1, m2, m3, m4, m5
461 T2_3DN m0, m1, Z(0), Z(1)
464 T4_3DN m0, m1, m2, m3, m4, m5
467 T2_3DN m4, m5, Z(4), Z(5)
468 T2_3DN m6, m7, Z2(6), Z2(7)
477 T4_3DN m1, m3, m5, m7, m0, m2
482 T4_3DN m0, m2, m4, m6, m5, m7
514 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
515 %define Z2(x) [zq + o3q + mmsize*(x&1)]
516 %define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
517 %define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
519 %macro DECL_PASS 2+ ; name, payload
522 DEFINE_ARGS z, w, n, o1, o3
541 vextractf128 %4(%5), %2, 0
542 vextractf128 %4 %+ H(%5), %3, 0
543 vextractf128 %4(%5 + 1), %2, 1
544 vextractf128 %4 %+ H(%5 + 1), %3, 1
547 %define INTERL INTERL_AVX
549 DECL_PASS pass_avx, PASS_BIG 1
550 DECL_PASS pass_interleave_avx, PASS_BIG 0
563 %define INTERL INTERL_SSE
565 DECL_PASS pass_sse, PASS_BIG 1
566 DECL_PASS pass_interleave_sse, PASS_BIG 0
572 %define unpcklps punpckldq
573 %define unpckhps punpckhdq
574 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
575 DECL_PASS pass_interleave_3dn, PASS_BIG 0
576 %define pass_3dn2 pass_3dn
577 %define pass_interleave_3dn2 pass_interleave_3dn
580 %define SECTION_REL - $$
585 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
586 lea r2, [dispatch_tab%1]
587 mov r2, [r2 + (%2q-2)*gprsize]
593 %endmacro ; FFT_DISPATCH
595 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
596 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
598 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
601 %xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
608 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
613 add r0, n*4 - (n&(-2<<%1))
615 add r0, n*2 - (n2&(-2<<%1))
617 sub r0, n*6 + (n2&(-2<<%1))
627 dispatch_tab%3%2: pointer list_of_fft
631 ; On x86_32, this function does the register saving and restoring for all of fft.
632 ; The others pass args in registers and don't spill anything.
633 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
634 FFT_DISPATCH %3%2, nbits
643 DECL_FFT 6, _avx, _interleave
646 DECL_FFT 5, _sse, _interleave
648 DECL_FFT 4, _3dn, _interleave
650 DECL_FFT 4, _3dn2, _interleave
659 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
660 movaps xmm0, [%3+%2*4]
661 movaps xmm1, [%3+%1*4-0x10]
663 shufps xmm0, xmm1, 0x88
664 shufps xmm1, xmm2, 0x77
665 movlps xmm4, [%4+%2*2]
666 movlps xmm5, [%5+%2*2+0x0]
667 movhps xmm4, [%4+%1*2-0x8]
668 movhps xmm5, [%5+%1*2-0x8]
682 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
683 mulps m6, %3, [%5+%1]
684 mulps m7, %2, [%5+%1]
685 mulps %2, %2, [%6+%1]
686 mulps %3, %3, [%6+%1]
691 %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
693 vmovaps ymm1, [%3+%1*2]
694 vmovaps ymm0, [%3+%1*2+0x20]
695 vmovaps ymm3, [%3+%2*2]
696 vmovaps ymm2, [%3+%2*2+0x20]
698 CMUL %1, ymm0, ymm1, %3, %4, %5
699 CMUL %2, ymm2, ymm3, %3, %4, %5
700 vshufps ymm1, ymm1, ymm1, 0x1b
701 vshufps ymm3, ymm3, ymm3, 0x1b
702 vperm2f128 ymm1, ymm1, ymm1, 0x01
703 vperm2f128 ymm3, ymm3, ymm3, 0x01
704 vunpcklps ymm6, ymm2, ymm1
705 vunpckhps ymm4, ymm2, ymm1
706 vunpcklps ymm7, ymm0, ymm3
707 vunpckhps ymm5, ymm0, ymm3
709 vextractf128 [%3+%1*2], ymm7, 0
710 vextractf128 [%3+%1*2+0x10], ymm5, 0
711 vextractf128 [%3+%1*2+0x20], ymm7, 1
712 vextractf128 [%3+%1*2+0x30], ymm5, 1
714 vextractf128 [%3+%2*2], ymm6, 0
715 vextractf128 [%3+%2*2+0x10], ymm4, 0
716 vextractf128 [%3+%2*2+0x20], ymm6, 1
717 vextractf128 [%3+%2*2+0x30], ymm4, 1
723 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
725 movaps xmm1, [%3+%1*2]
726 movaps xmm0, [%3+%1*2+0x10]
727 CMUL %1, xmm0, xmm1, %3, %4, %5
728 movaps xmm5, [%3+%2*2]
729 movaps xmm4, [%3+%2*2+0x10]
730 CMUL %2, xmm4, xmm5, %3, %4, %5
731 shufps xmm1, xmm1, 0x1b
732 shufps xmm5, xmm5, 0x1b
739 movaps [%3+%2*2], xmm6
740 movaps [%3+%2*2+0x10], xmm4
741 movaps [%3+%1*2], xmm0
742 movaps [%3+%1*2+0x10], xmm2
749 cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
762 mov r3d, [r0+FFTContext.mdctsize]
765 mov rtcos, [r0+FFTContext.tcos]
766 mov rtsin, [r0+FFTContext.tsin]
774 mov rrevtab, [r0+FFTContext.revtab]
794 PREROTATER r4, r3, r2, rtcos, rtsin
796 movzx r5, word [rrevtab+r4-4]
797 movzx r6, word [rrevtab+r4-2]
798 movzx r13, word [rrevtab+r3]
799 movzx r14, word [rrevtab+r3+2]
800 movlps [r1+r5 *8], xmm0
801 movhps [r1+r6 *8], xmm0
802 movlps [r1+r13*8], xmm1
803 movhps [r1+r14*8], xmm1
807 movzx r5, word [r6+r4-4]
808 movzx r4, word [r6+r4-2]
809 movlps [r1+r5*8], xmm0
810 movhps [r1+r4*8], xmm0
811 movzx r5, word [r6+r3]
812 movzx r4, word [r6+r3+2]
813 movlps [r1+r5*8], xmm1
814 movhps [r1+r4*8], xmm1
822 mov r1d, [r5+FFTContext.nbits]
826 mov r0d, [r5+FFTContext.mdctsize]
838 %2 r0, r1, r6, rtcos, rtsin
846 %ifidn avx_enabled, 1
852 DECL_IMDCT _sse, POSROTATESHUF
857 DECL_IMDCT _avx, POSROTATESHUF_AVX