1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2011 Vitor Sessak
6 ;* This algorithm (though not any of the implementation details) is
7 ;* based on libdjbfft by D. J. Bernstein.
9 ;* This file is part of FFmpeg.
11 ;* FFmpeg is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* FFmpeg is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with FFmpeg; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 ; These functions are not individually interchangeable with the C versions.
27 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28 ; in blocks as conventient to the vector size.
29 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
31 %include "libavutil/x86/x86inc.asm"
52 %define M_SQRT1_2 0.70710678118654752440
53 %define M_COS_PI_1_8 0.923879532511287
54 %define M_COS_PI_3_8 0.38268343236509
57 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
58 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
60 ps_root2: times 8 dd M_SQRT1_2
61 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
62 ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
64 perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
65 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
66 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
67 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
90 %macro T2_3DN 4 ; z0, z1, mem0, mem1
97 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
100 pfadd %5, %4 ; {t6,t5}
101 pxor %3, [ps_m1p1] ; {t8,t7}
104 pfadd %1, %5 ; {r0,i0}
105 pfsub %6, %5 ; {r2,i2}
107 pfadd %2, %3 ; {r1,i1}
108 pfsub %4, %3 ; {r3,i3}
112 ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
113 ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
115 ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
116 ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
118 vsubps %5, %1, %2 ; v = %1 - %2
119 vaddps %3, %1, %2 ; w = %1 + %2
120 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
121 vpermilps %2, %2, [perm1]
122 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
123 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
124 vsubps %4, %5, %1 ; s = r - q
125 vaddps %1, %5, %1 ; u = r + q
126 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
127 vshufps %5, %4, %1, 0xbb
128 vshufps %3, %4, %1, 0xee
129 vperm2f128 %3, %3, %5, 0x13
130 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
131 vshufps %2, %1, %4, 0xdd
132 vshufps %1, %1, %4, 0x88
133 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
134 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
136 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
137 vsubps %2, %4, %1 ; %2 = v - w
138 vaddps %1, %4, %1 ; %1 = v + w
141 ; In SSE mode do one fft4 transforms
142 ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
143 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
145 ; In AVX mode do two fft4 transforms
146 ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
147 ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
149 subps %3, %1, %2 ; {t3,t4,-t8,t7}
150 addps %1, %1, %2 ; {t1,t2,t6,t5}
151 xorps %3, %3, [ps_p1p1m1p1]
152 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
153 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
154 subps %3, %1, %2 ; {r2,i2,r3,i3}
155 addps %1, %1, %2 ; {r0,i0,r1,i1}
156 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
157 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
160 ; In SSE mode do one FFT8
161 ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
162 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
164 ; In AVX mode do two FFT8
165 ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
166 ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
167 ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
168 ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
170 addps %6, %3, %4 ; {t1,t2,t3,t4}
171 subps %3, %3, %4 ; {r5,i5,r7,i7}
172 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
173 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
174 mulps %4, %4, [ps_root2]
175 addps %3, %3, %4 ; {t8,t7,ta,t9}
176 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
177 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
178 subps %3, %6, %4 ; {t6,t5,tc,tb}
179 addps %6, %6, %4 ; {t1,t2,t9,ta}
180 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
181 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
182 subps %3, %1, %6 ; {r4,r5,r6,r7}
183 addps %1, %1, %6 ; {r0,r1,r2,r3}
184 subps %4, %2, %5 ; {i4,i5,i6,i7}
185 addps %2, %2, %5 ; {i0,i1,i2,i3}
188 ; scheduled for cpu-bound sizes
189 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
194 mulps m2, m4, m0 ; r2*wre
196 mulps m3, m5, m1 ; i2*wim
198 mulps m4, m4, m1 ; r2*wim
199 mulps m5, m5, m0 ; i2*wre
200 addps m2, m2, m3 ; r2*wre + i2*wim
201 mulps m3, m1, m7 ; i3*wim
202 subps m5, m5, m4 ; i2*wre - r2*wim
203 mulps m1, m1, m6 ; r3*wim
204 mulps m4, m0, m6 ; r3*wre
205 mulps m0, m0, m7 ; i3*wre
206 subps m4, m4, m3 ; r3*wre - i3*wim
208 addps m0, m0, m1 ; i3*wre + r3*wim
209 subps m1, m4, m2 ; t3
210 addps m4, m4, m2 ; t5
211 subps m3, m3, m4 ; r2
212 addps m4, m4, Z(0) ; r0
216 subps m3, m5, m0 ; t4
217 subps m4, m6, m3 ; r3
218 addps m3, m3, m6 ; r1
222 addps m3, m5, m0 ; t6
223 subps m2, m2, m1 ; i3
225 addps m1, m1, Z(3) ; i1
228 subps m4, m7, m3 ; i2
229 addps m3, m3, m7 ; i0
234 ; scheduled to avoid store->load aliasing
235 %macro PASS_BIG 1 ; (!interleave)
239 mova m1, [wq+o1q] ; wim
240 mulps m2, m4, m0 ; r2*wre
242 mulps m3, m5, m1 ; i2*wim
244 mulps m4, m4, m1 ; r2*wim
245 mulps m5, m5, m0 ; i2*wre
246 addps m2, m2, m3 ; r2*wre + i2*wim
247 mulps m3, m1, m7 ; i3*wim
248 mulps m1, m1, m6 ; r3*wim
249 subps m5, m5, m4 ; i2*wre - r2*wim
250 mulps m4, m0, m6 ; r3*wre
251 mulps m0, m0, m7 ; i3*wre
252 subps m4, m4, m3 ; r3*wre - i3*wim
254 addps m0, m0, m1 ; i3*wre + r3*wim
255 subps m1, m4, m2 ; t3
256 addps m4, m4, m2 ; t5
257 subps m3, m3, m4 ; r2
258 addps m4, m4, Z(0) ; r0
262 subps m3, m5, m0 ; t4
263 subps m4, m6, m3 ; r3
264 addps m3, m3, m6 ; r1
268 addps m5, m5, m0 ; t6
269 subps m2, m2, m1 ; i3
271 addps m1, m1, Z(3) ; i1
274 subps m6, m7, m5 ; i2
275 addps m5, m5, m7 ; i0
279 INTERL m1, m3, m7, Z, 2
280 INTERL m2, m4, m0, Z2, 6
285 INTERL m5, m1, m3, Z, 0
286 INTERL m6, m2, m7, Z, 4
296 %define Z(x) [r0+mmsize*x]
297 %define Z2(x) [r0+mmsize*x]
298 %define ZH(x) [r0+mmsize*x+mmsize/2]
307 T8_AVX m0, m1, m2, m3, m4
321 T8_AVX m0, m1, m4, m5, m7
323 mova m4, [ps_cos16_1]
324 mova m5, [ps_cos16_2]
331 vblendps m2, m7, m3, 0xf0
332 vperm2f128 m3, m7, m3, 0x21
335 vperm2f128 m2, m2, m2, 0x01
340 vextractf128 Z(0), m0, 0
341 vextractf128 ZH(0), m1, 0
342 vextractf128 Z(1), m0, 1
343 vextractf128 ZH(1), m1, 1
344 vextractf128 Z(2), m5, 0
345 vextractf128 ZH(2), m3, 0
346 vextractf128 Z(3), m5, 1
347 vextractf128 ZH(3), m3, 1
362 T8_SSE m0, m1, m2, m3, m4, m6
363 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
364 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
366 vperm2f128 m4, m0, m2, 0x20
367 vperm2f128 m5, m1, m3, 0x20
368 vperm2f128 m6, m0, m2, 0x31
369 vperm2f128 m7, m1, m3, 0x31
371 PASS_SMALL 0, [cos_32], [cos_32+32]
375 fft32_interleave_avx:
383 vextractf128 Z(0), m0, 0
384 vextractf128 ZH(0), m1, 0
385 vextractf128 Z(1), m0, 1
386 vextractf128 ZH(1), m1, 1
395 %define movdqa movaps
414 T8_SSE m0, m1, m2, m3, m4, m5
428 T8_SSE m0, m1, m2, m3, m4, m5
439 PASS_SMALL 0, [cos_16], [cos_16+16]
446 T2_3DN m0, m1, Z(0), Z(1)
449 T4_3DN m0, m1, m2, m3, m4, m5
460 T2_3DN m0, m1, Z(0), Z(1)
463 T4_3DN m0, m1, m2, m3, m4, m5
466 T2_3DN m4, m5, Z(4), Z(5)
467 T2_3DN m6, m7, Z2(6), Z2(7)
476 T4_3DN m1, m3, m5, m7, m0, m2
481 T4_3DN m0, m2, m4, m6, m5, m7
515 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
516 %define Z2(x) [zq + o3q + mmsize*(x&1)]
517 %define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
518 %define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
520 %macro DECL_PASS 2+ ; name, payload
523 DEFINE_ARGS z, w, n, o1, o3
542 vextractf128 %4(%5), %2, 0
543 vextractf128 %4 %+ H(%5), %3, 0
544 vextractf128 %4(%5 + 1), %2, 1
545 vextractf128 %4 %+ H(%5 + 1), %3, 1
548 %define INTERL INTERL_AVX
550 DECL_PASS pass_avx, PASS_BIG 1
551 DECL_PASS pass_interleave_avx, PASS_BIG 0
564 %define INTERL INTERL_SSE
566 DECL_PASS pass_sse, PASS_BIG 1
567 DECL_PASS pass_interleave_sse, PASS_BIG 0
573 %define unpcklps punpckldq
574 %define unpckhps punpckhdq
575 DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
576 DECL_PASS pass_interleave_3dnow, PASS_BIG 0
577 %define pass_3dnow2 pass_3dnow
578 %define pass_interleave_3dnow2 pass_interleave_3dnow
581 %define SECTION_REL - $$
586 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
587 lea r2, [dispatch_tab%1]
588 mov r2, [r2 + (%2q-2)*gprsize]
594 %endmacro ; FFT_DISPATCH
596 %macro DECL_FFT 1-2 ; nbits, suffix
598 %xdefine fullsuffix SUFFIX
600 %xdefine fullsuffix %2 %+ SUFFIX
602 %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
604 %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
607 %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
614 %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
617 fft %+ n %+ fullsuffix:
618 call fft %+ n2 %+ SUFFIX
619 add r0, n*4 - (n&(-2<<%1))
620 call fft %+ n4 %+ SUFFIX
621 add r0, n*2 - (n2&(-2<<%1))
622 call fft %+ n4 %+ SUFFIX
623 sub r0, n*6 + (n2&(-2<<%1))
626 jmp pass %+ fullsuffix
633 dispatch_tab %+ fullsuffix: pointer list_of_fft
637 ; On x86_32, this function does the register saving and restoring for all of fft.
638 ; The others pass args in registers and don't spill anything.
639 cglobal fft_dispatch%2, 2,5,8, z, nbits
640 FFT_DISPATCH fullsuffix, nbits
650 DECL_FFT 6, _interleave
654 DECL_FFT 5, _interleave
657 DECL_FFT 4, _interleave
660 DECL_FFT 4, _interleave
669 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
670 movaps xmm0, [%3+%2*4]
671 movaps xmm1, [%3+%1*4-0x10]
673 shufps xmm0, xmm1, 0x88
674 shufps xmm1, xmm2, 0x77
675 movlps xmm4, [%4+%2*2]
676 movlps xmm5, [%5+%2*2+0x0]
677 movhps xmm4, [%4+%1*2-0x8]
678 movhps xmm5, [%5+%1*2-0x8]
692 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
693 mulps m6, %3, [%5+%1]
694 mulps m7, %2, [%5+%1]
695 mulps %2, %2, [%6+%1]
696 mulps %3, %3, [%6+%1]
701 %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
703 vmovaps ymm1, [%3+%1*2]
704 vmovaps ymm0, [%3+%1*2+0x20]
705 vmovaps ymm3, [%3+%2*2]
706 vmovaps ymm2, [%3+%2*2+0x20]
708 CMUL %1, ymm0, ymm1, %3, %4, %5
709 CMUL %2, ymm2, ymm3, %3, %4, %5
710 vshufps ymm1, ymm1, ymm1, 0x1b
711 vshufps ymm3, ymm3, ymm3, 0x1b
712 vperm2f128 ymm1, ymm1, ymm1, 0x01
713 vperm2f128 ymm3, ymm3, ymm3, 0x01
714 vunpcklps ymm6, ymm2, ymm1
715 vunpckhps ymm4, ymm2, ymm1
716 vunpcklps ymm7, ymm0, ymm3
717 vunpckhps ymm5, ymm0, ymm3
719 vextractf128 [%3+%1*2], ymm7, 0
720 vextractf128 [%3+%1*2+0x10], ymm5, 0
721 vextractf128 [%3+%1*2+0x20], ymm7, 1
722 vextractf128 [%3+%1*2+0x30], ymm5, 1
724 vextractf128 [%3+%2*2], ymm6, 0
725 vextractf128 [%3+%2*2+0x10], ymm4, 0
726 vextractf128 [%3+%2*2+0x20], ymm6, 1
727 vextractf128 [%3+%2*2+0x30], ymm4, 1
733 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
735 movaps xmm1, [%3+%1*2]
736 movaps xmm0, [%3+%1*2+0x10]
737 CMUL %1, xmm0, xmm1, %3, %4, %5
738 movaps xmm5, [%3+%2*2]
739 movaps xmm4, [%3+%2*2+0x10]
740 CMUL %2, xmm4, xmm5, %3, %4, %5
741 shufps xmm1, xmm1, 0x1b
742 shufps xmm5, xmm5, 0x1b
749 movaps [%3+%2*2], xmm6
750 movaps [%3+%2*2+0x10], xmm4
751 movaps [%3+%1*2], xmm0
752 movaps [%3+%1*2+0x10], xmm2
759 cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
769 mov r3d, [r0+FFTContext.mdctsize]
772 mov rtcos, [r0+FFTContext.tcos]
773 mov rtsin, [r0+FFTContext.tsin]
781 mov rrevtab, [r0+FFTContext.revtab]
801 PREROTATER r4, r3, r2, rtcos, rtsin
803 movzx r5, word [rrevtab+r4-4]
804 movzx r6, word [rrevtab+r4-2]
805 movzx r10, word [rrevtab+r3]
806 movzx r11, word [rrevtab+r3+2]
807 movlps [r1+r5 *8], xmm0
808 movhps [r1+r6 *8], xmm0
809 movlps [r1+r10*8], xmm1
810 movhps [r1+r11*8], xmm1
814 movzx r5, word [r6+r4-4]
815 movzx r4, word [r6+r4-2]
816 movlps [r1+r5*8], xmm0
817 movhps [r1+r4*8], xmm0
818 movzx r5, word [r6+r3]
819 movzx r4, word [r6+r3+2]
820 movlps [r1+r5*8], xmm1
821 movhps [r1+r4*8], xmm1
829 mov r1d, [r5+FFTContext.nbits]
831 FFT_DISPATCH SUFFIX, r1
833 mov r0d, [r5+FFTContext.mdctsize]
845 %1 r0, r1, r6, rtcos, rtsin
855 DECL_IMDCT POSROTATESHUF
860 DECL_IMDCT POSROTATESHUF_AVX