1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2011 Vitor Sessak
6 ;* This algorithm (though not any of the implementation details) is
7 ;* based on libdjbfft by D. J. Bernstein.
9 ;* This file is part of FFmpeg.
11 ;* FFmpeg is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* FFmpeg is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with FFmpeg; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 ; These functions are not individually interchangeable with the C versions.
27 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28 ; in blocks as conventient to the vector size.
29 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
31 %include "libavutil/x86/x86inc.asm"
56 %define M_SQRT1_2 0.70710678118654752440
57 %define M_COS_PI_1_8 0.923879532511287
58 %define M_COS_PI_3_8 0.38268343236509
61 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
62 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
64 ps_root2: times 8 dd M_SQRT1_2
65 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
66 ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
68 perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
69 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
70 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
71 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
72 ps_m1m1m1m1: times 4 dd 1<<31
95 %macro T2_3DN 4 ; z0, z1, mem0, mem1
102 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
105 pfadd %5, %4 ; {t6,t5}
106 pxor %3, [ps_m1p1] ; {t8,t7}
109 pfadd %1, %5 ; {r0,i0}
110 pfsub %6, %5 ; {r2,i2}
112 pfadd %2, %3 ; {r1,i1}
113 pfsub %4, %3 ; {r3,i3}
117 ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
118 ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
120 ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
121 ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
123 vsubps %5, %1, %2 ; v = %1 - %2
124 vaddps %3, %1, %2 ; w = %1 + %2
125 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
126 vpermilps %2, %2, [perm1]
127 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
128 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
129 vsubps %4, %5, %1 ; s = r - q
130 vaddps %1, %5, %1 ; u = r + q
131 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
132 vshufps %5, %4, %1, 0xbb
133 vshufps %3, %4, %1, 0xee
134 vperm2f128 %3, %3, %5, 0x13
135 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
136 vshufps %2, %1, %4, 0xdd
137 vshufps %1, %1, %4, 0x88
138 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
139 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
141 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
142 vsubps %2, %4, %1 ; %2 = v - w
143 vaddps %1, %4, %1 ; %1 = v + w
146 ; In SSE mode do one fft4 transforms
147 ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
148 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
150 ; In AVX mode do two fft4 transforms
151 ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
152 ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
154 subps %3, %1, %2 ; {t3,t4,-t8,t7}
155 addps %1, %1, %2 ; {t1,t2,t6,t5}
156 xorps %3, %3, [ps_p1p1m1p1]
157 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
158 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
159 subps %3, %1, %2 ; {r2,i2,r3,i3}
160 addps %1, %1, %2 ; {r0,i0,r1,i1}
161 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
162 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
165 ; In SSE mode do one FFT8
166 ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
167 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
169 ; In AVX mode do two FFT8
170 ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
171 ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
172 ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
173 ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
175 addps %6, %3, %4 ; {t1,t2,t3,t4}
176 subps %3, %3, %4 ; {r5,i5,r7,i7}
177 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
178 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
179 mulps %4, %4, [ps_root2]
180 addps %3, %3, %4 ; {t8,t7,ta,t9}
181 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
182 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
183 subps %3, %6, %4 ; {t6,t5,tc,tb}
184 addps %6, %6, %4 ; {t1,t2,t9,ta}
185 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
186 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
187 subps %3, %1, %6 ; {r4,r5,r6,r7}
188 addps %1, %1, %6 ; {r0,r1,r2,r3}
189 subps %4, %2, %5 ; {i4,i5,i6,i7}
190 addps %2, %2, %5 ; {i0,i1,i2,i3}
193 ; scheduled for cpu-bound sizes
194 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
199 mulps m2, m4, m0 ; r2*wre
201 mulps m3, m5, m1 ; i2*wim
203 mulps m4, m4, m1 ; r2*wim
204 mulps m5, m5, m0 ; i2*wre
205 addps m2, m2, m3 ; r2*wre + i2*wim
206 mulps m3, m1, m7 ; i3*wim
207 subps m5, m5, m4 ; i2*wre - r2*wim
208 mulps m1, m1, m6 ; r3*wim
209 mulps m4, m0, m6 ; r3*wre
210 mulps m0, m0, m7 ; i3*wre
211 subps m4, m4, m3 ; r3*wre - i3*wim
213 addps m0, m0, m1 ; i3*wre + r3*wim
214 subps m1, m4, m2 ; t3
215 addps m4, m4, m2 ; t5
216 subps m3, m3, m4 ; r2
217 addps m4, m4, Z(0) ; r0
221 subps m3, m5, m0 ; t4
222 subps m4, m6, m3 ; r3
223 addps m3, m3, m6 ; r1
227 addps m3, m5, m0 ; t6
228 subps m2, m2, m1 ; i3
230 addps m1, m1, Z(3) ; i1
233 subps m4, m7, m3 ; i2
234 addps m3, m3, m7 ; i0
239 ; scheduled to avoid store->load aliasing
240 %macro PASS_BIG 1 ; (!interleave)
244 mova m1, [wq+o1q] ; wim
245 mulps m2, m4, m0 ; r2*wre
247 mulps m3, m5, m1 ; i2*wim
249 mulps m4, m4, m1 ; r2*wim
250 mulps m5, m5, m0 ; i2*wre
251 addps m2, m2, m3 ; r2*wre + i2*wim
252 mulps m3, m1, m7 ; i3*wim
253 mulps m1, m1, m6 ; r3*wim
254 subps m5, m5, m4 ; i2*wre - r2*wim
255 mulps m4, m0, m6 ; r3*wre
256 mulps m0, m0, m7 ; i3*wre
257 subps m4, m4, m3 ; r3*wre - i3*wim
259 addps m0, m0, m1 ; i3*wre + r3*wim
260 subps m1, m4, m2 ; t3
261 addps m4, m4, m2 ; t5
262 subps m3, m3, m4 ; r2
263 addps m4, m4, Z(0) ; r0
267 subps m3, m5, m0 ; t4
268 subps m4, m6, m3 ; r3
269 addps m3, m3, m6 ; r1
273 addps m5, m5, m0 ; t6
274 subps m2, m2, m1 ; i3
276 addps m1, m1, Z(3) ; i1
279 subps m6, m7, m5 ; i2
280 addps m5, m5, m7 ; i0
284 INTERL m1, m3, m7, Z, 2
285 INTERL m2, m4, m0, Z2, 6
290 INTERL m5, m1, m3, Z, 0
291 INTERL m6, m2, m7, Z, 4
301 %define Z(x) [r0+mmsize*x]
302 %define Z2(x) [r0+mmsize*x]
303 %define ZH(x) [r0+mmsize*x+mmsize/2]
312 T8_AVX m0, m1, m2, m3, m4
326 T8_AVX m0, m1, m4, m5, m7
328 mova m4, [ps_cos16_1]
329 mova m5, [ps_cos16_2]
336 vblendps m2, m7, m3, 0xf0
337 vperm2f128 m3, m7, m3, 0x21
340 vperm2f128 m2, m2, m2, 0x01
345 vextractf128 Z(0), m0, 0
346 vextractf128 ZH(0), m1, 0
347 vextractf128 Z(1), m0, 1
348 vextractf128 ZH(1), m1, 1
349 vextractf128 Z(2), m5, 0
350 vextractf128 ZH(2), m3, 0
351 vextractf128 Z(3), m5, 1
352 vextractf128 ZH(3), m3, 1
367 T8_SSE m0, m1, m2, m3, m4, m6
368 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
369 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
371 vperm2f128 m4, m0, m2, 0x20
372 vperm2f128 m5, m1, m3, 0x20
373 vperm2f128 m6, m0, m2, 0x31
374 vperm2f128 m7, m1, m3, 0x31
376 PASS_SMALL 0, [cos_32], [cos_32+32]
380 fft32_interleave_avx:
388 vextractf128 Z(0), m0, 0
389 vextractf128 ZH(0), m1, 0
390 vextractf128 Z(1), m0, 1
391 vextractf128 ZH(1), m1, 1
400 %define movdqa movaps
419 T8_SSE m0, m1, m2, m3, m4, m5
433 T8_SSE m0, m1, m2, m3, m4, m5
444 PASS_SMALL 0, [cos_16], [cos_16+16]
451 T2_3DN m0, m1, Z(0), Z(1)
454 T4_3DN m0, m1, m2, m3, m4, m5
465 T2_3DN m0, m1, Z(0), Z(1)
468 T4_3DN m0, m1, m2, m3, m4, m5
471 T2_3DN m4, m5, Z(4), Z(5)
472 T2_3DN m6, m7, Z2(6), Z2(7)
481 T4_3DN m1, m3, m5, m7, m0, m2
486 T4_3DN m0, m2, m4, m6, m5, m7
520 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
521 %define Z2(x) [zq + o3q + mmsize*(x&1)]
522 %define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
523 %define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
525 %macro DECL_PASS 2+ ; name, payload
528 DEFINE_ARGS z, w, n, o1, o3
541 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
542 lea r2, [dispatch_tab%1]
543 mov r2, [r2 + (%2q-2)*gprsize]
549 %endmacro ; FFT_DISPATCH
557 vextractf128 %4(%5), %2, 0
558 vextractf128 %4 %+ H(%5), %3, 0
559 vextractf128 %4(%5 + 1), %2, 1
560 vextractf128 %4 %+ H(%5 + 1), %3, 1
563 %define INTERL INTERL_AVX
565 DECL_PASS pass_avx, PASS_BIG 1
566 DECL_PASS pass_interleave_avx, PASS_BIG 0
568 cglobal fft_calc, 2,5,8
569 mov r3d, [r0 + FFTContext.nbits]
572 FFT_DISPATCH _interleave %+ SUFFIX, r1
587 %define INTERL INTERL_SSE
589 DECL_PASS pass_sse, PASS_BIG 1
590 DECL_PASS pass_interleave_sse, PASS_BIG 0
592 cglobal fft_calc, 2,5,8
593 mov r3d, [r0 + FFTContext.nbits]
598 FFT_DISPATCH _interleave %+ SUFFIX, r1
608 movaps xmm0, [r4 + r2]
610 unpcklps xmm0, [r4 + r2 + 16]
611 unpckhps xmm1, [r4 + r2 + 16]
612 movaps [r4 + r2], xmm0
613 movaps [r4 + r2 + 16], xmm1
619 cglobal fft_permute, 2,7,1
620 mov r4, [r0 + FFTContext.revtab]
621 mov r5, [r0 + FFTContext.tmpbuf]
622 mov ecx, [r0 + FFTContext.nbits]
630 movaps xmm0, [r1 + 8*r0]
631 movzx r6, word [r4 + 2*r0]
632 movzx r3, word [r4 + 2*r0 + 2]
633 movlps [r5 + 8*r6], xmm0
634 movhps [r5 + 8*r3], xmm0
642 ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
644 movaps xmm0, [r5 + r2]
645 movaps xmm1, [r5 + r2 + 16]
646 movaps [r1 + r2], xmm0
647 movaps [r1 + r2 + 16], xmm1
652 cglobal imdct_calc, 3,5,3
653 mov r3d, [r0 + FFTContext.mdctsize]
654 mov r4, [r0 + FFTContext.imdcthalf]
677 movaps xmm2, [ps_m1m1m1m1]
679 movaps xmm0, [r1 + r3]
680 movaps xmm1, [r0 + r2]
681 shufps xmm0, xmm0, 0x1b
682 shufps xmm1, xmm1, 0x1b
684 movaps [r0 + r3], xmm1
685 movaps [r1 + r2], xmm0
695 %define unpcklps punpckldq
696 %define unpckhps punpckhdq
697 DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
698 DECL_PASS pass_interleave_3dnow, PASS_BIG 0
699 %define pass_3dnow2 pass_3dnow
700 %define pass_interleave_3dnow2 pass_interleave_3dnow
703 %define SECTION_REL - $$
708 %macro DECL_FFT 1-2 ; nbits, suffix
710 %xdefine fullsuffix SUFFIX
712 %xdefine fullsuffix %2 %+ SUFFIX
714 %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
716 %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
719 %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
726 %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
729 fft %+ n %+ fullsuffix:
730 call fft %+ n2 %+ SUFFIX
731 add r0, n*4 - (n&(-2<<%1))
732 call fft %+ n4 %+ SUFFIX
733 add r0, n*2 - (n2&(-2<<%1))
734 call fft %+ n4 %+ SUFFIX
735 sub r0, n*6 + (n2&(-2<<%1))
738 jmp pass %+ fullsuffix
745 dispatch_tab %+ fullsuffix: pointer list_of_fft
749 ; On x86_32, this function does the register saving and restoring for all of fft.
750 ; The others pass args in registers and don't spill anything.
751 cglobal fft_dispatch%2, 2,5,8, z, nbits
752 FFT_DISPATCH fullsuffix, nbits
762 DECL_FFT 6, _interleave
766 DECL_FFT 5, _interleave
769 DECL_FFT 4, _interleave
772 DECL_FFT 4, _interleave
781 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
782 movaps xmm0, [%3+%2*4]
783 movaps xmm1, [%3+%1*4-0x10]
785 shufps xmm0, xmm1, 0x88
786 shufps xmm1, xmm2, 0x77
787 movlps xmm4, [%4+%2*2]
788 movlps xmm5, [%5+%2*2+0x0]
789 movhps xmm4, [%4+%1*2-0x8]
790 movhps xmm5, [%5+%1*2-0x8]
804 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
805 mulps m6, %3, [%5+%1]
806 mulps m7, %2, [%5+%1]
807 mulps %2, %2, [%6+%1]
808 mulps %3, %3, [%6+%1]
813 %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
815 vmovaps ymm1, [%3+%1*2]
816 vmovaps ymm0, [%3+%1*2+0x20]
817 vmovaps ymm3, [%3+%2*2]
818 vmovaps ymm2, [%3+%2*2+0x20]
820 CMUL %1, ymm0, ymm1, %3, %4, %5
821 CMUL %2, ymm2, ymm3, %3, %4, %5
822 vshufps ymm1, ymm1, ymm1, 0x1b
823 vshufps ymm3, ymm3, ymm3, 0x1b
824 vperm2f128 ymm1, ymm1, ymm1, 0x01
825 vperm2f128 ymm3, ymm3, ymm3, 0x01
826 vunpcklps ymm6, ymm2, ymm1
827 vunpckhps ymm4, ymm2, ymm1
828 vunpcklps ymm7, ymm0, ymm3
829 vunpckhps ymm5, ymm0, ymm3
831 vextractf128 [%3+%1*2], ymm7, 0
832 vextractf128 [%3+%1*2+0x10], ymm5, 0
833 vextractf128 [%3+%1*2+0x20], ymm7, 1
834 vextractf128 [%3+%1*2+0x30], ymm5, 1
836 vextractf128 [%3+%2*2], ymm6, 0
837 vextractf128 [%3+%2*2+0x10], ymm4, 0
838 vextractf128 [%3+%2*2+0x20], ymm6, 1
839 vextractf128 [%3+%2*2+0x30], ymm4, 1
845 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
847 movaps xmm1, [%3+%1*2]
848 movaps xmm0, [%3+%1*2+0x10]
849 CMUL %1, xmm0, xmm1, %3, %4, %5
850 movaps xmm5, [%3+%2*2]
851 movaps xmm4, [%3+%2*2+0x10]
852 CMUL %2, xmm4, xmm5, %3, %4, %5
853 shufps xmm1, xmm1, 0x1b
854 shufps xmm5, xmm5, 0x1b
861 movaps [%3+%2*2], xmm6
862 movaps [%3+%2*2+0x10], xmm4
863 movaps [%3+%1*2], xmm0
864 movaps [%3+%1*2+0x10], xmm2
871 cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
881 mov r3d, [r0+FFTContext.mdctsize]
884 mov rtcos, [r0+FFTContext.tcos]
885 mov rtsin, [r0+FFTContext.tsin]
893 mov rrevtab, [r0+FFTContext.revtab]
913 PREROTATER r4, r3, r2, rtcos, rtsin
915 movzx r5, word [rrevtab+r4-4]
916 movzx r6, word [rrevtab+r4-2]
917 movzx r10, word [rrevtab+r3]
918 movzx r11, word [rrevtab+r3+2]
919 movlps [r1+r5 *8], xmm0
920 movhps [r1+r6 *8], xmm0
921 movlps [r1+r10*8], xmm1
922 movhps [r1+r11*8], xmm1
926 movzx r5, word [r6+r4-4]
927 movzx r4, word [r6+r4-2]
928 movlps [r1+r5*8], xmm0
929 movhps [r1+r4*8], xmm0
930 movzx r5, word [r6+r3]
931 movzx r4, word [r6+r3+2]
932 movlps [r1+r5*8], xmm1
933 movhps [r1+r4*8], xmm1
941 mov r1d, [r5+FFTContext.nbits]
943 FFT_DISPATCH SUFFIX, r1
945 mov r0d, [r5+FFTContext.mdctsize]
957 %1 r0, r1, r6, rtcos, rtsin
967 DECL_IMDCT POSROTATESHUF
972 DECL_IMDCT POSROTATESHUF_AVX