1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2011 Vitor Sessak
6 ;* This algorithm (though not any of the implementation details) is
7 ;* based on libdjbfft by D. J. Bernstein.
9 ;* This file is part of FFmpeg.
11 ;* FFmpeg is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* FFmpeg is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with FFmpeg; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 ; These functions are not individually interchangeable with the C versions.
27 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28 ; in blocks as conventient to the vector size.
29 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
31 %include "libavutil/x86/x86util.asm"
56 %define M_SQRT1_2 0.70710678118654752440
57 %define M_COS_PI_1_8 0.923879532511287
58 %define M_COS_PI_3_8 0.38268343236509
60 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
61 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
63 ps_root2: times 8 dd M_SQRT1_2
64 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
65 ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
67 perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
68 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
69 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
70 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
95 %macro T2_3DNOW 4 ; z0, z1, mem0, mem1
102 %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
105 pfadd %5, %4 ; {t6,t5}
106 pxor %3, [ps_m1p1] ; {t8,t7}
110 pfadd %1, %5 ; {r0,i0}
111 pfsub %6, %5 ; {r2,i2}
113 pfadd %2, %3 ; {r1,i1}
114 pfsub %4, %3 ; {r3,i3}
118 ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
119 ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
121 ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
122 ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
124 vsubps %5, %1, %2 ; v = %1 - %2
125 vaddps %3, %1, %2 ; w = %1 + %2
126 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
127 vpermilps %2, %2, [perm1]
128 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
129 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
130 vsubps %4, %5, %1 ; s = r - q
131 vaddps %1, %5, %1 ; u = r + q
132 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
133 vshufps %5, %4, %1, 0xbb
134 vshufps %3, %4, %1, 0xee
135 vperm2f128 %3, %3, %5, 0x13
136 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
137 vshufps %2, %1, %4, 0xdd
138 vshufps %1, %1, %4, 0x88
139 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
140 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
142 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
143 vsubps %2, %4, %1 ; %2 = v - w
144 vaddps %1, %4, %1 ; %1 = v + w
147 ; In SSE mode do one fft4 transforms
148 ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
149 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
151 ; In AVX mode do two fft4 transforms
152 ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
153 ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
155 subps %3, %1, %2 ; {t3,t4,-t8,t7}
156 addps %1, %1, %2 ; {t1,t2,t6,t5}
157 xorps %3, %3, [ps_p1p1m1p1]
158 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
159 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
160 subps %3, %1, %2 ; {r2,i2,r3,i3}
161 addps %1, %1, %2 ; {r0,i0,r1,i1}
162 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
163 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
166 ; In SSE mode do one FFT8
167 ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
168 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
170 ; In AVX mode do two FFT8
171 ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
172 ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
173 ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
174 ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
176 addps %6, %3, %4 ; {t1,t2,t3,t4}
177 subps %3, %3, %4 ; {r5,i5,r7,i7}
178 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
179 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
180 mulps %4, %4, [ps_root2]
181 addps %3, %3, %4 ; {t8,t7,ta,t9}
182 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
183 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
184 subps %3, %6, %4 ; {t6,t5,tc,tb}
185 addps %6, %6, %4 ; {t1,t2,t9,ta}
186 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
187 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
188 subps %3, %1, %6 ; {r4,r5,r6,r7}
189 addps %1, %1, %6 ; {r0,r1,r2,r3}
190 subps %4, %2, %5 ; {i4,i5,i6,i7}
191 addps %2, %2, %5 ; {i0,i1,i2,i3}
198 vextractf128 %4(%5), %2, 0
199 vextractf128 %4 %+ H(%5), %3, 0
200 vextractf128 %4(%5 + 1), %2, 1
201 vextractf128 %4 %+ H(%5 + 1), %3, 1
211 ; scheduled for cpu-bound sizes
212 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
217 mulps m2, m4, m0 ; r2*wre
219 mulps m3, m5, m1 ; i2*wim
221 mulps m4, m4, m1 ; r2*wim
222 mulps m5, m5, m0 ; i2*wre
223 addps m2, m2, m3 ; r2*wre + i2*wim
224 mulps m3, m1, m7 ; i3*wim
225 subps m5, m5, m4 ; i2*wre - r2*wim
226 mulps m1, m1, m6 ; r3*wim
227 mulps m4, m0, m6 ; r3*wre
228 mulps m0, m0, m7 ; i3*wre
229 subps m4, m4, m3 ; r3*wre - i3*wim
231 addps m0, m0, m1 ; i3*wre + r3*wim
232 subps m1, m4, m2 ; t3
233 addps m4, m4, m2 ; t5
234 subps m3, m3, m4 ; r2
235 addps m4, m4, Z(0) ; r0
239 subps m3, m5, m0 ; t4
240 subps m4, m6, m3 ; r3
241 addps m3, m3, m6 ; r1
245 addps m3, m5, m0 ; t6
246 subps m2, m2, m1 ; i3
248 addps m1, m1, Z(3) ; i1
251 subps m4, m7, m3 ; i2
252 addps m3, m3, m7 ; i0
257 ; scheduled to avoid store->load aliasing
258 %macro PASS_BIG 1 ; (!interleave)
262 mova m1, [wq+o1q] ; wim
263 mulps m2, m4, m0 ; r2*wre
265 mulps m3, m5, m1 ; i2*wim
267 mulps m4, m4, m1 ; r2*wim
268 mulps m5, m5, m0 ; i2*wre
269 addps m2, m2, m3 ; r2*wre + i2*wim
270 mulps m3, m1, m7 ; i3*wim
271 mulps m1, m1, m6 ; r3*wim
272 subps m5, m5, m4 ; i2*wre - r2*wim
273 mulps m4, m0, m6 ; r3*wre
274 mulps m0, m0, m7 ; i3*wre
275 subps m4, m4, m3 ; r3*wre - i3*wim
277 addps m0, m0, m1 ; i3*wre + r3*wim
278 subps m1, m4, m2 ; t3
279 addps m4, m4, m2 ; t5
280 subps m3, m3, m4 ; r2
281 addps m4, m4, Z(0) ; r0
285 subps m3, m5, m0 ; t4
286 subps m4, m6, m3 ; r3
287 addps m3, m3, m6 ; r1
291 addps m5, m5, m0 ; t6
292 subps m2, m2, m1 ; i3
294 addps m1, m1, Z(3) ; i1
297 subps m6, m7, m5 ; i2
298 addps m5, m5, m7 ; i0
302 INTERL m1, m3, m7, Z, 2
303 INTERL m2, m4, m0, Z2, 6
308 INTERL m5, m1, m3, Z, 0
309 INTERL m6, m2, m7, Z, 4
319 %define Z(x) [r0+mmsize*x]
320 %define Z2(x) [r0+mmsize*x]
321 %define ZH(x) [r0+mmsize*x+mmsize/2]
325 %if HAVE_AVX_EXTERNAL
330 T8_AVX m0, m1, m2, m3, m4
344 T8_AVX m0, m1, m4, m5, m7
346 mova m4, [ps_cos16_1]
347 mova m5, [ps_cos16_2]
354 vblendps m2, m7, m3, 0xf0
355 vperm2f128 m3, m7, m3, 0x21
358 vperm2f128 m2, m2, m2, 0x01
363 vextractf128 Z(0), m0, 0
364 vextractf128 ZH(0), m1, 0
365 vextractf128 Z(1), m0, 1
366 vextractf128 ZH(1), m1, 1
367 vextractf128 Z(2), m5, 0
368 vextractf128 ZH(2), m3, 0
369 vextractf128 Z(3), m5, 1
370 vextractf128 ZH(3), m3, 1
385 T8_SSE m0, m1, m2, m3, m4, m6
386 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
387 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
389 vperm2f128 m4, m0, m2, 0x20
390 vperm2f128 m5, m1, m3, 0x20
391 vperm2f128 m6, m0, m2, 0x31
392 vperm2f128 m7, m1, m3, 0x31
394 PASS_SMALL 0, [cos_32], [cos_32+32]
398 fft32_interleave_avx:
406 vextractf128 Z(0), m0, 0
407 vextractf128 ZH(0), m1, 0
408 vextractf128 Z(1), m0, 1
409 vextractf128 ZH(1), m1, 1
436 T8_SSE m0, m1, m2, m3, m4, m5
450 T8_SSE m0, m1, m2, m3, m4, m5
461 PASS_SMALL 0, [cos_16], [cos_16+16]
468 T2_3DNOW m0, m1, Z(0), Z(1)
471 T4_3DNOW m0, m1, m2, m3, m4, m5
482 T2_3DNOW m0, m1, Z(0), Z(1)
485 T4_3DNOW m0, m1, m2, m3, m4, m5
488 T2_3DNOW m4, m5, Z(4), Z(5)
489 T2_3DNOW m6, m7, Z2(6), Z2(7)
498 T4_3DNOW m1, m3, m5, m7, m0, m2
503 T4_3DNOW m0, m2, m4, m6, m5, m7
527 %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
528 %define Z2(x) [zcq + o3q + mmsize*(x&1)]
529 %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
530 %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
532 %macro DECL_PASS 2+ ; name, payload
535 DEFINE_ARGS zc, w, n, o1, o3
548 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
549 lea r2, [dispatch_tab%1]
550 mov r2, [r2 + (%2q-2)*gprsize]
556 %endmacro ; FFT_DISPATCH
560 %if HAVE_AVX_EXTERNAL
561 DECL_PASS pass_avx, PASS_BIG 1
562 DECL_PASS pass_interleave_avx, PASS_BIG 0
564 cglobal fft_calc, 2,5,8
565 mov r3d, [r0 + FFTContext.nbits]
568 FFT_DISPATCH _interleave %+ SUFFIX, r1
575 DECL_PASS pass_sse, PASS_BIG 1
576 DECL_PASS pass_interleave_sse, PASS_BIG 0
578 %macro FFT_CALC_FUNC 0
579 cglobal fft_calc, 2,5,8
580 mov r3d, [r0 + FFTContext.nbits]
585 FFT_DISPATCH _interleave %+ SUFFIX, r1
588 cmp rcx, 3+(mmsize/16)
596 PSWAPD m0, [r4 + r2 + 4]
597 mova [r4 + r2 + 4], m0
599 movaps xmm0, [r4 + r2]
601 unpcklps xmm0, [r4 + r2 + 16]
602 unpckhps xmm1, [r4 + r2 + 16]
603 movaps [r4 + r2], xmm0
604 movaps [r4 + r2 + 16], xmm1
626 cglobal fft_permute, 2,7,1
627 mov r4, [r0 + FFTContext.revtab]
628 mov r5, [r0 + FFTContext.tmpbuf]
629 mov ecx, [r0 + FFTContext.nbits]
637 movaps xmm0, [r1 + 8*r0]
638 movzx r6, word [r4 + 2*r0]
639 movzx r3, word [r4 + 2*r0 + 2]
640 movlps [r5 + 8*r6], xmm0
641 movhps [r5 + 8*r3], xmm0
649 ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
651 movaps xmm0, [r5 + r2]
652 movaps xmm1, [r5 + r2 + 16]
653 movaps [r1 + r2], xmm0
654 movaps [r1 + r2 + 16], xmm1
659 %macro IMDCT_CALC_FUNC 0
660 cglobal imdct_calc, 3,5,3
661 mov r3d, [r0 + FFTContext.mdctsize]
662 mov r4, [r0 + FFTContext.imdcthalf]
671 sub rsp, 8+32*WIN64 ; allocate win64 shadow space
726 %define unpcklps punpckldq
727 %define unpckhps punpckhdq
728 DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
729 DECL_PASS pass_interleave_3dnow, PASS_BIG 0
730 %define pass_3dnowext pass_3dnow
731 %define pass_interleave_3dnowext pass_interleave_3dnow
735 %define SECTION_REL - $$
740 %macro DECL_FFT 1-2 ; nbits, suffix
742 %xdefine fullsuffix SUFFIX
744 %xdefine fullsuffix %2 %+ SUFFIX
746 %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
748 %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
751 %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
758 %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
761 fft %+ n %+ fullsuffix:
762 call fft %+ n2 %+ SUFFIX
763 add r0, n*4 - (n&(-2<<%1))
764 call fft %+ n4 %+ SUFFIX
765 add r0, n*2 - (n2&(-2<<%1))
766 call fft %+ n4 %+ SUFFIX
767 sub r0, n*6 + (n2&(-2<<%1))
770 jmp pass %+ fullsuffix
777 dispatch_tab %+ fullsuffix: pointer list_of_fft
780 %if HAVE_AVX_EXTERNAL
783 DECL_FFT 6, _interleave
787 DECL_FFT 5, _interleave
791 DECL_FFT 4, _interleave
794 DECL_FFT 4, _interleave
804 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
805 %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
811 movd m1, [%4+%1*2-4] ; tcos[j]
812 movd m3, [%4+%2*2] ; tcos[n4-j-1]
813 punpckldq m1, [%5+%1*2-4] ; tsin[j]
814 punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
824 %if cpuflag(3dnowext)
828 SBUTTERFLY dq, 0, 4, 1
829 SBUTTERFLY dq, 2, 6, 3
836 movaps xmm0, [%3+%2*4]
837 movaps xmm1, [%3+%1*4-0x10]
839 shufps xmm0, xmm1, 0x88
840 shufps xmm1, xmm2, 0x77
841 movlps xmm4, [%4+%2*2]
842 movlps xmm5, [%5+%2*2+0x0]
843 movhps xmm4, [%4+%1*2-0x8]
844 movhps xmm5, [%5+%1*2-0x8]
859 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
861 mulps m6, %3, [%5+%1]
862 mulps m7, %2, [%5+%1]
863 mulps %2, %2, [%6+%1]
864 mulps %3, %3, [%6+%1]
881 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
884 vmovaps ymm1, [%3+%1*2]
885 vmovaps ymm0, [%3+%1*2+0x20]
886 vmovaps ymm3, [%3+%2*2]
887 vmovaps ymm2, [%3+%2*2+0x20]
889 CMUL %1, ymm0, ymm1, %3, %4, %5
890 CMUL %2, ymm2, ymm3, %3, %4, %5
891 vshufps ymm1, ymm1, ymm1, 0x1b
892 vshufps ymm3, ymm3, ymm3, 0x1b
893 vperm2f128 ymm1, ymm1, ymm1, 0x01
894 vperm2f128 ymm3, ymm3, ymm3, 0x01
895 vunpcklps ymm6, ymm2, ymm1
896 vunpckhps ymm4, ymm2, ymm1
897 vunpcklps ymm7, ymm0, ymm3
898 vunpckhps ymm5, ymm0, ymm3
900 vextractf128 [%3+%1*2], ymm7, 0
901 vextractf128 [%3+%1*2+0x10], ymm5, 0
902 vextractf128 [%3+%1*2+0x20], ymm7, 1
903 vextractf128 [%3+%1*2+0x30], ymm5, 1
905 vextractf128 [%3+%2*2], ymm6, 0
906 vextractf128 [%3+%2*2+0x10], ymm4, 0
907 vextractf128 [%3+%2*2+0x20], ymm6, 1
908 vextractf128 [%3+%2*2+0x30], ymm4, 1
913 movaps xmm1, [%3+%1*2]
914 movaps xmm0, [%3+%1*2+0x10]
915 CMUL %1, xmm0, xmm1, %3, %4, %5
916 movaps xmm5, [%3+%2*2]
917 movaps xmm4, [%3+%2*2+0x10]
918 CMUL %2, xmm4, xmm5, %3, %4, %5
919 shufps xmm1, xmm1, 0x1b
920 shufps xmm5, xmm5, 0x1b
927 movaps [%3+%2*2], xmm6
928 movaps [%3+%2*2+0x10], xmm4
929 movaps [%3+%1*2], xmm0
930 movaps [%3+%1*2+0x10], xmm2
935 CMUL %3, %1, m0, m1, %4, %5
936 CMUL %3, %2, m2, m3, %4, %5
937 movd [%3+%1*2+ 0], m0
938 movd [%3+%2*2+12], m1
939 movd [%3+%2*2+ 0], m2
940 movd [%3+%1*2+12], m3
945 movd [%3+%1*2+ 8], m0
946 movd [%3+%2*2+ 4], m1
947 movd [%3+%2*2+ 8], m2
948 movd [%3+%1*2+ 4], m3
956 cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
966 mov r3d, [r0+FFTContext.mdctsize]
969 mov rtcos, [r0+FFTContext.tcos]
970 mov rtsin, [r0+FFTContext.tsin]
978 mov rrevtab, [r0+FFTContext.revtab]
989 %if ARCH_X86_64 || mmsize == 8
993 %if notcpuflag(3dnowext) && mmsize == 8
1007 PREROTATER r4, r3, r2, rtcos, rtsin
1009 mov r6, [esp] ; rrevtab = ptr+n8
1010 movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
1011 movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
1018 movzx r5, word [rrevtab+r4-4]
1019 movzx r6, word [rrevtab+r4-2]
1020 movzx r10, word [rrevtab+r3]
1021 movzx r11, word [rrevtab+r3+2]
1022 movlps [r1+r5 *8], xmm0
1023 movhps [r1+r6 *8], xmm0
1024 movlps [r1+r10*8], xmm1
1025 movhps [r1+r11*8], xmm1
1029 movzx r5, word [r6+r4-4]
1030 movzx r4, word [r6+r4-2]
1031 movlps [r1+r5*8], xmm0
1032 movhps [r1+r4*8], xmm0
1033 movzx r5, word [r6+r3]
1034 movzx r4, word [r6+r3+2]
1035 movlps [r1+r5*8], xmm1
1036 movhps [r1+r4*8], xmm1
1045 mov r1d, [r5+FFTContext.nbits]
1047 FFT_DISPATCH SUFFIX, r1
1049 mov r0d, [r5+FFTContext.mdctsize]
1052 %if ARCH_X86_64 == 0
1061 POSROTATESHUF r0, r1, r6, rtcos, rtsin
1062 %if ARCH_X86_64 == 0
1083 %if HAVE_AVX_EXTERNAL