1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2011 Vitor Sessak
6 ;* This algorithm (though not any of the implementation details) is
7 ;* based on libdjbfft by D. J. Bernstein.
9 ;* This file is part of Libav.
11 ;* Libav is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* Libav is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with Libav; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 ; These functions are not individually interchangeable with the C versions.
27 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28 ; in blocks as conventient to the vector size.
29 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
31 %include "libavutil/x86/x86util.asm"
56 %define M_SQRT1_2 0.70710678118654752440
57 %define M_COS_PI_1_8 0.923879532511287
58 %define M_COS_PI_3_8 0.38268343236509
61 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
62 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
64 ps_root2: times 8 dd M_SQRT1_2
65 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
66 ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
68 perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
69 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
70 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
71 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
72 ps_m1m1m1m1: times 4 dd 1<<31
95 %macro T2_3DNOW 4 ; z0, z1, mem0, mem1
102 %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
105 pfadd %5, %4 ; {t6,t5}
106 pxor %3, [ps_m1p1] ; {t8,t7}
110 pfadd %1, %5 ; {r0,i0}
111 pfsub %6, %5 ; {r2,i2}
113 pfadd %2, %3 ; {r1,i1}
114 pfsub %4, %3 ; {r3,i3}
118 ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
119 ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
121 ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
122 ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
124 vsubps %5, %1, %2 ; v = %1 - %2
125 vaddps %3, %1, %2 ; w = %1 + %2
126 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
127 vpermilps %2, %2, [perm1]
128 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
129 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
130 vsubps %4, %5, %1 ; s = r - q
131 vaddps %1, %5, %1 ; u = r + q
132 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
133 vshufps %5, %4, %1, 0xbb
134 vshufps %3, %4, %1, 0xee
135 vperm2f128 %3, %3, %5, 0x13
136 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
137 vshufps %2, %1, %4, 0xdd
138 vshufps %1, %1, %4, 0x88
139 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
140 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
142 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
143 vsubps %2, %4, %1 ; %2 = v - w
144 vaddps %1, %4, %1 ; %1 = v + w
147 ; In SSE mode do one fft4 transforms
148 ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
149 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
151 ; In AVX mode do two fft4 transforms
152 ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
153 ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
155 subps %3, %1, %2 ; {t3,t4,-t8,t7}
156 addps %1, %1, %2 ; {t1,t2,t6,t5}
157 xorps %3, %3, [ps_p1p1m1p1]
158 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
159 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
160 subps %3, %1, %2 ; {r2,i2,r3,i3}
161 addps %1, %1, %2 ; {r0,i0,r1,i1}
162 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
163 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
166 ; In SSE mode do one FFT8
167 ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
168 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
170 ; In AVX mode do two FFT8
171 ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
172 ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
173 ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
174 ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
176 addps %6, %3, %4 ; {t1,t2,t3,t4}
177 subps %3, %3, %4 ; {r5,i5,r7,i7}
178 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
179 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
180 mulps %4, %4, [ps_root2]
181 addps %3, %3, %4 ; {t8,t7,ta,t9}
182 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
183 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
184 subps %3, %6, %4 ; {t6,t5,tc,tb}
185 addps %6, %6, %4 ; {t1,t2,t9,ta}
186 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
187 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
188 subps %3, %1, %6 ; {r4,r5,r6,r7}
189 addps %1, %1, %6 ; {r0,r1,r2,r3}
190 subps %4, %2, %5 ; {i4,i5,i6,i7}
191 addps %2, %2, %5 ; {i0,i1,i2,i3}
194 ; scheduled for cpu-bound sizes
195 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
200 mulps m2, m4, m0 ; r2*wre
202 mulps m3, m5, m1 ; i2*wim
204 mulps m4, m4, m1 ; r2*wim
205 mulps m5, m5, m0 ; i2*wre
206 addps m2, m2, m3 ; r2*wre + i2*wim
207 mulps m3, m1, m7 ; i3*wim
208 subps m5, m5, m4 ; i2*wre - r2*wim
209 mulps m1, m1, m6 ; r3*wim
210 mulps m4, m0, m6 ; r3*wre
211 mulps m0, m0, m7 ; i3*wre
212 subps m4, m4, m3 ; r3*wre - i3*wim
214 addps m0, m0, m1 ; i3*wre + r3*wim
215 subps m1, m4, m2 ; t3
216 addps m4, m4, m2 ; t5
217 subps m3, m3, m4 ; r2
218 addps m4, m4, Z(0) ; r0
222 subps m3, m5, m0 ; t4
223 subps m4, m6, m3 ; r3
224 addps m3, m3, m6 ; r1
228 addps m3, m5, m0 ; t6
229 subps m2, m2, m1 ; i3
231 addps m1, m1, Z(3) ; i1
234 subps m4, m7, m3 ; i2
235 addps m3, m3, m7 ; i0
240 ; scheduled to avoid store->load aliasing
241 %macro PASS_BIG 1 ; (!interleave)
245 mova m1, [wq+o1q] ; wim
246 mulps m2, m4, m0 ; r2*wre
248 mulps m3, m5, m1 ; i2*wim
250 mulps m4, m4, m1 ; r2*wim
251 mulps m5, m5, m0 ; i2*wre
252 addps m2, m2, m3 ; r2*wre + i2*wim
253 mulps m3, m1, m7 ; i3*wim
254 mulps m1, m1, m6 ; r3*wim
255 subps m5, m5, m4 ; i2*wre - r2*wim
256 mulps m4, m0, m6 ; r3*wre
257 mulps m0, m0, m7 ; i3*wre
258 subps m4, m4, m3 ; r3*wre - i3*wim
260 addps m0, m0, m1 ; i3*wre + r3*wim
261 subps m1, m4, m2 ; t3
262 addps m4, m4, m2 ; t5
263 subps m3, m3, m4 ; r2
264 addps m4, m4, Z(0) ; r0
268 subps m3, m5, m0 ; t4
269 subps m4, m6, m3 ; r3
270 addps m3, m3, m6 ; r1
274 addps m5, m5, m0 ; t6
275 subps m2, m2, m1 ; i3
277 addps m1, m1, Z(3) ; i1
280 subps m6, m7, m5 ; i2
281 addps m5, m5, m7 ; i0
285 INTERL m1, m3, m7, Z, 2
286 INTERL m2, m4, m0, Z2, 6
291 INTERL m5, m1, m3, Z, 0
292 INTERL m6, m2, m7, Z, 4
302 %define Z(x) [r0+mmsize*x]
303 %define Z2(x) [r0+mmsize*x]
304 %define ZH(x) [r0+mmsize*x+mmsize/2]
312 T8_AVX m0, m1, m2, m3, m4
326 T8_AVX m0, m1, m4, m5, m7
328 mova m4, [ps_cos16_1]
329 mova m5, [ps_cos16_2]
336 vblendps m2, m7, m3, 0xf0
337 vperm2f128 m3, m7, m3, 0x21
340 vperm2f128 m2, m2, m2, 0x01
345 vextractf128 Z(0), m0, 0
346 vextractf128 ZH(0), m1, 0
347 vextractf128 Z(1), m0, 1
348 vextractf128 ZH(1), m1, 1
349 vextractf128 Z(2), m5, 0
350 vextractf128 ZH(2), m3, 0
351 vextractf128 Z(3), m5, 1
352 vextractf128 ZH(3), m3, 1
367 T8_SSE m0, m1, m2, m3, m4, m6
368 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
369 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
371 vperm2f128 m4, m0, m2, 0x20
372 vperm2f128 m5, m1, m3, 0x20
373 vperm2f128 m6, m0, m2, 0x31
374 vperm2f128 m7, m1, m3, 0x31
376 PASS_SMALL 0, [cos_32], [cos_32+32]
380 fft32_interleave_avx:
388 vextractf128 Z(0), m0, 0
389 vextractf128 ZH(0), m1, 0
390 vextractf128 Z(1), m0, 1
391 vextractf128 ZH(1), m1, 1
416 T8_SSE m0, m1, m2, m3, m4, m5
430 T8_SSE m0, m1, m2, m3, m4, m5
441 PASS_SMALL 0, [cos_16], [cos_16+16]
448 T2_3DNOW m0, m1, Z(0), Z(1)
451 T4_3DNOW m0, m1, m2, m3, m4, m5
462 T2_3DNOW m0, m1, Z(0), Z(1)
465 T4_3DNOW m0, m1, m2, m3, m4, m5
468 T2_3DNOW m4, m5, Z(4), Z(5)
469 T2_3DNOW m6, m7, Z2(6), Z2(7)
478 T4_3DNOW m1, m3, m5, m7, m0, m2
483 T4_3DNOW m0, m2, m4, m6, m5, m7
507 %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
508 %define Z2(x) [zcq + o3q + mmsize*(x&1)]
509 %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
510 %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
512 %macro DECL_PASS 2+ ; name, payload
515 DEFINE_ARGS zc, w, n, o1, o3
528 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
529 lea r2, [dispatch_tab%1]
530 mov r2, [r2 + (%2q-2)*gprsize]
536 %endmacro ; FFT_DISPATCH
543 vextractf128 %4(%5), %2, 0
544 vextractf128 %4 %+ H(%5), %3, 0
545 vextractf128 %4(%5 + 1), %2, 1
546 vextractf128 %4 %+ H(%5 + 1), %3, 1
549 %define INTERL INTERL_AVX
551 DECL_PASS pass_avx, PASS_BIG 1
552 DECL_PASS pass_interleave_avx, PASS_BIG 0
554 cglobal fft_calc, 2,5,8
555 mov r3d, [r0 + FFTContext.nbits]
558 FFT_DISPATCH _interleave %+ SUFFIX, r1
572 %define INTERL INTERL_SSE
574 DECL_PASS pass_sse, PASS_BIG 1
575 DECL_PASS pass_interleave_sse, PASS_BIG 0
577 %macro FFT_CALC_FUNC 0
578 cglobal fft_calc, 2,5,8
579 mov r3d, [r0 + FFTContext.nbits]
584 FFT_DISPATCH _interleave %+ SUFFIX, r1
587 cmp rcx, 3+(mmsize/16)
595 PSWAPD m0, [r4 + r2 + 4]
596 mova [r4 + r2 + 4], m0
598 movaps xmm0, [r4 + r2]
600 unpcklps xmm0, [r4 + r2 + 16]
601 unpckhps xmm1, [r4 + r2 + 16]
602 movaps [r4 + r2], xmm0
603 movaps [r4 + r2 + 16], xmm1
625 cglobal fft_permute, 2,7,1
626 mov r4, [r0 + FFTContext.revtab]
627 mov r5, [r0 + FFTContext.tmpbuf]
628 mov ecx, [r0 + FFTContext.nbits]
636 movaps xmm0, [r1 + 8*r0]
637 movzx r6, word [r4 + 2*r0]
638 movzx r3, word [r4 + 2*r0 + 2]
639 movlps [r5 + 8*r6], xmm0
640 movhps [r5 + 8*r3], xmm0
648 ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
650 movaps xmm0, [r5 + r2]
651 movaps xmm1, [r5 + r2 + 16]
652 movaps [r1 + r2], xmm0
653 movaps [r1 + r2 + 16], xmm1
663 %define unpcklps punpckldq
664 %define unpckhps punpckhdq
665 DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
666 DECL_PASS pass_interleave_3dnow, PASS_BIG 0
667 %define pass_3dnowext pass_3dnow
668 %define pass_interleave_3dnowext pass_interleave_3dnow
672 %define SECTION_REL - $$
677 %macro DECL_FFT 1-2 ; nbits, suffix
679 %xdefine fullsuffix SUFFIX
681 %xdefine fullsuffix %2 %+ SUFFIX
683 %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
685 %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
688 %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
695 %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
698 fft %+ n %+ fullsuffix:
699 call fft %+ n2 %+ SUFFIX
700 add r0, n*4 - (n&(-2<<%1))
701 call fft %+ n4 %+ SUFFIX
702 add r0, n*2 - (n2&(-2<<%1))
703 call fft %+ n4 %+ SUFFIX
704 sub r0, n*6 + (n2&(-2<<%1))
707 jmp pass %+ fullsuffix
714 dispatch_tab %+ fullsuffix: pointer list_of_fft
719 DECL_FFT 6, _interleave
722 DECL_FFT 5, _interleave
726 DECL_FFT 4, _interleave
729 DECL_FFT 4, _interleave
734 %macro IMDCT_CALC_FUNC 0
735 cglobal imdct_calc, 3,5,3
736 mov r3d, [r0 + FFTContext.mdctsize]
737 mov r4, [r0 + FFTContext.imdcthalf]
746 sub rsp, 8+32*WIN64 ; allocate win64 shadow space
760 mova m2, [ps_m1m1m1m1]
803 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
804 %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
810 movd m1, [%4+%1*2-4] ; tcos[j]
811 movd m3, [%4+%2*2] ; tcos[n4-j-1]
812 punpckldq m1, [%5+%1*2-4] ; tsin[j]
813 punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
823 %if cpuflag(3dnowext)
827 SBUTTERFLY dq, 0, 4, 1
828 SBUTTERFLY dq, 2, 6, 3
835 movaps xmm0, [%3+%2*4]
836 movaps xmm1, [%3+%1*4-0x10]
838 shufps xmm0, xmm1, 0x88
839 shufps xmm1, xmm2, 0x77
840 movlps xmm4, [%4+%2*2]
841 movlps xmm5, [%5+%2*2+0x0]
842 movhps xmm4, [%4+%1*2-0x8]
843 movhps xmm5, [%5+%1*2-0x8]
858 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
859 mulps m6, %3, [%5+%1]
860 mulps m7, %2, [%5+%1]
861 mulps %2, %2, [%6+%1]
862 mulps %3, %3, [%6+%1]
867 %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
869 vmovaps ymm1, [%3+%1*2]
870 vmovaps ymm0, [%3+%1*2+0x20]
871 vmovaps ymm3, [%3+%2*2]
872 vmovaps ymm2, [%3+%2*2+0x20]
874 CMUL %1, ymm0, ymm1, %3, %4, %5
875 CMUL %2, ymm2, ymm3, %3, %4, %5
876 vshufps ymm1, ymm1, ymm1, 0x1b
877 vshufps ymm3, ymm3, ymm3, 0x1b
878 vperm2f128 ymm1, ymm1, ymm1, 0x01
879 vperm2f128 ymm3, ymm3, ymm3, 0x01
880 vunpcklps ymm6, ymm2, ymm1
881 vunpckhps ymm4, ymm2, ymm1
882 vunpcklps ymm7, ymm0, ymm3
883 vunpckhps ymm5, ymm0, ymm3
885 vextractf128 [%3+%1*2], ymm7, 0
886 vextractf128 [%3+%1*2+0x10], ymm5, 0
887 vextractf128 [%3+%1*2+0x20], ymm7, 1
888 vextractf128 [%3+%1*2+0x30], ymm5, 1
890 vextractf128 [%3+%2*2], ymm6, 0
891 vextractf128 [%3+%2*2+0x10], ymm4, 0
892 vextractf128 [%3+%2*2+0x20], ymm6, 1
893 vextractf128 [%3+%2*2+0x30], ymm4, 1
899 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
901 movaps xmm1, [%3+%1*2]
902 movaps xmm0, [%3+%1*2+0x10]
903 CMUL %1, xmm0, xmm1, %3, %4, %5
904 movaps xmm5, [%3+%2*2]
905 movaps xmm4, [%3+%2*2+0x10]
906 CMUL %2, xmm4, xmm5, %3, %4, %5
907 shufps xmm1, xmm1, 0x1b
908 shufps xmm5, xmm5, 0x1b
915 movaps [%3+%2*2], xmm6
916 movaps [%3+%2*2+0x10], xmm4
917 movaps [%3+%1*2], xmm0
918 movaps [%3+%1*2+0x10], xmm2
937 %macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
939 CMUL_3DNOW %3, %1, m0, m1, %4, %5
940 CMUL_3DNOW %3, %2, m2, m3, %4, %5
941 movd [%3+%1*2+ 0], m0
942 movd [%3+%2*2+12], m1
943 movd [%3+%2*2+ 0], m2
944 movd [%3+%1*2+12], m3
949 movd [%3+%1*2+ 8], m0
950 movd [%3+%2*2+ 4], m1
951 movd [%3+%2*2+ 8], m2
952 movd [%3+%1*2+ 4], m3
959 cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
969 mov r3d, [r0+FFTContext.mdctsize]
972 mov rtcos, [r0+FFTContext.tcos]
973 mov rtsin, [r0+FFTContext.tsin]
981 mov rrevtab, [r0+FFTContext.revtab]
992 %if ARCH_X86_64 || mmsize == 8
996 %if notcpuflag(3dnowext) && mmsize == 8
997 movd m7, [ps_m1m1m1m1]
1000 %if ARCH_X86_64 == 0
1010 PREROTATER r4, r3, r2, rtcos, rtsin
1012 mov r6, [esp] ; rrevtab = ptr+n8
1013 movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
1014 movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
1021 movzx r5, word [rrevtab+r4-4]
1022 movzx r6, word [rrevtab+r4-2]
1023 movzx r10, word [rrevtab+r3]
1024 movzx r11, word [rrevtab+r3+2]
1025 movlps [r1+r5 *8], xmm0
1026 movhps [r1+r6 *8], xmm0
1027 movlps [r1+r10*8], xmm1
1028 movhps [r1+r11*8], xmm1
1032 movzx r5, word [r6+r4-4]
1033 movzx r4, word [r6+r4-2]
1034 movlps [r1+r5*8], xmm0
1035 movhps [r1+r4*8], xmm0
1036 movzx r5, word [r6+r3]
1037 movzx r4, word [r6+r3+2]
1038 movlps [r1+r5*8], xmm1
1039 movhps [r1+r4*8], xmm1
1048 mov r1d, [r5+FFTContext.nbits]
1050 FFT_DISPATCH SUFFIX, r1
1052 mov r0d, [r5+FFTContext.mdctsize]
1055 %if ARCH_X86_64 == 0
1064 %1 r0, r1, r6, rtcos, rtsin
1065 %if ARCH_X86_64 == 0
1074 DECL_IMDCT POSROTATESHUF
1078 DECL_IMDCT POSROTATESHUF_3DNOW
1081 DECL_IMDCT POSROTATESHUF_3DNOW
1085 DECL_IMDCT POSROTATESHUF_AVX
1087 %endif ; CONFIG_MDCT