1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
4 ;* Copyright (c) 2011 Vitor Sessak
6 ;* This algorithm (though not any of the implementation details) is
7 ;* based on libdjbfft by D. J. Bernstein.
9 ;* This file is part of FFmpeg.
11 ;* FFmpeg is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* FFmpeg is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with FFmpeg; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
26 ; These functions are not individually interchangeable with the C versions.
27 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
28 ; in blocks as conventient to the vector size.
29 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
31 %include "libavutil/x86/x86inc.asm"
32 %include "libavutil/x86/x86util.asm"
57 %define M_SQRT1_2 0.70710678118654752440
58 %define M_COS_PI_1_8 0.923879532511287
59 %define M_COS_PI_3_8 0.38268343236509
62 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
63 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
65 ps_root2: times 8 dd M_SQRT1_2
66 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
67 ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
69 perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
70 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
71 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
72 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
73 ps_m1m1m1m1: times 4 dd 1<<31
96 %macro T2_3DNOW 4 ; z0, z1, mem0, mem1
103 %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
106 pfadd %5, %4 ; {t6,t5}
107 pxor %3, [ps_m1p1] ; {t8,t7}
110 pfadd %1, %5 ; {r0,i0}
111 pfsub %6, %5 ; {r2,i2}
113 pfadd %2, %3 ; {r1,i1}
114 pfsub %4, %3 ; {r3,i3}
118 ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
119 ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
121 ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
122 ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
124 vsubps %5, %1, %2 ; v = %1 - %2
125 vaddps %3, %1, %2 ; w = %1 + %2
126 vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
127 vpermilps %2, %2, [perm1]
128 vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
129 vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
130 vsubps %4, %5, %1 ; s = r - q
131 vaddps %1, %5, %1 ; u = r + q
132 vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
133 vshufps %5, %4, %1, 0xbb
134 vshufps %3, %4, %1, 0xee
135 vperm2f128 %3, %3, %5, 0x13
136 vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
137 vshufps %2, %1, %4, 0xdd
138 vshufps %1, %1, %4, 0x88
139 vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
140 vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
142 vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
143 vsubps %2, %4, %1 ; %2 = v - w
144 vaddps %1, %4, %1 ; %1 = v + w
147 ; In SSE mode do one fft4 transforms
148 ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
149 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
151 ; In AVX mode do two fft4 transforms
152 ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
153 ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
155 subps %3, %1, %2 ; {t3,t4,-t8,t7}
156 addps %1, %1, %2 ; {t1,t2,t6,t5}
157 xorps %3, %3, [ps_p1p1m1p1]
158 shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
159 shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
160 subps %3, %1, %2 ; {r2,i2,r3,i3}
161 addps %1, %1, %2 ; {r0,i0,r1,i1}
162 shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
163 shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
166 ; In SSE mode do one FFT8
167 ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
168 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
170 ; In AVX mode do two FFT8
171 ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
172 ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
173 ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
174 ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
176 addps %6, %3, %4 ; {t1,t2,t3,t4}
177 subps %3, %3, %4 ; {r5,i5,r7,i7}
178 shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
179 mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
180 mulps %4, %4, [ps_root2]
181 addps %3, %3, %4 ; {t8,t7,ta,t9}
182 shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
183 shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
184 subps %3, %6, %4 ; {t6,t5,tc,tb}
185 addps %6, %6, %4 ; {t1,t2,t9,ta}
186 shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
187 shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
188 subps %3, %1, %6 ; {r4,r5,r6,r7}
189 addps %1, %1, %6 ; {r0,r1,r2,r3}
190 subps %4, %2, %5 ; {i4,i5,i6,i7}
191 addps %2, %2, %5 ; {i0,i1,i2,i3}
194 ; scheduled for cpu-bound sizes
195 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
200 mulps m2, m4, m0 ; r2*wre
202 mulps m3, m5, m1 ; i2*wim
204 mulps m4, m4, m1 ; r2*wim
205 mulps m5, m5, m0 ; i2*wre
206 addps m2, m2, m3 ; r2*wre + i2*wim
207 mulps m3, m1, m7 ; i3*wim
208 subps m5, m5, m4 ; i2*wre - r2*wim
209 mulps m1, m1, m6 ; r3*wim
210 mulps m4, m0, m6 ; r3*wre
211 mulps m0, m0, m7 ; i3*wre
212 subps m4, m4, m3 ; r3*wre - i3*wim
214 addps m0, m0, m1 ; i3*wre + r3*wim
215 subps m1, m4, m2 ; t3
216 addps m4, m4, m2 ; t5
217 subps m3, m3, m4 ; r2
218 addps m4, m4, Z(0) ; r0
222 subps m3, m5, m0 ; t4
223 subps m4, m6, m3 ; r3
224 addps m3, m3, m6 ; r1
228 addps m3, m5, m0 ; t6
229 subps m2, m2, m1 ; i3
231 addps m1, m1, Z(3) ; i1
234 subps m4, m7, m3 ; i2
235 addps m3, m3, m7 ; i0
240 ; scheduled to avoid store->load aliasing
241 %macro PASS_BIG 1 ; (!interleave)
245 mova m1, [wq+o1q] ; wim
246 mulps m2, m4, m0 ; r2*wre
248 mulps m3, m5, m1 ; i2*wim
250 mulps m4, m4, m1 ; r2*wim
251 mulps m5, m5, m0 ; i2*wre
252 addps m2, m2, m3 ; r2*wre + i2*wim
253 mulps m3, m1, m7 ; i3*wim
254 mulps m1, m1, m6 ; r3*wim
255 subps m5, m5, m4 ; i2*wre - r2*wim
256 mulps m4, m0, m6 ; r3*wre
257 mulps m0, m0, m7 ; i3*wre
258 subps m4, m4, m3 ; r3*wre - i3*wim
260 addps m0, m0, m1 ; i3*wre + r3*wim
261 subps m1, m4, m2 ; t3
262 addps m4, m4, m2 ; t5
263 subps m3, m3, m4 ; r2
264 addps m4, m4, Z(0) ; r0
268 subps m3, m5, m0 ; t4
269 subps m4, m6, m3 ; r3
270 addps m3, m3, m6 ; r1
274 addps m5, m5, m0 ; t6
275 subps m2, m2, m1 ; i3
277 addps m1, m1, Z(3) ; i1
280 subps m6, m7, m5 ; i2
281 addps m5, m5, m7 ; i0
285 INTERL m1, m3, m7, Z, 2
286 INTERL m2, m4, m0, Z2, 6
291 INTERL m5, m1, m3, Z, 0
292 INTERL m6, m2, m7, Z, 4
302 %define Z(x) [r0+mmsize*x]
303 %define Z2(x) [r0+mmsize*x]
304 %define ZH(x) [r0+mmsize*x+mmsize/2]
308 %if HAVE_AVX_EXTERNAL
313 T8_AVX m0, m1, m2, m3, m4
327 T8_AVX m0, m1, m4, m5, m7
329 mova m4, [ps_cos16_1]
330 mova m5, [ps_cos16_2]
337 vblendps m2, m7, m3, 0xf0
338 vperm2f128 m3, m7, m3, 0x21
341 vperm2f128 m2, m2, m2, 0x01
346 vextractf128 Z(0), m0, 0
347 vextractf128 ZH(0), m1, 0
348 vextractf128 Z(1), m0, 1
349 vextractf128 ZH(1), m1, 1
350 vextractf128 Z(2), m5, 0
351 vextractf128 ZH(2), m3, 0
352 vextractf128 Z(3), m5, 1
353 vextractf128 ZH(3), m3, 1
368 T8_SSE m0, m1, m2, m3, m4, m6
369 ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
370 ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
372 vperm2f128 m4, m0, m2, 0x20
373 vperm2f128 m5, m1, m3, 0x20
374 vperm2f128 m6, m0, m2, 0x31
375 vperm2f128 m7, m1, m3, 0x31
377 PASS_SMALL 0, [cos_32], [cos_32+32]
381 fft32_interleave_avx:
389 vextractf128 Z(0), m0, 0
390 vextractf128 ZH(0), m1, 0
391 vextractf128 Z(1), m0, 1
392 vextractf128 ZH(1), m1, 1
419 T8_SSE m0, m1, m2, m3, m4, m5
433 T8_SSE m0, m1, m2, m3, m4, m5
444 PASS_SMALL 0, [cos_16], [cos_16+16]
451 T2_3DNOW m0, m1, Z(0), Z(1)
454 T4_3DNOW m0, m1, m2, m3, m4, m5
465 T2_3DNOW m0, m1, Z(0), Z(1)
468 T4_3DNOW m0, m1, m2, m3, m4, m5
471 T2_3DNOW m4, m5, Z(4), Z(5)
472 T2_3DNOW m6, m7, Z2(6), Z2(7)
481 T4_3DNOW m1, m3, m5, m7, m0, m2
486 T4_3DNOW m0, m2, m4, m6, m5, m7
504 %if cpuflag(3dnowext)
523 %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
524 %define Z2(x) [zcq + o3q + mmsize*(x&1)]
525 %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
526 %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
528 %macro DECL_PASS 2+ ; name, payload
531 DEFINE_ARGS zc, w, n, o1, o3
544 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
545 lea r2, [dispatch_tab%1]
546 mov r2, [r2 + (%2q-2)*gprsize]
552 %endmacro ; FFT_DISPATCH
556 %if HAVE_AVX_EXTERNAL
560 vextractf128 %4(%5), %2, 0
561 vextractf128 %4 %+ H(%5), %3, 0
562 vextractf128 %4(%5 + 1), %2, 1
563 vextractf128 %4 %+ H(%5 + 1), %3, 1
566 %define INTERL INTERL_AVX
568 DECL_PASS pass_avx, PASS_BIG 1
569 DECL_PASS pass_interleave_avx, PASS_BIG 0
571 cglobal fft_calc, 2,5,8
572 mov r3d, [r0 + FFTContext.nbits]
575 FFT_DISPATCH _interleave %+ SUFFIX, r1
590 %define INTERL INTERL_SSE
592 DECL_PASS pass_sse, PASS_BIG 1
593 DECL_PASS pass_interleave_sse, PASS_BIG 0
595 %macro FFT_CALC_FUNC 0
596 cglobal fft_calc, 2,5,8
597 mov r3d, [r0 + FFTContext.nbits]
602 FFT_DISPATCH _interleave %+ SUFFIX, r1
605 cmp rcx, 3+(mmsize/16)
613 PSWAPD m0, [r4 + r2 + 4]
614 mova [r4 + r2 + 4], m0
616 movaps xmm0, [r4 + r2]
618 unpcklps xmm0, [r4 + r2 + 16]
619 unpckhps xmm1, [r4 + r2 + 16]
620 movaps [r4 + r2], xmm0
621 movaps [r4 + r2 + 16], xmm1
643 cglobal fft_permute, 2,7,1
644 mov r4, [r0 + FFTContext.revtab]
645 mov r5, [r0 + FFTContext.tmpbuf]
646 mov ecx, [r0 + FFTContext.nbits]
654 movaps xmm0, [r1 + 8*r0]
655 movzx r6, word [r4 + 2*r0]
656 movzx r3, word [r4 + 2*r0 + 2]
657 movlps [r5 + 8*r6], xmm0
658 movhps [r5 + 8*r3], xmm0
666 ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
668 movaps xmm0, [r5 + r2]
669 movaps xmm1, [r5 + r2 + 16]
670 movaps [r1 + r2], xmm0
671 movaps [r1 + r2 + 16], xmm1
676 %macro IMDCT_CALC_FUNC 0
677 cglobal imdct_calc, 3,5,3
678 mov r3d, [r0 + FFTContext.mdctsize]
679 mov r4, [r0 + FFTContext.imdcthalf]
702 mova m2, [ps_m1m1m1m1]
743 %define unpcklps punpckldq
744 %define unpckhps punpckhdq
745 DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
746 DECL_PASS pass_interleave_3dnow, PASS_BIG 0
747 %define pass_3dnowext pass_3dnow
748 %define pass_interleave_3dnowext pass_interleave_3dnow
752 %define SECTION_REL - $$
757 %macro DECL_FFT 1-2 ; nbits, suffix
759 %xdefine fullsuffix SUFFIX
761 %xdefine fullsuffix %2 %+ SUFFIX
763 %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
765 %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
768 %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
775 %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
778 fft %+ n %+ fullsuffix:
779 call fft %+ n2 %+ SUFFIX
780 add r0, n*4 - (n&(-2<<%1))
781 call fft %+ n4 %+ SUFFIX
782 add r0, n*2 - (n2&(-2<<%1))
783 call fft %+ n4 %+ SUFFIX
784 sub r0, n*6 + (n2&(-2<<%1))
787 jmp pass %+ fullsuffix
794 dispatch_tab %+ fullsuffix: pointer list_of_fft
797 %if HAVE_AVX_EXTERNAL
800 DECL_FFT 6, _interleave
804 DECL_FFT 5, _interleave
808 DECL_FFT 4, _interleave
811 DECL_FFT 4, _interleave
821 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
822 %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
828 movd m1, [%4+%1*2-4] ; tcos[j]
829 movd m3, [%4+%2*2] ; tcos[n4-j-1]
830 punpckldq m1, [%5+%1*2-4] ; tsin[j]
831 punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
841 %if cpuflag(3dnowext)
845 SBUTTERFLY dq, 0, 4, 1
846 SBUTTERFLY dq, 2, 6, 3
853 movaps xmm0, [%3+%2*4]
854 movaps xmm1, [%3+%1*4-0x10]
856 shufps xmm0, xmm1, 0x88
857 shufps xmm1, xmm2, 0x77
858 movlps xmm4, [%4+%2*2]
859 movlps xmm5, [%5+%2*2+0x0]
860 movhps xmm4, [%4+%1*2-0x8]
861 movhps xmm5, [%5+%1*2-0x8]
876 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
877 mulps m6, %3, [%5+%1]
878 mulps m7, %2, [%5+%1]
879 mulps %2, %2, [%6+%1]
880 mulps %3, %3, [%6+%1]
885 %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
887 vmovaps ymm1, [%3+%1*2]
888 vmovaps ymm0, [%3+%1*2+0x20]
889 vmovaps ymm3, [%3+%2*2]
890 vmovaps ymm2, [%3+%2*2+0x20]
892 CMUL %1, ymm0, ymm1, %3, %4, %5
893 CMUL %2, ymm2, ymm3, %3, %4, %5
894 vshufps ymm1, ymm1, ymm1, 0x1b
895 vshufps ymm3, ymm3, ymm3, 0x1b
896 vperm2f128 ymm1, ymm1, ymm1, 0x01
897 vperm2f128 ymm3, ymm3, ymm3, 0x01
898 vunpcklps ymm6, ymm2, ymm1
899 vunpckhps ymm4, ymm2, ymm1
900 vunpcklps ymm7, ymm0, ymm3
901 vunpckhps ymm5, ymm0, ymm3
903 vextractf128 [%3+%1*2], ymm7, 0
904 vextractf128 [%3+%1*2+0x10], ymm5, 0
905 vextractf128 [%3+%1*2+0x20], ymm7, 1
906 vextractf128 [%3+%1*2+0x30], ymm5, 1
908 vextractf128 [%3+%2*2], ymm6, 0
909 vextractf128 [%3+%2*2+0x10], ymm4, 0
910 vextractf128 [%3+%2*2+0x20], ymm6, 1
911 vextractf128 [%3+%2*2+0x30], ymm4, 1
917 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
919 movaps xmm1, [%3+%1*2]
920 movaps xmm0, [%3+%1*2+0x10]
921 CMUL %1, xmm0, xmm1, %3, %4, %5
922 movaps xmm5, [%3+%2*2]
923 movaps xmm4, [%3+%2*2+0x10]
924 CMUL %2, xmm4, xmm5, %3, %4, %5
925 shufps xmm1, xmm1, 0x1b
926 shufps xmm5, xmm5, 0x1b
933 movaps [%3+%2*2], xmm6
934 movaps [%3+%2*2+0x10], xmm4
935 movaps [%3+%1*2], xmm0
936 movaps [%3+%1*2+0x10], xmm2
955 %macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
957 CMUL_3DNOW %3, %1, m0, m1, %4, %5
958 CMUL_3DNOW %3, %2, m2, m3, %4, %5
959 movd [%3+%1*2+ 0], m0
960 movd [%3+%2*2+12], m1
961 movd [%3+%2*2+ 0], m2
962 movd [%3+%1*2+12], m3
967 movd [%3+%1*2+ 8], m0
968 movd [%3+%2*2+ 4], m1
969 movd [%3+%2*2+ 8], m2
970 movd [%3+%1*2+ 4], m3
977 cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
987 mov r3d, [r0+FFTContext.mdctsize]
990 mov rtcos, [r0+FFTContext.tcos]
991 mov rtsin, [r0+FFTContext.tsin]
999 mov rrevtab, [r0+FFTContext.revtab]
1001 %if ARCH_X86_64 == 0
1010 %if ARCH_X86_64 || mmsize == 8
1014 %if notcpuflag(3dnowext) && mmsize == 8
1015 movd m7, [ps_m1m1m1m1]
1018 %if ARCH_X86_64 == 0
1028 PREROTATER r4, r3, r2, rtcos, rtsin
1030 mov r6, [esp] ; rrevtab = ptr+n8
1031 movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
1032 movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
1039 movzx r5, word [rrevtab+r4-4]
1040 movzx r6, word [rrevtab+r4-2]
1041 movzx r10, word [rrevtab+r3]
1042 movzx r11, word [rrevtab+r3+2]
1043 movlps [r1+r5 *8], xmm0
1044 movhps [r1+r6 *8], xmm0
1045 movlps [r1+r10*8], xmm1
1046 movhps [r1+r11*8], xmm1
1050 movzx r5, word [r6+r4-4]
1051 movzx r4, word [r6+r4-2]
1052 movlps [r1+r5*8], xmm0
1053 movhps [r1+r4*8], xmm0
1054 movzx r5, word [r6+r3]
1055 movzx r4, word [r6+r3+2]
1056 movlps [r1+r5*8], xmm1
1057 movhps [r1+r4*8], xmm1
1066 mov r1d, [r5+FFTContext.nbits]
1068 FFT_DISPATCH SUFFIX, r1
1070 mov r0d, [r5+FFTContext.mdctsize]
1073 %if ARCH_X86_64 == 0
1082 %1 r0, r1, r6, rtcos, rtsin
1083 %if ARCH_X86_64 == 0
1092 DECL_IMDCT POSROTATESHUF
1096 DECL_IMDCT POSROTATESHUF_3DNOW
1099 DECL_IMDCT POSROTATESHUF_3DNOW
1104 %if HAVE_AVX_EXTERNAL
1105 DECL_IMDCT POSROTATESHUF_AVX