1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This algorithm (though not any of the implementation details) is
6 ;* based on libdjbfft by D. J. Bernstein.
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 ; These functions are not individually interchangeable with the C versions.
26 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
27 ; in blocks as conventient to the vector size.
28 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
51 %define M_SQRT1_2 0.70710678118654752440
52 ps_root2: times 4 dd M_SQRT1_2
53 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
74 section .text align=16
76 %macro T2_3DN 4 ; z0, z1, mem0, mem1
83 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
86 pfadd %5, %4 ; {t6,t5}
87 pxor %3, [ps_m1p1] ; {t8,t7}
90 pfadd %1, %5 ; {r0,i0}
91 pfsub %6, %5 ; {r2,i2}
93 pfadd %2, %3 ; {r1,i1}
94 pfsub %4, %3 ; {r3,i3}
98 ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
99 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
102 shufps %1, %2, 0x64 ; {r0,i0,r3,i2}
103 shufps %3, %2, 0xce ; {r1,i1,r2,i3}
105 addps %1, %3 ; {t1,t2,t6,t5}
106 subps %2, %3 ; {t3,t4,t8,t7}
108 shufps %1, %2, 0x44 ; {t1,t2,t3,t4}
109 shufps %3, %2, 0xbe ; {t6,t5,t7,t8}
111 addps %1, %3 ; {r0,i0,r1,i1}
112 subps %2, %3 ; {r2,i2,r3,i3}
114 shufps %1, %2, 0x88 ; {r0,r1,r2,r3}
115 shufps %3, %2, 0xdd ; {i0,i1,i2,i3}
119 %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
121 shufps %3, %4, 0x44 ; {r4,i4,r6,i6}
122 shufps %5, %4, 0xee ; {r5,i5,r7,i7}
124 subps %3, %5 ; {r5,i5,r7,i7}
125 addps %6, %5 ; {t1,t2,t3,t4}
127 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7}
128 mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
130 addps %3, %5 ; {t8,t7,ta,t9}
132 shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
133 shufps %5, %3, 0x9c ; {t1,t4,t7,ta}
135 addps %6, %5 ; {t1,t2,t9,ta}
136 subps %3, %5 ; {t6,t5,tc,tb}
138 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb}
139 shufps %5, %3, 0x8d ; {t2,ta,t6,tc}
142 addps %1, %6 ; {r0,r1,r2,r3}
143 addps %2, %5 ; {i0,i1,i2,i3}
144 subps %3, %6 ; {r4,r5,r6,r7}
145 subps %4, %5 ; {i4,i5,i6,i7}
148 ; scheduled for cpu-bound sizes
149 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
156 mulps m2, m0 ; r2*wre
158 mulps m3, m1 ; i2*wim
160 mulps m4, m1 ; r2*wim
161 mulps m5, m0 ; i2*wre
162 addps m2, m3 ; r2*wre + i2*wim
164 mulps m1, m6 ; r3*wim
165 subps m5, m4 ; i2*wre - r2*wim
167 mulps m3, m7 ; i3*wim
168 mulps m4, m6 ; r3*wre
169 mulps m0, m7 ; i3*wre
170 subps m4, m3 ; r3*wre - i3*wim
172 addps m0, m1 ; i3*wre + r3*wim
202 ; scheduled to avoid store->load aliasing
203 %macro PASS_BIG 1 ; (!interleave)
209 mova m1, [wq+o1q] ; wim
210 mulps m2, m0 ; r2*wre
212 mulps m3, m1 ; i2*wim
214 mulps m4, m1 ; r2*wim
215 mulps m5, m0 ; i2*wre
216 addps m2, m3 ; r2*wre + i2*wim
218 mulps m1, m6 ; r3*wim
219 subps m5, m4 ; i2*wre - r2*wim
221 mulps m3, m7 ; i3*wim
222 mulps m4, m6 ; r3*wre
223 mulps m0, m7 ; i3*wre
224 subps m4, m3 ; r3*wre - i3*wim
226 addps m0, m1 ; i3*wre + r3*wim
289 %define Z(x) [r0+mmsize*x]
290 %define Z2(x) [r0+mmsize*x]
308 T8_SSE m0, m1, m2, m3, m4, m5
322 T8_SSE m0, m1, m2, m3, m4, m5
333 PASS_SMALL 0, [cos_16], [cos_16+16]
342 T2_3DN m0, m1, Z(0), Z(1)
345 T4_3DN m0, m1, m2, m3, m4, m5
356 T2_3DN m0, m1, Z(0), Z(1)
359 T4_3DN m0, m1, m2, m3, m4, m5
362 T2_3DN m4, m5, Z(4), Z(5)
363 T2_3DN m6, m7, Z2(6), Z2(7)
372 T4_3DN m1, m3, m5, m7, m0, m2
377 T4_3DN m0, m2, m4, m6, m5, m7
409 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
410 %define Z2(x) [zq + o3q + mmsize*(x&1)]
412 %macro DECL_PASS 2+ ; name, payload
415 DEFINE_ARGS z, w, n, o1, o3
430 DECL_PASS pass_sse, PASS_BIG 1
431 DECL_PASS pass_interleave_sse, PASS_BIG 0
437 %define unpcklps punpckldq
438 %define unpckhps punpckhdq
439 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
440 DECL_PASS pass_interleave_3dn, PASS_BIG 0
441 %define pass_3dn2 pass_3dn
442 %define pass_interleave_3dn2 pass_interleave_3dn
445 %define SECTION_REL - $$
450 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
451 lea r2, [dispatch_tab%1]
452 mov r2, [r2 + (%2q-2)*gprsize]
458 %endmacro ; FFT_DISPATCH
460 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
461 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
463 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
470 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
475 add r0, n*4 - (n&(-2<<%1))
477 add r0, n*2 - (n2&(-2<<%1))
479 sub r0, n*6 + (n2&(-2<<%1))
489 dispatch_tab%3%2: pointer list_of_fft
493 ; On x86_32, this function does the register saving and restoring for all of fft.
494 ; The others pass args in registers and don't spill anything.
495 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
496 FFT_DISPATCH %3%2, nbits
501 DECL_FFT 5, _sse, _interleave
503 DECL_FFT 4, _3dn, _interleave
505 DECL_FFT 4, _3dn2, _interleave
514 %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
515 movaps xmm0, [%3+%2*4]
516 movaps xmm1, [%3+%1*4-0x10]
518 shufps xmm0, xmm1, 0x88
519 shufps xmm1, xmm2, 0x77
520 movlps xmm4, [%4+%2*2]
521 movlps xmm5, [%5+%2*2+0x0]
522 movhps xmm4, [%4+%1*2-0x8]
523 movhps xmm5, [%5+%1*2-0x8]
537 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
538 movaps xmm6, [%4+%1*2]
539 movaps %2, [%4+%1*2+0x10]
550 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
552 CMUL %1, xmm0, xmm1, %3, %4, %5
553 CMUL %2, xmm4, xmm5, %3, %4, %5
554 shufps xmm1, xmm1, 0x1b
555 shufps xmm5, xmm5, 0x1b
562 movaps [%3+%2*2], xmm6
563 movaps [%3+%2*2+0x10], xmm4
564 movaps [%3+%1*2], xmm0
565 movaps [%3+%1*2+0x10], xmm2
571 cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
584 mov r3d, [r0+FFTContext.mdctsize]
587 mov rtcos, [r0+FFTContext.tcos]
588 mov rtsin, [r0+FFTContext.tsin]
596 mov rrevtab, [r0+FFTContext.revtab]
616 PREROTATER r4, r3, r2, rtcos, rtsin
618 movzx r5, word [rrevtab+r4-4]
619 movzx r6, word [rrevtab+r4-2]
620 movzx r13, word [rrevtab+r3]
621 movzx r14, word [rrevtab+r3+2]
622 movlps [r1+r5 *8], xmm0
623 movhps [r1+r6 *8], xmm0
624 movlps [r1+r13*8], xmm1
625 movhps [r1+r14*8], xmm1
629 movzx r5, word [r6+r4-4]
630 movzx r4, word [r6+r4-2]
631 movlps [r1+r5*8], xmm0
632 movhps [r1+r4*8], xmm0
633 movzx r5, word [r6+r3]
634 movzx r4, word [r6+r3+2]
635 movlps [r1+r5*8], xmm1
636 movhps [r1+r4*8], xmm1
644 mov r1d, [r5+FFTContext.nbits]
646 FFT_DISPATCH _sse, r1
648 mov r0d, [r5+FFTContext.mdctsize]
660 POSROTATESHUF r0, r1, r6, rtcos, rtsin