1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 ; These functions are not individually interchangeable with the C versions.
23 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
24 ; in blocks as conventient to the vector size.
25 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
31 %define M_SQRT1_2 0.70710678118654752440
32 ps_root2: times 4 dd M_SQRT1_2
33 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
54 section .text align=16
56 %macro T2_3DN 4 ; z0, z1, mem0, mem1
63 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
66 pfadd %5, %4 ; {t6,t5}
67 pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7}
70 pfadd %1, %5 ; {r0,i0}
71 pfsub %6, %5 ; {r2,i2}
73 pfadd %2, %3 ; {r1,i1}
74 pfsub %4, %3 ; {r3,i3}
78 ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
79 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
82 shufps %1, %2, 0x64 ; {r0,i0,r3,i2}
83 shufps %3, %2, 0xce ; {r1,i1,r2,i3}
85 addps %1, %3 ; {t1,t2,t6,t5}
86 subps %2, %3 ; {t3,t4,t8,t7}
88 shufps %1, %2, 0x44 ; {t1,t2,t3,t4}
89 shufps %3, %2, 0xbe ; {t6,t5,t7,t8}
91 addps %1, %3 ; {r0,i0,r1,i1}
92 subps %2, %3 ; {r2,i2,r3,i3}
94 shufps %1, %2, 0x88 ; {r0,r1,r2,r3}
95 shufps %3, %2, 0xdd ; {i0,i1,i2,i3}
99 %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
101 shufps %3, %4, 0x44 ; {r4,i4,r6,i6}
102 shufps %5, %4, 0xee ; {r5,i5,r7,i7}
104 subps %3, %5 ; {r5,i5,r7,i7}
105 addps %6, %5 ; {t1,t2,t3,t4}
107 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7}
108 mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7}
109 mulps %5, [ps_root2 GLOBAL]
110 addps %3, %5 ; {t8,t7,ta,t9}
112 shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
113 shufps %5, %3, 0x9c ; {t1,t4,t7,ta}
115 addps %6, %5 ; {t1,t2,t9,ta}
116 subps %3, %5 ; {t6,t5,tc,tb}
118 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb}
119 shufps %5, %3, 0x8d ; {t2,ta,t6,tc}
122 addps %1, %6 ; {r0,r1,r2,r3}
123 addps %2, %5 ; {i0,i1,i2,i3}
124 subps %3, %6 ; {r4,r5,r6,r7}
125 subps %4, %5 ; {i4,i5,i6,i7}
128 ; scheduled for cpu-bound sizes
129 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
136 mulps m2, m0 ; r2*wre
138 mulps m3, m1 ; i2*wim
140 mulps m4, m1 ; r2*wim
141 mulps m5, m0 ; i2*wre
142 addps m2, m3 ; r2*wre + i2*wim
144 mulps m1, m6 ; r3*wim
145 subps m5, m4 ; i2*wre - r2*wim
147 mulps m3, m7 ; i3*wim
148 mulps m4, m6 ; r3*wre
149 mulps m0, m7 ; i3*wre
150 subps m4, m3 ; r3*wre - i3*wim
152 addps m0, m1 ; i3*wre + r3*wim
182 ; scheduled to avoid store->load aliasing
183 %macro PASS_BIG 1 ; (!interleave)
189 mova m1, [wq+o1q] ; wim
190 mulps m2, m0 ; r2*wre
192 mulps m3, m1 ; i2*wim
194 mulps m4, m1 ; r2*wim
195 mulps m5, m0 ; i2*wre
196 addps m2, m3 ; r2*wre + i2*wim
198 mulps m1, m6 ; r3*wim
199 subps m5, m4 ; i2*wre - r2*wim
201 mulps m3, m7 ; i3*wim
202 mulps m4, m6 ; r3*wre
203 mulps m0, m7 ; i3*wre
204 subps m4, m3 ; r3*wre - i3*wim
206 addps m0, m1 ; i3*wre + r3*wim
269 %define Z(x) [r0+mmsize*x]
287 T8_SSE m0, m1, m2, m3, m4, m5
301 T8_SSE m0, m1, m2, m3, m4, m5
312 PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL]
321 T2_3DN m0, m1, Z(0), Z(1)
324 T4_3DN m0, m1, m2, m3, m4, m5
335 T2_3DN m0, m1, Z(0), Z(1)
338 T4_3DN m0, m1, m2, m3, m4, m5
341 T2_3DN m4, m5, Z(4), Z(5)
342 T2_3DN m6, m7, Z(6), Z(7)
345 pxor m0, [ps_m1p1 GLOBAL]
346 pxor m2, [ps_m1p1 GLOBAL]
349 pfmul m5, [ps_root2 GLOBAL]
350 pfmul m7, [ps_root2 GLOBAL]
351 T4_3DN m1, m3, m5, m7, m0, m2
356 T4_3DN m0, m2, m4, m6, m5, m7
388 %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
390 %macro DECL_PASS 2+ ; name, payload
393 DEFINE_ARGS z, w, n, o1, o3
408 DECL_PASS pass_sse, PASS_BIG 1
409 DECL_PASS pass_interleave_sse, PASS_BIG 0
415 %define unpcklps punpckldq
416 %define unpckhps punpckhdq
417 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
418 DECL_PASS pass_interleave_3dn, PASS_BIG 0
419 %define pass_3dn2 pass_3dn
420 %define pass_interleave_3dn2 pass_interleave_3dn
423 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
424 %xdefine list_of_fft fft4%2, fft8%2
426 %xdefine list_of_fft list_of_fft, fft16%2
433 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2
438 add r0, n*4 - (n&(-2<<%1))
440 add r0, n*2 - (n2&(-2<<%1))
442 sub r0, n*6 + (n2&(-2<<%1))
443 lea r1, [ff_cos_ %+ n GLOBAL]
451 %ifidn __OUTPUT_FORMAT__,macho64
456 dispatch_tab%3%2: pointer list_of_fft
460 ; On x86_32, this function does the register saving and restoring for all of fft.
461 ; The others pass args in registers and don't spill anything.
462 cglobal fft_dispatch%3%2, 2,5,8, z, nbits
463 lea r2, [dispatch_tab%3%2 GLOBAL]
464 mov r2, [r2 + (nbitsq-2)*gprsize]
470 DECL_FFT 5, _sse, _interleave
472 DECL_FFT 4, _3dn, _interleave
474 DECL_FFT 4, _3dn2, _interleave