1 ;******************************************************************************
4 ;* This file is part of FFmpeg.
6 ;* FFmpeg is free software; you can redistribute it and/or
7 ;* modify it under the terms of the GNU Lesser General Public
8 ;* License as published by the Free Software Foundation; either
9 ;* version 2.1 of the License, or (at your option) any later version.
11 ;* FFmpeg is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;* Lesser General Public License for more details.
16 ;* You should have received a copy of the GNU Lesser General Public
17 ;* License along with FFmpeg; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 ;******************************************************************************
21 ; Open `doc/transforms.md` to see the code upon which the transforms here were
22 ; based upon and compare.
25 ; carry over registers from smaller transforms to save on ~8 loads/stores
26 ; check if vinsertf could be faster than verpm2f128 for duplication
27 ; even faster FFT8 (current one is very #instructions optimized)
28 ; replace some xors with blends + addsubs?
29 ; replace some shuffles with vblends?
32 %include "x86util.asm"
42 cextern cos_ %+ i %+ _float ; ff_cos_i_float...
47 .n: resd 1 ; Non-power-of-two part
48 .m: resd 1 ; Power-of-two part
49 .inv: resd 1 ; Is inverse
51 .flags: resq 1 ; Flags
52 .scale: resq 1 ; Scale
54 .exptab: ptr 1 ; MDCT exptab
55 .tmp: ptr 1 ; Temporary buffer needed for all compound transforms
56 .pfatab: ptr 1 ; Input/Output mapping for compound transforms
57 .revtab: ptr 1 ; Input mapping for power of two transforms
58 .inplace_idx: ptr 1 ; Required indices to revtab for in-place transforms
60 .top_tx ptr 1 ; Used for transforms derived from other transforms
65 %define POS 0x00000000
66 %define NEG 0x80000000
68 %define M_SQRT1_2 0.707106781186547524401
69 %define COS16_1 0.92387950420379638671875
70 %define COS16_3 0.3826834261417388916015625
72 d8_mult_odd: dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \
73 M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2
75 s8_mult_odd: dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
76 s8_perm_even: dd 1, 3, 0, 2, 1, 3, 2, 0
77 s8_perm_odd1: dd 3, 3, 1, 1, 1, 1, 3, 3
78 s8_perm_odd2: dd 1, 2, 0, 3, 1, 0, 0, 1
80 s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2
81 s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3
82 s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
83 s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2
85 mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
86 mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
87 mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
88 mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG
89 mask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS
90 mask_pmpmpmpm: times 4 dd POS, NEG
94 ; Load complex values (64 bits) via a lookup table
95 ; %1 - output register
96 ; %2 - GRP of base input memory address
97 ; %3 - GPR of LUT (int32_t indices) address
99 ; %5 - temporary GPR (only used if vgather is not used)
100 ; %6 - temporary register (for avx only)
101 ; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
102 %macro LOAD64_LUT 5-7
103 %if %0 > 6 && cpuflag(avx2)
104 pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
105 movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
106 vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
108 mov %5d, [%3 + %4 + 0]
109 movsd xmm%1, [%2 + %5q*8]
111 mov %5d, [%3 + %4 + 8]
112 movsd xmm%6, [%2 + %5q*8]
114 mov %5d, [%3 + %4 + 4]
115 movhps xmm%1, [%2 + %5q*8]
117 mov %5d, [%3 + %4 + 12]
118 movhps xmm%6, [%2 + %5q*8]
119 vinsertf128 %1, %1, xmm%6, 1
124 ; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode)
125 ; %1 - coefficients (r0.reim, r1.reim)
128 shufps %2, %1, %1, q3322
129 shufps %1, %1, %1, q1100
133 shufps %1, %1, %1, q2031
136 ; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode)
137 ; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
138 ; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
141 subps %3, %1, %2 ; r1234, [r5678]
142 addps %1, %1, %2 ; t1234, [t5678]
144 shufps %2, %1, %3, q1010 ; t12, r12
145 shufps %1, %1, %3, q2332 ; t34, r43
147 subps %3, %2, %1 ; a34, b32
148 addps %2, %2, %1 ; a12, b14
150 shufps %1, %2, %3, q1010 ; a1234 even
152 shufps %2, %2, %3, q2332 ; b1423
153 shufps %2, %2, %2, q1320 ; b1234 odd
156 ; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode)
157 ; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim])
158 ; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim])
159 ; %3 - odd coefficients (a1.reim, a3.reim, [b1.reim, b3.reim])
160 ; %4 - odd coefficients (a5.reim, a7.reim, [b5.reim, b7.reim])
164 addps %5, %1, %3 ; q1-8
165 addps %6, %2, %4 ; k1-8
167 subps %1, %1, %3 ; r1-8
168 subps %2, %2, %4 ; j1-8
170 shufps %4, %1, %1, q2323 ; r4343
171 shufps %3, %5, %6, q3032 ; q34, k14
173 shufps %1, %1, %1, q1010 ; r1212
174 shufps %5, %5, %6, q1210 ; q12, k32
176 xorps %4, %4, [mask_pmmppmmp] ; r4343 * pmmp
177 addps %6, %5, %3 ; s12, g12
179 mulps %2, %2, [d8_mult_odd] ; r8 * d8_mult_odd
180 subps %5, %5, %3 ; s34, g43
182 addps %3, %1, %4 ; z1234
183 unpcklpd %1, %6, %5 ; s1234
185 shufps %4, %2, %2, q2301 ; j2143
186 shufps %6, %6, %5, q2332 ; g1234
188 addsubps %2, %2, %4 ; l2143
189 shufps %5, %2, %2, q0123 ; l3412
190 addsubps %5, %5, %2 ; t1234
192 subps %2, %1, %6 ; h1234 even
193 subps %4, %3, %5 ; u1234 odd
195 addps %1, %1, %6 ; w1234 even
196 addps %3, %3, %5 ; o1234 odd
199 ; Single 8-point in-place complex FFT in 20 instructions
200 ; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
201 ; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
205 subps %3, %1, %2 ; r1234, r5678
206 addps %1, %1, %2 ; q1234, q5678
208 vpermilps %2, %3, [s8_perm_odd1] ; r4422, r6688
209 shufps %4, %1, %1, q3322 ; q1122, q5566
211 movsldup %3, %3 ; r1133, r5577
212 shufps %1, %1, %1, q1100 ; q3344, q7788
214 addsubps %3, %3, %2 ; z1234, z5678
215 addsubps %1, %1, %4 ; s3142, s7586
217 mulps %3, %3, [s8_mult_odd] ; z * s8_mult_odd
218 vpermilps %1, %1, [s8_perm_even] ; s1234, s5687 !
220 shufps %2, %3, %3, q2332 ; junk, z7887
221 xorps %4, %1, [mask_mmmmpppm] ; e1234, e5687 !
223 vpermilps %3, %3, [s8_perm_odd2] ; z2314, z6556
224 vperm2f128 %1, %1, %4, 0x03 ; e5687, s1234
226 addsubps %2, %2, %3 ; junk, t5678
227 subps %1, %1, %4 ; w1234, w5678 even
229 vperm2f128 %2, %2, %2, 0x11 ; t5678, t5678
230 vperm2f128 %3, %3, %3, 0x00 ; z2314, z2314
232 xorps %2, %2, [mask_ppmpmmpm] ; t * ppmpmmpm
233 addps %2, %3, %2 ; u1234, u5678 odd
236 ; Single 16-point in-place complex FFT
237 ; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
238 ; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim)
239 ; %3 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
240 ; %4 - odd coefficients (r9.reim, r11.reim, r13.reim, r15.reim)
242 ; %7, %8 - temporary (optional)
246 FFT8_AVX %1, %2, %6, %7
247 movaps %8, [mask_mpmppmpm]
248 movaps %7, [s16_perm]
252 FFT8_AVX %1, %2, %6, %7
253 movaps %7, [s16_perm]
254 %define mask [mask_mpmppmpm]
257 FFT8_AVX %1, %2, %6, %5
258 %define mask [mask_mpmppmpm]
259 %define perm [s16_perm]
263 shufps %6, %4, %4, q2301 ; z12.imre, z13.imre...
264 shufps %5, %5, %3, q2301 ; 0, 0, z8.imre...
266 mulps %4, %4, [s16_mult_odd1] ; z.reim * costab
267 xorps %5, %5, [mask_mppmmpmp]
269 fmaddps %6, %6, [s16_mult_odd2], %4 ; s[8..15]
270 addps %5, %3, %5 ; s[0...7]
272 mulps %6, %6, [s16_mult_odd2] ; z.imre * costab
274 addps %5, %3, %5 ; s[0...7]
275 addps %6, %4, %6 ; s[8..15]
277 mulps %5, %5, [s16_mult_even] ; s[0...7]*costab
279 xorps %4, %6, mask ; s[8..15]*mpmppmpm
280 xorps %3, %5, mask ; s[0...7]*mpmppmpm
282 vperm2f128 %4, %4, %4, 0x01 ; s[12..15, 8..11]
283 vperm2f128 %3, %3, %3, 0x01 ; s[4..7, 0..3]
285 addps %6, %6, %4 ; y56, u56, y34, u34
286 addps %5, %5, %3 ; w56, x56, w34, x34
288 vpermilps %6, %6, perm ; y56, u56, y43, u43
289 vpermilps %5, %5, perm ; w56, x56, w43, x43
291 subps %4, %2, %6 ; odd part 2
292 addps %3, %2, %6 ; odd part 1
294 subps %2, %1, %5 ; even part 2
295 addps %1, %1, %5 ; even part 1
300 ; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs
301 ; Uses all 16 of registers.
302 ; Output is slightly permuted such that tx2,3's coefficients are interleaved
303 ; on a 2-point basis (look at `doc/transforms.md`)
304 %macro SPLIT_RADIX_COMBINE 17
305 %if %1 && mmsize == 32
306 vperm2f128 %14, %6, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] even
307 vperm2f128 %16, %9, %8, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd
308 vperm2f128 %15, %6, %7, 0x31 ; m2[2], m2[3], m3[2], m3[3] even
309 vperm2f128 %17, %9, %8, 0x31 ; m2[2], m2[3], m3[2], m3[3] odd
312 shufps %12, %10, %10, q2200 ; cos00224466
313 shufps %13, %11, %11, q1133 ; wim77553311
314 movshdup %10, %10 ; cos11335577
315 shufps %11, %11, %11, q0022 ; wim66442200
317 %if %1 && mmsize == 32
318 shufps %6, %14, %14, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even
319 shufps %8, %16, %16, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd
320 shufps %7, %15, %15, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even
321 shufps %9, %17, %17, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd
323 mulps %14, %14, %13 ; m2[0123]reim * wim7531 even
324 mulps %16, %16, %11 ; m2[0123]reim * wim7531 odd
325 mulps %15, %15, %13 ; m3[0123]reim * wim7531 even
326 mulps %17, %17, %11 ; m3[0123]reim * wim7531 odd
328 mulps %14, %6, %13 ; m2,3[01]reim * wim7531 even
329 mulps %16, %8, %11 ; m2,3[01]reim * wim7531 odd
330 mulps %15, %7, %13 ; m2,3[23]reim * wim7531 even
331 mulps %17, %9, %11 ; m2,3[23]reim * wim7531 odd
332 ; reorder the multiplies to save movs reg, reg in the %if above
333 shufps %6, %6, %6, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
334 shufps %8, %8, %8, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd
335 shufps %7, %7, %7, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
336 shufps %9, %9, %9, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd
339 %if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA!
340 fmaddsubps %6, %6, %12, %14 ; w[0..8] even
341 fmaddsubps %8, %8, %10, %16 ; w[0..8] odd
342 fmsubaddps %7, %7, %12, %15 ; j[0..8] even
343 fmsubaddps %9, %9, %10, %17 ; j[0..8] odd
344 movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!"
346 mulps %6, %6, %12 ; m2,3[01]imre * cos0246
347 mulps %8, %8, %10 ; m2,3[01]imre * cos0246
348 movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!"
349 mulps %7, %7, %12 ; m2,3[23]reim * cos0246
350 mulps %9, %9, %10 ; m2,3[23]reim * cos0246
351 addsubps %6, %6, %14 ; w[0..8]
352 addsubps %8, %8, %16 ; w[0..8]
353 xorps %15, %15, %13 ; +-m2,3[23]imre * wim7531
354 xorps %17, %17, %13 ; +-m2,3[23]imre * wim7531
355 addps %7, %7, %15 ; j[0..8]
356 addps %9, %9, %17 ; j[0..8]
359 addps %14, %6, %7 ; t10235476 even
360 addps %16, %8, %9 ; t10235476 odd
361 subps %15, %6, %7 ; +-r[0..7] even
362 subps %17, %8, %9 ; +-r[0..7] odd
364 shufps %14, %14, %14, q2301 ; t[0..7] even
365 shufps %16, %16, %16, q2301 ; t[0..7] odd
366 xorps %15, %15, %13 ; r[0..7] even
367 xorps %17, %17, %13 ; r[0..7] odd
369 subps %6, %2, %14 ; m2,3[01] even
370 subps %8, %4, %16 ; m2,3[01] odd
371 subps %7, %3, %15 ; m2,3[23] even
372 subps %9, %5, %17 ; m2,3[23] odd
374 addps %2, %2, %14 ; m0 even
375 addps %4, %4, %16 ; m0 odd
376 addps %3, %3, %15 ; m1 even
377 addps %5, %5, %17 ; m1 odd
380 ; Same as above, only does one parity at a time, takes 3 temporary registers,
381 ; however, if the twiddles aren't needed after this, the registers they use
382 ; can be used as any of the temporary registers.
383 %macro SPLIT_RADIX_COMBINE_HALF 10
385 shufps %8, %6, %6, q2200 ; cos00224466
386 shufps %9, %7, %7, q1133 ; wim77553311
388 shufps %8, %6, %6, q3311 ; cos11335577
389 shufps %9, %7, %7, q0022 ; wim66442200
392 mulps %10, %4, %9 ; m2,3[01]reim * wim7531 even
393 mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even
395 shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
396 shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
399 fmaddsubps %4, %4, %8, %10 ; w[0..8] even
400 fmsubaddps %5, %5, %8, %9 ; j[0..8] even
401 movaps %10, [mask_pmpmpmpm]
403 mulps %4, %4, %8 ; m2,3[01]imre * cos0246
404 mulps %5, %5, %8 ; m2,3[23]reim * cos0246
405 addsubps %4, %4, %10 ; w[0..8]
406 movaps %10, [mask_pmpmpmpm]
407 xorps %9, %9, %10 ; +-m2,3[23]imre * wim7531
408 addps %5, %5, %9 ; j[0..8]
411 addps %8, %4, %5 ; t10235476
412 subps %9, %4, %5 ; +-r[0..7]
414 shufps %8, %8, %8, q2301 ; t[0..7]
415 xorps %9, %9, %10 ; r[0..7]
417 subps %4, %2, %8 ; %3,3[01]
418 subps %5, %3, %9 ; %3,3[23]
420 addps %2, %2, %8 ; m0
421 addps %3, %3, %9 ; m1
424 ; Same as above, tries REALLY hard to use 2 temporary registers.
425 %macro SPLIT_RADIX_COMBINE_LITE 9
427 shufps %8, %6, %6, q2200 ; cos00224466
428 shufps %9, %7, %7, q1133 ; wim77553311
430 shufps %8, %6, %6, q3311 ; cos11335577
431 shufps %9, %7, %7, q0022 ; wim66442200
434 mulps %9, %9, %4 ; m2,3[01]reim * wim7531 even
435 shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
438 fmaddsubps %4, %4, %8, %9 ; w[0..8] even
440 mulps %4, %4, %8 ; m2,3[01]imre * cos0246
441 addsubps %4, %4, %9 ; w[0..8]
445 shufps %9, %7, %7, q1133 ; wim77553311
447 shufps %9, %7, %7, q0022 ; wim66442200
450 mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even
451 shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
453 fmsubaddps %5, %5, %8, %9 ; j[0..8] even
455 mulps %5, %5, %8 ; m2,3[23]reim * cos0246
456 xorps %9, %9, [mask_pmpmpmpm] ; +-m2,3[23]imre * wim7531
457 addps %5, %5, %9 ; j[0..8]
460 addps %8, %4, %5 ; t10235476
461 subps %9, %4, %5 ; +-r[0..7]
463 shufps %8, %8, %8, q2301 ; t[0..7]
464 xorps %9, %9, [mask_pmpmpmpm] ; r[0..7]
466 subps %4, %2, %8 ; %3,3[01]
467 subps %5, %3, %9 ; %3,3[23]
469 addps %2, %2, %8 ; m0
470 addps %3, %3, %9 ; m1
473 %macro SPLIT_RADIX_COMBINE_64 0
474 SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
476 movaps [outq + 0*mmsize], m0
477 movaps [outq + 4*mmsize], m1
478 movaps [outq + 8*mmsize], tx1_e0
479 movaps [outq + 12*mmsize], tx2_e0
481 SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0
483 movaps [outq + 2*mmsize], m2
484 movaps [outq + 6*mmsize], m3
485 movaps [outq + 10*mmsize], tx1_o0
486 movaps [outq + 14*mmsize], tx2_o0
488 movaps tw_e, [cos_64_float + mmsize]
489 vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23
491 movaps m0, [outq + 1*mmsize]
492 movaps m1, [outq + 3*mmsize]
493 movaps m2, [outq + 5*mmsize]
494 movaps m3, [outq + 7*mmsize]
496 SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
497 tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
499 movaps [outq + 1*mmsize], m0
500 movaps [outq + 3*mmsize], m1
501 movaps [outq + 5*mmsize], m2
502 movaps [outq + 7*mmsize], m3
504 movaps [outq + 9*mmsize], tx1_e1
505 movaps [outq + 11*mmsize], tx1_o1
506 movaps [outq + 13*mmsize], tx2_e1
507 movaps [outq + 15*mmsize], tx2_o1
510 ; Perform a single even/odd split radix combination with loads and stores
511 ; The _4 indicates this is a quarter of the iterations required to complete a full
513 ; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6
514 %macro SPLIT_RADIX_LOAD_COMBINE_4 8
515 movaps m8, [rtabq + (%5)*mmsize + %7]
516 vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23
518 movaps m0, [outq + (0 + %4)*mmsize + %6]
519 movaps m2, [outq + (2 + %4)*mmsize + %6]
520 movaps m1, [outq + %1 + (0 + %4)*mmsize + %6]
521 movaps m3, [outq + %1 + (2 + %4)*mmsize + %6]
523 movaps m4, [outq + %2 + (0 + %4)*mmsize + %6]
524 movaps m6, [outq + %2 + (2 + %4)*mmsize + %6]
525 movaps m5, [outq + %3 + (0 + %4)*mmsize + %6]
526 movaps m7, [outq + %3 + (2 + %4)*mmsize + %6]
528 SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
531 m10, m11, m12, m13, m14, m15
533 movaps [outq + (0 + %4)*mmsize + %6], m0
534 movaps [outq + (2 + %4)*mmsize + %6], m2
535 movaps [outq + %1 + (0 + %4)*mmsize + %6], m1
536 movaps [outq + %1 + (2 + %4)*mmsize + %6], m3
538 movaps [outq + %2 + (0 + %4)*mmsize + %6], m4
539 movaps [outq + %2 + (2 + %4)*mmsize + %6], m6
540 movaps [outq + %3 + (0 + %4)*mmsize + %6], m5
541 movaps [outq + %3 + (2 + %4)*mmsize + %6], m7
544 %macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5
561 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i
562 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i
563 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i
564 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i
567 ; Perform a single even/odd split radix combination with loads, deinterleaves and
568 ; stores. The _2 indicates this is a half of the iterations required to complete
569 ; a full combine+deinterleave loop
570 ; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6
571 %macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6
572 movaps m8, [rtabq + (0 + %2)*mmsize]
573 vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23
575 movaps m0, [outq + (0 + 0 + %1)*mmsize + %6]
576 movaps m2, [outq + (2 + 0 + %1)*mmsize + %6]
577 movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6]
578 movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6]
580 movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6]
581 movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6]
582 movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6]
583 movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6]
585 SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
588 m10, m11, m12, m13, m14, m15
599 vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 0], m0, 0
600 vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 16], m10, 0
601 vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 0], m1, 0
602 vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0
604 vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 0], m4, 0
605 vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0
606 vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 0], m5, 0
607 vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0
609 vperm2f128 m10, m10, m0, 0x13
610 vperm2f128 m11, m11, m1, 0x13
611 vperm2f128 m12, m12, m4, 0x13
612 vperm2f128 m13, m13, m5, 0x13
614 movaps m8, [rtabq + (1 + %2)*mmsize]
615 vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23
617 movaps m0, [outq + (0 + 1 + %1)*mmsize + %6]
618 movaps m2, [outq + (2 + 1 + %1)*mmsize + %6]
619 movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6]
620 movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6]
622 movaps [outq + (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict
623 movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict
625 movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6]
626 movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6]
627 movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6]
628 movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6]
630 movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict
631 movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict
633 SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
636 m10, m11, m12, m13, m14, m15 ; temporary registers
647 vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 0], m8, 0
648 vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 16], m0, 0
649 vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 0], m8, 1
650 vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 16], m0, 1
652 vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 0], m9, 0
653 vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1, 0
654 vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 0], m9, 1
655 vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1, 1
657 vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 0], m10, 0
658 vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4, 0
659 vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 0], m10, 1
660 vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4, 1
662 vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 0], m11, 0
663 vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5, 0
664 vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 0], m11, 1
665 vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5, 1
668 %macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3
674 SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset
675 SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset
679 cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
687 cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
688 movaps m0, [inq + 0*mmsize]
689 movaps m1, [inq + 1*mmsize]
692 shufps m2, m1, m0, q3210
693 shufps m0, m0, m1, q3210
702 movaps [outq + 0*mmsize], m2
703 movaps [outq + 1*mmsize], m0
712 cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
713 mov ctxq, [ctxq + AVTXContext.revtab]
715 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
716 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
717 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
718 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
720 FFT8 m0, m1, m2, m3, m4, m5
727 movups [outq + 0*mmsize], m4
728 movups [outq + 1*mmsize], m0
729 movups [outq + 2*mmsize], m5
730 movups [outq + 3*mmsize], m1
735 cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
736 mov ctxq, [ctxq + AVTXContext.revtab]
738 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
739 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
741 FFT8_AVX m0, m1, m2, m3
746 ; Around 2% faster than 2x vperm2f128 + 2x movapd
747 vextractf128 [outq + 16*0], m2, 0
748 vextractf128 [outq + 16*1], m0, 0
749 vextractf128 [outq + 16*2], m2, 1
750 vextractf128 [outq + 16*3], m0, 1
756 cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
757 mov ctxq, [ctxq + AVTXContext.revtab]
759 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
760 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
761 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6
762 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7
764 FFT16 m0, m1, m2, m3, m4, m5, m6, m7
771 vextractf128 [outq + 16*0], m4, 0
772 vextractf128 [outq + 16*1], m0, 0
773 vextractf128 [outq + 16*2], m4, 1
774 vextractf128 [outq + 16*3], m0, 1
775 vextractf128 [outq + 16*4], m5, 0
776 vextractf128 [outq + 16*5], m1, 0
777 vextractf128 [outq + 16*6], m5, 1
778 vextractf128 [outq + 16*7], m1, 1
788 cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
789 mov ctxq, [ctxq + AVTXContext.revtab]
791 LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9
792 LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11
793 LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m12, m13
794 LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m14, m15
796 FFT8 m4, m5, m6, m7, m8, m9
798 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m9
799 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m10, m11
800 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13
801 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15
803 movaps m8, [cos_32_float]
804 vperm2f128 m9, m9, [cos_32_float + 4*8 - 4*7], 0x23
806 FFT16 m0, m1, m2, m3, m10, m11, m12, m13
808 SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
809 m10, m11, m12, m13, m14, m15 ; temporary registers
820 vextractf128 [outq + 16* 0], m8, 0
821 vextractf128 [outq + 16* 1], m0, 0
822 vextractf128 [outq + 16* 2], m8, 1
823 vextractf128 [outq + 16* 3], m0, 1
824 vextractf128 [outq + 16* 4], m9, 0
825 vextractf128 [outq + 16* 5], m1, 0
826 vextractf128 [outq + 16* 6], m9, 1
827 vextractf128 [outq + 16* 7], m1, 1
829 vextractf128 [outq + 16* 8], m11, 0
830 vextractf128 [outq + 16* 9], m4, 0
831 vextractf128 [outq + 16*10], m11, 1
832 vextractf128 [outq + 16*11], m4, 1
833 vextractf128 [outq + 16*12], m10, 0
834 vextractf128 [outq + 16*13], m5, 0
835 vextractf128 [outq + 16*14], m10, 1
836 vextractf128 [outq + 16*15], m5, 1
846 %macro FFT_SPLIT_RADIX_DEF 1-2
852 add outq, (%1*4) - (%1/1)
855 add outq, (%1*2) - (%1/2) ; the synth loops also increment outq
859 sub outq, (%1*4) + (%1*2) + (%1/2)
861 lea rtabq, [cos_ %+ %1 %+ _float]
862 lea itabq, [cos_ %+ %1 %+ _float + %1 - 4*7]
871 SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0
879 jg %2 ; can't do math here, nasm doesn't get it
884 %macro FFT_SPLIT_RADIX_FN 1
886 cglobal split_radix_fft_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
887 mov lenq, [lutq + AVTXContext.m]
888 mov lutq, [lutq + AVTXContext.revtab]
891 ; Bottom-most/32-point transform ===============================================
894 LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m9
895 LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m10, m11
896 LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m12, m13
897 LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m14, m15
899 FFT8 m4, m5, m6, m7, m8, m9
901 LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m9
902 LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m10, m11
903 LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13
904 LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15
906 movaps m8, [cos_32_float]
907 vperm2f128 m9, m9, [cos_32_float + 32 - 4*7], 0x23
909 FFT16 m0, m1, m2, m3, m10, m11, m12, m13
911 SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
912 m10, m11, m12, m13, m14, m15 ; temporary registers
914 movaps [outq + 1*mmsize], m1
915 movaps [outq + 3*mmsize], m3
916 movaps [outq + 5*mmsize], m5
917 movaps [outq + 7*mmsize], m7
919 add lutq, (mmsize/2)*8
923 movaps [outq + 0*mmsize], m0
924 movaps [outq + 2*mmsize], m2
925 movaps [outq + 4*mmsize], m4
926 movaps [outq + 6*mmsize], m6
930 ; 64-point transform ===========================================================
933 ; Helper defines, these make it easier to track what's happening
950 LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tw_o
951 LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tmp1, tmp2
952 LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tw_o
953 LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp1, tmp2
955 FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
957 LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1, tmp2
958 LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_e, tw_o
959 LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tmp1, tmp2
960 LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e, tw_o
962 FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
964 movaps tw_e, [cos_64_float]
965 vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7], 0x23
967 add lutq, (mmsize/2)*8
971 SPLIT_RADIX_COMBINE_64
977 ; 128-point transform ==========================================================
992 lea rtabq, [cos_128_float]
993 lea itabq, [cos_128_float + 128 - 4*7]
998 SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128
1004 ; 256-point transform ==========================================================
1019 lea rtabq, [cos_256_float]
1020 lea itabq, [cos_256_float + 256 - 4*7]
1025 SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256
1026 SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize
1032 ; 512-point transform ==========================================================
1047 lea rtabq, [cos_512_float]
1048 lea itabq, [cos_512_float + 512 - 4*7]
1056 SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512
1067 ; 1024-point transform ==========================================================
1080 sub outq, 192*mmsize
1082 lea rtabq, [cos_1024_float]
1083 lea itabq, [cos_1024_float + 1024 - 4*7]
1091 SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024
1102 ; 2048 to 131072-point transforms ==============================================
1103 FFT_SPLIT_RADIX_DEF 2048, .4096pt
1104 FFT_SPLIT_RADIX_DEF 4096, .8192pt
1105 FFT_SPLIT_RADIX_DEF 8192, .16384pt
1106 FFT_SPLIT_RADIX_DEF 16384, .32768pt
1107 FFT_SPLIT_RADIX_DEF 32768, .65536pt
1108 FFT_SPLIT_RADIX_DEF 65536, .131072pt
1109 FFT_SPLIT_RADIX_DEF 131072
1111 ;===============================================================================
1112 ; Final synthesis + deinterleaving code
1113 ;===============================================================================
1119 lea lutq, [4*lenq + tmpq]
1121 .synth_deinterleave:
1122 SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, lutq
1127 jg .synth_deinterleave
1131 ; 64-point deinterleave which only has to load 4 registers =====================
1133 SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
1134 SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e
1136 unpcklpd tmp1, m0, m2
1137 unpcklpd tmp2, m1, m3
1138 unpcklpd tw_o, tx1_e0, tx1_o0
1139 unpcklpd tw_e, tx2_e0, tx2_o0
1142 unpckhpd tx1_e0, tx1_e0, tx1_o0
1143 unpckhpd tx2_e0, tx2_e0, tx2_o0
1145 vextractf128 [outq + 0*mmsize + 0], tmp1, 0
1146 vextractf128 [outq + 0*mmsize + 16], m0, 0
1147 vextractf128 [outq + 4*mmsize + 0], tmp2, 0
1148 vextractf128 [outq + 4*mmsize + 16], m1, 0
1150 vextractf128 [outq + 8*mmsize + 0], tw_o, 0
1151 vextractf128 [outq + 8*mmsize + 16], tx1_e0, 0
1152 vextractf128 [outq + 9*mmsize + 0], tw_o, 1
1153 vextractf128 [outq + 9*mmsize + 16], tx1_e0, 1
1155 vperm2f128 tmp1, tmp1, m0, 0x31
1156 vperm2f128 tmp2, tmp2, m1, 0x31
1158 vextractf128 [outq + 12*mmsize + 0], tw_e, 0
1159 vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0
1160 vextractf128 [outq + 13*mmsize + 0], tw_e, 1
1161 vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
1163 movaps tw_e, [cos_64_float + mmsize]
1164 vperm2f128 tw_o, tw_o, [cos_64_float + 64 - 4*7 - mmsize], 0x23
1166 movaps m0, [outq + 1*mmsize]
1167 movaps m1, [outq + 3*mmsize]
1168 movaps m2, [outq + 5*mmsize]
1169 movaps m3, [outq + 7*mmsize]
1171 movaps [outq + 1*mmsize], tmp1
1172 movaps [outq + 5*mmsize], tmp2
1174 SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
1175 tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
1177 unpcklpd tmp1, m0, m1
1178 unpcklpd tmp2, m2, m3
1179 unpcklpd tw_e, tx1_e1, tx1_o1
1180 unpcklpd tw_o, tx2_e1, tx2_o1
1183 unpckhpd tx1_e1, tx1_e1, tx1_o1
1184 unpckhpd tx2_e1, tx2_e1, tx2_o1
1186 vextractf128 [outq + 2*mmsize + 0], tmp1, 0
1187 vextractf128 [outq + 2*mmsize + 16], m0, 0
1188 vextractf128 [outq + 3*mmsize + 0], tmp1, 1
1189 vextractf128 [outq + 3*mmsize + 16], m0, 1
1191 vextractf128 [outq + 6*mmsize + 0], tmp2, 0
1192 vextractf128 [outq + 6*mmsize + 16], m2, 0
1193 vextractf128 [outq + 7*mmsize + 0], tmp2, 1
1194 vextractf128 [outq + 7*mmsize + 16], m2, 1
1196 vextractf128 [outq + 10*mmsize + 0], tw_e, 0
1197 vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0
1198 vextractf128 [outq + 11*mmsize + 0], tw_e, 1
1199 vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1
1201 vextractf128 [outq + 14*mmsize + 0], tw_o, 0
1202 vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0
1203 vextractf128 [outq + 15*mmsize + 0], tw_o, 1
1204 vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
1210 FFT_SPLIT_RADIX_FN avx
1211 %if HAVE_AVX2_EXTERNAL
1212 FFT_SPLIT_RADIX_FN avx2