2 * FFT transform with Altivec optimizations
3 * Copyright (c) 2009 Loren Merritt
5 * This algorithm (though not any of the implementation details) is
6 * based on libdjbfft by D. J. Bernstein.
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * These functions are not individually interchangeable with the C versions.
27 * While C takes arrays of FFTComplex, Altivec leaves intermediate results
28 * in blocks as convenient to the vector size.
29 * i.e. {4x real, 4x imaginary, 4x real, ...}
31 * I ignore standard calling convention.
32 * Instead, the following registers are treated as global constants:
35 * v19..v29: permutations
38 * and the rest are free for local use.
46 .macro addi2 ra, imm // add 32-bit immediate
51 addis \ra, \ra, \imm@ha
55 .macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
56 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
57 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
58 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
59 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
60 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
61 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
62 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
63 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
64 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
65 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
68 .macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
69 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
70 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
73 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
74 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
77 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
78 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
81 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
82 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
85 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
86 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
91 .macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
92 vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
93 vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
94 vperm \a2,\a0,\a1,v20 // FFT4 ...
96 vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
97 vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
98 vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
101 vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
102 vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
104 vperm \a3,\a0,\a1,v22
105 vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
106 vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
109 vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
110 vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
111 vperm \a2,\a0,\a1,v23
112 vperm \a3,\a0,\a1,v24
113 vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
114 vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
115 vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
116 vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
117 vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
118 vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
121 .macro BF d0,d1,s0,s1
126 .macro zip d0,d1,s0,s1
131 .macro def_fft4 interleave
132 fft4\interleave\()_altivec:
147 .macro def_fft8 interleave
148 fft8\interleave\()_altivec:
154 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
171 .macro def_fft16 interleave
172 fft16\interleave\()_altivec:
180 FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
185 FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
186 vmaddfp v8,v4,v15,v14 // r2*wre
187 vmaddfp v9,v5,v15,v14 // i2*wre
188 vmaddfp v10,v6,v15,v14 // r3*wre
189 vmaddfp v11,v7,v15,v14 // i3*wre
190 vmaddfp v8,v5,v16,v8 // i2*wim
191 vnmsubfp v9,v4,v16,v9 // r2*wim
192 vnmsubfp v10,v7,v16,v10 // i3*wim
193 vmaddfp v11,v6,v16,v11 // r3*wim
226 // void pass(float *z, float *wre, int n)
227 .macro PASS interleave, suffix
228 fft_pass\suffix\()_altivec:
235 addi r6,r5,16 // o1+16
236 addi r8,r7,16 // o2+16
237 addi r11,r10,16 // o3+16
243 vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
244 lvx v4,r3,r7 // r2 = z[o2]
245 lvx v5,r3,r8 // i2 = z[o2+16]
246 lvx v6,r3,r10 // r3 = z[o3]
247 lvx v7,r3,r11 // i3 = z[o3+16]
248 vmaddfp v10,v4,v8,v14 // r2*wre
249 vmaddfp v11,v5,v8,v14 // i2*wre
250 vmaddfp v12,v6,v8,v14 // r3*wre
251 vmaddfp v13,v7,v8,v14 // i3*wre
252 lvx v0, 0,r3 // r0 = z[0]
253 lvx v3,r3,r6 // i1 = z[o1+16]
254 vmaddfp v10,v5,v9,v10 // i2*wim
255 vnmsubfp v11,v4,v9,v11 // r2*wim
256 vnmsubfp v12,v7,v9,v12 // i3*wim
257 vmaddfp v13,v6,v9,v13 // r3*wim
258 lvx v1,r3,r9 // i0 = z[16]
259 lvx v2,r3,r5 // r1 = z[o1]
302 #define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
304 #define WORD_0 0x00,0x01,0x02,0x03
305 #define WORD_1 0x04,0x05,0x06,0x07
306 #define WORD_2 0x08,0x09,0x0a,0x0b
307 #define WORD_3 0x0c,0x0d,0x0e,0x0f
308 #define WORD_s0 0x10,0x11,0x12,0x13
309 #define WORD_s1 0x14,0x15,0x16,0x17
310 #define WORD_s2 0x18,0x19,0x1a,0x1b
311 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
313 #define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
319 .float 1, 0.92387953, M_SQRT1_2, 0.38268343
320 .float 0, 0.38268343, M_SQRT1_2, 0.92387953
321 .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
322 .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
335 .macro lvm b, r, regs:vararg
343 .macro stvm b, r, regs:vararg
351 .macro fft_calc interleave
352 extfunc ff_fft_calc\interleave\()_altivec
355 stpu r1, -(160+16*PS)(r1)
357 stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
364 lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
365 lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
368 movrel r12, X(ff_cos_tabs)
370 movrel r6, fft_dispatch_tab\interleave\()_altivec
373 slwi r3, r3, 2+ARCH_PPC64
380 lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
389 .macro DECL_FFT suffix, bits, n, n2, n4
390 fft\n\suffix\()_altivec:
392 stp r0,PS*(\bits-3)(r1)
399 lp r0,PS*(\bits-3)(r1)
403 b fft_pass\suffix\()_altivec
406 .macro DECL_FFTS interleave, suffix
411 PASS \interleave, \suffix
412 DECL_FFT \suffix, 5, 32, 16, 8
413 DECL_FFT \suffix, 6, 64, 32, 16
414 DECL_FFT \suffix, 7, 128, 64, 32
415 DECL_FFT \suffix, 8, 256, 128, 64
416 DECL_FFT \suffix, 9, 512, 256, 128
417 DECL_FFT \suffix,10, 1024, 512, 256
418 DECL_FFT \suffix,11, 2048, 1024, 512
419 DECL_FFT \suffix,12, 4096, 2048, 1024
420 DECL_FFT \suffix,13, 8192, 4096, 2048
421 DECL_FFT \suffix,14,16384, 8192, 4096
422 DECL_FFT \suffix,15,32768,16384, 8192
423 DECL_FFT \suffix,16,65536,32768,16384
429 fft_dispatch_tab\suffix\()_altivec:
430 PTR fft4\suffix\()_altivec
431 PTR fft8\suffix\()_altivec
432 PTR fft16\suffix\()_altivec
433 PTR fft32\suffix\()_altivec
434 PTR fft64\suffix\()_altivec
435 PTR fft128\suffix\()_altivec
436 PTR fft256\suffix\()_altivec
437 PTR fft512\suffix\()_altivec
438 PTR fft1024\suffix\()_altivec
439 PTR fft2048\suffix\()_altivec
440 PTR fft4096\suffix\()_altivec
441 PTR fft8192\suffix\()_altivec
442 PTR fft16384\suffix\()_altivec
443 PTR fft32768\suffix\()_altivec
444 PTR fft65536\suffix\()_altivec
448 DECL_FFTS 1, _interleave