]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/fft_neon.S
avcodec/adpcm_ima_apm: cosmetics
[ffmpeg] / libavcodec / aarch64 / fft_neon.S
1 /*
2  * ARM NEON optimised FFT
3  *
4  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
5  * Copyright (c) 2009 Naotoshi Nojiri
6  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
7  *
8  * This algorithm (though not any of the implementation details) is
9  * based on libdjbfft by D. J. Bernstein.
10  *
11  * This file is part of FFmpeg.
12  *
13  * FFmpeg is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU Lesser General Public
15  * License as published by the Free Software Foundation; either
16  * version 2.1 of the License, or (at your option) any later version.
17  *
18  * FFmpeg is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21  * Lesser General Public License for more details.
22  *
23  * You should have received a copy of the GNU Lesser General Public
24  * License along with FFmpeg; if not, write to the Free Software
25  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26  */
27
28 #include "libavutil/aarch64/asm.S"
29
30 #define M_SQRT1_2 0.70710678118654752440
31
32 .macro transpose d0, d1, s0, s1
33         trn1            \d0, \s0, \s1
34         trn2            \d1, \s0, \s1
35 .endm
36
37
38 function fft4_neon
39         ld1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
40
41         fadd            v4.2s,  v0.2s,  v1.2s   // r0+r1,i0+i1
42         fsub            v6.2s,  v0.2s,  v1.2s   // r0-r1,i0-i1
43
44         ext             v16.8b, v2.8b,  v3.8b,  #4
45         ext             v17.8b, v3.8b,  v2.8b,  #4
46
47         fadd            v5.2s,  v2.2s,  v3.2s   // i2+i3,r2+r3
48         fsub            v7.2s,  v16.2s, v17.2s  // r3-r2,i2-i3
49
50         fadd            v0.2s,  v4.2s,  v5.2s
51         fsub            v2.2s,  v4.2s,  v5.2s
52         fadd            v1.2s,  v6.2s,  v7.2s
53         fsub            v3.2s,  v6.2s,  v7.2s
54
55         st1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
56
57         ret
58 endfunc
59
60 function fft8_neon
61         mov             x1,  x0
62         ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
63         ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
64         ext             v22.8b, v2.8b,  v3.8b,  #4
65         ext             v23.8b, v3.8b,  v2.8b,  #4
66         fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
67         fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
68         fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
69         fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
70         rev64           v27.2s, v28.2s  // ???
71         fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
72         fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
73         fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
74         ext             v6.8b,  v4.8b,  v5.8b,  #4
75         ext             v7.8b,  v5.8b,  v4.8b,  #4
76         fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
77         fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
78         fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
79         fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
80         fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
81         fadd            v0.2s,  v20.2s, v21.2s
82         fsub            v2.2s,  v20.2s, v21.2s
83         fadd            v1.2s,  v22.2s, v23.2s
84         rev64           v26.2s, v26.2s
85         rev64           v27.2s, v27.2s
86         fsub            v3.2s,  v22.2s, v23.2s
87         fsub            v6.2s,  v6.2s,  v7.2s
88         fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
89         fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
90         fadd            v7.2s,  v4.2s,  v5.2s
91         fsub            v18.2s, v2.2s,  v6.2s
92         ext             v26.8b, v24.8b, v25.8b, #4
93         ext             v27.8b, v25.8b, v24.8b, #4
94         fadd            v2.2s,  v2.2s,  v6.2s
95         fsub            v16.2s, v0.2s,  v7.2s
96         fadd            v5.2s,  v25.2s, v24.2s
97         fsub            v4.2s,  v26.2s, v27.2s
98         fadd            v0.2s,  v0.2s,  v7.2s
99         fsub            v17.2s, v1.2s,  v5.2s
100         fsub            v19.2s, v3.2s,  v4.2s
101         fadd            v3.2s,  v3.2s,  v4.2s
102         fadd            v1.2s,  v1.2s,  v5.2s
103
104         st1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
105         st1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x1]
106
107         ret
108 endfunc
109
110 function fft16_neon
111         mov             x1,  x0
112         ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
113         ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
114         ext             v22.8b, v2.8b,  v3.8b,  #4
115         ext             v23.8b, v3.8b,  v2.8b,  #4
116         fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
117         fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
118         fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
119         fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
120         rev64           v27.2s, v28.2s  // ???
121         fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
122         fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
123         fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
124         ext             v6.8b,  v4.8b,  v5.8b,  #4
125         ext             v7.8b,  v5.8b,  v4.8b,  #4
126         fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
127         fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
128         fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
129         fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
130         fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
131         fadd            v0.2s,  v20.2s, v21.2s
132         fsub            v2.2s,  v20.2s, v21.2s
133         fadd            v1.2s,  v22.2s, v23.2s
134         rev64           v26.2s, v26.2s
135         rev64           v27.2s, v27.2s
136         fsub            v3.2s,  v22.2s, v23.2s
137         fsub            v6.2s,  v6.2s,  v7.2s
138         fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
139         fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
140         fadd            v7.2s,  v4.2s,  v5.2s
141         fsub            v18.2s, v2.2s,  v6.2s
142         ld1             {v20.4s,v21.4s}, [x0], #32
143         ld1             {v22.4s,v23.4s}, [x0], #32
144         ext             v26.8b, v24.8b, v25.8b, #4
145         ext             v27.8b, v25.8b, v24.8b, #4
146         fadd            v2.2s,  v2.2s,  v6.2s
147         fsub            v16.2s, v0.2s,  v7.2s
148         fadd            v5.2s,  v25.2s, v24.2s
149         fsub            v4.2s,  v26.2s, v27.2s
150         transpose       v24.2d, v25.2d, v20.2d, v22.2d
151         transpose       v26.2d, v27.2d, v21.2d, v23.2d
152         fadd            v0.2s,  v0.2s,  v7.2s
153         fsub            v17.2s, v1.2s,  v5.2s
154         fsub            v19.2s, v3.2s,  v4.2s
155         fadd            v3.2s,  v3.2s,  v4.2s
156         fadd            v1.2s,  v1.2s,  v5.2s
157         ext             v20.16b, v21.16b, v21.16b,  #4
158         ext             v21.16b, v23.16b, v23.16b,  #4
159
160         zip1            v0.2d,  v0.2d,  v1.2d   // {z[0],   z[1]}
161         zip1            v1.2d,  v2.2d,  v3.2d   // {z[2],   z[3]}
162         zip1            v2.2d,  v16.2d, v17.2d  // {z[o1],  z[o1+1]}
163         zip1            v3.2d,  v18.2d, v19.2d  // {z[o1+2],z[o1+3]}
164
165         // 2 x fft4
166         transpose       v22.2d, v23.2d, v20.2d, v21.2d
167
168         fadd            v4.4s,  v24.4s, v25.4s
169         fadd            v5.4s,  v26.4s, v27.4s
170         fsub            v6.4s,  v24.4s, v25.4s
171         fsub            v7.4s,  v22.4s, v23.4s
172
173         ld1             {v23.4s},  [x14]
174
175         fadd            v24.4s, v4.4s,  v5.4s   // {z[o2+0],z[o2+1]}
176         fsub            v26.4s, v4.4s,  v5.4s   // {z[o2+2],z[o2+3]}
177         fadd            v25.4s, v6.4s,  v7.4s   // {z[o3+0],z[o3+1]}
178         fsub            v27.4s, v6.4s,  v7.4s   // {z[o3+2],z[o3+3]}
179
180         //fft_pass_neon_16
181         rev64           v7.4s,  v25.4s
182         fmul            v25.4s, v25.4s, v23.s[1]
183         fmul            v7.4s,  v7.4s,  v29.4s
184         fmla            v25.4s, v7.4s,  v23.s[3] // {t1a,t2a,t5a,t6a}
185
186         zip1            v20.4s, v24.4s, v25.4s
187         zip2            v21.4s, v24.4s, v25.4s
188         fneg            v22.4s, v20.4s
189         fadd            v4.4s,  v21.4s, v20.4s
190         fsub            v6.4s,  v20.4s, v21.4s  // just the second half
191         fadd            v5.4s,  v21.4s, v22.4s  // just the first half
192
193         tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
194         tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
195
196         fsub            v20.4s, v0.4s,  v4.4s   // {z[o2],z[o2+1]}
197         fadd            v16.4s, v0.4s,  v4.4s   // {z[0], z[1]}
198         fsub            v22.4s, v2.4s,  v5.4s   // {z[o3],z[o3+1]}
199         fadd            v18.4s, v2.4s,  v5.4s   // {z[o1],z[o1+1]}
200
201 //second half
202         rev64           v6.4s,  v26.4s
203         fmul            v26.4s, v26.4s, v23.s[2]
204         rev64           v7.4s,  v27.4s
205         fmul            v27.4s, v27.4s, v23.s[3]
206         fmul            v6.4s,  v6.4s,  v29.4s
207         fmul            v7.4s,  v7.4s,  v29.4s
208         fmla            v26.4s, v6.4s,  v23.s[2] // {t1,t2,t5,t6}
209         fmla            v27.4s, v7.4s,  v23.s[1] // {t1a,t2a,t5a,t6a}
210
211         zip1            v24.4s, v26.4s, v27.4s
212         zip2            v25.4s, v26.4s, v27.4s
213         fneg            v26.4s, v24.4s
214         fadd            v4.4s,  v25.4s, v24.4s
215         fsub            v6.4s,  v24.4s, v25.4s  // just the second half
216         fadd            v5.4s,  v25.4s, v26.4s  // just the first half
217
218         tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
219         tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
220
221         fadd            v17.4s, v1.4s, v4.4s    // {z[2], z[3]}
222         fsub            v21.4s, v1.4s, v4.4s    // {z[o2+2],z[o2+3]}
223         fadd            v19.4s, v3.4s, v5.4s    // {z[o1+2],z[o1+3]}
224         fsub            v23.4s, v3.4s, v5.4s    // {z[o3+2],z[o3+3]}
225
226         st1             {v16.4s,v17.4s}, [x1], #32
227         st1             {v18.4s,v19.4s}, [x1], #32
228         st1             {v20.4s,v21.4s}, [x1], #32
229         st1             {v22.4s,v23.4s}, [x1], #32
230
231         ret
232 endfunc
233
234
235 const  trans4_float, align=4
236         .byte    0,  1,  2,  3
237         .byte    8,  9, 10, 11
238         .byte    4,  5,  6,  7
239         .byte   12, 13, 14, 15
240 endconst
241
242 const  trans8_float, align=4
243         .byte   24, 25, 26, 27
244         .byte    0,  1,  2,  3
245         .byte   28, 29, 30, 31
246         .byte    4,  5,  6,  7
247 endconst
248
249 function fft_pass_neon
250         sub             x6,  x2,  #1            // n - 1, loop counter
251         lsl             x5,  x2,  #3            // 2 * n * sizeof FFTSample
252         lsl             x1,  x2,  #4            // 2 * n * sizeof FFTComplex
253         add             x5,  x4,  x5            // wim
254         add             x3,  x1,  x2,  lsl #5   // 4 * n * sizeof FFTComplex
255         add             x2,  x0,  x2,  lsl #5   // &z[o2]
256         add             x3,  x0,  x3            // &z[o3]
257         add             x1,  x0,  x1            // &z[o1]
258         ld1             {v20.4s},[x2]           // {z[o2],z[o2+1]}
259         ld1             {v22.4s},[x3]           // {z[o3],z[o3+1]}
260         ld1             {v4.2s},  [x4], #8      // {wre[0],wre[1]}
261         trn2            v25.2d, v20.2d, v22.2d
262         sub             x5,  x5,  #4            // wim--
263         trn1            v24.2d, v20.2d, v22.2d
264         ld1             {v5.s}[0],  [x5], x7    // d5[0] = wim[-1]
265         rev64           v7.4s,  v25.4s
266         fmul            v25.4s, v25.4s, v4.s[1]
267         ld1             {v16.4s}, [x0]          // {z[0],z[1]}
268         fmul            v7.4s,  v7.4s,  v29.4s
269         ld1             {v17.4s}, [x1]          // {z[o1],z[o1+1]}
270         prfm            pldl1keep, [x2, #16]
271         prfm            pldl1keep, [x3, #16]
272         fmla            v25.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
273         prfm            pldl1keep, [x0, #16]
274         prfm            pldl1keep, [x1, #16]
275
276         zip1            v20.4s, v24.4s, v25.4s
277         zip2            v21.4s, v24.4s, v25.4s
278         fneg            v22.4s, v20.4s
279         fadd            v4.4s,  v21.4s, v20.4s
280         fsub            v6.4s,  v20.4s, v21.4s  // just the second half
281         fadd            v5.4s,  v21.4s, v22.4s  // just the first half
282
283         tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
284         tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
285
286         fadd            v20.4s, v16.4s, v4.4s
287         fsub            v22.4s, v16.4s, v4.4s
288         fadd            v21.4s, v17.4s, v5.4s
289         st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
290         fsub            v23.4s, v17.4s, v5.4s
291
292         st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
293         st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
294         st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
295 1:
296         ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
297         ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}
298         ld1             {v4.2s}, [x4], #8       // {wre[0],wre[1]}
299         transpose       v26.2d, v27.2d, v20.2d, v22.2d
300         ld1             {v5.2s}, [x5], x7       // {wim[-1],wim[0]}
301         rev64           v6.4s,  v26.4s
302         fmul            v26.4s, v26.4s, v4.s[0]
303         rev64           v7.4s,  v27.4s
304         fmul            v27.4s, v27.4s, v4.s[1]
305         fmul            v6.4s,  v6.4s,  v29.4s
306         fmul            v7.4s,  v7.4s,  v29.4s
307         ld1             {v16.4s},[x0]           // {z[0],z[1]}
308         fmla            v26.4s, v6.4s,  v5.s[1] // {t1,t2,t5,t6}
309         fmla            v27.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
310         ld1             {v17.4s},[x1]           // {z[o1],z[o1+1]}
311
312         subs            x6,  x6,  #1            // n--
313
314         zip1            v20.4s, v26.4s, v27.4s
315         zip2            v21.4s, v26.4s, v27.4s
316         fneg            v22.4s, v20.4s
317         fadd            v4.4s,  v21.4s, v20.4s
318         fsub            v6.4s,  v20.4s, v21.4s  // just the second half
319         fadd            v5.4s,  v21.4s, v22.4s  // just the first half
320
321         tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
322         tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
323
324         fadd            v20.4s, v16.4s, v4.4s
325         fsub            v22.4s, v16.4s, v4.4s
326         fadd            v21.4s, v17.4s, v5.4s
327         st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
328         fsub            v23.4s, v17.4s, v5.4s
329
330         st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
331         st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
332         st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
333         b.ne            1b
334
335         ret
336 endfunc
337
338 .macro  def_fft n, n2, n4
339 function fft\n\()_neon, align=6
340         sub             sp,  sp,  #16
341         stp             x28, x30, [sp]
342         add             x28, x0,  #\n4*2*8
343         bl              fft\n2\()_neon
344         mov             x0,  x28
345         bl              fft\n4\()_neon
346         add             x0,  x28, #\n4*1*8
347         bl              fft\n4\()_neon
348         sub             x0,  x28, #\n4*2*8
349         ldp             x28, x30, [sp], #16
350         movrel          x4,  X(ff_cos_\n)
351         mov             x2,  #\n4>>1
352         b               fft_pass_neon
353 endfunc
354 .endm
355
356         def_fft    32,    16,     8
357         def_fft    64,    32,    16
358         def_fft   128,    64,    32
359         def_fft   256,   128,    64
360         def_fft   512,   256,   128
361         def_fft  1024,   512,   256
362         def_fft  2048,  1024,   512
363         def_fft  4096,  2048,  1024
364         def_fft  8192,  4096,  2048
365         def_fft 16384,  8192,  4096
366         def_fft 32768, 16384,  8192
367         def_fft 65536, 32768, 16384
368
369 function ff_fft_calc_neon, export=1
370         prfm            pldl1keep, [x1]
371         movrel          x10, trans4_float
372         ldr             w2,  [x0]
373         movrel          x11, trans8_float
374         sub             w2,  w2,  #2
375         movrel          x3,  fft_tab_neon
376         ld1             {v30.16b}, [x10]
377         mov             x7,  #-8
378         movrel          x12, pmmp
379         ldr             x3,  [x3, x2, lsl #3]
380         movrel          x13, mppm
381         movrel          x14, X(ff_cos_16)
382         ld1             {v31.16b}, [x11]
383         mov             x0,  x1
384         ld1             {v29.4s},  [x12]         // pmmp
385         ld1             {v28.4s},  [x13]
386         br              x3
387 endfunc
388
389 function ff_fft_permute_neon, export=1
390         mov             x6,  #1
391         ldr             w2,  [x0]       // nbits
392         ldr             x3,  [x0, #16]  // tmp_buf
393         ldr             x0,  [x0, #8]   // revtab
394         lsl             x6,  x6, x2
395         mov             x2,  x6
396 1:
397         ld1             {v0.2s,v1.2s}, [x1], #16
398         ldr             w4,  [x0], #4
399         uxth            w5,  w4
400         lsr             w4,  w4,  #16
401         add             x5,  x3,  x5,  lsl #3
402         add             x4,  x3,  x4,  lsl #3
403         st1             {v0.2s}, [x5]
404         st1             {v1.2s}, [x4]
405         subs            x6,  x6, #2
406         b.gt            1b
407
408         sub             x1,  x1,  x2,  lsl #3
409 1:
410         ld1             {v0.4s,v1.4s}, [x3], #32
411         st1             {v0.4s,v1.4s}, [x1], #32
412         subs            x2,  x2,  #4
413         b.gt            1b
414
415         ret
416 endfunc
417
418 const   fft_tab_neon, relocate=1
419         .quad fft4_neon
420         .quad fft8_neon
421         .quad fft16_neon
422         .quad fft32_neon
423         .quad fft64_neon
424         .quad fft128_neon
425         .quad fft256_neon
426         .quad fft512_neon
427         .quad fft1024_neon
428         .quad fft2048_neon
429         .quad fft4096_neon
430         .quad fft8192_neon
431         .quad fft16384_neon
432         .quad fft32768_neon
433         .quad fft65536_neon
434 endconst
435
436 const   pmmp, align=4
437         .float          +1.0, -1.0, -1.0, +1.0
438 endconst
439
440 const   mppm, align=4
441         .float          -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
442 endconst