2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/arm/asm.S"
24 @ TODO: * FFTs wider than 16
28 vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
29 vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
30 vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
31 vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
33 vadd.f s12, s0, s8 @ i0
34 vadd.f s13, s1, s9 @ i1
35 vadd.f s14, s2, s10 @ i2
36 vadd.f s15, s3, s11 @ i3
37 vsub.f s8, s0, s8 @ i4
38 vsub.f s9, s1, s9 @ i5
39 vsub.f s10, s2, s10 @ i6
40 vsub.f s11, s3, s11 @ i7
43 vadd.f s0, s12, s14 @ z[0].re
44 vsub.f s4, s12, s14 @ z[2].re
45 vadd.f s1, s13, s15 @ z[0].im
46 vsub.f s5, s13, s15 @ z[2].im
47 vadd.f s7, s9, s10 @ z[3].im
48 vsub.f s3, s9, s10 @ z[1].im
49 vadd.f s2, s8, s11 @ z[1].re
50 vsub.f s6, s8, s11 @ z[3].re
63 .macro macro_fft8_head
65 vldr d4, [a1, #0 * 2*4]
66 vldr d6, [a1, #1 * 2*4]
67 vldr d5, [a1, #2 * 2*4]
68 vldr d7, [a1, #3 * 2*4]
70 vldr d12, [a1, #4 * 2*4]
71 vadd.f s16, s8, s12 @ vector op
72 vldr d14, [a1, #5 * 2*4]
73 vldr d13, [a1, #6 * 2*4]
74 vldr d15, [a1, #7 * 2*4]
75 vsub.f s20, s8, s12 @ vector op
84 vsub.f s20, s24, s28 @ vector op
85 vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
86 vstr d1, [a1, #1 * 2*4]
88 vadd.f s16, s24, s28 @ vector op
89 vstr d2, [a1, #2 * 2*4]
90 vstr d3, [a1, #3 * 2*4]
91 vldr d12, [a1, #0 * 2*4]
93 vmul.f s20, s20, s0 @ vector x scalar op
94 vldr d13, [a1, #1 * 2*4]
95 vldr d14, [a1, #2 * 2*4]
96 vldr d15, [a1, #3 * 2*4]
106 vadd.f s8, s0, s24 @ vector op
107 vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
108 vstr d1, [a1, #1 * 2*4]
109 vldr d6, [a1, #0 * 2*4]
110 vldr d7, [a1, #1 * 2*4]
115 vsub.f s12, s24, s12 @ vector op
120 vadd.f s16, s0, s28 @ vector op
121 vstr d6, [a1, #4 * 2*4]
122 vstr d7, [a1, #6 * 2*4]
123 vstr d4, [a1, #0 * 2*4]
124 vstr d5, [a1, #2 * 2*4]
125 vstr d2, [a1, #5 * 2*4]
126 vstr d3, [a1, #7 * 2*4]
129 .macro macro_fft8_tail
130 vstr d8, [a1, #1 * 2*4]
131 vstr d9, [a1, #3 * 2*4]
135 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
149 cos1pi4: @ cos(1*pi/4) = sqrt(2)
150 .float 0.707106769084930419921875
151 cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
152 .float 0.92387950420379638671875
153 cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
154 .float 0.3826834261417388916015625
156 function ff_fft16_vfp, export=1
157 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
164 vldr d10, [a1, #8 * 2*4]
165 vldr d12, [a1, #9 * 2*4]
166 vldr d11, [a1, #10 * 2*4]
167 vldr d13, [a1, #11 * 2*4]
169 vadd.f s16, s20, s24 @ vector op
171 vldr d4, [a1, #12 * 2*4]
172 vldr d6, [a1, #13 * 2*4]
173 vldr d5, [a1, #14 * 2*4]
174 vsub.f s20, s20, s24 @ vector op
175 vldr d7, [a1, #15 * 2*4]
184 vadd.f s16, s8, s12 @ vector op
185 vstr d0, [a1, #8 * 2*4]
186 vstr d2, [a1, #10 * 2*4]
187 vstr d1, [a1, #9 * 2*4]
189 vstr d3, [a1, #11 * 2*4]
190 @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
191 vldr d12, [a1, #10 * 2*4]
200 vstr d0, [a1, #12 * 2*4]
202 @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
203 vldr d6, [a1, #9 * 2*4]
204 vstr d1, [a1, #13 * 2*4]
205 vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
206 vstr d2, [a1, #15 * 2*4]
207 vldr d7, [a1, #13 * 2*4]
212 vmul.f s20, s12, s3 @ vector op
213 @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
214 vldr d4, [a1, #11 * 2*4]
215 vldr d5, [a1, #15 * 2*4]
217 vmul.f s24, s4, s2 @ vector * scalar op
218 vmul.f s28, s12, s1 @ vector * scalar op
219 vmul.f s12, s8, s1 @ vector * scalar op
224 vmul.f s8, s8, s3 @ vector * scalar op
225 vldr d8, [a1, #1 * 2*4]
226 vldr d9, [a1, #5 * 2*4]
227 vldr d10, [a1, #3 * 2*4]
228 vldr d11, [a1, #7 * 2*4]
229 vldr d14, [a1, #2 * 2*4]
238 vadd.f s12, s0, s16 @ vector op
239 vstr d0, [a1, #1 * 2*4]
240 vstr d1, [a1, #5 * 2*4]
241 vldr d4, [a1, #1 * 2*4]
242 vldr d5, [a1, #5 * 2*4]
247 vsub.f s8, s16, s8 @ vector op
248 vstr d6, [a1, #1 * 2*4]
249 vstr d7, [a1, #5 * 2*4]
250 vldr d15, [a1, #6 * 2*4]
255 vadd.f s20, s0, s20 @ vector op
256 vstr d4, [a1, #9 * 2*4]
257 @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
258 vldr d6, [a1, #8 * 2*4]
259 vstr d5, [a1, #13 * 2*4]
260 vldr d7, [a1, #12 * 2*4]
261 vstr d2, [a1, #11 * 2*4]
262 vldr d8, [a1, #0 * 2*4]
263 vstr d3, [a1, #15 * 2*4]
264 vldr d9, [a1, #4 * 2*4]
273 vadd.f s8, s0, s28 @ vector op
274 vstr d0, [a1, #3 * 2*4]
275 vstr d1, [a1, #7 * 2*4]
276 vldr d6, [a1, #3 * 2*4]
277 vldr d7, [a1, #7 * 2*4]
282 vsub.f s12, s28, s12 @ vector op
283 vadd.f s16, s4, s16 @ vector op
284 vstr d10, [a1, #3 * 2*4]
285 vstr d11, [a1, #7 * 2*4]
286 vstr d4, [a1, #2 * 2*4]
287 vstr d5, [a1, #6 * 2*4]
288 vstr d0, [a1, #8 * 2*4]
289 vstr d1, [a1, #12 * 2*4]
290 vstr d6, [a1, #10 * 2*4]
291 vstr d7, [a1, #14 * 2*4]
292 vstr d8, [a1, #0 * 2*4]
293 vstr d9, [a1, #4 * 2*4]