2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/arm/asm.S"
24 @ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
25 @ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
26 @ all single-precision VFP registers may be corrupted on exit. The a2
27 @ register may not be clobbered in these functions, as it holds the
28 @ stored original FPSCR.
30 function ff_fft_calc_vfp, export=1
31 ldr ip, [a1, #0] @ nbits
33 A ldr pc, [pc, ip, lsl #2]
37 T movrel a2, (fft_tab_vfp - 8)
38 T ldr pc, [a2, ip, lsl #2]
43 .word X(ff_fft16_vfp) @ this one alone is exported
60 vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
61 vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
62 vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
63 vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
65 vadd.f s12, s0, s8 @ i0
66 vadd.f s13, s1, s9 @ i1
67 vadd.f s14, s2, s10 @ i2
68 vadd.f s15, s3, s11 @ i3
69 vsub.f s8, s0, s8 @ i4
70 vsub.f s9, s1, s9 @ i5
71 vsub.f s10, s2, s10 @ i6
72 vsub.f s11, s3, s11 @ i7
75 vadd.f s0, s12, s14 @ z[0].re
76 vsub.f s4, s12, s14 @ z[2].re
77 vadd.f s1, s13, s15 @ z[0].im
78 vsub.f s5, s13, s15 @ z[2].im
79 vadd.f s7, s9, s10 @ z[3].im
80 vsub.f s3, s9, s10 @ z[1].im
81 vadd.f s2, s8, s11 @ z[1].re
82 vsub.f s6, s8, s11 @ z[3].re
95 .macro macro_fft8_head
97 vldr d4, [a1, #0 * 2*4]
98 vldr d6, [a1, #1 * 2*4]
99 vldr d5, [a1, #2 * 2*4]
100 vldr d7, [a1, #3 * 2*4]
102 vldr d12, [a1, #4 * 2*4]
103 vadd.f s16, s8, s12 @ vector op
104 vldr d14, [a1, #5 * 2*4]
105 vldr d13, [a1, #6 * 2*4]
106 vldr d15, [a1, #7 * 2*4]
107 vsub.f s20, s8, s12 @ vector op
116 vsub.f s20, s24, s28 @ vector op
117 vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
118 vstr d1, [a1, #1 * 2*4]
120 vadd.f s16, s24, s28 @ vector op
121 vstr d2, [a1, #2 * 2*4]
122 vstr d3, [a1, #3 * 2*4]
123 vldr d12, [a1, #0 * 2*4]
125 vmul.f s20, s20, s0 @ vector x scalar op
126 vldr d13, [a1, #1 * 2*4]
127 vldr d14, [a1, #2 * 2*4]
128 vldr d15, [a1, #3 * 2*4]
138 vadd.f s8, s0, s24 @ vector op
139 vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
140 vstr d1, [a1, #1 * 2*4]
141 vldr d6, [a1, #0 * 2*4]
142 vldr d7, [a1, #1 * 2*4]
147 vsub.f s12, s24, s12 @ vector op
152 vadd.f s16, s0, s28 @ vector op
153 vstr d6, [a1, #4 * 2*4]
154 vstr d7, [a1, #6 * 2*4]
155 vstr d4, [a1, #0 * 2*4]
156 vstr d5, [a1, #2 * 2*4]
157 vstr d2, [a1, #5 * 2*4]
158 vstr d3, [a1, #7 * 2*4]
161 .macro macro_fft8_tail
162 vstr d8, [a1, #1 * 2*4]
163 vstr d9, [a1, #3 * 2*4]
166 function .Lfft8_internal_vfp
173 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
178 bl .Lfft8_internal_vfp
185 cos1pi4: @ cos(1*pi/4) = sqrt(2)
186 .float 0.707106769084930419921875
187 cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
188 .float 0.92387950420379638671875
189 cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
190 .float 0.3826834261417388916015625
192 function .Lfft16_internal_vfp
195 vldr d10, [a1, #8 * 2*4]
196 vldr d12, [a1, #9 * 2*4]
197 vldr d11, [a1, #10 * 2*4]
198 vldr d13, [a1, #11 * 2*4]
200 vadd.f s16, s20, s24 @ vector op
202 vldr d4, [a1, #12 * 2*4]
203 vldr d6, [a1, #13 * 2*4]
204 vldr d5, [a1, #14 * 2*4]
205 vsub.f s20, s20, s24 @ vector op
206 vldr d7, [a1, #15 * 2*4]
215 vadd.f s16, s8, s12 @ vector op
216 vstr d0, [a1, #8 * 2*4]
217 vstr d2, [a1, #10 * 2*4]
218 vstr d1, [a1, #9 * 2*4]
220 vstr d3, [a1, #11 * 2*4]
221 @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
222 vldr d12, [a1, #10 * 2*4]
231 vstr d0, [a1, #12 * 2*4]
233 @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
234 vldr d6, [a1, #9 * 2*4]
235 vstr d1, [a1, #13 * 2*4]
236 vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
237 vstr d2, [a1, #15 * 2*4]
238 vldr d7, [a1, #13 * 2*4]
243 vmul.f s20, s12, s3 @ vector op
244 @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
245 vldr d4, [a1, #11 * 2*4]
246 vldr d5, [a1, #15 * 2*4]
248 vmul.f s24, s4, s2 @ vector * scalar op
249 vmul.f s28, s12, s1 @ vector * scalar op
250 vmul.f s12, s8, s1 @ vector * scalar op
255 vmul.f s8, s8, s3 @ vector * scalar op
256 vldr d8, [a1, #1 * 2*4]
257 vldr d9, [a1, #5 * 2*4]
258 vldr d10, [a1, #3 * 2*4]
259 vldr d11, [a1, #7 * 2*4]
260 vldr d14, [a1, #2 * 2*4]
269 vadd.f s12, s0, s16 @ vector op
270 vstr d0, [a1, #1 * 2*4]
271 vstr d1, [a1, #5 * 2*4]
272 vldr d4, [a1, #1 * 2*4]
273 vldr d5, [a1, #5 * 2*4]
278 vsub.f s8, s16, s8 @ vector op
279 vstr d6, [a1, #1 * 2*4]
280 vstr d7, [a1, #5 * 2*4]
281 vldr d15, [a1, #6 * 2*4]
286 vadd.f s20, s0, s20 @ vector op
287 vstr d4, [a1, #9 * 2*4]
288 @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
289 vldr d6, [a1, #8 * 2*4]
290 vstr d5, [a1, #13 * 2*4]
291 vldr d7, [a1, #12 * 2*4]
292 vstr d2, [a1, #11 * 2*4]
293 vldr d8, [a1, #0 * 2*4]
294 vstr d3, [a1, #15 * 2*4]
295 vldr d9, [a1, #4 * 2*4]
304 vadd.f s8, s0, s28 @ vector op
305 vstr d0, [a1, #3 * 2*4]
306 vstr d1, [a1, #7 * 2*4]
307 vldr d6, [a1, #3 * 2*4]
308 vldr d7, [a1, #7 * 2*4]
313 vsub.f s12, s28, s12 @ vector op
314 vadd.f s16, s4, s16 @ vector op
315 vstr d10, [a1, #3 * 2*4]
316 vstr d11, [a1, #7 * 2*4]
317 vstr d4, [a1, #2 * 2*4]
318 vstr d5, [a1, #6 * 2*4]
319 vstr d0, [a1, #8 * 2*4]
320 vstr d1, [a1, #12 * 2*4]
321 vstr d6, [a1, #10 * 2*4]
322 vstr d7, [a1, #14 * 2*4]
323 vstr d8, [a1, #0 * 2*4]
324 vstr d9, [a1, #4 * 2*4]
329 function ff_fft16_vfp, export=1
330 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
335 bl .Lfft16_internal_vfp
341 .macro pass n, z0, z1, z2, z3
343 @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
344 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
345 @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
346 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
347 vldr d8, [\z2, #8*(o2+1)] @ s16,s17
349 vldr d9, [\z3, #8*(o3+1)] @ s18,s19
350 vldmia v5!, {s0,s1} @ s0 is unused
351 vldr s7, [\z2, #8*o2] @ t1
352 vmul.f s20, s16, s2 @ vector * scalar
353 vldr s0, [\z3, #8*o3] @ t5
354 vldr s6, [\z2, #8*o2+4] @ t2
355 vldr s3, [\z3, #8*o3+4] @ t6
356 vmul.f s16, s16, s1 @ vector * scalar
358 1: add \z0, \z0, #8*2
368 @ up to 2 stalls (VFP vector issuing / waiting for s0)
369 @ depending upon whether this is the first iteration and
370 @ how many add instructions are inserted above
371 vadd.f s4, s0, s7 @ t5
372 vadd.f s5, s6, s3 @ t6
373 vsub.f s6, s6, s3 @ t4
374 vsub.f s7, s0, s7 @ t3
375 vldr d6, [\z0, #8*0-8*2] @ s12,s13
376 vadd.f s0, s16, s21 @ t1
377 vldr d7, [\z1, #8*o1-8*2] @ s14,s15
378 vsub.f s1, s18, s23 @ t5
379 vadd.f s8, s4, s12 @ vector + vector
380 @ stall (VFP vector issuing)
381 @ stall (VFP vector issuing)
382 @ stall (VFP vector issuing)
387 vsub.f s2, s17, s20 @ t2
388 vadd.f s3, s19, s22 @ t6
389 vstr d4, [\z0, #8*0-8*2] @ s8,s9
390 vstr d5, [\z1, #8*o1-8*2] @ s10,s11
391 @ stall (waiting for s5)
392 vstr d2, [\z2, #8*o2-8*2] @ s4,s5
393 vadd.f s4, s1, s0 @ t5
394 vstr d3, [\z3, #8*o3-8*2] @ s6,s7
395 vsub.f s7, s1, s0 @ t3
396 vadd.f s5, s2, s3 @ t6
397 vsub.f s6, s2, s3 @ t4
398 vldr d6, [\z0, #8*1-8*2] @ s12,s13
399 vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
400 vldr d4, [\z2, #8*o2] @ s8,s9
402 vldr d5, [\z3, #8*o3] @ s10,s11
403 vadd.f s20, s4, s12 @ vector + vector
405 vldr d8, [\z2, #8*(o2+1)] @ s16,s17
406 @ stall (VFP vector issuing)
411 vmul.f s12, s8, s3 @ vector * scalar
412 vstr d10, [\z0, #8*1-8*2] @ s20,s21
413 vldr d9, [\z3, #8*(o3+1)] @ s18,s19
414 vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
415 vmul.f s8, s8, s0 @ vector * scalar
416 vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
417 @ stall (waiting for s7)
418 vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
419 vmul.f s20, s16, s2 @ vector * scalar
420 @ stall (VFP vector issuing)
421 @ stall (VFP vector issuing)
422 @ stall (VFP vector issuing)
423 vadd.f s7, s8, s13 @ t1
424 vsub.f s6, s9, s12 @ t2
425 vsub.f s0, s10, s15 @ t5
426 vadd.f s3, s11, s14 @ t6
427 vmul.f s16, s16, s1 @ vector * scalar
430 @ What remains is identical to the first two indentations of
431 @ the above, but without the increment of z
432 vadd.f s4, s0, s7 @ t5
433 vadd.f s5, s6, s3 @ t6
434 vsub.f s6, s6, s3 @ t4
435 vsub.f s7, s0, s7 @ t3
436 vldr d6, [\z0, #8*0] @ s12,s13
437 vadd.f s0, s16, s21 @ t1
438 vldr d7, [\z1, #8*o1] @ s14,s15
439 vsub.f s1, s18, s23 @ t5
440 vadd.f s8, s4, s12 @ vector + vector
445 vsub.f s2, s17, s20 @ t2
446 vadd.f s3, s19, s22 @ t6
447 vstr d4, [\z0, #8*0] @ s8,s9
448 vstr d5, [\z1, #8*o1] @ s10,s11
449 vstr d2, [\z2, #8*o2] @ s4,s5
450 vadd.f s4, s1, s0 @ t5
451 vstr d3, [\z3, #8*o3] @ s6,s7
452 vsub.f s7, s1, s0 @ t3
453 vadd.f s5, s2, s3 @ t6
454 vsub.f s6, s2, s3 @ t4
455 vldr d6, [\z0, #8*1] @ s12,s13
456 vldr d7, [\z1, #8*(o1+1)] @ s14,s15
457 vadd.f s20, s4, s12 @ vector + vector
462 vstr d10, [\z0, #8*1] @ s20,s21
463 vstr d11, [\z1, #8*(o1+1)] @ s22,s23
464 vstr d2, [\z2, #8*(o2+1)] @ s4,s5
465 vstr d3, [\z3, #8*(o3+1)] @ s6,s7
468 .macro def_fft n, n2, n4
469 function .Lfft\n\()_internal_vfp
473 push {v1-v2,v5-v6,lr}
478 bl .Lfft\n2\()_internal_vfp
479 add a1, v1, #8*(\n/4)*2
480 bl .Lfft\n4\()_internal_vfp
481 movrelx v5, X(ff_cos_\n), a1
482 add a1, v1, #8*(\n/4)*3
483 bl .Lfft\n4\()_internal_vfp
488 add v2, v1, #8*2*(\n/4/2)
489 add v3, v1, #8*4*(\n/4/2)
490 add v4, v1, #8*6*(\n/4/2)
491 pass (\n/4/2), v1, v2, v3, v4
497 add v2, v1, #8*4*(\n/4/2)
498 pass (\n/4/2), v1, v1, v2, v2
504 pass (\n/4/2), v1, v1, v1, v1
509 function fft\n\()_vfp
510 ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
515 bl .Lfft\n\()_internal_vfp
528 def_fft 512, 256, 128
529 def_fft 1024, 512, 256
530 def_fft 2048, 1024, 512
531 def_fft 4096, 2048, 1024
532 def_fft 8192, 4096, 2048
533 def_fft 16384, 8192, 4096
534 def_fft 32768, 16384, 8192
535 def_fft 65536, 32768, 16384