2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/arm/asm.S"
24 @ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
25 @ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
26 @ all single-precision VFP registers may be corrupted on exit. The a2
27 @ register may not be clobbered in these functions, as it holds the
28 @ stored original FPSCR.
30 function ff_fft_calc_vfp, export=1
31 ldr ip, [a1, #0] @ nbits
33 movrel a2, (fft_tab_vfp - 8)
34 ldr pc, [a2, ip, lsl #2]
36 const fft_tab_vfp, relocate=1
39 .word X(ff_fft16_vfp) @ this one alone is exported
55 vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
56 vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
57 vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
58 vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
60 vadd.f s12, s0, s8 @ i0
61 vadd.f s13, s1, s9 @ i1
62 vadd.f s14, s2, s10 @ i2
63 vadd.f s15, s3, s11 @ i3
64 vsub.f s8, s0, s8 @ i4
65 vsub.f s9, s1, s9 @ i5
66 vsub.f s10, s2, s10 @ i6
67 vsub.f s11, s3, s11 @ i7
70 vadd.f s0, s12, s14 @ z[0].re
71 vsub.f s4, s12, s14 @ z[2].re
72 vadd.f s1, s13, s15 @ z[0].im
73 vsub.f s5, s13, s15 @ z[2].im
74 vadd.f s7, s9, s10 @ z[3].im
75 vsub.f s3, s9, s10 @ z[1].im
76 vadd.f s2, s8, s11 @ z[1].re
77 vsub.f s6, s8, s11 @ z[3].re
90 .macro macro_fft8_head
92 vldr d4, [a1, #0 * 2*4]
93 vldr d6, [a1, #1 * 2*4]
94 vldr d5, [a1, #2 * 2*4]
95 vldr d7, [a1, #3 * 2*4]
97 vldr d12, [a1, #4 * 2*4]
98 vadd.f s16, s8, s12 @ vector op
99 vldr d14, [a1, #5 * 2*4]
100 vldr d13, [a1, #6 * 2*4]
101 vldr d15, [a1, #7 * 2*4]
102 vsub.f s20, s8, s12 @ vector op
111 vsub.f s20, s24, s28 @ vector op
112 vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
113 vstr d1, [a1, #1 * 2*4]
115 vadd.f s16, s24, s28 @ vector op
116 vstr d2, [a1, #2 * 2*4]
117 vstr d3, [a1, #3 * 2*4]
118 vldr d12, [a1, #0 * 2*4]
120 vmul.f s20, s20, s0 @ vector x scalar op
121 vldr d13, [a1, #1 * 2*4]
122 vldr d14, [a1, #2 * 2*4]
123 vldr d15, [a1, #3 * 2*4]
133 vadd.f s8, s0, s24 @ vector op
134 vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
135 vstr d1, [a1, #1 * 2*4]
136 vldr d6, [a1, #0 * 2*4]
137 vldr d7, [a1, #1 * 2*4]
142 vsub.f s12, s24, s12 @ vector op
147 vadd.f s16, s0, s28 @ vector op
148 vstr d6, [a1, #4 * 2*4]
149 vstr d7, [a1, #6 * 2*4]
150 vstr d4, [a1, #0 * 2*4]
151 vstr d5, [a1, #2 * 2*4]
152 vstr d2, [a1, #5 * 2*4]
153 vstr d3, [a1, #7 * 2*4]
156 .macro macro_fft8_tail
157 vstr d8, [a1, #1 * 2*4]
158 vstr d9, [a1, #3 * 2*4]
161 function .Lfft8_internal_vfp
168 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
173 bl .Lfft8_internal_vfp
180 cos1pi4: @ cos(1*pi/4) = sqrt(2)
181 .float 0.707106769084930419921875
182 cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
183 .float 0.92387950420379638671875
184 cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
185 .float 0.3826834261417388916015625
187 function .Lfft16_internal_vfp
190 vldr d10, [a1, #8 * 2*4]
191 vldr d12, [a1, #9 * 2*4]
192 vldr d11, [a1, #10 * 2*4]
193 vldr d13, [a1, #11 * 2*4]
195 vadd.f s16, s20, s24 @ vector op
197 vldr d4, [a1, #12 * 2*4]
198 vldr d6, [a1, #13 * 2*4]
199 vldr d5, [a1, #14 * 2*4]
200 vsub.f s20, s20, s24 @ vector op
201 vldr d7, [a1, #15 * 2*4]
210 vadd.f s16, s8, s12 @ vector op
211 vstr d0, [a1, #8 * 2*4]
212 vstr d2, [a1, #10 * 2*4]
213 vstr d1, [a1, #9 * 2*4]
215 vstr d3, [a1, #11 * 2*4]
216 @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
217 vldr d12, [a1, #10 * 2*4]
226 vstr d0, [a1, #12 * 2*4]
228 @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
229 vldr d6, [a1, #9 * 2*4]
230 vstr d1, [a1, #13 * 2*4]
231 vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
232 vstr d2, [a1, #15 * 2*4]
233 vldr d7, [a1, #13 * 2*4]
238 vmul.f s20, s12, s3 @ vector op
239 @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
240 vldr d4, [a1, #11 * 2*4]
241 vldr d5, [a1, #15 * 2*4]
243 vmul.f s24, s4, s2 @ vector * scalar op
244 vmul.f s28, s12, s1 @ vector * scalar op
245 vmul.f s12, s8, s1 @ vector * scalar op
250 vmul.f s8, s8, s3 @ vector * scalar op
251 vldr d8, [a1, #1 * 2*4]
252 vldr d9, [a1, #5 * 2*4]
253 vldr d10, [a1, #3 * 2*4]
254 vldr d11, [a1, #7 * 2*4]
255 vldr d14, [a1, #2 * 2*4]
264 vadd.f s12, s0, s16 @ vector op
265 vstr d0, [a1, #1 * 2*4]
266 vstr d1, [a1, #5 * 2*4]
267 vldr d4, [a1, #1 * 2*4]
268 vldr d5, [a1, #5 * 2*4]
273 vsub.f s8, s16, s8 @ vector op
274 vstr d6, [a1, #1 * 2*4]
275 vstr d7, [a1, #5 * 2*4]
276 vldr d15, [a1, #6 * 2*4]
281 vadd.f s20, s0, s20 @ vector op
282 vstr d4, [a1, #9 * 2*4]
283 @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
284 vldr d6, [a1, #8 * 2*4]
285 vstr d5, [a1, #13 * 2*4]
286 vldr d7, [a1, #12 * 2*4]
287 vstr d2, [a1, #11 * 2*4]
288 vldr d8, [a1, #0 * 2*4]
289 vstr d3, [a1, #15 * 2*4]
290 vldr d9, [a1, #4 * 2*4]
299 vadd.f s8, s0, s28 @ vector op
300 vstr d0, [a1, #3 * 2*4]
301 vstr d1, [a1, #7 * 2*4]
302 vldr d6, [a1, #3 * 2*4]
303 vldr d7, [a1, #7 * 2*4]
308 vsub.f s12, s28, s12 @ vector op
309 vadd.f s16, s4, s16 @ vector op
310 vstr d10, [a1, #3 * 2*4]
311 vstr d11, [a1, #7 * 2*4]
312 vstr d4, [a1, #2 * 2*4]
313 vstr d5, [a1, #6 * 2*4]
314 vstr d0, [a1, #8 * 2*4]
315 vstr d1, [a1, #12 * 2*4]
316 vstr d6, [a1, #10 * 2*4]
317 vstr d7, [a1, #14 * 2*4]
318 vstr d8, [a1, #0 * 2*4]
319 vstr d9, [a1, #4 * 2*4]
324 function ff_fft16_vfp, export=1
325 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
330 bl .Lfft16_internal_vfp
336 .macro pass n, z0, z1, z2, z3
338 @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
339 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
340 @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
341 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
342 vldr d8, [\z2, #8*(o2+1)] @ s16,s17
344 vldr d9, [\z3, #8*(o3+1)] @ s18,s19
345 vldmia v5!, {s0,s1} @ s0 is unused
346 vldr s7, [\z2, #8*o2] @ t1
347 vmul.f s20, s16, s2 @ vector * scalar
348 vldr s0, [\z3, #8*o3] @ t5
349 vldr s6, [\z2, #8*o2+4] @ t2
350 vldr s3, [\z3, #8*o3+4] @ t6
351 vmul.f s16, s16, s1 @ vector * scalar
353 1: add \z0, \z0, #8*2
363 @ up to 2 stalls (VFP vector issuing / waiting for s0)
364 @ depending upon whether this is the first iteration and
365 @ how many add instructions are inserted above
366 vadd.f s4, s0, s7 @ t5
367 vadd.f s5, s6, s3 @ t6
368 vsub.f s6, s6, s3 @ t4
369 vsub.f s7, s0, s7 @ t3
370 vldr d6, [\z0, #8*0-8*2] @ s12,s13
371 vadd.f s0, s16, s21 @ t1
372 vldr d7, [\z1, #8*o1-8*2] @ s14,s15
373 vsub.f s1, s18, s23 @ t5
374 vadd.f s8, s4, s12 @ vector + vector
375 @ stall (VFP vector issuing)
376 @ stall (VFP vector issuing)
377 @ stall (VFP vector issuing)
382 vsub.f s2, s17, s20 @ t2
383 vadd.f s3, s19, s22 @ t6
384 vstr d4, [\z0, #8*0-8*2] @ s8,s9
385 vstr d5, [\z1, #8*o1-8*2] @ s10,s11
386 @ stall (waiting for s5)
387 vstr d2, [\z2, #8*o2-8*2] @ s4,s5
388 vadd.f s4, s1, s0 @ t5
389 vstr d3, [\z3, #8*o3-8*2] @ s6,s7
390 vsub.f s7, s1, s0 @ t3
391 vadd.f s5, s2, s3 @ t6
392 vsub.f s6, s2, s3 @ t4
393 vldr d6, [\z0, #8*1-8*2] @ s12,s13
394 vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
395 vldr d4, [\z2, #8*o2] @ s8,s9
397 vldr d5, [\z3, #8*o3] @ s10,s11
398 vadd.f s20, s4, s12 @ vector + vector
400 vldr d8, [\z2, #8*(o2+1)] @ s16,s17
401 @ stall (VFP vector issuing)
406 vmul.f s12, s8, s3 @ vector * scalar
407 vstr d10, [\z0, #8*1-8*2] @ s20,s21
408 vldr d9, [\z3, #8*(o3+1)] @ s18,s19
409 vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
410 vmul.f s8, s8, s0 @ vector * scalar
411 vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
412 @ stall (waiting for s7)
413 vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
414 vmul.f s20, s16, s2 @ vector * scalar
415 @ stall (VFP vector issuing)
416 @ stall (VFP vector issuing)
417 @ stall (VFP vector issuing)
418 vadd.f s7, s8, s13 @ t1
419 vsub.f s6, s9, s12 @ t2
420 vsub.f s0, s10, s15 @ t5
421 vadd.f s3, s11, s14 @ t6
422 vmul.f s16, s16, s1 @ vector * scalar
425 @ What remains is identical to the first two indentations of
426 @ the above, but without the increment of z
427 vadd.f s4, s0, s7 @ t5
428 vadd.f s5, s6, s3 @ t6
429 vsub.f s6, s6, s3 @ t4
430 vsub.f s7, s0, s7 @ t3
431 vldr d6, [\z0, #8*0] @ s12,s13
432 vadd.f s0, s16, s21 @ t1
433 vldr d7, [\z1, #8*o1] @ s14,s15
434 vsub.f s1, s18, s23 @ t5
435 vadd.f s8, s4, s12 @ vector + vector
440 vsub.f s2, s17, s20 @ t2
441 vadd.f s3, s19, s22 @ t6
442 vstr d4, [\z0, #8*0] @ s8,s9
443 vstr d5, [\z1, #8*o1] @ s10,s11
444 vstr d2, [\z2, #8*o2] @ s4,s5
445 vadd.f s4, s1, s0 @ t5
446 vstr d3, [\z3, #8*o3] @ s6,s7
447 vsub.f s7, s1, s0 @ t3
448 vadd.f s5, s2, s3 @ t6
449 vsub.f s6, s2, s3 @ t4
450 vldr d6, [\z0, #8*1] @ s12,s13
451 vldr d7, [\z1, #8*(o1+1)] @ s14,s15
452 vadd.f s20, s4, s12 @ vector + vector
457 vstr d10, [\z0, #8*1] @ s20,s21
458 vstr d11, [\z1, #8*(o1+1)] @ s22,s23
459 vstr d2, [\z2, #8*(o2+1)] @ s4,s5
460 vstr d3, [\z3, #8*(o3+1)] @ s6,s7
463 .macro def_fft n, n2, n4
464 function .Lfft\n\()_internal_vfp
468 push {v1-v2,v5-v6,lr}
473 bl .Lfft\n2\()_internal_vfp
474 add a1, v1, #8*(\n/4)*2
475 bl .Lfft\n4\()_internal_vfp
476 movrelx v5, X(ff_cos_\n), a1
477 add a1, v1, #8*(\n/4)*3
478 bl .Lfft\n4\()_internal_vfp
483 add v2, v1, #8*2*(\n/4/2)
484 add v3, v1, #8*4*(\n/4/2)
485 add v4, v1, #8*6*(\n/4/2)
486 pass (\n/4/2), v1, v2, v3, v4
492 add v2, v1, #8*4*(\n/4/2)
493 pass (\n/4/2), v1, v1, v2, v2
499 pass (\n/4/2), v1, v1, v1, v1
504 function fft\n\()_vfp
505 ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
510 bl .Lfft\n\()_internal_vfp
523 def_fft 512, 256, 128
524 def_fft 1024, 512, 256
525 def_fft 2048, 1024, 512
526 def_fft 4096, 2048, 1024
527 def_fft 8192, 4096, 2048
528 def_fft 16384, 8192, 4096
529 def_fft 32768, 16384, 8192
530 def_fft 65536, 32768, 16384