2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/arm/asm.S"
42 .macro prerotation_innerloop
44 .set trig_hi, n4 - k - 2
45 .set in_lo, trig_lo * 2
46 .set in_hi, trig_hi * 2
47 vldr d8, [TCOS, #trig_lo*4] @ s16,s17
48 vldr d9, [TCOS, #trig_hi*4] @ s18,s19
49 vldr s0, [IN, #in_hi*4 + 12]
50 vldr s1, [IN, #in_hi*4 + 4]
51 vldr s2, [IN, #in_lo*4 + 12]
52 vldr s3, [IN, #in_lo*4 + 4]
53 vmul.f s8, s0, s16 @ vector operation
54 vldr d10, [TSIN, #trig_lo*4] @ s20,s21
55 vldr d11, [TSIN, #trig_hi*4] @ s22,s23
56 vldr s4, [IN, #in_lo*4]
57 vldr s5, [IN, #in_lo*4 + 8]
58 vldr s6, [IN, #in_hi*4]
59 vldr s7, [IN, #in_hi*4 + 8]
60 ldr J0, [REVTAB, #trig_lo*2]
61 vmul.f s12, s0, s20 @ vector operation
62 ldr J2, [REVTAB, #trig_hi*2]
64 and J0, J0, #255 @ halfword value will be < n4
65 vmls.f s8, s4, s20 @ vector operation
67 and J2, J2, #255 @ halfword value will be < n4
68 add J0, OUT, J0, lsl #3
69 vmla.f s12, s4, s16 @ vector operation
70 add J1, OUT, J1, lsl #3
71 add J2, OUT, J2, lsl #3
72 add J3, OUT, J3, lsl #3
84 .macro prerotation_innerloop_rolled
85 vldmia TCOS!, {s16,s17}
86 vldmdb TCOS_HI!, {s18,s19}
88 vldr s1, [IN_HI, #-12]
91 vmul.f s8, s0, s16 @ vector operation
92 vldmia TSIN!, {s20,s21}
93 vldmdb TSIN_HI!, {s22,s23}
96 vldr s6, [IN_HI, #-16]
98 vmul.f s12, s0, s20 @ vector operation
100 sub IN_HI, IN_HI, #16
101 ldrh J0, [REVTAB], #2
102 ldrh J1, [REVTAB], #2
103 vmls.f s8, s4, s20 @ vector operation
104 ldrh J3, [REVTAB_HI, #-2]!
105 ldrh J2, [REVTAB_HI, #-2]!
106 add J0, OUT, J0, lsl #3
107 vmla.f s12, s4, s16 @ vector operation
108 add J1, OUT, J1, lsl #3
109 add J2, OUT, J2, lsl #3
110 add J3, OUT, J3, lsl #3
121 .macro postrotation_innerloop tail, head
122 .set trig_lo_head, n8 - k - 2
123 .set trig_hi_head, n8 + k
124 .set out_lo_head, trig_lo_head * 2
125 .set out_hi_head, trig_hi_head * 2
126 .set trig_lo_tail, n8 - (k - 2) - 2
127 .set trig_hi_tail, n8 + (k - 2)
128 .set out_lo_tail, trig_lo_tail * 2
129 .set out_hi_tail, trig_hi_tail * 2
131 TCOS_D0_HEAD .req d10 @ s20,s21
132 TCOS_D1_HEAD .req d11 @ s22,s23
133 TCOS_S0_TAIL .req s24
135 TCOS_D0_HEAD .req d12 @ s24,s25
136 TCOS_D1_HEAD .req d13 @ s26,s27
137 TCOS_S0_TAIL .req s20
140 vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
143 vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
144 vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
145 vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
148 vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
151 vldr s0, [OUT, #out_lo_head*4]
152 vldr s1, [OUT, #out_lo_head*4 + 8]
153 vldr s2, [OUT, #out_hi_head*4]
154 vldr s3, [OUT, #out_hi_head*4 + 8]
155 vldr s4, [OUT, #out_lo_head*4 + 4]
156 vldr s5, [OUT, #out_lo_head*4 + 12]
157 vldr s6, [OUT, #out_hi_head*4 + 4]
158 vldr s7, [OUT, #out_hi_head*4 + 12]
161 vstr s8, [OUT, #out_lo_tail*4]
162 vstr s9, [OUT, #out_lo_tail*4 + 8]
163 vstr s10, [OUT, #out_hi_tail*4]
164 vstr s11, [OUT, #out_hi_tail*4 + 8]
167 vmul.f s8, s4, s16 @ vector operation
170 vstr s12, [OUT, #out_hi_tail*4 + 12]
171 vstr s13, [OUT, #out_hi_tail*4 + 4]
172 vstr s14, [OUT, #out_lo_tail*4 + 12]
173 vstr s15, [OUT, #out_lo_tail*4 + 4]
176 vmul.f s12, s0, s16 @ vector operation
177 vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
187 .macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
189 vmls.f s8, s0, \tcos_s0_tail @ vector operation
192 vldmia TSIN!, {s16,s17}
193 vldmdb TSIN_HI!, {s18,s19}
194 vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head}
197 vmla.f s12, s4, \tcos_s0_tail @ vector operation
200 vldr s0, [OUT, #+\out_offset_head+0]
201 vldr s1, [OUT, #+\out_offset_head+8]
202 vldr s2, [OUT_HI, #-\out_offset_head-16]
203 vldr s3, [OUT_HI, #-\out_offset_head-8]
204 vldr s4, [OUT, #+\out_offset_head+4]
205 vldr s5, [OUT, #+\out_offset_head+12]
206 vldr s6, [OUT_HI, #-\out_offset_head-12]
207 vldr s7, [OUT_HI, #-\out_offset_head-4]
210 vstr s8, [OUT, #+\out_offset_tail+0]
211 vstr s9, [OUT, #+\out_offset_tail+8]
212 vstr s10, [OUT_HI, #-\out_offset_tail-16]
213 vstr s11, [OUT_HI, #-\out_offset_tail-8]
216 vmul.f s8, s4, s16 @ vector operation
219 vstr s12, [OUT_HI, #-\out_offset_tail-4]
220 vstr s13, [OUT_HI, #-\out_offset_tail-12]
221 vstr s14, [OUT, #+\out_offset_tail+12]
222 vstr s15, [OUT, #+\out_offset_tail+4]
225 vmul.f s12, s0, s16 @ vector operation
226 vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
231 /* void ff_imdct_half_vfp(FFTContext *s,
233 * const FFTSample *input)
235 function ff_imdct_half_vfp, export=1
236 ldr ip, [CONTEXT, #5*4] @ mdct_bits
248 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
251 ldr REVTAB, [CONTEXT, #2*4]
252 ldr TCOS, [CONTEXT, #6*4]
253 ldr TSIN, [CONTEXT, #7*4]
257 prerotation_innerloop
263 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
267 postrotation_innerloop , head
269 postrotation_innerloop tail, head
271 postrotation_innerloop tail
278 push {v1-v6,sl,fp,lr}
281 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
285 ldr REVTAB, [CONTEXT, #2*4]
286 ldr TCOS, [CONTEXT, #6*4]
287 ldr TSIN, [CONTEXT, #7*4]
290 push {CONTEXT,OLDFPSCR}
291 add IN_HI, IN, lr, lsl #1
292 add REVTAB_HI, REVTAB, lr, lsr #1
293 add TCOS_HI, TCOS, lr
294 add TSIN_HI, TSIN, lr
295 0: prerotation_innerloop_rolled
298 ldmia sp, {CONTEXT,OLDFPSCR}
302 ldr ip, [CONTEXT, #9*4]
303 blx ip @ s->fft_calc(s, output)
305 pop {CONTEXT,OLDFPSCR}
306 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
307 ldr ip, [CONTEXT, #5*4] @ mdct_bits
311 sub TCOS, TCOS, lr, lsr #1
312 sub TSIN, TSIN, lr, lsr #1
313 add OUT_HI, OUT, lr, lsl #1
314 add TCOS_HI, TCOS, lr
315 add TSIN_HI, TSIN, lr
316 postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
319 sub OUT_HI, OUT_HI, #32
320 postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
321 1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
324 postrotation_innerloop_rolled tail,,,,,, s24,, 16