2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/arm/asm.S"
31 SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
32 SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
41 COEF0 .req s8 @ coefficient elements
49 ACCUM0 .req s16 @ double-buffered multiply-accumulate results
51 POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
57 .macro inner_loop decifactor, dir, tail, head
66 vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
67 vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
68 vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
69 vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
72 vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
75 vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
76 vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
77 vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
78 vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
81 vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
84 vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
86 vmul.f ACCUM4, COEF4, IN1 @ vector operation
88 vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
89 vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
91 vmul.f ACCUM4, COEF4, IN1 @ vector operation
93 vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
94 vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
97 vstmia POUT!, {POST0-POST3}
100 vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
101 vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
102 vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
103 vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
104 vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
105 vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
106 .if \decifactor == 32
107 vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
108 vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
109 vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
110 vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
111 vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
112 vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
113 vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
114 vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
115 vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
116 vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
117 vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
118 vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
119 vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
120 vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
121 vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
122 vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
123 vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
124 vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
125 vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
126 vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
131 .macro dca_lfe_fir decifactor
132 .if \decifactor == 32
135 vmov SCALE32, s0 @ duplicate scalar across vector
136 vldr IN4, [PIN, #-4*4]
137 vldr IN5, [PIN, #-5*4]
138 vldr IN6, [PIN, #-6*4]
139 vldr IN7, [PIN, #-7*4]
145 mov COUNTER, #\decifactor/4 - 1
146 inner_loop \decifactor, up,, head
147 1: add PCOEF, PCOEF, #4*JMAX*4
148 subs COUNTER, COUNTER, #1
149 inner_loop \decifactor, up, tail, head
151 inner_loop \decifactor, up, tail
153 mov COUNTER, #\decifactor/4 - 1
154 inner_loop \decifactor, down,, head
155 1: sub PCOEF, PCOEF, #4*JMAX*4
156 subs COUNTER, COUNTER, #1
157 inner_loop \decifactor, down, tail, head
159 inner_loop \decifactor, down, tail
161 .if \decifactor == 32
171 /* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
172 * int decifactor, float scale)
174 function ff_dca_lfe_fir_vfp, export=1
177 ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
180 vldr IN0, [PIN, #-0*4]
181 vldr IN1, [PIN, #-1*4]
182 vldr IN2, [PIN, #-2*4]
183 vldr IN3, [PIN, #-3*4]
230 SCALEINT .req v4 @ only used in softfp case
235 /* Stack layout differs in softfp and hardfp cases:
238 * fp -> 6 arg words saved by caller
239 * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
242 * buf -> 8*32*4 bytes buffer
244 * sp -> 3 arg words for callee
247 * fp -> 7 arg words saved by caller
248 * a4,v1-v5,fp,lr on entry
251 * buf -> 8*32*4 bytes buffer
252 * sp -> 4 arg words for callee
255 /* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
256 * SynthFilterContext *synth, FFTContext *imdct,
257 * float (*synth_buf_ptr)[512],
258 * int *synth_buf_offset, float (*synth_buf2)[32],
259 * const float (*window)[512], float *samples_out,
260 * float (*raXin)[32], float scale);
262 function ff_dca_qmf_32_subbands_vfp, export=1
263 VFP push {a3-a4,v1-v3,v5,fp,lr}
264 NOVFP push {a4,v1-v5,fp,lr}
267 @ The buffer pointed at by raXin isn't big enough for us to do a
268 @ complete matrix transposition as we want to, so allocate an
269 @ alternative buffer from the stack. Align to 4 words for speed.
273 ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
276 @ COUNT is used to count down 2 things at once:
277 @ bits 0-4 are the number of word pairs remaining in the output row
278 @ bits 5-31 are the number of words to copy (with possible negation)
279 @ from the source matrix before we start zeroing the remainder
280 mov COUNT, #(-4 << 5) + 16
281 adds COUNT, COUNT, SBACT, lsl #5
284 vldr s8, [IN, #(0*8+0)*4]
285 vldr s10, [IN, #(0*8+1)*4]
286 vldr s12, [IN, #(0*8+2)*4]
287 vldr s14, [IN, #(0*8+3)*4]
288 vldr s16, [IN, #(0*8+4)*4]
289 vldr s18, [IN, #(0*8+5)*4]
290 vldr s20, [IN, #(0*8+6)*4]
291 vldr s22, [IN, #(0*8+7)*4]
293 vldr s9, [IN, #(1*8+0)*4]
294 vldr s11, [IN, #(1*8+1)*4]
295 vldr s13, [IN, #(1*8+2)*4]
296 vldr s15, [IN, #(1*8+3)*4]
298 vldr s17, [IN, #(1*8+4)*4]
299 vldr s19, [IN, #(1*8+5)*4]
300 vldr s21, [IN, #(1*8+6)*4]
301 vldr s23, [IN, #(1*8+7)*4]
302 vstr d4, [BUF, #(0*32+0)*4]
303 vstr d5, [BUF, #(1*32+0)*4]
304 vstr d6, [BUF, #(2*32+0)*4]
305 vstr d7, [BUF, #(3*32+0)*4]
306 vstr d8, [BUF, #(4*32+0)*4]
307 vstr d9, [BUF, #(5*32+0)*4]
308 vstr d10, [BUF, #(6*32+0)*4]
309 vstr d11, [BUF, #(7*32+0)*4]
310 vldr s9, [IN, #(3*8+0)*4]
311 vldr s11, [IN, #(3*8+1)*4]
312 vldr s13, [IN, #(3*8+2)*4]
313 vldr s15, [IN, #(3*8+3)*4]
314 vldr s17, [IN, #(3*8+4)*4]
315 vldr s19, [IN, #(3*8+5)*4]
316 vldr s21, [IN, #(3*8+6)*4]
317 vldr s23, [IN, #(3*8+7)*4]
319 vldr s8, [IN, #(2*8+0)*4]
320 vldr s10, [IN, #(2*8+1)*4]
321 vldr s12, [IN, #(2*8+2)*4]
322 vldr s14, [IN, #(2*8+3)*4]
324 vldr s16, [IN, #(2*8+4)*4]
325 vldr s18, [IN, #(2*8+5)*4]
326 vldr s20, [IN, #(2*8+6)*4]
327 vldr s22, [IN, #(2*8+7)*4]
328 vstr d4, [BUF, #(0*32+2)*4]
329 vstr d5, [BUF, #(1*32+2)*4]
330 vstr d6, [BUF, #(2*32+2)*4]
331 vstr d7, [BUF, #(3*32+2)*4]
332 vstr d8, [BUF, #(4*32+2)*4]
333 vstr d9, [BUF, #(5*32+2)*4]
334 vstr d10, [BUF, #(6*32+2)*4]
335 vstr d11, [BUF, #(7*32+2)*4]
338 subs COUNT, COUNT, #(4 << 5) + 2
340 2: @ Now deal with trailing < 4 samples
341 adds COUNT, COUNT, #3 << 5
342 bmi 4f @ sb_act was a multiple of 4
343 bics lr, COUNT, #0x1F
346 vldr s8, [IN, #(0*8+0)*4]
347 vldr s10, [IN, #(0*8+1)*4]
348 vldr s12, [IN, #(0*8+2)*4]
349 vldr s14, [IN, #(0*8+3)*4]
350 vldr s16, [IN, #(0*8+4)*4]
351 vldr s18, [IN, #(0*8+5)*4]
352 vldr s20, [IN, #(0*8+6)*4]
353 vldr s22, [IN, #(0*8+7)*4]
364 vstr d4, [BUF, #(0*32+0)*4]
365 vstr d5, [BUF, #(1*32+0)*4]
366 vstr d6, [BUF, #(2*32+0)*4]
367 vstr d7, [BUF, #(3*32+0)*4]
368 vstr d8, [BUF, #(4*32+0)*4]
369 vstr d9, [BUF, #(5*32+0)*4]
370 vstr d10, [BUF, #(6*32+0)*4]
371 vstr d11, [BUF, #(7*32+0)*4]
375 3: @ sb_act was n*4+2 or n*4+3, so do the first 2
376 vldr s8, [IN, #(0*8+0)*4]
377 vldr s10, [IN, #(0*8+1)*4]
378 vldr s12, [IN, #(0*8+2)*4]
379 vldr s14, [IN, #(0*8+3)*4]
380 vldr s16, [IN, #(0*8+4)*4]
381 vldr s18, [IN, #(0*8+5)*4]
382 vldr s20, [IN, #(0*8+6)*4]
383 vldr s22, [IN, #(0*8+7)*4]
385 vldr s9, [IN, #(1*8+0)*4]
386 vldr s11, [IN, #(1*8+1)*4]
387 vldr s13, [IN, #(1*8+2)*4]
388 vldr s15, [IN, #(1*8+3)*4]
390 vldr s17, [IN, #(1*8+4)*4]
391 vldr s19, [IN, #(1*8+5)*4]
392 vldr s21, [IN, #(1*8+6)*4]
393 vldr s23, [IN, #(1*8+7)*4]
394 vstr d4, [BUF, #(0*32+0)*4]
395 vstr d5, [BUF, #(1*32+0)*4]
396 vstr d6, [BUF, #(2*32+0)*4]
397 vstr d7, [BUF, #(3*32+0)*4]
398 vstr d8, [BUF, #(4*32+0)*4]
399 vstr d9, [BUF, #(5*32+0)*4]
400 vstr d10, [BUF, #(6*32+0)*4]
401 vstr d11, [BUF, #(7*32+0)*4]
403 sub COUNT, COUNT, #(2 << 5) + 1
404 bics lr, COUNT, #0x1F
407 vldr s8, [IN, #(2*8+0)*4]
408 vldr s10, [IN, #(2*8+1)*4]
409 vldr s12, [IN, #(2*8+2)*4]
410 vldr s14, [IN, #(2*8+3)*4]
411 vldr s16, [IN, #(2*8+4)*4]
412 vldr s18, [IN, #(2*8+5)*4]
413 vldr s20, [IN, #(2*8+6)*4]
414 vldr s22, [IN, #(2*8+7)*4]
423 vstr d4, [BUF, #(0*32+0)*4]
424 vstr d5, [BUF, #(1*32+0)*4]
425 vstr d6, [BUF, #(2*32+0)*4]
426 vstr d7, [BUF, #(3*32+0)*4]
427 vstr d8, [BUF, #(4*32+0)*4]
428 vstr d9, [BUF, #(5*32+0)*4]
429 vstr d10, [BUF, #(6*32+0)*4]
430 vstr d11, [BUF, #(7*32+0)*4]
433 4: @ Now fill the remainder with 0
436 ands COUNT, COUNT, #0x1F
438 5: vstr d4, [BUF, #(0*32+0)*4]
439 vstr d4, [BUF, #(1*32+0)*4]
440 vstr d4, [BUF, #(2*32+0)*4]
441 vstr d4, [BUF, #(3*32+0)*4]
442 vstr d4, [BUF, #(4*32+0)*4]
443 vstr d4, [BUF, #(5*32+0)*4]
444 vstr d4, [BUF, #(6*32+0)*4]
445 vstr d4, [BUF, #(7*32+0)*4]
447 subs COUNT, COUNT, #1
451 ldr WINDOW, [fp, #3*4]
454 NOVFP ldr SCALEINT, [fp, #6*4]
458 NOVFP sub sp, sp, #4*4
460 VFP ldr a1, [fp, #-7*4] @ imdct
461 NOVFP ldr a1, [fp, #-8*4]
463 VFP stmia sp, {WINDOW, OUT, BUF}
464 NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
465 VFP vldr SCALE, [sp, #3*4]
466 bl X(ff_synth_filter_float_vfp)
469 subs COUNT, COUNT, #1
472 A sub sp, fp, #(8+8)*4
473 T sub fp, fp, #(8+8)*4
476 VFP pop {a3-a4,v1-v3,v5,fp,pc}
477 NOVFP pop {a4,v1-v5,fp,pc}