2 * Copyright (c) 2014 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/arm/asm.S"
24 #define MAX_CHANNELS 8
25 #define MAX_FIR_ORDER 8
26 #define MAX_IIR_ORDER 4
27 #define MAX_RATEFACTOR 4
28 #define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR)
46 // Some macros that do loads/multiplies where the register number is determined
47 // from an assembly-time expression. Boy is GNU assembler's syntax ugly...
49 .macro load group, index, base, offset
51 load_ \group, %(\index), \base, \offset
55 .macro load_ group, index, base, offset
56 ldr \group\index, [\base, #\offset]
59 .macro loadd group, index, base, offset
61 loadd_ \group, %(\index), %(\index+1), \base, \offset
65 .macro loadd_ group, index0, index1, base, offset
67 A ldr \group\index0, [\base, #\offset]
68 A ldr \group\index1, [\base, #(\offset) + 4]
70 ldrd \group\index0, \group\index1, [\base, #\offset]
74 .macro multiply index, accumulate, long
76 multiply_ %(\index), \accumulate, \long
80 .macro multiply_ index, accumulate, long
83 smlal AC0, AC1, CO\index, ST\index
85 smull AC0, AC1, CO\index, ST\index
89 mla AC0, CO\index, ST\index, AC0
91 mul AC0, CO\index, ST\index
96 // A macro to update the load register number and load offsets
99 .set LOAD_REG, (LOAD_REG + \howmany) & 3
100 .set OFFSET_CO, OFFSET_CO + 4 * \howmany
101 .set OFFSET_ST, OFFSET_ST + 4 * \howmany
103 .set FIR_REMAIN, FIR_REMAIN - \howmany
105 .set OFFSET_CO, 4 * MAX_FIR_ORDER
106 .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
108 .elseif IIR_REMAIN > 0
109 .set IIR_REMAIN, IIR_REMAIN - \howmany
113 // Macro to implement the inner loop for one specific combination of parameters
115 .macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps
116 .set TOTAL_TAPS, \iir_taps + \fir_taps
118 // Deal with register allocation...
119 .set DEFINED_SHIFT, 0
121 .set SHUFFLE_SHIFT, 0
126 // Little register pressure in this case - just keep MASK where it was
134 // AC1 is unused with shift 0
142 // All coefficients are preloaded (so pointer not needed)
150 .else // shift not 0 or 8
152 // All coefficients are preloaded, and at least one CO register is unused
155 .set DEFINED_SHIFT, 1
156 .set SHUFFLE_SHIFT, 1
159 .set DEFINED_SHIFT, 1
160 .set SHUFFLE_SHIFT, 1
167 .elseif TOTAL_TAPS == 4
168 // All coefficients are preloaded
170 .set DEFINED_SHIFT, 1
171 .set SHUFFLE_SHIFT, 1
185 .set DEFINED_SHIFT, 1
192 // Preload coefficients if possible
201 load CO, LOAD_REG, PCO, OFFSET_CO
202 .set LOAD_REG, (LOAD_REG + 1) & 3
203 .set OFFSET_CO, OFFSET_CO + 4
205 .set OFFSET_CO, 4 * MAX_FIR_ORDER
207 load CO, LOAD_REG, PCO, OFFSET_CO
208 .set LOAD_REG, (LOAD_REG + 1) & 3
209 .set OFFSET_CO, OFFSET_CO + 4
213 // Move mask/shift to final positions if necessary
214 // Need to do this after preloading, because in some cases we
215 // reuse the coefficient pointer register
226 // Things simplify a lot in this case
227 // In fact this could be pipelined further if it's worth it...
234 str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
235 str ST0, [PSAMP], #4 * MAX_CHANNELS
244 .set FIR_REMAIN, \fir_taps
245 .set IIR_REMAIN, \iir_taps
246 .if FIR_REMAIN == 0 // only IIR terms
247 .set OFFSET_CO, 4 * MAX_FIR_ORDER
248 .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
253 .set MUL_REG, LOAD_REG
257 .if FIR_REMAIN != 0 || IIR_REMAIN != 0
260 load CO, LOAD_REG, PCO, OFFSET_CO
262 load ST, LOAD_REG, PST, OFFSET_ST
264 .elseif COUNTER == 1 && (\fir_taps & 1) == 0
266 load CO, LOAD_REG, PCO, OFFSET_CO
268 load ST, LOAD_REG, PST, OFFSET_ST
270 .elseif LOAD_BANK == 0
272 .if FIR_REMAIN == 0 && IIR_REMAIN == 1
273 load CO, LOAD_REG, PCO, OFFSET_CO
275 loadd CO, LOAD_REG, PCO, OFFSET_CO
280 .if FIR_REMAIN == 0 && IIR_REMAIN == 1
281 load ST, LOAD_REG, PST, OFFSET_ST
284 loadd ST, LOAD_REG, PST, OFFSET_ST
291 // Do interleaved multiplies, slightly delayed
293 multiply MUL_REG, COUNTER > 2, !\shift_0
294 .set MUL_REG, (MUL_REG + 1) & 3
296 .set COUNTER, COUNTER + 1
299 // Post-process the result of the multiplies
301 ldr SHIFT, [sp, #9*4 + 0*4]
304 ldr MASK, [sp, #9*4 + 1*4]
310 orr AC0, AC0, AC1, lsl #24
313 mov AC0, AC0, lsr SHIFT
314 A orr AC0, AC0, AC1, lsl ST3
315 T mov AC1, AC1, lsl ST3
326 str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
327 str ST3, [PSAMP], #4 * MAX_CHANNELS
340 .macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps
341 A ldr pc, [pc, a3, lsl #2] // firorder is in range 0-(8-iir_taps)
342 T tbh [pc, a3, lsl #1]
344 A .word 0, 70f, 71f, 72f, 73f, 74f
345 T .hword (70f - 0b) / 2, (71f - 0b) / 2, (72f - 0b) / 2, (73f - 0b) / 2, (74f - 0b) / 2
348 T .hword (75f - 0b) / 2
351 T .hword (76f - 0b) / 2
354 T .hword (77f - 0b) / 2
357 T .hword (78f - 0b) / 2
362 70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
363 71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
364 72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
365 73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
366 74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
368 75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
370 76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
372 77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
374 78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
381 .macro switch_on_iir_taps mask_minus1, shift_0, shift_8
382 A ldr pc, [pc, a4, lsl #2] // irorder is in range 0-4
383 T tbh [pc, a4, lsl #1]
385 A .word 0, 60f, 61f, 62f, 63f, 64f
386 T .hword (60f - 0b) / 2, (61f - 0b) / 2, (62f - 0b) / 2, (63f - 0b) / 2, (64f - 0b) / 2
387 60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0
388 61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1
389 62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2
390 63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3
391 64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4
394 /* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
395 * int firorder, int iirorder,
396 * unsigned int filter_shift, int32_t mask,
397 * int blocksize, int32_t *sample_buffer);
399 function ff_mlp_filter_channel_arm, export=1
401 add v1, sp, #9*4 // point at arguments on stack
402 ldm v1, {ST0,ST1,I,PSAMP}
405 movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
408 switch_on_iir_taps 1, 1, 0
409 10: switch_on_iir_taps 1, 0, 1
410 20: switch_on_iir_taps 1, 0, 0
411 30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
414 switch_on_iir_taps 0, 1, 0
415 40: switch_on_iir_taps 0, 0, 1
416 50: switch_on_iir_taps 0, 0, 0
435 /********************************************************************/
437 PSA .req a1 // samples
438 PCO .req a2 // coeffs
439 PBL .req a3 // bypassed_lsbs
453 DCH .req SA2 // dest_ch
456 // INDEX is used as follows:
457 // bits 0..6 index2 (values up to 17, but wider so that we can
458 // add to index field without needing to mask)
459 // bits 7..14 i (values up to 160)
460 // bit 15 underflow detect for i
461 // bits 25..31 (if access_unit_size_pow2 == 128) \ index
462 // bits 26..31 (if access_unit_size_pow2 == 64) /
464 .macro implement_rematrix shift, index_mask, mask_minus1, maxchan
466 // We can just leave the coefficients in registers in this case
472 smull AC0, AC1, CO0, SA0
473 .elseif \maxchan == 5
478 ldrd CO2, CO3, [PCO, #8]
479 smull AC0, AC1, CO0, SA0
480 ldrd SA2, SA3, [PSA, #8]
481 smlal AC0, AC1, CO1, SA1
482 ldrd CO0, CO1, [PCO, #16]
483 smlal AC0, AC1, CO2, SA2
484 ldrd SA0, SA1, [PSA, #16]
485 smlal AC0, AC1, CO3, SA3
486 smlal AC0, AC1, CO0, SA0
487 .else // \maxchan == 7
492 ldrd CO0, CO1, [PCO, #8]
493 smull AC0, AC1, CO2, SA2
494 ldrd SA0, SA1, [PSA, #8]
495 smlal AC0, AC1, CO3, SA3
496 ldrd CO2, CO3, [PCO, #16]
497 smlal AC0, AC1, CO0, SA0
498 ldrd SA2, SA3, [PSA, #16]
499 smlal AC0, AC1, CO1, SA1
500 ldrd CO0, CO1, [PCO, #24]
501 smlal AC0, AC1, CO2, SA2
502 ldrd SA0, SA1, [PSA, #24]
503 smlal AC0, AC1, CO3, SA3
504 smlal AC0, AC1, CO0, SA0
506 ldm sp, {NOISE, DCH, MASK}
507 smlal AC0, AC1, CO1, SA1
509 .if \index_mask == 63
510 add NOISE, NOISE, INDEX, lsr #32-6
511 ldrb LSB, [PBL], #MAX_CHANNELS
513 add INDEX, INDEX, INDEX, lsl #32-6
514 .else // \index_mask == 127
515 add NOISE, NOISE, INDEX, lsr #32-7
516 ldrb LSB, [PBL], #MAX_CHANNELS
518 add INDEX, INDEX, INDEX, lsl #32-7
520 sub INDEX, INDEX, #1<<7
521 adds AC0, AC0, NOISE, lsl #\shift + 7
522 adc AC1, AC1, NOISE, asr #31
524 ldrb LSB, [PBL], #MAX_CHANNELS
525 sub INDEX, INDEX, #1<<7
527 add PSA, PSA, #MAX_CHANNELS*4
528 mov AC0, AC0, lsr #14
529 orr AC0, AC0, AC1, lsl #18
535 str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA
540 .macro switch_on_maxchan shift, index_mask, mask_minus1
544 implement_rematrix \shift, \index_mask, \mask_minus1, 7
545 50: implement_rematrix \shift, \index_mask, \mask_minus1, 5
546 51: implement_rematrix \shift, \index_mask, \mask_minus1, 1
549 .macro switch_on_mask shift, index_mask
552 switch_on_maxchan \shift, \index_mask, 1
553 40: switch_on_maxchan \shift, \index_mask, 0
556 .macro switch_on_au_size shift
558 switch_on_mask \shift, undefined
562 orr INDEX, INDEX, v1, lsl #32-6
563 switch_on_mask \shift, 63
564 30: orr INDEX, INDEX, v1, lsl #32-7
565 switch_on_mask \shift, 127
569 /* void ff_mlp_rematrix_channel_arm(int32_t *samples,
570 * const int32_t *coeffs,
571 * const uint8_t *bypassed_lsbs,
572 * const int8_t *noise_buffer,
574 * unsigned int dest_ch,
576 * unsigned int maxchan,
577 * int matrix_noise_shift,
578 * int access_unit_size_pow2,
581 function ff_mlp_rematrix_channel_arm, export=1
583 add v1, sp, #9*4 // point at arguments on stack
594 sub v2, v2, #MAX_CHANNELS
595 push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned
596 movs INDEX, v3, lsl #7
597 beq 98f // just in case, do nothing if blockpos = 0
598 subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
599 adc lr, v1, v1 // calculate index2 (C was set by preceding subs)
601 // Switch on matrix_noise_shift: values 0 and 1 are
602 // disproportionately common so do those in a form the branch
603 // predictor can accelerate. Values can only go up to 15.
607 A ldr pc, [pc, v5, lsl #2]
608 T tbh [pc, v5, lsl #1]
610 A .word 0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f
611 T .hword 0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2
612 T .hword (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2
613 T .hword (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2
614 10: switch_on_au_size 0
615 11: switch_on_au_size 1
616 12: switch_on_au_size 2
617 13: switch_on_au_size 3
618 14: switch_on_au_size 4
619 15: switch_on_au_size 5
620 16: switch_on_au_size 6
621 17: switch_on_au_size 7
622 18: switch_on_au_size 8
623 19: switch_on_au_size 9
624 20: switch_on_au_size 10
625 21: switch_on_au_size 11
626 22: switch_on_au_size 12
627 23: switch_on_au_size 13
628 24: switch_on_au_size 14
629 25: switch_on_au_size 15
633 99: // Can't handle these parameters, drop back to C
635 b X(ff_mlp_rematrix_channel)