2 * Copyright (c) 2014 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavutil/arm/asm.S"
24 #define MAX_CHANNELS 8
25 #define MAX_FIR_ORDER 8
26 #define MAX_IIR_ORDER 4
27 #define MAX_RATEFACTOR 4
28 #define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR)
46 .macro branch_pic_label first, remainder:vararg
50 branch_pic_label \remainder
54 // Some macros that do loads/multiplies where the register number is determined
55 // from an assembly-time expression. Boy is GNU assembler's syntax ugly...
57 .macro load group, index, base, offset
59 load_ \group, %(\index), \base, \offset
63 .macro load_ group, index, base, offset
64 ldr \group\index, [\base, #\offset]
67 .macro loadd group, index, base, offset
69 loadd_ \group, %(\index), %(\index+1), \base, \offset
73 .macro loadd_ group, index0, index1, base, offset
75 A ldr \group\index0, [\base, #\offset]
76 A ldr \group\index1, [\base, #(\offset) + 4]
78 ldrd \group\index0, \group\index1, [\base, #\offset]
82 .macro multiply index, accumulate, long
84 multiply_ %(\index), \accumulate, \long
88 .macro multiply_ index, accumulate, long
91 smlal AC0, AC1, CO\index, ST\index
93 smull AC0, AC1, CO\index, ST\index
97 mla AC0, CO\index, ST\index, AC0
99 mul AC0, CO\index, ST\index
104 // A macro to update the load register number and load offsets
107 .set LOAD_REG, (LOAD_REG + \howmany) & 3
108 .set OFFSET_CO, OFFSET_CO + 4 * \howmany
109 .set OFFSET_ST, OFFSET_ST + 4 * \howmany
111 .set FIR_REMAIN, FIR_REMAIN - \howmany
113 .set OFFSET_CO, 4 * MAX_FIR_ORDER
114 .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
116 .elseif IIR_REMAIN > 0
117 .set IIR_REMAIN, IIR_REMAIN - \howmany
121 // Macro to implement the inner loop for one specific combination of parameters
123 .macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps
124 .set TOTAL_TAPS, \iir_taps + \fir_taps
126 // Deal with register allocation...
127 .set DEFINED_SHIFT, 0
129 .set SHUFFLE_SHIFT, 0
134 // Little register pressure in this case - just keep MASK where it was
142 // AC1 is unused with shift 0
150 // All coefficients are preloaded (so pointer not needed)
158 .else // shift not 0 or 8
160 // All coefficients are preloaded, and at least one CO register is unused
163 .set DEFINED_SHIFT, 1
164 .set SHUFFLE_SHIFT, 1
167 .set DEFINED_SHIFT, 1
168 .set SHUFFLE_SHIFT, 1
175 .elseif TOTAL_TAPS == 4
176 // All coefficients are preloaded
178 .set DEFINED_SHIFT, 1
179 .set SHUFFLE_SHIFT, 1
193 .set DEFINED_SHIFT, 1
200 // Preload coefficients if possible
209 load CO, LOAD_REG, PCO, OFFSET_CO
210 .set LOAD_REG, (LOAD_REG + 1) & 3
211 .set OFFSET_CO, OFFSET_CO + 4
213 .set OFFSET_CO, 4 * MAX_FIR_ORDER
215 load CO, LOAD_REG, PCO, OFFSET_CO
216 .set LOAD_REG, (LOAD_REG + 1) & 3
217 .set OFFSET_CO, OFFSET_CO + 4
221 // Move mask/shift to final positions if necessary
222 // Need to do this after preloading, because in some cases we
223 // reuse the coefficient pointer register
234 // Things simplify a lot in this case
235 // In fact this could be pipelined further if it's worth it...
242 str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
243 str ST0, [PSAMP], #4 * MAX_CHANNELS
252 .set FIR_REMAIN, \fir_taps
253 .set IIR_REMAIN, \iir_taps
254 .if FIR_REMAIN == 0 // only IIR terms
255 .set OFFSET_CO, 4 * MAX_FIR_ORDER
256 .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
261 .set MUL_REG, LOAD_REG
265 .if FIR_REMAIN != 0 || IIR_REMAIN != 0
268 load CO, LOAD_REG, PCO, OFFSET_CO
270 load ST, LOAD_REG, PST, OFFSET_ST
272 .elseif COUNTER == 1 && (\fir_taps & 1) == 0
274 load CO, LOAD_REG, PCO, OFFSET_CO
276 load ST, LOAD_REG, PST, OFFSET_ST
278 .elseif LOAD_BANK == 0
280 .if FIR_REMAIN == 0 && IIR_REMAIN == 1
281 load CO, LOAD_REG, PCO, OFFSET_CO
283 loadd CO, LOAD_REG, PCO, OFFSET_CO
288 .if FIR_REMAIN == 0 && IIR_REMAIN == 1
289 load ST, LOAD_REG, PST, OFFSET_ST
292 loadd ST, LOAD_REG, PST, OFFSET_ST
299 // Do interleaved multiplies, slightly delayed
301 multiply MUL_REG, COUNTER > 2, !\shift_0
302 .set MUL_REG, (MUL_REG + 1) & 3
304 .set COUNTER, COUNTER + 1
307 // Post-process the result of the multiplies
309 ldr SHIFT, [sp, #9*4 + 0*4]
312 ldr MASK, [sp, #9*4 + 1*4]
318 orr AC0, AC0, AC1, lsl #24
321 mov AC0, AC0, lsr SHIFT
322 A orr AC0, AC0, AC1, lsl ST3
323 T mov AC1, AC1, lsl ST3
334 str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
335 str ST3, [PSAMP], #4 * MAX_CHANNELS
348 .macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps
349 A ldr CO0, [pc, a3, lsl #2] // firorder is in range 0-(8-iir_taps)
351 T tbh [pc, a3, lsl #1]
353 branch_pic_label (70f - 0b), (71f - 0b), (72f - 0b), (73f - 0b)
354 branch_pic_label (74f - 0b)
356 branch_pic_label (75f - 0b)
358 branch_pic_label (76f - 0b)
360 branch_pic_label (77f - 0b)
362 branch_pic_label (78f - 0b)
367 70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
368 71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
369 72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
370 73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
371 74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
373 75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
375 76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
377 77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
379 78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
386 .macro switch_on_iir_taps mask_minus1, shift_0, shift_8
387 A ldr CO0, [pc, a4, lsl #2] // irorder is in range 0-4
389 T tbh [pc, a4, lsl #1]
391 branch_pic_label (60f - 0b), (61f - 0b), (62f - 0b), (63f - 0b)
392 branch_pic_label (64f - 0b)
393 60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0
394 61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1
395 62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2
396 63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3
397 64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4
400 /* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
401 * int firorder, int iirorder,
402 * unsigned int filter_shift, int32_t mask,
403 * int blocksize, int32_t *sample_buffer);
405 function ff_mlp_filter_channel_arm, export=1
407 add v1, sp, #9*4 // point at arguments on stack
408 ldm v1, {ST0,ST1,I,PSAMP}
411 movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
414 switch_on_iir_taps 1, 1, 0
415 10: switch_on_iir_taps 1, 0, 1
416 20: switch_on_iir_taps 1, 0, 0
417 30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
420 switch_on_iir_taps 0, 1, 0
421 40: switch_on_iir_taps 0, 0, 1
422 50: switch_on_iir_taps 0, 0, 0
441 /********************************************************************/
443 PSA .req a1 // samples
444 PCO .req a2 // coeffs
445 PBL .req a3 // bypassed_lsbs
459 DCH .req SA2 // dest_ch
462 // INDEX is used as follows:
463 // bits 0..6 index2 (values up to 17, but wider so that we can
464 // add to index field without needing to mask)
465 // bits 7..14 i (values up to 160)
466 // bit 15 underflow detect for i
467 // bits 25..31 (if access_unit_size_pow2 == 128) \ index
468 // bits 26..31 (if access_unit_size_pow2 == 64) /
470 .macro implement_rematrix shift, index_mask, mask_minus1, maxchan
472 // We can just leave the coefficients in registers in this case
478 smull AC0, AC1, CO0, SA0
479 .elseif \maxchan == 5
484 ldrd CO2, CO3, [PCO, #8]
485 smull AC0, AC1, CO0, SA0
486 ldrd SA2, SA3, [PSA, #8]
487 smlal AC0, AC1, CO1, SA1
488 ldrd CO0, CO1, [PCO, #16]
489 smlal AC0, AC1, CO2, SA2
490 ldrd SA0, SA1, [PSA, #16]
491 smlal AC0, AC1, CO3, SA3
492 smlal AC0, AC1, CO0, SA0
493 .else // \maxchan == 7
498 ldrd CO0, CO1, [PCO, #8]
499 smull AC0, AC1, CO2, SA2
500 ldrd SA0, SA1, [PSA, #8]
501 smlal AC0, AC1, CO3, SA3
502 ldrd CO2, CO3, [PCO, #16]
503 smlal AC0, AC1, CO0, SA0
504 ldrd SA2, SA3, [PSA, #16]
505 smlal AC0, AC1, CO1, SA1
506 ldrd CO0, CO1, [PCO, #24]
507 smlal AC0, AC1, CO2, SA2
508 ldrd SA0, SA1, [PSA, #24]
509 smlal AC0, AC1, CO3, SA3
510 smlal AC0, AC1, CO0, SA0
512 ldm sp, {NOISE, DCH, MASK}
513 smlal AC0, AC1, CO1, SA1
515 .if \index_mask == 63
516 add NOISE, NOISE, INDEX, lsr #32-6
517 ldrb LSB, [PBL], #MAX_CHANNELS
519 add INDEX, INDEX, INDEX, lsl #32-6
520 .else // \index_mask == 127
521 add NOISE, NOISE, INDEX, lsr #32-7
522 ldrb LSB, [PBL], #MAX_CHANNELS
524 add INDEX, INDEX, INDEX, lsl #32-7
526 sub INDEX, INDEX, #1<<7
527 adds AC0, AC0, NOISE, lsl #\shift + 7
528 adc AC1, AC1, NOISE, asr #31
530 ldrb LSB, [PBL], #MAX_CHANNELS
531 sub INDEX, INDEX, #1<<7
533 add PSA, PSA, #MAX_CHANNELS*4
534 mov AC0, AC0, lsr #14
535 orr AC0, AC0, AC1, lsl #18
541 str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA
546 .macro switch_on_maxchan shift, index_mask, mask_minus1
550 implement_rematrix \shift, \index_mask, \mask_minus1, 7
551 50: implement_rematrix \shift, \index_mask, \mask_minus1, 5
552 51: implement_rematrix \shift, \index_mask, \mask_minus1, 1
555 .macro switch_on_mask shift, index_mask
558 switch_on_maxchan \shift, \index_mask, 1
559 40: switch_on_maxchan \shift, \index_mask, 0
562 .macro switch_on_au_size shift
564 switch_on_mask \shift, undefined
568 orr INDEX, INDEX, v1, lsl #32-6
569 switch_on_mask \shift, 63
570 30: orr INDEX, INDEX, v1, lsl #32-7
571 switch_on_mask \shift, 127
575 /* void ff_mlp_rematrix_channel_arm(int32_t *samples,
576 * const int32_t *coeffs,
577 * const uint8_t *bypassed_lsbs,
578 * const int8_t *noise_buffer,
580 * unsigned int dest_ch,
582 * unsigned int maxchan,
583 * int matrix_noise_shift,
584 * int access_unit_size_pow2,
587 function ff_mlp_rematrix_channel_arm, export=1
589 add v1, sp, #9*4 // point at arguments on stack
600 sub v2, v2, #MAX_CHANNELS
601 push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned
602 movs INDEX, v3, lsl #7
603 beq 98f // just in case, do nothing if blockpos = 0
604 subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
605 adc lr, v1, v1 // calculate index2 (C was set by preceding subs)
607 // Switch on matrix_noise_shift: values 0 and 1 are
608 // disproportionately common so do those in a form the branch
609 // predictor can accelerate. Values can only go up to 15.
613 A ldr v5, [pc, v5, lsl #2]
615 T tbh [pc, v5, lsl #1]
617 branch_pic_label 0, 0, (12f - 0b), (13f - 0b)
618 branch_pic_label (14f - 0b), (15f - 0b), (16f - 0b), (17f - 0b)
619 branch_pic_label (18f - 0b), (19f - 0b), (20f - 0b), (21f - 0b)
620 branch_pic_label (22f - 0b), (23f - 0b), (24f - 0b), (25f - 0b)
621 10: switch_on_au_size 0
622 11: switch_on_au_size 1
623 12: switch_on_au_size 2
624 13: switch_on_au_size 3
625 14: switch_on_au_size 4
626 15: switch_on_au_size 5
627 16: switch_on_au_size 6
628 17: switch_on_au_size 7
629 18: switch_on_au_size 8
630 19: switch_on_au_size 9
631 20: switch_on_au_size 10
632 21: switch_on_au_size 11
633 22: switch_on_au_size 12
634 23: switch_on_au_size 13
635 24: switch_on_au_size 14
636 25: switch_on_au_size 15
640 99: // Can't handle these parameters, drop back to C
642 b X(ff_mlp_rematrix_channel)