From: Rémi Denis-Courmont Date: Mon, 28 Sep 2009 19:59:01 +0000 (+0300) Subject: NEON converter: unroll fi32->s16n conversion X-Git-Tag: 1.1.0-ff~3132 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=56bbc3365ba0ec871b68564ab9b6b15db46194b9;p=vlc NEON converter: unroll fi32->s16n conversion Main loop now burns 3 cycles per 8 values, if I compute right. This is not quite main bottleneck, but it is pretty much always used (since we output 16-bits PCM). --- diff --git a/modules/audio_filter/converter/Modules.am b/modules/audio_filter/converter/Modules.am index 6306500b48..1f0f611e14 100644 --- a/modules/audio_filter/converter/Modules.am +++ b/modules/audio_filter/converter/Modules.am @@ -1,6 +1,8 @@ SOURCES_converter_fixed = fixed.c SOURCES_converter_float = float.c -SOURCES_converter_neon = neon.c +SOURCES_converter_neon = \ + neon_s32_s16.S \ + neon.c SOURCES_a52tospdif = a52tospdif.c SOURCES_a52tofloat32 = a52tofloat32.c SOURCES_dtstospdif = dtstospdif.c diff --git a/modules/audio_filter/converter/neon.c b/modules/audio_filter/converter/neon.c index 21fd13d61e..e1c682a005 100644 --- a/modules/audio_filter/converter/neon.c +++ b/modules/audio_filter/converter/neon.c @@ -28,6 +28,8 @@ #include #include +#include + static int Open (vlc_object_t *); vlc_module_begin () @@ -129,63 +131,32 @@ static block_t *Do_F32_S32 (filter_t *filter, block_t *inbuf) return inbuf; } +void s32_s16_neon_unaligned (int16_t *out, const int32_t *in, unsigned nb); +void s32_s16_neon (int16_t *out, const int32_t *in, unsigned nb); + /** * Signed 32-bits fixed point to signed 16-bits integer */ static block_t *Do_S32_S16 (filter_t *filter, block_t *inbuf) { - unsigned nb_samples = inbuf->i_nb_samples - * aout_FormatNbChannels (&filter->fmt_in.audio); - int32_t *inp = (int32_t *)inbuf->p_buffer; - const int32_t *endp = inp + nb_samples; - int16_t *outp = (int16_t *)inp; - - while (nb_samples & 3) - { - const int16_t roundup = 1 << 12; - asm volatile ( - "qadd r0, %[inv], %[roundup]\n" - "ssat %[outv], #16, r0, asr #13\n" - : [outv] "=r" (*outp) - : [inv] "r" (*inp), [roundup] "r" (roundup) - : "r0"); - inp++; - outp++; - nb_samples--; - } + const int32_t *in = (int32_t *)inbuf->p_buffer; + int16_t *out = (int16_t *)in; + unsigned nb; - if (nb_samples & 4) - asm volatile ( - "vld1.s32 {q0}, [%[inp]]!\n" - "vrshrn.i32 d0, q0, #13\n" - "vst1.s16 {d0}, [%[outp]]!\n" - : [outp] "+r" (outp), [inp] "+r" (inp) - : - : "q0", "memory"); + nb = ((-(uintptr_t)in) & 12) >> 2; + out += nb; /* fix up misalignment */ + inbuf->p_buffer += 2 * nb; - if (nb_samples & 8) - asm volatile ( - "vld1.s32 {q0-q1}, [%[inp]]!\n" - "vrshrn.i32 d0, q0, #13\n" - "vrshrn.i32 d1, q1, #13\n" - "vst1.s16 {q0}, [%[outp]]!\n" - : [outp] "+r" (outp), [inp] "+r" (inp) - : - : "q0", "q1", "memory"); + s32_s16_neon_unaligned (out, in, nb); + in += nb; + out += nb; - while (inp != endp) - asm volatile ( - "vld1.s32 {q0-q1}, [%[inp]]!\n" - "vld1.s32 {q2-q3}, [%[inp]]!\n" - "vrshrn.s32 d0, q0, #13\n" - "vrshrn.s32 d1, q1, #13\n" - "vrshrn.s32 d2, q2, #13\n" - "vrshrn.s32 d3, q3, #13\n" - "vst1.s16 {q0-q1}, [%[outp]]!\n" - : [outp] "+r" (outp), [inp] "+r" (inp) - : - : "q0", "q1", "q2", "q3", "memory"); + nb = inbuf->i_nb_samples + * aout_FormatNbChannels (&filter->fmt_in.audio) - nb; + assert (!(((uintptr_t)in) & 15)); + assert (!(((uintptr_t)out) & 15)); + s32_s16_neon (out, in, nb); inbuf->i_buffer /= 2; return inbuf; } diff --git a/modules/audio_filter/converter/neon_s32_s16.S b/modules/audio_filter/converter/neon_s32_s16.S new file mode 100644 index 0000000000..88effca4e5 --- /dev/null +++ b/modules/audio_filter/converter/neon_s32_s16.S @@ -0,0 +1,100 @@ + @***************************************************************************** + @ neon_s32_s16.S : ARM NEONv1 fi32 to s16n audio sample conversion + @***************************************************************************** + @ Copyright (C) 2009 Rémi Denis-Courmont + @ + @ This program is free software; you can redistribute it and/or modify + @ it under the terms of the GNU General Public License as published by + @ the Free Software Foundation; either version 2 of the License, or + @ (at your option) any later version. + @ + @ This program is distributed in the hope that it will be useful, + @ but WITHOUT ANY WARRANTY; without even the implied warranty of + @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + @ GNU General Public License for more details. + @ + @ You should have received a copy of the GNU General Public License + @ along with this program; if not, write to the Free Software Foundation, + @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. + @****************************************************************************/ + + .fpu neon + .text + +#define OUT r0 +#define IN r1 +#define N r2 +#define BUF r3 +#define HALF ip + + .align + .global s32_s16_neon + .type s32_s16_neon, %function + @ Converts fixed-point 32-bits to signed 16-bits + @ Input and output must be on 128-bits boundary +s32_s16_neon: + pld [IN] +2: + cmp N, #8 + blt s32_s16_neon_unaligned + vld1.s32 {q8-q9}, [IN,:128]! + +3: @ Main loop + pld [IN, #64] + sub N, #8 + vqrshrn.s32 d16, q8, #13 + vqrshrn.s32 d17, q9, #13 + cmp N, #8 + blt 4f + vld1.s32 {q10-q11}, [IN,:128]! + sub N, #8 + vqrshrn.s32 d18, q10, #13 + vqrshrn.s32 d19, q11, #13 + cmp N, #8 + blt 5f + vld1.s32 {q12-q13}, [IN,:128]! + sub N, #8 + vqrshrn.s32 d20, q12, #13 + vqrshrn.s32 d21, q13, #13 + vst1.s16 {d16-d19}, [OUT,:128]! + cmp N, #8 + blt 6f + vld1.s32 {q8-q9}, [IN,:128]! + vst1.s16 {d20-d21}, [OUT,:128]! + b 3b +4: + vst1.s16 {d16-d17}, [OUT,:128]! + b 7f +5: + vst1.s16 {d16-d19}, [OUT,:128]! + b 7f +6: + vst1.s16 {d20-d21}, [OUT,:128]! +7: + cmp N, #4 + blt s32_s16_neon_unaligned + vld1.s32 {q8}, [IN,:128]! + sub N, #4 + vqrshrn.s32 d16, q8, #13 + vst1.s16 {d16}, [OUT,:64]! + + @ Fall through for last 0-3 samples + + .global s32_s16_neon_unaligned + .type s32_s16_neon_unaligned, %function + @ Converts fixed-point 32-bits to signed 16-bits + @ Input must be on 32-bits boundary, output on 16-bits +s32_s16_neon_unaligned: + mov HALF, #4096 +1: + cmp N, #0 + bxeq lr + + ldr BUF, [IN] + add IN, #4 + add OUT, #2 + qadd BUF, HALF, BUF + sub N, #1 + ssat BUF, #16, BUF, asr #13 + strh BUF, [OUT, #-2] + b 1b