]> git.sesse.net Git - vlc/commitdiff
NEON converter: unroll fi32->s16n conversion
authorRémi Denis-Courmont <remi@remlab.net>
Mon, 28 Sep 2009 19:59:01 +0000 (22:59 +0300)
committerRémi Denis-Courmont <remi@remlab.net>
Tue, 29 Sep 2009 19:07:48 +0000 (22:07 +0300)
Main loop now burns 3 cycles per 8 values, if I compute right.
This is not quite main bottleneck, but it is pretty much always used
(since we output 16-bits PCM).

modules/audio_filter/converter/Modules.am
modules/audio_filter/converter/neon.c
modules/audio_filter/converter/neon_s32_s16.S [new file with mode: 0644]

index 6306500b486ae0838927ffe4c323fdb28787e1ae..1f0f611e14be3e52714e08ba4c513bc1b4d1c31f 100644 (file)
@@ -1,6 +1,8 @@
 SOURCES_converter_fixed = fixed.c
 SOURCES_converter_float = float.c
-SOURCES_converter_neon = neon.c
+SOURCES_converter_neon = \
+       neon_s32_s16.S \
+       neon.c
 SOURCES_a52tospdif = a52tospdif.c
 SOURCES_a52tofloat32 = a52tofloat32.c
 SOURCES_dtstospdif = dtstospdif.c
index 21fd13d61eab65b9388b518fbccf0fb81ba8f7b7..e1c682a00513058a8b4d4486d36eeb1a05b3f0b6 100644 (file)
@@ -28,6 +28,8 @@
 #include <vlc_filter.h>
 #include <vlc_cpu.h>
 
+#include <assert.h>
+
 static int Open (vlc_object_t *);
 
 vlc_module_begin ()
@@ -129,63 +131,32 @@ static block_t *Do_F32_S32 (filter_t *filter, block_t *inbuf)
     return inbuf;
 }
 
+void s32_s16_neon_unaligned (int16_t *out, const int32_t *in, unsigned nb);
+void s32_s16_neon (int16_t *out, const int32_t *in, unsigned nb);
+
 /**
  * Signed 32-bits fixed point to signed 16-bits integer
  */
 static block_t *Do_S32_S16 (filter_t *filter, block_t *inbuf)
 {
-    unsigned nb_samples = inbuf->i_nb_samples
-                     * aout_FormatNbChannels (&filter->fmt_in.audio);
-    int32_t *inp = (int32_t *)inbuf->p_buffer;
-    const int32_t *endp = inp + nb_samples;
-    int16_t *outp = (int16_t *)inp;
-
-    while (nb_samples & 3)
-    {
-        const int16_t roundup = 1 << 12;
-        asm volatile (
-            "qadd r0, %[inv], %[roundup]\n"
-            "ssat %[outv], #16, r0, asr #13\n"
-            : [outv] "=r" (*outp)
-            : [inv] "r" (*inp), [roundup] "r" (roundup)
-            : "r0");
-        inp++;
-        outp++;
-        nb_samples--;
-    }
+    const int32_t *in = (int32_t *)inbuf->p_buffer;
+    int16_t *out = (int16_t *)in;
+    unsigned nb;
 
-    if (nb_samples & 4)
-        asm volatile (
-            "vld1.s32 {q0}, [%[inp]]!\n"
-            "vrshrn.i32 d0, q0, #13\n"
-            "vst1.s16 {d0}, [%[outp]]!\n"
-            : [outp] "+r" (outp), [inp] "+r" (inp)
-            :
-            : "q0", "memory");
+    nb = ((-(uintptr_t)in) & 12) >> 2;
+    out += nb; /* fix up misalignment */
+    inbuf->p_buffer += 2 * nb;
 
-    if (nb_samples & 8)
-        asm volatile (
-            "vld1.s32 {q0-q1}, [%[inp]]!\n"
-            "vrshrn.i32 d0, q0, #13\n"
-            "vrshrn.i32 d1, q1, #13\n"
-            "vst1.s16 {q0}, [%[outp]]!\n"
-            : [outp] "+r" (outp), [inp] "+r" (inp)
-            :
-            : "q0", "q1", "memory");
+    s32_s16_neon_unaligned (out, in, nb);
+    in += nb;
+    out += nb;
 
-    while (inp != endp)
-        asm volatile (
-            "vld1.s32 {q0-q1}, [%[inp]]!\n"
-            "vld1.s32 {q2-q3}, [%[inp]]!\n"
-            "vrshrn.s32 d0, q0, #13\n"
-            "vrshrn.s32 d1, q1, #13\n"
-            "vrshrn.s32 d2, q2, #13\n"
-            "vrshrn.s32 d3, q3, #13\n"
-            "vst1.s16 {q0-q1}, [%[outp]]!\n"
-            : [outp] "+r" (outp), [inp] "+r" (inp)
-            :
-            : "q0", "q1", "q2", "q3", "memory");
+    nb = inbuf->i_nb_samples
+         * aout_FormatNbChannels (&filter->fmt_in.audio) - nb;
+    assert (!(((uintptr_t)in) & 15));
+    assert (!(((uintptr_t)out) & 15));
 
+    s32_s16_neon (out, in, nb);
     inbuf->i_buffer /= 2;
     return inbuf;
 }
diff --git a/modules/audio_filter/converter/neon_s32_s16.S b/modules/audio_filter/converter/neon_s32_s16.S
new file mode 100644 (file)
index 0000000..88effca
--- /dev/null
@@ -0,0 +1,100 @@
+ @*****************************************************************************
+ @ neon_s32_s16.S : ARM NEONv1 fi32 to s16n audio sample conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009 Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+       .fpu neon
+       .text
+
+#define        OUT     r0
+#define        IN      r1
+#define        N       r2
+#define        BUF     r3
+#define HALF   ip
+
+       .align
+       .global s32_s16_neon
+       .type   s32_s16_neon, %function
+       @ Converts fixed-point 32-bits to signed 16-bits
+       @ Input and output must be on 128-bits boundary
+s32_s16_neon:
+       pld             [IN]
+2:
+       cmp             N,      #8
+       blt             s32_s16_neon_unaligned
+       vld1.s32        {q8-q9},        [IN,:128]!
+
+3:     @ Main loop
+       pld             [IN, #64]
+       sub             N,      #8
+       vqrshrn.s32     d16,    q8,     #13
+       vqrshrn.s32     d17,    q9,     #13
+       cmp             N,      #8
+       blt             4f
+       vld1.s32        {q10-q11},      [IN,:128]!
+       sub             N,      #8
+       vqrshrn.s32     d18,    q10,    #13
+       vqrshrn.s32     d19,    q11,    #13
+       cmp             N,      #8
+       blt             5f
+       vld1.s32        {q12-q13},      [IN,:128]!
+       sub             N,      #8
+       vqrshrn.s32     d20,    q12,    #13
+       vqrshrn.s32     d21,    q13,    #13
+       vst1.s16        {d16-d19},      [OUT,:128]!
+       cmp             N,      #8
+       blt             6f
+       vld1.s32        {q8-q9},        [IN,:128]!
+       vst1.s16        {d20-d21},      [OUT,:128]!
+       b               3b
+4:
+       vst1.s16        {d16-d17},      [OUT,:128]!
+       b               7f
+5:
+       vst1.s16        {d16-d19},      [OUT,:128]!
+       b               7f
+6:
+       vst1.s16        {d20-d21},      [OUT,:128]!
+7:
+       cmp             N,      #4
+       blt             s32_s16_neon_unaligned
+       vld1.s32        {q8},           [IN,:128]!
+       sub             N,      #4
+       vqrshrn.s32     d16,    q8,     #13
+       vst1.s16        {d16},          [OUT,:64]!
+
+       @ Fall through for last 0-3 samples
+
+       .global s32_s16_neon_unaligned
+       .type   s32_s16_neon_unaligned, %function
+       @ Converts fixed-point 32-bits to signed 16-bits
+       @ Input must be on 32-bits boundary, output on 16-bits
+s32_s16_neon_unaligned:
+       mov             HALF,   #4096
+1:
+       cmp             N,      #0
+       bxeq            lr
+
+       ldr             BUF,    [IN]
+       add             IN,     #4
+       add             OUT,    #2
+       qadd            BUF,    HALF,   BUF
+       sub             N,      #1
+       ssat            BUF,    #16,    BUF, asr #13
+       strh            BUF,    [OUT, #-2]
+       b               1b