+ @*****************************************************************************
+ @ neon_s32_s16.S : ARM NEONv1 fi32 to s16n audio sample conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009 Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+ .fpu neon
+ .text
+
+#define OUT r0
+#define IN r1
+#define N r2
+#define BUF r3
+#define HALF ip
+
+ .align
+ .global s32_s16_neon
+ .type s32_s16_neon, %function
+ @ Converts fixed-point 32-bits to signed 16-bits
+ @ Input and output must be on 128-bits boundary
+s32_s16_neon:
+ pld [IN]
+2:
+ cmp N, #8
+ blt s32_s16_neon_unaligned
+ vld1.s32 {q8-q9}, [IN,:128]!
+
+3: @ Main loop
+ pld [IN, #64]
+ sub N, #8
+ vqrshrn.s32 d16, q8, #13
+ vqrshrn.s32 d17, q9, #13
+ cmp N, #8
+ blt 4f
+ vld1.s32 {q10-q11}, [IN,:128]!
+ sub N, #8
+ vqrshrn.s32 d18, q10, #13
+ vqrshrn.s32 d19, q11, #13
+ cmp N, #8
+ blt 5f
+ vld1.s32 {q12-q13}, [IN,:128]!
+ sub N, #8
+ vqrshrn.s32 d20, q12, #13
+ vqrshrn.s32 d21, q13, #13
+ vst1.s16 {d16-d19}, [OUT,:128]!
+ cmp N, #8
+ blt 6f
+ vld1.s32 {q8-q9}, [IN,:128]!
+ vst1.s16 {d20-d21}, [OUT,:128]!
+ b 3b
+4:
+ vst1.s16 {d16-d17}, [OUT,:128]!
+ b 7f
+5:
+ vst1.s16 {d16-d19}, [OUT,:128]!
+ b 7f
+6:
+ vst1.s16 {d20-d21}, [OUT,:128]!
+7:
+ cmp N, #4
+ blt s32_s16_neon_unaligned
+ vld1.s32 {q8}, [IN,:128]!
+ sub N, #4
+ vqrshrn.s32 d16, q8, #13
+ vst1.s16 {d16}, [OUT,:64]!
+
+ @ Fall through for last 0-3 samples
+
+ .global s32_s16_neon_unaligned
+ .type s32_s16_neon_unaligned, %function
+ @ Converts fixed-point 32-bits to signed 16-bits
+ @ Input must be on 32-bits boundary, output on 16-bits
+s32_s16_neon_unaligned:
+ mov HALF, #4096
+1:
+ cmp N, #0
+ bxeq lr
+
+ ldr BUF, [IN]
+ add IN, #4
+ add OUT, #2
+ qadd BUF, HALF, BUF
+ sub N, #1
+ ssat BUF, #16, BUF, asr #13
+ strh BUF, [OUT, #-2]
+ b 1b