1 From 9faa77ebb077e629ad5c5e1a84fa637cda30538d Mon Sep 17 00:00:00 2001
2 From: Jyri Sarha <jsarha@ti.com>
3 Date: Thu, 1 Sep 2011 15:30:27 +0300
4 Subject: [PATCH 1/4] resample: Calculate full sinc table (e.g. no sinc
7 On many systems taking couple of kilo-bytes for full sinc table is not
8 a problem. With a straight filter table the number of MAC operations
9 needed drops to one quarter. Also the inner loop of the filter becomes
10 more simple and easier to optimize. So the full sinc table version
11 runs much faster in most cases. The trade off is some kilo-bytes
12 bigger memory usage and more CPU needed when initializing the
15 configure.ac: Add --enable-resample-full-sinc-table conf flag
17 Use this flag to enable resampling with full sinc table.
19 configure.ac | 5 +++++
20 libspeex/resample.c | 4 ++++
21 2 files changed, 9 insertions(+), 0 deletions(-)
23 diff --git a/configure.ac b/configure.ac
24 index 3179521..2f87461 100644
27 @@ -198,6 +198,11 @@ AC_ARG_ENABLE(fixed-point-debug, [ --enable-fixed-point-debug Debug fixed-poin
28 AC_DEFINE([FIXED_DEBUG], , [Debug fixed-point implementation])
31 +AC_ARG_ENABLE(resample-full-sinc-table, [ --enable-resample-full-sinc-table Resample full SINC table (no interpolation)],
32 +[if test "$enableval" = yes; then
33 + AC_DEFINE([RESAMPLE_FULL_SINC_TABLE], , [Resample with full SINC table (no interpolation)])
36 AC_ARG_ENABLE(ti-c55x, [ --enable-ti-c55x Enable support for TI C55X DSP],
37 [if test "$enableval" = yes; then
39 diff --git a/libspeex/resample.c b/libspeex/resample.c
40 index 7b5a308..3829715 100644
41 --- a/libspeex/resample.c
42 +++ b/libspeex/resample.c
43 @@ -585,7 +585,11 @@ static void update_filter(SpeexResamplerState *st)
46 /* Choose the resampling type that requires the least amount of memory */
47 +#ifdef RESAMPLE_FULL_SINC_TABLE
50 if (st->den_rate <= st->oversample)
58 From 3da94c7da3357438e39b5b6d110529fdd6796c25 Mon Sep 17 00:00:00 2001
59 From: Jyri Sarha <jsarha@ti.com>
60 Date: Thu, 1 Sep 2011 15:30:30 +0300
61 Subject: [PATCH 2/4] configure.ac: Add ARM NEON support
63 Use --enable-neon to force NEON optimization on. The auto detection
64 should also work if your CFLAGS supports NEON.
66 configure.ac | 30 ++++++++++++++++++++++++++++++
67 1 files changed, 30 insertions(+), 0 deletions(-)
69 diff --git a/configure.ac b/configure.ac
70 index 2f87461..08d3d5f 100644
73 @@ -89,6 +89,23 @@ has_sse=no
75 AC_MSG_RESULT($has_sse)
77 +AC_MSG_CHECKING(for NEON in current arch/CFLAGS)
80 +#include <arm_neon.h>
81 +int32x4_t testfunc(int16_t *a, int16_t *b) {
82 + return vmull_s16(vld1_s16(a), vld1_s16(b));
92 +AC_MSG_RESULT($has_neon)
95 CFLAGS="$CFLAGS -fvisibility=hidden"
96 AC_MSG_CHECKING(for ELF visibility)
97 @@ -148,6 +165,15 @@ has_sse=no
101 +AC_ARG_ENABLE(neon, [ --enable-neon Enable NEON support], [
102 +if test "x$enableval" != xno; then
104 +CFLAGS="$CFLAGS -O3 -march=armv7-a -mfpu=neon"
113 @@ -165,6 +191,10 @@ if test "$has_sse" = yes; then
114 AC_DEFINE([_USE_SSE], , [Enable SSE support])
117 +if test "$has_neon" = yes; then
118 + AC_DEFINE([_USE_NEON], , [Enable NEON support])
121 AC_ARG_ENABLE(float-api, [ --disable-float-api Disable the floating-point API],
122 [if test "$enableval" = no; then
123 AC_DEFINE([DISABLE_FLOAT_API], , [Disable all parts of the API that are using floats])
127 From bf0e909164995b820066111d401c793e121d55ac Mon Sep 17 00:00:00 2001
128 From: Jyri Sarha <jsarha@ti.com>
129 Date: Thu, 1 Sep 2011 15:30:29 +0300
130 Subject: [PATCH 3/4] resample: Add NEON optimized inner_product_single for
133 Semantics of inner_product_single have also been changed to contain
134 the final right shift and saturation so it can also be implemented in
135 the optimal way for the used platform. This change affects fixed point
138 I also added a new fixed point macro SATURATE32PSHR(x, shift, a). It
139 does pretty much the same thing as SATURATE32(PSHR32(x, shift), a),
140 but it avoids over flowing in rounding up phase in the rare occasion
141 where x has already been saturated. It should also be slightly faster.
143 libspeex/arch.h | 1 +
144 libspeex/fixed_generic.h | 4 ++
145 libspeex/resample.c | 10 ++++-
146 libspeex/resample_neon.h | 100 ++++++++++++++++++++++++++++++++++++++++++++++
147 4 files changed, 113 insertions(+), 2 deletions(-)
148 create mode 100644 libspeex/resample_neon.h
150 diff --git a/libspeex/arch.h b/libspeex/arch.h
151 index 3b47ed9..daa72a7 100644
152 --- a/libspeex/arch.h
153 +++ b/libspeex/arch.h
154 @@ -171,6 +171,7 @@ typedef float spx_word32_t;
155 #define VSHR32(a,shift) (a)
156 #define SATURATE16(x,a) (x)
157 #define SATURATE32(x,a) (x)
158 +#define SATURATE32PSHR(x,shift,a) (x)
160 #define PSHR(a,shift) (a)
161 #define SHR(a,shift) (a)
162 diff --git a/libspeex/fixed_generic.h b/libspeex/fixed_generic.h
163 index 3fb096e..0e012e9 100644
164 --- a/libspeex/fixed_generic.h
165 +++ b/libspeex/fixed_generic.h
167 #define SATURATE16(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
168 #define SATURATE32(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
170 +#define SATURATE32PSHR(x,shift,a) (((x)>=(SHL32(a,shift))) ? (a) : \
171 + (x)<=-(SHL32(a,shift)) ? -(a) : \
172 + (PSHR32(x, shift)))
174 #define SHR(a,shift) ((a) >> (shift))
175 #define SHL(a,shift) ((spx_word32_t)(a) << (shift))
176 #define PSHR(a,shift) (SHR((a)+((EXTEND32(1)<<((shift))>>1)),shift))
177 diff --git a/libspeex/resample.c b/libspeex/resample.c
178 index 3829715..7957c61 100644
179 --- a/libspeex/resample.c
180 +++ b/libspeex/resample.c
181 @@ -99,6 +99,10 @@ static void speex_free (void *ptr) {free(ptr);}
182 #include "resample_sse.h"
186 +#include "resample_neon.h"
189 /* Numer of elements to allocate on the stack */
191 #define FIXED_STACK_ALLOC 8192
192 @@ -360,11 +364,12 @@ static int resampler_basic_direct_single(SpeexResamplerState *st, spx_uint32_t c
194 sum = accum[0] + accum[1] + accum[2] + accum[3];
196 + sum = SATURATE32PSHR(sum, 15, 32767);
198 sum = inner_product_single(sinc, iptr, N);
201 - out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
202 + out[out_stride * out_sample++] = sum;
203 last_sample += int_advance;
204 samp_frac_num += frac_advance;
205 if (samp_frac_num >= den_rate)
206 @@ -470,12 +475,13 @@ static int resampler_basic_interpolate_single(SpeexResamplerState *st, spx_uint3
208 cubic_coef(frac, interp);
209 sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
210 + sum = SATURATE32PSHR(sum, 15, 32767);
212 cubic_coef(frac, interp);
213 sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
216 - out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
217 + out[out_stride * out_sample++] = sum;
218 last_sample += int_advance;
219 samp_frac_num += frac_advance;
220 if (samp_frac_num >= den_rate)
221 diff --git a/libspeex/resample_neon.h b/libspeex/resample_neon.h
223 index 0000000..ba93e41
225 +++ b/libspeex/resample_neon.h
227 +/* Copyright (C) 2007-2008 Jean-Marc Valin
228 + * Copyright (C) 2008 Thorvald Natvig
229 + * Copyright (C) 2011 Texas Instruments
230 + * author Jyri Sarha
233 + @file resample_neon.h
234 + @brief Resampler functions (NEON version)
237 + Redistribution and use in source and binary forms, with or without
238 + modification, are permitted provided that the following conditions
241 + - Redistributions of source code must retain the above copyright
242 + notice, this list of conditions and the following disclaimer.
244 + - Redistributions in binary form must reproduce the above copyright
245 + notice, this list of conditions and the following disclaimer in the
246 + documentation and/or other materials provided with the distribution.
248 + - Neither the name of the Xiph.org Foundation nor the names of its
249 + contributors may be used to endorse or promote products derived from
250 + this software without specific prior written permission.
252 + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
253 + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
254 + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
255 + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
256 + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
257 + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
258 + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
259 + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
260 + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
261 + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
262 + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
265 +#include <arm_neon.h>
268 +#define OVERRIDE_INNER_PRODUCT_SINGLE
269 +/* Only works when len % 4 == 0 */
270 +static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
273 + uint32_t remainder = len % 16;
274 + len = len - remainder;
276 + asm volatile (" cmp %[len], #0\n"
278 + " vld1.16 {d16}, [%[b]]!\n"
279 + " vld1.16 {d20}, [%[a]]!\n"
280 + " subs %[remainder], %[remainder], #4\n"
281 + " vmull.s16 q0, d16, d20\n"
285 + " vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
286 + " vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
287 + " subs %[len], %[len], #16\n"
288 + " vmull.s16 q0, d16, d20\n"
289 + " vmlal.s16 q0, d17, d21\n"
290 + " vmlal.s16 q0, d18, d22\n"
291 + " vmlal.s16 q0, d19, d23\n"
294 + " vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
295 + " vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
296 + " subs %[len], %[len], #16\n"
297 + " vmlal.s16 q0, d16, d20\n"
298 + " vmlal.s16 q0, d17, d21\n"
299 + " vmlal.s16 q0, d18, d22\n"
300 + " vmlal.s16 q0, d19, d23\n"
303 + " cmp %[remainder], #0\n"
306 + " vld1.16 {d16}, [%[b]]!\n"
307 + " vld1.16 {d20}, [%[a]]!\n"
308 + " subs %[remainder], %[remainder], #4\n"
309 + " vmlal.s16 q0, d16, d20\n"
312 + " vaddl.s32 q0, d0, d1\n"
313 + " vadd.s64 d0, d0, d1\n"
314 + " vqmovn.s64 d0, q0\n"
315 + " vqrshrn.s32 d0, q0, #15\n"
316 + " vmov.s16 %[ret], d0[0]\n"
317 + : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
318 + [len] "+r" (len), [remainder] "+r" (remainder)
321 + "d16", "d17", "d18", "d19",
322 + "d20", "d21", "d22", "d23");
330 From e2127a3db9e43b4336e79e5746f3053c0c799562 Mon Sep 17 00:00:00 2001
331 From: Jyri Sarha <jsarha@ti.com>
332 Date: Thu, 1 Sep 2011 15:30:31 +0300
333 Subject: [PATCH 4/4] resample: Add NEON optimized inner_product_single for
336 Also adds inline asm implementations of WORD2INT(x) macro for fixed
339 libspeex/resample_neon.h | 101 ++++++++++++++++++++++++++++++++++++++++++++++
340 1 files changed, 101 insertions(+), 0 deletions(-)
342 diff --git a/libspeex/resample_neon.h b/libspeex/resample_neon.h
343 index ba93e41..e7e981e 100644
344 --- a/libspeex/resample_neon.h
345 +++ b/libspeex/resample_neon.h
347 #include <arm_neon.h>
351 +static inline int32_t saturate_32bit_to_16bit(int32_t a) {
353 + asm ("ssat %[ret], #16, %[a]"
354 + : [ret] "=&r" (ret)
360 +static inline int32_t saturate_32bit_to_16bit(int32_t a) {
362 + asm ("vmov.s32 d0[0], %[a]\n"
363 + "vqmovn.s32 d0, q0\n"
364 + "vmov.s16 %[ret], d0[0]\n"
365 + : [ret] "=&r" (ret)
372 +#define WORD2INT(x) (saturate_32bit_to_16bit(x))
374 #define OVERRIDE_INNER_PRODUCT_SINGLE
375 /* Only works when len % 4 == 0 */
376 static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
377 @@ -97,4 +121,81 @@ static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, u
381 +#elif defined(FLOATING_POINT)
383 +static inline int32_t saturate_float_to_16bit(float a) {
385 + asm ("vmov.f32 d0[0], %[a]\n"
386 + "vcvt.s32.f32 d0, d0, #15\n"
387 + "vqrshrn.s32 d0, q0, #15\n"
388 + "vmov.s16 %[ret], d0[0]\n"
389 + : [ret] "=&r" (ret)
395 +#define WORD2INT(x) (saturate_float_to_16bit(x))
397 +#define OVERRIDE_INNER_PRODUCT_SINGLE
398 +/* Only works when len % 4 == 0 */
399 +static inline float inner_product_single(const float *a, const float *b, unsigned int len)
402 + uint32_t remainder = len % 16;
403 + len = len - remainder;
405 + asm volatile (" cmp %[len], #0\n"
407 + " vld1.32 {q4}, [%[b]]!\n"
408 + " vld1.32 {q8}, [%[a]]!\n"
409 + " subs %[remainder], %[remainder], #4\n"
410 + " vmul.f32 q0, q4, q8\n"
414 + " vld1.32 {q4, q5}, [%[b]]!\n"
415 + " vld1.32 {q8, q9}, [%[a]]!\n"
416 + " vld1.32 {q6, q7}, [%[b]]!\n"
417 + " vld1.32 {q10, q11}, [%[a]]!\n"
418 + " subs %[len], %[len], #16\n"
419 + " vmul.f32 q0, q4, q8\n"
420 + " vmul.f32 q1, q5, q9\n"
421 + " vmul.f32 q2, q6, q10\n"
422 + " vmul.f32 q3, q7, q11\n"
425 + " vld1.32 {q4, q5}, [%[b]]!\n"
426 + " vld1.32 {q8, q9}, [%[a]]!\n"
427 + " vld1.32 {q6, q7}, [%[b]]!\n"
428 + " vld1.32 {q10, q11}, [%[a]]!\n"
429 + " subs %[len], %[len], #16\n"
430 + " vmla.f32 q0, q4, q8\n"
431 + " vmla.f32 q1, q5, q9\n"
432 + " vmla.f32 q2, q6, q10\n"
433 + " vmla.f32 q3, q7, q11\n"
436 + " vadd.f32 q4, q0, q1\n"
437 + " vadd.f32 q5, q2, q3\n"
438 + " cmp %[remainder], #0\n"
439 + " vadd.f32 q0, q4, q5\n"
442 + " vld1.32 {q6}, [%[b]]!\n"
443 + " vld1.32 {q10}, [%[a]]!\n"
444 + " subs %[remainder], %[remainder], #4\n"
445 + " vmla.f32 q0, q6, q10\n"
448 + " vadd.f32 d0, d0, d1\n"
449 + " vpadd.f32 d0, d0, d0\n"
450 + " vmov.f32 %[ret], d0[0]\n"
451 + : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
452 + [len] "+l" (len), [remainder] "+l" (remainder)
454 + : "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
455 + "q9", "q10", "q11");