git.sesse.net Git - vlc/blob - contrib/src/speexdsp/neon.patch

   1 From b1654a302361ec9f612fca84da941ae99c669795 Mon Sep 17 00:00:00 2001
   2 From: Jyri Sarha <jsarha@ti.com>
   3 Date: Thu, 1 Sep 2011 15:30:27 +0300
   4 Subject: [PATCH 1/4] resample: Calculate full sinc table (e.g. no sinc
   5  interpolation)
   6
   7 On many systems taking couple of kilo-bytes for full sinc table is not
   8 a problem. With a straight filter table the number of MAC operations
   9 needed drops to one quarter. Also the inner loop of the filter becomes
  10 more simple and easier to optimize. So the full sinc table version
  11 runs much faster in most cases. The trade off is some kilo-bytes
  12 bigger memory usage and more CPU needed when initializing the
  13 resampler.
  14
  15 configure.ac: Add --enable-resample-full-sinc-table conf flag
  16
  17 Use this flag to enable resampling with full sinc table.
  18 ---
  19  configure.ac           | 5 +++++
  20  libspeexdsp/resample.c | 4 ++++
  21  2 files changed, 9 insertions(+)
  22
  23 diff --git a/configure.ac b/configure.ac
  24 index 2ba0b13..9ce4f97 100644
  25 --- a/configure.ac
  26 +++ b/configure.ac
  27 @@ -168,6 +168,11 @@ AC_ARG_ENABLE(fixed-point-debug, [  --enable-fixed-point-debug  Debug fixed-poin
  28    AC_DEFINE([FIXED_DEBUG], , [Debug fixed-point implementation])
  29  fi])
  30
  31 +AC_ARG_ENABLE(resample-full-sinc-table, [  --enable-resample-full-sinc-table Resample full SINC table (no interpolation)],
  32 +[if test "$enableval" = yes; then
  33 +  AC_DEFINE([RESAMPLE_FULL_SINC_TABLE], , [Resample with full SINC table (no interpolation)])
  34 +fi])
  35 +
  36  AC_ARG_ENABLE(ti-c55x, [  --enable-ti-c55x        Enable support for TI C55X DSP],
  37  [if test "$enableval" = yes; then
  38    has_char16=yes;
  39 diff --git a/libspeexdsp/resample.c b/libspeexdsp/resample.c
  40 index 7121445..3588f7b 100644
  41 --- a/libspeexdsp/resample.c
  42 +++ b/libspeexdsp/resample.c
  43 @@ -585,7 +585,11 @@ static void update_filter(SpeexResamplerState *st)
  44     }
  45
  46     /* Choose the resampling type that requires the least amount of memory */
  47 +#ifdef RESAMPLE_FULL_SINC_TABLE
  48 +   if (1)
  49 +#else
  50     if (st->filt_len*st->den_rate <= st->filt_len*st->oversample+8)
  51 +#endif
  52     {
  53        spx_uint32_t i;
  54        if (!st->sinc_table)
  55 --
  56 1.8.3.2
  57
  58 From 6a882b92234b0715d0227777217e4b869a2fade1 Mon Sep 17 00:00:00 2001
  59 From: Jyri Sarha <jsarha@ti.com>
  60 Date: Thu, 1 Sep 2011 15:30:30 +0300
  61 Subject: [PATCH 2/4] configure.ac: Add ARM NEON support
  62
  63 Use --enable-neon to force NEON optimization on. The auto detection
  64 should also work if your CFLAGS supports NEON.
  65 ---
  66  configure.ac | 30 ++++++++++++++++++++++++++++++
  67  1 file changed, 30 insertions(+)
  68
  69 diff --git a/configure.ac b/configure.ac
  70 index 9ce4f97..d851a52 100644
  71 --- a/configure.ac
  72 +++ b/configure.ac
  73 @@ -81,6 +81,23 @@ has_sse=no
  74  )
  75  AC_MSG_RESULT($has_sse)
  76
  77 +AC_MSG_CHECKING(for NEON in current arch/CFLAGS)
  78 +AC_LINK_IFELSE([
  79 +AC_LANG_PROGRAM([[
  80 +#include <arm_neon.h>
  81 +int32x4_t testfunc(int16_t *a, int16_t *b) {
  82 +      return vmull_s16(vld1_s16(a), vld1_s16(b));
  83 +}
  84 +]])],
  85 +[
  86 +has_neon=yes
  87 +],
  88 +[
  89 +has_neon=no
  90 +]
  91 +)
  92 +AC_MSG_RESULT($has_neon)
  93 +
  94  SAVE_CFLAGS="$CFLAGS"
  95  CFLAGS="$CFLAGS -fvisibility=hidden"
  96  AC_MSG_CHECKING(for ELF visibility)
  97 @@ -123,6 +140,15 @@ has_sse=no
  98  fi
  99  ])
 100
 101 +AC_ARG_ENABLE(neon, [  --enable-neon           Enable NEON support], [
 102 +if test "x$enableval" != xno; then
 103 +has_neon=yes
 104 +CFLAGS="$CFLAGS -O3 -march=armv7-a -mfpu=neon"
 105 +else
 106 +has_neon=no
 107 +fi
 108 +])
 109 +
 110
 111  FFT=smallft
 112
 113 @@ -140,6 +166,10 @@ if test "$has_sse" = yes; then
 114    AC_DEFINE([_USE_SSE], , [Enable SSE support])
 115  fi
 116
 117 +if test "$has_neon" = yes; then
 118 +  AC_DEFINE([_USE_NEON], , [Enable NEON support])
 119 +fi
 120 +
 121  AC_ARG_ENABLE(float-api, [  --disable-float-api     Disable the floating-point API],
 122  [if test "$enableval" = no; then
 123    AC_DEFINE([DISABLE_FLOAT_API], , [Disable all parts of the API that are using floats])
 124 --
 125 1.8.3.2
 126
 127 From fcb1b3668d6efbf52bc229da0769d66edc19d483 Mon Sep 17 00:00:00 2001
 128 From: Jyri Sarha <jsarha@ti.com>
 129 Date: Thu, 1 Sep 2011 15:30:29 +0300
 130 Subject: [PATCH 3/4] resample: Add NEON optimized inner_product_single for
 131  fixed point
 132
 133 Semantics of inner_product_single have also been changed to contain
 134 the final right shift and saturation so it can also be implemented in
 135 the optimal way for the used platform. This change affects fixed point
 136 calculations only.
 137
 138 I also added a new fixed point macro SATURATE32PSHR(x, shift, a). It
 139 does pretty much the same thing as SATURATE32(PSHR32(x, shift), a),
 140 but it avoids over flowing in rounding up phase in the rare occasion
 141 where x has already been saturated. It should also be slightly faster.
 142 ---
 143  libspeexdsp/arch.h          |   1 +
 144  libspeexdsp/fixed_generic.h |   4 ++
 145  libspeexdsp/resample.c      |  10 ++++-
 146  libspeexdsp/resample_neon.h | 100 ++++++++++++++++++++++++++++++++++++++++++++
 147  4 files changed, 113 insertions(+), 2 deletions(-)
 148  create mode 100644 libspeexdsp/resample_neon.h
 149
 150 diff --git a/libspeexdsp/arch.h b/libspeexdsp/arch.h
 151 index 18446dd..535d308 100644
 152 --- a/libspeexdsp/arch.h
 153 +++ b/libspeexdsp/arch.h
 154 @@ -163,6 +163,7 @@ typedef float spx_word32_t;
 155  #define VSHR32(a,shift) (a)
 156  #define SATURATE16(x,a) (x)
 157  #define SATURATE32(x,a) (x)
 158 +#define SATURATE32PSHR(x,shift,a) (x)
 159
 160  #define PSHR(a,shift)       (a)
 161  #define SHR(a,shift)       (a)
 162 diff --git a/libspeexdsp/fixed_generic.h b/libspeexdsp/fixed_generic.h
 163 index 3fb096e..0e012e9 100644
 164 --- a/libspeexdsp/fixed_generic.h
 165 +++ b/libspeexdsp/fixed_generic.h
 166 @@ -52,6 +52,10 @@
 167  #define SATURATE16(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 168  #define SATURATE32(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 169
 170 +#define SATURATE32PSHR(x,shift,a) (((x)>=(SHL32(a,shift))) ? (a) : \
 171 +                                   (x)<=-(SHL32(a,shift)) ? -(a) : \
 172 +                                   (PSHR32(x, shift)))
 173 +
 174  #define SHR(a,shift) ((a) >> (shift))
 175  #define SHL(a,shift) ((spx_word32_t)(a) << (shift))
 176  #define PSHR(a,shift) (SHR((a)+((EXTEND32(1)<<((shift))>>1)),shift))
 177 diff --git a/libspeexdsp/resample.c b/libspeexdsp/resample.c
 178 index 3588f7b..ad59b97 100644
 179 --- a/libspeexdsp/resample.c
 180 +++ b/libspeexdsp/resample.c
 181 @@ -99,6 +99,10 @@ static void speex_free (void *ptr) {free(ptr);}
 182  #include "resample_sse.h"
 183  #endif
 184
 185 +#ifdef _USE_NEON
 186 +#include "resample_neon.h"
 187 +#endif
 188 +
 189  /* Numer of elements to allocate on the stack */
 190  #ifdef VAR_ARRAYS
 191  #define FIXED_STACK_ALLOC 8192
 192 @@ -360,11 +364,12 @@ static int resampler_basic_direct_single(SpeexResamplerState *st, spx_uint32_t c
 193        }
 194        sum = accum[0] + accum[1] + accum[2] + accum[3];
 195  */
 196 +      sum = SATURATE32PSHR(sum, 15, 32767);
 197  #else
 198        sum = inner_product_single(sinc, iptr, N);
 199  #endif
 200
 201 -      out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
 202 +      out[out_stride * out_sample++] = sum;
 203        last_sample += int_advance;
 204        samp_frac_num += frac_advance;
 205        if (samp_frac_num >= den_rate)
 206 @@ -470,12 +475,13 @@ static int resampler_basic_interpolate_single(SpeexResamplerState *st, spx_uint3
 207
 208        cubic_coef(frac, interp);
 209        sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
 210 +      sum = SATURATE32PSHR(sum, 15, 32767);
 211  #else
 212        cubic_coef(frac, interp);
 213        sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
 214  #endif
 215
 216 -      out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
 217 +      out[out_stride * out_sample++] = sum;
 218        last_sample += int_advance;
 219        samp_frac_num += frac_advance;
 220        if (samp_frac_num >= den_rate)
 221 diff --git a/libspeexdsp/resample_neon.h b/libspeexdsp/resample_neon.h
 222 new file mode 100644
 223 index 0000000..ba93e41
 224 --- /dev/null
 225 +++ b/libspeexdsp/resample_neon.h
 226 @@ -0,0 +1,100 @@
 227 +/* Copyright (C) 2007-2008 Jean-Marc Valin
 228 + * Copyright (C) 2008 Thorvald Natvig
 229 + * Copyright (C) 2011 Texas Instruments
 230 + *               author Jyri Sarha
 231 + */
 232 +/**
 233 +   @file resample_neon.h
 234 +   @brief Resampler functions (NEON version)
 235 +*/
 236 +/*
 237 +   Redistribution and use in source and binary forms, with or without
 238 +   modification, are permitted provided that the following conditions
 239 +   are met:
 240 +
 241 +   - Redistributions of source code must retain the above copyright
 242 +   notice, this list of conditions and the following disclaimer.
 243 +
 244 +   - Redistributions in binary form must reproduce the above copyright
 245 +   notice, this list of conditions and the following disclaimer in the
 246 +   documentation and/or other materials provided with the distribution.
 247 +
 248 +   - Neither the name of the Xiph.org Foundation nor the names of its
 249 +   contributors may be used to endorse or promote products derived from
 250 +   this software without specific prior written permission.
 251 +
 252 +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 253 +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 254 +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 255 +   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
 256 +   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 257 +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 258 +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 259 +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 260 +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 261 +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 262 +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 263 +*/
 264 +
 265 +#include <arm_neon.h>
 266 +
 267 +#ifdef FIXED_POINT
 268 +#define OVERRIDE_INNER_PRODUCT_SINGLE
 269 +/* Only works when len % 4 == 0 */
 270 +static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
 271 +{
 272 +    int32_t ret;
 273 +    uint32_t remainder = len % 16;
 274 +    len = len - remainder;
 275 +
 276 +    asm volatile ("     cmp %[len], #0\n"
 277 +                 "      bne 1f\n"
 278 +                 "      vld1.16 {d16}, [%[b]]!\n"
 279 +                 "      vld1.16 {d20}, [%[a]]!\n"
 280 +                 "      subs %[remainder], %[remainder], #4\n"
 281 +                 "      vmull.s16 q0, d16, d20\n"
 282 +                 "      beq 5f\n"
 283 +                 "      b 4f\n"
 284 +                 "1:"
 285 +                 "      vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
 286 +                 "      vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
 287 +                 "      subs %[len], %[len], #16\n"
 288 +                 "      vmull.s16 q0, d16, d20\n"
 289 +                 "      vmlal.s16 q0, d17, d21\n"
 290 +                 "      vmlal.s16 q0, d18, d22\n"
 291 +                 "      vmlal.s16 q0, d19, d23\n"
 292 +                 "      beq 3f\n"
 293 +                 "2:"
 294 +                 "      vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
 295 +                 "      vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
 296 +                 "      subs %[len], %[len], #16\n"
 297 +                 "      vmlal.s16 q0, d16, d20\n"
 298 +                 "      vmlal.s16 q0, d17, d21\n"
 299 +                 "      vmlal.s16 q0, d18, d22\n"
 300 +                 "      vmlal.s16 q0, d19, d23\n"
 301 +                 "      bne 2b\n"
 302 +                 "3:"
 303 +                 "      cmp %[remainder], #0\n"
 304 +                 "      beq 5f\n"
 305 +                 "4:"
 306 +                 "      vld1.16 {d16}, [%[b]]!\n"
 307 +                 "      vld1.16 {d20}, [%[a]]!\n"
 308 +                 "      subs %[remainder], %[remainder], #4\n"
 309 +                 "      vmlal.s16 q0, d16, d20\n"
 310 +                 "      bne 4b\n"
 311 +                 "5:"
 312 +                 "      vaddl.s32 q0, d0, d1\n"
 313 +                 "      vadd.s64 d0, d0, d1\n"
 314 +                 "      vqmovn.s64 d0, q0\n"
 315 +                 "      vqrshrn.s32 d0, q0, #15\n"
 316 +                 "      vmov.s16 %[ret], d0[0]\n"
 317 +                 : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
 318 +                   [len] "+r" (len), [remainder] "+r" (remainder)
 319 +                 :
 320 +                 : "cc", "q0",
 321 +                   "d16", "d17", "d18", "d19",
 322 +                   "d20", "d21", "d22", "d23");
 323 +
 324 +    return ret;
 325 +}
 326 +#endif
 327 --
 328 1.8.3.2
 329
 330 From 9864346021128b8a985542811612225d7b4acee7 Mon Sep 17 00:00:00 2001
 331 From: Jyri Sarha <jsarha@ti.com>
 332 Date: Thu, 1 Sep 2011 15:30:31 +0300
 333 Subject: [PATCH 4/4] resample: Add NEON optimized inner_product_single for
 334  floating point
 335
 336 Also adds inline asm implementations of WORD2INT(x) macro for fixed
 337 and floating point.
 338 ---
 339  libspeexdsp/resample_neon.h | 101 ++++++++++++++++++++++++++++++++++++++++++++
 340  1 file changed, 101 insertions(+)
 341
 342 diff --git a/libspeexdsp/resample_neon.h b/libspeexdsp/resample_neon.h
 343 index ba93e41..e7e981e 100644
 344 --- a/libspeexdsp/resample_neon.h
 345 +++ b/libspeexdsp/resample_neon.h
 346 @@ -39,6 +39,30 @@
 347  #include <arm_neon.h>
 348
 349  #ifdef FIXED_POINT
 350 +#ifdef __thumb2__
 351 +static inline int32_t saturate_32bit_to_16bit(int32_t a) {
 352 +    int32_t ret;
 353 +    asm ("ssat %[ret], #16, %[a]"
 354 +         : [ret] "=&r" (ret)
 355 +         : [a] "r" (a)
 356 +         : );
 357 +    return ret;
 358 +}
 359 +#else
 360 +static inline int32_t saturate_32bit_to_16bit(int32_t a) {
 361 +    int32_t ret;
 362 +    asm ("vmov.s32 d0[0], %[a]\n"
 363 +         "vqmovn.s32 d0, q0\n"
 364 +         "vmov.s16 %[ret], d0[0]\n"
 365 +         : [ret] "=&r" (ret)
 366 +         : [a] "r" (a)
 367 +         : "q0");
 368 +    return ret;
 369 +}
 370 +#endif
 371 +#undef WORD2INT
 372 +#define WORD2INT(x) (saturate_32bit_to_16bit(x))
 373 +
 374  #define OVERRIDE_INNER_PRODUCT_SINGLE
 375  /* Only works when len % 4 == 0 */
 376  static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
 377 @@ -97,4 +121,81 @@ static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, u
 378
 379      return ret;
 380  }
 381 +#elif defined(FLOATING_POINT)
 382 +
 383 +static inline int32_t saturate_float_to_16bit(float a) {
 384 +    int32_t ret;
 385 +    asm ("vmov.f32 d0[0], %[a]\n"
 386 +         "vcvt.s32.f32 d0, d0, #15\n"
 387 +         "vqrshrn.s32 d0, q0, #15\n"
 388 +         "vmov.s16 %[ret], d0[0]\n"
 389 +         : [ret] "=&r" (ret)
 390 +         : [a] "r" (a)
 391 +         : "q0");
 392 +    return ret;
 393 +}
 394 +#undef WORD2INT
 395 +#define WORD2INT(x) (saturate_float_to_16bit(x))
 396 +
 397 +#define OVERRIDE_INNER_PRODUCT_SINGLE
 398 +/* Only works when len % 4 == 0 */
 399 +static inline float inner_product_single(const float *a, const float *b, unsigned int len)
 400 +{
 401 +    float ret;
 402 +    uint32_t remainder = len % 16;
 403 +    len = len - remainder;
 404 +
 405 +    asm volatile ("     cmp %[len], #0\n"
 406 +                 "      bne 1f\n"
 407 +                 "      vld1.32 {q4}, [%[b]]!\n"
 408 +                 "      vld1.32 {q8}, [%[a]]!\n"
 409 +                 "      subs %[remainder], %[remainder], #4\n"
 410 +                 "      vmul.f32 q0, q4, q8\n"
 411 +                 "      bne 4f\n"
 412 +                 "      b 5f\n"
 413 +                 "1:"
 414 +                 "      vld1.32 {q4, q5}, [%[b]]!\n"
 415 +                 "      vld1.32 {q8, q9}, [%[a]]!\n"
 416 +                 "      vld1.32 {q6, q7}, [%[b]]!\n"
 417 +                 "      vld1.32 {q10, q11}, [%[a]]!\n"
 418 +                 "      subs %[len], %[len], #16\n"
 419 +                 "      vmul.f32 q0, q4, q8\n"
 420 +                 "      vmul.f32 q1, q5, q9\n"
 421 +                 "      vmul.f32 q2, q6, q10\n"
 422 +                 "      vmul.f32 q3, q7, q11\n"
 423 +                 "      beq 3f\n"
 424 +                 "2:"
 425 +                 "      vld1.32 {q4, q5}, [%[b]]!\n"
 426 +                 "      vld1.32 {q8, q9}, [%[a]]!\n"
 427 +                 "      vld1.32 {q6, q7}, [%[b]]!\n"
 428 +                 "      vld1.32 {q10, q11}, [%[a]]!\n"
 429 +                 "      subs %[len], %[len], #16\n"
 430 +                 "      vmla.f32 q0, q4, q8\n"
 431 +                 "      vmla.f32 q1, q5, q9\n"
 432 +                 "      vmla.f32 q2, q6, q10\n"
 433 +                 "      vmla.f32 q3, q7, q11\n"
 434 +                 "      bne 2b\n"
 435 +                 "3:"
 436 +                 "      vadd.f32 q4, q0, q1\n"
 437 +                 "      vadd.f32 q5, q2, q3\n"
 438 +                 "      cmp %[remainder], #0\n"
 439 +                 "      vadd.f32 q0, q4, q5\n"
 440 +                 "      beq 5f\n"
 441 +                 "4:"
 442 +                 "      vld1.32 {q6}, [%[b]]!\n"
 443 +                 "      vld1.32 {q10}, [%[a]]!\n"
 444 +                 "      subs %[remainder], %[remainder], #4\n"
 445 +                 "      vmla.f32 q0, q6, q10\n"
 446 +                 "      bne 4b\n"
 447 +                 "5:"
 448 +                 "      vadd.f32 d0, d0, d1\n"
 449 +                 "      vpadd.f32 d0, d0, d0\n"
 450 +                 "      vmov.f32 %[ret], d0[0]\n"
 451 +                 : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
 452 +                   [len] "+l" (len), [remainder] "+l" (remainder)
 453 +                 :
 454 +                 : "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
 455 +                    "q9", "q10", "q11");
 456 +    return ret;
 457 +}
 458  #endif
 459 --
 460 1.8.3.2
 461