Yet more moving into subdirectories.

[nageru] / nageru / patches / zita-resampler-sse.diff
diff --git a/nageru/patches/zita-resampler-sse.diff b/nageru/patches/zita-resampler-sse.diff

new file mode 100644 (file)

index 0000000..4954515
--- /dev/null
+++ b/nageru/patches/zita-resampler-sse.diff
@@ -0,0 +1,417 @@
+diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/resampler.cc
+--- orig/zita-resampler-1.3.0/libs/resampler.cc        2012-10-26 22:58:55.000000000 +0200
++++ zita-resampler-1.3.0/libs/resampler.cc     2016-09-05 00:30:34.520191288 +0200
+@@ -24,6 +24,10 @@
+ #include <math.h>
+ #include <zita-resampler/resampler.h>
+ 
++#ifdef __SSE2__
++#include <xmmintrin.h>
++#endif
++
+ 
+ static unsigned int gcd (unsigned int a, unsigned int b)
+ {
+@@ -47,6 +51,118 @@
+     return 1; 
+ }
+ 
++#ifdef __SSE2__
++
++static inline float calc_mono_sample_sse (unsigned int hl,
++                                          const float *c1,
++                                          const float *c2,
++                                          const float *q1,
++                                          const float *q2)
++{
++    unsigned int   i;
++    __m128         denorm, s, w1, w2, shuf;
++
++    denorm = _mm_set1_ps (1e-20f);
++    s = denorm;
++    for (i = 0; i < hl; i += 4)
++    {
++      q2 -= 4;
++
++      // s += *q1 * c1 [i];
++      w1 = _mm_loadu_ps (&c1 [i]);
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), w1));
++
++      // s += *q2 * c2 [i];
++      w2 = _mm_loadu_ps (&c2 [i]);
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 1, 2, 3))));
++
++      q1 += 4;
++    }
++    s = _mm_sub_ps (s, denorm);
++
++    // Add all the elements of s together into one. Adapted from
++    // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
++    shuf = _mm_shuffle_ps (s, s, _MM_SHUFFLE (2, 3, 0, 1));
++    s = _mm_add_ps (s, shuf);
++    s = _mm_add_ss (s, _mm_movehl_ps (shuf, s));
++    return _mm_cvtss_f32 (s);
++}
++
++// Note: This writes four floats instead of two (the last two are garbage).
++// The caller will need to make sure there is room for all four.
++static inline void calc_stereo_sample_sse (unsigned int hl,
++                                           const float *c1,
++                                           const float *c2,
++                                           const float *q1,
++                                           const float *q2,
++                                           float *out_data)
++{
++    unsigned int   i;
++    __m128         denorm, s, w1, w2;
++
++    denorm = _mm_set1_ps (1e-20f);
++    s = denorm;
++    for (i = 0; i < hl; i += 4)
++    {
++      q2 -= 8;
++
++      // s += *q1 * c1 [i];
++      w1 = _mm_loadu_ps (&c1 [i]);
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1),     _mm_unpacklo_ps (w1, w1)));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 4), _mm_unpackhi_ps (w1, w1)));
++
++      // s += *q2 * c2 [i];
++      w2 = _mm_loadu_ps (&c2 [i]);
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 4), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 1, 1))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2),     _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 3, 3))));
++
++      q1 += 8;
++    }
++    s = _mm_sub_ps (s, denorm);
++    s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
++
++    _mm_storeu_ps (out_data, s);
++}
++
++static inline void calc_quad_sample_sse (int hl,
++                                         int nchan,
++                                         const float *c1,
++                                         const float *c2,
++                                         const float *q1,
++                                         const float *q2,
++                                         float *out_data)
++{
++    int            i;
++    __m128         denorm, s, w1, w2;
++
++    denorm = _mm_set1_ps (1e-20f);
++    s = denorm;
++    for (i = 0; i < hl; i += 4)
++    {
++      q2 -= 4 * nchan;
++
++      // s += *p1 * _c1 [i];
++      w1 = _mm_loadu_ps (&c1 [i]);
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1),             _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (0, 0, 0, 0))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + nchan),     _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (1, 1, 1, 1))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 2 * nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (2, 2, 2, 2))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 3 * nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (3, 3, 3, 3))));
++
++      // s += *p2 * _c2 [i];
++      w2 = _mm_loadu_ps (&c2 [i]);
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 3 * nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 0, 0))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 2 * nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (1, 1, 1, 1))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + nchan),     _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 2, 2))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2),             _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (3, 3, 3, 3))));
++
++      q1 += 4 * nchan;
++    }
++    s = _mm_sub_ps (s, denorm);
++
++    _mm_storeu_ps (out_data, s);
++}
++#endif
++
+ 
+ Resampler::Resampler (void) :
+     _table (0),
+@@ -213,18 +329,42 @@
+               {
+                   float *c1 = _table->_ctab + hl * ph;
+                   float *c2 = _table->_ctab + hl * (np - ph);
+-                  for (c = 0; c < _nchan; c++)
++#ifdef __SSE2__
++                  if ((hl % 4) == 0 && _nchan == 1)
++                    {
++                      *out_data++ = calc_mono_sample_sse (hl, c1, c2, p1, p2);
++                    }
++                  else if ((hl % 4) == 0 && _nchan == 2)
+                   {
+-                      float *q1 = p1 + c;
+-                      float *q2 = p2 + c;
+-                      float s = 1e-20f;
+-                      for (i = 0; i < hl; i++)
++                        if (out_count >= 2)
++                        {
++                          calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data);
++                        }
++                        else
++                        {
++                            float tmp[4];
++                          calc_stereo_sample_sse (hl, c1, c2, p1, p2, tmp);
++                            out_data[0] = tmp[0];
++                            out_data[1] = tmp[1];
++                        }
++                      out_data += 2;
++                  }
++                  else
++#endif
++                    {
++                      for (c = 0; c < _nchan; c++)
+                       {
+-                          q2 -= _nchan;
+-                          s += *q1 * c1 [i] + *q2 * c2 [i];
+-                          q1 += _nchan;
++                          float *q1 = p1 + c;
++                          float *q2 = p2 + c;
++                          float s = 1e-20f;
++                          for (i = 0; i < hl; i++)
++                          {
++                              q2 -= _nchan;
++                              s += *q1 * c1 [i] + *q2 * c2 [i];
++                              q1 += _nchan;
++                          }
++                          *out_data++ = s - 1e-20f;
+                       }
+-                      *out_data++ = s - 1e-20f;
+                   }
+               }
+               else
+diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/vresampler.cc
+--- orig/zita-resampler-1.3.0/libs/vresampler.cc       2012-10-26 22:58:55.000000000 +0200
++++ zita-resampler-1.3.0/libs/vresampler.cc    2016-09-05 00:33:53.907511211 +0200
+@@ -25,6 +25,152 @@
+ #include <zita-resampler/vresampler.h>
+ 
+ 
++#ifdef __SSE2__
++
++#include <xmmintrin.h>
++
++static inline float calc_mono_sample_sse (int hl,
++                                          float b,
++                                          const float *p1,
++                                          const float *p2,
++                                          const float *q1,
++                                          const float *q2)
++{
++    int            i;
++    __m128         denorm, bs, s, c1, c2, w1, w2, shuf;
++
++    denorm = _mm_set1_ps (1e-25f);
++    bs = _mm_set1_ps (b);
++    s = denorm;
++    for (i = 0; i < hl; i += 4)
++    {
++      p2 -= 4;
++
++      // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
++      w1 = _mm_loadu_ps (&q1 [i]);
++      w2 = _mm_loadu_ps (&q1 [i + hl]);
++      c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++      // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
++      w1 = _mm_loadu_ps (&q2 [i]);
++      w2 = _mm_loadu_ps (&q2 [i - hl]);
++      c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++      // s += *p1 * _c1 [i];
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), c1));
++
++      // s += *p2 * _c2 [i];
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 1, 2, 3))));
++
++      p1 += 4;
++    }
++    s = _mm_sub_ps (s, denorm);
++
++    // Add all the elements of s together into one. Adapted from
++    // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
++    shuf = _mm_shuffle_ps (s, s, _MM_SHUFFLE (2, 3, 0, 1));
++    s = _mm_add_ps (s, shuf);
++    s = _mm_add_ss (s, _mm_movehl_ps (shuf, s));
++    return _mm_cvtss_f32 (s);
++}
++
++// Note: This writes four floats instead of two (the last two are garbage).
++// The caller will need to make sure there is room for all four.
++static inline void calc_stereo_sample_sse (int hl,
++                                           float b,
++                                           const float *p1,
++                                           const float *p2,
++                                           const float *q1,
++                                           const float *q2,
++                                           float *out_data)
++{
++    int            i;
++    __m128         denorm, bs, s, c1, c2, w1, w2;
++
++    denorm = _mm_set1_ps (1e-25f);
++    bs = _mm_set1_ps (b);
++    s = denorm;
++    for (i = 0; i < hl; i += 4)
++    {
++      p2 -= 8;
++
++      // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
++      w1 = _mm_loadu_ps (&q1 [i]);
++      w2 = _mm_loadu_ps (&q1 [i + hl]);
++      c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++      // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
++      w1 = _mm_loadu_ps (&q2 [i]);
++      w2 = _mm_loadu_ps (&q2 [i - hl]);
++      c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++      // s += *p1 * _c1 [i];
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1),     _mm_unpacklo_ps (c1, c1)));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 4), _mm_unpackhi_ps (c1, c1)));
++
++      // s += *p2 * _c2 [i];
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 4), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 1, 1))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2),     _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 3, 3))));
++
++      p1 += 8;
++    }
++    s = _mm_sub_ps (s, denorm);
++    s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
++
++    _mm_storeu_ps (out_data, s);
++}
++
++static inline void calc_quad_sample_sse (int hl,
++                                         int nchan,
++                                         float b,
++                                         const float *p1,
++                                         const float *p2,
++                                         const float *q1,
++                                         const float *q2,
++                                         float *out_data)
++{
++    int            i;
++    __m128         denorm, bs, s, c1, c2, w1, w2;
++
++    denorm = _mm_set1_ps (1e-25f);
++    bs = _mm_set1_ps (b);
++    s = denorm;
++    for (i = 0; i < hl; i += 4)
++    {
++      p2 -= 4 * nchan;
++
++      // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
++      w1 = _mm_loadu_ps (&q1 [i]);
++      w2 = _mm_loadu_ps (&q1 [i + hl]);
++      c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++      // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
++      w1 = _mm_loadu_ps (&q2 [i]);
++      w2 = _mm_loadu_ps (&q2 [i - hl]);
++      c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++      // s += *p1 * _c1 [i];
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1),             _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (0, 0, 0, 0))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + nchan),     _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (1, 1, 1, 1))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 2 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (2, 2, 2, 2))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 3 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (3, 3, 3, 3))));
++
++      // s += *p2 * _c2 [i];
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 3 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 0, 0))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 2 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (1, 1, 1, 1))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + nchan),     _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 2, 2))));
++      s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2),             _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (3, 3, 3, 3))));
++
++      p1 += 4 * nchan;
++    }
++    s = _mm_sub_ps (s, denorm);
++
++    _mm_storeu_ps (out_data, s);
++}
++
++#endif
++
++
+ VResampler::VResampler (void) :
+     _table (0),
+     _nchan (0),
+@@ -163,7 +309,7 @@
+ 
+ int VResampler::process (void)
+ {
+-    unsigned int   k, np, in, nr, n, c;
++    unsigned int   j, k, np, in, nr, n, c;
+     int            i, hl, nz;
+     double         ph, dp, dd; 
+     float          a, b, *p1, *p2, *q1, *q2;
+@@ -212,23 +358,55 @@
+                   a = 1.0f - b;
+                   q1 = _table->_ctab + hl * k;
+                   q2 = _table->_ctab + hl * (np - k);
+-                  for (i = 0; i < hl; i++)
++#ifdef __SSE2__
++                  if ((hl % 4) == 0 && _nchan == 1)
++                  {
++                      *out_data++ = calc_mono_sample_sse (hl, b, p1, p2, q1, q2);
++                  }
++                  else if ((hl % 4) == 0 && _nchan == 2)
+                   {
+-                        _c1 [i] = a * q1 [i] + b * q1 [i + hl];
+-                      _c2 [i] = a * q2 [i] + b * q2 [i - hl];
++                      if (out_count >= 2)
++                      {
++                          calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data);
++                      }
++                      else
++                      {
++                          float tmp[4];
++                          calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, tmp);
++                          out_data[0] = tmp[0];
++                          out_data[1] = tmp[1];
++                      }
++                      out_data += 2;
++                  }
++                  else if ((hl % 4) == 0 && (_nchan % 4) == 0)
++                  {
++                      for (j = 0; j < _nchan; j += 4)
++                      {
++                          calc_quad_sample_sse (hl, _nchan, b, p1 + j, p2 + j, q1, q2, out_data + j);
++                      }
++                      out_data += _nchan;
+                   }
+-                  for (c = 0; c < _nchan; c++)
++                  else
++#endif
+                   {
+-                      q1 = p1 + c;
+-                      q2 = p2 + c;
+-                      a = 1e-25f;
+                       for (i = 0; i < hl; i++)
+                       {
+-                          q2 -= _nchan;
+-                          a += *q1 * _c1 [i] + *q2 * _c2 [i];
+-                          q1 += _nchan;
++                          _c1 [i] = a * q1 [i] + b * q1 [i + hl];
++                          _c2 [i] = a * q2 [i] + b * q2 [i - hl];
++                      }
++                      for (c = 0; c < _nchan; c++)
++                      {
++                          q1 = p1 + c;
++                          q2 = p2 + c;
++                          a = 1e-25f;
++                          for (i = 0; i < hl; i++)
++                          {
++                              q2 -= _nchan;
++                              a += *q1 * _c1 [i] + *q2 * _c2 [i];
++                              q1 += _nchan;
++                          }
++                          *out_data++ = a - 1e-25f;
+                       }
+-                      *out_data++ = a - 1e-25f;
+                   }
+               }
+               else