From: Steinar H. Gunderson <sgunderson@bigfoot.com>
Date: Sun, 4 Sep 2016 23:09:17 +0000 (+0200)
Subject: Update the zita-reampler SSE2 patch.
X-Git-Tag: 1.4.0~63
X-Git-Url: https://git.sesse.net/?p=nageru;a=commitdiff_plain;h=9bd4bf03ccf39a60aaa778b52ab9df7ae828a33e

Update the zita-reampler SSE2 patch.

This version supports not only two channels, but mono, stereo or
multiples of four. (In time, we might reorganize so that
e.g. 11 channels are split into 8-, 2- and 1-channel resamplers.)
---

diff --git a/patches/zita-resampler-sse.diff b/patches/zita-resampler-sse.diff
index 43cfef8..4954515 100644
--- a/patches/zita-resampler-sse.diff
+++ b/patches/zita-resampler-sse.diff
@@ -1,6 +1,6 @@
 diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/resampler.cc
 --- orig/zita-resampler-1.3.0/libs/resampler.cc	2012-10-26 22:58:55.000000000 +0200
-+++ zita-resampler-1.3.0/libs/resampler.cc	2015-11-15 12:27:42.764591015 +0100
++++ zita-resampler-1.3.0/libs/resampler.cc	2016-09-05 00:30:34.520191288 +0200
 @@ -24,6 +24,10 @@
  #include <math.h>
  #include <zita-resampler/resampler.h>
@@ -12,17 +12,54 @@ diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/r
  
  static unsigned int gcd (unsigned int a, unsigned int b)
  {
-@@ -47,6 +51,45 @@
+@@ -47,6 +51,118 @@
      return 1; 
  }
  
 +#ifdef __SSE2__
 +
++static inline float calc_mono_sample_sse (unsigned int hl,
++                                          const float *c1,
++                                          const float *c2,
++                                          const float *q1,
++                                          const float *q2)
++{
++    unsigned int   i;
++    __m128         denorm, s, w1, w2, shuf;
++
++    denorm = _mm_set1_ps (1e-20f);
++    s = denorm;
++    for (i = 0; i < hl; i += 4)
++    {
++	q2 -= 4;
++
++	// s += *q1 * c1 [i];
++	w1 = _mm_loadu_ps (&c1 [i]);
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), w1));
++
++	// s += *q2 * c2 [i];
++	w2 = _mm_loadu_ps (&c2 [i]);
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 1, 2, 3))));
++
++	q1 += 4;
++    }
++    s = _mm_sub_ps (s, denorm);
++
++    // Add all the elements of s together into one. Adapted from
++    // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
++    shuf = _mm_shuffle_ps (s, s, _MM_SHUFFLE (2, 3, 0, 1));
++    s = _mm_add_ps (s, shuf);
++    s = _mm_add_ss (s, _mm_movehl_ps (shuf, s));
++    return _mm_cvtss_f32 (s);
++}
++
++// Note: This writes four floats instead of two (the last two are garbage).
++// The caller will need to make sure there is room for all four.
 +static inline void calc_stereo_sample_sse (unsigned int hl,
-+                                           float *c1,
-+                                           float *c2,
-+                                           float *q1,
-+                                           float *q2,
++                                           const float *c1,
++                                           const float *c2,
++                                           const float *q1,
++                                           const float *q2,
 +                                           float *out_data)
 +{
 +    unsigned int   i;
@@ -49,28 +86,78 @@ diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/r
 +    s = _mm_sub_ps (s, denorm);
 +    s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
 +
-+    // Writes two bytes more than we want, but this is fine since out_count >= 2.
 +    _mm_storeu_ps (out_data, s);
 +}
 +
++static inline void calc_quad_sample_sse (int hl,
++                                         int nchan,
++                                         const float *c1,
++                                         const float *c2,
++                                         const float *q1,
++                                         const float *q2,
++                                         float *out_data)
++{
++    int            i;
++    __m128         denorm, s, w1, w2;
++
++    denorm = _mm_set1_ps (1e-20f);
++    s = denorm;
++    for (i = 0; i < hl; i += 4)
++    {
++	q2 -= 4 * nchan;
++
++	// s += *p1 * _c1 [i];
++	w1 = _mm_loadu_ps (&c1 [i]);
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1),             _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (0, 0, 0, 0))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + nchan),     _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (1, 1, 1, 1))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 2 * nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (2, 2, 2, 2))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 3 * nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (3, 3, 3, 3))));
++
++	// s += *p2 * _c2 [i];
++	w2 = _mm_loadu_ps (&c2 [i]);
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 3 * nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 0, 0))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 2 * nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (1, 1, 1, 1))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + nchan),     _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 2, 2))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2),             _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (3, 3, 3, 3))));
++
++	q1 += 4 * nchan;
++    }
++    s = _mm_sub_ps (s, denorm);
++
++    _mm_storeu_ps (out_data, s);
++}
 +#endif
 +
  
  Resampler::Resampler (void) :
      _table (0),
-@@ -213,18 +256,28 @@
+@@ -213,18 +329,42 @@
  		{
  		    float *c1 = _table->_ctab + hl * ph;
  		    float *c2 = _table->_ctab + hl * (np - ph);
 -		    for (c = 0; c < _nchan; c++)
 +#ifdef __SSE2__
-+		    if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2)
++		    if ((hl % 4) == 0 && _nchan == 1)
++                    {
++			*out_data++ = calc_mono_sample_sse (hl, c1, c2, p1, p2);
++                    }
++		    else if ((hl % 4) == 0 && _nchan == 2)
  		    {
 -			float *q1 = p1 + c;
 -			float *q2 = p2 + c;
 -			float s = 1e-20f;
 -			for (i = 0; i < hl; i++)
-+			calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data);
++                        if (out_count >= 2)
++                        {
++			    calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data);
++                        }
++                        else
++                        {
++                            float tmp[4];
++			    calc_stereo_sample_sse (hl, c1, c2, p1, p2, tmp);
++                            out_data[0] = tmp[0];
++                            out_data[1] = tmp[1];
++                        }
 +			out_data += 2;
 +		    }
 +		    else
@@ -96,15 +183,10 @@ diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/r
  		    }
  		}
  		else
-@@ -260,4 +313,3 @@
-     return 0;
- }
- 
--
 diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/vresampler.cc
 --- orig/zita-resampler-1.3.0/libs/vresampler.cc	2012-10-26 22:58:55.000000000 +0200
-+++ zita-resampler-1.3.0/libs/vresampler.cc	2015-11-15 12:27:58.424544882 +0100
-@@ -25,6 +25,58 @@
++++ zita-resampler-1.3.0/libs/vresampler.cc	2016-09-05 00:33:53.907511211 +0200
+@@ -25,6 +25,152 @@
  #include <zita-resampler/vresampler.h>
  
  
@@ -112,12 +194,59 @@ diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/
 +
 +#include <xmmintrin.h>
 +
++static inline float calc_mono_sample_sse (int hl,
++                                          float b,
++                                          const float *p1,
++                                          const float *p2,
++                                          const float *q1,
++                                          const float *q2)
++{
++    int            i;
++    __m128         denorm, bs, s, c1, c2, w1, w2, shuf;
++
++    denorm = _mm_set1_ps (1e-25f);
++    bs = _mm_set1_ps (b);
++    s = denorm;
++    for (i = 0; i < hl; i += 4)
++    {
++	p2 -= 4;
++
++	// _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
++	w1 = _mm_loadu_ps (&q1 [i]);
++	w2 = _mm_loadu_ps (&q1 [i + hl]);
++	c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++	// _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
++	w1 = _mm_loadu_ps (&q2 [i]);
++	w2 = _mm_loadu_ps (&q2 [i - hl]);
++	c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++	// s += *p1 * _c1 [i];
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), c1));
++
++	// s += *p2 * _c2 [i];
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 1, 2, 3))));
++
++	p1 += 4;
++    }
++    s = _mm_sub_ps (s, denorm);
++
++    // Add all the elements of s together into one. Adapted from
++    // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
++    shuf = _mm_shuffle_ps (s, s, _MM_SHUFFLE (2, 3, 0, 1));
++    s = _mm_add_ps (s, shuf);
++    s = _mm_add_ss (s, _mm_movehl_ps (shuf, s));
++    return _mm_cvtss_f32 (s);
++}
++
++// Note: This writes four floats instead of two (the last two are garbage).
++// The caller will need to make sure there is room for all four.
 +static inline void calc_stereo_sample_sse (int hl,
 +                                           float b,
-+                                           float *p1,
-+                                           float *p2,
-+                                           float *q1,
-+                                           float *q2,
++                                           const float *p1,
++                                           const float *p2,
++                                           const float *q1,
++                                           const float *q2,
 +                                           float *out_data)
 +{
 +    int            i;
@@ -153,7 +282,54 @@ diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/
 +    s = _mm_sub_ps (s, denorm);
 +    s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
 +
-+    // Writes two bytes more than we want, but this is fine since out_count >= 2.
++    _mm_storeu_ps (out_data, s);
++}
++
++static inline void calc_quad_sample_sse (int hl,
++                                         int nchan,
++                                         float b,
++                                         const float *p1,
++                                         const float *p2,
++                                         const float *q1,
++                                         const float *q2,
++                                         float *out_data)
++{
++    int            i;
++    __m128         denorm, bs, s, c1, c2, w1, w2;
++
++    denorm = _mm_set1_ps (1e-25f);
++    bs = _mm_set1_ps (b);
++    s = denorm;
++    for (i = 0; i < hl; i += 4)
++    {
++	p2 -= 4 * nchan;
++
++	// _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
++	w1 = _mm_loadu_ps (&q1 [i]);
++	w2 = _mm_loadu_ps (&q1 [i + hl]);
++	c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++	// _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
++	w1 = _mm_loadu_ps (&q2 [i]);
++	w2 = _mm_loadu_ps (&q2 [i - hl]);
++	c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++	// s += *p1 * _c1 [i];
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1),             _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (0, 0, 0, 0))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + nchan),     _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (1, 1, 1, 1))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 2 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (2, 2, 2, 2))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 3 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (3, 3, 3, 3))));
++
++	// s += *p2 * _c2 [i];
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 3 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 0, 0))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 2 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (1, 1, 1, 1))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + nchan),     _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 2, 2))));
++	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2),             _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (3, 3, 3, 3))));
++
++	p1 += 4 * nchan;
++    }
++    s = _mm_sub_ps (s, denorm);
++
 +    _mm_storeu_ps (out_data, s);
 +}
 +
@@ -163,18 +339,49 @@ diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/
  VResampler::VResampler (void) :
      _table (0),
      _nchan (0),
-@@ -212,23 +264,33 @@
+@@ -163,7 +309,7 @@
+ 
+ int VResampler::process (void)
+ {
+-    unsigned int   k, np, in, nr, n, c;
++    unsigned int   j, k, np, in, nr, n, c;
+     int            i, hl, nz;
+     double         ph, dp, dd; 
+     float          a, b, *p1, *p2, *q1, *q2;
+@@ -212,23 +358,55 @@
  		    a = 1.0f - b;
  		    q1 = _table->_ctab + hl * k;
  		    q2 = _table->_ctab + hl * (np - k);
 -     		    for (i = 0; i < hl; i++)
 +#ifdef __SSE2__
-+		    if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2)
++		    if ((hl % 4) == 0 && _nchan == 1)
++		    {
++			*out_data++ = calc_mono_sample_sse (hl, b, p1, p2, q1, q2);
++		    }
++		    else if ((hl % 4) == 0 && _nchan == 2)
  		    {
 -                        _c1 [i] = a * q1 [i] + b * q1 [i + hl];
 -    		        _c2 [i] = a * q2 [i] + b * q2 [i - hl];
-+			calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data);
++			if (out_count >= 2)
++			{
++			    calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data);
++			}
++			else
++			{
++			    float tmp[4];
++			    calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, tmp);
++			    out_data[0] = tmp[0];
++			    out_data[1] = tmp[1];
++			}
 +			out_data += 2;
++		    }
++		    else if ((hl % 4) == 0 && (_nchan % 4) == 0)
++		    {
++			for (j = 0; j < _nchan; j += 4)
++			{
++			    calc_quad_sample_sse (hl, _nchan, b, p1 + j, p2 + j, q1, q2, out_data + j);
++			}
++			out_data += _nchan;
  		    }
 -		    for (c = 0; c < _nchan; c++)
 +		    else
@@ -183,31 +390,28 @@ diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/
 -			q1 = p1 + c;
 -			q2 = p2 + c;
 -			a = 1e-25f;
--			for (i = 0; i < hl; i++)
--			{
+ 			for (i = 0; i < hl; i++)
+ 			{
 -			    q2 -= _nchan;
 -			    a += *q1 * _c1 [i] + *q2 * _c2 [i];
 -			    q1 += _nchan;
--			}
++			    _c1 [i] = a * q1 [i] + b * q1 [i + hl];
++			    _c2 [i] = a * q2 [i] + b * q2 [i - hl];
++			}
++			for (c = 0; c < _nchan; c++)
++			{
++			    q1 = p1 + c;
++			    q2 = p2 + c;
++			    a = 1e-25f;
++			    for (i = 0; i < hl; i++)
++			    {
++				q2 -= _nchan;
++				a += *q1 * _c1 [i] + *q2 * _c2 [i];
++				q1 += _nchan;
++			    }
++			    *out_data++ = a - 1e-25f;
+ 			}
 -			*out_data++ = a - 1e-25f;
-+		        for (i = 0; i < hl; i++)
-+		        {
-+		            _c1 [i] = a * q1 [i] + b * q1 [i + hl];
-+		            _c2 [i] = a * q2 [i] + b * q2 [i - hl];
-+		        }
-+		        for (c = 0; c < _nchan; c++)
-+		        {
-+		            q1 = p1 + c;
-+		            q2 = p2 + c;
-+		            a = 1e-25f;
-+		            for (i = 0; i < hl; i++)
-+		            {
-+		                q2 -= _nchan;
-+		                a += *q1 * _c1 [i] + *q2 * _c2 [i];
-+		                q1 += _nchan;
-+		            }
-+		            *out_data++ = a - 1e-25f;
-+		        }
  		    }
  		}
  		else