From: Steinar H. Gunderson Date: Sun, 4 Sep 2016 23:09:17 +0000 (+0200) Subject: Update the zita-reampler SSE2 patch. X-Git-Tag: 1.4.0~63 X-Git-Url: https://git.sesse.net/?p=nageru;a=commitdiff_plain;h=9bd4bf03ccf39a60aaa778b52ab9df7ae828a33e Update the zita-reampler SSE2 patch. This version supports not only two channels, but mono, stereo or multiples of four. (In time, we might reorganize so that e.g. 11 channels are split into 8-, 2- and 1-channel resamplers.) --- diff --git a/patches/zita-resampler-sse.diff b/patches/zita-resampler-sse.diff index 43cfef8..4954515 100644 --- a/patches/zita-resampler-sse.diff +++ b/patches/zita-resampler-sse.diff @@ -1,6 +1,6 @@ diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/resampler.cc --- orig/zita-resampler-1.3.0/libs/resampler.cc 2012-10-26 22:58:55.000000000 +0200 -+++ zita-resampler-1.3.0/libs/resampler.cc 2015-11-15 12:27:42.764591015 +0100 ++++ zita-resampler-1.3.0/libs/resampler.cc 2016-09-05 00:30:34.520191288 +0200 @@ -24,6 +24,10 @@ #include #include @@ -12,17 +12,54 @@ diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/r static unsigned int gcd (unsigned int a, unsigned int b) { -@@ -47,6 +51,45 @@ +@@ -47,6 +51,118 @@ return 1; } +#ifdef __SSE2__ + ++static inline float calc_mono_sample_sse (unsigned int hl, ++ const float *c1, ++ const float *c2, ++ const float *q1, ++ const float *q2) ++{ ++ unsigned int i; ++ __m128 denorm, s, w1, w2, shuf; ++ ++ denorm = _mm_set1_ps (1e-20f); ++ s = denorm; ++ for (i = 0; i < hl; i += 4) ++ { ++ q2 -= 4; ++ ++ // s += *q1 * c1 [i]; ++ w1 = _mm_loadu_ps (&c1 [i]); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), w1)); ++ ++ // s += *q2 * c2 [i]; ++ w2 = _mm_loadu_ps (&c2 [i]); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 1, 2, 3)))); ++ ++ q1 += 4; ++ } ++ s = _mm_sub_ps (s, denorm); ++ ++ // Add all the elements of s together into one. Adapted from ++ // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 ++ shuf = _mm_shuffle_ps (s, s, _MM_SHUFFLE (2, 3, 0, 1)); ++ s = _mm_add_ps (s, shuf); ++ s = _mm_add_ss (s, _mm_movehl_ps (shuf, s)); ++ return _mm_cvtss_f32 (s); ++} ++ ++// Note: This writes four floats instead of two (the last two are garbage). ++// The caller will need to make sure there is room for all four. +static inline void calc_stereo_sample_sse (unsigned int hl, -+ float *c1, -+ float *c2, -+ float *q1, -+ float *q2, ++ const float *c1, ++ const float *c2, ++ const float *q1, ++ const float *q2, + float *out_data) +{ + unsigned int i; @@ -49,28 +86,78 @@ diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/r + s = _mm_sub_ps (s, denorm); + s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2))); + -+ // Writes two bytes more than we want, but this is fine since out_count >= 2. + _mm_storeu_ps (out_data, s); +} + ++static inline void calc_quad_sample_sse (int hl, ++ int nchan, ++ const float *c1, ++ const float *c2, ++ const float *q1, ++ const float *q2, ++ float *out_data) ++{ ++ int i; ++ __m128 denorm, s, w1, w2; ++ ++ denorm = _mm_set1_ps (1e-20f); ++ s = denorm; ++ for (i = 0; i < hl; i += 4) ++ { ++ q2 -= 4 * nchan; ++ ++ // s += *p1 * _c1 [i]; ++ w1 = _mm_loadu_ps (&c1 [i]); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (0, 0, 0, 0)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (1, 1, 1, 1)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 2 * nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (2, 2, 2, 2)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 3 * nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (3, 3, 3, 3)))); ++ ++ // s += *p2 * _c2 [i]; ++ w2 = _mm_loadu_ps (&c2 [i]); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 3 * nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 0, 0)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 2 * nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (1, 1, 1, 1)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 2, 2)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (3, 3, 3, 3)))); ++ ++ q1 += 4 * nchan; ++ } ++ s = _mm_sub_ps (s, denorm); ++ ++ _mm_storeu_ps (out_data, s); ++} +#endif + Resampler::Resampler (void) : _table (0), -@@ -213,18 +256,28 @@ +@@ -213,18 +329,42 @@ { float *c1 = _table->_ctab + hl * ph; float *c2 = _table->_ctab + hl * (np - ph); - for (c = 0; c < _nchan; c++) +#ifdef __SSE2__ -+ if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2) ++ if ((hl % 4) == 0 && _nchan == 1) ++ { ++ *out_data++ = calc_mono_sample_sse (hl, c1, c2, p1, p2); ++ } ++ else if ((hl % 4) == 0 && _nchan == 2) { - float *q1 = p1 + c; - float *q2 = p2 + c; - float s = 1e-20f; - for (i = 0; i < hl; i++) -+ calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data); ++ if (out_count >= 2) ++ { ++ calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data); ++ } ++ else ++ { ++ float tmp[4]; ++ calc_stereo_sample_sse (hl, c1, c2, p1, p2, tmp); ++ out_data[0] = tmp[0]; ++ out_data[1] = tmp[1]; ++ } + out_data += 2; + } + else @@ -96,15 +183,10 @@ diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/r } } else -@@ -260,4 +313,3 @@ - return 0; - } - -- diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/vresampler.cc --- orig/zita-resampler-1.3.0/libs/vresampler.cc 2012-10-26 22:58:55.000000000 +0200 -+++ zita-resampler-1.3.0/libs/vresampler.cc 2015-11-15 12:27:58.424544882 +0100 -@@ -25,6 +25,58 @@ ++++ zita-resampler-1.3.0/libs/vresampler.cc 2016-09-05 00:33:53.907511211 +0200 +@@ -25,6 +25,152 @@ #include @@ -112,12 +194,59 @@ diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/ + +#include + ++static inline float calc_mono_sample_sse (int hl, ++ float b, ++ const float *p1, ++ const float *p2, ++ const float *q1, ++ const float *q2) ++{ ++ int i; ++ __m128 denorm, bs, s, c1, c2, w1, w2, shuf; ++ ++ denorm = _mm_set1_ps (1e-25f); ++ bs = _mm_set1_ps (b); ++ s = denorm; ++ for (i = 0; i < hl; i += 4) ++ { ++ p2 -= 4; ++ ++ // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]); ++ w1 = _mm_loadu_ps (&q1 [i]); ++ w2 = _mm_loadu_ps (&q1 [i + hl]); ++ c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1))); ++ ++ // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]); ++ w1 = _mm_loadu_ps (&q2 [i]); ++ w2 = _mm_loadu_ps (&q2 [i - hl]); ++ c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1))); ++ ++ // s += *p1 * _c1 [i]; ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), c1)); ++ ++ // s += *p2 * _c2 [i]; ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 1, 2, 3)))); ++ ++ p1 += 4; ++ } ++ s = _mm_sub_ps (s, denorm); ++ ++ // Add all the elements of s together into one. Adapted from ++ // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 ++ shuf = _mm_shuffle_ps (s, s, _MM_SHUFFLE (2, 3, 0, 1)); ++ s = _mm_add_ps (s, shuf); ++ s = _mm_add_ss (s, _mm_movehl_ps (shuf, s)); ++ return _mm_cvtss_f32 (s); ++} ++ ++// Note: This writes four floats instead of two (the last two are garbage). ++// The caller will need to make sure there is room for all four. +static inline void calc_stereo_sample_sse (int hl, + float b, -+ float *p1, -+ float *p2, -+ float *q1, -+ float *q2, ++ const float *p1, ++ const float *p2, ++ const float *q1, ++ const float *q2, + float *out_data) +{ + int i; @@ -153,7 +282,54 @@ diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/ + s = _mm_sub_ps (s, denorm); + s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2))); + -+ // Writes two bytes more than we want, but this is fine since out_count >= 2. ++ _mm_storeu_ps (out_data, s); ++} ++ ++static inline void calc_quad_sample_sse (int hl, ++ int nchan, ++ float b, ++ const float *p1, ++ const float *p2, ++ const float *q1, ++ const float *q2, ++ float *out_data) ++{ ++ int i; ++ __m128 denorm, bs, s, c1, c2, w1, w2; ++ ++ denorm = _mm_set1_ps (1e-25f); ++ bs = _mm_set1_ps (b); ++ s = denorm; ++ for (i = 0; i < hl; i += 4) ++ { ++ p2 -= 4 * nchan; ++ ++ // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]); ++ w1 = _mm_loadu_ps (&q1 [i]); ++ w2 = _mm_loadu_ps (&q1 [i + hl]); ++ c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1))); ++ ++ // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]); ++ w1 = _mm_loadu_ps (&q2 [i]); ++ w2 = _mm_loadu_ps (&q2 [i - hl]); ++ c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1))); ++ ++ // s += *p1 * _c1 [i]; ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (0, 0, 0, 0)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (1, 1, 1, 1)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 2 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (2, 2, 2, 2)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 3 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (3, 3, 3, 3)))); ++ ++ // s += *p2 * _c2 [i]; ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 3 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 0, 0)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 2 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (1, 1, 1, 1)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 2, 2)))); ++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (3, 3, 3, 3)))); ++ ++ p1 += 4 * nchan; ++ } ++ s = _mm_sub_ps (s, denorm); ++ + _mm_storeu_ps (out_data, s); +} + @@ -163,18 +339,49 @@ diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/ VResampler::VResampler (void) : _table (0), _nchan (0), -@@ -212,23 +264,33 @@ +@@ -163,7 +309,7 @@ + + int VResampler::process (void) + { +- unsigned int k, np, in, nr, n, c; ++ unsigned int j, k, np, in, nr, n, c; + int i, hl, nz; + double ph, dp, dd; + float a, b, *p1, *p2, *q1, *q2; +@@ -212,23 +358,55 @@ a = 1.0f - b; q1 = _table->_ctab + hl * k; q2 = _table->_ctab + hl * (np - k); - for (i = 0; i < hl; i++) +#ifdef __SSE2__ -+ if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2) ++ if ((hl % 4) == 0 && _nchan == 1) ++ { ++ *out_data++ = calc_mono_sample_sse (hl, b, p1, p2, q1, q2); ++ } ++ else if ((hl % 4) == 0 && _nchan == 2) { - _c1 [i] = a * q1 [i] + b * q1 [i + hl]; - _c2 [i] = a * q2 [i] + b * q2 [i - hl]; -+ calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data); ++ if (out_count >= 2) ++ { ++ calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data); ++ } ++ else ++ { ++ float tmp[4]; ++ calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, tmp); ++ out_data[0] = tmp[0]; ++ out_data[1] = tmp[1]; ++ } + out_data += 2; ++ } ++ else if ((hl % 4) == 0 && (_nchan % 4) == 0) ++ { ++ for (j = 0; j < _nchan; j += 4) ++ { ++ calc_quad_sample_sse (hl, _nchan, b, p1 + j, p2 + j, q1, q2, out_data + j); ++ } ++ out_data += _nchan; } - for (c = 0; c < _nchan; c++) + else @@ -183,31 +390,28 @@ diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/ - q1 = p1 + c; - q2 = p2 + c; - a = 1e-25f; -- for (i = 0; i < hl; i++) -- { + for (i = 0; i < hl; i++) + { - q2 -= _nchan; - a += *q1 * _c1 [i] + *q2 * _c2 [i]; - q1 += _nchan; -- } ++ _c1 [i] = a * q1 [i] + b * q1 [i + hl]; ++ _c2 [i] = a * q2 [i] + b * q2 [i - hl]; ++ } ++ for (c = 0; c < _nchan; c++) ++ { ++ q1 = p1 + c; ++ q2 = p2 + c; ++ a = 1e-25f; ++ for (i = 0; i < hl; i++) ++ { ++ q2 -= _nchan; ++ a += *q1 * _c1 [i] + *q2 * _c2 [i]; ++ q1 += _nchan; ++ } ++ *out_data++ = a - 1e-25f; + } - *out_data++ = a - 1e-25f; -+ for (i = 0; i < hl; i++) -+ { -+ _c1 [i] = a * q1 [i] + b * q1 [i + hl]; -+ _c2 [i] = a * q2 [i] + b * q2 [i - hl]; -+ } -+ for (c = 0; c < _nchan; c++) -+ { -+ q1 = p1 + c; -+ q2 = p2 + c; -+ a = 1e-25f; -+ for (i = 0; i < hl; i++) -+ { -+ q2 -= _nchan; -+ a += *q1 * _c1 [i] + *q2 * _c2 [i]; -+ q1 += _nchan; -+ } -+ *out_data++ = a - 1e-25f; -+ } } } else