1 diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/resampler.cc
2 --- orig/zita-resampler-1.3.0/libs/resampler.cc 2012-10-26 22:58:55.000000000 +0200
3 +++ zita-resampler-1.3.0/libs/resampler.cc 2015-11-15 12:27:42.764591015 +0100
6 #include <zita-resampler/resampler.h>
9 +#include <xmmintrin.h>
13 static unsigned int gcd (unsigned int a, unsigned int b)
21 +static inline void calc_stereo_sample_sse (unsigned int hl,
29 + __m128 denorm, s, w1, w2;
31 + denorm = _mm_set1_ps (1e-20f);
33 + for (i = 0; i < hl; i += 4)
37 + // s += *q1 * c1 [i];
38 + w1 = _mm_loadu_ps (&c1 [i]);
39 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), _mm_unpacklo_ps (w1, w1)));
40 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 4), _mm_unpackhi_ps (w1, w1)));
42 + // s += *q2 * c2 [i];
43 + w2 = _mm_loadu_ps (&c2 [i]);
44 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 4), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 1, 1))));
45 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 3, 3))));
49 + s = _mm_sub_ps (s, denorm);
50 + s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
52 + // Writes two bytes more than we want, but this is fine since out_count >= 2.
53 + _mm_storeu_ps (out_data, s);
59 Resampler::Resampler (void) :
63 float *c1 = _table->_ctab + hl * ph;
64 float *c2 = _table->_ctab + hl * (np - ph);
65 - for (c = 0; c < _nchan; c++)
67 + if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2)
72 - for (i = 0; i < hl; i++)
73 + calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data);
79 + for (c = 0; c < _nchan; c++)
82 - s += *q1 * c1 [i] + *q2 * c2 [i];
87 + for (i = 0; i < hl; i++)
90 + s += *q1 * c1 [i] + *q2 * c2 [i];
93 + *out_data++ = s - 1e-20f;
95 - *out_data++ = s - 1e-20f;
104 diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/vresampler.cc
105 --- orig/zita-resampler-1.3.0/libs/vresampler.cc 2012-10-26 22:58:55.000000000 +0200
106 +++ zita-resampler-1.3.0/libs/vresampler.cc 2015-11-15 12:27:58.424544882 +0100
108 #include <zita-resampler/vresampler.h>
113 +#include <xmmintrin.h>
115 +static inline void calc_stereo_sample_sse (int hl,
124 + __m128 denorm, bs, s, c1, c2, w1, w2;
126 + denorm = _mm_set1_ps (1e-25f);
127 + bs = _mm_set1_ps (b);
129 + for (i = 0; i < hl; i += 4)
133 + // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
134 + w1 = _mm_loadu_ps (&q1 [i]);
135 + w2 = _mm_loadu_ps (&q1 [i + hl]);
136 + c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
138 + // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
139 + w1 = _mm_loadu_ps (&q2 [i]);
140 + w2 = _mm_loadu_ps (&q2 [i - hl]);
141 + c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
143 + // s += *p1 * _c1 [i];
144 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), _mm_unpacklo_ps (c1, c1)));
145 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 4), _mm_unpackhi_ps (c1, c1)));
147 + // s += *p2 * _c2 [i];
148 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 4), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 1, 1))));
149 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 3, 3))));
153 + s = _mm_sub_ps (s, denorm);
154 + s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
156 + // Writes two bytes more than we want, but this is fine since out_count >= 2.
157 + _mm_storeu_ps (out_data, s);
163 VResampler::VResampler (void) :
166 @@ -212,23 +264,33 @@
168 q1 = _table->_ctab + hl * k;
169 q2 = _table->_ctab + hl * (np - k);
170 - for (i = 0; i < hl; i++)
172 + if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2)
174 - _c1 [i] = a * q1 [i] + b * q1 [i + hl];
175 - _c2 [i] = a * q2 [i] + b * q2 [i - hl];
176 + calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data);
179 - for (c = 0; c < _nchan; c++)
186 - for (i = 0; i < hl; i++)
189 - a += *q1 * _c1 [i] + *q2 * _c2 [i];
192 - *out_data++ = a - 1e-25f;
193 + for (i = 0; i < hl; i++)
195 + _c1 [i] = a * q1 [i] + b * q1 [i + hl];
196 + _c2 [i] = a * q2 [i] + b * q2 [i - hl];
198 + for (c = 0; c < _nchan; c++)
203 + for (i = 0; i < hl; i++)
206 + a += *q1 * _c1 [i] + *q2 * _c2 [i];
209 + *out_data++ = a - 1e-25f;