1 diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/resampler.cc
2 --- orig/zita-resampler-1.3.0/libs/resampler.cc 2012-10-26 22:58:55.000000000 +0200
3 +++ zita-resampler-1.3.0/libs/resampler.cc 2016-09-05 00:30:34.520191288 +0200
6 #include <zita-resampler/resampler.h>
9 +#include <xmmintrin.h>
13 static unsigned int gcd (unsigned int a, unsigned int b)
21 +static inline float calc_mono_sample_sse (unsigned int hl,
28 + __m128 denorm, s, w1, w2, shuf;
30 + denorm = _mm_set1_ps (1e-20f);
32 + for (i = 0; i < hl; i += 4)
36 + // s += *q1 * c1 [i];
37 + w1 = _mm_loadu_ps (&c1 [i]);
38 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), w1));
40 + // s += *q2 * c2 [i];
41 + w2 = _mm_loadu_ps (&c2 [i]);
42 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 1, 2, 3))));
46 + s = _mm_sub_ps (s, denorm);
48 + // Add all the elements of s together into one. Adapted from
49 + // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
50 + shuf = _mm_shuffle_ps (s, s, _MM_SHUFFLE (2, 3, 0, 1));
51 + s = _mm_add_ps (s, shuf);
52 + s = _mm_add_ss (s, _mm_movehl_ps (shuf, s));
53 + return _mm_cvtss_f32 (s);
56 +// Note: This writes four floats instead of two (the last two are garbage).
57 +// The caller will need to make sure there is room for all four.
58 +static inline void calc_stereo_sample_sse (unsigned int hl,
66 + __m128 denorm, s, w1, w2;
68 + denorm = _mm_set1_ps (1e-20f);
70 + for (i = 0; i < hl; i += 4)
74 + // s += *q1 * c1 [i];
75 + w1 = _mm_loadu_ps (&c1 [i]);
76 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), _mm_unpacklo_ps (w1, w1)));
77 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 4), _mm_unpackhi_ps (w1, w1)));
79 + // s += *q2 * c2 [i];
80 + w2 = _mm_loadu_ps (&c2 [i]);
81 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 4), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 1, 1))));
82 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 3, 3))));
86 + s = _mm_sub_ps (s, denorm);
87 + s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
89 + _mm_storeu_ps (out_data, s);
92 +static inline void calc_quad_sample_sse (int hl,
101 + __m128 denorm, s, w1, w2;
103 + denorm = _mm_set1_ps (1e-20f);
105 + for (i = 0; i < hl; i += 4)
109 + // s += *p1 * _c1 [i];
110 + w1 = _mm_loadu_ps (&c1 [i]);
111 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (0, 0, 0, 0))));
112 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (1, 1, 1, 1))));
113 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 2 * nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (2, 2, 2, 2))));
114 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 3 * nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (3, 3, 3, 3))));
116 + // s += *p2 * _c2 [i];
117 + w2 = _mm_loadu_ps (&c2 [i]);
118 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 3 * nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 0, 0))));
119 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 2 * nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (1, 1, 1, 1))));
120 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 2, 2))));
121 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (3, 3, 3, 3))));
125 + s = _mm_sub_ps (s, denorm);
127 + _mm_storeu_ps (out_data, s);
132 Resampler::Resampler (void) :
134 @@ -213,18 +329,42 @@
136 float *c1 = _table->_ctab + hl * ph;
137 float *c2 = _table->_ctab + hl * (np - ph);
138 - for (c = 0; c < _nchan; c++)
140 + if ((hl % 4) == 0 && _nchan == 1)
142 + *out_data++ = calc_mono_sample_sse (hl, c1, c2, p1, p2);
144 + else if ((hl % 4) == 0 && _nchan == 2)
146 - float *q1 = p1 + c;
147 - float *q2 = p2 + c;
149 - for (i = 0; i < hl; i++)
150 + if (out_count >= 2)
152 + calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data);
157 + calc_stereo_sample_sse (hl, c1, c2, p1, p2, tmp);
158 + out_data[0] = tmp[0];
159 + out_data[1] = tmp[1];
166 + for (c = 0; c < _nchan; c++)
169 - s += *q1 * c1 [i] + *q2 * c2 [i];
171 + float *q1 = p1 + c;
172 + float *q2 = p2 + c;
174 + for (i = 0; i < hl; i++)
177 + s += *q1 * c1 [i] + *q2 * c2 [i];
180 + *out_data++ = s - 1e-20f;
182 - *out_data++ = s - 1e-20f;
186 diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/vresampler.cc
187 --- orig/zita-resampler-1.3.0/libs/vresampler.cc 2012-10-26 22:58:55.000000000 +0200
188 +++ zita-resampler-1.3.0/libs/vresampler.cc 2016-09-05 00:33:53.907511211 +0200
190 #include <zita-resampler/vresampler.h>
195 +#include <xmmintrin.h>
197 +static inline float calc_mono_sample_sse (int hl,
205 + __m128 denorm, bs, s, c1, c2, w1, w2, shuf;
207 + denorm = _mm_set1_ps (1e-25f);
208 + bs = _mm_set1_ps (b);
210 + for (i = 0; i < hl; i += 4)
214 + // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
215 + w1 = _mm_loadu_ps (&q1 [i]);
216 + w2 = _mm_loadu_ps (&q1 [i + hl]);
217 + c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
219 + // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
220 + w1 = _mm_loadu_ps (&q2 [i]);
221 + w2 = _mm_loadu_ps (&q2 [i - hl]);
222 + c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
224 + // s += *p1 * _c1 [i];
225 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), c1));
227 + // s += *p2 * _c2 [i];
228 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 1, 2, 3))));
232 + s = _mm_sub_ps (s, denorm);
234 + // Add all the elements of s together into one. Adapted from
235 + // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
236 + shuf = _mm_shuffle_ps (s, s, _MM_SHUFFLE (2, 3, 0, 1));
237 + s = _mm_add_ps (s, shuf);
238 + s = _mm_add_ss (s, _mm_movehl_ps (shuf, s));
239 + return _mm_cvtss_f32 (s);
242 +// Note: This writes four floats instead of two (the last two are garbage).
243 +// The caller will need to make sure there is room for all four.
244 +static inline void calc_stereo_sample_sse (int hl,
253 + __m128 denorm, bs, s, c1, c2, w1, w2;
255 + denorm = _mm_set1_ps (1e-25f);
256 + bs = _mm_set1_ps (b);
258 + for (i = 0; i < hl; i += 4)
262 + // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
263 + w1 = _mm_loadu_ps (&q1 [i]);
264 + w2 = _mm_loadu_ps (&q1 [i + hl]);
265 + c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
267 + // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
268 + w1 = _mm_loadu_ps (&q2 [i]);
269 + w2 = _mm_loadu_ps (&q2 [i - hl]);
270 + c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
272 + // s += *p1 * _c1 [i];
273 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), _mm_unpacklo_ps (c1, c1)));
274 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 4), _mm_unpackhi_ps (c1, c1)));
276 + // s += *p2 * _c2 [i];
277 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 4), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 1, 1))));
278 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 3, 3))));
282 + s = _mm_sub_ps (s, denorm);
283 + s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
285 + _mm_storeu_ps (out_data, s);
288 +static inline void calc_quad_sample_sse (int hl,
298 + __m128 denorm, bs, s, c1, c2, w1, w2;
300 + denorm = _mm_set1_ps (1e-25f);
301 + bs = _mm_set1_ps (b);
303 + for (i = 0; i < hl; i += 4)
307 + // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
308 + w1 = _mm_loadu_ps (&q1 [i]);
309 + w2 = _mm_loadu_ps (&q1 [i + hl]);
310 + c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
312 + // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
313 + w1 = _mm_loadu_ps (&q2 [i]);
314 + w2 = _mm_loadu_ps (&q2 [i - hl]);
315 + c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
317 + // s += *p1 * _c1 [i];
318 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (0, 0, 0, 0))));
319 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (1, 1, 1, 1))));
320 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 2 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (2, 2, 2, 2))));
321 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 3 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (3, 3, 3, 3))));
323 + // s += *p2 * _c2 [i];
324 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 3 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 0, 0))));
325 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 2 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (1, 1, 1, 1))));
326 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 2, 2))));
327 + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (3, 3, 3, 3))));
331 + s = _mm_sub_ps (s, denorm);
333 + _mm_storeu_ps (out_data, s);
339 VResampler::VResampler (void) :
344 int VResampler::process (void)
346 - unsigned int k, np, in, nr, n, c;
347 + unsigned int j, k, np, in, nr, n, c;
350 float a, b, *p1, *p2, *q1, *q2;
351 @@ -212,23 +358,55 @@
353 q1 = _table->_ctab + hl * k;
354 q2 = _table->_ctab + hl * (np - k);
355 - for (i = 0; i < hl; i++)
357 + if ((hl % 4) == 0 && _nchan == 1)
359 + *out_data++ = calc_mono_sample_sse (hl, b, p1, p2, q1, q2);
361 + else if ((hl % 4) == 0 && _nchan == 2)
363 - _c1 [i] = a * q1 [i] + b * q1 [i + hl];
364 - _c2 [i] = a * q2 [i] + b * q2 [i - hl];
365 + if (out_count >= 2)
367 + calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data);
372 + calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, tmp);
373 + out_data[0] = tmp[0];
374 + out_data[1] = tmp[1];
378 + else if ((hl % 4) == 0 && (_nchan % 4) == 0)
380 + for (j = 0; j < _nchan; j += 4)
382 + calc_quad_sample_sse (hl, _nchan, b, p1 + j, p2 + j, q1, q2, out_data + j);
384 + out_data += _nchan;
386 - for (c = 0; c < _nchan; c++)
393 for (i = 0; i < hl; i++)
396 - a += *q1 * _c1 [i] + *q2 * _c2 [i];
398 + _c1 [i] = a * q1 [i] + b * q1 [i + hl];
399 + _c2 [i] = a * q2 [i] + b * q2 [i - hl];
401 + for (c = 0; c < _nchan; c++)
406 + for (i = 0; i < hl; i++)
409 + a += *q1 * _c1 [i] + *q2 * _c2 [i];
412 + *out_data++ = a - 1e-25f;
414 - *out_data++ = a - 1e-25f;