++ _mm_storeu_ps (out_data, s);
++}
++
++static inline void calc_quad_sample_sse (int hl,
++ int nchan,
++ float b,
++ const float *p1,
++ const float *p2,
++ const float *q1,
++ const float *q2,
++ float *out_data)
++{
++ int i;
++ __m128 denorm, bs, s, c1, c2, w1, w2;
++
++ denorm = _mm_set1_ps (1e-25f);
++ bs = _mm_set1_ps (b);
++ s = denorm;
++ for (i = 0; i < hl; i += 4)
++ {
++ p2 -= 4 * nchan;
++
++ // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
++ w1 = _mm_loadu_ps (&q1 [i]);
++ w2 = _mm_loadu_ps (&q1 [i + hl]);
++ c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++ // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
++ w1 = _mm_loadu_ps (&q2 [i]);
++ w2 = _mm_loadu_ps (&q2 [i - hl]);
++ c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
++
++ // s += *p1 * _c1 [i];
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (0, 0, 0, 0))));
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (1, 1, 1, 1))));
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 2 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (2, 2, 2, 2))));
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 3 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (3, 3, 3, 3))));
++
++ // s += *p2 * _c2 [i];
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 3 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 0, 0))));
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 2 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (1, 1, 1, 1))));
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 2, 2))));
++ s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (3, 3, 3, 3))));
++
++ p1 += 4 * nchan;
++ }
++ s = _mm_sub_ps (s, denorm);
++