+ filtered_pcm.resize(pcm.size());
+ unsigned i = NUM_FILTER_COEFF;
+#ifdef __AVX__
+ unsigned avx_end = i + ((pcm.size() - i) & ~7);
+ for ( ; i < avx_end; i += 8) {
+ __m256 s = _mm256_setzero_ps();
+ for (int j = 0; j < NUM_FILTER_COEFF; ++j) {
+ __m256 f = _mm256_set1_ps(filter[j]);
+ s = _mm256_fmadd_ps(f, _mm256_load_ps(&pcm[i - j]), s);
+ }
+ _mm256_storeu_ps(&filtered_pcm[i], s);
+ }
+#endif
+ // Do what we couldn't do with AVX (which is everything for non-AVX machines)
+ // as scalar code.
+ for (; i < pcm.size(); ++i) {