#if __AVX2__
const __m256i needle = _mm256_set1_epi8(sync_char);
- const __m256i *in = (const __m256i *)aligned_start;
+ const __restrict __m256i *in = (const __m256i *)aligned_start;
if (current_frame->interleaved) {
- __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
- __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
+ __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
+ __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
if (current_frame->len % 2 == 1) {
swap(out1, out2);
}
15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
while (in < (const __m256i *)limit) {
// Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
- __m256i data1 = _mm256_load_si256(in); // AaBbCcDd EeFfGgHh
- __m256i data2 = _mm256_load_si256(in + 1); // IiJjKkLl MmNnOoPp
+ __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh
+ __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp
__m256i found1 = _mm256_cmpeq_epi8(data1, needle);
__m256i found2 = _mm256_cmpeq_epi8(data2, needle);
+ __m256i found = _mm256_or_si256(found1, found2);
data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh
data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop
_mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used.
_mm256_storeu_si256(out2, hi);
- if (!_mm256_testz_si256(found1, found1) ||
- !_mm256_testz_si256(found2, found2)) {
+ if (!_mm256_testz_si256(found, found)) {
break;
}