- __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
- while (in < (const __m256i *)limit) {
- __m256i data = _mm256_load_si256(in);
- _mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used.
- __m256i found = _mm256_cmpeq_epi8(data, needle);
- if (!_mm256_testz_si256(found, found)) {
- break;
+ if (current_frame->interleaved) {
+ __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
+ __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
+ if (current_frame->len % 2 == 1) {
+ swap(out1, out2);
+ }
+
+ __m256i mask_lower_byte = _mm256_set1_epi16(0x00ff);
+ while (in < (const __m256i *)limit) {
+ __m256i data1 = _mm256_load_si256(in);
+ __m256i data2 = _mm256_load_si256(in + 1);
+ __m256i data1_lo = _mm256_and_si256(data1, mask_lower_byte);
+ __m256i data2_lo = _mm256_and_si256(data2, mask_lower_byte);
+ __m256i data1_hi = _mm256_srli_epi16(data1, 8);
+ __m256i data2_hi = _mm256_srli_epi16(data2, 8);
+ __m256i lo = _mm256_packus_epi16(data1_lo, data2_lo);
+ lo = _mm256_permute4x64_epi64(lo, 0b11011000);
+ _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used.
+ __m256i hi = _mm256_packus_epi16(data1_hi, data2_hi);
+ hi = _mm256_permute4x64_epi64(hi, 0b11011000);
+ _mm256_storeu_si256(out2, hi);
+ __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
+ __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
+ if (!_mm256_testz_si256(found1, found1) ||
+ !_mm256_testz_si256(found2, found2)) {
+ break;
+ }
+
+ in += 2;
+ ++out1;
+ ++out2;