}
#endif
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+ assert(n % 2 == 0);
+ uint8_t *dptr1 = dest1;
+ uint8_t *dptr2 = dest2;
+
+ for (size_t i = 0; i < n; i += 2) {
+ *dptr1++ = *src++;
+ *dptr2++ = *src++;
+ }
+}
+
void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
{
if (current_frame->data == nullptr ||
int(current_frame->len + bytes - current_frame->size), frame_type_name);
//dump_frame();
} else {
- memcpy(current_frame->data + current_frame->len, start, bytes);
- current_frame->len += bytes;
+ if (current_frame->interleaved) {
+ uint8_t *data = current_frame->data + current_frame->len / 2;
+ uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
+ if (current_frame->len % 2 == 1) {
+ ++data;
+ swap(data, data2);
+ }
+ if (bytes % 2 == 1) {
+ *data++ = *start++;
+ swap(data, data2);
+ ++current_frame->len;
+ --bytes;
+ }
+ memcpy_interleaved(data, data2, start, bytes);
+ current_frame->len += bytes;
+ } else {
+ memcpy(current_frame->data + current_frame->len, start, bytes);
+ current_frame->len += bytes;
+ }
}
}
if (aligned_start != start) {
const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
if (sync_start == nullptr) {
- memcpy(current_frame->data, start, aligned_start - start);
- current_frame->len += aligned_start - start;
+ add_to_frame(current_frame, "", start, aligned_start);
} else {
- memcpy(current_frame->data, start, sync_start - start);
- current_frame->len += sync_start - start;
+ add_to_frame(current_frame, "", start, sync_start);
return sync_start;
}
}
+ // Make the length a multiple of 64.
+ if (current_frame->interleaved) {
+ if (((limit - aligned_start) % 64) != 0) {
+ limit -= 32;
+ }
+ assert(((limit - aligned_start) % 64) == 0);
+ }
+
#if __AVX2__
const __m256i needle = _mm256_set1_epi8(sync_char);
const __m256i *in = (const __m256i *)aligned_start;
- __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
- while (in < (const __m256i *)limit) {
- __m256i data = _mm256_load_si256(in);
- _mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used.
- __m256i found = _mm256_cmpeq_epi8(data, needle);
- if (!_mm256_testz_si256(found, found)) {
- break;
+ if (current_frame->interleaved) {
+ __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
+ __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
+ if (current_frame->len % 2 == 1) {
+ swap(out1, out2);
+ }
+
+ __m256i mask_lower_byte = _mm256_set1_epi16(0x00ff);
+ while (in < (const __m256i *)limit) {
+ __m256i data1 = _mm256_load_si256(in);
+ __m256i data2 = _mm256_load_si256(in + 1);
+ __m256i data1_lo = _mm256_and_si256(data1, mask_lower_byte);
+ __m256i data2_lo = _mm256_and_si256(data2, mask_lower_byte);
+ __m256i data1_hi = _mm256_srli_epi16(data1, 8);
+ __m256i data2_hi = _mm256_srli_epi16(data2, 8);
+ __m256i lo = _mm256_packus_epi16(data1_lo, data2_lo);
+ lo = _mm256_permute4x64_epi64(lo, 0b11011000);
+ _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used.
+ __m256i hi = _mm256_packus_epi16(data1_hi, data2_hi);
+ hi = _mm256_permute4x64_epi64(hi, 0b11011000);
+ _mm256_storeu_si256(out2, hi);
+ __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
+ __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
+ if (!_mm256_testz_si256(found1, found1) ||
+ !_mm256_testz_si256(found2, found2)) {
+ break;
+ }
+
+ in += 2;
+ ++out1;
+ ++out2;
}
+ current_frame->len += (uint8_t *)in - aligned_start;
+ } else {
+ __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
+ while (in < (const __m256i *)limit) {
+ __m256i data = _mm256_load_si256(in);
+ _mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used.
+ __m256i found = _mm256_cmpeq_epi8(data, needle);
+ if (!_mm256_testz_si256(found, found)) {
+ break;
+ }
- ++in;
- ++out;
+ ++in;
+ ++out;
+ }
+ current_frame->len = (uint8_t *)out - current_frame->data;
}
#else
const __m128i needle = _mm_set1_epi8(sync_char);
const __m128i *in = (const __m128i *)aligned_start;
- __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
- while (in < (const __m128i *)limit) {
- __m128i data = _mm_load_si128(in);
- _mm_storeu_si128(out, data); // Store as early as possible, even if the data isn't used.
- __m128i found = _mm_cmpeq_epi8(data, needle);
- if (!_mm_testz_si128(found, found)) {
- break;
+ if (current_frame->interleaved) {
+ __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
+ __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
+ if (current_frame->len % 2 == 1) {
+ swap(out1, out2);
+ }
+
+ __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+ while (in < (const __m128i *)limit) {
+ __m128i data1 = _mm_load_si128(in);
+ __m128i data2 = _mm_load_si128(in + 1);
+ __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+ __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+ __m128i data1_hi = _mm_srli_epi16(data1, 8);
+ __m128i data2_hi = _mm_srli_epi16(data2, 8);
+ __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+ _mm_storeu_si128(out1, lo); // Store as early as possible, even if the data isn't used.
+ __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+ _mm_storeu_si128(out2, hi);
+ __m128i found1 = _mm_cmpeq_epi8(data1, needle);
+ __m128i found2 = _mm_cmpeq_epi8(data2, needle);
+ if (!_mm_testz_si128(found1, found1) ||
+ !_mm_testz_si128(found2, found2)) {
+ break;
+ }
+
+ in += 2;
+ ++out1;
+ ++out2;
}
+ current_frame->len += (uint8_t *)in - aligned_start;
+ } else {
+ __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
+ while (in < (const __m128i *)limit) {
+ __m128i data = _mm_load_si128(in);
+ _mm_storeu_si128(out, data); // Store as early as possible, even if the data isn't used.
+ __m128i found = _mm_cmpeq_epi8(data, needle);
+ if (!_mm_testz_si128(found, found)) {
+ break;
+ }
- ++in;
- ++out;
+ ++in;
+ ++out;
+ }
+ current_frame->len = (uint8_t *)out - current_frame->data;
}
#endif
//printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
- current_frame->len = (uint8_t *)out - current_frame->data;
return (const uint8_t *)in;
}
#endif