From 209ad7e3501591f6beda8d36d2baf5208bdc38e8 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Fri, 18 Sep 2015 21:27:13 +0200 Subject: [PATCH] Add an interleaved mode to split UYVY into YV and YY on-the-fly. --- bmusb.cpp | 154 +++++++++++++++++++++++++++++++++++++++++++++--------- bmusb.h | 6 +++ 2 files changed, 135 insertions(+), 25 deletions(-) diff --git a/bmusb.cpp b/bmusb.cpp index 0101a3f..e7d9fbe 100644 --- a/bmusb.cpp +++ b/bmusb.cpp @@ -255,6 +255,18 @@ static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_p } #endif +void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) +{ + assert(n % 2 == 0); + uint8_t *dptr1 = dest1; + uint8_t *dptr2 = dest2; + + for (size_t i = 0; i < n; i += 2) { + *dptr1++ = *src++; + *dptr2++ = *src++; + } +} + void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end) { if (current_frame->data == nullptr || @@ -269,8 +281,25 @@ void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_n int(current_frame->len + bytes - current_frame->size), frame_type_name); //dump_frame(); } else { - memcpy(current_frame->data + current_frame->len, start, bytes); - current_frame->len += bytes; + if (current_frame->interleaved) { + uint8_t *data = current_frame->data + current_frame->len / 2; + uint8_t *data2 = current_frame->data2 + current_frame->len / 2; + if (current_frame->len % 2 == 1) { + ++data; + swap(data, data2); + } + if (bytes % 2 == 1) { + *data++ = *start++; + swap(data, data2); + ++current_frame->len; + --bytes; + } + memcpy_interleaved(data, data2, start, bytes); + current_frame->len += bytes; + } else { + memcpy(current_frame->data + current_frame->len, start, bytes); + current_frame->len += bytes; + } } } @@ -313,52 +342,127 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const if (aligned_start != start) { const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start); if (sync_start == nullptr) { - memcpy(current_frame->data, start, aligned_start - start); - current_frame->len += aligned_start - start; + add_to_frame(current_frame, "", start, aligned_start); } else { - memcpy(current_frame->data, start, sync_start - start); - current_frame->len += sync_start - start; + add_to_frame(current_frame, "", start, sync_start); return sync_start; } } + // Make the length a multiple of 64. + if (current_frame->interleaved) { + if (((limit - aligned_start) % 64) != 0) { + limit -= 32; + } + assert(((limit - aligned_start) % 64) == 0); + } + #if __AVX2__ const __m256i needle = _mm256_set1_epi8(sync_char); const __m256i *in = (const __m256i *)aligned_start; - __m256i *out = (__m256i *)(current_frame->data + current_frame->len); - while (in < (const __m256i *)limit) { - __m256i data = _mm256_load_si256(in); - _mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used. - __m256i found = _mm256_cmpeq_epi8(data, needle); - if (!_mm256_testz_si256(found, found)) { - break; + if (current_frame->interleaved) { + __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2); + __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2); + if (current_frame->len % 2 == 1) { + swap(out1, out2); + } + + __m256i mask_lower_byte = _mm256_set1_epi16(0x00ff); + while (in < (const __m256i *)limit) { + __m256i data1 = _mm256_load_si256(in); + __m256i data2 = _mm256_load_si256(in + 1); + __m256i data1_lo = _mm256_and_si256(data1, mask_lower_byte); + __m256i data2_lo = _mm256_and_si256(data2, mask_lower_byte); + __m256i data1_hi = _mm256_srli_epi16(data1, 8); + __m256i data2_hi = _mm256_srli_epi16(data2, 8); + __m256i lo = _mm256_packus_epi16(data1_lo, data2_lo); + lo = _mm256_permute4x64_epi64(lo, 0b11011000); + _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used. + __m256i hi = _mm256_packus_epi16(data1_hi, data2_hi); + hi = _mm256_permute4x64_epi64(hi, 0b11011000); + _mm256_storeu_si256(out2, hi); + __m256i found1 = _mm256_cmpeq_epi8(data1, needle); + __m256i found2 = _mm256_cmpeq_epi8(data2, needle); + if (!_mm256_testz_si256(found1, found1) || + !_mm256_testz_si256(found2, found2)) { + break; + } + + in += 2; + ++out1; + ++out2; } + current_frame->len += (uint8_t *)in - aligned_start; + } else { + __m256i *out = (__m256i *)(current_frame->data + current_frame->len); + while (in < (const __m256i *)limit) { + __m256i data = _mm256_load_si256(in); + _mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used. + __m256i found = _mm256_cmpeq_epi8(data, needle); + if (!_mm256_testz_si256(found, found)) { + break; + } - ++in; - ++out; + ++in; + ++out; + } + current_frame->len = (uint8_t *)out - current_frame->data; } #else const __m128i needle = _mm_set1_epi8(sync_char); const __m128i *in = (const __m128i *)aligned_start; - __m128i *out = (__m128i *)(current_frame->data + current_frame->len); - while (in < (const __m128i *)limit) { - __m128i data = _mm_load_si128(in); - _mm_storeu_si128(out, data); // Store as early as possible, even if the data isn't used. - __m128i found = _mm_cmpeq_epi8(data, needle); - if (!_mm_testz_si128(found, found)) { - break; + if (current_frame->interleaved) { + __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2); + __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2); + if (current_frame->len % 2 == 1) { + swap(out1, out2); + } + + __m128i mask_lower_byte = _mm_set1_epi16(0x00ff); + while (in < (const __m128i *)limit) { + __m128i data1 = _mm_load_si128(in); + __m128i data2 = _mm_load_si128(in + 1); + __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte); + __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte); + __m128i data1_hi = _mm_srli_epi16(data1, 8); + __m128i data2_hi = _mm_srli_epi16(data2, 8); + __m128i lo = _mm_packus_epi16(data1_lo, data2_lo); + _mm_storeu_si128(out1, lo); // Store as early as possible, even if the data isn't used. + __m128i hi = _mm_packus_epi16(data1_hi, data2_hi); + _mm_storeu_si128(out2, hi); + __m128i found1 = _mm_cmpeq_epi8(data1, needle); + __m128i found2 = _mm_cmpeq_epi8(data2, needle); + if (!_mm_testz_si128(found1, found1) || + !_mm_testz_si128(found2, found2)) { + break; + } + + in += 2; + ++out1; + ++out2; } + current_frame->len += (uint8_t *)in - aligned_start; + } else { + __m128i *out = (__m128i *)(current_frame->data + current_frame->len); + while (in < (const __m128i *)limit) { + __m128i data = _mm_load_si128(in); + _mm_storeu_si128(out, data); // Store as early as possible, even if the data isn't used. + __m128i found = _mm_cmpeq_epi8(data, needle); + if (!_mm_testz_si128(found, found)) { + break; + } - ++in; - ++out; + ++in; + ++out; + } + current_frame->len = (uint8_t *)out - current_frame->data; } #endif //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes); - current_frame->len = (uint8_t *)out - current_frame->data; return (const uint8_t *)in; } #endif diff --git a/bmusb.h b/bmusb.h index fb6f871..311248e 100644 --- a/bmusb.h +++ b/bmusb.h @@ -14,10 +14,16 @@ class FrameAllocator { public: struct Frame { uint8_t *data = nullptr; + uint8_t *data2 = nullptr; // Only if interleaved == true. size_t len = 0; // Number of bytes we actually have. size_t size = 0; // Number of bytes we have room for. void *userdata = nullptr; FrameAllocator *owner = nullptr; + + // If set to true, every other byte will go to data and to data2. + // If so, and are still about the number of total bytes + // so if size == 1024, there's 512 bytes in data and 512 in data2. + bool interleaved = false; }; virtual ~FrameAllocator(); -- 2.39.2