]> git.sesse.net Git - bmusb/commitdiff
Add an interleaved mode to split UYVY into YV and YY on-the-fly.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Fri, 18 Sep 2015 19:27:13 +0000 (21:27 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Fri, 18 Sep 2015 19:27:13 +0000 (21:27 +0200)
bmusb.cpp
bmusb.h

index 0101a3f6718e4c395a7f60962a277e3c52988b64..e7d9fbe55c2148d76b3a6e921522da62c3d4341a 100644 (file)
--- a/bmusb.cpp
+++ b/bmusb.cpp
@@ -255,6 +255,18 @@ static void dump_pack(const libusb_transfer *xfr, int offset, const libusb_iso_p
 }
 #endif
 
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+       assert(n % 2 == 0);
+       uint8_t *dptr1 = dest1;
+       uint8_t *dptr2 = dest2;
+
+       for (size_t i = 0; i < n; i += 2) {
+               *dptr1++ = *src++;
+               *dptr2++ = *src++;
+       }
+}
+
 void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_name, const uint8_t *start, const uint8_t *end)
 {
        if (current_frame->data == nullptr ||
@@ -269,8 +281,25 @@ void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_n
                        int(current_frame->len + bytes - current_frame->size), frame_type_name);
                //dump_frame();
        } else {
-               memcpy(current_frame->data + current_frame->len, start, bytes);
-               current_frame->len += bytes;
+               if (current_frame->interleaved) {
+                       uint8_t *data = current_frame->data + current_frame->len / 2;
+                       uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
+                       if (current_frame->len % 2 == 1) {
+                               ++data;
+                               swap(data, data2);
+                       }
+                       if (bytes % 2 == 1) {
+                               *data++ = *start++;
+                               swap(data, data2);
+                               ++current_frame->len;
+                               --bytes;
+                       }
+                       memcpy_interleaved(data, data2, start, bytes);
+                       current_frame->len += bytes;
+               } else {
+                       memcpy(current_frame->data + current_frame->len, start, bytes);
+                       current_frame->len += bytes;
+               }
        }
 }
 
@@ -313,52 +342,127 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const
        if (aligned_start != start) {
                const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
                if (sync_start == nullptr) {
-                       memcpy(current_frame->data, start, aligned_start - start);
-                       current_frame->len += aligned_start - start;
+                       add_to_frame(current_frame, "", start, aligned_start);
                } else {
-                       memcpy(current_frame->data, start, sync_start - start);
-                       current_frame->len += sync_start - start;
+                       add_to_frame(current_frame, "", start, sync_start);
                        return sync_start;
                }
        }
 
+       // Make the length a multiple of 64.
+       if (current_frame->interleaved) {
+               if (((limit - aligned_start) % 64) != 0) {
+                       limit -= 32;
+               }
+               assert(((limit - aligned_start) % 64) == 0);
+       }
+
 #if __AVX2__
        const __m256i needle = _mm256_set1_epi8(sync_char);
 
        const __m256i *in = (const __m256i *)aligned_start;
-       __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
-       while (in < (const __m256i *)limit) {
-               __m256i data = _mm256_load_si256(in);
-               _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
-               __m256i found = _mm256_cmpeq_epi8(data, needle);
-               if (!_mm256_testz_si256(found, found)) {
-                       break;
+       if (current_frame->interleaved) {
+               __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
+               __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
+               if (current_frame->len % 2 == 1) {
+                       swap(out1, out2);
+               }
+
+               __m256i mask_lower_byte = _mm256_set1_epi16(0x00ff);
+               while (in < (const __m256i *)limit) {
+                       __m256i data1 = _mm256_load_si256(in);
+                       __m256i data2 = _mm256_load_si256(in + 1);
+                       __m256i data1_lo = _mm256_and_si256(data1, mask_lower_byte);
+                       __m256i data2_lo = _mm256_and_si256(data2, mask_lower_byte);
+                       __m256i data1_hi = _mm256_srli_epi16(data1, 8);
+                       __m256i data2_hi = _mm256_srli_epi16(data2, 8);
+                       __m256i lo = _mm256_packus_epi16(data1_lo, data2_lo);
+                       lo = _mm256_permute4x64_epi64(lo, 0b11011000);
+                       _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
+                       __m256i hi = _mm256_packus_epi16(data1_hi, data2_hi);
+                       hi = _mm256_permute4x64_epi64(hi, 0b11011000);
+                       _mm256_storeu_si256(out2, hi);
+                       __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
+                       __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
+                       if (!_mm256_testz_si256(found1, found1) ||
+                           !_mm256_testz_si256(found2, found2)) {
+                               break;
+                       }
+
+                       in += 2;
+                       ++out1;
+                       ++out2;
                }
+               current_frame->len += (uint8_t *)in - aligned_start;
+       } else {
+               __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
+               while (in < (const __m256i *)limit) {
+                       __m256i data = _mm256_load_si256(in);
+                       _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
+                       __m256i found = _mm256_cmpeq_epi8(data, needle);
+                       if (!_mm256_testz_si256(found, found)) {
+                               break;
+                       }
 
-               ++in;
-               ++out;
+                       ++in;
+                       ++out;
+               }
+               current_frame->len = (uint8_t *)out - current_frame->data;
        }
 #else
        const __m128i needle = _mm_set1_epi8(sync_char);
 
        const __m128i *in = (const __m128i *)aligned_start;
-       __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
-       while (in < (const __m128i *)limit) {
-               __m128i data = _mm_load_si128(in);
-               _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
-               __m128i found = _mm_cmpeq_epi8(data, needle);
-               if (!_mm_testz_si128(found, found)) {
-                       break;
+       if (current_frame->interleaved) {
+               __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
+               __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
+               if (current_frame->len % 2 == 1) {
+                       swap(out1, out2);
+               }
+
+               __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+               while (in < (const __m128i *)limit) {
+                       __m128i data1 = _mm_load_si128(in);
+                       __m128i data2 = _mm_load_si128(in + 1);
+                       __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+                       __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+                       __m128i data1_hi = _mm_srli_epi16(data1, 8);
+                       __m128i data2_hi = _mm_srli_epi16(data2, 8);
+                       __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+                       _mm_storeu_si128(out1, lo);  // Store as early as possible, even if the data isn't used.
+                       __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+                       _mm_storeu_si128(out2, hi);
+                       __m128i found1 = _mm_cmpeq_epi8(data1, needle);
+                       __m128i found2 = _mm_cmpeq_epi8(data2, needle);
+                       if (!_mm_testz_si128(found1, found1) ||
+                           !_mm_testz_si128(found2, found2)) {
+                               break;
+                       }
+
+                       in += 2;
+                       ++out1;
+                       ++out2;
                }
+               current_frame->len += (uint8_t *)in - aligned_start;
+       } else {
+               __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
+               while (in < (const __m128i *)limit) {
+                       __m128i data = _mm_load_si128(in);
+                       _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
+                       __m128i found = _mm_cmpeq_epi8(data, needle);
+                       if (!_mm_testz_si128(found, found)) {
+                               break;
+                       }
 
-               ++in;
-               ++out;
+                       ++in;
+                       ++out;
+               }
+               current_frame->len = (uint8_t *)out - current_frame->data;
        }
 #endif
 
        //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
 
-       current_frame->len = (uint8_t *)out - current_frame->data;
        return (const uint8_t *)in;
 }
 #endif
diff --git a/bmusb.h b/bmusb.h
index fb6f87135f514158f0fcdcd9bbbd4b5ab24655c5..311248e5fddb0864b6ce3dfbf01b775474413419 100644 (file)
--- a/bmusb.h
+++ b/bmusb.h
@@ -14,10 +14,16 @@ class FrameAllocator {
  public:
        struct Frame {
                uint8_t *data = nullptr;
+               uint8_t *data2 = nullptr;  // Only if interleaved == true.
                size_t len = 0;  // Number of bytes we actually have.
                size_t size = 0;  // Number of bytes we have room for.
                void *userdata = nullptr;
                FrameAllocator *owner = nullptr;
+
+               // If set to true, every other byte will go to data and to data2.
+               // If so, <len> and <size> are still about the number of total bytes
+               // so if size == 1024, there's 512 bytes in data and 512 in data2.
+               bool interleaved = false;
        };
 
        virtual ~FrameAllocator();