Add an SSE2/AVX2 fast path to fuse the memmem() into the memcpy.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 17 Sep 2015 22:14:45 +0000 (00:14 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 17 Sep 2015 22:14:45 +0000 (00:14 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 17 Sep 2015 22:14:45 +0000 (00:14 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 17 Sep 2015 22:14:45 +0000 (00:14 +0200)
diff --git a/bmusb.cpp b/bmusb.cpp

index e960425547d6c4d2f8d04d81b971ce10b0bda1fe..0101a3f6718e4c395a7f60962a277e3c52988b64 100644 (file)
--- a/bmusb.cpp
+++ b/bmusb.cpp
@@ -12,6 +12,10 @@
  #include <string.h>
  #include <fcntl.h>
  #include <stdint.h>
+#include <assert.h>
+#ifdef __SSE2__
+#include <immintrin.h>
+#endif
  #include <algorithm>
  #include <functional>
  #include <memory>
@@ -270,6 +274,95 @@ void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_n
         }
  }
  
+#ifdef __SSE2__
+
+// Does a memcpy and memchr in one to reduce processing time.
+// Note that the benefit is somewhat limited if your L3 cache is small,
+// as you'll (unfortunately) spend most of the time loading the data
+// from main memory.
+//
+// Complicated cases are left to the slow path; it basically stops copying
+// up until the first instance of "sync_char" (usually a bit before, actually).
+// This is fine, since 0x00 bytes shouldn't really show up in normal picture
+// data, and what we really need this for is the 00 00 ff ff marker in video data.
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+       if (current_frame->data == nullptr ||
+           current_frame->len > current_frame->size ||
+           start == limit) {
+               return start;
+       }
+       size_t orig_bytes = limit - start;
+       if (orig_bytes < 128) {
+               // Don't bother.
+               return start;
+       }
+
+       // Don't read more bytes than we can write.
+       limit = min(limit, start + (current_frame->size - current_frame->len));
+
+       // Align end to 32 bytes.
+       limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+       if (start >= limit) {
+               return start;
+       }
+
+       // Process [0,31] bytes, such that start gets aligned to 32 bytes.
+       const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
+       if (aligned_start != start) {
+               const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
+               if (sync_start == nullptr) {
+                       memcpy(current_frame->data, start, aligned_start - start);
+                       current_frame->len += aligned_start - start;
+               } else {
+                       memcpy(current_frame->data, start, sync_start - start);
+                       current_frame->len += sync_start - start;
+                       return sync_start;
+               }
+       }
+
+#if __AVX2__
+       const __m256i needle = _mm256_set1_epi8(sync_char);
+
+       const __m256i *in = (const __m256i *)aligned_start;
+       __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
+       while (in < (const __m256i *)limit) {
+               __m256i data = _mm256_load_si256(in);
+               _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
+               __m256i found = _mm256_cmpeq_epi8(data, needle);
+               if (!_mm256_testz_si256(found, found)) {
+                       break;
+               }
+
+               ++in;
+               ++out;
+       }
+#else
+       const __m128i needle = _mm_set1_epi8(sync_char);
+
+       const __m128i *in = (const __m128i *)aligned_start;
+       __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
+       while (in < (const __m128i *)limit) {
+               __m128i data = _mm_load_si128(in);
+               _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
+               __m128i found = _mm_cmpeq_epi8(data, needle);
+               if (!_mm_testz_si128(found, found)) {
+                       break;
+               }
+
+               ++in;
+               ++out;
+       }
+#endif
+
+       //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
+
+       current_frame->len = (uint8_t *)out - current_frame->data;
+       return (const uint8_t *)in;
+}
+#endif
+
  void decode_packs(const libusb_transfer *xfr,
                    const char *sync_pattern,
                    int sync_length,
@@ -290,6 +383,12 @@ void decode_packs(const libusb_transfer *xfr,
                 const uint8_t *start = xfr->buffer + offset;
                 const uint8_t *limit = start + pack->actual_length;
                 while (start < limit) {  // Usually runs only one iteration.
+#ifdef __SSE2__
+                       start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
+                       if (start == limit) break;
+                       assert(start < limit);
+#endif
+
                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
                         if (start_next_frame == nullptr) {
                                 // add the rest of the buffer
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 17 Sep 2015 22:14:45 +0000 (00:14 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 17 Sep 2015 22:14:45 +0000 (00:14 +0200)