From d145aa142ce2210f9b4588967712451d25ae41d3 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Fri, 18 Sep 2015 00:14:45 +0200 Subject: [PATCH] Add an SSE2/AVX2 fast path to fuse the memmem() into the memcpy. --- bmusb.cpp | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/bmusb.cpp b/bmusb.cpp index e960425..0101a3f 100644 --- a/bmusb.cpp +++ b/bmusb.cpp @@ -12,6 +12,10 @@ #include #include #include +#include +#ifdef __SSE2__ +#include +#endif #include #include #include @@ -270,6 +274,95 @@ void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_n } } +#ifdef __SSE2__ + +// Does a memcpy and memchr in one to reduce processing time. +// Note that the benefit is somewhat limited if your L3 cache is small, +// as you'll (unfortunately) spend most of the time loading the data +// from main memory. +// +// Complicated cases are left to the slow path; it basically stops copying +// up until the first instance of "sync_char" (usually a bit before, actually). +// This is fine, since 0x00 bytes shouldn't really show up in normal picture +// data, and what we really need this for is the 00 00 ff ff marker in video data. +const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char) +{ + if (current_frame->data == nullptr || + current_frame->len > current_frame->size || + start == limit) { + return start; + } + size_t orig_bytes = limit - start; + if (orig_bytes < 128) { + // Don't bother. + return start; + } + + // Don't read more bytes than we can write. + limit = min(limit, start + (current_frame->size - current_frame->len)); + + // Align end to 32 bytes. + limit = (const uint8_t *)(intptr_t(limit) & ~31); + + if (start >= limit) { + return start; + } + + // Process [0,31] bytes, such that start gets aligned to 32 bytes. + const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31); + if (aligned_start != start) { + const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start); + if (sync_start == nullptr) { + memcpy(current_frame->data, start, aligned_start - start); + current_frame->len += aligned_start - start; + } else { + memcpy(current_frame->data, start, sync_start - start); + current_frame->len += sync_start - start; + return sync_start; + } + } + +#if __AVX2__ + const __m256i needle = _mm256_set1_epi8(sync_char); + + const __m256i *in = (const __m256i *)aligned_start; + __m256i *out = (__m256i *)(current_frame->data + current_frame->len); + while (in < (const __m256i *)limit) { + __m256i data = _mm256_load_si256(in); + _mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used. + __m256i found = _mm256_cmpeq_epi8(data, needle); + if (!_mm256_testz_si256(found, found)) { + break; + } + + ++in; + ++out; + } +#else + const __m128i needle = _mm_set1_epi8(sync_char); + + const __m128i *in = (const __m128i *)aligned_start; + __m128i *out = (__m128i *)(current_frame->data + current_frame->len); + while (in < (const __m128i *)limit) { + __m128i data = _mm_load_si128(in); + _mm_storeu_si128(out, data); // Store as early as possible, even if the data isn't used. + __m128i found = _mm_cmpeq_epi8(data, needle); + if (!_mm_testz_si128(found, found)) { + break; + } + + ++in; + ++out; + } +#endif + + //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes); + + current_frame->len = (uint8_t *)out - current_frame->data; + return (const uint8_t *)in; +} +#endif + void decode_packs(const libusb_transfer *xfr, const char *sync_pattern, int sync_length, @@ -290,6 +383,12 @@ void decode_packs(const libusb_transfer *xfr, const uint8_t *start = xfr->buffer + offset; const uint8_t *limit = start + pack->actual_length; while (start < limit) { // Usually runs only one iteration. +#ifdef __SSE2__ + start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]); + if (start == limit) break; + assert(start < limit); +#endif + const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length); if (start_next_frame == nullptr) { // add the rest of the buffer -- 2.39.2