#include <string.h>
#include <fcntl.h>
#include <stdint.h>
+#include <assert.h>
+#ifdef __SSE2__
+#include <immintrin.h>
+#endif
#include <algorithm>
#include <functional>
#include <memory>
}
}
+#ifdef __SSE2__
+
+// Does a memcpy and memchr in one to reduce processing time.
+// Note that the benefit is somewhat limited if your L3 cache is small,
+// as you'll (unfortunately) spend most of the time loading the data
+// from main memory.
+//
+// Complicated cases are left to the slow path; it basically stops copying
+// up until the first instance of "sync_char" (usually a bit before, actually).
+// This is fine, since 0x00 bytes shouldn't really show up in normal picture
+// data, and what we really need this for is the 00 00 ff ff marker in video data.
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+ if (current_frame->data == nullptr ||
+ current_frame->len > current_frame->size ||
+ start == limit) {
+ return start;
+ }
+ size_t orig_bytes = limit - start;
+ if (orig_bytes < 128) {
+ // Don't bother.
+ return start;
+ }
+
+ // Don't read more bytes than we can write.
+ limit = min(limit, start + (current_frame->size - current_frame->len));
+
+ // Align end to 32 bytes.
+ limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+ if (start >= limit) {
+ return start;
+ }
+
+ // Process [0,31] bytes, such that start gets aligned to 32 bytes.
+ const uint8_t *aligned_start = (const uint8_t *)(intptr_t(start + 31) & ~31);
+ if (aligned_start != start) {
+ const uint8_t *sync_start = (const uint8_t *)memchr(start, sync_char, aligned_start - start);
+ if (sync_start == nullptr) {
+ memcpy(current_frame->data, start, aligned_start - start);
+ current_frame->len += aligned_start - start;
+ } else {
+ memcpy(current_frame->data, start, sync_start - start);
+ current_frame->len += sync_start - start;
+ return sync_start;
+ }
+ }
+
+#if __AVX2__
+ const __m256i needle = _mm256_set1_epi8(sync_char);
+
+ const __m256i *in = (const __m256i *)aligned_start;
+ __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
+ while (in < (const __m256i *)limit) {
+ __m256i data = _mm256_load_si256(in);
+ _mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used.
+ __m256i found = _mm256_cmpeq_epi8(data, needle);
+ if (!_mm256_testz_si256(found, found)) {
+ break;
+ }
+
+ ++in;
+ ++out;
+ }
+#else
+ const __m128i needle = _mm_set1_epi8(sync_char);
+
+ const __m128i *in = (const __m128i *)aligned_start;
+ __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
+ while (in < (const __m128i *)limit) {
+ __m128i data = _mm_load_si128(in);
+ _mm_storeu_si128(out, data); // Store as early as possible, even if the data isn't used.
+ __m128i found = _mm_cmpeq_epi8(data, needle);
+ if (!_mm_testz_si128(found, found)) {
+ break;
+ }
+
+ ++in;
+ ++out;
+ }
+#endif
+
+ //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
+
+ current_frame->len = (uint8_t *)out - current_frame->data;
+ return (const uint8_t *)in;
+}
+#endif
+
void decode_packs(const libusb_transfer *xfr,
const char *sync_pattern,
int sync_length,
const uint8_t *start = xfr->buffer + offset;
const uint8_t *limit = start + pack->actual_length;
while (start < limit) { // Usually runs only one iteration.
+#ifdef __SSE2__
+ start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
+ if (start == limit) break;
+ assert(start < limit);
+#endif
+
const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
if (start_next_frame == nullptr) {
// add the rest of the buffer