// 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
// Audio comes out as 8-channel 24-bit raw audio.
+#if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
+#define HAS_MULTIVERSIONING 1
+#endif
+
#include <assert.h>
#include <errno.h>
#include <libusb.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#ifdef __SSE4_1__
+#if HAS_MULTIVERSIONING
#include <immintrin.h>
#endif
#include "bmusb.h"
}
}
-#ifdef __SSE4_1__
-
#if 0
void avx2_dump(const char *name, __m256i n)
{
}
#endif
+#ifndef HAS_MULTIVERSIONING
+
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+ // No fast path possible unless we have multiversioning.
+ return start;
+}
+
+#else // defined(HAS_MULTIVERSIONING)
+
+const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
+
// Does a memcpy and memchr in one to reduce processing time.
// Note that the benefit is somewhat limited if your L3 cache is small,
// as you'll (unfortunately) spend most of the time loading the data
// up until the first instance of "sync_char" (usually a bit before, actually).
// This is fine, since 0x00 bytes shouldn't really show up in normal picture
// data, and what we really need this for is the 00 00 ff ff marker in video data.
+__attribute__((target("default")))
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+ // No fast path possible unless we have SSE 4.1 or higher.
+ return start;
+}
+
+__attribute__((target("sse4.1", "avx2")))
const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
{
if (current_frame->data == nullptr ||
assert(((limit - aligned_start) % 64) == 0);
}
-#if __AVX2__
+ return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
+}
+
+__attribute__((target("avx2")))
+const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
+{
const __m256i needle = _mm256_set1_epi8(sync_char);
const __restrict __m256i *in = (const __m256i *)aligned_start;
}
current_frame->len = (uint8_t *)out - current_frame->data;
}
-#else
+
+ //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
+ return (const uint8_t *)in;
+}
+
+__attribute__((target("sse4.1")))
+const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
+{
const __m128i needle = _mm_set1_epi8(sync_char);
const __m128i *in = (const __m128i *)aligned_start;
}
current_frame->len = (uint8_t *)out - current_frame->data;
}
-#endif
//printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
-
return (const uint8_t *)in;
}
-#endif
+
+#endif // defined(HAS_MULTIVERSIONING)
void decode_packs(const libusb_transfer *xfr,
const char *sync_pattern,
const uint8_t *start = xfr->buffer + offset;
const uint8_t *limit = start + pack->actual_length;
while (start < limit) { // Usually runs only one iteration.
-#ifdef __SSE4_1__
start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
if (start == limit) break;
assert(start < limit);
-#endif
const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
if (start_next_frame == nullptr) {