From: Steinar H. Gunderson Date: Mon, 25 Jul 2016 11:37:26 +0000 (+0200) Subject: Use GCC multiversionining instead of #ifdefs and -march=native. X-Git-Tag: 0.4~7 X-Git-Url: https://git.sesse.net/?p=bmusb;a=commitdiff_plain;h=e614cf402e0a7f754a0b88a68b11c766ab9baebe Use GCC multiversionining instead of #ifdefs and -march=native. --- diff --git a/Makefile b/Makefile index aaca93d..3489104 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CXXFLAGS := -std=gnu++14 -O2 -march=native -Wall -g $(shell pkg-config libusb-1.0 --cflags) -pthread +CXXFLAGS := -std=gnu++14 -O2 -Wall -g $(shell pkg-config libusb-1.0 --cflags) -pthread LDFLAGS := $(shell pkg-config libusb-1.0 --libs) -pthread main: bmusb.o main.o diff --git a/bmusb.cpp b/bmusb.cpp index e89610e..f8ab2e6 100644 --- a/bmusb.cpp +++ b/bmusb.cpp @@ -4,6 +4,10 @@ // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation) // Audio comes out as 8-channel 24-bit raw audio. +#if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) +#define HAS_MULTIVERSIONING 1 +#endif + #include #include #include @@ -14,7 +18,7 @@ #include #include #include -#ifdef __SSE4_1__ +#if HAS_MULTIVERSIONING #include #endif #include "bmusb.h" @@ -360,8 +364,6 @@ void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_n } } -#ifdef __SSE4_1__ - #if 0 void avx2_dump(const char *name, __m256i n) { @@ -405,6 +407,18 @@ void avx2_dump(const char *name, __m256i n) } #endif +#ifndef HAS_MULTIVERSIONING + +const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char) +{ + // No fast path possible unless we have multiversioning. + return start; +} + +#else // defined(HAS_MULTIVERSIONING) + +const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char); + // Does a memcpy and memchr in one to reduce processing time. // Note that the benefit is somewhat limited if your L3 cache is small, // as you'll (unfortunately) spend most of the time loading the data @@ -414,6 +428,14 @@ void avx2_dump(const char *name, __m256i n) // up until the first instance of "sync_char" (usually a bit before, actually). // This is fine, since 0x00 bytes shouldn't really show up in normal picture // data, and what we really need this for is the 00 00 ff ff marker in video data. +__attribute__((target("default"))) +const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char) +{ + // No fast path possible unless we have SSE 4.1 or higher. + return start; +} + +__attribute__((target("sse4.1", "avx2"))) const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char) { if (current_frame->data == nullptr || @@ -457,7 +479,12 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const assert(((limit - aligned_start) % 64) == 0); } -#if __AVX2__ + return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char); +} + +__attribute__((target("avx2"))) +const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char) +{ const __m256i needle = _mm256_set1_epi8(sync_char); const __restrict __m256i *in = (const __m256i *)aligned_start; @@ -516,7 +543,14 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const } current_frame->len = (uint8_t *)out - current_frame->data; } -#else + + //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes); + return (const uint8_t *)in; +} + +__attribute__((target("sse4.1"))) +const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char) +{ const __m128i needle = _mm_set1_epi8(sync_char); const __m128i *in = (const __m128i *)aligned_start; @@ -566,13 +600,12 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const } current_frame->len = (uint8_t *)out - current_frame->data; } -#endif //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes); - return (const uint8_t *)in; } -#endif + +#endif // defined(HAS_MULTIVERSIONING) void decode_packs(const libusb_transfer *xfr, const char *sync_pattern, @@ -594,11 +627,9 @@ void decode_packs(const libusb_transfer *xfr, const uint8_t *start = xfr->buffer + offset; const uint8_t *limit = start + pack->actual_length; while (start < limit) { // Usually runs only one iteration. -#ifdef __SSE4_1__ start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]); if (start == limit) break; assert(start < limit); -#endif const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length); if (start_next_frame == nullptr) {