Use GCC multiversionining instead of #ifdefs and -march=native.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Mon, 25 Jul 2016 11:37:26 +0000 (13:37 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Mon, 25 Jul 2016 11:37:26 +0000 (13:37 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Mon, 25 Jul 2016 11:37:26 +0000 (13:37 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Mon, 25 Jul 2016 11:37:26 +0000 (13:37 +0200)
diff --git a/Makefile b/Makefile

index aaca93d7680a7cc99ff1490f47028634a55bfa9c..3489104e9ed806d67373e79ea0e4e7289b17f4a2 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-CXXFLAGS := -std=gnu++14 -O2 -march=native -Wall -g $(shell pkg-config libusb-1.0 --cflags) -pthread
+CXXFLAGS := -std=gnu++14 -O2 -Wall -g $(shell pkg-config libusb-1.0 --cflags) -pthread
  LDFLAGS := $(shell pkg-config libusb-1.0 --libs) -pthread
  
  main: bmusb.o main.o
diff --git a/bmusb.cpp b/bmusb.cpp

index e89610e7a5f2d7cc28a34ed1a22f326a5221bca5..f8ab2e61af3f8af8e66830db911199f5041338f5 100644 (file)
--- a/bmusb.cpp
+++ b/bmusb.cpp
@@ -4,6 +4,10 @@
  // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
  // Audio comes out as 8-channel 24-bit raw audio.
  
+#if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
+#define HAS_MULTIVERSIONING 1
+#endif
+
  #include <assert.h>
  #include <errno.h>
  #include <libusb.h>
@@ -14,7 +18,7 @@
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
-#ifdef __SSE4_1__
+#if HAS_MULTIVERSIONING
  #include <immintrin.h>
  #endif
  #include "bmusb.h"
@@ -360,8 +364,6 @@ void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_n
         }
  }
  
-#ifdef __SSE4_1__
-
  #if 0
  void avx2_dump(const char *name, __m256i n)
  {
@@ -405,6 +407,18 @@ void avx2_dump(const char *name, __m256i n)
  }
  #endif
  
+#ifndef HAS_MULTIVERSIONING
+
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+       // No fast path possible unless we have multiversioning.
+       return start;
+}
+
+#else  // defined(HAS_MULTIVERSIONING)
+
+const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char);
+
  // Does a memcpy and memchr in one to reduce processing time.
  // Note that the benefit is somewhat limited if your L3 cache is small,
  // as you'll (unfortunately) spend most of the time loading the data
@@ -414,6 +428,14 @@ void avx2_dump(const char *name, __m256i n)
  // up until the first instance of "sync_char" (usually a bit before, actually).
  // This is fine, since 0x00 bytes shouldn't really show up in normal picture
  // data, and what we really need this for is the 00 00 ff ff marker in video data.
+__attribute__((target("default")))
+const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
+{
+       // No fast path possible unless we have SSE 4.1 or higher.
+       return start;
+}
+
+__attribute__((target("sse4.1", "avx2")))
  const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const uint8_t *start, const uint8_t *limit, const char sync_char)
  {
         if (current_frame->data == nullptr ||
@@ -457,7 +479,12 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const
                 assert(((limit - aligned_start) % 64) == 0);
         }
  
-#if __AVX2__
+       return add_to_frame_fastpath_core(current_frame, aligned_start, limit, sync_char);
+}
+
+__attribute__((target("avx2")))
+const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
+{
         const __m256i needle = _mm256_set1_epi8(sync_char);
  
         const __restrict __m256i *in = (const __m256i *)aligned_start;
@@ -516,7 +543,14 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const
                 }
                 current_frame->len = (uint8_t *)out - current_frame->data;
         }
-#else
+
+       //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
+       return (const uint8_t *)in;
+}
+
+__attribute__((target("sse4.1")))
+const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const uint8_t *aligned_start, const uint8_t *limit, const char sync_char)
+{
         const __m128i needle = _mm_set1_epi8(sync_char);
  
         const __m128i *in = (const __m128i *)aligned_start;
@@ -566,13 +600,12 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const
                 }
                 current_frame->len = (uint8_t *)out - current_frame->data;
         }
-#endif
  
         //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
-
         return (const uint8_t *)in;
  }
-#endif
+
+#endif  // defined(HAS_MULTIVERSIONING)
  
  void decode_packs(const libusb_transfer *xfr,
                    const char *sync_pattern,
@@ -594,11 +627,9 @@ void decode_packs(const libusb_transfer *xfr,
                 const uint8_t *start = xfr->buffer + offset;
                 const uint8_t *limit = start + pack->actual_length;
                 while (start < limit) {  // Usually runs only one iteration.
-#ifdef __SSE4_1__
                         start = add_to_frame_fastpath(current_frame, start, limit, sync_pattern[0]);
                         if (start == limit) break;
                         assert(start < limit);
-#endif
  
                         const unsigned char* start_next_frame = (const unsigned char *)memmem(start, limit - start, sync_pattern, sync_length);
                         if (start_next_frame == nullptr) {
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Mon, 25 Jul 2016 11:37:26 +0000 (13:37 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Mon, 25 Jul 2016 11:37:26 +0000 (13:37 +0200)
Makefile		patch \| blob \| history
bmusb.cpp		patch \| blob \| history