]> git.sesse.net Git - nageru/commitdiff
Enable function multiversioning for memcpy_interleaved().
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Fri, 15 Feb 2019 21:58:02 +0000 (22:58 +0100)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Fri, 15 Feb 2019 21:58:02 +0000 (22:58 +0100)
This makes the AVX2 version actually be used in practice (nobody really
compiles with -march=native).

shared/memcpy_interleaved.cpp

index 9634fd26b88ddcb95ca4d5c5afcedb79cb21747c..8b70d4cb4d30566da0fdcd67f5cd9a3a9c01a260 100644 (file)
@@ -1,7 +1,11 @@
+#if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
+#define HAS_MULTIVERSIONING 1
+#endif
+
 #include <algorithm>
 #include <assert.h>
 #include <cstdint>
-#if __SSE2__
+#if HAS_MULTIVERSIONING
 #include <immintrin.h>
 #endif
 
@@ -20,42 +24,48 @@ void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src,
        }
 }
 
-#ifdef __SSE2__
+#if HAS_MULTIVERSIONING
 
-// Returns the number of bytes consumed.
-size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+__attribute__((target("sse2")))
+size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit);
+
+__attribute__((target("avx2")))
+size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit);
+
+__attribute__((target("sse2")))
+size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit)
 {
-       const uint8_t *limit = src + n;
        size_t consumed = 0;
+       const __m128i * __restrict in = (const __m128i *)src;
+       __m128i * __restrict out1 = (__m128i *)dest1;
+       __m128i * __restrict out2 = (__m128i *)dest2;
 
-       // Align end to 32 bytes.
-       limit = (const uint8_t *)(intptr_t(limit) & ~31);
-
-       if (src >= limit) {
-               return 0;
-       }
+       __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+       while (in < (const __m128i *)limit) {
+               __m128i data1 = _mm_load_si128(in);
+               __m128i data2 = _mm_load_si128(in + 1);
+               __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+               __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+               __m128i data1_hi = _mm_srli_epi16(data1, 8);
+               __m128i data2_hi = _mm_srli_epi16(data2, 8);
+               __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+               _mm_storeu_si128(out1, lo);
+               __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+               _mm_storeu_si128(out2, hi);
 
-       // Process [0,31] bytes, such that start gets aligned to 32 bytes.
-       const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31);
-       if (aligned_src != src) {
-               size_t n2 = aligned_src - src;
-               memcpy_interleaved_slow(dest1, dest2, src, n2);
-               dest1 += n2 / 2;
-               dest2 += n2 / 2;
-               if (n2 % 2) {
-                       swap(dest1, dest2);
-               }
-               src = aligned_src;
-               consumed += n2;
+               in += 2;
+               ++out1;
+               ++out2;
+               consumed += 32;
        }
 
-       // Make the length a multiple of 64.
-       if (((limit - src) % 64) != 0) {
-               limit -= 32;
-       }
-       assert(((limit - src) % 64) == 0);
+       return consumed;
+}
 
-#if __AVX2__
+__attribute__((target("avx2")))
+size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit)
+{
+       size_t consumed = 0;
        const __m256i *__restrict in = (const __m256i *)src;
        __m256i *__restrict out1 = (__m256i *)dest1;
        __m256i *__restrict out2 = (__m256i *)dest2;
@@ -85,39 +95,52 @@ size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t
                ++out2;
                consumed += 64;
        }
-#else
-       const __m128i * __restrict in = (const __m128i *)src;
-       __m128i * __restrict out1 = (__m128i *)dest1;
-       __m128i * __restrict out2 = (__m128i *)dest2;
 
-       __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
-       while (in < (const __m128i *)limit) {
-               __m128i data1 = _mm_load_si128(in);
-               __m128i data2 = _mm_load_si128(in + 1);
-               __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
-               __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
-               __m128i data1_hi = _mm_srli_epi16(data1, 8);
-               __m128i data2_hi = _mm_srli_epi16(data2, 8);
-               __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
-               _mm_storeu_si128(out1, lo);
-               __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
-               _mm_storeu_si128(out2, hi);
+       return consumed;
+}
 
-               in += 2;
-               ++out1;
-               ++out2;
-               consumed += 32;
+// Returns the number of bytes consumed.
+__attribute__((target("sse2", "avx2")))
+size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+       const uint8_t *limit = src + n;
+       size_t consumed = 0;
+
+       // Align end to 32 bytes.
+       limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+       if (src >= limit) {
+               return 0;
        }
-#endif
 
-       return consumed;
+       // Process [0,31] bytes, such that start gets aligned to 32 bytes.
+       const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31);
+       if (aligned_src != src) {
+               size_t n2 = aligned_src - src;
+               memcpy_interleaved_slow(dest1, dest2, src, n2);
+               dest1 += n2 / 2;
+               dest2 += n2 / 2;
+               if (n2 % 2) {
+                       swap(dest1, dest2);
+               }
+               src = aligned_src;
+               consumed += n2;
+       }
+
+       // Make the length a multiple of 64.
+       if (((limit - src) % 64) != 0) {
+               limit -= 32;
+       }
+       assert(((limit - src) % 64) == 0);
+
+       return consumed + memcpy_interleaved_fastpath_core(dest1, dest2, src, limit);
 }
 
-#endif  // defined(__SSE2__)
+#endif  // defined(HAS_MULTIVERSIONING)
 
 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
 {
-#ifdef __SSE2__
+#if HAS_MULTIVERSIONING
        size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n);
        src += consumed;
        dest1 += consumed / 2;
@@ -126,11 +149,9 @@ void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size
                swap(dest1, dest2);
        }
        n -= consumed;
+#endif
 
        if (n > 0) {
                memcpy_interleaved_slow(dest1, dest2, src, n);
        }
-#else
-       memcpy_interleaved_slow(dest1, dest2, src, n);
-#endif
 }