From 66c4fc7370e321fa5725e56685e5c5bb84bd6990 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Fri, 15 Feb 2019 22:58:02 +0100 Subject: [PATCH] Enable function multiversioning for memcpy_interleaved(). This makes the AVX2 version actually be used in practice (nobody really compiles with -march=native). --- shared/memcpy_interleaved.cpp | 133 ++++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 56 deletions(-) diff --git a/shared/memcpy_interleaved.cpp b/shared/memcpy_interleaved.cpp index 9634fd2..8b70d4c 100644 --- a/shared/memcpy_interleaved.cpp +++ b/shared/memcpy_interleaved.cpp @@ -1,7 +1,11 @@ +#if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) +#define HAS_MULTIVERSIONING 1 +#endif + #include #include #include -#if __SSE2__ +#if HAS_MULTIVERSIONING #include #endif @@ -20,42 +24,48 @@ void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, } } -#ifdef __SSE2__ +#if HAS_MULTIVERSIONING -// Returns the number of bytes consumed. -size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) +__attribute__((target("sse2"))) +size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit); + +__attribute__((target("avx2"))) +size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit); + +__attribute__((target("sse2"))) +size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit) { - const uint8_t *limit = src + n; size_t consumed = 0; + const __m128i * __restrict in = (const __m128i *)src; + __m128i * __restrict out1 = (__m128i *)dest1; + __m128i * __restrict out2 = (__m128i *)dest2; - // Align end to 32 bytes. - limit = (const uint8_t *)(intptr_t(limit) & ~31); - - if (src >= limit) { - return 0; - } + __m128i mask_lower_byte = _mm_set1_epi16(0x00ff); + while (in < (const __m128i *)limit) { + __m128i data1 = _mm_load_si128(in); + __m128i data2 = _mm_load_si128(in + 1); + __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte); + __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte); + __m128i data1_hi = _mm_srli_epi16(data1, 8); + __m128i data2_hi = _mm_srli_epi16(data2, 8); + __m128i lo = _mm_packus_epi16(data1_lo, data2_lo); + _mm_storeu_si128(out1, lo); + __m128i hi = _mm_packus_epi16(data1_hi, data2_hi); + _mm_storeu_si128(out2, hi); - // Process [0,31] bytes, such that start gets aligned to 32 bytes. - const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31); - if (aligned_src != src) { - size_t n2 = aligned_src - src; - memcpy_interleaved_slow(dest1, dest2, src, n2); - dest1 += n2 / 2; - dest2 += n2 / 2; - if (n2 % 2) { - swap(dest1, dest2); - } - src = aligned_src; - consumed += n2; + in += 2; + ++out1; + ++out2; + consumed += 32; } - // Make the length a multiple of 64. - if (((limit - src) % 64) != 0) { - limit -= 32; - } - assert(((limit - src) % 64) == 0); + return consumed; +} -#if __AVX2__ +__attribute__((target("avx2"))) +size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit) +{ + size_t consumed = 0; const __m256i *__restrict in = (const __m256i *)src; __m256i *__restrict out1 = (__m256i *)dest1; __m256i *__restrict out2 = (__m256i *)dest2; @@ -85,39 +95,52 @@ size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t ++out2; consumed += 64; } -#else - const __m128i * __restrict in = (const __m128i *)src; - __m128i * __restrict out1 = (__m128i *)dest1; - __m128i * __restrict out2 = (__m128i *)dest2; - __m128i mask_lower_byte = _mm_set1_epi16(0x00ff); - while (in < (const __m128i *)limit) { - __m128i data1 = _mm_load_si128(in); - __m128i data2 = _mm_load_si128(in + 1); - __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte); - __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte); - __m128i data1_hi = _mm_srli_epi16(data1, 8); - __m128i data2_hi = _mm_srli_epi16(data2, 8); - __m128i lo = _mm_packus_epi16(data1_lo, data2_lo); - _mm_storeu_si128(out1, lo); - __m128i hi = _mm_packus_epi16(data1_hi, data2_hi); - _mm_storeu_si128(out2, hi); + return consumed; +} - in += 2; - ++out1; - ++out2; - consumed += 32; +// Returns the number of bytes consumed. +__attribute__((target("sse2", "avx2"))) +size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) +{ + const uint8_t *limit = src + n; + size_t consumed = 0; + + // Align end to 32 bytes. + limit = (const uint8_t *)(intptr_t(limit) & ~31); + + if (src >= limit) { + return 0; } -#endif - return consumed; + // Process [0,31] bytes, such that start gets aligned to 32 bytes. + const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31); + if (aligned_src != src) { + size_t n2 = aligned_src - src; + memcpy_interleaved_slow(dest1, dest2, src, n2); + dest1 += n2 / 2; + dest2 += n2 / 2; + if (n2 % 2) { + swap(dest1, dest2); + } + src = aligned_src; + consumed += n2; + } + + // Make the length a multiple of 64. + if (((limit - src) % 64) != 0) { + limit -= 32; + } + assert(((limit - src) % 64) == 0); + + return consumed + memcpy_interleaved_fastpath_core(dest1, dest2, src, limit); } -#endif // defined(__SSE2__) +#endif // defined(HAS_MULTIVERSIONING) void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) { -#ifdef __SSE2__ +#if HAS_MULTIVERSIONING size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n); src += consumed; dest1 += consumed / 2; @@ -126,11 +149,9 @@ void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size swap(dest1, dest2); } n -= consumed; +#endif if (n > 0) { memcpy_interleaved_slow(dest1, dest2, src, n); } -#else - memcpy_interleaved_slow(dest1, dest2, src, n); -#endif } -- 2.39.2