X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=shared%2Fmemcpy_interleaved.cpp;h=dd7a59656608098ed451998838fcb7a713c5a6af;hb=fb65325fa7865b45d2799f3a916fcbae517e38cf;hp=9634fd26b88ddcb95ca4d5c5afcedb79cb21747c;hpb=b563b8903fa84bb7fd62d7d0b84b70cb26843dbf;p=nageru diff --git a/shared/memcpy_interleaved.cpp b/shared/memcpy_interleaved.cpp index 9634fd2..dd7a596 100644 --- a/shared/memcpy_interleaved.cpp +++ b/shared/memcpy_interleaved.cpp @@ -1,7 +1,11 @@ +#if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) +#define HAS_MULTIVERSIONING 1 +#endif + #include #include #include -#if __SSE2__ +#if HAS_MULTIVERSIONING #include #endif @@ -20,7 +24,71 @@ void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, } } -#ifdef __SSE2__ +void memcpy_interleaved_word_slow(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, size_t n) +{ + assert(n % 2 == 0); + uint16_t *dptr1 = dest1; + uint16_t *dptr2 = dest2; + + for (size_t i = 0; i < n; i += 2) { + *dptr1++ = *src++; + *dptr2++ = *src++; + } +} + +#if HAS_MULTIVERSIONING + +// uint8_t version. + +__attribute__((target("default"))) +size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit); + +__attribute__((target("avx2"))) +size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit); + +__attribute__((target("default"))) +size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit) +{ + // No fast path supported unless we have AVX2. + return 0; +} + +__attribute__((target("avx2"))) +size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit) +{ + size_t consumed = 0; + const __m256i *__restrict in = (const __m256i *)src; + __m256i *__restrict out1 = (__m256i *)dest1; + __m256i *__restrict out2 = (__m256i *)dest2; + + __m256i shuffle_cw = _mm256_set_epi8( + 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, + 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); + while (in < (const __m256i *)limit) { + // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128). + __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh + __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp + + data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh + data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop + + data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh + data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop + + __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000); + __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001); + + _mm256_storeu_si256(out1, lo); + _mm256_storeu_si256(out2, hi); + + in += 2; + ++out1; + ++out2; + consumed += 64; + } + + return consumed; +} // Returns the number of bytes consumed. size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) @@ -55,22 +123,43 @@ size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t } assert(((limit - src) % 64) == 0); -#if __AVX2__ + return consumed + memcpy_interleaved_fastpath_core(dest1, dest2, src, limit); +} + +// uint16_t version. + +__attribute__((target("default"))) +size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit); + +__attribute__((target("avx2"))) +size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit); + +__attribute__((target("default"))) +size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit) +{ + // No fast path supported unless we have AVX2. + return 0; +} + +__attribute__((target("avx2"))) +size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit) +{ + size_t consumed = 0; const __m256i *__restrict in = (const __m256i *)src; __m256i *__restrict out1 = (__m256i *)dest1; __m256i *__restrict out2 = (__m256i *)dest2; __m256i shuffle_cw = _mm256_set_epi8( - 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, - 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0); while (in < (const __m256i *)limit) { - // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128). + // Note: Each element in these comments is 16 bits long (lanes are 2x128 bits). __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop - + data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop @@ -80,44 +169,62 @@ size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t _mm256_storeu_si256(out1, lo); _mm256_storeu_si256(out2, hi); - in += 2; - ++out1; - ++out2; - consumed += 64; - } -#else - const __m128i * __restrict in = (const __m128i *)src; - __m128i * __restrict out1 = (__m128i *)dest1; - __m128i * __restrict out2 = (__m128i *)dest2; - - __m128i mask_lower_byte = _mm_set1_epi16(0x00ff); - while (in < (const __m128i *)limit) { - __m128i data1 = _mm_load_si128(in); - __m128i data2 = _mm_load_si128(in + 1); - __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte); - __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte); - __m128i data1_hi = _mm_srli_epi16(data1, 8); - __m128i data2_hi = _mm_srli_epi16(data2, 8); - __m128i lo = _mm_packus_epi16(data1_lo, data2_lo); - _mm_storeu_si128(out1, lo); - __m128i hi = _mm_packus_epi16(data1_hi, data2_hi); - _mm_storeu_si128(out2, hi); - in += 2; ++out1; ++out2; consumed += 32; } -#endif return consumed; } -#endif // defined(__SSE2__) +// Returns the number of bytes consumed. +size_t memcpy_interleaved_word_fastpath(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, size_t n) +{ + // We assume this to generally be the case, but just to be sure, + // drop down to the slow path. + if (intptr_t(dest1) % 2 != 0 || intptr_t(dest2) % 2 != 0 || intptr_t(src) % 2 != 0) { + return 0; + } + + const uint16_t *limit = src + n; + size_t consumed = 0; + + // Align end to 32 bytes. + limit = (const uint16_t *)(intptr_t(limit) & ~31); + + if (src >= limit) { + return 0; + } + + // Process [0,15] words, such that start gets aligned to 32 bytes (16 words). + const uint16_t *aligned_src = (const uint16_t *)(intptr_t(src + 31) & ~31); + if (aligned_src != src) { + size_t n2 = aligned_src - src; + memcpy_interleaved_word_slow(dest1, dest2, src, n2); + dest1 += n2 / 2; + dest2 += n2 / 2; + if (n2 % 2) { + swap(dest1, dest2); + } + src = aligned_src; + consumed += n2; + } + + // Make the length a multiple of 32 words (64 bytes). + if (((limit - src) % 32) != 0) { + limit -= 16; + } + assert(((limit - src) % 32) == 0); + + return consumed + memcpy_interleaved_word_fastpath_core(dest1, dest2, src, limit); +} + +#endif // defined(HAS_MULTIVERSIONING) void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) { -#ifdef __SSE2__ +#if HAS_MULTIVERSIONING size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n); src += consumed; dest1 += consumed / 2; @@ -126,11 +233,27 @@ void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size swap(dest1, dest2); } n -= consumed; +#endif if (n > 0) { memcpy_interleaved_slow(dest1, dest2, src, n); } -#else - memcpy_interleaved_slow(dest1, dest2, src, n); +} + +void memcpy_interleaved_word(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, size_t n) +{ +#if HAS_MULTIVERSIONING + size_t consumed = memcpy_interleaved_word_fastpath(dest1, dest2, src, n); + src += consumed; + dest1 += consumed / 2; + dest2 += consumed / 2; + if (consumed % 2) { + swap(dest1, dest2); + } + n -= consumed; #endif + + if (n > 0) { + memcpy_interleaved_word_slow(dest1, dest2, src, n); + } }