X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=shared%2Fmemcpy_interleaved.cpp;h=dd7a59656608098ed451998838fcb7a713c5a6af;hb=d92973cb0206e84529011bc8edd644e1a25374bd;hp=2de1ecec3b725498ebc6d92d298e33b332723bd3;hpb=ee6ece72bc12e5527e114e0e1973f0c0b2dc2138;p=nageru diff --git a/shared/memcpy_interleaved.cpp b/shared/memcpy_interleaved.cpp index 2de1ece..dd7a596 100644 --- a/shared/memcpy_interleaved.cpp +++ b/shared/memcpy_interleaved.cpp @@ -24,8 +24,22 @@ void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, } } +void memcpy_interleaved_word_slow(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, size_t n) +{ + assert(n % 2 == 0); + uint16_t *dptr1 = dest1; + uint16_t *dptr2 = dest2; + + for (size_t i = 0; i < n; i += 2) { + *dptr1++ = *src++; + *dptr2++ = *src++; + } +} + #if HAS_MULTIVERSIONING +// uint8_t version. + __attribute__((target("default"))) size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit); @@ -112,6 +126,100 @@ size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t return consumed + memcpy_interleaved_fastpath_core(dest1, dest2, src, limit); } +// uint16_t version. + +__attribute__((target("default"))) +size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit); + +__attribute__((target("avx2"))) +size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit); + +__attribute__((target("default"))) +size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit) +{ + // No fast path supported unless we have AVX2. + return 0; +} + +__attribute__((target("avx2"))) +size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit) +{ + size_t consumed = 0; + const __m256i *__restrict in = (const __m256i *)src; + __m256i *__restrict out1 = (__m256i *)dest1; + __m256i *__restrict out2 = (__m256i *)dest2; + + __m256i shuffle_cw = _mm256_set_epi8( + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, + 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0); + while (in < (const __m256i *)limit) { + // Note: Each element in these comments is 16 bits long (lanes are 2x128 bits). + __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh + __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp + + data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh + data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop + + data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh + data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop + + __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000); + __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001); + + _mm256_storeu_si256(out1, lo); + _mm256_storeu_si256(out2, hi); + + in += 2; + ++out1; + ++out2; + consumed += 32; + } + + return consumed; +} + +// Returns the number of bytes consumed. +size_t memcpy_interleaved_word_fastpath(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, size_t n) +{ + // We assume this to generally be the case, but just to be sure, + // drop down to the slow path. + if (intptr_t(dest1) % 2 != 0 || intptr_t(dest2) % 2 != 0 || intptr_t(src) % 2 != 0) { + return 0; + } + + const uint16_t *limit = src + n; + size_t consumed = 0; + + // Align end to 32 bytes. + limit = (const uint16_t *)(intptr_t(limit) & ~31); + + if (src >= limit) { + return 0; + } + + // Process [0,15] words, such that start gets aligned to 32 bytes (16 words). + const uint16_t *aligned_src = (const uint16_t *)(intptr_t(src + 31) & ~31); + if (aligned_src != src) { + size_t n2 = aligned_src - src; + memcpy_interleaved_word_slow(dest1, dest2, src, n2); + dest1 += n2 / 2; + dest2 += n2 / 2; + if (n2 % 2) { + swap(dest1, dest2); + } + src = aligned_src; + consumed += n2; + } + + // Make the length a multiple of 32 words (64 bytes). + if (((limit - src) % 32) != 0) { + limit -= 16; + } + assert(((limit - src) % 32) == 0); + + return consumed + memcpy_interleaved_word_fastpath_core(dest1, dest2, src, limit); +} + #endif // defined(HAS_MULTIVERSIONING) void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) @@ -131,3 +239,21 @@ void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size memcpy_interleaved_slow(dest1, dest2, src, n); } } + +void memcpy_interleaved_word(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, size_t n) +{ +#if HAS_MULTIVERSIONING + size_t consumed = memcpy_interleaved_word_fastpath(dest1, dest2, src, n); + src += consumed; + dest1 += consumed / 2; + dest2 += consumed / 2; + if (consumed % 2) { + swap(dest1, dest2); + } + n -= consumed; +#endif + + if (n > 0) { + memcpy_interleaved_word_slow(dest1, dest2, src, n); + } +}