From 306e7142adb5ee950cdda4b9588b3120655870aa Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Thu, 21 Jul 2022 17:40:58 +0200 Subject: [PATCH] Drop the SSE2 fastpath. Pre-Haswell CPUs are not really worth optimizing for anymore. --- shared/memcpy_interleaved.cpp | 35 +---------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/shared/memcpy_interleaved.cpp b/shared/memcpy_interleaved.cpp index 4aba3c9..2de1ece 100644 --- a/shared/memcpy_interleaved.cpp +++ b/shared/memcpy_interleaved.cpp @@ -29,49 +29,16 @@ void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, __attribute__((target("default"))) size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit); -__attribute__((target("sse2"))) -size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit); - __attribute__((target("avx2"))) size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit); __attribute__((target("default"))) size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit) { - // No fast path possible unless we have SSE2 or higher. + // No fast path supported unless we have AVX2. return 0; } -__attribute__((target("sse2"))) -size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit) -{ - size_t consumed = 0; - const __m128i * __restrict in = (const __m128i *)src; - __m128i * __restrict out1 = (__m128i *)dest1; - __m128i * __restrict out2 = (__m128i *)dest2; - - __m128i mask_lower_byte = _mm_set1_epi16(0x00ff); - while (in < (const __m128i *)limit) { - __m128i data1 = _mm_load_si128(in); - __m128i data2 = _mm_load_si128(in + 1); - __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte); - __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte); - __m128i data1_hi = _mm_srli_epi16(data1, 8); - __m128i data2_hi = _mm_srli_epi16(data2, 8); - __m128i lo = _mm_packus_epi16(data1_lo, data2_lo); - _mm_storeu_si128(out1, lo); - __m128i hi = _mm_packus_epi16(data1_hi, data2_hi); - _mm_storeu_si128(out2, hi); - - in += 2; - ++out1; - ++out2; - consumed += 32; - } - - return consumed; -} - __attribute__((target("avx2"))) size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit) { -- 2.39.2