]> git.sesse.net Git - nageru/commitdiff
Drop the SSE2 fastpath.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 21 Jul 2022 15:40:58 +0000 (17:40 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 21 Jul 2022 15:40:58 +0000 (17:40 +0200)
Pre-Haswell CPUs are not really worth optimizing for anymore.

shared/memcpy_interleaved.cpp

index 4aba3c936a00effd9fb68ae99042db85375b4d24..2de1ecec3b725498ebc6d92d298e33b332723bd3 100644 (file)
@@ -29,49 +29,16 @@ void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src,
 __attribute__((target("default")))
 size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit);
 
-__attribute__((target("sse2")))
-size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit);
-
 __attribute__((target("avx2")))
 size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit);
 
 __attribute__((target("default")))
 size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit)
 {
-       // No fast path possible unless we have SSE2 or higher.
+       // No fast path supported unless we have AVX2.
        return 0;
 }
 
-__attribute__((target("sse2")))
-size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit)
-{
-       size_t consumed = 0;
-       const __m128i * __restrict in = (const __m128i *)src;
-       __m128i * __restrict out1 = (__m128i *)dest1;
-       __m128i * __restrict out2 = (__m128i *)dest2;
-
-       __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
-       while (in < (const __m128i *)limit) {
-               __m128i data1 = _mm_load_si128(in);
-               __m128i data2 = _mm_load_si128(in + 1);
-               __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
-               __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
-               __m128i data1_hi = _mm_srli_epi16(data1, 8);
-               __m128i data2_hi = _mm_srli_epi16(data2, 8);
-               __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
-               _mm_storeu_si128(out1, lo);
-               __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
-               _mm_storeu_si128(out2, hi);
-
-               in += 2;
-               ++out1;
-               ++out2;
-               consumed += 32;
-       }
-
-       return consumed;
-}
-
 __attribute__((target("avx2")))
 size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit)
 {