__attribute__((target("default")))
size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit);
-__attribute__((target("sse2")))
-size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit);
-
__attribute__((target("avx2")))
size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit);
__attribute__((target("default")))
size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit)
{
- // No fast path possible unless we have SSE2 or higher.
+ // No fast path supported unless we have AVX2.
return 0;
}
-__attribute__((target("sse2")))
-size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit)
-{
- size_t consumed = 0;
- const __m128i * __restrict in = (const __m128i *)src;
- __m128i * __restrict out1 = (__m128i *)dest1;
- __m128i * __restrict out2 = (__m128i *)dest2;
-
- __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
- while (in < (const __m128i *)limit) {
- __m128i data1 = _mm_load_si128(in);
- __m128i data2 = _mm_load_si128(in + 1);
- __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
- __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
- __m128i data1_hi = _mm_srli_epi16(data1, 8);
- __m128i data2_hi = _mm_srli_epi16(data2, 8);
- __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
- _mm_storeu_si128(out1, lo);
- __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
- _mm_storeu_si128(out2, hi);
-
- in += 2;
- ++out1;
- ++out2;
- consumed += 32;
- }
-
- return consumed;
-}
-
__attribute__((target("avx2")))
size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit)
{