10 // TODO: Support stride.
11 void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
14 uint8_t *dptr1 = dest1;
15 uint8_t *dptr2 = dest2;
17 for (size_t i = 0; i < n; i += 2) {
25 // Returns the number of bytes consumed.
26 size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
28 const uint8_t *limit = src + n;
31 // Align end to 32 bytes.
32 limit = (const uint8_t *)(intptr_t(limit) & ~31);
38 // Process [0,31] bytes, such that start gets aligned to 32 bytes.
39 const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31);
40 if (aligned_src != src) {
41 size_t n2 = aligned_src - src;
42 memcpy_interleaved_slow(dest1, dest2, src, n2);
52 // Make the length a multiple of 64.
53 if (((limit - src) % 64) != 0) {
56 assert(((limit - src) % 64) == 0);
59 const __m256i *__restrict in = (const __m256i *)src;
60 __m256i *__restrict out1 = (__m256i *)dest1;
61 __m256i *__restrict out2 = (__m256i *)dest2;
63 __m256i shuffle_cw = _mm256_set_epi8(
64 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
65 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
66 while (in < (const __m256i *)limit) {
67 // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
68 __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh
69 __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp
71 data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh
72 data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop
74 data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh
75 data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop
77 __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
78 __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
80 _mm256_storeu_si256(out1, lo);
81 _mm256_storeu_si256(out2, hi);
89 const __m128i * __restrict in = (const __m128i *)src;
90 __m128i * __restrict out1 = (__m128i *)dest1;
91 __m128i * __restrict out2 = (__m128i *)dest2;
93 __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
94 while (in < (const __m128i *)limit) {
95 __m128i data1 = _mm_load_si128(in);
96 __m128i data2 = _mm_load_si128(in + 1);
97 __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
98 __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
99 __m128i data1_hi = _mm_srli_epi16(data1, 8);
100 __m128i data2_hi = _mm_srli_epi16(data2, 8);
101 __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
102 _mm_storeu_si128(out1, lo);
103 __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
104 _mm_storeu_si128(out2, hi);
116 #endif // defined(__SSE2__)
118 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
121 size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n);
123 dest1 += consumed / 2;
124 dest2 += consumed / 2;
131 memcpy_interleaved_slow(dest1, dest2, src, n);
134 memcpy_interleaved_slow(dest1, dest2, src, n);