1 #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
2 #define HAS_MULTIVERSIONING 1
8 #if HAS_MULTIVERSIONING
14 // TODO: Support stride.
15 void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
18 uint8_t *dptr1 = dest1;
19 uint8_t *dptr2 = dest2;
21 for (size_t i = 0; i < n; i += 2) {
27 void memcpy_interleaved_word_slow(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, size_t n)
30 uint16_t *dptr1 = dest1;
31 uint16_t *dptr2 = dest2;
33 for (size_t i = 0; i < n; i += 2) {
39 #if HAS_MULTIVERSIONING
43 __attribute__((target("default")))
44 size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit);
46 __attribute__((target("avx2")))
47 size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit);
49 __attribute__((target("default")))
50 size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit)
52 // No fast path supported unless we have AVX2.
56 __attribute__((target("avx2")))
57 size_t memcpy_interleaved_fastpath_core(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, const uint8_t *limit)
60 const __m256i *__restrict in = (const __m256i *)src;
61 __m256i *__restrict out1 = (__m256i *)dest1;
62 __m256i *__restrict out2 = (__m256i *)dest2;
64 __m256i shuffle_cw = _mm256_set_epi8(
65 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
66 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
67 while (in < (const __m256i *)limit) {
68 // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
69 __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh
70 __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp
72 data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh
73 data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop
75 data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh
76 data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop
78 __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
79 __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
81 _mm256_storeu_si256(out1, lo);
82 _mm256_storeu_si256(out2, hi);
93 // Returns the number of bytes consumed.
94 size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
96 const uint8_t *limit = src + n;
99 // Align end to 32 bytes.
100 limit = (const uint8_t *)(intptr_t(limit) & ~31);
106 // Process [0,31] bytes, such that start gets aligned to 32 bytes.
107 const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31);
108 if (aligned_src != src) {
109 size_t n2 = aligned_src - src;
110 memcpy_interleaved_slow(dest1, dest2, src, n2);
120 // Make the length a multiple of 64.
121 if (((limit - src) % 64) != 0) {
124 assert(((limit - src) % 64) == 0);
126 return consumed + memcpy_interleaved_fastpath_core(dest1, dest2, src, limit);
131 __attribute__((target("default")))
132 size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit);
134 __attribute__((target("avx2")))
135 size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit);
137 __attribute__((target("default")))
138 size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit)
140 // No fast path supported unless we have AVX2.
144 __attribute__((target("avx2")))
145 size_t memcpy_interleaved_word_fastpath_core(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, const uint16_t *limit)
148 const __m256i *__restrict in = (const __m256i *)src;
149 __m256i *__restrict out1 = (__m256i *)dest1;
150 __m256i *__restrict out2 = (__m256i *)dest2;
152 __m256i shuffle_cw = _mm256_set_epi8(
153 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0,
154 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0);
155 while (in < (const __m256i *)limit) {
156 // Note: Each element in these comments is 16 bits long (lanes are 2x128 bits).
157 __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh
158 __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp
160 data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh
161 data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop
163 data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh
164 data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop
166 __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
167 __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
169 _mm256_storeu_si256(out1, lo);
170 _mm256_storeu_si256(out2, hi);
181 // Returns the number of bytes consumed.
182 size_t memcpy_interleaved_word_fastpath(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, size_t n)
184 // We assume this to generally be the case, but just to be sure,
185 // drop down to the slow path.
186 if (intptr_t(dest1) % 2 != 0 || intptr_t(dest2) % 2 != 0 || intptr_t(src) % 2 != 0) {
190 const uint16_t *limit = src + n;
193 // Align end to 32 bytes.
194 limit = (const uint16_t *)(intptr_t(limit) & ~31);
200 // Process [0,15] words, such that start gets aligned to 32 bytes (16 words).
201 const uint16_t *aligned_src = (const uint16_t *)(intptr_t(src + 31) & ~31);
202 if (aligned_src != src) {
203 size_t n2 = aligned_src - src;
204 memcpy_interleaved_word_slow(dest1, dest2, src, n2);
214 // Make the length a multiple of 32 words (64 bytes).
215 if (((limit - src) % 32) != 0) {
218 assert(((limit - src) % 32) == 0);
220 return consumed + memcpy_interleaved_word_fastpath_core(dest1, dest2, src, limit);
223 #endif // defined(HAS_MULTIVERSIONING)
225 void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
227 #if HAS_MULTIVERSIONING
228 size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n);
230 dest1 += consumed / 2;
231 dest2 += consumed / 2;
239 memcpy_interleaved_slow(dest1, dest2, src, n);
243 void memcpy_interleaved_word(uint16_t *dest1, uint16_t *dest2, const uint16_t *src, size_t n)
245 #if HAS_MULTIVERSIONING
246 size_t consumed = memcpy_interleaved_word_fastpath(dest1, dest2, src, n);
248 dest1 += consumed / 2;
249 dest2 += consumed / 2;
257 memcpy_interleaved_word_slow(dest1, dest2, src, n);