From: Steinar H. Gunderson Date: Tue, 22 Sep 2015 23:16:11 +0000 (+0200) Subject: Make AVX2 code a bit cleaner. X-Git-Tag: 0.4~69 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;ds=sidebyside;h=eb9dd19040997ee812bac870807275749bb99c6e;p=bmusb Make AVX2 code a bit cleaner. Possibly also a cycle shorter, but hard to say without a benchmark. --- diff --git a/bmusb.cpp b/bmusb.cpp index fb196f3..45d1d66 100644 --- a/bmusb.cpp +++ b/bmusb.cpp @@ -343,22 +343,29 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const swap(out1, out2); } - __m256i mask_lower_byte = _mm256_set1_epi16(0x00ff); + __m256i shuffle_cw = _mm256_set_epi8( + 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, + 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); while (in < (const __m256i *)limit) { - __m256i data1 = _mm256_load_si256(in); - __m256i data2 = _mm256_load_si256(in + 1); - __m256i data1_lo = _mm256_and_si256(data1, mask_lower_byte); - __m256i data2_lo = _mm256_and_si256(data2, mask_lower_byte); - __m256i data1_hi = _mm256_srli_epi16(data1, 8); - __m256i data2_hi = _mm256_srli_epi16(data2, 8); - __m256i lo = _mm256_packus_epi16(data1_lo, data2_lo); - lo = _mm256_permute4x64_epi64(lo, 0b11011000); - _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used. - __m256i hi = _mm256_packus_epi16(data1_hi, data2_hi); - hi = _mm256_permute4x64_epi64(hi, 0b11011000); - _mm256_storeu_si256(out2, hi); + // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128). + __m256i data1 = _mm256_load_si256(in); // AaBbCcDd EeFfGgHh + __m256i data2 = _mm256_load_si256(in + 1); // IiJjKkLl MmNnOoPp + __m256i found1 = _mm256_cmpeq_epi8(data1, needle); __m256i found2 = _mm256_cmpeq_epi8(data2, needle); + + data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh + data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop + + data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh + data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop + + __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000); + __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001); + + _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used. + _mm256_storeu_si256(out2, hi); + if (!_mm256_testz_si256(found1, found1) || !_mm256_testz_si256(found2, found2)) { break;