From: Steinar H. Gunderson Date: Tue, 22 Sep 2015 23:31:36 +0000 (+0200) Subject: Yet more small AVX2 tweaks. X-Git-Tag: 0.4~68 X-Git-Url: https://git.sesse.net/?p=bmusb;a=commitdiff_plain;h=03d475289549453b606c1c04d21e2929abef1e90 Yet more small AVX2 tweaks. --- diff --git a/bmusb.cpp b/bmusb.cpp index 45d1d66..1b4628e 100644 --- a/bmusb.cpp +++ b/bmusb.cpp @@ -335,10 +335,10 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const #if __AVX2__ const __m256i needle = _mm256_set1_epi8(sync_char); - const __m256i *in = (const __m256i *)aligned_start; + const __restrict __m256i *in = (const __m256i *)aligned_start; if (current_frame->interleaved) { - __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2); - __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2); + __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2); + __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2); if (current_frame->len % 2 == 1) { swap(out1, out2); } @@ -348,11 +348,12 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); while (in < (const __m256i *)limit) { // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128). - __m256i data1 = _mm256_load_si256(in); // AaBbCcDd EeFfGgHh - __m256i data2 = _mm256_load_si256(in + 1); // IiJjKkLl MmNnOoPp + __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh + __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp __m256i found1 = _mm256_cmpeq_epi8(data1, needle); __m256i found2 = _mm256_cmpeq_epi8(data2, needle); + __m256i found = _mm256_or_si256(found1, found2); data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop @@ -366,8 +367,7 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used. _mm256_storeu_si256(out2, hi); - if (!_mm256_testz_si256(found1, found1) || - !_mm256_testz_si256(found2, found2)) { + if (!_mm256_testz_si256(found, found)) { break; }