]> git.sesse.net Git - bmusb/commitdiff
Yet more small AVX2 tweaks.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 22 Sep 2015 23:31:36 +0000 (01:31 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 22 Sep 2015 23:32:15 +0000 (01:32 +0200)
bmusb.cpp

index 45d1d669dd2fe0771f362aa9eed4b24df8482b6c..1b4628e378c18796e192db4cd260ab92584e0b8f 100644 (file)
--- a/bmusb.cpp
+++ b/bmusb.cpp
@@ -335,10 +335,10 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const
 #if __AVX2__
        const __m256i needle = _mm256_set1_epi8(sync_char);
 
-       const __m256i *in = (const __m256i *)aligned_start;
+       const __restrict __m256i *in = (const __m256i *)aligned_start;
        if (current_frame->interleaved) {
-               __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
-               __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
+               __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
+               __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
                if (current_frame->len % 2 == 1) {
                        swap(out1, out2);
                }
@@ -348,11 +348,12 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const
                        15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
                while (in < (const __m256i *)limit) {
                        // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
-                       __m256i data1 = _mm256_load_si256(in);                // AaBbCcDd EeFfGgHh
-                       __m256i data2 = _mm256_load_si256(in + 1);            // IiJjKkLl MmNnOoPp
+                       __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
+                       __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
 
                        __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
                        __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
+                       __m256i found = _mm256_or_si256(found1, found2);
 
                        data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
                        data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
@@ -366,8 +367,7 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const
                        _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
                        _mm256_storeu_si256(out2, hi);
 
-                       if (!_mm256_testz_si256(found1, found1) ||
-                           !_mm256_testz_si256(found2, found2)) {
+                       if (!_mm256_testz_si256(found, found)) {
                                break;
                        }