Make AVX2 code a bit cleaner.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Tue, 22 Sep 2015 23:16:11 +0000 (01:16 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Tue, 22 Sep 2015 23:16:11 +0000 (01:16 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 22 Sep 2015 23:16:11 +0000 (01:16 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 22 Sep 2015 23:16:11 +0000 (01:16 +0200)
diff --git a/bmusb.cpp b/bmusb.cpp

index fb196f38aede4cd8fbdab3c35263c32059dde35b..45d1d669dd2fe0771f362aa9eed4b24df8482b6c 100644 (file)
--- a/bmusb.cpp
+++ b/bmusb.cpp
@@ -343,22 +343,29 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const
                         swap(out1, out2);
                 }
  
-               __m256i mask_lower_byte = _mm256_set1_epi16(0x00ff);
+               __m256i shuffle_cw = _mm256_set_epi8(
+                       15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+                       15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
                 while (in < (const __m256i *)limit) {
-                       __m256i data1 = _mm256_load_si256(in);
-                       __m256i data2 = _mm256_load_si256(in + 1);
-                       __m256i data1_lo = _mm256_and_si256(data1, mask_lower_byte);
-                       __m256i data2_lo = _mm256_and_si256(data2, mask_lower_byte);
-                       __m256i data1_hi = _mm256_srli_epi16(data1, 8);
-                       __m256i data2_hi = _mm256_srli_epi16(data2, 8);
-                       __m256i lo = _mm256_packus_epi16(data1_lo, data2_lo);
-                       lo = _mm256_permute4x64_epi64(lo, 0b11011000);
-                       _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
-                       __m256i hi = _mm256_packus_epi16(data1_hi, data2_hi);
-                       hi = _mm256_permute4x64_epi64(hi, 0b11011000);
-                       _mm256_storeu_si256(out2, hi);
+                       // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+                       __m256i data1 = _mm256_load_si256(in);                // AaBbCcDd EeFfGgHh
+                       __m256i data2 = _mm256_load_si256(in + 1);            // IiJjKkLl MmNnOoPp
+
                         __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
                         __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
+
+                       data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
+                       data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
+               
+                       data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
+                       data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
+
+                       __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+                       __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+                       _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
+                       _mm256_storeu_si256(out2, hi);
+
                         if (!_mm256_testz_si256(found1, found1) ||
                             !_mm256_testz_si256(found2, found2)) {
                                 break;
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Tue, 22 Sep 2015 23:16:11 +0000 (01:16 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Tue, 22 Sep 2015 23:16:11 +0000 (01:16 +0200)