- __m256i data1 = _mm256_load_si256(in);
- __m256i data2 = _mm256_load_si256(in + 1);
- __m256i data1_lo = _mm256_and_si256(data1, mask_lower_byte);
- __m256i data2_lo = _mm256_and_si256(data2, mask_lower_byte);
- __m256i data1_hi = _mm256_srli_epi16(data1, 8);
- __m256i data2_hi = _mm256_srli_epi16(data2, 8);
- __m256i lo = _mm256_packus_epi16(data1_lo, data2_lo);
- lo = _mm256_permute4x64_epi64(lo, 0b11011000);
- _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used.
- __m256i hi = _mm256_packus_epi16(data1_hi, data2_hi);
- hi = _mm256_permute4x64_epi64(hi, 0b11011000);
- _mm256_storeu_si256(out2, hi);
+ // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+ __m256i data1 = _mm256_load_si256(in); // AaBbCcDd EeFfGgHh
+ __m256i data2 = _mm256_load_si256(in + 1); // IiJjKkLl MmNnOoPp
+