#ifdef __SSE2__
+#if 0
+void avx2_dump(const char *name, __m256i n)
+{
+ printf("%-10s:", name);
+ printf(" %02x", _mm256_extract_epi8(n, 0));
+ printf(" %02x", _mm256_extract_epi8(n, 1));
+ printf(" %02x", _mm256_extract_epi8(n, 2));
+ printf(" %02x", _mm256_extract_epi8(n, 3));
+ printf(" %02x", _mm256_extract_epi8(n, 4));
+ printf(" %02x", _mm256_extract_epi8(n, 5));
+ printf(" %02x", _mm256_extract_epi8(n, 6));
+ printf(" %02x", _mm256_extract_epi8(n, 7));
+ printf(" ");
+ printf(" %02x", _mm256_extract_epi8(n, 8));
+ printf(" %02x", _mm256_extract_epi8(n, 9));
+ printf(" %02x", _mm256_extract_epi8(n, 10));
+ printf(" %02x", _mm256_extract_epi8(n, 11));
+ printf(" %02x", _mm256_extract_epi8(n, 12));
+ printf(" %02x", _mm256_extract_epi8(n, 13));
+ printf(" %02x", _mm256_extract_epi8(n, 14));
+ printf(" %02x", _mm256_extract_epi8(n, 15));
+ printf(" ");
+ printf(" %02x", _mm256_extract_epi8(n, 16));
+ printf(" %02x", _mm256_extract_epi8(n, 17));
+ printf(" %02x", _mm256_extract_epi8(n, 18));
+ printf(" %02x", _mm256_extract_epi8(n, 19));
+ printf(" %02x", _mm256_extract_epi8(n, 20));
+ printf(" %02x", _mm256_extract_epi8(n, 21));
+ printf(" %02x", _mm256_extract_epi8(n, 22));
+ printf(" %02x", _mm256_extract_epi8(n, 23));
+ printf(" ");
+ printf(" %02x", _mm256_extract_epi8(n, 24));
+ printf(" %02x", _mm256_extract_epi8(n, 25));
+ printf(" %02x", _mm256_extract_epi8(n, 26));
+ printf(" %02x", _mm256_extract_epi8(n, 27));
+ printf(" %02x", _mm256_extract_epi8(n, 28));
+ printf(" %02x", _mm256_extract_epi8(n, 29));
+ printf(" %02x", _mm256_extract_epi8(n, 30));
+ printf(" %02x", _mm256_extract_epi8(n, 31));
+ printf("\n");
+}
+#endif
+
// Does a memcpy and memchr in one to reduce processing time.
// Note that the benefit is somewhat limited if your L3 cache is small,
// as you'll (unfortunately) spend most of the time loading the data