+ __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+ while (in < (const __m128i *)limit) {
+ __m128i data1 = _mm_load_si128(in);
+ __m128i data2 = _mm_load_si128(in + 1);
+ __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+ __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+ __m128i data1_hi = _mm_srli_epi16(data1, 8);
+ __m128i data2_hi = _mm_srli_epi16(data2, 8);
+ __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+ _mm_storeu_si128(out1, lo);
+ __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+ _mm_storeu_si128(out2, hi);