swap(out1, out2);
}
- __m256i mask_lower_byte = _mm256_set1_epi16(0x00ff);
+ __m256i shuffle_cw = _mm256_set_epi8(
+ 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+ 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
while (in < (const __m256i *)limit) {
- __m256i data1 = _mm256_load_si256(in);
- __m256i data2 = _mm256_load_si256(in + 1);
- __m256i data1_lo = _mm256_and_si256(data1, mask_lower_byte);
- __m256i data2_lo = _mm256_and_si256(data2, mask_lower_byte);
- __m256i data1_hi = _mm256_srli_epi16(data1, 8);
- __m256i data2_hi = _mm256_srli_epi16(data2, 8);
- __m256i lo = _mm256_packus_epi16(data1_lo, data2_lo);
- lo = _mm256_permute4x64_epi64(lo, 0b11011000);
- _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used.
- __m256i hi = _mm256_packus_epi16(data1_hi, data2_hi);
- hi = _mm256_permute4x64_epi64(hi, 0b11011000);
- _mm256_storeu_si256(out2, hi);
+ // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+ __m256i data1 = _mm256_load_si256(in); // AaBbCcDd EeFfGgHh
+ __m256i data2 = _mm256_load_si256(in + 1); // IiJjKkLl MmNnOoPp
+
__m256i found1 = _mm256_cmpeq_epi8(data1, needle);
__m256i found2 = _mm256_cmpeq_epi8(data2, needle);
+
+ data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh
+ data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop
+
+ data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh
+ data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop
+
+ __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+ __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+ _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used.
+ _mm256_storeu_si256(out2, hi);
+
if (!_mm256_testz_si256(found1, found1) ||
!_mm256_testz_si256(found2, found2)) {
break;
printf(" interface %d\n", interface_number);
const libusb_interface *interface = &config->interface[interface_number];
for (int altsetting = 0; altsetting < interface->num_altsetting; ++altsetting) {
- printf(" alternate setting %d\n", altsetting);
const libusb_interface_descriptor *interface_desc = &interface->altsetting[altsetting];
+ printf(" alternate setting %d\n", interface_desc->bAlternateSetting);
for (int endpoint_number = 0; endpoint_number < interface_desc->bNumEndpoints; ++endpoint_number) {
const libusb_endpoint_descriptor *endpoint = &interface_desc->endpoint[endpoint_number];
printf(" endpoint address 0x%02x\n", endpoint->bEndpointAddress);
//
// so only first 16 bits count, and 0x0100 is a mask for ok/stable signal?
//
+ // Bottom 16 bits of this register seem to be firmware version number (possibly not all all of them).
+ //
// 28 and 32 seems to be analog audio input levels (one byte for each of the eight channels).
// however, if setting 32 with HDMI embedded audio, it is immediately overwritten back (to 0xe137002a).
//
// 36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
// but the driver sets it to 0x8036802a at some point.
//
+ // all of this is on request 214/215. other requests (192, 219,
+ // 222, 223, 224) are used for firmware upgrade. Probably best to
+ // stay out of it unless you know what you're doing.
+ //
+ //
// register 16:
// first byte is 0x39 for a stable 576p60 signal, 0x2d for a stable 720p60 signal, 0x20 for no signal
//