X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=bmusb.cpp;h=00f06e443f387f1b0218ae8284a53b00a7b173da;hb=f9218bdaa19f2c286ca3e2b05c156960edfa0780;hp=c3914ac83ca29482a3e2ec503c8266daa3337a95;hpb=67bbba40bacadfaa53ed6aa0a58e56c4153e46d5;p=bmusb diff --git a/bmusb.cpp b/bmusb.cpp index c3914ac..00f06e4 100644 --- a/bmusb.cpp +++ b/bmusb.cpp @@ -335,32 +335,39 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const #if __AVX2__ const __m256i needle = _mm256_set1_epi8(sync_char); - const __m256i *in = (const __m256i *)aligned_start; + const __restrict __m256i *in = (const __m256i *)aligned_start; if (current_frame->interleaved) { - __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2); - __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2); + __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2); + __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2); if (current_frame->len % 2 == 1) { swap(out1, out2); } - __m256i mask_lower_byte = _mm256_set1_epi16(0x00ff); + __m256i shuffle_cw = _mm256_set_epi8( + 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, + 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); while (in < (const __m256i *)limit) { - __m256i data1 = _mm256_load_si256(in); - __m256i data2 = _mm256_load_si256(in + 1); - __m256i data1_lo = _mm256_and_si256(data1, mask_lower_byte); - __m256i data2_lo = _mm256_and_si256(data2, mask_lower_byte); - __m256i data1_hi = _mm256_srli_epi16(data1, 8); - __m256i data2_hi = _mm256_srli_epi16(data2, 8); - __m256i lo = _mm256_packus_epi16(data1_lo, data2_lo); - lo = _mm256_permute4x64_epi64(lo, 0b11011000); - _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used. - __m256i hi = _mm256_packus_epi16(data1_hi, data2_hi); - hi = _mm256_permute4x64_epi64(hi, 0b11011000); - _mm256_storeu_si256(out2, hi); + // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128). + __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh + __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp + __m256i found1 = _mm256_cmpeq_epi8(data1, needle); __m256i found2 = _mm256_cmpeq_epi8(data2, needle); - if (!_mm256_testz_si256(found1, found1) || - !_mm256_testz_si256(found2, found2)) { + __m256i found = _mm256_or_si256(found1, found2); + + data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh + data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop + + data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh + data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop + + __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000); + __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001); + + _mm256_storeu_si256(out1, lo); // Store as early as possible, even if the data isn't used. + _mm256_storeu_si256(out2, hi); + + if (!_mm256_testz_si256(found, found)) { break; } @@ -686,8 +693,9 @@ void BMUSBCapture::start_bm_capture() // 36 can be set to 0 with no apparent effect (all of this tested on both video and audio), // but the driver sets it to 0x8036802a at some point. // - // 0 is written to during firmware upgrade. Probably best to stay out of it unless you know - // what you're doing. + // all of this is on request 214/215. other requests (192, 219, + // 222, 223, 224) are used for firmware upgrade. Probably best to + // stay out of it unless you know what you're doing. // // // register 16: @@ -709,59 +717,6 @@ void BMUSBCapture::start_bm_capture() static const ctrl ctrls[] = { { LIBUSB_ENDPOINT_IN, 214, 16, 0 }, { LIBUSB_ENDPOINT_IN, 214, 0, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 0, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 4, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 16, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 20, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 24, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 28, 0 }, - { LIBUSB_ENDPOINT_IN, 215, 32, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 36, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 40, 0 }, - { LIBUSB_ENDPOINT_IN, 216, 44, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 48, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 52, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 40, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 40, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 40, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 24, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 40, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 40, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 40, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 24, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 40, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 40, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, // packet 354 - { LIBUSB_ENDPOINT_IN, 214, 24, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 12, 0 }, - { LIBUSB_ENDPOINT_IN, 214, 40, 0 }, - // more... - //{ LIBUSB_ENDPOINT_OUT, 215, 0, 0x80000100 }, - //{ LIBUSB_ENDPOINT_OUT, 215, 0, 0x09000000 }, // wow, some kind of mode // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit // capture (v210). @@ -771,46 +726,10 @@ void BMUSBCapture::start_bm_capture() // 0x3c000000 = composite video? (analog audio) // 0x3e000000 = s-video? (analog audio) { LIBUSB_ENDPOINT_OUT, 215, 0, 0x29000000 }, + //{ LIBUSB_ENDPOINT_OUT, 215, 0, 0x80000100 }, //{ LIBUSB_ENDPOINT_OUT, 215, 0, 0x09000000 }, - - //{ LIBUSB_ENDPOINT_OUT, 215, 28, 0xffffffff }, - //{ LIBUSB_ENDPOINT_OUT, 215, 32, 0xffffffff }, - //{ LIBUSB_ENDPOINT_OUT, 215, 28, 0x40404040 }, - //{ LIBUSB_ENDPOINT_OUT, 215, 32, 0x40404040 }, - //{ LIBUSB_ENDPOINT_OUT, 215, 36, 0x8036802a }, { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 }, // latch for frame start? - //{ LIBUSB_ENDPOINT_OUT, 215, 24, 0x13370001 }, // latch for frame start? { LIBUSB_ENDPOINT_IN, 214, 24, 0 }, // - //{ LIBUSB_ENDPOINT_OUT, 215, 4, 0x00000000 }, // appears to have no e fect - //{ LIBUSB_ENDPOINT_OUT, 215, 8, 0x00000000 }, // appears to have no effect - //{ LIBUSB_ENDPOINT_OUT, 215, 20, 0x00000000 }, // appears to have no effect - //{ LIBUSB_ENDPOINT_OUT, 215, 28, 0x00000000 }, // appears to have no effect - //{ LIBUSB_ENDPOINT_OUT, 215, 32, 0x00000000 }, // appears to have no effect - //{ LIBUSB_ENDPOINT_OUT, 215, 36, 0x00000000 }, // appears to have no effect -#if 0 - { LIBUSB_ENDPOINT_OUT, 215, 0 }, - { LIBUSB_ENDPOINT_OUT, 215, 0 }, - { LIBUSB_ENDPOINT_OUT, 215, 28 }, - { LIBUSB_ENDPOINT_OUT, 215, 32 }, - { LIBUSB_ENDPOINT_OUT, 215, 36 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 0 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, - { LIBUSB_ENDPOINT_OUT, 215, 24 }, -#endif }; for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {