]> git.sesse.net Git - bmusb/blobdiff - bmusb.cpp
Remove a lot of control transfers (they were from the day when I hardly understood...
[bmusb] / bmusb.cpp
index c3914ac83ca29482a3e2ec503c8266daa3337a95..00f06e443f387f1b0218ae8284a53b00a7b173da 100644 (file)
--- a/bmusb.cpp
+++ b/bmusb.cpp
@@ -335,32 +335,39 @@ const uint8_t *add_to_frame_fastpath(FrameAllocator::Frame *current_frame, const
 #if __AVX2__
        const __m256i needle = _mm256_set1_epi8(sync_char);
 
-       const __m256i *in = (const __m256i *)aligned_start;
+       const __restrict __m256i *in = (const __m256i *)aligned_start;
        if (current_frame->interleaved) {
-               __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
-               __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
+               __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
+               __restrict __m256i *out2 = (__m256i *)(current_frame->data2 + current_frame->len / 2);
                if (current_frame->len % 2 == 1) {
                        swap(out1, out2);
                }
 
-               __m256i mask_lower_byte = _mm256_set1_epi16(0x00ff);
+               __m256i shuffle_cw = _mm256_set_epi8(
+                       15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+                       15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
                while (in < (const __m256i *)limit) {
-                       __m256i data1 = _mm256_load_si256(in);
-                       __m256i data2 = _mm256_load_si256(in + 1);
-                       __m256i data1_lo = _mm256_and_si256(data1, mask_lower_byte);
-                       __m256i data2_lo = _mm256_and_si256(data2, mask_lower_byte);
-                       __m256i data1_hi = _mm256_srli_epi16(data1, 8);
-                       __m256i data2_hi = _mm256_srli_epi16(data2, 8);
-                       __m256i lo = _mm256_packus_epi16(data1_lo, data2_lo);
-                       lo = _mm256_permute4x64_epi64(lo, 0b11011000);
-                       _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
-                       __m256i hi = _mm256_packus_epi16(data1_hi, data2_hi);
-                       hi = _mm256_permute4x64_epi64(hi, 0b11011000);
-                       _mm256_storeu_si256(out2, hi);
+                       // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+                       __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
+                       __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
+
                        __m256i found1 = _mm256_cmpeq_epi8(data1, needle);
                        __m256i found2 = _mm256_cmpeq_epi8(data2, needle);
-                       if (!_mm256_testz_si256(found1, found1) ||
-                           !_mm256_testz_si256(found2, found2)) {
+                       __m256i found = _mm256_or_si256(found1, found2);
+
+                       data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
+                       data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
+               
+                       data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
+                       data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
+
+                       __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+                       __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+                       _mm256_storeu_si256(out1, lo);  // Store as early as possible, even if the data isn't used.
+                       _mm256_storeu_si256(out2, hi);
+
+                       if (!_mm256_testz_si256(found, found)) {
                                break;
                        }
 
@@ -686,8 +693,9 @@ void BMUSBCapture::start_bm_capture()
        //    36 can be set to 0 with no apparent effect (all of this tested on both video and audio),
        //    but the driver sets it to 0x8036802a at some point.
        //
-       //    0 is written to during firmware upgrade. Probably best to stay out of it unless you know
-       //    what you're doing.
+       //    all of this is on request 214/215. other requests (192, 219,
+       //    222, 223, 224) are used for firmware upgrade. Probably best to
+       //    stay out of it unless you know what you're doing.
        //
        //
        // register 16:
@@ -709,59 +717,6 @@ void BMUSBCapture::start_bm_capture()
        static const ctrl ctrls[] = {
                { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
                { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
-               { LIBUSB_ENDPOINT_IN,  214,  0, 0 },
-               { LIBUSB_ENDPOINT_IN,  214,  4, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 16, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 20, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 24, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 28, 0 },
-               { LIBUSB_ENDPOINT_IN,  215, 32, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 36, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  216, 44, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 48, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 52, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 24, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 24, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },  // packet 354
-               { LIBUSB_ENDPOINT_IN,  214, 24, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 12, 0 },
-               { LIBUSB_ENDPOINT_IN,  214, 40, 0 },
-               // more...
-               //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
-               //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },  // wow, some kind of mode
 
                // seems to capture on HDMI, clearing the 0x20000000 bit seems to activate 10-bit
                // capture (v210).
@@ -771,46 +726,10 @@ void BMUSBCapture::start_bm_capture()
                // 0x3c000000 = composite video? (analog audio)
                // 0x3e000000 = s-video? (analog audio)
                { LIBUSB_ENDPOINT_OUT, 215,  0, 0x29000000 },
+               //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x80000100 },
                //{ LIBUSB_ENDPOINT_OUT, 215,  0, 0x09000000 },
-
-               //{ LIBUSB_ENDPOINT_OUT, 215, 28, 0xffffffff },
-               //{ LIBUSB_ENDPOINT_OUT, 215, 32, 0xffffffff },
-               //{ LIBUSB_ENDPOINT_OUT, 215, 28, 0x40404040 },
-               //{ LIBUSB_ENDPOINT_OUT, 215, 32, 0x40404040 },
-               //{ LIBUSB_ENDPOINT_OUT, 215, 36, 0x8036802a },
                { LIBUSB_ENDPOINT_OUT, 215, 24, 0x73c60001 },  // latch for frame start?
-               //{ LIBUSB_ENDPOINT_OUT, 215, 24, 0x13370001 },  // latch for frame start?
                { LIBUSB_ENDPOINT_IN,  214, 24, 0 },  // 
-               //{ LIBUSB_ENDPOINT_OUT, 215,  4, 0x00000000 },  // appears to have no e fect
-               //{ LIBUSB_ENDPOINT_OUT, 215,  8, 0x00000000 },  // appears to have no effect
-               //{ LIBUSB_ENDPOINT_OUT, 215, 20, 0x00000000 },  // appears to have no effect
-               //{ LIBUSB_ENDPOINT_OUT, 215, 28, 0x00000000 },  // appears to have no effect
-               //{ LIBUSB_ENDPOINT_OUT, 215, 32, 0x00000000 },  // appears to have no effect
-               //{ LIBUSB_ENDPOINT_OUT, 215, 36, 0x00000000 },  // appears to have no effect
-#if 0
-               { LIBUSB_ENDPOINT_OUT, 215,  0 },
-               { LIBUSB_ENDPOINT_OUT, 215,  0 },
-               { LIBUSB_ENDPOINT_OUT, 215, 28 },
-               { LIBUSB_ENDPOINT_OUT, 215, 32 },
-               { LIBUSB_ENDPOINT_OUT, 215, 36 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215,  0 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-               { LIBUSB_ENDPOINT_OUT, 215, 24 },
-#endif
        };
 
        for (unsigned req = 0; req < sizeof(ctrls) / sizeof(ctrls[0]); ++req) {