]> git.sesse.net Git - bmusb/blobdiff - bmusb.cpp
Release 0.7.8.
[bmusb] / bmusb.cpp
index 3d090d746d44effbdf25fc2ee75f528f3c4e919f..19a9da1aa2c52ca1ed22c980b3c244db405970e6 100644 (file)
--- a/bmusb.cpp
+++ b/bmusb.cpp
@@ -1,5 +1,5 @@
-// Intensity Shuttle USB3 capture driver, v0.5.4
-// Can download 8-bit and 10-bit UYVY/v210 frames from HDMI, quite stable
+// Intensity Shuttle USB3 capture driver, v0.7.8
+// Can download 8-bit and 10-bit UYVY/v210-ish frames from HDMI, quite stable
 // (can do captures for hours at a time with no drops), except during startup
 // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation)
 // Audio comes out as 8-channel 24-bit raw audio.
@@ -63,14 +63,24 @@ FILE *audiofp;
 thread usb_thread;
 atomic<bool> should_quit;
 
-int find_xfer_size_for_width(int width)
+int v210_stride(int width)
+{
+       return (width + 5) / 6 * 4 * sizeof(uint32_t);
+}
+
+int find_xfer_size_for_width(PixelFormat pixel_format, int width)
 {
        // Video seems to require isochronous packets scaled with the width;
        // seemingly six lines is about right, rounded up to the required 1kB
        // multiple.
-       int size = width * 2 * 6;
        // Note that for 10-bit input, you'll need to increase size accordingly.
-       //size = size * 4 / 3;
+       int stride;
+       if (pixel_format == PixelFormat_10BitYCbCr) {
+               stride = v210_stride(width);
+       } else {
+               stride = width * sizeof(uint16_t);
+       }
+       int size = stride * 6;
        if (size % 1024 != 0) {
                size &= ~1023;
                size += 1024;
@@ -78,10 +88,10 @@ int find_xfer_size_for_width(int width)
        return size;
 }
 
-void change_xfer_size_for_width(int width, libusb_transfer *xfr)
+void change_xfer_size_for_width(PixelFormat pixel_format, int width, libusb_transfer *xfr)
 {
        assert(width >= MIN_WIDTH);
-       size_t size = find_xfer_size_for_width(width);
+       size_t size = find_xfer_size_for_width(pixel_format, width);
        int num_iso_pack = xfr->length / size;
        if (num_iso_pack != xfr->num_iso_packets ||
            size != xfr->iso_packet_desc[0].length) {
@@ -120,7 +130,7 @@ bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_forma
                decoded_video_format->has_signal = false;
                return true;
        }
-       if ((video_format & 0xe800) != 0xe800) {
+       if ((video_format & 0xe000) != 0xe000) {
                printf("Video format 0x%04x does not appear to be a video format. Assuming 60 Hz.\n",
                        video_format);
                decoded_video_format->width = 0;
@@ -137,10 +147,16 @@ bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_forma
        decoded_video_format->has_signal = true;
 
        // NTSC (480i59.94, I suppose). A special case, see below.
-       if (video_format == 0xe901 || video_format == 0xe9c1 || video_format == 0xe801) {
+       if ((video_format & ~0x0800) == 0xe101 ||
+           (video_format & ~0x0800) == 0xe1c1 ||
+           (video_format & ~0x0800) == 0xe001) {
                decoded_video_format->width = 720;
                decoded_video_format->height = 480;
-               decoded_video_format->stride = 720 * 2;
+               if (video_format & 0x0800) {
+                       decoded_video_format->stride = 720 * 2;
+               } else {
+                       decoded_video_format->stride = v210_stride(720);
+               }
                decoded_video_format->extra_lines_top = 17;
                decoded_video_format->extra_lines_bottom = 28;
                decoded_video_format->frame_rate_nom = 30000;
@@ -151,10 +167,18 @@ bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_forma
        }
 
        // PAL (576i50, I suppose). A special case, see below.
-       if (video_format == 0xe909 || video_format == 0xe9c9 || video_format == 0xe809 || video_format == 0xebe9 || video_format == 0xebe1) {
+       if ((video_format & ~0x0800) == 0xe109 ||
+           (video_format & ~0x0800) == 0xe1c9 ||
+           (video_format & ~0x0800) == 0xe009 ||
+           (video_format & ~0x0800) == 0xe3e9 ||
+           (video_format & ~0x0800) == 0xe3e1) {
                decoded_video_format->width = 720;
                decoded_video_format->height = 576;
-               decoded_video_format->stride = 720 * 2;
+               if (video_format & 0x0800) {
+                       decoded_video_format->stride = 720 * 2;
+               } else {
+                       decoded_video_format->stride = v210_stride(720);
+               }
                decoded_video_format->extra_lines_top = 22;
                decoded_video_format->extra_lines_bottom = 27;
                decoded_video_format->frame_rate_nom = 25;
@@ -167,15 +191,18 @@ bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_forma
        // 0x8 seems to be a flag about availability of deep color on the input,
        // except when it's not (e.g. it's the only difference between NTSC
        // and PAL). Rather confusing. But we clear it here nevertheless, because
-       // usually it doesn't mean anything.
+       // usually it doesn't mean anything. 0x0800 appears to be 8-bit input
+       // (as opposed to 10-bit).
        //
        // 0x4 is a flag I've only seen from the D4. I don't know what it is.
        uint16_t normalized_video_format = video_format & ~0xe80c;
        constexpr VideoFormatEntry entries[] = {
                { 0x01f1,  720,  480,   0, 40,  5, 60000, 1001, false },  // 480p59.94 (believed).
                { 0x0131,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
+               { 0x0151,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50.
                { 0x0011,  720,  576,   0, 44,  5,    50,    1, false },  // 576p50 (5:4).
                { 0x0143, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
+               { 0x0161, 1280,  720,   0, 25,  5,    50,    1, false },  // 720p50.
                { 0x0103, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
                { 0x0125, 1280,  720,   0, 25,  5,    60,    1, false },  // 720p60.
                { 0x0121, 1280,  720,   0, 25,  5, 60000, 1001, false },  // 720p59.94.
@@ -192,7 +219,11 @@ bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_forma
                if (normalized_video_format == entry.normalized_video_format) {
                        decoded_video_format->width = entry.width;
                        decoded_video_format->height = entry.height;
-                       decoded_video_format->stride = entry.width * 2;
+                       if (video_format & 0x0800) {
+                               decoded_video_format->stride = entry.width * 2;
+                       } else {
+                               decoded_video_format->stride = v210_stride(entry.width);
+                       }
                        decoded_video_format->second_field_start = entry.second_field_start;
                        decoded_video_format->extra_lines_top = entry.extra_lines_top;
                        decoded_video_format->extra_lines_bottom = entry.extra_lines_bottom;
@@ -212,6 +243,26 @@ bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_forma
        return false;
 }
 
+// There are seemingly no direct indicators of sample rate; you just get
+// one frame's worth and have to guess from that.
+int guess_sample_rate(const VideoFormat &video_format, size_t len, int default_rate)
+{
+       size_t num_samples = len / 3 / 8;
+       size_t num_samples_per_second = num_samples * video_format.frame_rate_nom / video_format.frame_rate_den;
+
+       // See if we match or are very close to any of the mandatory HDMI sample rates.
+       const int candidate_sample_rates[] = { 32000, 44100, 48000 };
+       for (int rate : candidate_sample_rates) {
+               if (abs(int(num_samples_per_second) - rate) <= 100) {
+                       return rate;
+               }
+       }
+
+       fprintf(stderr, "%ld samples at %d/%d fps (%ld Hz) matches no known sample rate, keeping capture at %d Hz\n",
+               num_samples, video_format.frame_rate_nom, video_format.frame_rate_den, num_samples_per_second, default_rate);
+       return default_rate;
+}
+
 }  // namespace
 
 FrameAllocator::~FrameAllocator() {}
@@ -291,7 +342,9 @@ void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len)
 
 void dump_audio_block(uint8_t *audio_start, size_t audio_len)
 {
-       fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
+       if (audiofp != nullptr) {
+               fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp);
+       }
 }
 
 void BMUSBCapture::dequeue_thread_func()
@@ -303,6 +356,7 @@ void BMUSBCapture::dequeue_thread_func()
        if (has_dequeue_callbacks) {
                dequeue_init_callback();
        }
+       size_t last_sample_rate = 48000;
        while (!dequeue_thread_should_quit) {
                unique_lock<mutex> lock(queue_lock);
                queues_not_empty.wait(lock, [this]{ return dequeue_thread_should_quit || (!pending_video_frames.empty() && !pending_audio_frames.empty()); });
@@ -314,6 +368,7 @@ void BMUSBCapture::dequeue_thread_func()
                AudioFormat audio_format;
                audio_format.bits_per_sample = 24;
                audio_format.num_channels = 8;
+               audio_format.sample_rate = last_sample_rate;
                if (uint16_less_than_with_wraparound(video_timecode, audio_timecode)) {
                        printf("Video block 0x%04x without corresponding audio block, dropping.\n",
                                video_timecode);
@@ -354,10 +409,16 @@ void BMUSBCapture::dequeue_thread_func()
                        VideoFormat video_format;
                        audio_format.id = audio_frame.format;
                        if (decode_video_format(video_frame.format, &video_format)) {
+                               if (audio_frame.frame.len != 0) {
+                                       audio_format.sample_rate = guess_sample_rate(video_format, audio_frame.frame.len, last_sample_rate);
+                                       last_sample_rate = audio_format.sample_rate;
+                               }
                                frame_callback(video_timecode,
                                               video_frame.frame, HEADER_SIZE, video_format,
                                               audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
                        } else {
+                               video_frame_allocator->release_frame(video_frame.frame);
+                               audio_format.sample_rate = last_sample_rate;
                                frame_callback(video_timecode,
                                               FrameAllocator::Frame(), 0, video_format,
                                               audio_frame.frame, AUDIO_HEADER_SIZE, audio_format);
@@ -424,8 +485,8 @@ void BMUSBCapture::start_new_audio_block(const uint8_t *start)
                //dump_audio_block();
                queue_frame(format, timecode, current_audio_frame, &pending_audio_frames);
        }
-       //printf("Found audio block start, format 0x%04x timecode 0x%04x, previous block length was %d\n",
-       //      format, timecode, read_current_audio_block);
+       //printf("Found audio block start, format 0x%04x timecode 0x%04x\n",
+       //      format, timecode);
        current_audio_frame = audio_frame_allocator->alloc_frame();
 }
 
@@ -477,6 +538,9 @@ void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_n
                }
                //dump_frame();
        } else {
+               if (current_frame->data_copy != nullptr) {
+                       memcpy(current_frame->data_copy + current_frame->len, start, bytes);
+               }
                if (current_frame->interleaved) {
                        uint8_t *data = current_frame->data + current_frame->len / 2;
                        uint8_t *data2 = current_frame->data2 + current_frame->len / 2;
@@ -626,6 +690,7 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
 {
        const __m256i needle = _mm256_set1_epi8(sync_char);
 
+       size_t bytes_copied;
        const __restrict __m256i *in = (const __m256i *)aligned_start;
        if (current_frame->interleaved) {
                __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2);
@@ -666,9 +731,10 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
                        ++out1;
                        ++out2;
                }
-               current_frame->len += (uint8_t *)in - aligned_start;
+               bytes_copied = (uint8_t *)in - aligned_start;
        } else {
-               __m256i *out = (__m256i *)(current_frame->data + current_frame->len);
+               uint8_t *old_end = current_frame->data + current_frame->len;
+               __m256i *out = (__m256i *)old_end;
                while (in < (const __m256i *)limit) {
                        __m256i data = _mm256_load_si256(in);
                        _mm256_storeu_si256(out, data);  // Store as early as possible, even if the data isn't used.
@@ -680,8 +746,14 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
                        ++in;
                        ++out;
                }
-               current_frame->len = (uint8_t *)out - current_frame->data;
+               bytes_copied = (uint8_t *)out - old_end;
+       }
+       if (current_frame->data_copy != nullptr) {
+               // TODO: It would be somewhat more cache-efficient to write this in the
+               // same loop as above. However, it might not be worth the extra complexity.
+               memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied);
        }
+       current_frame->len += bytes_copied;
 
        //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
        return (const uint8_t *)in;
@@ -693,6 +765,7 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
        const __m128i needle = _mm_set1_epi8(sync_char);
 
        const __m128i *in = (const __m128i *)aligned_start;
+       size_t bytes_copied;
        if (current_frame->interleaved) {
                __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2);
                __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2);
@@ -723,9 +796,10 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
                        ++out1;
                        ++out2;
                }
-               current_frame->len += (uint8_t *)in - aligned_start;
+               bytes_copied = (uint8_t *)in - aligned_start;
        } else {
-               __m128i *out = (__m128i *)(current_frame->data + current_frame->len);
+               uint8_t *old_end = current_frame->data + current_frame->len;
+               __m128i *out = (__m128i *)old_end;
                while (in < (const __m128i *)limit) {
                        __m128i data = _mm_load_si128(in);
                        _mm_storeu_si128(out, data);  // Store as early as possible, even if the data isn't used.
@@ -737,8 +811,14 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame,
                        ++in;
                        ++out;
                }
-               current_frame->len = (uint8_t *)out - current_frame->data;
+               bytes_copied = (uint8_t *)out - old_end;
        }
+       if (current_frame->data_copy != nullptr) {
+               // TODO: It would be somewhat more cache-efficient to write this in the
+               // same loop as above. However, it might not be worth the extra complexity.
+               memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied);
+       }
+       current_frame->len += bytes_copied;
 
        //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes);
        return (const uint8_t *)in;
@@ -819,7 +899,7 @@ void BMUSBCapture::cb_xfr(struct libusb_transfer *xfr)
                        decode_packs(xfr, "\x00\x00\xff\xff", 4, &usb->current_video_frame, "video", bind(&BMUSBCapture::start_new_frame, usb, _1));
 
                        // Update the transfer with the new assumed width, if we're in the process of changing formats.
-                       change_xfer_size_for_width(usb->assumed_frame_width, xfr);
+                       change_xfer_size_for_width(usb->current_pixel_format, usb->assumed_frame_width, xfr);
                }
        }
        if (xfr->type == LIBUSB_TRANSFER_TYPE_CONTROL) {
@@ -1049,6 +1129,12 @@ unsigned BMUSBCapture::num_cards()
        return ret;
 }
 
+void BMUSBCapture::set_pixel_format(PixelFormat pixel_format)
+{
+       current_pixel_format = pixel_format;
+       update_capture_mode();
+}
+
 void BMUSBCapture::configure_card()
 {
        if (video_frame_allocator == nullptr) {
@@ -1307,16 +1393,15 @@ void BMUSBCapture::configure_card()
 
        // set up isochronous transfers for audio and video
        for (int e = 3; e <= 4; ++e) {
-               //int num_transfers = (e == 3) ? 6 : 6;
                int num_transfers = 6;
                for (int i = 0; i < num_transfers; ++i) {
                        size_t buf_size;
                        int num_iso_pack, size;
                        if (e == 3) {
                                // Allocate for minimum width (because that will give us the most
-                               // number of packets, so we don't need to reallocated, but we'll
+                               // number of packets, so we don't need to reallocate, but we'll
                                // default to 720p for the first frame.
-                               size = find_xfer_size_for_width(MIN_WIDTH);
+                               size = find_xfer_size_for_width(PixelFormat_8BitYCbCr, MIN_WIDTH);
                                num_iso_pack = USB_VIDEO_TRANSFER_SIZE / size;
                                buf_size = USB_VIDEO_TRANSFER_SIZE;
                        } else {
@@ -1355,7 +1440,7 @@ void BMUSBCapture::configure_card()
                        xfr->user_data = this;
 
                        if (e == 3) {
-                               change_xfer_size_for_width(assumed_frame_width, xfr);
+                               change_xfer_size_for_width(current_pixel_format, assumed_frame_width, xfr);
                        }
 
                        iso_xfrs.push_back(xfr);
@@ -1416,6 +1501,7 @@ void BMUSBCapture::start_bm_thread()
 void BMUSBCapture::stop_bm_thread()
 {
        should_quit = true;
+       libusb_interrupt_event_handler(nullptr);
        usb_thread.join();
 }
 
@@ -1472,9 +1558,17 @@ void BMUSBCapture::set_audio_input(uint32_t audio_input_id)
 
 void BMUSBCapture::update_capture_mode()
 {
-       // clearing the 0x20000000 bit seems to activate 10-bit capture (v210).
-       // clearing the 0x08000000 bit seems to change the capture format (other source?)
-       uint32_t mode = htonl(0x29000000 | current_video_input | current_audio_input);
+       if (devh == nullptr) {
+               return;
+       }
+
+       // Clearing the 0x08000000 bit seems to change the capture format (other source?).
+       uint32_t mode = htonl(0x09000000 | current_video_input | current_audio_input);
+       if (current_pixel_format == PixelFormat_8BitYCbCr) {
+               mode |= htonl(0x20000000);
+       } else {
+               assert(current_pixel_format == PixelFormat_10BitYCbCr);
+       }
 
        int rc = libusb_control_transfer(devh, LIBUSB_REQUEST_TYPE_VENDOR | LIBUSB_ENDPOINT_OUT,
                /*request=*/215, /*value=*/0, /*index=*/0, (unsigned char *)&mode, sizeof(mode), /*timeout=*/0);