X-Git-Url: https://git.sesse.net/?p=bmusb;a=blobdiff_plain;f=bmusb.cpp;h=0169d062dad8d1823ed441e29cc3938eee2c2e09;hp=2e0f5c3e4c83264c25f780bf22be984891269f5d;hb=HEAD;hpb=32043c95d3b9f8cb97d6d28b9996fa1bec2ce11b diff --git a/bmusb.cpp b/bmusb.cpp index 2e0f5c3..19a9da1 100644 --- a/bmusb.cpp +++ b/bmusb.cpp @@ -1,4 +1,4 @@ -// Intensity Shuttle USB3 capture driver, v0.7.0 +// Intensity Shuttle USB3 capture driver, v0.7.8 // Can download 8-bit and 10-bit UYVY/v210-ish frames from HDMI, quite stable // (can do captures for hours at a time with no drops), except during startup // 576p60/720p60/1080i60 works, 1080p60 does not work (firmware limitation) @@ -202,6 +202,7 @@ bool decode_video_format(uint16_t video_format, VideoFormat *decoded_video_forma { 0x0151, 720, 576, 0, 44, 5, 50, 1, false }, // 576p50. { 0x0011, 720, 576, 0, 44, 5, 50, 1, false }, // 576p50 (5:4). { 0x0143, 1280, 720, 0, 25, 5, 50, 1, false }, // 720p50. + { 0x0161, 1280, 720, 0, 25, 5, 50, 1, false }, // 720p50. { 0x0103, 1280, 720, 0, 25, 5, 60, 1, false }, // 720p60. { 0x0125, 1280, 720, 0, 25, 5, 60, 1, false }, // 720p60. { 0x0121, 1280, 720, 0, 25, 5, 60000, 1001, false }, // 720p59.94. @@ -252,7 +253,7 @@ int guess_sample_rate(const VideoFormat &video_format, size_t len, int default_r // See if we match or are very close to any of the mandatory HDMI sample rates. const int candidate_sample_rates[] = { 32000, 44100, 48000 }; for (int rate : candidate_sample_rates) { - if (abs(int(num_samples_per_second) - rate) < 50) { + if (abs(int(num_samples_per_second) - rate) <= 100) { return rate; } } @@ -341,7 +342,9 @@ void dump_frame(const char *filename, uint8_t *frame_start, size_t frame_len) void dump_audio_block(uint8_t *audio_start, size_t audio_len) { - fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp); + if (audiofp != nullptr) { + fwrite(audio_start + AUDIO_HEADER_SIZE, 1, audio_len - AUDIO_HEADER_SIZE, audiofp); + } } void BMUSBCapture::dequeue_thread_func() @@ -535,6 +538,9 @@ void add_to_frame(FrameAllocator::Frame *current_frame, const char *frame_type_n } //dump_frame(); } else { + if (current_frame->data_copy != nullptr) { + memcpy(current_frame->data_copy + current_frame->len, start, bytes); + } if (current_frame->interleaved) { uint8_t *data = current_frame->data + current_frame->len / 2; uint8_t *data2 = current_frame->data2 + current_frame->len / 2; @@ -684,6 +690,7 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, { const __m256i needle = _mm256_set1_epi8(sync_char); + size_t bytes_copied; const __restrict __m256i *in = (const __m256i *)aligned_start; if (current_frame->interleaved) { __restrict __m256i *out1 = (__m256i *)(current_frame->data + (current_frame->len + 1) / 2); @@ -724,9 +731,10 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, ++out1; ++out2; } - current_frame->len += (uint8_t *)in - aligned_start; + bytes_copied = (uint8_t *)in - aligned_start; } else { - __m256i *out = (__m256i *)(current_frame->data + current_frame->len); + uint8_t *old_end = current_frame->data + current_frame->len; + __m256i *out = (__m256i *)old_end; while (in < (const __m256i *)limit) { __m256i data = _mm256_load_si256(in); _mm256_storeu_si256(out, data); // Store as early as possible, even if the data isn't used. @@ -738,8 +746,14 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, ++in; ++out; } - current_frame->len = (uint8_t *)out - current_frame->data; + bytes_copied = (uint8_t *)out - old_end; } + if (current_frame->data_copy != nullptr) { + // TODO: It would be somewhat more cache-efficient to write this in the + // same loop as above. However, it might not be worth the extra complexity. + memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied); + } + current_frame->len += bytes_copied; //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes); return (const uint8_t *)in; @@ -751,6 +765,7 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, const __m128i needle = _mm_set1_epi8(sync_char); const __m128i *in = (const __m128i *)aligned_start; + size_t bytes_copied; if (current_frame->interleaved) { __m128i *out1 = (__m128i *)(current_frame->data + (current_frame->len + 1) / 2); __m128i *out2 = (__m128i *)(current_frame->data2 + current_frame->len / 2); @@ -781,9 +796,10 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, ++out1; ++out2; } - current_frame->len += (uint8_t *)in - aligned_start; + bytes_copied = (uint8_t *)in - aligned_start; } else { - __m128i *out = (__m128i *)(current_frame->data + current_frame->len); + uint8_t *old_end = current_frame->data + current_frame->len; + __m128i *out = (__m128i *)old_end; while (in < (const __m128i *)limit) { __m128i data = _mm_load_si128(in); _mm_storeu_si128(out, data); // Store as early as possible, even if the data isn't used. @@ -795,8 +811,14 @@ const uint8_t *add_to_frame_fastpath_core(FrameAllocator::Frame *current_frame, ++in; ++out; } - current_frame->len = (uint8_t *)out - current_frame->data; + bytes_copied = (uint8_t *)out - old_end; + } + if (current_frame->data_copy != nullptr) { + // TODO: It would be somewhat more cache-efficient to write this in the + // same loop as above. However, it might not be worth the extra complexity. + memcpy(current_frame->data_copy + current_frame->len, aligned_start, bytes_copied); } + current_frame->len += bytes_copied; //printf("managed to fastpath %ld/%ld bytes\n", (const uint8_t *)in - (const uint8_t *)aligned_start, orig_bytes); return (const uint8_t *)in; @@ -1479,6 +1501,7 @@ void BMUSBCapture::start_bm_thread() void BMUSBCapture::stop_bm_thread() { should_quit = true; + libusb_interrupt_event_handler(nullptr); usb_thread.join(); }