From 63a912898083c06cc75f336f8d9a367a707e378a Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Sat, 10 Dec 2016 15:58:35 +0100 Subject: [PATCH] Add a switch to print video latency. This ignores mux latency and audio latency, but it's a very good start for ongoing low-latency debugging. Requires bmusb 0.5.3. --- Makefile | 2 +- README | 5 ++--- bmusb | 2 +- decklink_capture.cpp | 6 ++++++ flags.cpp | 8 +++++++- flags.h | 1 + mixer.cpp | 1 + mixer.h | 1 + print_latency.cpp | 30 ++++++++++++++++++++++++++++++ print_latency.h | 21 +++++++++++++++++++++ quicksync_encoder.cpp | 40 ++++++++++++++++++++++++++++++++-------- x264_encoder.cpp | 20 +++++++++++++++++++- x264_encoder.h | 10 +++++++++- 13 files changed, 131 insertions(+), 16 deletions(-) create mode 100644 print_latency.cpp create mode 100644 print_latency.h diff --git a/Makefile b/Makefile index 6b79254..0380d0d 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ OBJS += midi_mapper.o midi_mapping.pb.o # Mixer objects AUDIO_MIXER_OBJS = audio_mixer.o alsa_input.o alsa_pool.o ebu_r128_proc.o stereocompressor.o resampling_queue.o flags.o correlation_measurer.o filter.o input_mapping.o state.pb.o -OBJS += mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o $(AUDIO_MIXER_OBJS) +OBJS += mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o $(AUDIO_MIXER_OBJS) # Streaming and encoding objects OBJS += quicksync_encoder.o x264_encoder.o x264_speed_control.o video_encoder.o metacube2.o mux.o audio_encoder.o ffmpeg_raii.o diff --git a/README b/README index 664ba9d..c1862af 100644 --- a/README +++ b/README @@ -76,10 +76,9 @@ with: libmovit-dev libegl1-mesa-dev libasound2-dev libx264-dev libbmusb-dev \ protobuf-compiler libprotobuf-dev libpci-dev -Exceptions as of November 2016: +Exceptions as of December 2016: - - libusb 1.0.21 is not yet in stretch; you need to fetch it from sid. - Same with a bmusb version built against it. + - bmusb 0.5.3 is not yet in sid. The patches/ directory contains a patch that helps zita-resampler performance. diff --git a/bmusb b/bmusb index a765e06..227166d 160000 --- a/bmusb +++ b/bmusb @@ -1 +1 @@ -Subproject commit a765e066b74ac52ff0abf239d430d6f8d83f792e +Subproject commit 227166d4398263e1191bdfe244f62bc38aaf4c1f diff --git a/decklink_capture.cpp b/decklink_capture.cpp index 505083f..5e41da4 100644 --- a/decklink_capture.cpp +++ b/decklink_capture.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ #define FRAME_SIZE (8 << 20) // 8 MB. using namespace std; +using namespace std::chrono; using namespace std::placeholders; using namespace bmusb; @@ -353,6 +355,8 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived( video_format.width = width; video_format.height = height; + + current_video_frame.received_timestamp = steady_clock::now(); } } @@ -369,6 +373,8 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived( audio_format.bits_per_sample = 32; audio_format.num_channels = 2; + + current_audio_frame.received_timestamp = steady_clock::now(); } } diff --git a/flags.cpp b/flags.cpp index 279ea42..239a688 100644 --- a/flags.cpp +++ b/flags.cpp @@ -44,7 +44,8 @@ enum LongOption { OPTION_DISABLE_MAKEUP_GAIN_AUTO, OPTION_ENABLE_MAKEUP_GAIN_AUTO, OPTION_DISABLE_ALSA_OUTPUT, - OPTION_NO_FLUSH_PBOS + OPTION_NO_FLUSH_PBOS, + OPTION_PRINT_VIDEO_LATENCY }; void usage() @@ -98,6 +99,7 @@ void usage() fprintf(stderr, " --no-flush-pbos do not explicitly signal texture data uploads\n"); fprintf(stderr, " (will give display corruption, but makes it\n"); fprintf(stderr, " possible to run with apitrace in real time)\n"); + fprintf(stderr, " --print-video-latency print out measurements of video latency on stdout\n"); } void parse_flags(int argc, char * const argv[]) @@ -143,6 +145,7 @@ void parse_flags(int argc, char * const argv[]) { "enable-makeup-gain-auto", no_argument, 0, OPTION_ENABLE_MAKEUP_GAIN_AUTO }, { "disable-alsa-output", no_argument, 0, OPTION_DISABLE_ALSA_OUTPUT }, { "no-flush-pbos", no_argument, 0, OPTION_NO_FLUSH_PBOS }, + { "print-video-latency", no_argument, 0, OPTION_PRINT_VIDEO_LATENCY }, { 0, 0, 0, 0 } }; vector theme_dirs; @@ -292,6 +295,9 @@ void parse_flags(int argc, char * const argv[]) case OPTION_NO_FLUSH_PBOS: global_flags.flush_pbos = false; break; + case OPTION_PRINT_VIDEO_LATENCY: + global_flags.print_video_latency = true; + break; case OPTION_HELP: usage(); exit(0); diff --git a/flags.h b/flags.h index 5abc953..7149581 100644 --- a/flags.h +++ b/flags.h @@ -40,6 +40,7 @@ struct Flags { bool multichannel_mapping_mode = false; // Implicitly true if input_mapping_filename is nonempty. std::string input_mapping_filename; // Empty for none. std::string midi_mapping_filename; // Empty for none. + bool print_video_latency = false; }; extern Flags global_flags; diff --git a/mixer.cpp b/mixer.cpp index ff9c77f..0a7ecc2 100644 --- a/mixer.cpp +++ b/mixer.cpp @@ -511,6 +511,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode, new_frame.interlaced = video_format.interlaced; new_frame.upload_func = upload_func; new_frame.dropped_frames = dropped_frames; + new_frame.received_timestamp = video_frame.received_timestamp; // Ignore the audio timestamp. card->new_frames.push(move(new_frame)); card->new_frames_changed.notify_all(); } diff --git a/mixer.h b/mixer.h index f5aa4fb..551e217 100644 --- a/mixer.h +++ b/mixer.h @@ -336,6 +336,7 @@ private: unsigned field; // Which field (0 or 1) of the frame to use. Always 0 for progressive. std::function upload_func; // Needs to be called to actually upload the texture to OpenGL. unsigned dropped_frames = 0; // Number of dropped frames before this one. + std::chrono::steady_clock::time_point received_timestamp = std::chrono::steady_clock::time_point::min(); }; std::queue new_frames; bool should_quit = false; diff --git a/print_latency.cpp b/print_latency.cpp new file mode 100644 index 0000000..0b0a1ae --- /dev/null +++ b/print_latency.cpp @@ -0,0 +1,30 @@ +#include "print_latency.h" +#include "flags.h" + +#include +#include +#include + +using namespace std; +using namespace std::chrono; + +void print_latency(const string &header, const ReceivedTimestamps &received_ts, bool is_b_frame, int *frameno) +{ + if (received_ts.max_ts == steady_clock::time_point::min()) + return; + + // 101 is chosen so that it's prime, which is unlikely to get the same frame type every time. + if (global_flags.print_video_latency && (++*frameno % 101) == 0) { + const steady_clock::time_point now = steady_clock::now(); + printf("%-60s %4.0f ms (lowest-latency input), %4.0f ms (highest-latency input)", + header.c_str(), + 1e3 * std::chrono::duration(now - received_ts.max_ts).count(), + 1e3 * std::chrono::duration(now - received_ts.min_ts).count()); + + if (is_b_frame) { + printf(" [on B-frame; potential extra latency]\n"); + } else { + printf("\n"); + } + } +} diff --git a/print_latency.h b/print_latency.h new file mode 100644 index 0000000..9d8a93b --- /dev/null +++ b/print_latency.h @@ -0,0 +1,21 @@ +#ifndef _PRINT_LATENCY_H +#define _PRINT_LATENCY_H 1 + +// A small utility function to print the latency between two end points +// (typically when the frame was received from the video card, and some +// point when the frame is ready to be output in some form). + +#include +#include + +// Since every output frame is based on multiple input frames, we need +// more than one start timestamp. For now, we keep just the smallest +// and largest timestamp, so that we can print out a range. +// For both of these, steady_clock::time_point::min() is used for “not set”. +struct ReceivedTimestamps { + std::chrono::steady_clock::time_point min_ts, max_ts; +}; + +void print_latency(const std::string &header, const ReceivedTimestamps &received_ts, bool is_b_frame, int *frameno); + +#endif // !defined(_PRINT_LATENCY_H) diff --git a/quicksync_encoder.cpp b/quicksync_encoder.cpp index 4e97422..2ab1f10 100644 --- a/quicksync_encoder.cpp +++ b/quicksync_encoder.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -47,11 +48,13 @@ extern "C" { #include "ffmpeg_raii.h" #include "flags.h" #include "mux.h" +#include "print_latency.h" #include "ref_counted_frame.h" #include "timebase.h" #include "x264_encoder.h" using namespace std; +using namespace std::chrono; using namespace std::placeholders; class QOpenGLContext; @@ -142,6 +145,7 @@ public: struct Frame { int64_t pts, duration; uint8_t *data; + ReceivedTimestamps received_ts; // Invert to get the smallest pts first. bool operator< (const Frame &other) const { return pts > other.pts; } @@ -151,7 +155,7 @@ public: // Does _not_ take ownership of data; a copy is taken if needed. // The returned pointer is valid until the next call to reorder_frame, or destruction. // As a special case, if queue_length == 0, will just return pts and data (no reordering needed). - Frame reorder_frame(int64_t pts, int64_t duration, uint8_t *data); + Frame reorder_frame(int64_t pts, int64_t duration, uint8_t *data, const ReceivedTimestamps &received_ts); // The same as reorder_frame, but without inserting anything. Used to empty the queue. Frame get_first_frame(); @@ -179,22 +183,22 @@ FrameReorderer::FrameReorderer(unsigned queue_length, int width, int height) } } -FrameReorderer::Frame FrameReorderer::reorder_frame(int64_t pts, int64_t duration, uint8_t *data) +FrameReorderer::Frame FrameReorderer::reorder_frame(int64_t pts, int64_t duration, uint8_t *data, const ReceivedTimestamps &received_ts) { if (queue_length == 0) { - return Frame{pts, duration, data}; + return Frame{pts, duration, data, received_ts}; } assert(!freelist.empty()); uint8_t *storage = freelist.top(); freelist.pop(); memcpy(storage, data, width * height * 2); - frames.push(Frame{pts, duration, storage}); + frames.push(Frame{pts, duration, storage, received_ts}); if (frames.size() >= queue_length) { return get_first_frame(); } else { - return Frame{-1, -1, nullptr}; + return Frame{-1, -1, nullptr, steady_clock::time_point::min(), steady_clock::time_point::min()}; } } @@ -232,6 +236,7 @@ private: int frame_type; vector audio; int64_t pts, dts, duration; + ReceivedTimestamps received_ts; }; struct PendingFrame { RefCountedGLsync fence; @@ -1630,6 +1635,10 @@ void QuickSyncEncoderImpl::save_codeddata(storage_task task) } vaUnmapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf); + static int frameno = 0; + print_latency("Current QuickSync latency (video inputs → disk mux):", + task.received_ts, (task.frame_type == FRAME_B), &frameno); + { // Add video. AVPacket pkt; @@ -2034,7 +2043,7 @@ void QuickSyncEncoderImpl::encode_remaining_frames_as_p(int encoding_frame_num, add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data); } else { assert(global_flags.x264_video_to_http); - x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data); + x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data, output_frame.received_ts); } } } @@ -2081,6 +2090,20 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame } while (sync_status == GL_TIMEOUT_EXPIRED); assert(sync_status != GL_WAIT_FAILED); + // Find min and max timestamp of all input frames that have a timestamp. + steady_clock::time_point min_ts = steady_clock::time_point::max(), max_ts = steady_clock::time_point::min(); + for (const RefCountedFrame &input_frame : frame.input_frames) { + if (input_frame && input_frame->received_timestamp > steady_clock::time_point::min()) { + min_ts = min(min_ts, input_frame->received_timestamp); + max_ts = max(max_ts, input_frame->received_timestamp); + } + } + const ReceivedTimestamps received_ts{ min_ts, max_ts }; + + static int frameno = 0; + print_latency("Current mixer latency (video inputs → ready for encode):", + received_ts, (frame_type == FRAME_B), &frameno); + // Release back any input frames we needed to render this frame. frame.input_frames.clear(); @@ -2109,13 +2132,13 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame global_flags.x264_video_to_http) { // Add uncompressed video. (Note that pts == dts here.) // Delay needs to match audio. - FrameReorderer::Frame output_frame = reorderer->reorder_frame(pts + global_delay(), duration, reinterpret_cast(surf->y_ptr)); + FrameReorderer::Frame output_frame = reorderer->reorder_frame(pts + global_delay(), duration, reinterpret_cast(surf->y_ptr), received_ts); if (output_frame.data != nullptr) { if (global_flags.uncompressed_video_to_http) { add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data); } else { assert(global_flags.x264_video_to_http); - x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data); + x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data, output_frame.received_ts); } } } @@ -2156,6 +2179,7 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame tmp.pts = pts; tmp.dts = dts; tmp.duration = duration; + tmp.received_ts = received_ts; storage_task_enqueue(move(tmp)); update_ReferenceFrames(frame_type); diff --git a/x264_encoder.cpp b/x264_encoder.cpp index 95d3625..fcd20e7 100644 --- a/x264_encoder.cpp +++ b/x264_encoder.cpp @@ -1,5 +1,6 @@ #include "x264_encoder.h" +#include #include #include #include @@ -10,6 +11,7 @@ #include "defs.h" #include "flags.h" #include "mux.h" +#include "print_latency.h" #include "timebase.h" #include "x264_speed_control.h" @@ -19,6 +21,7 @@ extern "C" { } using namespace std; +using namespace std::chrono; namespace { @@ -55,11 +58,12 @@ X264Encoder::~X264Encoder() encoder_thread.join(); } -void X264Encoder::add_frame(int64_t pts, int64_t duration, const uint8_t *data) +void X264Encoder::add_frame(int64_t pts, int64_t duration, const uint8_t *data, const ReceivedTimestamps &received_ts) { QueuedFrame qf; qf.pts = pts; qf.duration = duration; + qf.received_ts = received_ts; { lock_guard lock(mu); @@ -251,6 +255,8 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf) pic.opaque = reinterpret_cast(intptr_t(qf.duration)); input_pic = &pic; + + frames_being_encoded[qf.pts] = qf.received_ts; } // See if we have a new bitrate to change to. @@ -280,6 +286,18 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf) if (num_nal == 0) return; + if (frames_being_encoded.count(pic.i_pts)) { + ReceivedTimestamps received_ts = frames_being_encoded[pic.i_pts]; + frames_being_encoded.erase(pic.i_pts); + + static int frameno = 0; + print_latency("Current x264 latency (video inputs → network mux):", + received_ts, (pic.i_type == X264_TYPE_B || pic.i_type == X264_TYPE_BREF), + &frameno); + } else { + assert(false); + } + // We really need one AVPacket for the entire frame, it seems, // so combine it all. size_t num_bytes = buffered_sei.size(); diff --git a/x264_encoder.h b/x264_encoder.h index f607a91..8adb42a 100644 --- a/x264_encoder.h +++ b/x264_encoder.h @@ -20,17 +20,21 @@ #include #include #include +#include #include #include #include #include #include #include +#include extern "C" { #include } +#include "print_latency.h" + class Mux; class X264SpeedControl; @@ -47,7 +51,7 @@ public: // is taken to be raw NV12 data of WIDTHxHEIGHT resolution. // Does not block. - void add_frame(int64_t pts, int64_t duration, const uint8_t *data); + void add_frame(int64_t pts, int64_t duration, const uint8_t *data, const ReceivedTimestamps &received_ts); std::string get_global_headers() const { while (!x264_init_done) { @@ -64,6 +68,7 @@ private: struct QueuedFrame { int64_t pts, duration; uint8_t *data; + ReceivedTimestamps received_ts; }; void encoder_thread_func(); void init_x264(); @@ -101,6 +106,9 @@ private: // Whenever the state of changes. std::condition_variable queued_frames_nonempty; + + // Key is the pts of the frame. + std::unordered_map frames_being_encoded; }; #endif // !defined(_X264ENCODE_H) -- 2.39.2