From 63a912898083c06cc75f336f8d9a367a707e378a Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sat, 10 Dec 2016 15:58:35 +0100
Subject: [PATCH] Add a switch to print video latency.

This ignores mux latency and audio latency, but it's a very good start
for ongoing low-latency debugging.

Requires bmusb 0.5.3.
---
 Makefile              |  2 +-
 README                |  5 ++---
 bmusb                 |  2 +-
 decklink_capture.cpp  |  6 ++++++
 flags.cpp             |  8 +++++++-
 flags.h               |  1 +
 mixer.cpp             |  1 +
 mixer.h               |  1 +
 print_latency.cpp     | 30 ++++++++++++++++++++++++++++++
 print_latency.h       | 21 +++++++++++++++++++++
 quicksync_encoder.cpp | 40 ++++++++++++++++++++++++++++++++--------
 x264_encoder.cpp      | 20 +++++++++++++++++++-
 x264_encoder.h        | 10 +++++++++-
 13 files changed, 131 insertions(+), 16 deletions(-)
 create mode 100644 print_latency.cpp
 create mode 100644 print_latency.h
diff --git a/Makefile b/Makefile
index 6b79254..0380d0d 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,7 @@ OBJS += midi_mapper.o midi_mapping.pb.o
 
 # Mixer objects
 AUDIO_MIXER_OBJS = audio_mixer.o alsa_input.o alsa_pool.o ebu_r128_proc.o stereocompressor.o resampling_queue.o flags.o correlation_measurer.o filter.o input_mapping.o state.pb.o
-OBJS += mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o $(AUDIO_MIXER_OBJS)
+OBJS += mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o $(AUDIO_MIXER_OBJS)
 
 # Streaming and encoding objects
 OBJS += quicksync_encoder.o x264_encoder.o x264_speed_control.o video_encoder.o metacube2.o mux.o audio_encoder.o ffmpeg_raii.o
diff --git a/README b/README
index 664ba9d..c1862af 100644
--- a/README
+++ b/README
@@ -76,10 +76,9 @@ with:
     libmovit-dev libegl1-mesa-dev libasound2-dev libx264-dev libbmusb-dev \
     protobuf-compiler libprotobuf-dev libpci-dev
 
-Exceptions as of November 2016:
+Exceptions as of December 2016:
 
-  - libusb 1.0.21 is not yet in stretch; you need to fetch it from sid.
-    Same with a bmusb version built against it.
+  - bmusb 0.5.3 is not yet in sid.
 
 
 The patches/ directory contains a patch that helps zita-resampler performance.
diff --git a/bmusb b/bmusb
index a765e06..227166d 160000
--- a/bmusb
+++ b/bmusb
@@ -1 +1 @@
-Subproject commit a765e066b74ac52ff0abf239d430d6f8d83f792e
+Subproject commit 227166d4398263e1191bdfe244f62bc38aaf4c1f
diff --git a/decklink_capture.cpp b/decklink_capture.cpp
index 505083f..5e41da4 100644
--- a/decklink_capture.cpp
+++ b/decklink_capture.cpp
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <chrono>
 #include <cstdint>
 #include <utility>
 #include <vector>
@@ -21,6 +22,7 @@
 #define FRAME_SIZE (8 << 20)  // 8 MB.
 
 using namespace std;
+using namespace std::chrono;
 using namespace std::placeholders;
 using namespace bmusb;
 
@@ -353,6 +355,8 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 
 			video_format.width = width;
 			video_format.height = height;
+
+			current_video_frame.received_timestamp = steady_clock::now();
 		}
 	}
 
@@ -369,6 +373,8 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 
 			audio_format.bits_per_sample = 32;
 			audio_format.num_channels = 2;
+
+			current_audio_frame.received_timestamp = steady_clock::now();
 		}
 	}
 
diff --git a/flags.cpp b/flags.cpp
index 279ea42..239a688 100644
--- a/flags.cpp
+++ b/flags.cpp
@@ -44,7 +44,8 @@ enum LongOption {
 	OPTION_DISABLE_MAKEUP_GAIN_AUTO,
 	OPTION_ENABLE_MAKEUP_GAIN_AUTO,
 	OPTION_DISABLE_ALSA_OUTPUT,
-	OPTION_NO_FLUSH_PBOS
+	OPTION_NO_FLUSH_PBOS,
+	OPTION_PRINT_VIDEO_LATENCY
 };
 
 void usage()
@@ -98,6 +99,7 @@ void usage()
 	fprintf(stderr, "      --no-flush-pbos             do not explicitly signal texture data uploads\n");
 	fprintf(stderr, "                                    (will give display corruption, but makes it\n");
 	fprintf(stderr, "                                    possible to run with apitrace in real time)\n");
+	fprintf(stderr, "      --print-video-latency       print out measurements of video latency on stdout\n");
 }
 
 void parse_flags(int argc, char * const argv[])
@@ -143,6 +145,7 @@ void parse_flags(int argc, char * const argv[])
 		{ "enable-makeup-gain-auto", no_argument, 0, OPTION_ENABLE_MAKEUP_GAIN_AUTO },
 		{ "disable-alsa-output", no_argument, 0, OPTION_DISABLE_ALSA_OUTPUT },
 		{ "no-flush-pbos", no_argument, 0, OPTION_NO_FLUSH_PBOS },
+		{ "print-video-latency", no_argument, 0, OPTION_PRINT_VIDEO_LATENCY },
 		{ 0, 0, 0, 0 }
 	};
 	vector<string> theme_dirs;
@@ -292,6 +295,9 @@ void parse_flags(int argc, char * const argv[])
 		case OPTION_NO_FLUSH_PBOS:
 			global_flags.flush_pbos = false;
 			break;
+		case OPTION_PRINT_VIDEO_LATENCY:
+			global_flags.print_video_latency = true;
+			break;
 		case OPTION_HELP:
 			usage();
 			exit(0);
diff --git a/flags.h b/flags.h
index 5abc953..7149581 100644
--- a/flags.h
+++ b/flags.h
@@ -40,6 +40,7 @@ struct Flags {
 	bool multichannel_mapping_mode = false;  // Implicitly true if input_mapping_filename is nonempty.
 	std::string input_mapping_filename;  // Empty for none.
 	std::string midi_mapping_filename;  // Empty for none.
+	bool print_video_latency = false;
 };
 extern Flags global_flags;
 
diff --git a/mixer.cpp b/mixer.cpp
index ff9c77f..0a7ecc2 100644
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -511,6 +511,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
 			new_frame.interlaced = video_format.interlaced;
 			new_frame.upload_func = upload_func;
 			new_frame.dropped_frames = dropped_frames;
+			new_frame.received_timestamp = video_frame.received_timestamp;  // Ignore the audio timestamp.
 			card->new_frames.push(move(new_frame));
 			card->new_frames_changed.notify_all();
 		}
diff --git a/mixer.h b/mixer.h
index f5aa4fb..551e217 100644
--- a/mixer.h
+++ b/mixer.h
@@ -336,6 +336,7 @@ private:
 			unsigned field;  // Which field (0 or 1) of the frame to use. Always 0 for progressive.
 			std::function<void()> upload_func;  // Needs to be called to actually upload the texture to OpenGL.
 			unsigned dropped_frames = 0;  // Number of dropped frames before this one.
+			std::chrono::steady_clock::time_point received_timestamp = std::chrono::steady_clock::time_point::min();
 		};
 		std::queue<NewFrame> new_frames;
 		bool should_quit = false;
diff --git a/print_latency.cpp b/print_latency.cpp
new file mode 100644
index 0000000..0b0a1ae
--- /dev/null
+++ b/print_latency.cpp
@@ -0,0 +1,30 @@
+#include "print_latency.h"
+#include "flags.h"
+
+#include <stdio.h>
+#include <chrono>
+#include <string>
+
+using namespace std;
+using namespace std::chrono;
+
+void print_latency(const string &header, const ReceivedTimestamps &received_ts, bool is_b_frame, int *frameno)
+{
+	if (received_ts.max_ts == steady_clock::time_point::min())
+		return;
+
+	// 101 is chosen so that it's prime, which is unlikely to get the same frame type every time.
+	if (global_flags.print_video_latency && (++*frameno % 101) == 0) {
+		const steady_clock::time_point now = steady_clock::now();
+		printf("%-60s %4.0f ms (lowest-latency input), %4.0f ms (highest-latency input)",
+			header.c_str(),
+			1e3 * std::chrono::duration<double>(now - received_ts.max_ts).count(),
+			1e3 * std::chrono::duration<double>(now - received_ts.min_ts).count());
+
+		if (is_b_frame) {
+			printf("  [on B-frame; potential extra latency]\n");
+		} else {
+			printf("\n");
+		}
+	}
+}
diff --git a/print_latency.h b/print_latency.h
new file mode 100644
index 0000000..9d8a93b
--- /dev/null
+++ b/print_latency.h
@@ -0,0 +1,21 @@
+#ifndef _PRINT_LATENCY_H
+#define _PRINT_LATENCY_H 1
+
+// A small utility function to print the latency between two end points
+// (typically when the frame was received from the video card, and some
+// point when the frame is ready to be output in some form).
+
+#include <chrono>
+#include <string>
+
+// Since every output frame is based on multiple input frames, we need
+// more than one start timestamp. For now, we keep just the smallest
+// and largest timestamp, so that we can print out a range.
+// For both of these, steady_clock::time_point::min() is used for ânot setâ.
+struct ReceivedTimestamps {
+	std::chrono::steady_clock::time_point min_ts, max_ts;
+};
+
+void print_latency(const std::string &header, const ReceivedTimestamps &received_ts, bool is_b_frame, int *frameno);
+
+#endif  // !defined(_PRINT_LATENCY_H)
diff --git a/quicksync_encoder.cpp b/quicksync_encoder.cpp
index 4e97422..2ab1f10 100644
--- a/quicksync_encoder.cpp
+++ b/quicksync_encoder.cpp
@@ -18,6 +18,7 @@
 #include <va/va_enc_h264.h>
 #include <va/va_x11.h>
 #include <algorithm>
+#include <chrono>
 #include <condition_variable>
 #include <cstddef>
 #include <cstdint>
@@ -47,11 +48,13 @@ extern "C" {
 #include "ffmpeg_raii.h"
 #include "flags.h"
 #include "mux.h"
+#include "print_latency.h"
 #include "ref_counted_frame.h"
 #include "timebase.h"
 #include "x264_encoder.h"
 
 using namespace std;
+using namespace std::chrono;
 using namespace std::placeholders;
 
 class QOpenGLContext;
@@ -142,6 +145,7 @@ public:
 	struct Frame {
 		int64_t pts, duration;
 		uint8_t *data;
+		ReceivedTimestamps received_ts;
 
 		// Invert to get the smallest pts first.
 		bool operator< (const Frame &other) const { return pts > other.pts; }
@@ -151,7 +155,7 @@ public:
 	// Does _not_ take ownership of data; a copy is taken if needed.
 	// The returned pointer is valid until the next call to reorder_frame, or destruction.
 	// As a special case, if queue_length == 0, will just return pts and data (no reordering needed).
-	Frame reorder_frame(int64_t pts, int64_t duration, uint8_t *data);
+	Frame reorder_frame(int64_t pts, int64_t duration, uint8_t *data, const ReceivedTimestamps &received_ts);
 
 	// The same as reorder_frame, but without inserting anything. Used to empty the queue.
 	Frame get_first_frame();
@@ -179,22 +183,22 @@ FrameReorderer::FrameReorderer(unsigned queue_length, int width, int height)
 	}
 }
 
-FrameReorderer::Frame FrameReorderer::reorder_frame(int64_t pts, int64_t duration, uint8_t *data)
+FrameReorderer::Frame FrameReorderer::reorder_frame(int64_t pts, int64_t duration, uint8_t *data, const ReceivedTimestamps &received_ts)
 {
 	if (queue_length == 0) {
-		return Frame{pts, duration, data};
+		return Frame{pts, duration, data, received_ts};
 	}
 
 	assert(!freelist.empty());
 	uint8_t *storage = freelist.top();
 	freelist.pop();
 	memcpy(storage, data, width * height * 2);
-	frames.push(Frame{pts, duration, storage});
+	frames.push(Frame{pts, duration, storage, received_ts});
 
 	if (frames.size() >= queue_length) {
 		return get_first_frame();
 	} else {
-		return Frame{-1, -1, nullptr};
+		return Frame{-1, -1, nullptr, steady_clock::time_point::min(), steady_clock::time_point::min()};
 	}
 }
 
@@ -232,6 +236,7 @@ private:
 		int frame_type;
 		vector<float> audio;
 		int64_t pts, dts, duration;
+		ReceivedTimestamps received_ts;
 	};
 	struct PendingFrame {
 		RefCountedGLsync fence;
@@ -1630,6 +1635,10 @@ void QuickSyncEncoderImpl::save_codeddata(storage_task task)
 	}
 	vaUnmapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf);
 
+	static int frameno = 0;
+	print_latency("Current QuickSync latency (video inputs â disk mux):",
+		task.received_ts, (task.frame_type == FRAME_B), &frameno);
+
 	{
 		// Add video.
 		AVPacket pkt;
@@ -2034,7 +2043,7 @@ void QuickSyncEncoderImpl::encode_remaining_frames_as_p(int encoding_frame_num,
 				add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data);
 			} else {
 				assert(global_flags.x264_video_to_http);
-				x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data);
+				x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data, output_frame.received_ts);
 			}
 		}
 	}
@@ -2081,6 +2090,20 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
 	} while (sync_status == GL_TIMEOUT_EXPIRED);
 	assert(sync_status != GL_WAIT_FAILED);
 
+	// Find min and max timestamp of all input frames that have a timestamp.
+	steady_clock::time_point min_ts = steady_clock::time_point::max(), max_ts = steady_clock::time_point::min();
+	for (const RefCountedFrame &input_frame : frame.input_frames) {
+		if (input_frame && input_frame->received_timestamp > steady_clock::time_point::min()) {
+			min_ts = min(min_ts, input_frame->received_timestamp);
+			max_ts = max(max_ts, input_frame->received_timestamp);
+		}
+	}
+	const ReceivedTimestamps received_ts{ min_ts, max_ts };
+
+	static int frameno = 0;
+	print_latency("Current mixer latency (video inputs â ready for encode):",
+		received_ts, (frame_type == FRAME_B), &frameno);
+
 	// Release back any input frames we needed to render this frame.
 	frame.input_frames.clear();
 
@@ -2109,13 +2132,13 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
 		    global_flags.x264_video_to_http) {
 			// Add uncompressed video. (Note that pts == dts here.)
 			// Delay needs to match audio.
-			FrameReorderer::Frame output_frame = reorderer->reorder_frame(pts + global_delay(), duration, reinterpret_cast<uint8_t *>(surf->y_ptr));
+			FrameReorderer::Frame output_frame = reorderer->reorder_frame(pts + global_delay(), duration, reinterpret_cast<uint8_t *>(surf->y_ptr), received_ts);
 			if (output_frame.data != nullptr) {
 				if (global_flags.uncompressed_video_to_http) {
 					add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data);
 				} else {
 					assert(global_flags.x264_video_to_http);
-					x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data);
+					x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data, output_frame.received_ts);
 				}
 			}
 		}
@@ -2156,6 +2179,7 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
 	tmp.pts = pts;
 	tmp.dts = dts;
 	tmp.duration = duration;
+	tmp.received_ts = received_ts;
 	storage_task_enqueue(move(tmp));
 
 	update_ReferenceFrames(frame_type);
diff --git a/x264_encoder.cpp b/x264_encoder.cpp
index 95d3625..fcd20e7 100644
--- a/x264_encoder.cpp
+++ b/x264_encoder.cpp
@@ -1,5 +1,6 @@
 #include "x264_encoder.h"
 
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -10,6 +11,7 @@
 #include "defs.h"
 #include "flags.h"
 #include "mux.h"
+#include "print_latency.h"
 #include "timebase.h"
 #include "x264_speed_control.h"
 
@@ -19,6 +21,7 @@ extern "C" {
 }
 
 using namespace std;
+using namespace std::chrono;
 
 namespace {
 
@@ -55,11 +58,12 @@ X264Encoder::~X264Encoder()
 	encoder_thread.join();
 }
 
-void X264Encoder::add_frame(int64_t pts, int64_t duration, const uint8_t *data)
+void X264Encoder::add_frame(int64_t pts, int64_t duration, const uint8_t *data, const ReceivedTimestamps &received_ts)
 {
 	QueuedFrame qf;
 	qf.pts = pts;
 	qf.duration = duration;
+	qf.received_ts = received_ts;
 
 	{
 		lock_guard<mutex> lock(mu);
@@ -251,6 +255,8 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf)
 		pic.opaque = reinterpret_cast<void *>(intptr_t(qf.duration));
 
 		input_pic = &pic;
+
+		frames_being_encoded[qf.pts] = qf.received_ts;
 	}
 
 	// See if we have a new bitrate to change to.
@@ -280,6 +286,18 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf)
 
 	if (num_nal == 0) return;
 
+	if (frames_being_encoded.count(pic.i_pts)) {
+		ReceivedTimestamps received_ts = frames_being_encoded[pic.i_pts];
+		frames_being_encoded.erase(pic.i_pts);
+
+		static int frameno = 0;
+		print_latency("Current x264 latency (video inputs â network mux):",
+			received_ts, (pic.i_type == X264_TYPE_B || pic.i_type == X264_TYPE_BREF),
+			&frameno);
+	} else {
+		assert(false);
+	}
+
 	// We really need one AVPacket for the entire frame, it seems,
 	// so combine it all.
 	size_t num_bytes = buffered_sei.size();
diff --git a/x264_encoder.h b/x264_encoder.h
index f607a91..8adb42a 100644
--- a/x264_encoder.h
+++ b/x264_encoder.h
@@ -20,17 +20,21 @@
 #include <stdint.h>
 #include <x264.h>
 #include <atomic>
+#include <chrono>
 #include <condition_variable>
 #include <memory>
 #include <mutex>
 #include <queue>
 #include <string>
 #include <thread>
+#include <unordered_map>
 
 extern "C" {
 #include <libavformat/avformat.h>
 }
 
+#include "print_latency.h"
+
 class Mux;
 class X264SpeedControl;
 
@@ -47,7 +51,7 @@ public:
 
 	// <data> is taken to be raw NV12 data of WIDTHxHEIGHT resolution.
 	// Does not block.
-	void add_frame(int64_t pts, int64_t duration, const uint8_t *data);
+	void add_frame(int64_t pts, int64_t duration, const uint8_t *data, const ReceivedTimestamps &received_ts);
 
 	std::string get_global_headers() const {
 		while (!x264_init_done) {
@@ -64,6 +68,7 @@ private:
 	struct QueuedFrame {
 		int64_t pts, duration;
 		uint8_t *data;
+		ReceivedTimestamps received_ts;
 	};
 	void encoder_thread_func();
 	void init_x264();
@@ -101,6 +106,9 @@ private:
 
 	// Whenever the state of <queued_frames> changes.
 	std::condition_variable queued_frames_nonempty;
+
+	// Key is the pts of the frame.
+	std::unordered_map<int64_t, ReceivedTimestamps> frames_being_encoded;
 };
 
 #endif  // !defined(_X264ENCODE_H)
-- 
2.39.2