]> git.sesse.net Git - nageru/commitdiff
Add a switch to print video latency.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Sat, 10 Dec 2016 14:58:35 +0000 (15:58 +0100)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Sat, 10 Dec 2016 14:58:35 +0000 (15:58 +0100)
This ignores mux latency and audio latency, but it's a very good start
for ongoing low-latency debugging.

Requires bmusb 0.5.3.

13 files changed:
Makefile
README
bmusb
decklink_capture.cpp
flags.cpp
flags.h
mixer.cpp
mixer.h
print_latency.cpp [new file with mode: 0644]
print_latency.h [new file with mode: 0644]
quicksync_encoder.cpp
x264_encoder.cpp
x264_encoder.h

index 6b7925402c2c785fbf6c07f6dbc1a40bd57748f8..0380d0d53076531fe6322c6129c47bb14bd3f4da 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,7 @@ OBJS += midi_mapper.o midi_mapping.pb.o
 
 # Mixer objects
 AUDIO_MIXER_OBJS = audio_mixer.o alsa_input.o alsa_pool.o ebu_r128_proc.o stereocompressor.o resampling_queue.o flags.o correlation_measurer.o filter.o input_mapping.o state.pb.o
-OBJS += mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o $(AUDIO_MIXER_OBJS)
+OBJS += mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o $(AUDIO_MIXER_OBJS)
 
 # Streaming and encoding objects
 OBJS += quicksync_encoder.o x264_encoder.o x264_speed_control.o video_encoder.o metacube2.o mux.o audio_encoder.o ffmpeg_raii.o
diff --git a/README b/README
index 664ba9d58dcc9f28866f7026b858bf6130223f83..c1862af77d2876166614b137bfda0c7463efc2f4 100644 (file)
--- a/README
+++ b/README
@@ -76,10 +76,9 @@ with:
     libmovit-dev libegl1-mesa-dev libasound2-dev libx264-dev libbmusb-dev \
     protobuf-compiler libprotobuf-dev libpci-dev
 
-Exceptions as of November 2016:
+Exceptions as of December 2016:
 
-  - libusb 1.0.21 is not yet in stretch; you need to fetch it from sid.
-    Same with a bmusb version built against it.
+  - bmusb 0.5.3 is not yet in sid.
 
 
 The patches/ directory contains a patch that helps zita-resampler performance.
diff --git a/bmusb b/bmusb
index a765e066b74ac52ff0abf239d430d6f8d83f792e..227166d4398263e1191bdfe244f62bc38aaf4c1f 160000 (submodule)
--- a/bmusb
+++ b/bmusb
@@ -1 +1 @@
-Subproject commit a765e066b74ac52ff0abf239d430d6f8d83f792e
+Subproject commit 227166d4398263e1191bdfe244f62bc38aaf4c1f
index 505083ff056a72018069ec2e3ae9e2573e5ab51a..5e41da436ef71b0920495e38137390661f4b09fb 100644 (file)
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <chrono>
 #include <cstdint>
 #include <utility>
 #include <vector>
@@ -21,6 +22,7 @@
 #define FRAME_SIZE (8 << 20)  // 8 MB.
 
 using namespace std;
+using namespace std::chrono;
 using namespace std::placeholders;
 using namespace bmusb;
 
@@ -353,6 +355,8 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 
                        video_format.width = width;
                        video_format.height = height;
+
+                       current_video_frame.received_timestamp = steady_clock::now();
                }
        }
 
@@ -369,6 +373,8 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 
                        audio_format.bits_per_sample = 32;
                        audio_format.num_channels = 2;
+
+                       current_audio_frame.received_timestamp = steady_clock::now();
                }
        }
 
index 279ea426e92d0644a4a1eb82cd00edd1fc6df3e9..239a688e428e4871296c8ed9644e8ddda817ffa7 100644 (file)
--- a/flags.cpp
+++ b/flags.cpp
@@ -44,7 +44,8 @@ enum LongOption {
        OPTION_DISABLE_MAKEUP_GAIN_AUTO,
        OPTION_ENABLE_MAKEUP_GAIN_AUTO,
        OPTION_DISABLE_ALSA_OUTPUT,
-       OPTION_NO_FLUSH_PBOS
+       OPTION_NO_FLUSH_PBOS,
+       OPTION_PRINT_VIDEO_LATENCY
 };
 
 void usage()
@@ -98,6 +99,7 @@ void usage()
        fprintf(stderr, "      --no-flush-pbos             do not explicitly signal texture data uploads\n");
        fprintf(stderr, "                                    (will give display corruption, but makes it\n");
        fprintf(stderr, "                                    possible to run with apitrace in real time)\n");
+       fprintf(stderr, "      --print-video-latency       print out measurements of video latency on stdout\n");
 }
 
 void parse_flags(int argc, char * const argv[])
@@ -143,6 +145,7 @@ void parse_flags(int argc, char * const argv[])
                { "enable-makeup-gain-auto", no_argument, 0, OPTION_ENABLE_MAKEUP_GAIN_AUTO },
                { "disable-alsa-output", no_argument, 0, OPTION_DISABLE_ALSA_OUTPUT },
                { "no-flush-pbos", no_argument, 0, OPTION_NO_FLUSH_PBOS },
+               { "print-video-latency", no_argument, 0, OPTION_PRINT_VIDEO_LATENCY },
                { 0, 0, 0, 0 }
        };
        vector<string> theme_dirs;
@@ -292,6 +295,9 @@ void parse_flags(int argc, char * const argv[])
                case OPTION_NO_FLUSH_PBOS:
                        global_flags.flush_pbos = false;
                        break;
+               case OPTION_PRINT_VIDEO_LATENCY:
+                       global_flags.print_video_latency = true;
+                       break;
                case OPTION_HELP:
                        usage();
                        exit(0);
diff --git a/flags.h b/flags.h
index 5abc9530470cc0ac9b086c21661f22d2a1c75df6..71495816d0d22734f8560c2d1e38aa79389e3a20 100644 (file)
--- a/flags.h
+++ b/flags.h
@@ -40,6 +40,7 @@ struct Flags {
        bool multichannel_mapping_mode = false;  // Implicitly true if input_mapping_filename is nonempty.
        std::string input_mapping_filename;  // Empty for none.
        std::string midi_mapping_filename;  // Empty for none.
+       bool print_video_latency = false;
 };
 extern Flags global_flags;
 
index ff9c77f49f213fd80ef268a4590b2deac89e9528..0a7ecc21a070012e65a55c8e9e51f6f436641244 100644 (file)
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -511,6 +511,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
                        new_frame.interlaced = video_format.interlaced;
                        new_frame.upload_func = upload_func;
                        new_frame.dropped_frames = dropped_frames;
+                       new_frame.received_timestamp = video_frame.received_timestamp;  // Ignore the audio timestamp.
                        card->new_frames.push(move(new_frame));
                        card->new_frames_changed.notify_all();
                }
diff --git a/mixer.h b/mixer.h
index f5aa4fbdd014758faa047f21bc4dfdf701467a81..551e21790d9f71daacd89d1fc6a740baca0723a1 100644 (file)
--- a/mixer.h
+++ b/mixer.h
@@ -336,6 +336,7 @@ private:
                        unsigned field;  // Which field (0 or 1) of the frame to use. Always 0 for progressive.
                        std::function<void()> upload_func;  // Needs to be called to actually upload the texture to OpenGL.
                        unsigned dropped_frames = 0;  // Number of dropped frames before this one.
+                       std::chrono::steady_clock::time_point received_timestamp = std::chrono::steady_clock::time_point::min();
                };
                std::queue<NewFrame> new_frames;
                bool should_quit = false;
diff --git a/print_latency.cpp b/print_latency.cpp
new file mode 100644 (file)
index 0000000..0b0a1ae
--- /dev/null
@@ -0,0 +1,30 @@
+#include "print_latency.h"
+#include "flags.h"
+
+#include <stdio.h>
+#include <chrono>
+#include <string>
+
+using namespace std;
+using namespace std::chrono;
+
+void print_latency(const string &header, const ReceivedTimestamps &received_ts, bool is_b_frame, int *frameno)
+{
+       if (received_ts.max_ts == steady_clock::time_point::min())
+               return;
+
+       // 101 is chosen so that it's prime, which is unlikely to get the same frame type every time.
+       if (global_flags.print_video_latency && (++*frameno % 101) == 0) {
+               const steady_clock::time_point now = steady_clock::now();
+               printf("%-60s %4.0f ms (lowest-latency input), %4.0f ms (highest-latency input)",
+                       header.c_str(),
+                       1e3 * std::chrono::duration<double>(now - received_ts.max_ts).count(),
+                       1e3 * std::chrono::duration<double>(now - received_ts.min_ts).count());
+
+               if (is_b_frame) {
+                       printf("  [on B-frame; potential extra latency]\n");
+               } else {
+                       printf("\n");
+               }
+       }
+}
diff --git a/print_latency.h b/print_latency.h
new file mode 100644 (file)
index 0000000..9d8a93b
--- /dev/null
@@ -0,0 +1,21 @@
+#ifndef _PRINT_LATENCY_H
+#define _PRINT_LATENCY_H 1
+
+// A small utility function to print the latency between two end points
+// (typically when the frame was received from the video card, and some
+// point when the frame is ready to be output in some form).
+
+#include <chrono>
+#include <string>
+
+// Since every output frame is based on multiple input frames, we need
+// more than one start timestamp. For now, we keep just the smallest
+// and largest timestamp, so that we can print out a range.
+// For both of these, steady_clock::time_point::min() is used for “not set”.
+struct ReceivedTimestamps {
+       std::chrono::steady_clock::time_point min_ts, max_ts;
+};
+
+void print_latency(const std::string &header, const ReceivedTimestamps &received_ts, bool is_b_frame, int *frameno);
+
+#endif  // !defined(_PRINT_LATENCY_H)
index 4e974223d15def72a9d348a2a4eb94039077d0e2..2ab1f10f7c5f50d73073276094e242c584ea3f48 100644 (file)
@@ -18,6 +18,7 @@
 #include <va/va_enc_h264.h>
 #include <va/va_x11.h>
 #include <algorithm>
+#include <chrono>
 #include <condition_variable>
 #include <cstddef>
 #include <cstdint>
@@ -47,11 +48,13 @@ extern "C" {
 #include "ffmpeg_raii.h"
 #include "flags.h"
 #include "mux.h"
+#include "print_latency.h"
 #include "ref_counted_frame.h"
 #include "timebase.h"
 #include "x264_encoder.h"
 
 using namespace std;
+using namespace std::chrono;
 using namespace std::placeholders;
 
 class QOpenGLContext;
@@ -142,6 +145,7 @@ public:
        struct Frame {
                int64_t pts, duration;
                uint8_t *data;
+               ReceivedTimestamps received_ts;
 
                // Invert to get the smallest pts first.
                bool operator< (const Frame &other) const { return pts > other.pts; }
@@ -151,7 +155,7 @@ public:
        // Does _not_ take ownership of data; a copy is taken if needed.
        // The returned pointer is valid until the next call to reorder_frame, or destruction.
        // As a special case, if queue_length == 0, will just return pts and data (no reordering needed).
-       Frame reorder_frame(int64_t pts, int64_t duration, uint8_t *data);
+       Frame reorder_frame(int64_t pts, int64_t duration, uint8_t *data, const ReceivedTimestamps &received_ts);
 
        // The same as reorder_frame, but without inserting anything. Used to empty the queue.
        Frame get_first_frame();
@@ -179,22 +183,22 @@ FrameReorderer::FrameReorderer(unsigned queue_length, int width, int height)
        }
 }
 
-FrameReorderer::Frame FrameReorderer::reorder_frame(int64_t pts, int64_t duration, uint8_t *data)
+FrameReorderer::Frame FrameReorderer::reorder_frame(int64_t pts, int64_t duration, uint8_t *data, const ReceivedTimestamps &received_ts)
 {
        if (queue_length == 0) {
-               return Frame{pts, duration, data};
+               return Frame{pts, duration, data, received_ts};
        }
 
        assert(!freelist.empty());
        uint8_t *storage = freelist.top();
        freelist.pop();
        memcpy(storage, data, width * height * 2);
-       frames.push(Frame{pts, duration, storage});
+       frames.push(Frame{pts, duration, storage, received_ts});
 
        if (frames.size() >= queue_length) {
                return get_first_frame();
        } else {
-               return Frame{-1, -1, nullptr};
+               return Frame{-1, -1, nullptr, steady_clock::time_point::min(), steady_clock::time_point::min()};
        }
 }
 
@@ -232,6 +236,7 @@ private:
                int frame_type;
                vector<float> audio;
                int64_t pts, dts, duration;
+               ReceivedTimestamps received_ts;
        };
        struct PendingFrame {
                RefCountedGLsync fence;
@@ -1630,6 +1635,10 @@ void QuickSyncEncoderImpl::save_codeddata(storage_task task)
        }
        vaUnmapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf);
 
+       static int frameno = 0;
+       print_latency("Current QuickSync latency (video inputs → disk mux):",
+               task.received_ts, (task.frame_type == FRAME_B), &frameno);
+
        {
                // Add video.
                AVPacket pkt;
@@ -2034,7 +2043,7 @@ void QuickSyncEncoderImpl::encode_remaining_frames_as_p(int encoding_frame_num,
                                add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data);
                        } else {
                                assert(global_flags.x264_video_to_http);
-                               x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data);
+                               x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data, output_frame.received_ts);
                        }
                }
        }
@@ -2081,6 +2090,20 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
        } while (sync_status == GL_TIMEOUT_EXPIRED);
        assert(sync_status != GL_WAIT_FAILED);
 
+       // Find min and max timestamp of all input frames that have a timestamp.
+       steady_clock::time_point min_ts = steady_clock::time_point::max(), max_ts = steady_clock::time_point::min();
+       for (const RefCountedFrame &input_frame : frame.input_frames) {
+               if (input_frame && input_frame->received_timestamp > steady_clock::time_point::min()) {
+                       min_ts = min(min_ts, input_frame->received_timestamp);
+                       max_ts = max(max_ts, input_frame->received_timestamp);
+               }
+       }
+       const ReceivedTimestamps received_ts{ min_ts, max_ts };
+
+       static int frameno = 0;
+       print_latency("Current mixer latency (video inputs → ready for encode):",
+               received_ts, (frame_type == FRAME_B), &frameno);
+
        // Release back any input frames we needed to render this frame.
        frame.input_frames.clear();
 
@@ -2109,13 +2132,13 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
                    global_flags.x264_video_to_http) {
                        // Add uncompressed video. (Note that pts == dts here.)
                        // Delay needs to match audio.
-                       FrameReorderer::Frame output_frame = reorderer->reorder_frame(pts + global_delay(), duration, reinterpret_cast<uint8_t *>(surf->y_ptr));
+                       FrameReorderer::Frame output_frame = reorderer->reorder_frame(pts + global_delay(), duration, reinterpret_cast<uint8_t *>(surf->y_ptr), received_ts);
                        if (output_frame.data != nullptr) {
                                if (global_flags.uncompressed_video_to_http) {
                                        add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data);
                                } else {
                                        assert(global_flags.x264_video_to_http);
-                                       x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data);
+                                       x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data, output_frame.received_ts);
                                }
                        }
                }
@@ -2156,6 +2179,7 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
        tmp.pts = pts;
        tmp.dts = dts;
        tmp.duration = duration;
+       tmp.received_ts = received_ts;
        storage_task_enqueue(move(tmp));
 
        update_ReferenceFrames(frame_type);
index 95d3625354c23ac539cb42917befcb5145c42e98..fcd20e7aa519b33345a86b08728177bddf515b8b 100644 (file)
@@ -1,5 +1,6 @@
 #include "x264_encoder.h"
 
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -10,6 +11,7 @@
 #include "defs.h"
 #include "flags.h"
 #include "mux.h"
+#include "print_latency.h"
 #include "timebase.h"
 #include "x264_speed_control.h"
 
@@ -19,6 +21,7 @@ extern "C" {
 }
 
 using namespace std;
+using namespace std::chrono;
 
 namespace {
 
@@ -55,11 +58,12 @@ X264Encoder::~X264Encoder()
        encoder_thread.join();
 }
 
-void X264Encoder::add_frame(int64_t pts, int64_t duration, const uint8_t *data)
+void X264Encoder::add_frame(int64_t pts, int64_t duration, const uint8_t *data, const ReceivedTimestamps &received_ts)
 {
        QueuedFrame qf;
        qf.pts = pts;
        qf.duration = duration;
+       qf.received_ts = received_ts;
 
        {
                lock_guard<mutex> lock(mu);
@@ -251,6 +255,8 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf)
                pic.opaque = reinterpret_cast<void *>(intptr_t(qf.duration));
 
                input_pic = &pic;
+
+               frames_being_encoded[qf.pts] = qf.received_ts;
        }
 
        // See if we have a new bitrate to change to.
@@ -280,6 +286,18 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf)
 
        if (num_nal == 0) return;
 
+       if (frames_being_encoded.count(pic.i_pts)) {
+               ReceivedTimestamps received_ts = frames_being_encoded[pic.i_pts];
+               frames_being_encoded.erase(pic.i_pts);
+
+               static int frameno = 0;
+               print_latency("Current x264 latency (video inputs → network mux):",
+                       received_ts, (pic.i_type == X264_TYPE_B || pic.i_type == X264_TYPE_BREF),
+                       &frameno);
+       } else {
+               assert(false);
+       }
+
        // We really need one AVPacket for the entire frame, it seems,
        // so combine it all.
        size_t num_bytes = buffered_sei.size();
index f607a91df8ab4786f1d27a7df25910ceffe6795f..8adb42ae19102808b2f35d3395aeb3a83e50d9e0 100644 (file)
 #include <stdint.h>
 #include <x264.h>
 #include <atomic>
+#include <chrono>
 #include <condition_variable>
 #include <memory>
 #include <mutex>
 #include <queue>
 #include <string>
 #include <thread>
+#include <unordered_map>
 
 extern "C" {
 #include <libavformat/avformat.h>
 }
 
+#include "print_latency.h"
+
 class Mux;
 class X264SpeedControl;
 
@@ -47,7 +51,7 @@ public:
 
        // <data> is taken to be raw NV12 data of WIDTHxHEIGHT resolution.
        // Does not block.
-       void add_frame(int64_t pts, int64_t duration, const uint8_t *data);
+       void add_frame(int64_t pts, int64_t duration, const uint8_t *data, const ReceivedTimestamps &received_ts);
 
        std::string get_global_headers() const {
                while (!x264_init_done) {
@@ -64,6 +68,7 @@ private:
        struct QueuedFrame {
                int64_t pts, duration;
                uint8_t *data;
+               ReceivedTimestamps received_ts;
        };
        void encoder_thread_func();
        void init_x264();
@@ -101,6 +106,9 @@ private:
 
        // Whenever the state of <queued_frames> changes.
        std::condition_variable queued_frames_nonempty;
+
+       // Key is the pts of the frame.
+       std::unordered_map<int64_t, ReceivedTimestamps> frames_being_encoded;
 };
 
 #endif  // !defined(_X264ENCODE_H)