]> git.sesse.net Git - nageru/commitdiff
Wait for frames in render order, not QuickSync order.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Mon, 12 Dec 2016 22:15:19 +0000 (23:15 +0100)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Mon, 12 Dec 2016 22:15:19 +0000 (23:15 +0100)
Now that we have x264 and uncompressed outputs, there's no need
to add extra latency for these cases, and the gain for going in
QuickSync order is rather marginal anyway, as long as GPUs don't
render out-of-order.

quicksync_encoder.cpp
quicksync_encoder_impl.h

index 7edb3cc03fccc67258f5ecc4066f7f4ab14a2d33..65373808884a1d50d4a4bd48633bd597a8187708 100644 (file)
@@ -118,44 +118,6 @@ static constexpr int rc_default_modes[] = {  // Priority list of modes.
     
 using namespace std;
 
-FrameReorderer::FrameReorderer(unsigned queue_length, int width, int height)
-    : queue_length(queue_length), width(width), height(height)
-{
-       for (unsigned i = 0; i < queue_length; ++i) {
-               owner.emplace_back(new uint8_t[width * height * 2]);
-               freelist.push(owner.back().get());
-       }
-}
-
-FrameReorderer::Frame FrameReorderer::reorder_frame(int64_t pts, int64_t duration, uint8_t *data, const ReceivedTimestamps &received_ts)
-{
-       if (queue_length == 0) {
-               return Frame{pts, duration, data, received_ts};
-       }
-
-       assert(!freelist.empty());
-       uint8_t *storage = freelist.top();
-       freelist.pop();
-       memcpy(storage, data, width * height * 2);
-       frames.push(Frame{pts, duration, storage, received_ts});
-
-       if (frames.size() >= queue_length) {
-               return get_first_frame();
-       } else {
-               return Frame{-1, -1, nullptr, steady_clock::time_point::min(), steady_clock::time_point::min()};
-       }
-}
-
-FrameReorderer::Frame FrameReorderer::get_first_frame()
-{
-       assert(!frames.empty());
-       Frame storage = frames.top();
-       frames.pop();
-       freelist.push(storage.data);
-       return storage;
-}
-
-
 // Supposedly vaRenderPicture() is supposed to destroy the buffer implicitly,
 // but if we don't delete it here, we get leaks. The GStreamer implementation
 // does the same.
@@ -1543,10 +1505,6 @@ QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, movit::R
 
        //print_input();
 
-       if (global_flags.uncompressed_video_to_http ||
-           global_flags.x264_video_to_http) {
-               reorderer.reset(new FrameReorderer(ip_period - 1, frame_width, frame_height));
-       }
        if (global_flags.x264_video_to_http) {
                assert(x264_encoder != nullptr);
        } else {
@@ -1701,7 +1659,7 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame(int64_t pts, int64_t duration,
 
        {
                unique_lock<mutex> lock(frame_queue_mutex);
-               pending_video_frames[current_storage_frame] = PendingFrame{ fence, input_frames, pts, duration };
+               pending_video_frames.push(PendingFrame{ fence, input_frames, pts, duration });
                ++current_storage_frame;
        }
        frame_queue_nonempty.notify_all();
@@ -1762,58 +1720,76 @@ void QuickSyncEncoderImpl::encode_thread_func()
 {
        int64_t last_dts = -1;
        int gop_start_display_frame_num = 0;
-       for (int encoding_frame_num = 0; ; ++encoding_frame_num) {
+       for (int display_frame_num = 0; ; ++display_frame_num) {
+               // Wait for the frame to be in the queue. Note that this only means
+               // we started rendering it.
                PendingFrame frame;
-               int pts_lag;
-               int frame_type, display_frame_num;
-               encoding2display_order(encoding_frame_num, intra_period, intra_idr_period, ip_period,
-                                      &display_frame_num, &frame_type, &pts_lag);
-               if (frame_type == FRAME_IDR) {
-                       numShortTerm = 0;
-                       current_frame_num = 0;
-                       gop_start_display_frame_num = display_frame_num;
-               }
-
                {
                        unique_lock<mutex> lock(frame_queue_mutex);
-                       frame_queue_nonempty.wait(lock, [this, display_frame_num]{
-                               return encode_thread_should_quit || pending_video_frames.count(display_frame_num) != 0;
+                       frame_queue_nonempty.wait(lock, [this]{
+                               return encode_thread_should_quit || !pending_video_frames.empty();
                        });
-                       if (encode_thread_should_quit && pending_video_frames.count(display_frame_num) == 0) {
-                               // We have queued frames that were supposed to be B-frames,
-                               // but will be no P-frame to encode them against. Encode them all
-                               // as P-frames instead. Note that this happens under the mutex,
+                       if (encode_thread_should_quit && pending_video_frames.empty()) {
+                               // We may have queued frames left in the reorder buffer
+                               // that were supposed to be B-frames, but have no P-frame
+                               // to be encoded against. If so, encode them all as
+                               // P-frames instead. Note that this happens under the mutex,
                                // but nobody else uses it at this point, since we're shutting down,
                                // so there's no contention.
-                               encode_remaining_frames_as_p(encoding_frame_num, gop_start_display_frame_num, last_dts);
+                               encode_remaining_frames_as_p(quicksync_encoding_frame_num, gop_start_display_frame_num, last_dts);
                                return;
                        } else {
-                               frame = move(pending_video_frames[display_frame_num]);
-                               pending_video_frames.erase(display_frame_num);
+                               frame = move(pending_video_frames.front());
+                               pending_video_frames.pop();
                        }
                }
 
-               // Determine the dts of this frame.
-               int64_t dts;
-               if (pts_lag == -1) {
-                       assert(last_dts != -1);
-                       dts = last_dts + (TIMEBASE / MAX_FPS);
-               } else {
-                       dts = frame.pts - pts_lag;
-               }
-               last_dts = dts;
+               // Pass the frame on to x264 (or uncompressed to HTTP) as needed.
+               // Note that this implicitly waits for the frame to be done rendering.
+               pass_frame(frame, display_frame_num, frame.pts, frame.duration);
+               reorder_buffer[display_frame_num] = move(frame);
+
+               // Now encode as many QuickSync frames as we can using the frames we have available.
+               // (It could be zero, or it could be multiple.) FIXME: make a function.
+               for ( ;; ) {
+                       int pts_lag;
+                       int frame_type, quicksync_display_frame_num;
+                       encoding2display_order(quicksync_encoding_frame_num, intra_period, intra_idr_period, ip_period,
+                                              &quicksync_display_frame_num, &frame_type, &pts_lag);
+                       if (!reorder_buffer.count(quicksync_display_frame_num)) {
+                               break;
+                       }
 
-               encode_frame(frame, encoding_frame_num, display_frame_num, gop_start_display_frame_num, frame_type, frame.pts, dts, frame.duration);
+                       if (frame_type == FRAME_IDR) {
+                               numShortTerm = 0;
+                               current_frame_num = 0;
+                               gop_start_display_frame_num = quicksync_display_frame_num;
+                       }
+
+                       // Determine the dts of this frame.
+                       int64_t dts;
+                       if (pts_lag == -1) {
+                               assert(last_dts != -1);
+                               dts = last_dts + (TIMEBASE / MAX_FPS);
+                       } else {
+                               dts = frame.pts - pts_lag;
+                       }
+                       last_dts = dts;
+
+                       encode_frame(reorder_buffer[quicksync_display_frame_num], quicksync_encoding_frame_num, quicksync_display_frame_num, gop_start_display_frame_num, frame_type, frame.pts, dts, frame.duration);
+                       reorder_buffer.erase(quicksync_display_frame_num);
+                       ++quicksync_encoding_frame_num;
+               }
        }
 }
 
 void QuickSyncEncoderImpl::encode_remaining_frames_as_p(int encoding_frame_num, int gop_start_display_frame_num, int64_t last_dts)
 {
-       if (pending_video_frames.empty()) {
+       if (reorder_buffer.empty()) {
                return;
        }
 
-       for (auto &pending_frame : pending_video_frames) {
+       for (auto &pending_frame : reorder_buffer) {
                int display_frame_num = pending_frame.first;
                assert(display_frame_num > 0);
                PendingFrame frame = move(pending_frame.second);
@@ -1822,20 +1798,6 @@ void QuickSyncEncoderImpl::encode_remaining_frames_as_p(int encoding_frame_num,
                encode_frame(frame, encoding_frame_num++, display_frame_num, gop_start_display_frame_num, FRAME_P, frame.pts, dts, frame.duration);
                last_dts = dts;
        }
-
-       if (global_flags.uncompressed_video_to_http ||
-           global_flags.x264_video_to_http) {
-               // Add frames left in reorderer.
-               while (!reorderer->empty()) {
-                       FrameReorderer::Frame output_frame = reorderer->get_first_frame();
-                       if (global_flags.uncompressed_video_to_http) {
-                               add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data);
-                       } else {
-                               assert(global_flags.x264_video_to_http);
-                               x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data, output_frame.received_ts);
-                       }
-               }
-       }
 }
 
 void QuickSyncEncoderImpl::add_packet_for_uncompressed_frame(int64_t pts, int64_t duration, const uint8_t *data)
@@ -1866,10 +1828,22 @@ void memcpy_with_pitch(uint8_t *dst, const uint8_t *src, size_t src_width, size_
        }
 }
 
+ReceivedTimestamps find_received_timestamp(const vector<RefCountedFrame> &input_frames)
+{
+       // Find min and max timestamp of all input frames that have a timestamp.
+       steady_clock::time_point min_ts = steady_clock::time_point::max(), max_ts = steady_clock::time_point::min();
+       for (const RefCountedFrame &input_frame : input_frames) {
+               if (input_frame && input_frame->received_timestamp > steady_clock::time_point::min()) {
+                       min_ts = min(min_ts, input_frame->received_timestamp);
+                       max_ts = max(max_ts, input_frame->received_timestamp);
+               }
+       }
+       return { min_ts, max_ts };
+}
+
 }  // namespace
 
-void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num,
-                                   int frame_type, int64_t pts, int64_t dts, int64_t duration)
+void QuickSyncEncoderImpl::pass_frame(QuickSyncEncoderImpl::PendingFrame frame, int display_frame_num, int64_t pts, int64_t duration)
 {
        // Wait for the GPU to be done with the frame.
        GLenum sync_status;
@@ -1879,19 +1853,28 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
        } while (sync_status == GL_TIMEOUT_EXPIRED);
        assert(sync_status != GL_WAIT_FAILED);
 
-       // Find min and max timestamp of all input frames that have a timestamp.
-       steady_clock::time_point min_ts = steady_clock::time_point::max(), max_ts = steady_clock::time_point::min();
-       for (const RefCountedFrame &input_frame : frame.input_frames) {
-               if (input_frame && input_frame->received_timestamp > steady_clock::time_point::min()) {
-                       min_ts = min(min_ts, input_frame->received_timestamp);
-                       max_ts = max(max_ts, input_frame->received_timestamp);
-               }
-       }
-       const ReceivedTimestamps received_ts{ min_ts, max_ts };
+       ReceivedTimestamps received_ts = find_received_timestamp(frame.input_frames);
+       static int frameno = 0;
+       print_latency("Current mixer latency (video inputs → ready for encode):",
+               received_ts, false, &frameno);
 
        // Release back any input frames we needed to render this frame.
        frame.input_frames.clear();
 
+       GLSurface *surf = &gl_surfaces[display_frame_num % SURFACE_NUM];
+       uint8_t *data = reinterpret_cast<uint8_t *>(surf->y_ptr);
+       if (global_flags.uncompressed_video_to_http) {
+               add_packet_for_uncompressed_frame(pts, duration, data);
+       } else if (global_flags.x264_video_to_http) {
+               x264_encoder->add_frame(pts, duration, data, received_ts);
+       }
+}
+
+void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num,
+                                        int frame_type, int64_t pts, int64_t dts, int64_t duration)
+{
+       const ReceivedTimestamps received_ts = find_received_timestamp(frame.input_frames);
+
        GLSurface *surf = &gl_surfaces[display_frame_num % SURFACE_NUM];
        VAStatus va_status;
 
@@ -1901,6 +1884,7 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
                va_status = vaReleaseBufferHandle(va_dpy, surf->surface_image.buf);
                CHECK_VASTATUS(va_status, "vaReleaseBufferHandle");
        } else {
+               // Upload the frame to VA-API.
                unsigned char *surface_p = nullptr;
                vaMapBuffer(va_dpy, surf->surface_image.buf, (void **)&surface_p);
 
@@ -1912,27 +1896,8 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
 
                va_status = vaUnmapBuffer(va_dpy, surf->surface_image.buf);
                CHECK_VASTATUS(va_status, "vaUnmapBuffer");
-
-               if (global_flags.uncompressed_video_to_http ||
-                   global_flags.x264_video_to_http) {
-                       // Add uncompressed video. (Note that pts == dts here.)
-                       // Delay needs to match audio.
-                       FrameReorderer::Frame output_frame = reorderer->reorder_frame(pts + global_delay(), duration, reinterpret_cast<uint8_t *>(surf->y_ptr), received_ts);
-                       if (output_frame.data != nullptr) {
-                               if (global_flags.uncompressed_video_to_http) {
-                                       add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data);
-                               } else {
-                                       assert(global_flags.x264_video_to_http);
-                                       x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data, output_frame.received_ts);
-                               }
-                       }
-               }
        }
 
-       static int frameno = 0;
-       print_latency("Current mixer latency (video inputs → ready for encode):",
-               received_ts, (frame_type == FRAME_B), &frameno);
-
        va_status = vaDestroyImage(va_dpy, surf->surface_image.image_id);
        CHECK_VASTATUS(va_status, "vaDestroyImage");
 
index 648169276cccb725e5cd641b4372dc443f44a477..72eb3b68d97bc4b5168082db9e0c0a8d6640f1cc 100644 (file)
@@ -29,54 +29,6 @@ struct __bitstream {
 };
 typedef struct __bitstream bitstream;
 
-// H.264 video comes out in encoding order (e.g. with two B-frames:
-// 0, 3, 1, 2, 6, 4, 5, etc.), but uncompressed video needs to
-// come in the right order. Since we do everything, including waiting
-// for the frames to come out of OpenGL, in encoding order, we need
-// a reordering buffer for uncompressed frames so that they come out
-// correctly. We go the super-lazy way of not making it understand
-// anything about the true order (which introduces some extra latency,
-// though); we know that for N B-frames we need at most (N-1) frames
-// in the reorder buffer, and can just sort on that.
-//
-// The class also deals with keeping a freelist as needed.
-class FrameReorderer {
-public:
-       FrameReorderer(unsigned queue_length, int width, int height);
-
-       struct Frame {
-               int64_t pts, duration;
-               uint8_t *data;
-               ReceivedTimestamps received_ts;
-
-               // Invert to get the smallest pts first.
-               bool operator< (const Frame &other) const { return pts > other.pts; }
-       };
-
-       // Returns the next frame to insert with its pts, if any. Otherwise -1 and nullptr.
-       // Does _not_ take ownership of data; a copy is taken if needed.
-       // The returned pointer is valid until the next call to reorder_frame, or destruction.
-       // As a special case, if queue_length == 0, will just return pts and data (no reordering needed).
-       Frame reorder_frame(int64_t pts, int64_t duration, uint8_t *data, const ReceivedTimestamps &received_ts);
-
-       // The same as reorder_frame, but without inserting anything. Used to empty the queue.
-       Frame get_first_frame();
-
-       bool empty() const { return frames.empty(); }
-
-private:
-       unsigned queue_length;
-       int width, height;
-
-       std::priority_queue<Frame> frames;
-       std::stack<uint8_t *> freelist;  // Includes the last value returned from reorder_frame.
-
-       // Owns all the pointers. Normally, freelist and frames could do this themselves,
-       // except priority_queue doesn't work well with movable-only types.
-       std::vector<std::unique_ptr<uint8_t[]>> owner;
-};
-
-
 class QuickSyncEncoderImpl {
 public:
        QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const std::string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator);
@@ -114,6 +66,7 @@ private:
        void encode_thread_func();
        void encode_remaining_frames_as_p(int encoding_frame_num, int gop_start_display_frame_num, int64_t last_dts);
        void add_packet_for_uncompressed_frame(int64_t pts, int64_t duration, const uint8_t *data);
+       void pass_frame(PendingFrame frame, int display_frame_num, int64_t pts, int64_t duration);
        void encode_frame(PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num,
                          int frame_type, int64_t pts, int64_t dts, int64_t duration);
        void storage_task_thread();
@@ -160,13 +113,18 @@ private:
 
        int current_storage_frame;
 
-       std::map<int, PendingFrame> pending_video_frames;  // under frame_queue_mutex
+       std::queue<PendingFrame> pending_video_frames;  // under frame_queue_mutex
        movit::ResourcePool *resource_pool;
        QSurface *surface;
 
+       // Frames that are done rendering and passed on to x264 (if enabled),
+       // but have not been encoded by Quick Sync yet, and thus also not freed.
+       // The key is the display frame number.
+       std::map<int, PendingFrame> reorder_buffer;
+       int quicksync_encoding_frame_num = 0;
+
        std::unique_ptr<AudioEncoder> file_audio_encoder;
 
-       std::unique_ptr<FrameReorderer> reorderer;
        X264Encoder *x264_encoder;  // nullptr if not using x264.
 
        Mux* stream_mux = nullptr;  // To HTTP.