]> git.sesse.net Git - nageru/blobdiff - quicksync_encoder.cpp
Do not link kaeru against CEF.
[nageru] / quicksync_encoder.cpp
index 7edb3cc03fccc67258f5ecc4066f7f4ab14a2d33..67e9668faaecbac253e6d37877664651ba47a927 100644 (file)
@@ -1,5 +1,6 @@
 #include "quicksync_encoder.h"
 
+#include <movit/image_format.h>
 #include <movit/resource_pool.h>  // Must be above the Xlib includes.
 #include <movit/util.h>
 
@@ -8,6 +9,7 @@
 #include <assert.h>
 #include <epoxy/egl.h>
 #include <fcntl.h>
+#include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -54,6 +56,7 @@ extern "C" {
 #include "timebase.h"
 #include "x264_encoder.h"
 
+using namespace movit;
 using namespace std;
 using namespace std::chrono;
 using namespace std::placeholders;
@@ -61,6 +64,18 @@ using namespace std::placeholders;
 class QOpenGLContext;
 class QSurface;
 
+namespace {
+
+// These need to survive several QuickSyncEncoderImpl instances,
+// so they are outside.
+once_flag quick_sync_metrics_inited;
+LatencyHistogram mixer_latency_histogram, qs_latency_histogram;
+MuxMetrics current_file_mux_metrics, total_mux_metrics;
+std::atomic<double> metric_current_file_start_time_seconds{0.0 / 0.0};
+std::atomic<int64_t> metric_quick_sync_stalled_frames{0};
+
+}  // namespace
+
 #define CHECK_VASTATUS(va_status, func)                                 \
     if (va_status != VA_STATUS_SUCCESS) {                               \
         fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
@@ -103,58 +118,8 @@ static constexpr unsigned int MaxFrameNum = (2<<16);
 static constexpr unsigned int MaxPicOrderCntLsb = (2<<8);
 static constexpr unsigned int Log2MaxFrameNum = 16;
 static constexpr unsigned int Log2MaxPicOrderCntLsb = 8;
-static constexpr int rc_default_modes[] = {  // Priority list of modes.
-    VA_RC_VBR,
-    VA_RC_CQP,
-    VA_RC_VBR_CONSTRAINED,
-    VA_RC_CBR,
-    VA_RC_VCM,
-    VA_RC_NONE,
-};
-
-/* thread to save coded data */
-#define SRC_SURFACE_FREE        0
-#define SRC_SURFACE_IN_ENCODING 1
-    
-using namespace std;
-
-FrameReorderer::FrameReorderer(unsigned queue_length, int width, int height)
-    : queue_length(queue_length), width(width), height(height)
-{
-       for (unsigned i = 0; i < queue_length; ++i) {
-               owner.emplace_back(new uint8_t[width * height * 2]);
-               freelist.push(owner.back().get());
-       }
-}
-
-FrameReorderer::Frame FrameReorderer::reorder_frame(int64_t pts, int64_t duration, uint8_t *data, const ReceivedTimestamps &received_ts)
-{
-       if (queue_length == 0) {
-               return Frame{pts, duration, data, received_ts};
-       }
-
-       assert(!freelist.empty());
-       uint8_t *storage = freelist.top();
-       freelist.pop();
-       memcpy(storage, data, width * height * 2);
-       frames.push(Frame{pts, duration, storage, received_ts});
-
-       if (frames.size() >= queue_length) {
-               return get_first_frame();
-       } else {
-               return Frame{-1, -1, nullptr, steady_clock::time_point::min(), steady_clock::time_point::min()};
-       }
-}
-
-FrameReorderer::Frame FrameReorderer::get_first_frame()
-{
-       assert(!frames.empty());
-       Frame storage = frames.top();
-       frames.pop();
-       freelist.push(storage.data);
-       return storage;
-}
 
+using namespace std;
 
 // Supposedly vaRenderPicture() is supposed to destroy the buffer implicitly,
 // but if we don't delete it here, we get leaks. The GStreamer implementation
@@ -300,7 +265,7 @@ static void nal_header(bitstream *bs, int nal_ref_idc, int nal_unit_type)
     bitstream_put_ui(bs, nal_unit_type, 5);
 }
 
-void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs)
+void QuickSyncEncoderImpl::sps_rbsp(YCbCrLumaCoefficients ycbcr_coefficients, bitstream *bs)
 {
     int profile_idc = PROFILE_IDC_BASELINE;
 
@@ -360,6 +325,7 @@ void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs)
     if ( false ) {
         bitstream_put_ui(bs, 0, 1); /* vui_parameters_present_flag */
     } else {
+        // See H.264 annex E for the definition of this header.
         bitstream_put_ui(bs, 1, 1); /* vui_parameters_present_flag */
         bitstream_put_ui(bs, 0, 1); /* aspect_ratio_info_present_flag */
         bitstream_put_ui(bs, 0, 1); /* overscan_info_present_flag */
@@ -370,8 +336,13 @@ void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs)
             bitstream_put_ui(bs, 1, 1);  /* colour_description_present_flag */
             {
                 bitstream_put_ui(bs, 1, 8);  /* colour_primaries (1 = BT.709) */
-                bitstream_put_ui(bs, 2, 8);  /* transfer_characteristics (2 = unspecified, since we use sRGB) */
-                bitstream_put_ui(bs, 6, 8);  /* matrix_coefficients (6 = BT.601/SMPTE 170M) */
+                bitstream_put_ui(bs, 13, 8);  /* transfer_characteristics (13 = sRGB) */
+                if (ycbcr_coefficients == YCBCR_REC_709) {
+                    bitstream_put_ui(bs, 1, 8);  /* matrix_coefficients (1 = BT.709) */
+                } else {
+                    assert(ycbcr_coefficients == YCBCR_REC_601);
+                    bitstream_put_ui(bs, 6, 8);  /* matrix_coefficients (6 = BT.601/SMPTE 170M) */
+                }
             }
         }
         bitstream_put_ui(bs, 0, 1); /* chroma_loc_info_present_flag */
@@ -551,14 +522,14 @@ int QuickSyncEncoderImpl::build_packed_pic_buffer(unsigned char **header_buffer)
 }
 
 int
-QuickSyncEncoderImpl::build_packed_seq_buffer(unsigned char **header_buffer)
+QuickSyncEncoderImpl::build_packed_seq_buffer(YCbCrLumaCoefficients ycbcr_coefficients, unsigned char **header_buffer)
 {
     bitstream bs;
 
     bitstream_start(&bs);
     nal_start_code_prefix(&bs);
     nal_header(&bs, NAL_REF_IDC_HIGH, NAL_SPS);
-    sps_rbsp(&bs);
+    sps_rbsp(ycbcr_coefficients, &bs);
     bitstream_end(&bs);
 
     *header_buffer = (unsigned char *)bs.buffer;
@@ -735,29 +706,12 @@ void encoding2display_order(
 }
 
 
-static const char *rc_to_string(int rc_mode)
-{
-    switch (rc_mode) {
-    case VA_RC_NONE:
-        return "NONE";
-    case VA_RC_CBR:
-        return "CBR";
-    case VA_RC_VBR:
-        return "VBR";
-    case VA_RC_VCM:
-        return "VCM";
-    case VA_RC_CQP:
-        return "CQP";
-    case VA_RC_VBR_CONSTRAINED:
-        return "VBR_CONSTRAINED";
-    default:
-        return "Unknown";
-    }
-}
-
 void QuickSyncEncoderImpl::enable_zerocopy_if_possible()
 {
-       if (global_flags.uncompressed_video_to_http) {
+       if (global_flags.x264_video_to_disk) {
+               // Quick Sync is entirely disabled.
+               use_zerocopy = false;
+       } else if (global_flags.uncompressed_video_to_http) {
                fprintf(stderr, "Disabling zerocopy H.264 encoding due to --http-uncompressed-video.\n");
                use_zerocopy = false;
        } else if (global_flags.x264_video_to_http) {
@@ -766,6 +720,7 @@ void QuickSyncEncoderImpl::enable_zerocopy_if_possible()
        } else {
                use_zerocopy = true;
        }
+       global_flags.use_zerocopy = use_zerocopy;
 }
 
 VADisplay QuickSyncEncoderImpl::va_open_display(const string &va_display)
@@ -776,7 +731,6 @@ VADisplay QuickSyncEncoderImpl::va_open_display(const string &va_display)
                        fprintf(stderr, "error: can't connect to X server!\n");
                        return NULL;
                }
-               enable_zerocopy_if_possible();
                return vaGetDisplay(x11_display);
        } else if (va_display[0] != '/') {
                x11_display = XOpenDisplay(va_display.c_str());
@@ -784,7 +738,6 @@ VADisplay QuickSyncEncoderImpl::va_open_display(const string &va_display)
                        fprintf(stderr, "error: can't connect to X server!\n");
                        return NULL;
                }
-               enable_zerocopy_if_possible();
                return vaGetDisplay(x11_display);
        } else {
                drm_fd = open(va_display.c_str(), O_RDWR);
@@ -898,23 +851,13 @@ int QuickSyncEncoderImpl::init_va(const string &va_display)
     }
     
     if (attrib[VAConfigAttribRateControl].value != VA_ATTRIB_NOT_SUPPORTED) {
-        int tmp = attrib[VAConfigAttribRateControl].value;
-
-        if (rc_mode == -1 || !(rc_mode & tmp))  {
-            if (rc_mode != -1) {
-                printf("Warning: Don't support the specified RateControl mode: %s!!!, switch to ", rc_to_string(rc_mode));
-            }
-
-            for (i = 0; i < sizeof(rc_default_modes) / sizeof(rc_default_modes[0]); i++) {
-                if (rc_default_modes[i] & tmp) {
-                    rc_mode = rc_default_modes[i];
-                    break;
-                }
-            }
+        if (!(attrib[VAConfigAttribRateControl].value & VA_RC_CQP)) {
+            fprintf(stderr, "ERROR: VA-API encoder does not support CQP mode.\n");
+            exit(1);
         }
 
         config_attrib[config_attrib_num].type = VAConfigAttribRateControl;
-        config_attrib[config_attrib_num].value = rc_mode;
+        config_attrib[config_attrib_num].value = VA_RC_CQP;
         config_attrib_num++;
     }
     
@@ -962,88 +905,87 @@ int QuickSyncEncoderImpl::init_va(const string &va_display)
 
 int QuickSyncEncoderImpl::setup_encode()
 {
-    VAStatus va_status;
-    VASurfaceID *tmp_surfaceid;
-    int codedbuf_size, i;
-    VASurfaceID src_surface[SURFACE_NUM];
-    VASurfaceID ref_surface[SURFACE_NUM];
-    
-    va_status = vaCreateConfig(va_dpy, h264_profile, VAEntrypointEncSlice,
-            &config_attrib[0], config_attrib_num, &config_id);
-    CHECK_VASTATUS(va_status, "vaCreateConfig");
-
-    /* create source surfaces */
-    va_status = vaCreateSurfaces(va_dpy,
-                                 VA_RT_FORMAT_YUV420, frame_width_mbaligned, frame_height_mbaligned,
-                                 &src_surface[0], SURFACE_NUM,
-                                 NULL, 0);
-    CHECK_VASTATUS(va_status, "vaCreateSurfaces");
-
-    /* create reference surfaces */
-    va_status = vaCreateSurfaces(va_dpy,
-                                 VA_RT_FORMAT_YUV420, frame_width_mbaligned, frame_height_mbaligned,
-                                &ref_surface[0], SURFACE_NUM,
-                                NULL, 0);
-    CHECK_VASTATUS(va_status, "vaCreateSurfaces");
-
-    tmp_surfaceid = (VASurfaceID *)calloc(2 * SURFACE_NUM, sizeof(VASurfaceID));
-    memcpy(tmp_surfaceid, src_surface, SURFACE_NUM * sizeof(VASurfaceID));
-    memcpy(tmp_surfaceid + SURFACE_NUM, ref_surface, SURFACE_NUM * sizeof(VASurfaceID));
-    
-    /* Create a context for this encode pipe */
-    va_status = vaCreateContext(va_dpy, config_id,
-                                frame_width_mbaligned, frame_height_mbaligned,
-                                VA_PROGRESSIVE,
-                                tmp_surfaceid, 2 * SURFACE_NUM,
-                                &context_id);
-    CHECK_VASTATUS(va_status, "vaCreateContext");
-    free(tmp_surfaceid);
-
-    codedbuf_size = (frame_width_mbaligned * frame_height_mbaligned * 400) / (16*16);
-
-    for (i = 0; i < SURFACE_NUM; i++) {
-        /* create coded buffer once for all
-         * other VA buffers which won't be used again after vaRenderPicture.
-         * so APP can always vaCreateBuffer for every frame
-         * but coded buffer need to be mapped and accessed after vaRenderPicture/vaEndPicture
-         * so VA won't maintain the coded buffer
-         */
-        va_status = vaCreateBuffer(va_dpy, context_id, VAEncCodedBufferType,
-                codedbuf_size, 1, NULL, &gl_surfaces[i].coded_buf);
-        CHECK_VASTATUS(va_status, "vaCreateBuffer");
-    }
+       if (!global_flags.x264_video_to_disk) {
+               VAStatus va_status;
+               VASurfaceID *tmp_surfaceid;
+               int codedbuf_size;
+               VASurfaceID src_surface[SURFACE_NUM];
+               VASurfaceID ref_surface[SURFACE_NUM];
+
+               va_status = vaCreateConfig(va_dpy, h264_profile, VAEntrypointEncSlice,
+                               &config_attrib[0], config_attrib_num, &config_id);
+               CHECK_VASTATUS(va_status, "vaCreateConfig");
+
+               /* create source surfaces */
+               va_status = vaCreateSurfaces(va_dpy,
+                               VA_RT_FORMAT_YUV420, frame_width_mbaligned, frame_height_mbaligned,
+                               &src_surface[0], SURFACE_NUM,
+                               NULL, 0);
+               CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+               /* create reference surfaces */
+               va_status = vaCreateSurfaces(va_dpy,
+                               VA_RT_FORMAT_YUV420, frame_width_mbaligned, frame_height_mbaligned,
+                               &ref_surface[0], SURFACE_NUM,
+                               NULL, 0);
+               CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+               tmp_surfaceid = (VASurfaceID *)calloc(2 * SURFACE_NUM, sizeof(VASurfaceID));
+               memcpy(tmp_surfaceid, src_surface, SURFACE_NUM * sizeof(VASurfaceID));
+               memcpy(tmp_surfaceid + SURFACE_NUM, ref_surface, SURFACE_NUM * sizeof(VASurfaceID));
+
+               for (int i = 0; i < SURFACE_NUM; i++) {
+                       gl_surfaces[i].src_surface = src_surface[i];
+                       gl_surfaces[i].ref_surface = ref_surface[i];
+               }
 
-    /* create OpenGL objects */
-    //glGenFramebuffers(SURFACE_NUM, fbos);
-    
-    for (i = 0; i < SURFACE_NUM; i++) {
-        if (use_zerocopy) {
-            gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, 1, 1);
-            gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, 1, 1);
-        } else {
-            gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, frame_width, frame_height);
-            gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, frame_width / 2, frame_height / 2);
-
-            // Generate a PBO to read into. It doesn't necessarily fit 1:1 with the VA-API
-            // buffers, due to potentially differing pitch.
-            glGenBuffers(1, &gl_surfaces[i].pbo);
-            glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo);
-            glBufferStorage(GL_PIXEL_PACK_BUFFER, frame_width * frame_height * 2, nullptr, GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT);
-            uint8_t *ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, frame_width * frame_height * 2, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
-            gl_surfaces[i].y_offset = 0;
-            gl_surfaces[i].cbcr_offset = frame_width * frame_height;
-            gl_surfaces[i].y_ptr = ptr + gl_surfaces[i].y_offset;
-            gl_surfaces[i].cbcr_ptr = ptr + gl_surfaces[i].cbcr_offset;
-            glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
-        }
-    }
+               /* Create a context for this encode pipe */
+               va_status = vaCreateContext(va_dpy, config_id,
+                               frame_width_mbaligned, frame_height_mbaligned,
+                               VA_PROGRESSIVE,
+                               tmp_surfaceid, 2 * SURFACE_NUM,
+                               &context_id);
+               CHECK_VASTATUS(va_status, "vaCreateContext");
+               free(tmp_surfaceid);
+
+               codedbuf_size = (frame_width_mbaligned * frame_height_mbaligned * 400) / (16*16);
+
+               for (int i = 0; i < SURFACE_NUM; i++) {
+                       /* create coded buffer once for all
+                        * other VA buffers which won't be used again after vaRenderPicture.
+                        * so APP can always vaCreateBuffer for every frame
+                        * but coded buffer need to be mapped and accessed after vaRenderPicture/vaEndPicture
+                        * so VA won't maintain the coded buffer
+                        */
+                       va_status = vaCreateBuffer(va_dpy, context_id, VAEncCodedBufferType,
+                                       codedbuf_size, 1, NULL, &gl_surfaces[i].coded_buf);
+                       CHECK_VASTATUS(va_status, "vaCreateBuffer");
+               }
+       }
 
-    for (i = 0; i < SURFACE_NUM; i++) {
-        gl_surfaces[i].src_surface = src_surface[i];
-        gl_surfaces[i].ref_surface = ref_surface[i];
-    }
-    
-    return 0;
+       /* create OpenGL objects */
+       for (int i = 0; i < SURFACE_NUM; i++) {
+               if (use_zerocopy) {
+                       gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, 1, 1);
+                       gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, 1, 1);
+               } else {
+                       size_t bytes_per_pixel = (global_flags.x264_bit_depth > 8) ? 2 : 1;
+
+                       // Generate a PBO to read into. It doesn't necessarily fit 1:1 with the VA-API
+                       // buffers, due to potentially differing pitch.
+                       glGenBuffers(1, &gl_surfaces[i].pbo);
+                       glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo);
+                       glBufferStorage(GL_PIXEL_PACK_BUFFER, frame_width * frame_height * 2 * bytes_per_pixel, nullptr, GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT);
+                       uint8_t *ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, frame_width * frame_height * 2 * bytes_per_pixel, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+                       gl_surfaces[i].y_offset = 0;
+                       gl_surfaces[i].cbcr_offset = frame_width * frame_height * bytes_per_pixel;
+                       gl_surfaces[i].y_ptr = ptr + gl_surfaces[i].y_offset;
+                       gl_surfaces[i].cbcr_ptr = ptr + gl_surfaces[i].cbcr_offset;
+                       glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+               }
+       }
+
+       return 0;
 }
 
 // Given a list like 1 9 3 0 2 8 4 and a pivot element 3, will produce
@@ -1057,51 +999,65 @@ static void sort_two(T *begin, T *end, const T &pivot, const C &less_than)
        sort(middle, end, less_than);
 }
 
-void QuickSyncEncoderImpl::update_ReferenceFrames(int frame_type)
+void QuickSyncEncoderImpl::update_ReferenceFrames(int current_display_frame, int frame_type)
 {
-    int i;
-    
     if (frame_type == FRAME_B)
         return;
 
+    pic_param.CurrPic.frame_idx = current_ref_frame_num;
+
     CurrentCurrPic.flags = VA_PICTURE_H264_SHORT_TERM_REFERENCE;
-    numShortTerm++;
-    if (numShortTerm > num_ref_frames)
-        numShortTerm = num_ref_frames;
-    for (i=numShortTerm-1; i>0; i--)
-        ReferenceFrames[i] = ReferenceFrames[i-1];
-    ReferenceFrames[0] = CurrentCurrPic;
+    unique_lock<mutex> lock(storage_task_queue_mutex);
+
+    // Insert the new frame at the start of the reference queue.
+    reference_frames.push_front(ReferenceFrame{ CurrentCurrPic, current_display_frame });
+
+    if (reference_frames.size() > num_ref_frames)
+    {
+        // The back frame frame is no longer in use as a reference.
+        int display_frame_num = reference_frames.back().display_number;
+        assert(surface_for_frame.count(display_frame_num));
+        release_gl_surface(display_frame_num);
+        reference_frames.pop_back();
+    }
+
+    // Mark this frame in use as a reference.
+    assert(surface_for_frame.count(current_display_frame));
+    ++surface_for_frame[current_display_frame]->refcount;
     
-    current_frame_num++;
-    if (current_frame_num > MaxFrameNum)
-        current_frame_num = 0;
+    current_ref_frame_num++;
+    if (current_ref_frame_num > MaxFrameNum)
+        current_ref_frame_num = 0;
 }
 
 
-int QuickSyncEncoderImpl::update_RefPicList(int frame_type)
+void QuickSyncEncoderImpl::update_RefPicList_P(VAPictureH264 RefPicList0_P[MAX_NUM_REF2])
 {
     const auto descending_by_frame_idx = [](const VAPictureH264 &a, const VAPictureH264 &b) {
         return a.frame_idx > b.frame_idx;
     };
+
+    for (size_t i = 0; i < reference_frames.size(); ++i) {
+        RefPicList0_P[i] = reference_frames[i].pic;
+    }
+    sort(&RefPicList0_P[0], &RefPicList0_P[reference_frames.size()], descending_by_frame_idx);
+}
+
+void QuickSyncEncoderImpl::update_RefPicList_B(VAPictureH264 RefPicList0_B[MAX_NUM_REF2], VAPictureH264 RefPicList1_B[MAX_NUM_REF2])
+{
     const auto ascending_by_top_field_order_cnt = [](const VAPictureH264 &a, const VAPictureH264 &b) {
         return a.TopFieldOrderCnt < b.TopFieldOrderCnt;
     };
     const auto descending_by_top_field_order_cnt = [](const VAPictureH264 &a, const VAPictureH264 &b) {
         return a.TopFieldOrderCnt > b.TopFieldOrderCnt;
     };
-    
-    if (frame_type == FRAME_P) {
-        memcpy(RefPicList0_P, ReferenceFrames, numShortTerm * sizeof(VAPictureH264));
-        sort(&RefPicList0_P[0], &RefPicList0_P[numShortTerm], descending_by_frame_idx);
-    } else if (frame_type == FRAME_B) {
-        memcpy(RefPicList0_B, ReferenceFrames, numShortTerm * sizeof(VAPictureH264));
-        sort_two(&RefPicList0_B[0], &RefPicList0_B[numShortTerm], CurrentCurrPic, ascending_by_top_field_order_cnt);
 
-        memcpy(RefPicList1_B, ReferenceFrames, numShortTerm * sizeof(VAPictureH264));
-        sort_two(&RefPicList1_B[0], &RefPicList1_B[numShortTerm], CurrentCurrPic, descending_by_top_field_order_cnt);
+    for (size_t i = 0; i < reference_frames.size(); ++i) {
+        RefPicList0_B[i] = reference_frames[i].pic;
+        RefPicList1_B[i] = reference_frames[i].pic;
     }
-    
-    return 0;
+    sort_two(&RefPicList0_B[0], &RefPicList0_B[reference_frames.size()], CurrentCurrPic, ascending_by_top_field_order_cnt);
+    sort_two(&RefPicList1_B[0], &RefPicList1_B[reference_frames.size()], CurrentCurrPic, descending_by_top_field_order_cnt);
 }
 
 
@@ -1203,21 +1159,23 @@ static int calc_poc(int pic_order_cnt_lsb, int frame_type)
     return TopFieldOrderCnt;
 }
 
-int QuickSyncEncoderImpl::render_picture(int frame_type, int display_frame_num, int gop_start_display_frame_num)
+int QuickSyncEncoderImpl::render_picture(GLSurface *surf, int frame_type, int display_frame_num, int gop_start_display_frame_num)
 {
     VABufferID pic_param_buf;
     VAStatus va_status;
-    int i = 0;
+    size_t i = 0;
 
-    pic_param.CurrPic.picture_id = gl_surfaces[display_frame_num % SURFACE_NUM].ref_surface;
-    pic_param.CurrPic.frame_idx = current_frame_num;
+    pic_param.CurrPic.picture_id = surf->ref_surface;
+    pic_param.CurrPic.frame_idx = current_ref_frame_num;
     pic_param.CurrPic.flags = 0;
     pic_param.CurrPic.TopFieldOrderCnt = calc_poc((display_frame_num - gop_start_display_frame_num) % MaxPicOrderCntLsb, frame_type);
     pic_param.CurrPic.BottomFieldOrderCnt = pic_param.CurrPic.TopFieldOrderCnt;
     CurrentCurrPic = pic_param.CurrPic;
 
-    memcpy(pic_param.ReferenceFrames, ReferenceFrames, numShortTerm*sizeof(VAPictureH264));
-    for (i = numShortTerm; i < MAX_NUM_REF1; i++) {
+    for (i = 0; i < reference_frames.size(); i++) {
+        pic_param.ReferenceFrames[i] = reference_frames[i].pic;
+    }
+    for (i = reference_frames.size(); i < MAX_NUM_REF1; i++) {
         pic_param.ReferenceFrames[i].picture_id = VA_INVALID_SURFACE;
         pic_param.ReferenceFrames[i].flags = VA_PICTURE_H264_INVALID;
     }
@@ -1226,8 +1184,8 @@ int QuickSyncEncoderImpl::render_picture(int frame_type, int display_frame_num,
     pic_param.pic_fields.bits.reference_pic_flag = (frame_type != FRAME_B);
     pic_param.pic_fields.bits.entropy_coding_mode_flag = h264_entropy_mode;
     pic_param.pic_fields.bits.deblocking_filter_control_present_flag = 1;
-    pic_param.frame_num = current_frame_num;
-    pic_param.coded_buf = gl_surfaces[display_frame_num % SURFACE_NUM].coded_buf;
+    pic_param.frame_num = current_ref_frame_num;  // FIXME: is this correct?
+    pic_param.coded_buf = surf->coded_buf;
     pic_param.last_picture = false;  // FIXME
     pic_param.pic_init_qp = initial_qp;
 
@@ -1240,7 +1198,7 @@ int QuickSyncEncoderImpl::render_picture(int frame_type, int display_frame_num,
     return 0;
 }
 
-int QuickSyncEncoderImpl::render_packedsequence()
+int QuickSyncEncoderImpl::render_packedsequence(YCbCrLumaCoefficients ycbcr_coefficients)
 {
     VAEncPackedHeaderParameterBuffer packedheader_param_buffer;
     VABufferID packedseq_para_bufid, packedseq_data_bufid, render_id[2];
@@ -1248,7 +1206,7 @@ int QuickSyncEncoderImpl::render_packedsequence()
     unsigned char *packedseq_buffer = NULL;
     VAStatus va_status;
 
-    length_in_bits = build_packed_seq_buffer(&packedseq_buffer); 
+    length_in_bits = build_packed_seq_buffer(ycbcr_coefficients, &packedseq_buffer); 
     
     packedheader_param_buffer.type = VAEncPackedHeaderSequence;
     
@@ -1354,8 +1312,6 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame
     VAStatus va_status;
     int i;
 
-    update_RefPicList(frame_type);
-    
     /* one frame, one slice */
     slice_param.macroblock_address = 0;
     slice_param.num_macroblocks = frame_width_mbaligned * frame_height_mbaligned/(16*16); /* Measured by MB */
@@ -1364,6 +1320,9 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame
         if (encoding_frame_num != 0)
             ++slice_param.idr_pic_id;
     } else if (frame_type == FRAME_P) {
+        VAPictureH264 RefPicList0_P[MAX_NUM_REF2];
+        update_RefPicList_P(RefPicList0_P);
+
         int refpiclist0_max = h264_maxref & 0xffff;
         memcpy(slice_param.RefPicList0, RefPicList0_P, refpiclist0_max*sizeof(VAPictureH264));
 
@@ -1372,6 +1331,9 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame
             slice_param.RefPicList0[i].flags = VA_PICTURE_H264_INVALID;
         }
     } else if (frame_type == FRAME_B) {
+        VAPictureH264 RefPicList0_B[MAX_NUM_REF2], RefPicList1_B[MAX_NUM_REF2];
+        update_RefPicList_B(RefPicList0_B, RefPicList1_B);
+
         int refpiclist0_max = h264_maxref & 0xffff;
         int refpiclist1_max = (h264_maxref >> 16) & 0xffff;
 
@@ -1409,24 +1371,24 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame
 
 
 
-void QuickSyncEncoderImpl::save_codeddata(storage_task task)
+void QuickSyncEncoderImpl::save_codeddata(GLSurface *surf, storage_task task)
 {    
        VACodedBufferSegment *buf_list = NULL;
        VAStatus va_status;
 
        string data;
 
-       va_status = vaMapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf, (void **)(&buf_list));
+       va_status = vaMapBuffer(va_dpy, surf->coded_buf, (void **)(&buf_list));
        CHECK_VASTATUS(va_status, "vaMapBuffer");
        while (buf_list != NULL) {
                data.append(reinterpret_cast<const char *>(buf_list->buf), buf_list->size);
                buf_list = (VACodedBufferSegment *) buf_list->next;
        }
-       vaUnmapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf);
+       vaUnmapBuffer(va_dpy, surf->coded_buf);
 
        static int frameno = 0;
-       print_latency("Current QuickSync latency (video inputs → disk mux):",
-               task.received_ts, (task.frame_type == FRAME_B), &frameno);
+       print_latency("Current Quick Sync latency (video inputs → disk mux):",
+               task.received_ts, (task.frame_type == FRAME_B), &frameno, &qs_latency_histogram);
 
        {
                // Add video.
@@ -1463,8 +1425,10 @@ void QuickSyncEncoderImpl::storage_task_enqueue(storage_task task)
 
 void QuickSyncEncoderImpl::storage_task_thread()
 {
+       pthread_setname_np(pthread_self(), "QS_Storage");
        for ( ;; ) {
                storage_task current;
+               GLSurface *surf;
                {
                        // wait until there's an encoded frame  
                        unique_lock<mutex> lock(storage_task_queue_mutex);
@@ -1472,19 +1436,28 @@ void QuickSyncEncoderImpl::storage_task_thread()
                        if (storage_thread_should_quit && storage_task_queue.empty()) return;
                        current = move(storage_task_queue.front());
                        storage_task_queue.pop();
+                       surf = surface_for_frame[current.display_order];
+                       assert(surf != nullptr);
                }
 
                VAStatus va_status;
+
+               size_t display_order = current.display_order;
+               vector<size_t> ref_display_frame_numbers = move(current.ref_display_frame_numbers);
           
                // waits for data, then saves it to disk.
-               va_status = vaSyncSurface(va_dpy, gl_surfaces[current.display_order % SURFACE_NUM].src_surface);
+               va_status = vaSyncSurface(va_dpy, surf->src_surface);
                CHECK_VASTATUS(va_status, "vaSyncSurface");
-               save_codeddata(move(current));
+               save_codeddata(surf, move(current));
 
+               // Unlock the frame, and all its references.
                {
                        unique_lock<mutex> lock(storage_task_queue_mutex);
-                       srcsurface_status[current.display_order % SURFACE_NUM] = SRC_SURFACE_FREE;
-                       storage_task_queue_changed.notify_all();
+                       release_gl_surface(display_order);
+
+                       for (size_t frame_num : ref_display_frame_numbers) {
+                               release_gl_surface(frame_num);
+                       }
                }
        }
 }
@@ -1509,14 +1482,15 @@ void QuickSyncEncoderImpl::release_gl_resources()
        }
 
        for (unsigned i = 0; i < SURFACE_NUM; i++) {
-               if (!use_zerocopy) {
+               if (use_zerocopy) {
+                       resource_pool->release_2d_texture(gl_surfaces[i].y_tex);
+                       resource_pool->release_2d_texture(gl_surfaces[i].cbcr_tex);
+               } else {
                        glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo);
                        glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
                        glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
                        glDeleteBuffers(1, &gl_surfaces[i].pbo);
                }
-               resource_pool->release_2d_texture(gl_surfaces[i].y_tex);
-               resource_pool->release_2d_texture(gl_surfaces[i].cbcr_tex);
        }
 
        has_released_gl_resources = true;
@@ -1531,7 +1505,7 @@ int QuickSyncEncoderImpl::deinit_va()
     return 0;
 }
 
-QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
+QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
        : current_storage_frame(0), resource_pool(resource_pool), surface(surface), x264_encoder(x264_encoder), frame_width(width), frame_height(height), disk_space_estimator(disk_space_estimator)
 {
        file_audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE, oformat));
@@ -1543,30 +1517,36 @@ QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, movit::R
 
        //print_input();
 
-       if (global_flags.uncompressed_video_to_http ||
-           global_flags.x264_video_to_http) {
-               reorderer.reset(new FrameReorderer(ip_period - 1, frame_width, frame_height));
-       }
-       if (global_flags.x264_video_to_http) {
+       if (global_flags.x264_video_to_http || global_flags.x264_video_to_disk) {
                assert(x264_encoder != nullptr);
        } else {
                assert(x264_encoder == nullptr);
        }
 
-       init_va(va_display);
+       enable_zerocopy_if_possible();
+       if (!global_flags.x264_video_to_disk) {
+               init_va(va_display);
+       }
        setup_encode();
 
-       // No frames are ready yet.
-       memset(srcsurface_status, SRC_SURFACE_FREE, sizeof(srcsurface_status));
-           
-       memset(&seq_param, 0, sizeof(seq_param));
-       memset(&pic_param, 0, sizeof(pic_param));
-       memset(&slice_param, 0, sizeof(slice_param));
+       if (!global_flags.x264_video_to_disk) {
+               memset(&seq_param, 0, sizeof(seq_param));
+               memset(&pic_param, 0, sizeof(pic_param));
+               memset(&slice_param, 0, sizeof(slice_param));
+       }
+
+       call_once(quick_sync_metrics_inited, [](){
+               mixer_latency_histogram.init("mixer");
+               qs_latency_histogram.init("quick_sync");
+               current_file_mux_metrics.init({{ "destination", "current_file" }});
+               total_mux_metrics.init({{ "destination", "files_total" }});
+               global_metrics.add("current_file_start_time_seconds", &metric_current_file_start_time_seconds, Metrics::TYPE_GAUGE);
+               global_metrics.add("quick_sync_stalled_frames", &metric_quick_sync_stalled_frames);
+       });
 
        storage_thread = thread(&QuickSyncEncoderImpl::storage_task_thread, this);
 
        encode_thread = thread([this]{
-               //SDL_GL_MakeCurrent(window, context);
                QOpenGLContext *context = create_context(this->surface);
                eglBindAPI(EGL_OPENGL_API);
                if (!make_current(context, this->surface)) {
@@ -1585,89 +1565,139 @@ QuickSyncEncoderImpl::~QuickSyncEncoderImpl()
        release_gl_resources();
 }
 
-bool QuickSyncEncoderImpl::begin_frame(GLuint *y_tex, GLuint *cbcr_tex)
+QuickSyncEncoderImpl::GLSurface *QuickSyncEncoderImpl::allocate_gl_surface()
+{
+       for (unsigned i = 0; i < SURFACE_NUM; ++i) {
+               if (gl_surfaces[i].refcount == 0) {
+                       ++gl_surfaces[i].refcount;
+                       return &gl_surfaces[i];
+               }
+       }
+       return nullptr;
+}
+
+void QuickSyncEncoderImpl::release_gl_surface(size_t display_frame_num)
+{
+       assert(surface_for_frame.count(display_frame_num));
+       QuickSyncEncoderImpl::GLSurface *surf = surface_for_frame[display_frame_num];
+       if (--surf->refcount == 0) {
+               assert(surface_for_frame.count(display_frame_num));
+               surface_for_frame.erase(display_frame_num);
+               storage_task_queue_changed.notify_all();
+       }
+}
+
+bool QuickSyncEncoderImpl::is_zerocopy() const
+{
+       return use_zerocopy;
+}
+
+bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
        assert(!is_shutdown);
+       GLSurface *surf = nullptr;
        {
                // Wait until this frame slot is done encoding.
                unique_lock<mutex> lock(storage_task_queue_mutex);
-               if (srcsurface_status[current_storage_frame % SURFACE_NUM] != SRC_SURFACE_FREE) {
-                       fprintf(stderr, "Warning: Slot %d (for frame %d) is still encoding, rendering has to wait for H.264 encoder\n",
-                               current_storage_frame % SURFACE_NUM, current_storage_frame);
+               surf = allocate_gl_surface();
+               if (surf == nullptr) {
+                       fprintf(stderr, "Warning: No free slots for frame %d, rendering has to wait for H.264 encoder\n",
+                               current_storage_frame);
+                       ++metric_quick_sync_stalled_frames;
+                       storage_task_queue_changed.wait(lock, [this, &surf]{
+                               if (storage_thread_should_quit)
+                                       return true;
+                               surf = allocate_gl_surface();
+                               return surf != nullptr;
+                       });
                }
-               storage_task_queue_changed.wait(lock, [this]{ return storage_thread_should_quit || (srcsurface_status[current_storage_frame % SURFACE_NUM] == SRC_SURFACE_FREE); });
-               srcsurface_status[current_storage_frame % SURFACE_NUM] = SRC_SURFACE_IN_ENCODING;
                if (storage_thread_should_quit) return false;
+               assert(surf != nullptr);
+               surface_for_frame[current_storage_frame] = surf;
        }
 
-       //*fbo = fbos[current_storage_frame % SURFACE_NUM];
-       GLSurface *surf = &gl_surfaces[current_storage_frame % SURFACE_NUM];
-       *y_tex = surf->y_tex;
-       *cbcr_tex = surf->cbcr_tex;
-
-       VAStatus va_status = vaDeriveImage(va_dpy, surf->src_surface, &surf->surface_image);
-       CHECK_VASTATUS(va_status, "vaDeriveImage");
-
        if (use_zerocopy) {
-               VABufferInfo buf_info;
-               buf_info.mem_type = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;  // or VA_SURFACE_ATTRIB_MEM_TYPE_KERNEL_DRM?
-               va_status = vaAcquireBufferHandle(va_dpy, surf->surface_image.buf, &buf_info);
-               CHECK_VASTATUS(va_status, "vaAcquireBufferHandle");
-
-               // Create Y image.
-               surf->y_egl_image = EGL_NO_IMAGE_KHR;
-               EGLint y_attribs[] = {
-                       EGL_WIDTH, frame_width,
-                       EGL_HEIGHT, frame_height,
-                       EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('R', '8', ' ', ' '),
-                       EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle),
-                       EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[0]),
-                       EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[0]),
-                       EGL_NONE
-               };
-
-               surf->y_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, y_attribs);
-               assert(surf->y_egl_image != EGL_NO_IMAGE_KHR);
-
-               // Associate Y image to a texture.
-               glBindTexture(GL_TEXTURE_2D, *y_tex);
-               glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->y_egl_image);
-
-               // Create CbCr image.
-               surf->cbcr_egl_image = EGL_NO_IMAGE_KHR;
-               EGLint cbcr_attribs[] = {
-                       EGL_WIDTH, frame_width,
-                       EGL_HEIGHT, frame_height,
-                       EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('G', 'R', '8', '8'),
-                       EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle),
-                       EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[1]),
-                       EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[1]),
-                       EGL_NONE
-               };
-
-               surf->cbcr_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, cbcr_attribs);
-               assert(surf->cbcr_egl_image != EGL_NO_IMAGE_KHR);
-
-               // Associate CbCr image to a texture.
-               glBindTexture(GL_TEXTURE_2D, *cbcr_tex);
-               glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->cbcr_egl_image);
+               *y_tex = surf->y_tex;
+               *cbcr_tex = surf->cbcr_tex;
+       } else {
+               surf->y_tex = *y_tex;
+               surf->cbcr_tex = *cbcr_tex;
+       }
+
+       if (!global_flags.x264_video_to_disk) {
+               VAStatus va_status = vaDeriveImage(va_dpy, surf->src_surface, &surf->surface_image);
+               CHECK_VASTATUS(va_status, "vaDeriveImage");
+
+               if (use_zerocopy) {
+                       VABufferInfo buf_info;
+                       buf_info.mem_type = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;  // or VA_SURFACE_ATTRIB_MEM_TYPE_KERNEL_DRM?
+                       va_status = vaAcquireBufferHandle(va_dpy, surf->surface_image.buf, &buf_info);
+                       CHECK_VASTATUS(va_status, "vaAcquireBufferHandle");
+
+                       // Create Y image.
+                       surf->y_egl_image = EGL_NO_IMAGE_KHR;
+                       EGLint y_attribs[] = {
+                               EGL_WIDTH, frame_width,
+                               EGL_HEIGHT, frame_height,
+                               EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('R', '8', ' ', ' '),
+                               EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle),
+                               EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[0]),
+                               EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[0]),
+                               EGL_NONE
+                       };
+
+                       surf->y_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, y_attribs);
+                       assert(surf->y_egl_image != EGL_NO_IMAGE_KHR);
+
+                       // Associate Y image to a texture.
+                       glBindTexture(GL_TEXTURE_2D, *y_tex);
+                       glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->y_egl_image);
+
+                       // Create CbCr image.
+                       surf->cbcr_egl_image = EGL_NO_IMAGE_KHR;
+                       EGLint cbcr_attribs[] = {
+                               EGL_WIDTH, frame_width / 2,
+                               EGL_HEIGHT, frame_height / 2,
+                               EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('G', 'R', '8', '8'),
+                               EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle),
+                               EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[1]),
+                               EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[1]),
+                               EGL_NONE
+                       };
+
+                       surf->cbcr_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, cbcr_attribs);
+                       assert(surf->cbcr_egl_image != EGL_NO_IMAGE_KHR);
+
+                       // Associate CbCr image to a texture.
+                       glBindTexture(GL_TEXTURE_2D, *cbcr_tex);
+                       glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->cbcr_egl_image);
+               }
        }
 
+       current_video_frame = PendingFrame{ {}, input_frames, pts, duration, ycbcr_coefficients };
+
        return true;
 }
 
 void QuickSyncEncoderImpl::add_audio(int64_t pts, vector<float> audio)
 {
+       lock_guard<mutex> lock(file_audio_encoder_mutex);
        assert(!is_shutdown);
        file_audio_encoder->encode_audio(audio, pts + global_delay());
 }
 
-RefCountedGLsync QuickSyncEncoderImpl::end_frame(int64_t pts, int64_t duration, const vector<RefCountedFrame> &input_frames)
+RefCountedGLsync QuickSyncEncoderImpl::end_frame()
 {
        assert(!is_shutdown);
 
        if (!use_zerocopy) {
-               GLSurface *surf = &gl_surfaces[current_storage_frame % SURFACE_NUM];
+               GLenum type = global_flags.x264_bit_depth > 8 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE;
+               GLSurface *surf;
+               {
+                       unique_lock<mutex> lock(storage_task_queue_mutex);
+                       surf = surface_for_frame[current_storage_frame];
+                       assert(surf != nullptr);
+               }
 
                glPixelStorei(GL_PACK_ROW_LENGTH, 0);
                check_error();
@@ -1677,14 +1707,17 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame(int64_t pts, int64_t duration,
 
                glBindTexture(GL_TEXTURE_2D, surf->y_tex);
                check_error();
-               glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, BUFFER_OFFSET(surf->y_offset));
+               glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, type, BUFFER_OFFSET(surf->y_offset));
                check_error();
 
                glBindTexture(GL_TEXTURE_2D, surf->cbcr_tex);
                check_error();
-               glGetTexImage(GL_TEXTURE_2D, 0, GL_RG, GL_UNSIGNED_BYTE, BUFFER_OFFSET(surf->cbcr_offset));
+               glGetTexImage(GL_TEXTURE_2D, 0, GL_RG, type, BUFFER_OFFSET(surf->cbcr_offset));
                check_error();
 
+               // We don't own these; the caller does.
+               surf->y_tex = surf->cbcr_tex = 0;
+
                glBindTexture(GL_TEXTURE_2D, 0);
                check_error();
                glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
@@ -1701,7 +1734,8 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame(int64_t pts, int64_t duration,
 
        {
                unique_lock<mutex> lock(frame_queue_mutex);
-               pending_video_frames[current_storage_frame] = PendingFrame{ fence, input_frames, pts, duration };
+               current_video_frame.fence = fence;
+               pending_video_frames.push(move(current_video_frame));
                ++current_storage_frame;
        }
        frame_queue_nonempty.notify_all();
@@ -1729,14 +1763,24 @@ void QuickSyncEncoderImpl::shutdown()
        storage_thread.join();
 
        // Encode any leftover audio in the queues, and also any delayed frames.
-       file_audio_encoder->encode_last_audio();
+       {
+               lock_guard<mutex> lock(file_audio_encoder_mutex);
+               file_audio_encoder->encode_last_audio();
+       }
 
-       release_encode();
-       deinit_va();
-       file_mux.reset();
+       if (!global_flags.x264_video_to_disk) {
+               release_encode();
+               deinit_va();
+       }
        is_shutdown = true;
 }
 
+void QuickSyncEncoderImpl::close_file()
+{
+       file_mux.reset();
+       metric_current_file_start_time_seconds = 0.0 / 0.0;
+}
+
 void QuickSyncEncoderImpl::open_output_file(const std::string &filename)
 {
        AVFormatContext *avctx = avformat_alloc_context();
@@ -1752,90 +1796,124 @@ void QuickSyncEncoderImpl::open_output_file(const std::string &filename)
                exit(1);
        }
 
-       string video_extradata = "";  // FIXME: See other comment about global headers.
-       AVCodecParametersWithDeleter audio_codecpar = file_audio_encoder->get_codec_parameters();
-       file_mux.reset(new Mux(avctx, frame_width, frame_height, Mux::CODEC_H264, video_extradata, audio_codecpar.get(), TIMEBASE,
-               std::bind(&DiskSpaceEstimator::report_write, disk_space_estimator, filename, _1)));
+       string video_extradata;  // FIXME: See other comment about global headers.
+       if (global_flags.x264_video_to_disk) {
+               video_extradata = x264_encoder->get_global_headers();
+       }
+
+       current_file_mux_metrics.reset();
+
+       {
+               lock_guard<mutex> lock(file_audio_encoder_mutex);
+               AVCodecParametersWithDeleter audio_codecpar = file_audio_encoder->get_codec_parameters();
+               file_mux.reset(new Mux(avctx, frame_width, frame_height, Mux::CODEC_H264, video_extradata, audio_codecpar.get(), TIMEBASE,
+                       std::bind(&DiskSpaceEstimator::report_write, disk_space_estimator, filename, _1),
+                       Mux::WRITE_BACKGROUND,
+                       { &current_file_mux_metrics, &total_mux_metrics }));
+       }
+       metric_current_file_start_time_seconds = get_timestamp_for_metrics();
+
+       if (global_flags.x264_video_to_disk) {
+               x264_encoder->add_mux(file_mux.get());
+       }
 }
 
 void QuickSyncEncoderImpl::encode_thread_func()
 {
+       pthread_setname_np(pthread_self(), "QS_Encode");
+
        int64_t last_dts = -1;
        int gop_start_display_frame_num = 0;
-       for (int encoding_frame_num = 0; ; ++encoding_frame_num) {
+       for (int display_frame_num = 0; ; ++display_frame_num) {
+               // Wait for the frame to be in the queue. Note that this only means
+               // we started rendering it.
                PendingFrame frame;
-               int pts_lag;
-               int frame_type, display_frame_num;
-               encoding2display_order(encoding_frame_num, intra_period, intra_idr_period, ip_period,
-                                      &display_frame_num, &frame_type, &pts_lag);
-               if (frame_type == FRAME_IDR) {
-                       numShortTerm = 0;
-                       current_frame_num = 0;
-                       gop_start_display_frame_num = display_frame_num;
-               }
-
                {
                        unique_lock<mutex> lock(frame_queue_mutex);
-                       frame_queue_nonempty.wait(lock, [this, display_frame_num]{
-                               return encode_thread_should_quit || pending_video_frames.count(display_frame_num) != 0;
+                       frame_queue_nonempty.wait(lock, [this]{
+                               return encode_thread_should_quit || !pending_video_frames.empty();
                        });
-                       if (encode_thread_should_quit && pending_video_frames.count(display_frame_num) == 0) {
-                               // We have queued frames that were supposed to be B-frames,
-                               // but will be no P-frame to encode them against. Encode them all
-                               // as P-frames instead. Note that this happens under the mutex,
+                       if (encode_thread_should_quit && pending_video_frames.empty()) {
+                               // We may have queued frames left in the reorder buffer
+                               // that were supposed to be B-frames, but have no P-frame
+                               // to be encoded against. If so, encode them all as
+                               // P-frames instead. Note that this happens under the mutex,
                                // but nobody else uses it at this point, since we're shutting down,
                                // so there's no contention.
-                               encode_remaining_frames_as_p(encoding_frame_num, gop_start_display_frame_num, last_dts);
+                               encode_remaining_frames_as_p(quicksync_encoding_frame_num, gop_start_display_frame_num, last_dts);
                                return;
                        } else {
-                               frame = move(pending_video_frames[display_frame_num]);
-                               pending_video_frames.erase(display_frame_num);
+                               frame = move(pending_video_frames.front());
+                               pending_video_frames.pop();
                        }
                }
 
-               // Determine the dts of this frame.
-               int64_t dts;
-               if (pts_lag == -1) {
-                       assert(last_dts != -1);
-                       dts = last_dts + (TIMEBASE / MAX_FPS);
-               } else {
-                       dts = frame.pts - pts_lag;
+               // Pass the frame on to x264 (or uncompressed to HTTP) as needed.
+               // Note that this implicitly waits for the frame to be done rendering.
+               pass_frame(frame, display_frame_num, frame.pts, frame.duration);
+
+               if (global_flags.x264_video_to_disk) {
+                       unique_lock<mutex> lock(storage_task_queue_mutex);
+                       release_gl_surface(display_frame_num);
+                       continue;
                }
-               last_dts = dts;
 
-               encode_frame(frame, encoding_frame_num, display_frame_num, gop_start_display_frame_num, frame_type, frame.pts, dts, frame.duration);
+               reorder_buffer[display_frame_num] = move(frame);
+
+               // Now encode as many QuickSync frames as we can using the frames we have available.
+               // (It could be zero, or it could be multiple.) FIXME: make a function.
+               for ( ;; ) {
+                       int pts_lag;
+                       int frame_type, quicksync_display_frame_num;
+                       encoding2display_order(quicksync_encoding_frame_num, intra_period, intra_idr_period, ip_period,
+                                              &quicksync_display_frame_num, &frame_type, &pts_lag);
+                       if (!reorder_buffer.count(quicksync_display_frame_num)) {
+                               break;
+                       }
+                       frame = move(reorder_buffer[quicksync_display_frame_num]);
+                       reorder_buffer.erase(quicksync_display_frame_num);
+
+                       if (frame_type == FRAME_IDR) {
+                               // Release any reference frames from the previous GOP.
+                               for (const ReferenceFrame &frame : reference_frames) {
+                                       release_gl_surface(frame.display_number);
+                               }
+                               reference_frames.clear();
+                               current_ref_frame_num = 0;
+                               gop_start_display_frame_num = quicksync_display_frame_num;
+                       }
+
+                       // Determine the dts of this frame.
+                       int64_t dts;
+                       if (pts_lag == -1) {
+                               assert(last_dts != -1);
+                               dts = last_dts + (TIMEBASE / MAX_FPS);
+                       } else {
+                               dts = frame.pts - pts_lag;
+                       }
+                       last_dts = dts;
+
+                       encode_frame(frame, quicksync_encoding_frame_num, quicksync_display_frame_num, gop_start_display_frame_num, frame_type, frame.pts, dts, frame.duration, frame.ycbcr_coefficients);
+                       ++quicksync_encoding_frame_num;
+               }
        }
 }
 
 void QuickSyncEncoderImpl::encode_remaining_frames_as_p(int encoding_frame_num, int gop_start_display_frame_num, int64_t last_dts)
 {
-       if (pending_video_frames.empty()) {
+       if (reorder_buffer.empty()) {
                return;
        }
 
-       for (auto &pending_frame : pending_video_frames) {
+       for (auto &pending_frame : reorder_buffer) {
                int display_frame_num = pending_frame.first;
                assert(display_frame_num > 0);
                PendingFrame frame = move(pending_frame.second);
                int64_t dts = last_dts + (TIMEBASE / MAX_FPS);
                printf("Finalizing encode: Encoding leftover frame %d as P-frame instead of B-frame.\n", display_frame_num);
-               encode_frame(frame, encoding_frame_num++, display_frame_num, gop_start_display_frame_num, FRAME_P, frame.pts, dts, frame.duration);
+               encode_frame(frame, encoding_frame_num++, display_frame_num, gop_start_display_frame_num, FRAME_P, frame.pts, dts, frame.duration, frame.ycbcr_coefficients);
                last_dts = dts;
        }
-
-       if (global_flags.uncompressed_video_to_http ||
-           global_flags.x264_video_to_http) {
-               // Add frames left in reorderer.
-               while (!reorderer->empty()) {
-                       FrameReorderer::Frame output_frame = reorderer->get_first_frame();
-                       if (global_flags.uncompressed_video_to_http) {
-                               add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data);
-                       } else {
-                               assert(global_flags.x264_video_to_http);
-                               x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data, output_frame.received_ts);
-                       }
-               }
-       }
 }
 
 void QuickSyncEncoderImpl::add_packet_for_uncompressed_frame(int64_t pts, int64_t duration, const uint8_t *data)
@@ -1868,31 +1946,53 @@ void memcpy_with_pitch(uint8_t *dst, const uint8_t *src, size_t src_width, size_
 
 }  // namespace
 
-void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num,
-                                   int frame_type, int64_t pts, int64_t dts, int64_t duration)
+void QuickSyncEncoderImpl::pass_frame(QuickSyncEncoderImpl::PendingFrame frame, int display_frame_num, int64_t pts, int64_t duration)
 {
        // Wait for the GPU to be done with the frame.
        GLenum sync_status;
        do {
-               sync_status = glClientWaitSync(frame.fence.get(), 0, 1000000000);
+               sync_status = glClientWaitSync(frame.fence.get(), 0, 0);
                check_error();
+               if (sync_status == GL_TIMEOUT_EXPIRED) {
+                       // NVIDIA likes to busy-wait; yield instead.
+                       this_thread::sleep_for(milliseconds(1));
+               }
        } while (sync_status == GL_TIMEOUT_EXPIRED);
        assert(sync_status != GL_WAIT_FAILED);
 
-       // Find min and max timestamp of all input frames that have a timestamp.
-       steady_clock::time_point min_ts = steady_clock::time_point::max(), max_ts = steady_clock::time_point::min();
-       for (const RefCountedFrame &input_frame : frame.input_frames) {
-               if (input_frame && input_frame->received_timestamp > steady_clock::time_point::min()) {
-                       min_ts = min(min_ts, input_frame->received_timestamp);
-                       max_ts = max(max_ts, input_frame->received_timestamp);
-               }
-       }
-       const ReceivedTimestamps received_ts{ min_ts, max_ts };
+       ReceivedTimestamps received_ts = find_received_timestamp(frame.input_frames);
+       static int frameno = 0;
+       print_latency("Current mixer latency (video inputs → ready for encode):",
+               received_ts, false, &frameno, &mixer_latency_histogram);
 
        // Release back any input frames we needed to render this frame.
        frame.input_frames.clear();
 
-       GLSurface *surf = &gl_surfaces[display_frame_num % SURFACE_NUM];
+       GLSurface *surf;
+       {
+               unique_lock<mutex> lock(storage_task_queue_mutex);
+               surf = surface_for_frame[display_frame_num];
+               assert(surf != nullptr);
+       }
+       uint8_t *data = reinterpret_cast<uint8_t *>(surf->y_ptr);
+       if (global_flags.uncompressed_video_to_http) {
+               add_packet_for_uncompressed_frame(pts, duration, data);
+       } else if (global_flags.x264_video_to_http || global_flags.x264_video_to_disk) {
+               x264_encoder->add_frame(pts, duration, frame.ycbcr_coefficients, data, received_ts);
+       }
+}
+
+void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num,
+                                        int frame_type, int64_t pts, int64_t dts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients)
+{
+       const ReceivedTimestamps received_ts = find_received_timestamp(frame.input_frames);
+
+       GLSurface *surf;
+       {
+               unique_lock<mutex> lock(storage_task_queue_mutex);
+               surf = surface_for_frame[display_frame_num];
+               assert(surf != nullptr);
+       }
        VAStatus va_status;
 
        if (use_zerocopy) {
@@ -1901,6 +2001,7 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
                va_status = vaReleaseBufferHandle(va_dpy, surf->surface_image.buf);
                CHECK_VASTATUS(va_status, "vaReleaseBufferHandle");
        } else {
+               // Upload the frame to VA-API.
                unsigned char *surface_p = nullptr;
                vaMapBuffer(va_dpy, surf->surface_image.buf, (void **)&surface_p);
 
@@ -1912,27 +2013,8 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
 
                va_status = vaUnmapBuffer(va_dpy, surf->surface_image.buf);
                CHECK_VASTATUS(va_status, "vaUnmapBuffer");
-
-               if (global_flags.uncompressed_video_to_http ||
-                   global_flags.x264_video_to_http) {
-                       // Add uncompressed video. (Note that pts == dts here.)
-                       // Delay needs to match audio.
-                       FrameReorderer::Frame output_frame = reorderer->reorder_frame(pts + global_delay(), duration, reinterpret_cast<uint8_t *>(surf->y_ptr), received_ts);
-                       if (output_frame.data != nullptr) {
-                               if (global_flags.uncompressed_video_to_http) {
-                                       add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data);
-                               } else {
-                                       assert(global_flags.x264_video_to_http);
-                                       x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data, output_frame.received_ts);
-                               }
-                       }
-               }
        }
 
-       static int frameno = 0;
-       print_latency("Current mixer latency (video inputs → ready for encode):",
-               received_ts, (frame_type == FRAME_B), &frameno);
-
        va_status = vaDestroyImage(va_dpy, surf->surface_image.image_id);
        CHECK_VASTATUS(va_status, "vaDestroyImage");
 
@@ -1945,21 +2027,40 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
                // FIXME: If the mux wants global headers, we should not put the
                // SPS/PPS before each IDR frame, but rather put it into the
                // codec extradata (formatted differently?).
+               //
+               // NOTE: If we change ycbcr_coefficients, it will not take effect
+               // before the next IDR frame. This is acceptable, as it should only
+               // happen on a mode change, which is rare.
                render_sequence();
-               render_picture(frame_type, display_frame_num, gop_start_display_frame_num);
+               render_picture(surf, frame_type, display_frame_num, gop_start_display_frame_num);
                if (h264_packedheader) {
-                       render_packedsequence();
+                       render_packedsequence(ycbcr_coefficients);
                        render_packedpicture();
                }
        } else {
                //render_sequence();
-               render_picture(frame_type, display_frame_num, gop_start_display_frame_num);
+               render_picture(surf, frame_type, display_frame_num, gop_start_display_frame_num);
        }
        render_slice(encoding_frame_num, display_frame_num, gop_start_display_frame_num, frame_type);
 
        va_status = vaEndPicture(va_dpy, context_id);
        CHECK_VASTATUS(va_status, "vaEndPicture");
 
+       update_ReferenceFrames(display_frame_num, frame_type);
+
+       vector<size_t> ref_display_frame_numbers;
+
+       // Lock the references for this frame; otherwise, they could be
+       // rendered to before this frame is done encoding.
+       {
+               unique_lock<mutex> lock(storage_task_queue_mutex);
+               for (const ReferenceFrame &frame : reference_frames) {
+                       assert(surface_for_frame.count(frame.display_number));
+                       ++surface_for_frame[frame.display_number]->refcount;
+                       ref_display_frame_numbers.push_back(frame.display_number);
+               }
+       }
+
        // so now the data is done encoding (well, async job kicked off)...
        // we send that to the storage thread
        storage_task tmp;
@@ -1968,14 +2069,14 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
        tmp.pts = pts;
        tmp.dts = dts;
        tmp.duration = duration;
+       tmp.ycbcr_coefficients = ycbcr_coefficients;
        tmp.received_ts = received_ts;
+       tmp.ref_display_frame_numbers = move(ref_display_frame_numbers);
        storage_task_enqueue(move(tmp));
-
-       update_ReferenceFrames(frame_type);
 }
 
 // Proxy object.
-QuickSyncEncoder::QuickSyncEncoder(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
+QuickSyncEncoder::QuickSyncEncoder(const std::string &filename, ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
        : impl(new QuickSyncEncoderImpl(filename, resource_pool, surface, va_display, width, height, oformat, x264_encoder, disk_space_estimator)) {}
 
 // Must be defined here because unique_ptr<> destructor needs to know the impl.
@@ -1986,14 +2087,19 @@ void QuickSyncEncoder::add_audio(int64_t pts, vector<float> audio)
        impl->add_audio(pts, audio);
 }
 
-bool QuickSyncEncoder::begin_frame(GLuint *y_tex, GLuint *cbcr_tex)
+bool QuickSyncEncoder::is_zerocopy() const
 {
-       return impl->begin_frame(y_tex, cbcr_tex);
+       return impl->is_zerocopy();
 }
 
-RefCountedGLsync QuickSyncEncoder::end_frame(int64_t pts, int64_t duration, const vector<RefCountedFrame> &input_frames)
+bool QuickSyncEncoder::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
-       return impl->end_frame(pts, duration, input_frames);
+       return impl->begin_frame(pts, duration, ycbcr_coefficients, input_frames, y_tex, cbcr_tex);
+}
+
+RefCountedGLsync QuickSyncEncoder::end_frame()
+{
+       return impl->end_frame();
 }
 
 void QuickSyncEncoder::shutdown()
@@ -2001,6 +2107,11 @@ void QuickSyncEncoder::shutdown()
        impl->shutdown();
 }
 
+void QuickSyncEncoder::close_file()
+{
+       impl->shutdown();
+}
+
 void QuickSyncEncoder::set_stream_mux(Mux *mux)
 {
        impl->set_stream_mux(mux);