Support switching Y'CbCr coefficients midway, which will allow doing the Right Thing...

[nageru] / quicksync_encoder.cpp
diff --git a/quicksync_encoder.cpp b/quicksync_encoder.cpp

index 749eb63d7056e638f8c8f4ac1d640706532b3d3e..d49a48333ecc967acf2b703809989b52baa061f1 100644 (file)
--- a/quicksync_encoder.cpp
+++ b/quicksync_encoder.cpp
@@ -1,5 +1,6 @@
  #include "quicksync_encoder.h"
  
+#include <movit/image_format.h>
  #include <movit/resource_pool.h>  // Must be above the Xlib includes.
  #include <movit/util.h>
  
@@ -8,6 +9,7 @@
  #include <assert.h>
  #include <epoxy/egl.h>
  #include <fcntl.h>
+#include <pthread.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
@@ -54,6 +56,7 @@ extern "C" {
  #include "timebase.h"
  #include "x264_encoder.h"
  
+using namespace movit;
  using namespace std;
  using namespace std::chrono;
  using namespace std::placeholders;
@@ -112,10 +115,6 @@ static constexpr int rc_default_modes[] = {  // Priority list of modes.
      VA_RC_NONE,
  };
  
-/* thread to save coded data */
-#define SRC_SURFACE_FREE        0
-#define SRC_SURFACE_IN_ENCODING 1
-    
  using namespace std;
  
  // Supposedly vaRenderPicture() is supposed to destroy the buffer implicitly,
@@ -262,7 +261,7 @@ static void nal_header(bitstream *bs, int nal_ref_idc, int nal_unit_type)
      bitstream_put_ui(bs, nal_unit_type, 5);
  }
  
-void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs)
+void QuickSyncEncoderImpl::sps_rbsp(YCbCrLumaCoefficients ycbcr_coefficients, bitstream *bs)
  {
      int profile_idc = PROFILE_IDC_BASELINE;
  
@@ -322,6 +321,7 @@ void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs)
      if ( false ) {
          bitstream_put_ui(bs, 0, 1); /* vui_parameters_present_flag */
      } else {
+        // See H.264 annex E for the definition of this header.
          bitstream_put_ui(bs, 1, 1); /* vui_parameters_present_flag */
          bitstream_put_ui(bs, 0, 1); /* aspect_ratio_info_present_flag */
          bitstream_put_ui(bs, 0, 1); /* overscan_info_present_flag */
@@ -333,7 +333,12 @@ void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs)
              {
                  bitstream_put_ui(bs, 1, 8);  /* colour_primaries (1 = BT.709) */
                  bitstream_put_ui(bs, 2, 8);  /* transfer_characteristics (2 = unspecified, since we use sRGB) */
-                bitstream_put_ui(bs, 6, 8);  /* matrix_coefficients (6 = BT.601/SMPTE 170M) */
+                if (ycbcr_coefficients == YCBCR_REC_709) {
+                    bitstream_put_ui(bs, 1, 8);  /* matrix_coefficients (1 = BT.709) */
+                } else {
+                    assert(ycbcr_coefficients == YCBCR_REC_601);
+                    bitstream_put_ui(bs, 6, 8);  /* matrix_coefficients (6 = BT.601/SMPTE 170M) */
+                }
              }
          }
          bitstream_put_ui(bs, 0, 1); /* chroma_loc_info_present_flag */
@@ -513,14 +518,14 @@ int QuickSyncEncoderImpl::build_packed_pic_buffer(unsigned char **header_buffer)
  }
  
  int
-QuickSyncEncoderImpl::build_packed_seq_buffer(unsigned char **header_buffer)
+QuickSyncEncoderImpl::build_packed_seq_buffer(YCbCrLumaCoefficients ycbcr_coefficients, unsigned char **header_buffer)
  {
      bitstream bs;
  
      bitstream_start(&bs);
      nal_start_code_prefix(&bs);
      nal_header(&bs, NAL_REF_IDC_HIGH, NAL_SPS);
-    sps_rbsp(&bs);
+    sps_rbsp(ycbcr_coefficients, &bs);
      bitstream_end(&bs);
  
      *header_buffer = (unsigned char *)bs.buffer;
@@ -1019,51 +1024,65 @@ static void sort_two(T *begin, T *end, const T &pivot, const C &less_than)
         sort(middle, end, less_than);
  }
  
-void QuickSyncEncoderImpl::update_ReferenceFrames(int frame_type)
+void QuickSyncEncoderImpl::update_ReferenceFrames(int current_display_frame, int frame_type)
  {
-    int i;
-    
      if (frame_type == FRAME_B)
          return;
  
+    pic_param.CurrPic.frame_idx = current_ref_frame_num;
+
      CurrentCurrPic.flags = VA_PICTURE_H264_SHORT_TERM_REFERENCE;
-    numShortTerm++;
-    if (numShortTerm > num_ref_frames)
-        numShortTerm = num_ref_frames;
-    for (i=numShortTerm-1; i>0; i--)
-        ReferenceFrames[i] = ReferenceFrames[i-1];
-    ReferenceFrames[0] = CurrentCurrPic;
+    unique_lock<mutex> lock(storage_task_queue_mutex);
+
+    // Insert the new frame at the start of the reference queue.
+    reference_frames.push_front(ReferenceFrame{ CurrentCurrPic, current_display_frame });
+
+    if (reference_frames.size() > num_ref_frames)
+    {
+        // The back frame frame is no longer in use as a reference.
+        int display_frame_num = reference_frames.back().display_number;
+        assert(surface_for_frame.count(display_frame_num));
+        release_gl_surface(display_frame_num);
+        reference_frames.pop_back();
+    }
+
+    // Mark this frame in use as a reference.
+    assert(surface_for_frame.count(current_display_frame));
+    ++surface_for_frame[current_display_frame]->refcount;
      
-    current_frame_num++;
-    if (current_frame_num > MaxFrameNum)
-        current_frame_num = 0;
+    current_ref_frame_num++;
+    if (current_ref_frame_num > MaxFrameNum)
+        current_ref_frame_num = 0;
  }
  
  
-int QuickSyncEncoderImpl::update_RefPicList(int frame_type)
+void QuickSyncEncoderImpl::update_RefPicList_P(VAPictureH264 RefPicList0_P[MAX_NUM_REF2])
  {
      const auto descending_by_frame_idx = [](const VAPictureH264 &a, const VAPictureH264 &b) {
          return a.frame_idx > b.frame_idx;
      };
+
+    for (size_t i = 0; i < reference_frames.size(); ++i) {
+        RefPicList0_P[i] = reference_frames[i].pic;
+    }
+    sort(&RefPicList0_P[0], &RefPicList0_P[reference_frames.size()], descending_by_frame_idx);
+}
+
+void QuickSyncEncoderImpl::update_RefPicList_B(VAPictureH264 RefPicList0_B[MAX_NUM_REF2], VAPictureH264 RefPicList1_B[MAX_NUM_REF2])
+{
      const auto ascending_by_top_field_order_cnt = [](const VAPictureH264 &a, const VAPictureH264 &b) {
          return a.TopFieldOrderCnt < b.TopFieldOrderCnt;
      };
      const auto descending_by_top_field_order_cnt = [](const VAPictureH264 &a, const VAPictureH264 &b) {
          return a.TopFieldOrderCnt > b.TopFieldOrderCnt;
      };
-    
-    if (frame_type == FRAME_P) {
-        memcpy(RefPicList0_P, ReferenceFrames, numShortTerm * sizeof(VAPictureH264));
-        sort(&RefPicList0_P[0], &RefPicList0_P[numShortTerm], descending_by_frame_idx);
-    } else if (frame_type == FRAME_B) {
-        memcpy(RefPicList0_B, ReferenceFrames, numShortTerm * sizeof(VAPictureH264));
-        sort_two(&RefPicList0_B[0], &RefPicList0_B[numShortTerm], CurrentCurrPic, ascending_by_top_field_order_cnt);
  
-        memcpy(RefPicList1_B, ReferenceFrames, numShortTerm * sizeof(VAPictureH264));
-        sort_two(&RefPicList1_B[0], &RefPicList1_B[numShortTerm], CurrentCurrPic, descending_by_top_field_order_cnt);
+    for (size_t i = 0; i < reference_frames.size(); ++i) {
+        RefPicList0_B[i] = reference_frames[i].pic;
+        RefPicList1_B[i] = reference_frames[i].pic;
      }
-    
-    return 0;
+    sort_two(&RefPicList0_B[0], &RefPicList0_B[reference_frames.size()], CurrentCurrPic, ascending_by_top_field_order_cnt);
+    sort_two(&RefPicList1_B[0], &RefPicList1_B[reference_frames.size()], CurrentCurrPic, descending_by_top_field_order_cnt);
  }
  
  
@@ -1165,21 +1184,23 @@ static int calc_poc(int pic_order_cnt_lsb, int frame_type)
      return TopFieldOrderCnt;
  }
  
-int QuickSyncEncoderImpl::render_picture(int frame_type, int display_frame_num, int gop_start_display_frame_num)
+int QuickSyncEncoderImpl::render_picture(GLSurface *surf, int frame_type, int display_frame_num, int gop_start_display_frame_num)
  {
      VABufferID pic_param_buf;
      VAStatus va_status;
-    int i = 0;
+    size_t i = 0;
  
-    pic_param.CurrPic.picture_id = gl_surfaces[display_frame_num % SURFACE_NUM].ref_surface;
-    pic_param.CurrPic.frame_idx = current_frame_num;
+    pic_param.CurrPic.picture_id = surf->ref_surface;
+    pic_param.CurrPic.frame_idx = current_ref_frame_num;
      pic_param.CurrPic.flags = 0;
      pic_param.CurrPic.TopFieldOrderCnt = calc_poc((display_frame_num - gop_start_display_frame_num) % MaxPicOrderCntLsb, frame_type);
      pic_param.CurrPic.BottomFieldOrderCnt = pic_param.CurrPic.TopFieldOrderCnt;
      CurrentCurrPic = pic_param.CurrPic;
  
-    memcpy(pic_param.ReferenceFrames, ReferenceFrames, numShortTerm*sizeof(VAPictureH264));
-    for (i = numShortTerm; i < MAX_NUM_REF1; i++) {
+    for (i = 0; i < reference_frames.size(); i++) {
+        pic_param.ReferenceFrames[i] = reference_frames[i].pic;
+    }
+    for (i = reference_frames.size(); i < MAX_NUM_REF1; i++) {
          pic_param.ReferenceFrames[i].picture_id = VA_INVALID_SURFACE;
          pic_param.ReferenceFrames[i].flags = VA_PICTURE_H264_INVALID;
      }
@@ -1188,8 +1209,8 @@ int QuickSyncEncoderImpl::render_picture(int frame_type, int display_frame_num,
      pic_param.pic_fields.bits.reference_pic_flag = (frame_type != FRAME_B);
      pic_param.pic_fields.bits.entropy_coding_mode_flag = h264_entropy_mode;
      pic_param.pic_fields.bits.deblocking_filter_control_present_flag = 1;
-    pic_param.frame_num = current_frame_num;
-    pic_param.coded_buf = gl_surfaces[display_frame_num % SURFACE_NUM].coded_buf;
+    pic_param.frame_num = current_ref_frame_num;  // FIXME: is this correct?
+    pic_param.coded_buf = surf->coded_buf;
      pic_param.last_picture = false;  // FIXME
      pic_param.pic_init_qp = initial_qp;
  
@@ -1202,7 +1223,7 @@ int QuickSyncEncoderImpl::render_picture(int frame_type, int display_frame_num,
      return 0;
  }
  
-int QuickSyncEncoderImpl::render_packedsequence()
+int QuickSyncEncoderImpl::render_packedsequence(YCbCrLumaCoefficients ycbcr_coefficients)
  {
      VAEncPackedHeaderParameterBuffer packedheader_param_buffer;
      VABufferID packedseq_para_bufid, packedseq_data_bufid, render_id[2];
@@ -1210,7 +1231,7 @@ int QuickSyncEncoderImpl::render_packedsequence()
      unsigned char *packedseq_buffer = NULL;
      VAStatus va_status;
  
-    length_in_bits = build_packed_seq_buffer(&packedseq_buffer); 
+    length_in_bits = build_packed_seq_buffer(ycbcr_coefficients, &packedseq_buffer); 
      
      packedheader_param_buffer.type = VAEncPackedHeaderSequence;
      
@@ -1316,8 +1337,6 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame
      VAStatus va_status;
      int i;
  
-    update_RefPicList(frame_type);
-    
      /* one frame, one slice */
      slice_param.macroblock_address = 0;
      slice_param.num_macroblocks = frame_width_mbaligned * frame_height_mbaligned/(16*16); /* Measured by MB */
@@ -1326,6 +1345,9 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame
          if (encoding_frame_num != 0)
              ++slice_param.idr_pic_id;
      } else if (frame_type == FRAME_P) {
+        VAPictureH264 RefPicList0_P[MAX_NUM_REF2];
+        update_RefPicList_P(RefPicList0_P);
+
          int refpiclist0_max = h264_maxref & 0xffff;
          memcpy(slice_param.RefPicList0, RefPicList0_P, refpiclist0_max*sizeof(VAPictureH264));
  
@@ -1334,6 +1356,9 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame
              slice_param.RefPicList0[i].flags = VA_PICTURE_H264_INVALID;
          }
      } else if (frame_type == FRAME_B) {
+        VAPictureH264 RefPicList0_B[MAX_NUM_REF2], RefPicList1_B[MAX_NUM_REF2];
+        update_RefPicList_B(RefPicList0_B, RefPicList1_B);
+
          int refpiclist0_max = h264_maxref & 0xffff;
          int refpiclist1_max = (h264_maxref >> 16) & 0xffff;
  
@@ -1371,20 +1396,20 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame
  
  
  
-void QuickSyncEncoderImpl::save_codeddata(storage_task task)
+void QuickSyncEncoderImpl::save_codeddata(GLSurface *surf, storage_task task)
  {    
         VACodedBufferSegment *buf_list = NULL;
         VAStatus va_status;
  
         string data;
  
-       va_status = vaMapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf, (void **)(&buf_list));
+       va_status = vaMapBuffer(va_dpy, surf->coded_buf, (void **)(&buf_list));
         CHECK_VASTATUS(va_status, "vaMapBuffer");
         while (buf_list != NULL) {
                 data.append(reinterpret_cast<const char *>(buf_list->buf), buf_list->size);
                 buf_list = (VACodedBufferSegment *) buf_list->next;
         }
-       vaUnmapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf);
+       vaUnmapBuffer(va_dpy, surf->coded_buf);
  
         static int frameno = 0;
         print_latency("Current QuickSync latency (video inputs → disk mux):",
@@ -1425,8 +1450,10 @@ void QuickSyncEncoderImpl::storage_task_enqueue(storage_task task)
  
  void QuickSyncEncoderImpl::storage_task_thread()
  {
+       pthread_setname_np(pthread_self(), "QS_Storage");
         for ( ;; ) {
                 storage_task current;
+               GLSurface *surf;
                 {
                         // wait until there's an encoded frame  
                         unique_lock<mutex> lock(storage_task_queue_mutex);
@@ -1434,19 +1461,28 @@ void QuickSyncEncoderImpl::storage_task_thread()
                         if (storage_thread_should_quit && storage_task_queue.empty()) return;
                         current = move(storage_task_queue.front());
                         storage_task_queue.pop();
+                       surf = surface_for_frame[current.display_order];
+                       assert(surf != nullptr);
                 }
  
                 VAStatus va_status;
+
+               size_t display_order = current.display_order;
+               vector<size_t> ref_display_frame_numbers = move(current.ref_display_frame_numbers);
            
                 // waits for data, then saves it to disk.
-               va_status = vaSyncSurface(va_dpy, gl_surfaces[current.display_order % SURFACE_NUM].src_surface);
+               va_status = vaSyncSurface(va_dpy, surf->src_surface);
                 CHECK_VASTATUS(va_status, "vaSyncSurface");
-               save_codeddata(move(current));
+               save_codeddata(surf, move(current));
  
+               // Unlock the frame, and all its references.
                 {
                         unique_lock<mutex> lock(storage_task_queue_mutex);
-                       srcsurface_status[current.display_order % SURFACE_NUM] = SRC_SURFACE_FREE;
-                       storage_task_queue_changed.notify_all();
+                       release_gl_surface(display_order);
+
+                       for (size_t frame_num : ref_display_frame_numbers) {
+                               release_gl_surface(frame_num);
+                       }
                 }
         }
  }
@@ -1493,7 +1529,7 @@ int QuickSyncEncoderImpl::deinit_va()
      return 0;
  }
  
-QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
+QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
         : current_storage_frame(0), resource_pool(resource_pool), surface(surface), x264_encoder(x264_encoder), frame_width(width), frame_height(height), disk_space_estimator(disk_space_estimator)
  {
         file_audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE, oformat));
@@ -1514,9 +1550,6 @@ QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, movit::R
         init_va(va_display);
         setup_encode();
  
-       // No frames are ready yet.
-       memset(srcsurface_status, SRC_SURFACE_FREE, sizeof(srcsurface_status));
-           
         memset(&seq_param, 0, sizeof(seq_param));
         memset(&pic_param, 0, sizeof(pic_param));
         memset(&slice_param, 0, sizeof(slice_param));
@@ -1543,23 +1576,51 @@ QuickSyncEncoderImpl::~QuickSyncEncoderImpl()
         release_gl_resources();
  }
  
-bool QuickSyncEncoderImpl::begin_frame(GLuint *y_tex, GLuint *cbcr_tex)
+QuickSyncEncoderImpl::GLSurface *QuickSyncEncoderImpl::allocate_gl_surface()
+{
+       for (unsigned i = 0; i < SURFACE_NUM; ++i) {
+               if (gl_surfaces[i].refcount == 0) {
+                       ++gl_surfaces[i].refcount;
+                       return &gl_surfaces[i];
+               }
+       }
+       return nullptr;
+}
+
+void QuickSyncEncoderImpl::release_gl_surface(size_t display_frame_num)
+{
+       assert(surface_for_frame.count(display_frame_num));
+       QuickSyncEncoderImpl::GLSurface *surf = surface_for_frame[display_frame_num];
+       if (--surf->refcount == 0) {
+               assert(surface_for_frame.count(display_frame_num));
+               surface_for_frame.erase(display_frame_num);
+               storage_task_queue_changed.notify_all();
+       }
+}
+
+bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
  {
         assert(!is_shutdown);
+       GLSurface *surf = nullptr;
         {
                 // Wait until this frame slot is done encoding.
                 unique_lock<mutex> lock(storage_task_queue_mutex);
-               if (srcsurface_status[current_storage_frame % SURFACE_NUM] != SRC_SURFACE_FREE) {
-                       fprintf(stderr, "Warning: Slot %d (for frame %d) is still encoding, rendering has to wait for H.264 encoder\n",
-                               current_storage_frame % SURFACE_NUM, current_storage_frame);
+               surf = allocate_gl_surface();
+               if (surf == nullptr) {
+                       fprintf(stderr, "Warning: No free slots for frame %d, rendering has to wait for H.264 encoder\n",
+                               current_storage_frame);
+                       storage_task_queue_changed.wait(lock, [this, &surf]{
+                               if (storage_thread_should_quit)
+                                       return true;
+                               surf = allocate_gl_surface();
+                               return surf != nullptr;
+                       });
                 }
-               storage_task_queue_changed.wait(lock, [this]{ return storage_thread_should_quit || (srcsurface_status[current_storage_frame % SURFACE_NUM] == SRC_SURFACE_FREE); });
-               srcsurface_status[current_storage_frame % SURFACE_NUM] = SRC_SURFACE_IN_ENCODING;
                 if (storage_thread_should_quit) return false;
+               assert(surf != nullptr);
+               surface_for_frame[current_storage_frame] = surf;
         }
  
-       //*fbo = fbos[current_storage_frame % SURFACE_NUM];
-       GLSurface *surf = &gl_surfaces[current_storage_frame % SURFACE_NUM];
         *y_tex = surf->y_tex;
         *cbcr_tex = surf->cbcr_tex;
  
@@ -1611,6 +1672,8 @@ bool QuickSyncEncoderImpl::begin_frame(GLuint *y_tex, GLuint *cbcr_tex)
                 glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->cbcr_egl_image);
         }
  
+       current_video_frame = PendingFrame{ {}, input_frames, pts, duration, ycbcr_coefficients };
+
         return true;
  }
  
@@ -1620,12 +1683,17 @@ void QuickSyncEncoderImpl::add_audio(int64_t pts, vector<float> audio)
         file_audio_encoder->encode_audio(audio, pts + global_delay());
  }
  
-RefCountedGLsync QuickSyncEncoderImpl::end_frame(int64_t pts, int64_t duration, const vector<RefCountedFrame> &input_frames)
+RefCountedGLsync QuickSyncEncoderImpl::end_frame()
  {
         assert(!is_shutdown);
  
         if (!use_zerocopy) {
-               GLSurface *surf = &gl_surfaces[current_storage_frame % SURFACE_NUM];
+               GLSurface *surf;
+               {
+                       unique_lock<mutex> lock(storage_task_queue_mutex);
+                       surf = surface_for_frame[current_storage_frame];
+                       assert(surf != nullptr);
+               }
  
                 glPixelStorei(GL_PACK_ROW_LENGTH, 0);
                 check_error();
@@ -1659,7 +1727,8 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame(int64_t pts, int64_t duration,
  
         {
                 unique_lock<mutex> lock(frame_queue_mutex);
-               pending_video_frames.push(PendingFrame{ fence, input_frames, pts, duration });
+               current_video_frame.fence = fence;
+               pending_video_frames.push(move(current_video_frame));
                 ++current_storage_frame;
         }
         frame_queue_nonempty.notify_all();
@@ -1718,6 +1787,8 @@ void QuickSyncEncoderImpl::open_output_file(const std::string &filename)
  
  void QuickSyncEncoderImpl::encode_thread_func()
  {
+       pthread_setname_np(pthread_self(), "QS_Encode");
+
         int64_t last_dts = -1;
         int gop_start_display_frame_num = 0;
         for (int display_frame_num = 0; ; ++display_frame_num) {
@@ -1763,8 +1834,12 @@ void QuickSyncEncoderImpl::encode_thread_func()
                         reorder_buffer.erase(quicksync_display_frame_num);
  
                         if (frame_type == FRAME_IDR) {
-                               numShortTerm = 0;
-                               current_frame_num = 0;
+                               // Release any reference frames from the previous GOP.
+                               for (const ReferenceFrame &frame : reference_frames) {
+                                       release_gl_surface(frame.display_number);
+                               }
+                               reference_frames.clear();
+                               current_ref_frame_num = 0;
                                 gop_start_display_frame_num = quicksync_display_frame_num;
                         }
  
@@ -1778,7 +1853,7 @@ void QuickSyncEncoderImpl::encode_thread_func()
                         }
                         last_dts = dts;
  
-                       encode_frame(frame, quicksync_encoding_frame_num, quicksync_display_frame_num, gop_start_display_frame_num, frame_type, frame.pts, dts, frame.duration);
+                       encode_frame(frame, quicksync_encoding_frame_num, quicksync_display_frame_num, gop_start_display_frame_num, frame_type, frame.pts, dts, frame.duration, frame.ycbcr_coefficients);
                         ++quicksync_encoding_frame_num;
                 }
         }
@@ -1796,7 +1871,7 @@ void QuickSyncEncoderImpl::encode_remaining_frames_as_p(int encoding_frame_num,
                 PendingFrame frame = move(pending_frame.second);
                 int64_t dts = last_dts + (TIMEBASE / MAX_FPS);
                 printf("Finalizing encode: Encoding leftover frame %d as P-frame instead of B-frame.\n", display_frame_num);
-               encode_frame(frame, encoding_frame_num++, display_frame_num, gop_start_display_frame_num, FRAME_P, frame.pts, dts, frame.duration);
+               encode_frame(frame, encoding_frame_num++, display_frame_num, gop_start_display_frame_num, FRAME_P, frame.pts, dts, frame.duration, frame.ycbcr_coefficients);
                 last_dts = dts;
         }
  }
@@ -1849,21 +1924,31 @@ void QuickSyncEncoderImpl::pass_frame(QuickSyncEncoderImpl::PendingFrame frame,
         // Release back any input frames we needed to render this frame.
         frame.input_frames.clear();
  
-       GLSurface *surf = &gl_surfaces[display_frame_num % SURFACE_NUM];
+       GLSurface *surf;
+       {
+               unique_lock<mutex> lock(storage_task_queue_mutex);
+               surf = surface_for_frame[display_frame_num];
+               assert(surf != nullptr);
+       }
         uint8_t *data = reinterpret_cast<uint8_t *>(surf->y_ptr);
         if (global_flags.uncompressed_video_to_http) {
                 add_packet_for_uncompressed_frame(pts, duration, data);
         } else if (global_flags.x264_video_to_http) {
-               x264_encoder->add_frame(pts, duration, data, received_ts);
+               x264_encoder->add_frame(pts, duration, frame.ycbcr_coefficients, data, received_ts);
         }
  }
  
  void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num,
-                                        int frame_type, int64_t pts, int64_t dts, int64_t duration)
+                                        int frame_type, int64_t pts, int64_t dts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients)
  {
         const ReceivedTimestamps received_ts = find_received_timestamp(frame.input_frames);
  
-       GLSurface *surf = &gl_surfaces[display_frame_num % SURFACE_NUM];
+       GLSurface *surf;
+       {
+               unique_lock<mutex> lock(storage_task_queue_mutex);
+               surf = surface_for_frame[display_frame_num];
+               assert(surf != nullptr);
+       }
         VAStatus va_status;
  
         if (use_zerocopy) {
@@ -1898,21 +1983,40 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
                 // FIXME: If the mux wants global headers, we should not put the
                 // SPS/PPS before each IDR frame, but rather put it into the
                 // codec extradata (formatted differently?).
+               //
+               // NOTE: If we change ycbcr_coefficients, it will not take effect
+               // before the next IDR frame. This is acceptable, as it should only
+               // happen on a mode change, which is rare.
                 render_sequence();
-               render_picture(frame_type, display_frame_num, gop_start_display_frame_num);
+               render_picture(surf, frame_type, display_frame_num, gop_start_display_frame_num);
                 if (h264_packedheader) {
-                       render_packedsequence();
+                       render_packedsequence(ycbcr_coefficients);
                         render_packedpicture();
                 }
         } else {
                 //render_sequence();
-               render_picture(frame_type, display_frame_num, gop_start_display_frame_num);
+               render_picture(surf, frame_type, display_frame_num, gop_start_display_frame_num);
         }
         render_slice(encoding_frame_num, display_frame_num, gop_start_display_frame_num, frame_type);
  
         va_status = vaEndPicture(va_dpy, context_id);
         CHECK_VASTATUS(va_status, "vaEndPicture");
  
+       update_ReferenceFrames(display_frame_num, frame_type);
+
+       vector<size_t> ref_display_frame_numbers;
+
+       // Lock the references for this frame; otherwise, they could be
+       // rendered to before this frame is done encoding.
+       {
+               unique_lock<mutex> lock(storage_task_queue_mutex);
+               for (const ReferenceFrame &frame : reference_frames) {
+                       assert(surface_for_frame.count(frame.display_number));
+                       ++surface_for_frame[frame.display_number]->refcount;
+                       ref_display_frame_numbers.push_back(frame.display_number);
+               }
+       }
+
         // so now the data is done encoding (well, async job kicked off)...
         // we send that to the storage thread
         storage_task tmp;
@@ -1921,14 +2025,14 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
         tmp.pts = pts;
         tmp.dts = dts;
         tmp.duration = duration;
+       tmp.ycbcr_coefficients = ycbcr_coefficients;
         tmp.received_ts = received_ts;
+       tmp.ref_display_frame_numbers = move(ref_display_frame_numbers);
         storage_task_enqueue(move(tmp));
-
-       update_ReferenceFrames(frame_type);
  }
  
  // Proxy object.
-QuickSyncEncoder::QuickSyncEncoder(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
+QuickSyncEncoder::QuickSyncEncoder(const std::string &filename, ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
         : impl(new QuickSyncEncoderImpl(filename, resource_pool, surface, va_display, width, height, oformat, x264_encoder, disk_space_estimator)) {}
  
  // Must be defined here because unique_ptr<> destructor needs to know the impl.
@@ -1939,14 +2043,14 @@ void QuickSyncEncoder::add_audio(int64_t pts, vector<float> audio)
         impl->add_audio(pts, audio);
  }
  
-bool QuickSyncEncoder::begin_frame(GLuint *y_tex, GLuint *cbcr_tex)
+bool QuickSyncEncoder::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
  {
-       return impl->begin_frame(y_tex, cbcr_tex);
+       return impl->begin_frame(pts, duration, ycbcr_coefficients, input_frames, y_tex, cbcr_tex);
  }
  
-RefCountedGLsync QuickSyncEncoder::end_frame(int64_t pts, int64_t duration, const vector<RefCountedFrame> &input_frames)
+RefCountedGLsync QuickSyncEncoder::end_frame()
  {
-       return impl->end_frame(pts, duration, input_frames);
+       return impl->end_frame();
  }
  
  void QuickSyncEncoder::shutdown()