From ee7da87b4aa284b7babd59dc21db925f7c384ce7 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Wed, 8 Mar 2017 23:54:46 +0100
Subject: [PATCH] Support switching Y'CbCr coefficients midway, which will
 allow doing the Right Thing(TM) (BT.601 when you can for greater stream
 compatibility, BT.709 when you must for HDMI/SDI output) automatically.

---
 decklink_output.cpp      | 38 +++++++++++++++++++++++++++---------
 decklink_output.h        |  8 +++++++-
 flags.cpp                | 16 +++++++--------
 flags.h                  |  3 ++-
 mixer.cpp                | 22 +++++++++++++++++++--
 mux.cpp                  | 10 +++++++++-
 quicksync_encoder.cpp    | 42 ++++++++++++++++++++++++----------------
 quicksync_encoder.h      |  3 ++-
 quicksync_encoder_impl.h | 13 ++++++++-----
 theme.cpp                |  3 +++
 video_encoder.cpp        |  4 ++--
 video_encoder.h          |  3 ++-
 x264_encoder.cpp         | 42 +++++++++++++++++++++++++++++-----------
 x264_encoder.h           |  7 ++++++-
 14 files changed, 154 insertions(+), 60 deletions(-)
diff --git a/decklink_output.cpp b/decklink_output.cpp
index 4d6b1e1..8c6672b 100644
--- a/decklink_output.cpp
+++ b/decklink_output.cpp
@@ -107,14 +107,7 @@ void DeckLinkOutput::start_output(uint32_t mode, int64_t base_pts)
 		exit(1);
 	}
 
-	BMDDisplayModeFlags flags = display_mode->GetFlags();
-	if ((flags & bmdDisplayModeColorspaceRec601) && global_flags.ycbcr_rec709_coefficients) {
-		fprintf(stderr, "WARNING: Chosen output mode expects Rec. 601 Y'CbCr coefficients.\n");
-		fprintf(stderr, "         Consider --output-ycbcr-coefficients=rec601 (or =auto).\n");
-	} else if ((flags & bmdDisplayModeColorspaceRec709) && !global_flags.ycbcr_rec709_coefficients) {
-		fprintf(stderr, "WARNING: Chosen output mode expects Rec. 709 Y'CbCr coefficients.\n");
-		fprintf(stderr, "         Consider --output-ycbcr-coefficients=rec709 (or =auto).\n");
-	}
+	current_mode_flags = display_mode->GetFlags();
 
 	BMDTimeValue time_value;
 	BMDTimeScale time_scale;
@@ -184,10 +177,26 @@ void DeckLinkOutput::end_output()
 	}
 }
 
-void DeckLinkOutput::send_frame(GLuint y_tex, GLuint cbcr_tex, const vector<RefCountedFrame> &input_frames, int64_t pts, int64_t duration)
+void DeckLinkOutput::send_frame(GLuint y_tex, GLuint cbcr_tex, YCbCrLumaCoefficients output_ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, int64_t pts, int64_t duration)
 {
 	assert(!should_quit);
 
+	if ((current_mode_flags & bmdDisplayModeColorspaceRec601) && output_ycbcr_coefficients == YCBCR_REC_709) {
+		if (!last_frame_had_mode_mismatch) {
+			fprintf(stderr, "WARNING: Chosen output mode expects Rec. 601 Y'CbCr coefficients.\n");
+			fprintf(stderr, "         Consider --output-ycbcr-coefficients=rec601 (or =auto).\n");
+		}
+		last_frame_had_mode_mismatch = true;
+	} else if ((current_mode_flags & bmdDisplayModeColorspaceRec709) && output_ycbcr_coefficients == YCBCR_REC_601) {
+		if (!last_frame_had_mode_mismatch) {
+			fprintf(stderr, "WARNING: Chosen output mode expects Rec. 709 Y'CbCr coefficients.\n");
+			fprintf(stderr, "         Consider --output-ycbcr-coefficients=rec709 (or =auto).\n");
+		}
+		last_frame_had_mode_mismatch = true;
+	} else {
+		last_frame_had_mode_mismatch = false;
+	}
+
 	unique_ptr<Frame> frame = move(get_frame());
 	chroma_subsampler->create_uyvy(y_tex, cbcr_tex, width, height, frame->uyvy_tex);
 
@@ -334,6 +343,17 @@ uint32_t DeckLinkOutput::pick_video_mode(uint32_t mode) const
 	return best_mode;
 }
 
+YCbCrLumaCoefficients DeckLinkOutput::preferred_ycbcr_coefficients() const
+{
+	if (current_mode_flags & bmdDisplayModeColorspaceRec601) {
+		return YCBCR_REC_601;
+	} else {
+		// Don't bother checking bmdDisplayModeColorspaceRec709;
+		// if none is set, 709 is a good default anyway.
+		return YCBCR_REC_709;
+	}
+}
+
 HRESULT DeckLinkOutput::ScheduledFrameCompleted(/* in */ IDeckLinkVideoFrame *completedFrame, /* in */ BMDOutputFrameCompletionResult result)
 {
 	Frame *frame = static_cast<Frame *>(completedFrame);
diff --git a/decklink_output.h b/decklink_output.h
index d5e743a..5581c39 100644
--- a/decklink_output.h
+++ b/decklink_output.h
@@ -2,6 +2,7 @@
 #define _DECKLINK_OUTPUT_H 1
 
 #include <epoxy/gl.h>
+#include <movit/image_format.h>
 #include <stdint.h>
 #include <atomic>
 #include <chrono>
@@ -40,7 +41,7 @@ public:
 	void start_output(uint32_t mode, int64_t base_pts);  // Mode comes from get_available_video_modes().
 	void end_output();
 
-	void send_frame(GLuint y_tex, GLuint cbcr_tex, const std::vector<RefCountedFrame> &input_frames, int64_t pts, int64_t duration);
+	void send_frame(GLuint y_tex, GLuint cbcr_tex, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, int64_t pts, int64_t duration);
 	void send_audio(int64_t pts, const std::vector<float> &samples);
 
 	// NOTE: The returned timestamp is undefined for preroll.
@@ -59,6 +60,9 @@ public:
 	// If the given mode is supported, return it. If not, pick some âbestâ valid mode.
 	uint32_t pick_video_mode(uint32_t mode) const;
 
+	// Desired Y'CbCr coefficients for the current mode. Undefined before start_output().
+	movit::YCbCrLumaCoefficients preferred_ycbcr_coefficients() const;
+
 	// IUnknown.
 	HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, LPVOID *ppv) override;
 	ULONG STDMETHODCALLTYPE AddRef() override;
@@ -132,6 +136,8 @@ private:
 	std::condition_variable frame_queues_changed;
 	bool playback_initiated = false, playback_started = false;
 	int64_t base_pts, frame_duration;
+	BMDDisplayModeFlags current_mode_flags = 0;
+	bool last_frame_had_mode_mismatch = false;
 
 	movit::ResourcePool *resource_pool;
 	IDeckLinkOutput *output = nullptr;
diff --git a/flags.cpp b/flags.cpp
index 025b6d9..88a2c45 100644
--- a/flags.cpp
+++ b/flags.cpp
@@ -114,8 +114,8 @@ void usage()
 	fprintf(stderr, "      --audio-queue-length-ms=MS  length of audio resampling queue (default 100.0)\n");
 	fprintf(stderr, "      --output-ycbcr-coefficients={rec601,rec709,auto}\n");
 	fprintf(stderr, "                                  Y'CbCr coefficient standard of output (default auto)\n");
-	fprintf(stderr, "                                    auto is rec709 if and only if --output-card is used\n");
-	fprintf(stderr, "                                    and a HD resolution is set\n");
+	fprintf(stderr, "                                    auto is rec601, unless --output-card is used\n");
+	fprintf(stderr, "                                    and a Rec. 709 mode (typically HD modes) is in use\n");
 	fprintf(stderr, "      --output-buffer-frames=NUM  number of frames in output buffer for --output-card,\n");
 	fprintf(stderr, "                                    can be fractional (default 6.0); note also\n");
 	fprintf(stderr, "                                    the audio queue can't be much longer than this\n");
@@ -430,17 +430,17 @@ void parse_flags(int argc, char * const argv[])
 	// On the other hand, HDMI/SDI output typically requires Rec. 709 for
 	// HD resolutions (with no way of signaling anything else), which is
 	// a conflicting demand. In this case, we typically let the HDMI/SDI
-	// output win, but the user can override this.
+	// output win if it is active, but the user can override this.
 	if (output_ycbcr_coefficients == "auto") {
-		if (global_flags.output_card >= 0 && global_flags.width >= 1280) {
-			global_flags.ycbcr_rec709_coefficients = true;
-		} else {
-			global_flags.ycbcr_rec709_coefficients = false;
-		}
+		// Essentially: BT.709 if HDMI/SDI output is on, otherwise BT.601.
+		global_flags.ycbcr_rec709_coefficients = false;
+		global_flags.ycbcr_auto_coefficients = true;
 	} else if (output_ycbcr_coefficients == "rec709") {
 		global_flags.ycbcr_rec709_coefficients = true;
+		global_flags.ycbcr_auto_coefficients = false;
 	} else if (output_ycbcr_coefficients == "rec601") {
 		global_flags.ycbcr_rec709_coefficients = false;
+		global_flags.ycbcr_auto_coefficients = false;
 	} else {
 		fprintf(stderr, "ERROR: --output-ycbcr-coefficients must be ârec601â, ârec709â or âautoâ\n");
 		exit(1);
diff --git a/flags.h b/flags.h
index 12bc3d4..78b1f1f 100644
--- a/flags.h
+++ b/flags.h
@@ -42,7 +42,8 @@ struct Flags {
 	std::string midi_mapping_filename;  // Empty for none.
 	bool print_video_latency = false;
 	double audio_queue_length_ms = 100.0;
-	bool ycbcr_rec709_coefficients = false;
+	bool ycbcr_rec709_coefficients = false;  // Will be overridden by HDMI/SDI output if ycbcr_auto_coefficients == true.
+	bool ycbcr_auto_coefficients = true;
 	int output_card = -1;
 	double output_buffer_frames = 6.0;
 	double output_slop_frames = 0.5;
diff --git a/mixer.cpp b/mixer.cpp
index e286370..898e7c0 100644
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -1012,9 +1012,27 @@ void Mixer::render_one_frame(int64_t duration)
 	theme_main_chain.setup_chain();
 	//theme_main_chain.chain->enable_phase_timing(true);
 
+	// If HDMI/SDI output is active and the user has requested auto mode,
+	// its mode overrides the existing Y'CbCr setting for the chain.
+	YCbCrLumaCoefficients ycbcr_output_coefficients;
+	if (global_flags.ycbcr_auto_coefficients && output_card_index != -1) {
+		ycbcr_output_coefficients = cards[output_card_index].output->preferred_ycbcr_coefficients();
+	} else {
+		ycbcr_output_coefficients = global_flags.ycbcr_rec709_coefficients ? YCBCR_REC_709 : YCBCR_REC_601;
+	}
+
+	// TODO: Reduce the duplication against theme.cpp.
+	YCbCrFormat output_ycbcr_format;
+	output_ycbcr_format.chroma_subsampling_x = 1;
+	output_ycbcr_format.chroma_subsampling_y = 1;
+	output_ycbcr_format.luma_coefficients = ycbcr_output_coefficients;
+	output_ycbcr_format.full_range = false;
+	output_ycbcr_format.num_levels = 256;
+	chain->change_ycbcr_output_format(output_ycbcr_format);
+
 	const int64_t av_delay = lrint(global_flags.audio_queue_length_ms * 0.001 * TIMEBASE);  // Corresponds to the delay in ResamplingQueue.
 	GLuint y_tex, cbcr_tex;
-	bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, theme_main_chain.input_frames, &y_tex, &cbcr_tex);
+	bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, ycbcr_output_coefficients, theme_main_chain.input_frames, &y_tex, &cbcr_tex);
 	assert(got_frame);
 
 	// Render main chain. We take an extra copy of the created outputs,
@@ -1040,7 +1058,7 @@ void Mixer::render_one_frame(int64_t duration)
 	GLuint cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width / 2, global_flags.height / 2);
 	chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex, cbcr_copy_tex);
 	if (output_card_index != -1) {
-		cards[output_card_index].output->send_frame(y_tex, cbcr_full_tex, theme_main_chain.input_frames, pts_int, duration);
+		cards[output_card_index].output->send_frame(y_tex, cbcr_full_tex, ycbcr_output_coefficients, theme_main_chain.input_frames, pts_int, duration);
 	}
 	resource_pool->release_2d_texture(cbcr_full_tex);
 
diff --git a/mux.cpp b/mux.cpp
index a7b3a7f..1fd8e30 100644
--- a/mux.cpp
+++ b/mux.cpp
@@ -22,6 +22,7 @@ extern "C" {
 }
 
 #include "defs.h"
+#include "flags.h"
 #include "timebase.h"
 
 using namespace std;
@@ -68,9 +69,16 @@ Mux::Mux(AVFormatContext *avctx, int width, int height, Codec video_codec, const
 	// as noted in each comment.
 	// Note that the H.264 stream also contains this information and depending on the
 	// mux, this might simply get ignored. See sps_rbsp().
+	// Note that there's no way to change this per-frame as the H.264 stream
+	// would like to be able to.
 	avstream_video->codecpar->color_primaries = AVCOL_PRI_BT709;  // RGB colorspace (inout_format.color_space).
 	avstream_video->codecpar->color_trc = AVCOL_TRC_UNSPECIFIED;  // Gamma curve (inout_format.gamma_curve).
-	avstream_video->codecpar->color_space = AVCOL_SPC_SMPTE170M;  // YUV colorspace (output_ycbcr_format.luma_coefficients).
+	// YUV colorspace (output_ycbcr_format.luma_coefficients).
+	if (global_flags.ycbcr_rec709_coefficients) {
+		avstream_video->codecpar->color_space = AVCOL_SPC_BT709;
+	} else {
+		avstream_video->codecpar->color_space = AVCOL_SPC_SMPTE170M;
+	}
 	avstream_video->codecpar->color_range = AVCOL_RANGE_MPEG;  // Full vs. limited range (output_ycbcr_format.full_range).
 	avstream_video->codecpar->chroma_location = AVCHROMA_LOC_LEFT;  // Chroma sample location. See chroma_offset_0[] in Mixer::subsample_chroma().
 	avstream_video->codecpar->field_order = AV_FIELD_PROGRESSIVE;
diff --git a/quicksync_encoder.cpp b/quicksync_encoder.cpp
index 2e8633d..d49a483 100644
--- a/quicksync_encoder.cpp
+++ b/quicksync_encoder.cpp
@@ -1,5 +1,6 @@
 #include "quicksync_encoder.h"
 
+#include <movit/image_format.h>
 #include <movit/resource_pool.h>  // Must be above the Xlib includes.
 #include <movit/util.h>
 
@@ -55,6 +56,7 @@ extern "C" {
 #include "timebase.h"
 #include "x264_encoder.h"
 
+using namespace movit;
 using namespace std;
 using namespace std::chrono;
 using namespace std::placeholders;
@@ -259,7 +261,7 @@ static void nal_header(bitstream *bs, int nal_ref_idc, int nal_unit_type)
     bitstream_put_ui(bs, nal_unit_type, 5);
 }
 
-void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs)
+void QuickSyncEncoderImpl::sps_rbsp(YCbCrLumaCoefficients ycbcr_coefficients, bitstream *bs)
 {
     int profile_idc = PROFILE_IDC_BASELINE;
 
@@ -331,9 +333,10 @@ void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs)
             {
                 bitstream_put_ui(bs, 1, 8);  /* colour_primaries (1 = BT.709) */
                 bitstream_put_ui(bs, 2, 8);  /* transfer_characteristics (2 = unspecified, since we use sRGB) */
-                if (global_flags.ycbcr_rec709_coefficients) {
+                if (ycbcr_coefficients == YCBCR_REC_709) {
                     bitstream_put_ui(bs, 1, 8);  /* matrix_coefficients (1 = BT.709) */
                 } else {
+                    assert(ycbcr_coefficients == YCBCR_REC_601);
                     bitstream_put_ui(bs, 6, 8);  /* matrix_coefficients (6 = BT.601/SMPTE 170M) */
                 }
             }
@@ -515,14 +518,14 @@ int QuickSyncEncoderImpl::build_packed_pic_buffer(unsigned char **header_buffer)
 }
 
 int
-QuickSyncEncoderImpl::build_packed_seq_buffer(unsigned char **header_buffer)
+QuickSyncEncoderImpl::build_packed_seq_buffer(YCbCrLumaCoefficients ycbcr_coefficients, unsigned char **header_buffer)
 {
     bitstream bs;
 
     bitstream_start(&bs);
     nal_start_code_prefix(&bs);
     nal_header(&bs, NAL_REF_IDC_HIGH, NAL_SPS);
-    sps_rbsp(&bs);
+    sps_rbsp(ycbcr_coefficients, &bs);
     bitstream_end(&bs);
 
     *header_buffer = (unsigned char *)bs.buffer;
@@ -1220,7 +1223,7 @@ int QuickSyncEncoderImpl::render_picture(GLSurface *surf, int frame_type, int di
     return 0;
 }
 
-int QuickSyncEncoderImpl::render_packedsequence()
+int QuickSyncEncoderImpl::render_packedsequence(YCbCrLumaCoefficients ycbcr_coefficients)
 {
     VAEncPackedHeaderParameterBuffer packedheader_param_buffer;
     VABufferID packedseq_para_bufid, packedseq_data_bufid, render_id[2];
@@ -1228,7 +1231,7 @@ int QuickSyncEncoderImpl::render_packedsequence()
     unsigned char *packedseq_buffer = NULL;
     VAStatus va_status;
 
-    length_in_bits = build_packed_seq_buffer(&packedseq_buffer); 
+    length_in_bits = build_packed_seq_buffer(ycbcr_coefficients, &packedseq_buffer); 
     
     packedheader_param_buffer.type = VAEncPackedHeaderSequence;
     
@@ -1526,7 +1529,7 @@ int QuickSyncEncoderImpl::deinit_va()
     return 0;
 }
 
-QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
+QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
 	: current_storage_frame(0), resource_pool(resource_pool), surface(surface), x264_encoder(x264_encoder), frame_width(width), frame_height(height), disk_space_estimator(disk_space_estimator)
 {
 	file_audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE, oformat));
@@ -1595,7 +1598,7 @@ void QuickSyncEncoderImpl::release_gl_surface(size_t display_frame_num)
 	}
 }
 
-bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
+bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
 	assert(!is_shutdown);
 	GLSurface *surf = nullptr;
@@ -1669,7 +1672,7 @@ bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, const vect
 		glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->cbcr_egl_image);
 	}
 
-	current_video_frame = PendingFrame{ {}, input_frames, pts, duration };
+	current_video_frame = PendingFrame{ {}, input_frames, pts, duration, ycbcr_coefficients };
 
 	return true;
 }
@@ -1850,7 +1853,7 @@ void QuickSyncEncoderImpl::encode_thread_func()
 			}
 			last_dts = dts;
 
-			encode_frame(frame, quicksync_encoding_frame_num, quicksync_display_frame_num, gop_start_display_frame_num, frame_type, frame.pts, dts, frame.duration);
+			encode_frame(frame, quicksync_encoding_frame_num, quicksync_display_frame_num, gop_start_display_frame_num, frame_type, frame.pts, dts, frame.duration, frame.ycbcr_coefficients);
 			++quicksync_encoding_frame_num;
 		}
 	}
@@ -1868,7 +1871,7 @@ void QuickSyncEncoderImpl::encode_remaining_frames_as_p(int encoding_frame_num,
 		PendingFrame frame = move(pending_frame.second);
 		int64_t dts = last_dts + (TIMEBASE / MAX_FPS);
 		printf("Finalizing encode: Encoding leftover frame %d as P-frame instead of B-frame.\n", display_frame_num);
-		encode_frame(frame, encoding_frame_num++, display_frame_num, gop_start_display_frame_num, FRAME_P, frame.pts, dts, frame.duration);
+		encode_frame(frame, encoding_frame_num++, display_frame_num, gop_start_display_frame_num, FRAME_P, frame.pts, dts, frame.duration, frame.ycbcr_coefficients);
 		last_dts = dts;
 	}
 }
@@ -1931,12 +1934,12 @@ void QuickSyncEncoderImpl::pass_frame(QuickSyncEncoderImpl::PendingFrame frame,
 	if (global_flags.uncompressed_video_to_http) {
 		add_packet_for_uncompressed_frame(pts, duration, data);
 	} else if (global_flags.x264_video_to_http) {
-		x264_encoder->add_frame(pts, duration, data, received_ts);
+		x264_encoder->add_frame(pts, duration, frame.ycbcr_coefficients, data, received_ts);
 	}
 }
 
 void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num,
-                                        int frame_type, int64_t pts, int64_t dts, int64_t duration)
+                                        int frame_type, int64_t pts, int64_t dts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients)
 {
 	const ReceivedTimestamps received_ts = find_received_timestamp(frame.input_frames);
 
@@ -1980,10 +1983,14 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
 		// FIXME: If the mux wants global headers, we should not put the
 		// SPS/PPS before each IDR frame, but rather put it into the
 		// codec extradata (formatted differently?).
+		//
+		// NOTE: If we change ycbcr_coefficients, it will not take effect
+		// before the next IDR frame. This is acceptable, as it should only
+		// happen on a mode change, which is rare.
 		render_sequence();
 		render_picture(surf, frame_type, display_frame_num, gop_start_display_frame_num);
 		if (h264_packedheader) {
-			render_packedsequence();
+			render_packedsequence(ycbcr_coefficients);
 			render_packedpicture();
 		}
 	} else {
@@ -2018,13 +2025,14 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
 	tmp.pts = pts;
 	tmp.dts = dts;
 	tmp.duration = duration;
+	tmp.ycbcr_coefficients = ycbcr_coefficients;
 	tmp.received_ts = received_ts;
 	tmp.ref_display_frame_numbers = move(ref_display_frame_numbers);
 	storage_task_enqueue(move(tmp));
 }
 
 // Proxy object.
-QuickSyncEncoder::QuickSyncEncoder(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
+QuickSyncEncoder::QuickSyncEncoder(const std::string &filename, ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator)
 	: impl(new QuickSyncEncoderImpl(filename, resource_pool, surface, va_display, width, height, oformat, x264_encoder, disk_space_estimator)) {}
 
 // Must be defined here because unique_ptr<> destructor needs to know the impl.
@@ -2035,9 +2043,9 @@ void QuickSyncEncoder::add_audio(int64_t pts, vector<float> audio)
 	impl->add_audio(pts, audio);
 }
 
-bool QuickSyncEncoder::begin_frame(int64_t pts, int64_t duration, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
+bool QuickSyncEncoder::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
-	return impl->begin_frame(pts, duration, input_frames, y_tex, cbcr_tex);
+	return impl->begin_frame(pts, duration, ycbcr_coefficients, input_frames, y_tex, cbcr_tex);
 }
 
 RefCountedGLsync QuickSyncEncoder::end_frame()
diff --git a/quicksync_encoder.h b/quicksync_encoder.h
index caa6586..a247ee8 100644
--- a/quicksync_encoder.h
+++ b/quicksync_encoder.h
@@ -27,6 +27,7 @@
 #define _H264ENCODE_H
 
 #include <epoxy/gl.h>
+#include <movit/image_format.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <memory>
@@ -60,7 +61,7 @@ public:
 
 	void set_stream_mux(Mux *mux);  // Does not take ownership. Must be called unless x264 is used for the stream.
 	void add_audio(int64_t pts, std::vector<float> audio);
-	bool begin_frame(int64_t pts, int64_t duration, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
+	bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
 	RefCountedGLsync end_frame();
 	void shutdown();  // Blocking. Does not require an OpenGL context.
 	void release_gl_resources();  // Requires an OpenGL context. Must be run after shutdown.
diff --git a/quicksync_encoder_impl.h b/quicksync_encoder_impl.h
index b55edbb..679f2a2 100644
--- a/quicksync_encoder_impl.h
+++ b/quicksync_encoder_impl.h
@@ -2,6 +2,7 @@
 #define _QUICKSYNC_ENCODER_IMPL_H 1
 
 #include <epoxy/egl.h>
+#include <movit/image_format.h>
 #include <va/va.h>
 
 #include <condition_variable>
@@ -35,7 +36,7 @@ public:
 	QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const std::string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator);
 	~QuickSyncEncoderImpl();
 	void add_audio(int64_t pts, std::vector<float> audio);
-	bool begin_frame(int64_t pts, int64_t duration, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
+	bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
 	RefCountedGLsync end_frame();
 	void shutdown();
 	void release_gl_resources();
@@ -55,6 +56,7 @@ private:
 		int frame_type;
 		std::vector<float> audio;
 		int64_t pts, dts, duration;
+		movit::YCbCrLumaCoefficients ycbcr_coefficients;
 		ReceivedTimestamps received_ts;
 		std::vector<size_t> ref_display_frame_numbers;
 	};
@@ -62,6 +64,7 @@ private:
 		RefCountedGLsync fence;
 		std::vector<RefCountedFrame> input_frames;
 		int64_t pts, duration;
+		movit::YCbCrLumaCoefficients ycbcr_coefficients;
 	};
 	struct GLSurface {
 		VASurfaceID src_surface, ref_surface;
@@ -99,21 +102,21 @@ private:
 	void add_packet_for_uncompressed_frame(int64_t pts, int64_t duration, const uint8_t *data);
 	void pass_frame(PendingFrame frame, int display_frame_num, int64_t pts, int64_t duration);
 	void encode_frame(PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num,
-	                  int frame_type, int64_t pts, int64_t dts, int64_t duration);
+	                  int frame_type, int64_t pts, int64_t dts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients);
 	void storage_task_thread();
 	void storage_task_enqueue(storage_task task);
 	void save_codeddata(GLSurface *surf, storage_task task);
-	int render_packedsequence();
+	int render_packedsequence(movit::YCbCrLumaCoefficients ycbcr_coefficients);
 	int render_packedpicture();
 	void render_packedslice();
 	int render_sequence();
 	int render_picture(GLSurface *surf, int frame_type, int display_frame_num, int gop_start_display_frame_num);
-	void sps_rbsp(bitstream *bs);
+	void sps_rbsp(movit::YCbCrLumaCoefficients ycbcr_coefficients, bitstream *bs);
 	void pps_rbsp(bitstream *bs);
 	int build_packed_pic_buffer(unsigned char **header_buffer);
 	int render_slice(int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num, int frame_type);
 	void slice_header(bitstream *bs);
-	int build_packed_seq_buffer(unsigned char **header_buffer);
+	int build_packed_seq_buffer(movit::YCbCrLumaCoefficients ycbcr_coefficients, unsigned char **header_buffer);
 	int build_packed_slice_buffer(unsigned char **header_buffer);
 	int init_va(const std::string &va_display);
 	int deinit_va();
diff --git a/theme.cpp b/theme.cpp
index c5f335e..7bb1877 100644
--- a/theme.cpp
+++ b/theme.cpp
@@ -265,11 +265,14 @@ int EffectChain_finalize(lua_State* L)
 		// happens in a pass not run by Movit (see ChromaSubsampler::subsample_chroma()).
 		output_ycbcr_format.chroma_subsampling_x = 1;
 		output_ycbcr_format.chroma_subsampling_y = 1;
+
+		// This will be overridden if HDMI/SDI output is in force.
 		if (global_flags.ycbcr_rec709_coefficients) {
 			output_ycbcr_format.luma_coefficients = YCBCR_REC_709;
 		} else {
 			output_ycbcr_format.luma_coefficients = YCBCR_REC_601;
 		}
+
 		output_ycbcr_format.full_range = false;
 		output_ycbcr_format.num_levels = 256;
 
diff --git a/video_encoder.cpp b/video_encoder.cpp
index e00465c..fe0ecb1 100644
--- a/video_encoder.cpp
+++ b/video_encoder.cpp
@@ -120,11 +120,11 @@ void VideoEncoder::add_audio(int64_t pts, std::vector<float> audio)
 	stream_audio_encoder->encode_audio(audio, pts + quicksync_encoder->global_delay());
 }
 
-bool VideoEncoder::begin_frame(int64_t pts, int64_t duration, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
+bool VideoEncoder::begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
 	lock_guard<mutex> lock(qs_mu);
 	qs_needing_cleanup.clear();  // Since we have an OpenGL context here, and are called regularly.
-	return quicksync_encoder->begin_frame(pts, duration, input_frames, y_tex, cbcr_tex);
+	return quicksync_encoder->begin_frame(pts, duration, ycbcr_coefficients, input_frames, y_tex, cbcr_tex);
 }
 
 RefCountedGLsync VideoEncoder::end_frame()
diff --git a/video_encoder.h b/video_encoder.h
index 368037d..8578462 100644
--- a/video_encoder.h
+++ b/video_encoder.h
@@ -6,6 +6,7 @@
 #define _VIDEO_ENCODER_H
 
 #include <epoxy/gl.h>
+#include <movit/image_format.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <atomic>
@@ -44,7 +45,7 @@ public:
 	// Allocate a frame to render into. The returned two textures
 	// are yours to render into (build them into an FBO).
 	// Call end_frame() when you're done.
-	bool begin_frame(int64_t pts, int64_t duration, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
+	bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
 
 	// Call after you are done rendering into the frame; at this point,
 	// y_tex and cbcr_tex will be assumed done, and handed over to the
diff --git a/x264_encoder.cpp b/x264_encoder.cpp
index f9b5624..7d81d55 100644
--- a/x264_encoder.cpp
+++ b/x264_encoder.cpp
@@ -20,6 +20,7 @@ extern "C" {
 #include <libavformat/avformat.h>
 }
 
+using namespace movit;
 using namespace std;
 using namespace std::chrono;
 
@@ -58,11 +59,12 @@ X264Encoder::~X264Encoder()
 	encoder_thread.join();
 }
 
-void X264Encoder::add_frame(int64_t pts, int64_t duration, const uint8_t *data, const ReceivedTimestamps &received_ts)
+void X264Encoder::add_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const uint8_t *data, const ReceivedTimestamps &received_ts)
 {
 	QueuedFrame qf;
 	qf.pts = pts;
 	qf.duration = duration;
+	qf.ycbcr_coefficients = ycbcr_coefficients;
 	qf.received_ts = received_ts;
 
 	{
@@ -265,18 +267,36 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf)
 	// See if we have a new bitrate to change to.
 	unsigned new_rate = new_bitrate_kbit.exchange(0);  // Read and clear.
 	if (new_rate != 0) {
-		if (speed_control) {
-			speed_control->set_config_override_function([new_rate](x264_param_t *param) {
-				param->rc.i_bitrate = new_rate;
-				update_vbv_settings(param);
-			});
+		bitrate_override_func = [new_rate](x264_param_t *param) {
+			param->rc.i_bitrate = new_rate;
+			update_vbv_settings(param);
+		};
+	}
+
+	auto ycbcr_coefficients_override_func = [qf](x264_param_t *param) {
+		if (qf.ycbcr_coefficients == YCBCR_REC_709) {
+			param->vui.i_colmatrix = 1;  // BT.709.
 		} else {
-			x264_param_t param;
-			x264_encoder_parameters(x264, &param);
-			param.rc.i_bitrate = new_rate;
-			update_vbv_settings(&param);
-			x264_encoder_reconfig(x264, &param);
+			assert(qf.ycbcr_coefficients == YCBCR_REC_601);
+			param->vui.i_colmatrix = 6;  // BT.601/SMPTE 170M.
+		}
+	};
+
+	if (speed_control) {
+		speed_control->set_config_override_function([this, ycbcr_coefficients_override_func](x264_param_t *param) {
+			if (bitrate_override_func) {
+				bitrate_override_func(param);
+			}
+			ycbcr_coefficients_override_func(param);
+		});
+	} else {
+		x264_param_t param;
+		x264_encoder_parameters(x264, &param);
+		if (bitrate_override_func) {
+			bitrate_override_func(&param);
 		}
+		ycbcr_coefficients_override_func(&param);
+		x264_encoder_reconfig(x264, &param);
 	}
 
 	if (speed_control) {
diff --git a/x264_encoder.h b/x264_encoder.h
index 8adb42a..2e64e66 100644
--- a/x264_encoder.h
+++ b/x264_encoder.h
@@ -33,6 +33,8 @@ extern "C" {
 #include <libavformat/avformat.h>
 }
 
+#include <movit/image_format.h>
+
 #include "print_latency.h"
 
 class Mux;
@@ -51,7 +53,7 @@ public:
 
 	// <data> is taken to be raw NV12 data of WIDTHxHEIGHT resolution.
 	// Does not block.
-	void add_frame(int64_t pts, int64_t duration, const uint8_t *data, const ReceivedTimestamps &received_ts);
+	void add_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const uint8_t *data, const ReceivedTimestamps &received_ts);
 
 	std::string get_global_headers() const {
 		while (!x264_init_done) {
@@ -67,6 +69,7 @@ public:
 private:
 	struct QueuedFrame {
 		int64_t pts, duration;
+		movit::YCbCrLumaCoefficients ycbcr_coefficients;
 		uint8_t *data;
 		ReceivedTimestamps received_ts;
 	};
@@ -91,6 +94,8 @@ private:
 	x264_t *x264;
 	std::unique_ptr<X264SpeedControl> speed_control;
 
+	std::function<void(x264_param_t *)> bitrate_override_func;
+
 	std::atomic<unsigned> new_bitrate_kbit{0};  // 0 for no change.
 
 	// Protects everything below it.
-- 
2.39.2