From: Steinar H. Gunderson <sgunderson@bigfoot.com>
Date: Wed, 15 Mar 2017 22:07:24 +0000 (+0100)
Subject: Support 4:2:2 v210 (10-bit) output for DeckLink.
X-Git-Tag: 1.5.0~9
X-Git-Url: https://git.sesse.net/?p=nageru;a=commitdiff_plain;h=aa472f3f2fcf37701198deb330d3169636151060

Support 4:2:2 v210 (10-bit) output for DeckLink.

This again requires compute shaders; my GTX 950 needs a bit under 0.1 ms
to convert a 720p frame from the 16-bit planar representation. It replaces the
flag for 10-bit x264.

v210 is, as far as I understand, pretty much the native format for the DeckLink
cards, but I believe the conversion happens in hardware, so there shouldn't
be any significant speed gains to be have.
---

diff --git a/chroma_subsampler.cpp b/chroma_subsampler.cpp
index a9e5355..cf46883 100644
--- a/chroma_subsampler.cpp
+++ b/chroma_subsampler.cpp
@@ -1,4 +1,5 @@
 #include "chroma_subsampler.h"
+#include "v210_converter.h"
 
 #include <vector>
 
@@ -170,6 +171,78 @@ ChromaSubsampler::ChromaSubsampler(ResourcePool *resource_pool)
 	};
 	vbo = generate_vbo(2, GL_FLOAT, sizeof(vertices), vertices);
 	check_error();
+
+	// v210 compute shader.
+	if (v210Converter::has_hardware_support()) {
+		string v210_shader_src = R"(#version 150
+#extension GL_ARB_compute_shader : enable
+#extension GL_ARB_shader_image_load_store : enable
+layout(local_size_x=2, local_size_y=16) in;
+layout(r16) uniform restrict readonly image2D in_y;
+uniform sampler2D in_cbcr;  // Of type RG16.
+layout(rgb10_a2) uniform restrict writeonly image2D outbuf;
+uniform float inv_width, inv_height;
+
+void main()
+{
+	int xb = int(gl_GlobalInvocationID.x);  // X block number.
+	int y = int(gl_GlobalInvocationID.y);  // Y (actual line).
+	float yf = (gl_GlobalInvocationID.y + 0.5f) * inv_height;  // Y float coordinate.
+
+	// Load and scale CbCr values, sampling in-between the texels to get
+	// to (left/4 + center/2 + right/4).
+	vec2 pix_cbcr[3];
+	for (int i = 0; i < 3; ++i) {
+		vec2 a = texture(in_cbcr, vec2((xb * 6 + i * 2) * inv_width, yf)).xy;
+		vec2 b = texture(in_cbcr, vec2((xb * 6 + i * 2 + 1) * inv_width, yf)).xy;
+		pix_cbcr[i] = (a + b) * (0.5 * 65535.0 / 1023.0);
+	}
+
+	// Load and scale the Y values. Note that we use integer coordinates here,
+	// so we don't need to offset by 0.5.
+	float pix_y[6];
+	for (int i = 0; i < 6; ++i) {
+		pix_y[i] = imageLoad(in_y, ivec2(xb * 6 + i, y)).x * (65535.0 / 1023.0);
+	}
+
+	imageStore(outbuf, ivec2(xb * 4 + 0, y), vec4(pix_cbcr[0].x, pix_y[0],      pix_cbcr[0].y, 1.0));
+	imageStore(outbuf, ivec2(xb * 4 + 1, y), vec4(pix_y[1],      pix_cbcr[1].x, pix_y[2],      1.0));
+	imageStore(outbuf, ivec2(xb * 4 + 2, y), vec4(pix_cbcr[1].y, pix_y[3],      pix_cbcr[2].x, 1.0));
+	imageStore(outbuf, ivec2(xb * 4 + 3, y), vec4(pix_y[4],      pix_cbcr[2].y, pix_y[5],      1.0));
+}
+)";
+		GLuint shader_num = movit::compile_shader(v210_shader_src, GL_COMPUTE_SHADER);
+		check_error();
+		v210_program_num = glCreateProgram();
+		check_error();
+		glAttachShader(v210_program_num, shader_num);
+		check_error();
+		glLinkProgram(v210_program_num);
+		check_error();
+
+		GLint success;
+		glGetProgramiv(v210_program_num, GL_LINK_STATUS, &success);
+		check_error();
+		if (success == GL_FALSE) {
+			GLchar error_log[1024] = {0};
+			glGetProgramInfoLog(v210_program_num, 1024, NULL, error_log);
+			fprintf(stderr, "Error linking program: %s\n", error_log);
+			exit(1);
+		}
+
+		v210_in_y_pos = glGetUniformLocation(v210_program_num, "in_y");
+		check_error();
+		v210_in_cbcr_pos = glGetUniformLocation(v210_program_num, "in_cbcr");
+		check_error();
+		v210_outbuf_pos = glGetUniformLocation(v210_program_num, "outbuf");
+		check_error();
+		v210_inv_width_pos = glGetUniformLocation(v210_program_num, "inv_width");
+		check_error();
+		v210_inv_height_pos = glGetUniformLocation(v210_program_num, "inv_height");
+		check_error();
+	} else {
+		v210_program_num = 0;
+	}
 }
 
 ChromaSubsampler::~ChromaSubsampler()
@@ -180,6 +253,10 @@ ChromaSubsampler::~ChromaSubsampler()
 	check_error();
 	glDeleteBuffers(1, &vbo);
 	check_error();
+	if (v210_program_num != 0) {
+		glDeleteProgram(v210_program_num);
+		check_error();
+	}
 }
 
 void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex, GLuint dst2_tex)
@@ -334,3 +411,60 @@ void ChromaSubsampler::create_uyvy(GLuint y_tex, GLuint cbcr_tex, unsigned width
 	resource_pool->release_fbo(fbo);
 	glDeleteVertexArrays(1, &vao);
 }
+
+void ChromaSubsampler::create_v210(GLuint y_tex, GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex)
+{
+	assert(v210_program_num != 0);
+
+	glUseProgram(v210_program_num);
+	check_error();
+
+	glUniform1i(v210_in_y_pos, 0);
+	check_error();
+	glUniform1i(v210_in_cbcr_pos, 1);
+	check_error();
+	glUniform1i(v210_outbuf_pos, 2);
+	check_error();
+	glUniform1f(v210_inv_width_pos, 1.0 / width);
+	check_error();
+	glUniform1f(v210_inv_height_pos, 1.0 / height);
+	check_error();
+
+	glActiveTexture(GL_TEXTURE0);
+	check_error();
+	glBindTexture(GL_TEXTURE_2D, y_tex);  // We don't actually need to bind it, but we need to set the state.
+	check_error();
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+	check_error();
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+	check_error();
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+	check_error();
+	glBindImageTexture(0, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16);  // This is the real bind.
+	check_error();
+
+	glActiveTexture(GL_TEXTURE1);
+	check_error();
+	glBindTexture(GL_TEXTURE_2D, cbcr_tex);
+	check_error();
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+	check_error();
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+	check_error();
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+	check_error();
+
+	glBindImageTexture(2, dst_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGB10_A2);
+	check_error();
+
+	// Actually run the shader. We use workgroups of size 2x16 threadst , and each thread
+	// processes 6x1 input pixels, so round up to number of 12x16 pixel blocks.
+	glDispatchCompute((width + 11) / 12, (height + 15) / 16, 1);
+
+	glBindTexture(GL_TEXTURE_2D, 0);
+	check_error();
+	glActiveTexture(GL_TEXTURE0);
+	check_error();
+	glUseProgram(0);
+	check_error();
+}
diff --git a/chroma_subsampler.h b/chroma_subsampler.h
index 1bed433..d4c1c1e 100644
--- a/chroma_subsampler.h
+++ b/chroma_subsampler.h
@@ -27,6 +27,14 @@ public:
 	// width and height are the dimensions (in pixels) of the input textures.
 	void create_uyvy(GLuint y_tex, GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex);
 
+	// Subsamples and interleaves luma and chroma to give 10-bit 4:2:2
+	// packed Y'CbCr (v210); see v210converter.h for more information on
+	// the format. Luma and chroma are assumed to be 10-bit data packed
+	// into 16-bit textures. Chroma positioning is left (H.264 convention).
+	// width and height are the dimensions (in pixels) of the input textures;
+	// Requires compute shaders; check v210Converter::has_hardware_support().
+	void create_v210(GLuint y_tex, GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex);
+
 private:
 	movit::ResourcePool *resource_pool;
 
@@ -39,6 +47,10 @@ private:
 	GLuint uyvy_program_num;  // Owned by <resource_pool>.
 	GLuint uyvy_y_texture_sampler_uniform, uyvy_cbcr_texture_sampler_uniform;
 	GLuint uyvy_position_attribute_index, uyvy_texcoord_attribute_index;
+
+	GLuint v210_program_num;  // Compute shader, so owned by ourselves. Can be 0.
+	GLuint v210_in_y_pos, v210_in_cbcr_pos, v210_outbuf_pos;
+	GLuint v210_inv_width_pos, v210_inv_height_pos;
 };
 
 #endif  // !defined(_CHROMA_SUBSAMPLER_H)
diff --git a/decklink_output.cpp b/decklink_output.cpp
index 3b00d28..3ce692b 100644
--- a/decklink_output.cpp
+++ b/decklink_output.cpp
@@ -12,6 +12,7 @@
 #include "print_latency.h"
 #include "resource_pool.h"
 #include "timebase.h"
+#include "v210_converter.h"
 
 using namespace movit;
 using namespace std;
@@ -96,7 +97,8 @@ void DeckLinkOutput::start_output(uint32_t mode, int64_t base_pts)
 
 	BMDDisplayModeSupport support;
 	IDeckLinkDisplayMode *display_mode;
-	if (output->DoesSupportVideoMode(mode, bmdFormat8BitYUV, bmdVideoOutputFlagDefault,
+	BMDPixelFormat pixel_format = global_flags.ten_bit_output ? bmdFormat10BitYUV : bmdFormat8BitYUV;
+	if (output->DoesSupportVideoMode(mode, pixel_format, bmdVideoOutputFlagDefault,
 	                                 &support, &display_mode) != S_OK) {
 		fprintf(stderr, "Couldn't ask for format support\n");
 		exit(1);
@@ -198,7 +200,11 @@ void DeckLinkOutput::send_frame(GLuint y_tex, GLuint cbcr_tex, YCbCrLumaCoeffici
 	}
 
 	unique_ptr<Frame> frame = move(get_frame());
-	chroma_subsampler->create_uyvy(y_tex, cbcr_tex, width, height, frame->uyvy_tex);
+	if (global_flags.ten_bit_output) {
+		chroma_subsampler->create_v210(y_tex, cbcr_tex, width, height, frame->uyvy_tex);
+	} else {
+		chroma_subsampler->create_uyvy(y_tex, cbcr_tex, width, height, frame->uyvy_tex);
+	}
 
 	// Download the UYVY texture to the PBO.
 	glPixelStorei(GL_PACK_ROW_LENGTH, 0);
@@ -207,10 +213,17 @@ void DeckLinkOutput::send_frame(GLuint y_tex, GLuint cbcr_tex, YCbCrLumaCoeffici
 	glBindBuffer(GL_PIXEL_PACK_BUFFER, frame->pbo);
 	check_error();
 
-	glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex);
-	check_error();
-	glGetTexImage(GL_TEXTURE_2D, 0, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, BUFFER_OFFSET(0));
-	check_error();
+	if (global_flags.ten_bit_output) {
+		glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex);
+		check_error();
+		glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, BUFFER_OFFSET(0));
+		check_error();
+	} else {
+		glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex);
+		check_error();
+		glGetTexImage(GL_TEXTURE_2D, 0, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, BUFFER_OFFSET(0));
+		check_error();
+	}
 
 	glBindTexture(GL_TEXTURE_2D, 0);
 	check_error();
@@ -406,17 +419,31 @@ unique_ptr<DeckLinkOutput::Frame> DeckLinkOutput::get_frame()
 
 	unique_ptr<Frame> frame(new Frame);
 
-	frame->uyvy_tex = resource_pool->create_2d_texture(GL_RGBA8, width / 2, height);
+	size_t stride;
+	if (global_flags.ten_bit_output) {
+		stride = v210Converter::get_v210_stride(width);
+		GLint v210_width = stride / sizeof(uint32_t);
+		frame->uyvy_tex = resource_pool->create_2d_texture(GL_RGB10_A2, v210_width, height);
+
+		// We need valid texture state, or NVIDIA won't allow us to write to the texture.
+		glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex);
+		check_error();
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+		check_error();
+	} else {
+		stride = width * 2;
+		frame->uyvy_tex = resource_pool->create_2d_texture(GL_RGBA8, width / 2, height);
+	}
 
 	glGenBuffers(1, &frame->pbo);
 	check_error();
 	glBindBuffer(GL_PIXEL_PACK_BUFFER, frame->pbo);
 	check_error();
-	glBufferStorage(GL_PIXEL_PACK_BUFFER, width * height * 2, NULL, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+	glBufferStorage(GL_PIXEL_PACK_BUFFER, stride * height, NULL, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
 	check_error();
-	frame->uyvy_ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, width * height * 2, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+	frame->uyvy_ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, stride * height, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
 	check_error();
-	frame->uyvy_ptr_local.reset(new uint8_t[width * height * 2]);
+	frame->uyvy_ptr_local.reset(new uint8_t[stride * height]);
 	frame->resource_pool = resource_pool;
 
 	return frame;
@@ -444,7 +471,11 @@ void DeckLinkOutput::present_thread_func()
 		check_error();
 		frame->fence.reset();
 
-		memcpy(frame->uyvy_ptr_local.get(), frame->uyvy_ptr, width * height * 2);
+		if (global_flags.ten_bit_output) {
+			memcpy(frame->uyvy_ptr_local.get(), frame->uyvy_ptr, v210Converter::get_v210_stride(width) * height);
+		} else {
+			memcpy(frame->uyvy_ptr_local.get(), frame->uyvy_ptr, width * height * 2);
+		}
 
 		// Release any input frames we needed to render this frame.
 		frame->input_frames.clear();
@@ -526,12 +557,20 @@ long DeckLinkOutput::Frame::GetHeight()
 
 long DeckLinkOutput::Frame::GetRowBytes()
 {
-	return global_flags.width * 2;
+	if (global_flags.ten_bit_output) {
+		return v210Converter::get_v210_stride(global_flags.width);
+	} else {
+		return global_flags.width * 2;
+	}
 }
 
 BMDPixelFormat DeckLinkOutput::Frame::GetPixelFormat()
 {
-	return bmdFormat8BitYUV;
+	if (global_flags.ten_bit_output) {
+		return bmdFormat10BitYUV;
+	} else {
+		return bmdFormat8BitYUV;
+	}
 }
 
 BMDFrameFlags DeckLinkOutput::Frame::GetFlags()
diff --git a/decklink_output.h b/decklink_output.h
index 5581c39..7c0a17f 100644
--- a/decklink_output.h
+++ b/decklink_output.h
@@ -102,7 +102,7 @@ private:
 		movit::ResourcePool *resource_pool;
 
 		// These members are persistently allocated, and reused when the frame object is.
-		GLuint uyvy_tex;  // Owned by <resource_pool>.
+		GLuint uyvy_tex;  // Owned by <resource_pool>. Can also hold v210 data.
 		GLuint pbo;
 		uint8_t *uyvy_ptr;  // Persistent mapping into the PBO.
 
diff --git a/flags.cpp b/flags.cpp
index e6c306b..ee575c5 100644
--- a/flags.cpp
+++ b/flags.cpp
@@ -27,7 +27,6 @@ enum LongOption {
 	OPTION_X264_BITRATE,
 	OPTION_X264_VBV_BUFSIZE,
 	OPTION_X264_VBV_MAX_BITRATE,
-	OPTION_X264_10_BIT,
 	OPTION_X264_PARAM,
 	OPTION_HTTP_MUX,
 	OPTION_HTTP_COARSE_TIMEBASE,
@@ -56,6 +55,7 @@ enum LongOption {
 	OPTION_TIMECODE_STREAM,
 	OPTION_TIMECODE_STDOUT,
 	OPTION_10_BIT_INPUT,
+	OPTION_10_BIT_OUTPUT,
 };
 
 void usage()
@@ -90,7 +90,6 @@ void usage()
 	fprintf(stderr, "                                  default: same as --x264-bitrate, that is, one-second VBV)\n");
 	fprintf(stderr, "      --x264-vbv-max-bitrate      x264 local max bitrate (in kilobit/sec per --vbv-bufsize,\n");
 	fprintf(stderr, "                                  0 = no limit, default: same as --x264-bitrate, i.e., CBR)\n");
-	fprintf(stderr, "      --x264-10-bit               enable 10-bit x264 encoding\n");
 	fprintf(stderr, "      --x264-param=NAME[,VALUE]   set any x264 parameter, for fine tuning\n");
 	fprintf(stderr, "      --http-mux=NAME             mux to use for HTTP streams (default " DEFAULT_STREAM_MUX_NAME ")\n");
 	fprintf(stderr, "      --http-audio-codec=NAME     audio codec to use for HTTP streams\n");
@@ -130,6 +129,8 @@ void usage()
 	fprintf(stderr, "      --timecode-stream           show timestamp and timecode in stream\n");
 	fprintf(stderr, "      --timecode-stdout           show timestamp and timecode on standard output\n");
 	fprintf(stderr, "      --10-bit-input              use 10-bit video input (requires compute shaders)\n");
+	fprintf(stderr, "      --10-bit-output             use 10-bit video output (requires compute shaders,\n");
+	fprintf(stderr, "                                    implies --record-x264-video)\n");
 }
 
 void parse_flags(int argc, char * const argv[])
@@ -158,7 +159,6 @@ void parse_flags(int argc, char * const argv[])
 		{ "x264-bitrate", required_argument, 0, OPTION_X264_BITRATE },
 		{ "x264-vbv-bufsize", required_argument, 0, OPTION_X264_VBV_BUFSIZE },
 		{ "x264-vbv-max-bitrate", required_argument, 0, OPTION_X264_VBV_MAX_BITRATE },
-		{ "x264-10-bit", no_argument, 0, OPTION_X264_10_BIT },
 		{ "x264-param", required_argument, 0, OPTION_X264_PARAM },
 		{ "http-mux", required_argument, 0, OPTION_HTTP_MUX },
 		{ "http-coarse-timebase", no_argument, 0, OPTION_HTTP_COARSE_TIMEBASE },
@@ -187,6 +187,7 @@ void parse_flags(int argc, char * const argv[])
 		{ "timecode-stream", no_argument, 0, OPTION_TIMECODE_STREAM },
 		{ "timecode-stdout", no_argument, 0, OPTION_TIMECODE_STDOUT },
 		{ "10-bit-input", no_argument, 0, OPTION_10_BIT_INPUT },
+		{ "10-bit-output", no_argument, 0, OPTION_10_BIT_OUTPUT },
 		{ 0, 0, 0, 0 }
 	};
 	vector<string> theme_dirs;
@@ -290,9 +291,6 @@ void parse_flags(int argc, char * const argv[])
 		case OPTION_X264_VBV_BUFSIZE:
 			global_flags.x264_vbv_buffer_size = atoi(optarg);
 			break;
-		case OPTION_X264_10_BIT:
-			global_flags.x264_bit_depth = 10;
-			break;
 		case OPTION_X264_VBV_MAX_BITRATE:
 			global_flags.x264_vbv_max_bitrate = atoi(optarg);
 			break;
@@ -374,6 +372,12 @@ void parse_flags(int argc, char * const argv[])
 		case OPTION_10_BIT_INPUT:
 			global_flags.ten_bit_input = true;
 			break;
+		case OPTION_10_BIT_OUTPUT:
+			global_flags.ten_bit_output = true;
+			global_flags.x264_video_to_disk = true;
+			global_flags.x264_video_to_http = true;
+			global_flags.x264_bit_depth = 10;
+			break;
 		case OPTION_HELP:
 			usage();
 			exit(0);
diff --git a/flags.h b/flags.h
index b840c15..6ca9794 100644
--- a/flags.h
+++ b/flags.h
@@ -35,7 +35,6 @@ struct Flags {
 	int x264_bitrate = DEFAULT_X264_OUTPUT_BIT_RATE;  // In kilobit/sec.
 	int x264_vbv_max_bitrate = -1;  // In kilobits. 0 = no limit, -1 = same as <x264_bitrate> (CBR).
 	int x264_vbv_buffer_size = -1;  // In kilobits. 0 = one-frame VBV, -1 = same as <x264_bitrate> (one-second VBV).
-	int x264_bit_depth = 8;
 	std::vector<std::string> x264_extra_param;  // In âkey[,value]â format.
 	bool enable_alsa_output = true;
 	std::map<int, int> default_stream_mapping;
@@ -53,6 +52,8 @@ struct Flags {
 	bool display_timecode_in_stream = false;
 	bool display_timecode_on_stdout = false;
 	bool ten_bit_input = false;
+	bool ten_bit_output = false;  // Implies x264_video_to_disk == true and x264_bit_depth == 10.
+	int x264_bit_depth = 8;  // Not user-settable.
 };
 extern Flags global_flags;
 
diff --git a/mixer.cpp b/mixer.cpp
index 8e5b259..25fa3e4 100644
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -305,6 +305,13 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
 		v210_converter->precompile_shader(3840);
 		v210_converter->precompile_shader(4096);
 	}
+	if (global_flags.ten_bit_output) {
+		if (!v210Converter::has_hardware_support()) {
+			fprintf(stderr, "ERROR: --ten-bit-output requires support for OpenGL compute shaders\n");
+			fprintf(stderr, "       (OpenGL 4.3, or GL_ARB_compute_shader + GL_ARB_shader_image_load_store).\n");
+			exit(1);
+		}
+	}
 
 	timecode_renderer.reset(new TimecodeRenderer(resource_pool.get(), global_flags.width, global_flags.height));
 	display_timecode_in_stream = global_flags.display_timecode_in_stream;