From 471db5155f58c3bf7a98c446575cfa0c483da765 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Tue, 21 Feb 2017 18:42:26 +0100
Subject: [PATCH] Support 10-bit capture, both on bmusb and on DeckLink
 drivers.

---
 Makefile                |   2 +-
 README                  |   7 +-
 bmusb                   |   2 +-
 decklink_capture.cpp    |  35 +++++++--
 decklink_capture.h      |  10 +++
 flags.cpp               |   6 ++
 flags.h                 |   1 +
 mixer.cpp               |  93 ++++++++++++++++++------
 mixer.h                 |   4 +-
 pbo_frame_allocator.cpp | 105 +++++++++++++++++++--------
 pbo_frame_allocator.h   |   3 +-
 theme.cpp               |  18 +++--
 v210_converter.cpp      | 156 ++++++++++++++++++++++++++++++++++++++++
 v210_converter.h        | 103 ++++++++++++++++++++++++++
 14 files changed, 477 insertions(+), 68 deletions(-)
 create mode 100644 v210_converter.cpp
 create mode 100644 v210_converter.h

diff --git a/Makefile b/Makefile
index 970450e..9991424 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,7 @@ OBJS += midi_mapper.o midi_mapping.pb.o
 
 # Mixer objects
 AUDIO_MIXER_OBJS = audio_mixer.o alsa_input.o alsa_pool.o ebu_r128_proc.o stereocompressor.o resampling_queue.o flags.o correlation_measurer.o filter.o input_mapping.o state.pb.o
-OBJS += chroma_subsampler.o mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o timecode_renderer.o $(AUDIO_MIXER_OBJS)
+OBJS += chroma_subsampler.o v210_converter.o mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o timecode_renderer.o $(AUDIO_MIXER_OBJS)
 
 # Streaming and encoding objects
 OBJS += quicksync_encoder.o x264_encoder.o x264_speed_control.o video_encoder.o metacube2.o mux.o audio_encoder.o ffmpeg_raii.o
diff --git a/README b/README
index d858177..f4fc9ab 100644
--- a/README
+++ b/README
@@ -42,7 +42,7 @@ Nageru is in beta stage. It currently needs:
    but also for stability.
 
  - Movit, my GPU-based video filter library (https://movit.sesse.net).
-   You will need at least version 1.3.1, but at least 1.4.0 is recommended.
+   You will need at least version 1.5.0 (unreleased; get it from git).
 
  - Qt 5.5 or newer for the GUI.
 
@@ -76,6 +76,11 @@ with:
     libmovit-dev libegl1-mesa-dev libasound2-dev libx264-dev libbmusb-dev \
     protobuf-compiler libprotobuf-dev libpci-dev
 
+Exceptions as of February 2017:
+
+  - You will need Movit from git; stretch only has 1.4.0.
+
+  - You will need bmusb from git; stretch only has 0.5.4.
 
 The patches/ directory contains a patch that helps zita-resampler performance.
 It is meant for upstream, but was not in at the time Nageru was released.
diff --git a/bmusb b/bmusb
index aac1510..01ddb8f 160000
--- a/bmusb
+++ b/bmusb
@@ -1 +1 @@
-Subproject commit aac15101d9cc85681eee3e02c960d57e32414db6
+Subproject commit 01ddb8f836114c07cff3ca040d9ed2c946b2fdbf
diff --git a/decklink_capture.cpp b/decklink_capture.cpp
index 21a4e79..33bb73d 100644
--- a/decklink_capture.cpp
+++ b/decklink_capture.cpp
@@ -20,6 +20,8 @@
 
 #include "bmusb/bmusb.h"
 #include "decklink_util.h"
+#include "flags.h"
+#include "v210_converter.h"
 
 #define FRAME_SIZE (8 << 20)  // 8 MB.
 
@@ -138,6 +140,18 @@ size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t
 
 #endif  // __SSE2__
 
+BMDPixelFormat pixel_format_to_bmd(PixelFormat pixel_format)
+{
+	switch (pixel_format) {
+	case PixelFormat_8BitYCbCr:
+		return bmdFormat8BitYUV;
+	case PixelFormat_10BitYCbCr:
+		return bmdFormat10BitYUV;
+	default:
+		assert(false);
+	}
+}
+
 }  // namespace
 
 DeckLinkCapture::DeckLinkCapture(IDeckLink *card, int card_index)
@@ -329,7 +343,13 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 		int width = video_frame->GetWidth();
 		int height = video_frame->GetHeight();
 		const int stride = video_frame->GetRowBytes();
-		assert(stride == width * 2);
+		const BMDPixelFormat format = video_frame->GetPixelFormat();
+		assert(format == pixel_format_to_bmd(current_pixel_format));
+		if (global_flags.ten_bit_input) {
+			assert(stride == int(v210Converter::get_v210_stride(width)));
+		} else {
+			assert(stride == width * 2);
+		}
 
 		current_video_frame = video_frame_allocator->alloc_frame();
 		if (current_video_frame.data != nullptr) {
@@ -362,6 +382,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 
 			video_format.width = width;
 			video_format.height = height;
+			video_format.stride = stride;
 		}
 	}
 
@@ -413,7 +434,7 @@ void DeckLinkCapture::start_bm_capture()
 	if (running) {
 		return;
 	}
-	if (input->EnableVideoInput(current_video_mode, bmdFormat8BitYUV, supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
+	if (input->EnableVideoInput(current_video_mode, pixel_format_to_bmd(current_pixel_format), supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
 		fprintf(stderr, "Failed to set video mode 0x%04x for card %d\n", current_video_mode, card_index);
 		exit(1);
 	}
@@ -469,11 +490,17 @@ void DeckLinkCapture::set_video_mode(uint32_t video_mode_id)
 	}
 }
 
+void DeckLinkCapture::set_pixel_format(PixelFormat pixel_format)
+{
+	current_pixel_format = pixel_format;
+	set_video_mode(current_video_mode);
+}
+
 void DeckLinkCapture::set_video_mode_no_restart(uint32_t video_mode_id)
 {
 	BMDDisplayModeSupport support;
 	IDeckLinkDisplayMode *display_mode;
-	if (input->DoesSupportVideoMode(video_mode_id, bmdFormat8BitYUV, /*flags=*/0, &support, &display_mode)) {
+	if (input->DoesSupportVideoMode(video_mode_id, pixel_format_to_bmd(current_pixel_format), /*flags=*/0, &support, &display_mode)) {
 		fprintf(stderr, "Failed to query display mode for card %d\n", card_index);
 		exit(1);
 	}
@@ -491,7 +518,7 @@ void DeckLinkCapture::set_video_mode_no_restart(uint32_t video_mode_id)
 	field_dominance = display_mode->GetFieldDominance();
 
 	if (running) {
-		if (input->EnableVideoInput(video_mode_id, bmdFormat8BitYUV, supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
+		if (input->EnableVideoInput(video_mode_id, pixel_format_to_bmd(current_pixel_format), supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
 			fprintf(stderr, "Failed to set video mode 0x%04x for card %d\n", video_mode_id, card_index);
 			exit(1);
 		}
diff --git a/decklink_capture.h b/decklink_capture.h
index 1bdf9ad..f940241 100644
--- a/decklink_capture.h
+++ b/decklink_capture.h
@@ -7,6 +7,7 @@
 #include <functional>
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
 
 #include "DeckLinkAPIModes.h"
@@ -93,6 +94,14 @@ public:
 	void set_video_mode(uint32_t video_mode_id) override;
 	uint32_t get_current_video_mode() const override { return current_video_mode; }
 
+	std::set<bmusb::PixelFormat> get_available_pixel_formats() const override {
+		return std::set<bmusb::PixelFormat>{ bmusb::PixelFormat_8BitYCbCr, bmusb::PixelFormat_10BitYCbCr };
+	}
+	void set_pixel_format(bmusb::PixelFormat pixel_format) override;
+	bmusb::PixelFormat get_current_pixel_format() const override {
+		return current_pixel_format;
+	}
+
 	std::map<uint32_t, std::string> get_available_video_inputs() const override { return video_inputs; }
 	void set_video_input(uint32_t video_input_id) override;
 	uint32_t get_current_video_input() const override { return current_video_input; }
@@ -132,6 +141,7 @@ private:
 
 	std::map<uint32_t, bmusb::VideoMode> video_modes;
 	BMDDisplayMode current_video_mode;
+	bmusb::PixelFormat current_pixel_format = bmusb::PixelFormat_8BitYCbCr;
 
 	std::map<uint32_t, std::string> video_inputs;
 	BMDVideoConnection current_video_input;
diff --git a/flags.cpp b/flags.cpp
index 9c91bf7..025b6d9 100644
--- a/flags.cpp
+++ b/flags.cpp
@@ -53,6 +53,7 @@ enum LongOption {
 	OPTION_OUTPUT_SLOP_FRAMES,
 	OPTION_TIMECODE_STREAM,
 	OPTION_TIMECODE_STDOUT,
+	OPTION_10_BIT_INPUT,
 };
 
 void usage()
@@ -123,6 +124,7 @@ void usage()
 	fprintf(stderr, "                                    dropping the frame (default 0.5)\n");
 	fprintf(stderr, "      --timecode-stream           show timestamp and timecode in stream\n");
 	fprintf(stderr, "      --timecode-stdout           show timestamp and timecode on standard output\n");
+	fprintf(stderr, "      --10-bit-input              use 10-bit video input (requires compute shaders)\n");
 }
 
 void parse_flags(int argc, char * const argv[])
@@ -177,6 +179,7 @@ void parse_flags(int argc, char * const argv[])
 		{ "output-slop-frames", required_argument, 0, OPTION_OUTPUT_SLOP_FRAMES },
 		{ "timecode-stream", no_argument, 0, OPTION_TIMECODE_STREAM },
 		{ "timecode-stdout", no_argument, 0, OPTION_TIMECODE_STDOUT },
+		{ "10-bit-input", no_argument, 0, OPTION_10_BIT_INPUT },
 		{ 0, 0, 0, 0 }
 	};
 	vector<string> theme_dirs;
@@ -354,6 +357,9 @@ void parse_flags(int argc, char * const argv[])
 		case OPTION_TIMECODE_STDOUT:
 			global_flags.display_timecode_on_stdout = true;
 			break;
+		case OPTION_10_BIT_INPUT:
+			global_flags.ten_bit_input = true;
+			break;
 		case OPTION_HELP:
 			usage();
 			exit(0);
diff --git a/flags.h b/flags.h
index 91b2221..12bc3d4 100644
--- a/flags.h
+++ b/flags.h
@@ -49,6 +49,7 @@ struct Flags {
 	int max_input_queue_frames = 6;
 	bool display_timecode_in_stream = false;
 	bool display_timecode_on_stdout = false;
+	bool ten_bit_input = false;
 };
 extern Flags global_flags;
 
diff --git a/mixer.cpp b/mixer.cpp
index 81d4b73..a5ceeaf 100644
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -46,6 +46,7 @@
 #include "resampling_queue.h"
 #include "timebase.h"
 #include "timecode_renderer.h"
+#include "v210_converter.h"
 #include "video_encoder.h"
 
 class IDeckLink;
@@ -79,29 +80,48 @@ void insert_new_frame(RefCountedFrame frame, unsigned field_num, bool interlaced
 
 void ensure_texture_resolution(PBOFrameAllocator::Userdata *userdata, unsigned field, unsigned width, unsigned height)
 {
-	if (userdata->tex_y[field] == 0 ||
-	    userdata->tex_cbcr[field] == 0 ||
+	bool first;
+	if (global_flags.ten_bit_input) {
+		first = userdata->tex_v210[field] == 0 || userdata->tex_444[field] == 0;
+	} else {
+		first = userdata->tex_y[field] == 0 || userdata->tex_cbcr[field] == 0;
+	}
+
+	if (first ||
 	    width != userdata->last_width[field] ||
 	    height != userdata->last_height[field]) {
-		size_t cbcr_width = width / 2;
-
 		// We changed resolution since last use of this texture, so we need to create
 		// a new object. Note that this each card has its own PBOFrameAllocator,
 		// we don't need to worry about these flip-flopping between resolutions.
-		glBindTexture(GL_TEXTURE_2D, userdata->tex_cbcr[field]);
-		check_error();
-		glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, cbcr_width, height, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
-		check_error();
-		glBindTexture(GL_TEXTURE_2D, userdata->tex_y[field]);
-		check_error();
-		glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
-		check_error();
+		if (global_flags.ten_bit_input) {
+			const size_t v210_width = v210Converter::get_minimum_v210_texture_width(width);
+
+			glBindTexture(GL_TEXTURE_2D, userdata->tex_v210[field]);
+			check_error();
+			glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, v210_width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, nullptr);
+			check_error();
+			glBindTexture(GL_TEXTURE_2D, userdata->tex_444[field]);
+			check_error();
+			glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, nullptr);
+			check_error();
+		} else {
+			size_t cbcr_width = width / 2;
+
+			glBindTexture(GL_TEXTURE_2D, userdata->tex_cbcr[field]);
+			check_error();
+			glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, cbcr_width, height, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
+			check_error();
+			glBindTexture(GL_TEXTURE_2D, userdata->tex_y[field]);
+			check_error();
+			glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
+			check_error();
+		}
 		userdata->last_width[field] = width;
 		userdata->last_height[field] = height;
 	}
 }
 
-void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool interlaced_stride, GLenum format, GLintptr offset)
+void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool interlaced_stride, GLenum format, GLenum type, GLintptr offset)
 {
 	if (interlaced_stride) {
 		stride *= 2;
@@ -121,7 +141,7 @@ void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool
 		check_error();
 	}
 
-	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, format, GL_UNSIGNED_BYTE, BUFFER_OFFSET(offset));
+	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, format, type, BUFFER_OFFSET(offset));
 	check_error();
 	glBindTexture(GL_TEXTURE_2D, 0);
 	check_error();
@@ -252,6 +272,24 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
 
 	chroma_subsampler.reset(new ChromaSubsampler(resource_pool.get()));
 
+	if (global_flags.ten_bit_input) {
+		if (!v210Converter::has_hardware_support()) {
+			fprintf(stderr, "ERROR: --ten-bit-input requires support for OpenGL compute shaders\n");
+			fprintf(stderr, "       (OpenGL 4.3, or GL_ARB_compute_shader + GL_ARB_shader_image_load_store).\n");
+			exit(1);
+		}
+		v210_converter.reset(new v210Converter());
+
+		// These are all the widths listed in the Blackmagic SDK documentation
+		// (section 2.7.3, âDisplay Modesâ).
+		v210_converter->precompile_shader(720);
+		v210_converter->precompile_shader(1280);
+		v210_converter->precompile_shader(1920);
+		v210_converter->precompile_shader(2048);
+		v210_converter->precompile_shader(3840);
+		v210_converter->precompile_shader(4096);
+	}
+
 	timecode_renderer.reset(new TimecodeRenderer(resource_pool.get(), global_flags.width, global_flags.height));
 	display_timecode_in_stream = global_flags.display_timecode_in_stream;
 	display_timecode_on_stdout = global_flags.display_timecode_on_stdout;
@@ -308,6 +346,7 @@ void Mixer::configure_card(unsigned card_index, CaptureInterface *capture, bool
 	}
 	while (!card->new_frames.empty()) card->new_frames.pop_front();
 	card->last_timecode = -1;
+	card->capture->set_pixel_format(global_flags.ten_bit_input ? PixelFormat_10BitYCbCr : PixelFormat_8BitYCbCr);
 	card->capture->configure_card();
 
 	// NOTE: start_bm_capture() happens in thread_func().
@@ -450,7 +489,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
 
 	card->last_timecode = timecode;
 
-	size_t expected_length = video_format.width * (video_format.height + video_format.extra_lines_top + video_format.extra_lines_bottom) * 2;
+	size_t expected_length = video_format.stride * (video_format.height + video_format.extra_lines_top + video_format.extra_lines_bottom);
 	if (video_frame.len - video_offset == 0 ||
 	    video_frame.len - video_offset != expected_length) {
 		if (video_frame.len != 0) {
@@ -503,9 +542,9 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
 	RefCountedFrame frame(video_frame);
 
 	// Upload the textures.
-	size_t cbcr_width = video_format.width / 2;
-	size_t cbcr_offset = video_offset / 2;
-	size_t y_offset = video_frame.size / 2 + video_offset / 2;
+	const size_t cbcr_width = video_format.width / 2;
+	const size_t cbcr_offset = video_offset / 2;
+	const size_t y_offset = video_frame.size / 2 + video_offset / 2;
 
 	for (unsigned field = 0; field < num_fields; ++field) {
 		// Put the actual texture upload in a lambda that is executed in the main thread.
@@ -516,23 +555,31 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
 		// Note that this means we must hold on to the actual frame data in <userdata>
 		// until the upload command is run, but we hold on to <frame> much longer than that
 		// (in fact, all the way until we no longer use the texture in rendering).
-		auto upload_func = [field, video_format, y_offset, cbcr_offset, cbcr_width, interlaced_stride, userdata]() {
+		auto upload_func = [this, field, video_format, y_offset, video_offset, cbcr_offset, cbcr_width, interlaced_stride, userdata]() {
 			unsigned field_start_line;
 			if (field == 1) {
 				field_start_line = video_format.second_field_start;
 			} else {
 				field_start_line = video_format.extra_lines_top;
 			}
-			size_t field_y_start = y_offset + video_format.width * field_start_line;
-			size_t field_cbcr_start = cbcr_offset + cbcr_width * field_start_line * sizeof(uint16_t);
 
 			ensure_texture_resolution(userdata, field, video_format.width, video_format.height);
 
 			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, userdata->pbo);
 			check_error();
 
-			upload_texture(userdata->tex_y[field], video_format.width, video_format.height, video_format.width, interlaced_stride, GL_RED, field_y_start);
-			upload_texture(userdata->tex_cbcr[field], cbcr_width, video_format.height, cbcr_width * sizeof(uint16_t), interlaced_stride, GL_RG, field_cbcr_start);
+			if (global_flags.ten_bit_input) {
+				size_t field_start = video_offset + video_format.stride * field_start_line;
+				upload_texture(userdata->tex_v210[field], video_format.stride / sizeof(uint32_t), video_format.height, video_format.stride, interlaced_stride, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, field_start);
+				v210_converter->convert(userdata->tex_v210[field], userdata->tex_444[field], video_format.width, video_format.height);
+			} else {
+				size_t field_y_start = y_offset + video_format.width * field_start_line;
+				size_t field_cbcr_start = cbcr_offset + cbcr_width * field_start_line * sizeof(uint16_t);
+
+				// Make up our own strides, since we are interleaving.
+				upload_texture(userdata->tex_y[field], video_format.width, video_format.height, video_format.width, interlaced_stride, GL_RED, GL_UNSIGNED_BYTE, field_y_start);
+				upload_texture(userdata->tex_cbcr[field], cbcr_width, video_format.height, cbcr_width * sizeof(uint16_t), interlaced_stride, GL_RG, GL_UNSIGNED_BYTE, field_cbcr_start);
+			}
 
 			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 			check_error();
diff --git a/mixer.h b/mixer.h
index 2543c24..8f5342a 100644
--- a/mixer.h
+++ b/mixer.h
@@ -39,9 +39,10 @@
 class ALSAOutput;
 class ChromaSubsampler;
 class DeckLinkOutput;
-class TimecodeRenderer;
 class QSurface;
 class QSurfaceFormat;
+class TimecodeRenderer;
+class v210Converter;
 
 namespace movit {
 class Effect;
@@ -374,6 +375,7 @@ private:
 
 	std::unique_ptr<movit::EffectChain> display_chain;
 	std::unique_ptr<ChromaSubsampler> chroma_subsampler;
+	std::unique_ptr<v210Converter> v210_converter;
 	std::unique_ptr<VideoEncoder> video_encoder;
 
 	std::unique_ptr<TimecodeRenderer> timecode_renderer;
diff --git a/pbo_frame_allocator.cpp b/pbo_frame_allocator.cpp
index d18358a..5aa4705 100644
--- a/pbo_frame_allocator.cpp
+++ b/pbo_frame_allocator.cpp
@@ -7,6 +7,8 @@
 #include <stdio.h>
 #include <cstddef>
 
+#include "flags.h"
+
 using namespace std;
 
 PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, GLuint width, GLuint height, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits)
@@ -30,15 +32,27 @@ PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, GLuint width, GLuint hei
 		frame.userdata = &userdata[i];
 		userdata[i].pbo = pbo;
 		frame.owner = this;
-		frame.interleaved = true;
+
+		// For 8-bit Y'CbCr, we ask the driver to split Y' and Cb/Cr
+		// into separate textures. For 10-bit, the input format (v210)
+		// is complicated enough that we need to interpolate up to 4:4:4,
+		// which we do in a compute shader ourselves.
+		frame.interleaved = !global_flags.ten_bit_input;
 
 		// Create textures. We don't allocate any data for the second field at this point
 		// (just create the texture state with the samplers), since our default assumed
 		// resolution is progressive.
-		glGenTextures(2, userdata[i].tex_y);
-		check_error();
-		glGenTextures(2, userdata[i].tex_cbcr);
-		check_error();
+		if (global_flags.ten_bit_input) {
+			glGenTextures(2, userdata[i].tex_v210);
+			check_error();
+			glGenTextures(2, userdata[i].tex_444);
+			check_error();
+		} else {
+			glGenTextures(2, userdata[i].tex_y);
+			check_error();
+			glGenTextures(2, userdata[i].tex_cbcr);
+			check_error();
+		}
 		userdata[i].last_width[0] = width;
 		userdata[i].last_height[0] = height;
 		userdata[i].last_width[1] = 0;
@@ -47,30 +61,54 @@ PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, GLuint width, GLuint hei
 		userdata[i].last_has_signal = false;
 		userdata[i].last_is_connected = false;
 		for (unsigned field = 0; field < 2; ++field) {
-			glBindTexture(GL_TEXTURE_2D, userdata[i].tex_y[field]);
-			check_error();
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-			check_error();
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-			check_error();
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-			check_error();
-			if (field == 0) {
-				glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, NULL);
+			if (global_flags.ten_bit_input) {
+				glBindTexture(GL_TEXTURE_2D, userdata[i].tex_v210[field]);
 				check_error();
-			}
+				// Don't care about texture parameters, we're only going to read it
+				// from the compute shader anyway.
+				if (field == 0) {
+					glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, NULL);
+					check_error();
+				}
 
-			glBindTexture(GL_TEXTURE_2D, userdata[i].tex_cbcr[field]);
-			check_error();
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-			check_error();
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-			check_error();
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-			check_error();
-			if (field == 0) {
-				glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, width / 2, height, 0, GL_RG, GL_UNSIGNED_BYTE, NULL);
+				glBindTexture(GL_TEXTURE_2D, userdata[i].tex_444[field]);
+				check_error();
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
 				check_error();
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+				check_error();
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+				check_error();
+				if (field == 0) {
+					glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, NULL);
+					check_error();
+				}
+			} else {
+				glBindTexture(GL_TEXTURE_2D, userdata[i].tex_y[field]);
+				check_error();
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+				check_error();
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+				check_error();
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+				check_error();
+				if (field == 0) {
+					glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, NULL);
+					check_error();
+				}
+
+				glBindTexture(GL_TEXTURE_2D, userdata[i].tex_cbcr[field]);
+				check_error();
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+				check_error();
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+				check_error();
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+				check_error();
+				if (field == 0) {
+					glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, width / 2, height, 0, GL_RG, GL_UNSIGNED_BYTE, NULL);
+					check_error();
+				}
 			}
 		}
 
@@ -96,10 +134,17 @@ PBOFrameAllocator::~PBOFrameAllocator()
 		check_error();
 		glDeleteBuffers(1, &pbo);
 		check_error();
-		glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_y);
-		check_error();
-		glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_cbcr);
-		check_error();
+		if (global_flags.ten_bit_input) {
+			glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_v210);
+			check_error();
+			glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_444);
+			check_error();
+		} else {
+			glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_y);
+			check_error();
+			glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_cbcr);
+			check_error();
+		}
 	}
 }
 //static int sumsum = 0;
diff --git a/pbo_frame_allocator.h b/pbo_frame_allocator.h
index 024e57a..43310ae 100644
--- a/pbo_frame_allocator.h
+++ b/pbo_frame_allocator.h
@@ -31,7 +31,8 @@ public:
 		GLuint pbo;
 
 		// The second set is only used for the second field of interlaced inputs.
-		GLuint tex_y[2], tex_cbcr[2];
+		GLuint tex_y[2], tex_cbcr[2];  // For 8-bit.
+		GLuint tex_v210[2], tex_444[2];  // For 10-bit.
 		GLuint last_width[2], last_height[2];
 		bool last_interlaced, last_has_signal, last_is_connected;
 		unsigned last_frame_rate_nom, last_frame_rate_den;
diff --git a/theme.cpp b/theme.cpp
index 1c0d8c9..14560c2 100644
--- a/theme.cpp
+++ b/theme.cpp
@@ -603,9 +603,9 @@ LiveInputWrapper::LiveInputWrapper(Theme *theme, EffectChain *chain, bool overri
 	// Perhaps 601 was only to indicate the subsampling positions, not the
 	// colorspace itself? Tested with a Lenovo X1 gen 3 as input.
 	YCbCrFormat input_ycbcr_format;
-	input_ycbcr_format.chroma_subsampling_x = 2;
+	input_ycbcr_format.chroma_subsampling_x = global_flags.ten_bit_input ? 1 : 2;
 	input_ycbcr_format.chroma_subsampling_y = 1;
-	input_ycbcr_format.num_levels = 256;
+	input_ycbcr_format.num_levels = global_flags.ten_bit_input ? 1024 : 256;
 	input_ycbcr_format.cb_x_position = 0.0;
 	input_ycbcr_format.cr_x_position = 0.0;
 	input_ycbcr_format.cb_y_position = 0.5;
@@ -629,10 +629,12 @@ LiveInputWrapper::LiveInputWrapper(Theme *theme, EffectChain *chain, bool overri
 		num_inputs = 1;
 	}
 	for (unsigned i = 0; i < num_inputs; ++i) {
+		// When using 10-bit input, we're converting to interleaved through v210Converter.
+		YCbCrInputSplitting splitting = global_flags.ten_bit_input ? YCBCR_INPUT_INTERLEAVED : YCBCR_INPUT_SPLIT_Y_AND_CBCR;
 		if (override_bounce) {
-			inputs.push_back(new NonBouncingYCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
+			inputs.push_back(new NonBouncingYCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, splitting));
 		} else {
-			inputs.push_back(new YCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
+			inputs.push_back(new YCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, splitting));
 		}
 		chain->add_input(inputs.back());
 	}
@@ -681,8 +683,12 @@ void LiveInputWrapper::connect_signal(int signal_num)
 			userdata = (const PBOFrameAllocator::Userdata *)frame.frame->userdata;
 		}
 
-		inputs[i]->set_texture_num(0, userdata->tex_y[frame.field_number]);
-		inputs[i]->set_texture_num(1, userdata->tex_cbcr[frame.field_number]);
+		if (global_flags.ten_bit_input) {
+			inputs[i]->set_texture_num(0, userdata->tex_444[frame.field_number]);
+		} else {
+			inputs[i]->set_texture_num(0, userdata->tex_y[frame.field_number]);
+			inputs[i]->set_texture_num(1, userdata->tex_cbcr[frame.field_number]);
+		}
 		inputs[i]->set_width(userdata->last_width[frame.field_number]);
 		inputs[i]->set_height(userdata->last_height[frame.field_number]);
 
diff --git a/v210_converter.cpp b/v210_converter.cpp
new file mode 100644
index 0000000..715dd5f
--- /dev/null
+++ b/v210_converter.cpp
@@ -0,0 +1,156 @@
+#include "v210_converter.h"
+
+#include <epoxy/gl.h>
+#include <movit/util.h>
+
+using namespace std;
+
+v210Converter::~v210Converter()
+{
+	for (const auto &shader : shaders) {
+		glDeleteProgram(shader.second.glsl_program_num);
+		check_error();
+	}
+}
+
+bool v210Converter::has_hardware_support()
+{
+	// We don't have a GLES version of this, although GLSL ES 3.1 supports
+	// compute shaders. Note that GLSL ES has some extra restrictions,
+	// like requiring that the images are allocated with glTexStorage*(),
+	// or that binding= is effectively mandatory.
+	if (!epoxy_is_desktop_gl()) {
+		return false;
+	}
+	if (epoxy_gl_version() >= 43) {
+		// Supports compute shaders natively.
+		return true;
+	}
+	return epoxy_has_gl_extension("GL_ARB_compute_shader") &&
+	       epoxy_has_gl_extension("GL_ARB_shader_image_load_store");
+}
+
+void v210Converter::precompile_shader(unsigned width)
+{
+	unsigned num_local_work_groups = (width + 5) / 6;
+	if (shaders.count(num_local_work_groups)) {
+		// Already exists.
+		return;
+	}
+
+	char buf[16];
+	snprintf(buf, sizeof(buf), "%u", num_local_work_groups);
+        string shader_src = R"(#version 150
+#extension GL_ARB_compute_shader : enable
+#extension GL_ARB_shader_image_load_store : enable
+layout(local_size_x = )" + string(buf) + R"() in;
+layout(rgb10_a2) uniform restrict readonly image2D inbuf;
+layout(rgb10_a2) uniform restrict writeonly image2D outbuf;
+uniform int max_cbcr_x;
+shared vec2 cbcr[gl_WorkGroupSize.x * 3u];
+
+void main()
+{
+	int xb = int(gl_LocalInvocationID.x);  // X block.
+	int y = int(gl_GlobalInvocationID.y);  // Y (actual line).
+
+	// Load our pixel group, containing data for six pixels.
+	vec3 indata[4];
+	for (int i = 0; i < 4; ++i) {
+		indata[i] = imageLoad(inbuf, ivec2(xb * 4 + i, y)).xyz;
+	}
+
+	// Decode Cb and Cr to shared memory, because neighboring blocks need it for interpolation.
+	cbcr[xb * 3 + 0] = indata[0].xz;
+	cbcr[xb * 3 + 1] = vec2(indata[1].y, indata[2].x);
+	cbcr[xb * 3 + 2] = vec2(indata[2].z, indata[3].y);
+	memoryBarrierShared();
+
+	float pix_y[6];
+	pix_y[0] = indata[0].y;
+	pix_y[1] = indata[1].x;
+	pix_y[2] = indata[1].z;
+	pix_y[3] = indata[2].y;
+	pix_y[4] = indata[3].x;
+	pix_y[5] = indata[3].z;
+
+	barrier();
+
+	// Interpolate the missing Cb/Cr pixels, taking care not to read past the end of the screen
+	// for pixels that we use for interpolation.
+	vec2 pix_cbcr[7];
+	pix_cbcr[0] = indata[0].xz;
+	pix_cbcr[2] = cbcr[min(xb * 3 + 1, max_cbcr_x)];
+	pix_cbcr[4] = cbcr[min(xb * 3 + 2, max_cbcr_x)];
+	pix_cbcr[6] = cbcr[min(xb * 3 + 3, max_cbcr_x)];
+	pix_cbcr[1] = 0.5 * (pix_cbcr[0] + pix_cbcr[2]);
+	pix_cbcr[3] = 0.5 * (pix_cbcr[2] + pix_cbcr[4]);
+	pix_cbcr[5] = 0.5 * (pix_cbcr[4] + pix_cbcr[6]);
+
+	// Write the decoded pixels to the destination texture.
+	for (int i = 0; i < 6; ++i) {
+		vec4 outdata = vec4(pix_y[i], pix_cbcr[i].x, pix_cbcr[i].y, 1.0f);
+		imageStore(outbuf, ivec2(xb * 6 + i, y), outdata);
+	}
+}
+)";
+
+	Shader shader;
+
+	GLuint shader_num = movit::compile_shader(shader_src, GL_COMPUTE_SHADER);
+	check_error();
+	shader.glsl_program_num = glCreateProgram();
+	check_error();
+	glAttachShader(shader.glsl_program_num, shader_num);
+	check_error();
+	glLinkProgram(shader.glsl_program_num);
+	check_error();
+
+	GLint success;
+	glGetProgramiv(shader.glsl_program_num, GL_LINK_STATUS, &success);
+	check_error();
+	if (success == GL_FALSE) {
+		GLchar error_log[1024] = {0};
+		glGetProgramInfoLog(shader.glsl_program_num, 1024, NULL, error_log);
+		fprintf(stderr, "Error linking program: %s\n", error_log);
+		exit(1);
+	}
+
+	shader.max_cbcr_x_pos = glGetUniformLocation(shader.glsl_program_num, "max_cbcr_x");
+	check_error();
+	shader.inbuf_pos = glGetUniformLocation(shader.glsl_program_num, "inbuf");
+	check_error();
+	shader.outbuf_pos = glGetUniformLocation(shader.glsl_program_num, "outbuf");
+	check_error();
+
+	shaders.emplace(num_local_work_groups, shader);
+}
+
+void v210Converter::convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height)
+{
+	precompile_shader(width);
+	unsigned num_local_work_groups = (width + 5) / 6;
+	const Shader &shader = shaders[num_local_work_groups];
+
+	glUseProgram(shader.glsl_program_num);
+	check_error();
+	glUniform1i(shader.max_cbcr_x_pos, width / 2 - 1);
+	check_error();
+
+	// Bind the textures.
+	glUniform1i(shader.inbuf_pos, 0);
+	check_error();
+	glUniform1i(shader.outbuf_pos, 1);
+	check_error();
+        glBindImageTexture(0, tex_src, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGB10_A2);
+	check_error();
+        glBindImageTexture(1, tex_dst, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGB10_A2);
+	check_error();
+
+	// Actually run the shader.
+	glDispatchCompute(1, height, 1);
+	check_error();
+
+	glUseProgram(0);
+	check_error();
+}
diff --git a/v210_converter.h b/v210_converter.h
new file mode 100644
index 0000000..39c456f
--- /dev/null
+++ b/v210_converter.h
@@ -0,0 +1,103 @@
+#ifndef _V210CONVERTER_H
+#define _V210CONVERTER_H 1
+
+// v210 is a 10-bit 4:2:2 interleaved Y'CbCr format, packing three values
+// into a 32-bit int (leaving two unused bits at the top) with chroma being
+// sub-sited with the left luma sample. Even though this 2:10:10:10-arrangement
+// can be sampled from using the GL_RGB10_A2/GL_UNSIGNED_2_10_10_10_REV format,
+// the placement of the Y', Cb and Cr parts within these ints is rather
+// complicated, and thus hard to get a single Y'CbCr pixel from efficiently,
+// especially on a GPU. Six pixels (six Y', three Cb, three Cr) are packed into
+// four such ints in the following pattern (see e.g. the DeckLink documentation
+// for reference):
+//
+//   A  B   G   R
+// -----------------
+//   X Cr0 Y0  Cb0
+//   X  Y2 Cb2  Y1
+//   X Cb4 Y3  Cr2
+//   X  Y5 Cr4  Y4
+//
+// This patterns repeats for as long as needed, with the additional constraint
+// that stride must be divisible by 128 (or equivalently, 32 four-byte ints,
+// or eight pixel groups representing 48 pixels in all).
+//
+// Thus, v210Converter allows you to convert from v210 to a more regular
+// 4:4:4 format (upsampling Cb/Cr on the way, using linear interpolation)
+// that the GPU supports natively, again in the form of GL_RGB10_A2
+// (with Y', Cb, Cr packed as R, G and B, respectively -- the âalphaâ channel
+// is always 1).
+//
+// It does this fairly efficiently using a compute shader, which means you'll
+// need compute shader support (GL_ARB_compute_shader + GL_ARB_shader_image_load_store,
+// or equivalently, OpenGL 4.3 or newer) to use it. There are many possible
+// strategies for doing this in a compute shader, but I ended up settling
+// a fairly simple one after some benchmarking; each work unit takes in
+// a single four-int group and writes six samples, but as the interpolation
+// needs the leftmost chroma samples from the work unit at the right, each line
+// is put into a local work group. Cb/Cr is first decoded into shared memory
+// (OpenGL guarantees at least 32 kB shared memory for the work group, which is
+// enough for up to 6K video or so), and then the rest of the shuffling and
+// writing happens. Each line can of course be converted entirely
+// independently, so we can fire up as many such work groups as we have lines.
+//
+// On the Haswell GPU where I developed it (with single-channel memory),
+// conversion takes about 1.4 ms for a 720p frame, so it should be possible to
+// keep up multiple inputs at 720p60, although probably a faster machine is
+// needed if we want to run e.g. heavy scaling filters in the same pipeline.
+// (1.4 ms equates to about 35% of the theoretical memory bandwidth of
+// 12.8 GB/sec, which is pretty good.)
+
+#include <map>
+
+#include <epoxy/gl.h>
+
+class v210Converter {
+public:
+	~v210Converter();
+
+	// Whether the current hardware and driver supports the compute shader
+	// necessary to do this conversion.
+	static bool has_hardware_support();
+
+	// Given an image width, returns the minimum number of 32-bit groups
+	// needed for each line. This can be used to size the input texture properly.
+	static GLuint get_minimum_v210_texture_width(unsigned width)
+	{
+		unsigned num_local_groups = (width + 5) / 6;
+		return 4 * num_local_groups;
+	}
+
+	// Given an image width, returns the stride (in bytes) for each line.
+	static size_t get_v210_stride(unsigned width)
+	{
+		return (width + 47) / 48 * 128;
+	}
+
+	// Since work groups need to be determined at shader compile time,
+	// each width needs potentially a different shader. You can call this
+	// function at startup to make sure a shader for the given width
+	// has been compiled, making sure you don't need to start an expensive
+	// compilation job while video is running if a new resolution comes along.
+	// This is not required, but generally recommended.
+	void precompile_shader(unsigned width);
+
+	// Do the actual conversion. tex_src is assumed to be a GL_RGB10_A2
+	// texture of at least [get_minimum_v210_texture_width(width), height].
+	// tex_dst is assumed to be a GL_RGB10_A2 texture of exactly [width, height]
+	// (actually, other sizes will work fine, but be nonsensical).
+	// No textures will be allocated or deleted.
+	void convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height);
+
+private:
+	// Key is number of local groups, ie., ceil(width / 6).
+	struct Shader {
+		GLuint glsl_program_num = -1;
+
+		// Uniform locations.
+		GLuint max_cbcr_x_pos = -1, inbuf_pos = -1, outbuf_pos = -1;
+	};
+	std::map<unsigned, Shader> shaders;
+};
+
+#endif  // !defined(_V210CONVERTER_H)
-- 
2.39.2