From 76d3f4f3c75111cc8c59a08396c5ad60c712c9a5 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Tue, 14 Feb 2017 22:26:28 +0100
Subject: [PATCH] Add support for 10- and 12-bit planar Y'CbCr inputs.

This is mostly for completeness; at least for 10-bit, 10:10:10:2
should be a faster format. However, it's nice to allow direct
subsampled inputs _somehow_.
---
 resource_pool.cpp    | 22 +++++++++++++++
 ycbcr_input.cpp      | 32 +++++++++++++++++++---
 ycbcr_input.h        | 20 +++++++++++---
 ycbcr_input_test.cpp | 65 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 131 insertions(+), 8 deletions(-)

diff --git a/resource_pool.cpp b/resource_pool.cpp
index b560f15..5b04999 100644
--- a/resource_pool.cpp
+++ b/resource_pool.cpp
@@ -307,6 +307,7 @@ GLuint ResourcePool::create_2d_texture(GLint internal_format, GLsizei width, GLs
 	switch (internal_format) {
 	case GL_RGBA32F_ARB:
 	case GL_RGBA16F_ARB:
+	case GL_RGBA16:
 	case GL_RGBA8:
 	case GL_RGB10_A2:
 	case GL_SRGB8_ALPHA8:
@@ -314,6 +315,7 @@ GLuint ResourcePool::create_2d_texture(GLint internal_format, GLsizei width, GLs
 		break;
 	case GL_RGB32F:
 	case GL_RGB16F:
+	case GL_RGB16:
 	case GL_R11F_G11F_B10F:
 	case GL_RGB8:
 	case GL_RGB10:
@@ -324,11 +326,13 @@ GLuint ResourcePool::create_2d_texture(GLint internal_format, GLsizei width, GLs
 		break;
 	case GL_RG32F:
 	case GL_RG16F:
+	case GL_RG16:
 	case GL_RG8:
 		format = GL_RG;
 		break;
 	case GL_R32F:
 	case GL_R16F:
+	case GL_R16:
 	case GL_R8:
 		format = GL_RED;
 		break;
@@ -352,6 +356,12 @@ GLuint ResourcePool::create_2d_texture(GLint internal_format, GLsizei width, GLs
 	case GL_R16F:
 		type = GL_FLOAT;
 		break;
+	case GL_RGBA16:
+	case GL_RGB16:
+	case GL_RG16:
+	case GL_R16:
+		type = GL_UNSIGNED_SHORT;
+		break;
 	case GL_SRGB8_ALPHA8:
 	case GL_SRGB8:
 	case GL_RGBA8:
@@ -618,6 +628,18 @@ size_t ResourcePool::estimate_texture_size(const Texture2D &texture_format)
 	case GL_RGB565:
 		bytes_per_pixel = 2;
 		break;
+	case GL_RGBA16:
+		bytes_per_pixel = 8;
+		break;
+	case GL_RGB16:
+		bytes_per_pixel = 6;
+		break;
+	case GL_RG16:
+		bytes_per_pixel = 4;
+		break;
+	case GL_R16:
+		bytes_per_pixel = 2;
+		break;
 	default:
 		// TODO: Add more here as needed.
 		assert(false);
diff --git a/ycbcr_input.cpp b/ycbcr_input.cpp
index f8df0c1..f748f1c 100644
--- a/ycbcr_input.cpp
+++ b/ycbcr_input.cpp
@@ -74,19 +74,30 @@ void YCbCrInput::set_gl_state(GLuint glsl_program_num, const string& prefix, uns
 				if (type == GL_UNSIGNED_INT_2_10_10_10_REV) {
 					format = GL_RGBA;
 					internal_format = GL_RGB10_A2;
+				} else if (type == GL_UNSIGNED_SHORT) {
+					format = GL_RGB;
+					internal_format = GL_RGB16;
 				} else {
 					assert(type == GL_UNSIGNED_BYTE);
 					format = GL_RGB;
 					internal_format = GL_RGB8;
 				}
 			} else if (channel == 1 && ycbcr_input_splitting == YCBCR_INPUT_SPLIT_Y_AND_CBCR) {
-				assert(type == GL_UNSIGNED_BYTE);
 				format = GL_RG;
-				internal_format = GL_RG8;
+				if (type == GL_UNSIGNED_SHORT) {
+					internal_format = GL_RG16;
+				} else {
+					assert(type == GL_UNSIGNED_BYTE);
+					internal_format = GL_RG8;
+				}
 			} else {
-				assert(type == GL_UNSIGNED_BYTE);
 				format = GL_RED;
-				internal_format = GL_R8;
+				if (type == GL_UNSIGNED_SHORT) {
+					internal_format = GL_R16;
+				} else {
+					assert(type == GL_UNSIGNED_BYTE);
+					internal_format = GL_R8;
+				}
 			}
 
 			// (Re-)upload the texture.
@@ -135,6 +146,19 @@ string YCbCrInput::output_fragment_shader()
 	Matrix3d ycbcr_to_rgb;
 	compute_ycbcr_matrix(ycbcr_format, offset, &ycbcr_to_rgb);
 
+	if (type == GL_UNSIGNED_SHORT) {
+		// For 10-bit or 12-bit packed into 16-bit, we need to scale the values
+		// so that the max value goes from 1023 (or 4095) to 65535. We do this
+		// by folding the scaling into the conversion matrix, so it comes essentially
+		// for free. However, the offset is before the scaling (and thus assumes
+		// correctly scaled values), so we need to adjust that the other way.
+		double scale = 65535.0 / (ycbcr_format.num_levels - 1);
+		offset[0] /= scale;
+		offset[1] /= scale;
+		offset[2] /= scale;
+		ycbcr_to_rgb *= scale;
+	}
+
 	string frag_shader;
 
 	frag_shader = output_glsl_mat3("PREFIX(inv_ycbcr_matrix)", ycbcr_to_rgb);
diff --git a/ycbcr_input.h b/ycbcr_input.h
index 31ba42b..4e8f194 100644
--- a/ycbcr_input.h
+++ b/ycbcr_input.h
@@ -9,12 +9,14 @@
 //   * 8-bit semiplanar Y'CbCr (Y' in one plane, CbCr in another),
 //     possibly subsampled.
 //   * 8-bit interleaved (chunked) Y'CbCr, no subsampling (4:4:4 only).
+//   * All of the above in 10- and 12-bit versions, where each sample is
+//     stored in a 16-bit int (so the 6 or 4 top bits are wasted).
 //   * 10-bit interleaved (chunked) Y'CbCr packed into 32-bit words
 //     (10:10:10:2), no subsampling (4:4:4 only).
 //
-// For the former case, it upsamples planes as needed, using the default linear
-// upsampling OpenGL gives you. Note that YCbCr422InterleavedInput supports the
-// important special case of 8-bit 4:2:2 interleaved.
+// For the planar and semiplanar cases, it upsamples planes as needed, using
+// the default linear upsampling OpenGL gives you. Note that YCbCr422InterleavedInput
+// supports the important special case of 8-bit 4:2:2 interleaved.
 
 #include <epoxy/gl.h>
 #include <assert.h>
@@ -49,7 +51,8 @@ enum YCbCrInputSplitting {
 
 class YCbCrInput : public Input {
 public:
-	// Type can be GL_UNSIGNED_BYTE for 8-bit, or GL_UNSIGNED_INT_2_10_10_10_REV
+	// Type can be GL_UNSIGNED_BYTE for 8-bit, GL_UNSIGNED_SHORT for 10- or 12-bit
+	// (or 8-bit, although that's a bit useless), or GL_UNSIGNED_INT_2_10_10_10_REV
 	// for 10-bit (YCBCR_INPUT_INTERLEAVED only).
 	YCbCrInput(const ImageFormat &image_format,
 	           const YCbCrFormat &ycbcr_format,
@@ -93,6 +96,15 @@ public:
 		invalidate_pixel_data();
 	}
 
+	void set_pixel_data(unsigned channel, const uint16_t *pixel_data, GLuint pbo = 0)
+	{
+		assert(type == GL_UNSIGNED_SHORT);
+		assert(channel >= 0 && channel < num_channels);
+		this->pixel_data[channel] = reinterpret_cast<const unsigned char *>(pixel_data);
+		this->pbos[channel] = pbo;
+		invalidate_pixel_data();
+	}
+
 	void set_pixel_data(unsigned channel, const uint32_t *pixel_data, GLuint pbo = 0)
 	{
 		assert(type == GL_UNSIGNED_INT_2_10_10_10_REV);
diff --git a/ycbcr_input_test.cpp b/ycbcr_input_test.cpp
index 7792b28..1d1e23e 100644
--- a/ycbcr_input_test.cpp
+++ b/ycbcr_input_test.cpp
@@ -858,4 +858,69 @@ TEST(YCbCrInputTest, TenBitInterleaved) {
 	expect_equal(expected_data, out_data, 4 * width, height, 0.002, 0.0003);
 }
 
+TEST(YCbCrInputTest, TenBitPlanar) {
+	const int width = 1;
+	const int height = 5;
+
+	// The same data as TenBitInterleaved, but split.
+	uint16_t y[width * height] = {
+                 64,
+                940,
+                250,
+                691,
+                127,
+	};
+	uint16_t cb[width * height] = {
+                512,
+                512,
+                409,
+                167,
+                960,
+	};
+	uint16_t cr[width * height] = {
+                512,
+                512,
+                960,
+                105,
+                471,
+	};
+	float expected_data[4 * width * height] = {
+		0.0, 0.0, 0.0, 1.0,
+		1.0, 1.0, 1.0, 1.0,
+		1.0, 0.0, 0.0, 1.0,
+		0.0, 1.0, 0.0, 1.0,
+		0.0, 0.0, 1.0, 1.0,
+	};
+	float out_data[4 * width * height];
+
+	EffectChainTester tester(NULL, width, height);
+
+	ImageFormat format;
+	format.color_space = COLORSPACE_sRGB;
+	format.gamma_curve = GAMMA_sRGB;
+
+	YCbCrFormat ycbcr_format;
+	ycbcr_format.luma_coefficients = YCBCR_REC_709;
+	ycbcr_format.full_range = false;
+	ycbcr_format.num_levels = 1024;  // 10-bit.
+	ycbcr_format.chroma_subsampling_x = 1;
+	ycbcr_format.chroma_subsampling_y = 1;
+	ycbcr_format.cb_x_position = 0.5f;
+	ycbcr_format.cb_y_position = 0.5f;
+	ycbcr_format.cr_x_position = 0.5f;
+	ycbcr_format.cr_y_position = 0.5f;
+
+	YCbCrInput *input = new YCbCrInput(format, ycbcr_format, width, height, YCBCR_INPUT_PLANAR, GL_UNSIGNED_SHORT);
+	input->set_pixel_data(0, y);
+	input->set_pixel_data(1, cb);
+	input->set_pixel_data(2, cr);
+	tester.get_chain()->add_input(input);
+
+	tester.run(out_data, GL_RGBA, COLORSPACE_sRGB, GAMMA_sRGB);
+
+	// We can set much tighter limits on this than 8-bit Y'CbCr;
+	// even tighter than the default limits.
+	expect_equal(expected_data, out_data, 4 * width, height, 0.002, 0.0003);
+}
+
 }  // namespace movit
-- 
2.39.2