From f44c81569a268efea44f1f6df03a000711b18ffc Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Mon, 13 Mar 2017 20:12:50 +0100
Subject: [PATCH] Support 10-/12-bit Y'CbCr output packed in 16-bit.

This mirrors our existing input support, and makes planar and
semiplanar output possible for 10-/12-bit Y'CbCr.

ABI break. API stays the same.
---
 effect_chain.cpp                 |  7 +++--
 effect_chain.h                   | 11 ++++++--
 test_util.cpp                    | 34 ++++++++++++++++++++---
 test_util.h                      |  4 ++-
 version.h                        |  2 +-
 ycbcr.cpp                        | 20 +++++++++++++-
 ycbcr.h                          |  9 ++++++-
 ycbcr_conversion_effect.cpp      | 10 ++++---
 ycbcr_conversion_effect.h        |  3 ++-
 ycbcr_conversion_effect_test.cpp | 46 ++++++++++++++++++++++++++++++++
 ycbcr_input.cpp                  | 15 +----------
 11 files changed, 132 insertions(+), 29 deletions(-)
diff --git a/effect_chain.cpp b/effect_chain.cpp
index 90fadb7..efd4abc 100644
--- a/effect_chain.cpp
+++ b/effect_chain.cpp
@@ -97,7 +97,8 @@ void EffectChain::add_output(const ImageFormat &format, OutputAlphaFormat alpha_
 }
 
 void EffectChain::add_ycbcr_output(const ImageFormat &format, OutputAlphaFormat alpha_format,
-                                   const YCbCrFormat &ycbcr_format, YCbCrOutputSplitting output_splitting)
+                                   const YCbCrFormat &ycbcr_format, YCbCrOutputSplitting output_splitting,
+                                   GLenum output_type)
 {
 	assert(!finalized);
 	assert(num_output_color_ycbcr < 2);
@@ -111,8 +112,10 @@ void EffectChain::add_ycbcr_output(const ImageFormat &format, OutputAlphaFormat
 		assert(output_ycbcr_format.num_levels == ycbcr_format.num_levels);
 		assert(output_ycbcr_format.chroma_subsampling_x == 1);
 		assert(output_ycbcr_format.chroma_subsampling_y == 1);
+		assert(output_ycbcr_type == output_type);
 	} else {
 		output_ycbcr_format = ycbcr_format;
+		output_ycbcr_type = output_type;
 	}
 	output_ycbcr_splitting[num_output_color_ycbcr++] = output_splitting;
 
@@ -1644,7 +1647,7 @@ void EffectChain::add_ycbcr_conversion_if_needed()
 		return;
 	}
 	Node *output = find_output_node();
-	ycbcr_conversion_effect_node = add_node(new YCbCrConversionEffect(output_ycbcr_format));
+	ycbcr_conversion_effect_node = add_node(new YCbCrConversionEffect(output_ycbcr_format, output_ycbcr_type));
 	connect_nodes(output, ycbcr_conversion_effect_node);
 }
 	
diff --git a/effect_chain.h b/effect_chain.h
index ffa9389..b753ad4 100644
--- a/effect_chain.h
+++ b/effect_chain.h
@@ -264,7 +264,7 @@ public:
 	void add_output(const ImageFormat &format, OutputAlphaFormat alpha_format);
 
 	// Adds an YCbCr output. Note that you can only have at most two Y'CbCr
-	// outputs, and they must have the same <ycbcr_format>.
+	// outputs, and they must have the same <ycbcr_format> and <type>.
 	// (This limitation may be lifted in the future, to allow e.g. simultaneous
 	// 8- and 10-bit output. Currently, multiple Y'CbCr outputs are only
 	// useful in some very limited circumstances, like if one texture goes
@@ -272,13 +272,19 @@ public:
 	//
 	// Only 4:4:4 output is supported due to fragment shader limitations,
 	// so chroma_subsampling_x and chroma_subsampling_y must both be 1.
+	// <type> should match the data type of the FBO you are rendering to,
+	// so that if you use 16-bit output (GL_UNSIGNED_SHORT), you will get
+	// 8-, 10- or 12-bit output correctly as determined by <ycbcr_format.num_levels>.
+	// Using e.g. ycbcr_format.num_levels == 1024 with GL_UNSIGNED_BYTE is
+	// nonsensical and invokes undefined behavior.
 	//
 	// If you have both RGBA and Y'CbCr output(s), the RGBA output will come
 	// in the last draw buffer. Also, <format> and <alpha_format> must be
 	// identical between the two.
 	void add_ycbcr_output(const ImageFormat &format, OutputAlphaFormat alpha_format,
 	                      const YCbCrFormat &ycbcr_format,
-			      YCbCrOutputSplitting output_splitting = YCBCR_OUTPUT_INTERLEAVED);
+			      YCbCrOutputSplitting output_splitting = YCBCR_OUTPUT_INTERLEAVED,
+	                      GLenum output_type = GL_UNSIGNED_BYTE);
 
 	// Change Y'CbCr output format. (This can be done also after finalize()).
 	// Note that you are not allowed to change subsampling parameters;
@@ -494,6 +500,7 @@ private:
 	bool output_color_rgba;
 	int num_output_color_ycbcr;                      // Max 2.
 	YCbCrFormat output_ycbcr_format;                 // If num_output_color_ycbcr is > 0.
+	GLenum output_ycbcr_type;                        // If num_output_color_ycbcr is > 0.
 	YCbCrOutputSplitting output_ycbcr_splitting[2];  // If num_output_color_ycbcr is > N.
 
 	std::vector<Node *> nodes;
diff --git a/test_util.cpp b/test_util.cpp
index ae95847..4543ee3 100644
--- a/test_util.cpp
+++ b/test_util.cpp
@@ -143,6 +143,11 @@ void EffectChainTester::run(unsigned char *out_data, unsigned char *out_data2, u
 	internal_run(out_data, out_data2, out_data3, out_data4, GL_UNSIGNED_BYTE, format, color_space, gamma_curve, alpha_format);
 }
 
+void EffectChainTester::run(uint16_t *out_data, GLenum format, Colorspace color_space, GammaCurve gamma_curve, OutputAlphaFormat alpha_format)
+{
+	internal_run<uint16_t>(out_data, NULL, NULL, NULL, GL_UNSIGNED_SHORT, format, color_space, gamma_curve, alpha_format);
+}
+
 void EffectChainTester::run_10_10_10_2(uint32_t *out_data, GLenum format, Colorspace color_space, GammaCurve gamma_curve, OutputAlphaFormat alpha_format)
 {
 	internal_run<uint32_t>(out_data, NULL, NULL, NULL, GL_UNSIGNED_INT_2_10_10_10_REV, format, color_space, gamma_curve, alpha_format);
@@ -158,6 +163,8 @@ void EffectChainTester::internal_run(T *out_data, T *out_data2, T *out_data3, T
 	GLuint type;
 	if (framebuffer_format == GL_RGBA8) {
 		type = GL_UNSIGNED_BYTE;
+	} else if (framebuffer_format == GL_RGBA16) {
+		type = GL_UNSIGNED_SHORT;
 	} else if (framebuffer_format == GL_RGBA16F || framebuffer_format == GL_RGBA32F) {
 		type = GL_FLOAT;
 	} else if (framebuffer_format == GL_RGB10_A2) {
@@ -239,7 +246,7 @@ void EffectChainTester::internal_run(T *out_data, T *out_data2, T *out_data3, T
 			check_error();
 		}
 
-		if (format == GL_RGBA && (type == GL_UNSIGNED_BYTE || type == GL_FLOAT)) {
+		if (format == GL_RGBA && (type == GL_UNSIGNED_BYTE || type == GL_UNSIGNED_SHORT || type == GL_FLOAT)) {
 			vertical_flip(ptr, width * 4, height);
 		} else {
 			vertical_flip(ptr, width, height);
@@ -258,9 +265,9 @@ void EffectChainTester::add_output(const ImageFormat &format, OutputAlphaFormat
 	output_added = true;
 }
 
-void EffectChainTester::add_ycbcr_output(const ImageFormat &format, OutputAlphaFormat alpha_format, const YCbCrFormat &ycbcr_format, YCbCrOutputSplitting output_splitting)
+void EffectChainTester::add_ycbcr_output(const ImageFormat &format, OutputAlphaFormat alpha_format, const YCbCrFormat &ycbcr_format, YCbCrOutputSplitting output_splitting, GLenum type)
 {
-	chain.add_ycbcr_output(format, alpha_format, ycbcr_format, output_splitting);
+	chain.add_ycbcr_output(format, alpha_format, ycbcr_format, output_splitting, type);
 	output_added = true;
 }
 
@@ -346,6 +353,27 @@ void expect_equal(const unsigned char *ref, const unsigned char *result, unsigne
 	delete[] result_float;
 }
 
+void expect_equal(const uint16_t *ref, const uint16_t *result, unsigned width, unsigned height, unsigned largest_difference_limit, float rms_limit)
+{
+	assert(width > 0);
+	assert(height > 0);
+
+	float *ref_float = new float[width * height];
+	float *result_float = new float[width * height];
+
+	for (unsigned y = 0; y < height; ++y) {
+		for (unsigned x = 0; x < width; ++x) {
+			ref_float[y * width + x] = ref[y * width + x];
+			result_float[y * width + x] = result[y * width + x];
+		}
+	}
+
+	expect_equal(ref_float, result_float, width, height, largest_difference_limit, rms_limit);
+
+	delete[] ref_float;
+	delete[] result_float;
+}
+
 void expect_equal(const int *ref, const int *result, unsigned width, unsigned height, unsigned largest_difference_limit, float rms_limit)
 {
 	assert(width > 0);
diff --git a/test_util.h b/test_util.h
index e0195be..2f76dea 100644
--- a/test_util.h
+++ b/test_util.h
@@ -29,9 +29,10 @@ public:
 	void run(unsigned char *out_data, unsigned char *out_data2, GLenum format, Colorspace color_space, GammaCurve gamma_curve, OutputAlphaFormat alpha_format = OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
 	void run(unsigned char *out_data, unsigned char *out_data2, unsigned char *out_data3, GLenum format, Colorspace color_space, GammaCurve gamma_curve, OutputAlphaFormat alpha_format = OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
 	void run(unsigned char *out_data, unsigned char *out_data2, unsigned char *out_data3, unsigned char *out_data4, GLenum format, Colorspace color_space, GammaCurve gamma_curve, OutputAlphaFormat alpha_format = OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
+	void run(uint16_t *out_data, GLenum format, Colorspace color_space, GammaCurve gamma_curve, OutputAlphaFormat alpha_format = OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
 	void run_10_10_10_2(uint32_t *out_data, GLenum format, Colorspace color_space, GammaCurve gamma_curve, OutputAlphaFormat alpha_format = OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
 	void add_output(const ImageFormat &format, OutputAlphaFormat alpha_format);
-	void add_ycbcr_output(const ImageFormat &format, OutputAlphaFormat alpha_format, const YCbCrFormat &ycbcr_format, YCbCrOutputSplitting output_splitting = YCBCR_OUTPUT_INTERLEAVED);
+	void add_ycbcr_output(const ImageFormat &format, OutputAlphaFormat alpha_format, const YCbCrFormat &ycbcr_format, YCbCrOutputSplitting output_splitting = YCBCR_OUTPUT_INTERLEAVED, GLenum output_type = GL_UNSIGNED_BYTE);
 
 private:
 	void finalize_chain(Colorspace color_space, GammaCurve gamma_curve, OutputAlphaFormat alpha_format);
@@ -48,6 +49,7 @@ private:
 
 void expect_equal(const float *ref, const float *result, unsigned width, unsigned height, float largest_difference_limit = 1.5 / 255.0, float rms_limit = 0.2 / 255.0);
 void expect_equal(const unsigned char *ref, const unsigned char *result, unsigned width, unsigned height, unsigned largest_difference_limit = 1, float rms_limit = 0.2);
+void expect_equal(const uint16_t *ref, const uint16_t *result, unsigned width, unsigned height, unsigned largest_difference_limit = 1, float rms_limit = 0.2);
 void expect_equal(const int *ref, const int *result, unsigned width, unsigned height, unsigned largest_difference_limit = 1, float rms_limit = 0.2);
 void test_accuracy(const float *expected, const float *result, unsigned num_values, double absolute_error_limit, double relative_error_limit, double local_relative_error_limit, double rms_limit);
 
diff --git a/version.h b/version.h
index 165efc5..cc87124 100644
--- a/version.h
+++ b/version.h
@@ -5,6 +5,6 @@
 // changes, even within git versions. There is no specific version
 // documentation outside the regular changelogs, though.
 
-#define MOVIT_VERSION 27
+#define MOVIT_VERSION 28
 
 #endif // !defined(_MOVIT_VERSION_H)
diff --git a/ycbcr.cpp b/ycbcr.cpp
index 8ae8d34..8c4f780 100644
--- a/ycbcr.cpp
+++ b/ycbcr.cpp
@@ -59,7 +59,7 @@ float compute_chroma_offset(float pos, unsigned subsampling_factor, unsigned res
 // Given <ycbcr_format>, compute the values needed to turn Y'CbCr into R'G'B';
 // first subtract the returned offset, then left-multiply the returned matrix
 // (the scaling is already folded into it).
-void compute_ycbcr_matrix(YCbCrFormat ycbcr_format, float* offset, Matrix3d* ycbcr_to_rgb)
+void compute_ycbcr_matrix(YCbCrFormat ycbcr_format, float* offset, Matrix3d* ycbcr_to_rgb, GLenum type, double *scale_factor)
 {
 	double coeff[3], scale[3];
 
@@ -136,6 +136,24 @@ void compute_ycbcr_matrix(YCbCrFormat ycbcr_format, float* offset, Matrix3d* ycb
 
 	// Fold in the scaling.
 	*ycbcr_to_rgb *= Map<const Vector3d>(scale).asDiagonal();
+
+	if (type == GL_UNSIGNED_SHORT) {
+		// For 10-bit or 12-bit packed into 16-bit, we need to scale the values
+		// so that the max value goes from 1023 (or 4095) to 65535. We do this
+		// by folding the scaling into the conversion matrix, so it comes essentially
+		// for free. However, the offset is before the scaling (and thus assumes
+		// correctly scaled values), so we need to adjust that the other way.
+		double scale = 65535.0 / (ycbcr_format.num_levels - 1);
+		offset[0] /= scale;
+		offset[1] /= scale;
+		offset[2] /= scale;
+		*ycbcr_to_rgb *= scale;
+		if (scale_factor != NULL) {
+			*scale_factor = scale;
+		}
+	} else if (scale_factor != NULL) {
+		*scale_factor = 1.0;
+	}
 }
 
 }  // namespace movit
diff --git a/ycbcr.h b/ycbcr.h
index 6f9f4c9..1c55c39 100644
--- a/ycbcr.h
+++ b/ycbcr.h
@@ -39,6 +39,7 @@
 
 #include "image_format.h"
 
+#include <epoxy/gl.h>
 #include <Eigen/Core>
 
 namespace movit {
@@ -75,7 +76,13 @@ float compute_chroma_offset(float pos, unsigned subsampling_factor, unsigned res
 // Given <ycbcr_format>, compute the values needed to turn Y'CbCr into R'G'B';
 // first subtract the returned offset, then left-multiply the returned matrix
 // (the scaling is already folded into it).
-void compute_ycbcr_matrix(YCbCrFormat ycbcr_format, float *offset, Eigen::Matrix3d *ycbcr_to_rgb);
+//
+// <type> is the data type you're rendering from; normally, it would should match
+// <ycbcr_format.num_levels>, but for the special case of 10- and 12-bit Y'CbCr,
+// we support storing it in 16-bit formats, which incurs extra scaling factors.
+// You can get that scaling factor in <scale> if you want.
+void compute_ycbcr_matrix(YCbCrFormat ycbcr_format, float *offset, Eigen::Matrix3d *ycbcr_to_rgb,
+                          GLenum type = GL_UNSIGNED_BYTE, double *scale_factor = NULL);
 
 }  // namespace movit
 
diff --git a/ycbcr_conversion_effect.cpp b/ycbcr_conversion_effect.cpp
index 6d3e909..64d23cd 100644
--- a/ycbcr_conversion_effect.cpp
+++ b/ycbcr_conversion_effect.cpp
@@ -15,8 +15,8 @@ using namespace Eigen;
 
 namespace movit {
 
-YCbCrConversionEffect::YCbCrConversionEffect(const YCbCrFormat &ycbcr_format)
-	: ycbcr_format(ycbcr_format)
+YCbCrConversionEffect::YCbCrConversionEffect(const YCbCrFormat &ycbcr_format, GLenum type)
+	: ycbcr_format(ycbcr_format), type(type)
 {
 	register_uniform_mat3("ycbcr_matrix", &uniform_ycbcr_matrix);
 	register_uniform_vec3("offset", uniform_offset);
@@ -37,7 +37,8 @@ void YCbCrConversionEffect::set_gl_state(GLuint glsl_program_num, const string &
 	Effect::set_gl_state(glsl_program_num, prefix, sampler_num);
 
 	Matrix3d ycbcr_to_rgb;
-	compute_ycbcr_matrix(ycbcr_format, uniform_offset, &ycbcr_to_rgb);
+	double scale_factor;
+	compute_ycbcr_matrix(ycbcr_format, uniform_offset, &ycbcr_to_rgb, type, &scale_factor);
 
 	uniform_ycbcr_matrix = ycbcr_to_rgb.inverse();
 
@@ -74,6 +75,9 @@ void YCbCrConversionEffect::set_gl_state(GLuint glsl_program_num, const string &
 		} else {
 			assert(false);
 		}
+		uniform_ycbcr_min[0] /= scale_factor;
+		uniform_ycbcr_min[1] /= scale_factor;
+		uniform_ycbcr_min[2] /= scale_factor;
 	}
 }
 
diff --git a/ycbcr_conversion_effect.h b/ycbcr_conversion_effect.h
index ab31fd6..f57e5fa 100644
--- a/ycbcr_conversion_effect.h
+++ b/ycbcr_conversion_effect.h
@@ -18,7 +18,7 @@ class YCbCrConversionEffect : public Effect {
 private:
 	// Should not be instantiated by end users;
 	// call EffectChain::add_ycbcr_output() instead.
-	YCbCrConversionEffect(const YCbCrFormat &ycbcr_format);
+	YCbCrConversionEffect(const YCbCrFormat &ycbcr_format, GLenum type);
 	friend class EffectChain;
 
 public:
@@ -36,6 +36,7 @@ public:
 
 private:
 	YCbCrFormat ycbcr_format;
+	GLenum type;
 
 	Eigen::Matrix3d uniform_ycbcr_matrix;
 	float uniform_offset[3];
diff --git a/ycbcr_conversion_effect_test.cpp b/ycbcr_conversion_effect_test.cpp
index a35b9f1..27a10e3 100644
--- a/ycbcr_conversion_effect_test.cpp
+++ b/ycbcr_conversion_effect_test.cpp
@@ -576,4 +576,50 @@ TEST(YCbCrConversionEffectTest, TenBitOutput) {
 	expect_equal(expected_data, expanded_out_data, 4 * width, height);
 }
 
+TEST(YCbCrConversionEffectTest, TenBitOutputInSixteen) {
+	const int width = 1;
+	const int height = 5;
+
+	// Same test inputs and outputs as TenBitOutput, except that alpha
+	// is 16 bits instead of two.
+	float data[width * height * 4] = {
+		0.0f, 0.0f, 0.0f, 1.0f,
+		1.0f, 1.0f, 1.0f, 1.0f,
+		1.0f, 0.0f, 0.0f, 1.0f,
+		0.0f, 1.0f, 0.0f, 1.0f,
+		0.0f, 0.0f, 1.0f, 1.0f,
+	};
+	uint16_t out_data[width * height * 4];
+	uint16_t expected_data[width * height * 4] = {
+		 64, 512, 512, 65535,
+		940, 512, 512, 65535,
+		250, 409, 960, 65535,
+		691, 167, 105, 65535,
+		127, 960, 471, 65535,
+	};
+
+	EffectChainTester tester(NULL, width, height, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, GL_RGBA16);
+	tester.add_input(data, FORMAT_RGBA_POSTMULTIPLIED_ALPHA, COLORSPACE_sRGB, GAMMA_sRGB);
+
+	ImageFormat format;
+	format.color_space = COLORSPACE_sRGB;
+	format.gamma_curve = GAMMA_sRGB;
+
+	YCbCrFormat ycbcr_format;
+	ycbcr_format.luma_coefficients = YCBCR_REC_709;
+	ycbcr_format.full_range = false;
+	ycbcr_format.num_levels = 1024;
+	ycbcr_format.chroma_subsampling_x = 1;
+	ycbcr_format.chroma_subsampling_y = 1;
+	ycbcr_format.cb_x_position = 0.5f;
+	ycbcr_format.cb_y_position = 0.5f;
+	ycbcr_format.cr_x_position = 0.5f;
+	ycbcr_format.cr_y_position = 0.5f;
+
+	tester.add_ycbcr_output(format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_format, YCBCR_OUTPUT_INTERLEAVED, GL_UNSIGNED_SHORT);
+	tester.run(out_data, GL_RGBA, COLORSPACE_sRGB, GAMMA_sRGB);
+
+	expect_equal(expected_data, out_data, 4 * width, height);
+}
+
 }  // namespace movit
diff --git a/ycbcr_input.cpp b/ycbcr_input.cpp
index 3e7cad9..e725750 100644
--- a/ycbcr_input.cpp
+++ b/ycbcr_input.cpp
@@ -153,20 +153,7 @@ string YCbCrInput::output_fragment_shader()
 {
 	float offset[3];
 	Matrix3d ycbcr_to_rgb;
-	compute_ycbcr_matrix(ycbcr_format, offset, &ycbcr_to_rgb);
-
-	if (type == GL_UNSIGNED_SHORT) {
-		// For 10-bit or 12-bit packed into 16-bit, we need to scale the values
-		// so that the max value goes from 1023 (or 4095) to 65535. We do this
-		// by folding the scaling into the conversion matrix, so it comes essentially
-		// for free. However, the offset is before the scaling (and thus assumes
-		// correctly scaled values), so we need to adjust that the other way.
-		double scale = 65535.0 / (ycbcr_format.num_levels - 1);
-		offset[0] /= scale;
-		offset[1] /= scale;
-		offset[2] /= scale;
-		ycbcr_to_rgb *= scale;
-	}
+	compute_ycbcr_matrix(ycbcr_format, offset, &ycbcr_to_rgb, type);
 
 	string frag_shader;
 
-- 
2.39.2