From 9ab1efe8dbd389f309a8a3e7f68c48f114a2b901 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Fri, 10 Jan 2014 00:51:52 +0100
Subject: [PATCH] Implement GammaCompressionEffect using ALU ops instead of
 texture lookups.

This is the same as GammaExpansionEffect; however, the numerical issues
are much much bigger, and it took forever to actually get the coefficients
right for proper accuracy.

Added accuracy tests to GammaCompressionEffect to make sure things are
fine; unsurprisingly, just as with GammaExpansionEffect, the old texture
code wasn't all that accurate. The effects were not as bad, though;
it primarily affects 10- and 12-bit processing, and Movit isn't ready
for 12-bit accuracy-wise.

The benchmark wins are huge; in a chain with effectively only a
GammaExpansionEffect and a GammaCompressionEffect, FPS on Sandy Bridge
(HD3000) goes up ~80%. Seemingly sampling that 4096x1 texture was really
expensive.
---
 gamma_compression_effect.cpp      | 122 ++++++++++++++++++-------
 gamma_compression_effect.frag     |  21 ++++-
 gamma_compression_effect.h        |   4 +-
 gamma_compression_effect_test.cpp | 144 +++++++++++++++++++++++++++---
 4 files changed, 240 insertions(+), 51 deletions(-)

diff --git a/gamma_compression_effect.cpp b/gamma_compression_effect.cpp
index 5581fca..b1a16a6 100644
--- a/gamma_compression_effect.cpp
+++ b/gamma_compression_effect.cpp
@@ -3,14 +3,13 @@
 #include <assert.h>
 
 #include "gamma_compression_effect.h"
+#include "effect_util.h"
 #include "util.h"
 
 GammaCompressionEffect::GammaCompressionEffect()
 	: destination_curve(GAMMA_LINEAR)
 {
 	register_int("destination_curve", (int *)&destination_curve);
-	memset(compression_curve, 0, sizeof(compression_curve));
-	register_1d_texture("compression_curve_tex", compression_curve, COMPRESSION_CURVE_SIZE);
 }
 
 std::string GammaCompressionEffect::output_fragment_shader()
@@ -18,39 +17,96 @@ std::string GammaCompressionEffect::output_fragment_shader()
 	if (destination_curve == GAMMA_LINEAR) {
 		return read_file("identity.frag");
 	}
-	if (destination_curve == GAMMA_sRGB) {
-		for (unsigned i = 0; i < COMPRESSION_CURVE_SIZE; ++i) {
-			float x = i / (float)(COMPRESSION_CURVE_SIZE - 1);
-			if (x < 0.0031308f) {
-				compression_curve[i] = 12.92f * x;
-			} else {
-				compression_curve[i] = 1.055f * pow(x, 1.0f / 2.4f) - 0.055f;
-			}
-		}
-		invalidate_1d_texture("compression_curve_tex");
-		return read_file("gamma_compression_effect.frag");
-	}
-	if (destination_curve == GAMMA_REC_709 ||  // And Rec. 601, and 10-bit Rec. 2020.
+	if (destination_curve == GAMMA_sRGB ||
+	    destination_curve == GAMMA_REC_709 ||  // Also includes Rec. 601, and 10-bit Rec. 2020.
 	    destination_curve == GAMMA_REC_2020_12_BIT) {
-		// Rec. 2020, page 3.
-		float alpha, beta;
-		if (destination_curve == GAMMA_REC_2020_12_BIT) {
-			alpha = 1.0993f;
-			beta = 0.0181f;
-		} else {
-			alpha = 1.099f;
-			beta = 0.018f;
-		}
-		for (unsigned i = 0; i < COMPRESSION_CURVE_SIZE; ++i) {
-			float x = i / (float)(COMPRESSION_CURVE_SIZE - 1);
-			if (x < beta) {
-				compression_curve[i] = 4.5f * x;
-			} else {
-				compression_curve[i] = alpha * pow(x, 0.45f) - (alpha - 1.0f);
-			}
-		}
-		invalidate_1d_texture("compression_curve_tex");
 		return read_file("gamma_compression_effect.frag");
 	}
 	assert(false);
 }
+
+void GammaCompressionEffect::set_gl_state(GLuint glsl_program_num, const std::string &prefix, unsigned *sampler_num)
+{
+	Effect::set_gl_state(glsl_program_num, prefix, sampler_num);
+
+	// See GammaExpansionEffect for more details about the approximations in use;
+	// we will primarily deal with the differences here.
+	//
+	// Like in expansion, we have a piecewise curve that for very low values
+	// (up to some Î²) are linear. Above Î², we have a power curve that looks
+	// like this:
+	//
+	//   y = É x^É£ - (É - 1)
+	//
+	// Like in expansion, we want to approximate this by some minimax polynomial
+	// in the range Î²..1. However, in this case, É£ is typically around 0.4, and
+	// x^0.4 is actually very hard to approximate accurately in this range.
+	// We do a little trick by instead asking for a polynomial of s=sqrt(x),
+	// which means we instead need something like s^0.8, which is much easier.
+	// This warps the input space a bit as seen by the minimax algorithm,
+	// but since we are optimizing for _maximum_ error and not _average_,
+	// we should not add any extra weighting factors.
+	//
+	// However, since we have problems reaching the desired accuracy (~25%
+	// of a pixel level), especially for sRGB, we modify w(x) from
+	// GammaExpansionEffect to remove the special handling of the area
+	// around Î²; it is not really as useful when the next step is just a
+	// dither and round anyway. We keep it around 1, though, since that
+	// seems to hurt less.
+	//
+	// The Maple commands this time around become (again using sRGB as an example):
+	//
+	// > alpha := 1.055;
+	// > beta := 0.0031308;
+	// > gamma_ := 1.0/2.4;
+	// > w := x -> piecewise(x > 0.999, 10, 1);
+	// > numapprox[minimax](alpha * (x^2)^gamma_ - (alpha - 1), x=sqrt(beta)..1, [4,0], w(x^2), 'maxerror');
+	//
+	// Since the error here is possible to interpret on a uniform scale,
+	// we also show it as a value relative to a 8-, 10- or 12-bit pixel level,
+	// as appropriate.
+
+	if (destination_curve == GAMMA_sRGB) {
+		// From the Wikipedia article on sRGB; É (called a+1 there) = 1.055,
+		// Î² = 0.0031308, É£ = 1/2.4.
+		// maxerror      = 0.000785 = 0.200 * 255
+		// error at 1.0  = 0.000078 = 0.020 * 255
+		set_uniform_float(glsl_program_num, prefix, "linear_scale", 12.92);
+		set_uniform_float(glsl_program_num, prefix, "c0", -0.03679675939);
+		set_uniform_float(glsl_program_num, prefix, "c1", 1.443803073);
+		set_uniform_float(glsl_program_num, prefix, "c2", -0.9239780987);
+		set_uniform_float(glsl_program_num, prefix, "c3", 0.8060491596);
+		set_uniform_float(glsl_program_num, prefix, "c4", -0.2891558568);
+		set_uniform_float(glsl_program_num, prefix, "beta", 0.0031308);
+	}
+	if (destination_curve == GAMMA_REC_709) {  // Also includes Rec. 601, and 10-bit Rec. 2020.
+		// Rec. 2020, page 3; É = 1.099, Î² = 0.018, É£ = 0.45.
+		// maxerror      = 0.000131 = 0.033 * 255 = 0.134 * 1023
+		// error at 1.0  = 0.000013 = 0.003 * 255 = 0.013 * 1023
+		set_uniform_float(glsl_program_num, prefix, "linear_scale", 4.5);
+		set_uniform_float(glsl_program_num, prefix, "c0", -0.08541688528);
+		set_uniform_float(glsl_program_num, prefix, "c1", 1.292793370);
+		set_uniform_float(glsl_program_num, prefix, "c2", -0.4070417645);
+		set_uniform_float(glsl_program_num, prefix, "c3", 0.2923891828);
+		set_uniform_float(glsl_program_num, prefix, "c4", -0.09273699351);
+		set_uniform_float(glsl_program_num, prefix, "beta", 0.018);
+	}
+	if (destination_curve == GAMMA_REC_2020_12_BIT) {
+		// Rec. 2020, page 3; É = 1.0993, Î² = 0.0181, É£ = 0.45.
+		// maxerror      = 0.000130 = 0.533 * 4095
+		// error at 1.0  = 0.000013 = 0.053 * 4095
+		//
+		// Note that this error is above one half of a pixel level,
+		// which means that a few values will actually be off in the lowest
+		// bit. (Removing the constraint for x=1 will only take this down
+		// from 0.553 to 0.501; adding a fifth order can get it down to
+		// 0.167, although this assumes working in fp64 and not fp32.)
+		set_uniform_float(glsl_program_num, prefix, "linear_scale", 4.5);
+		set_uniform_float(glsl_program_num, prefix, "c0", -0.08569685663);
+		set_uniform_float(glsl_program_num, prefix, "c1", 1.293000900);
+		set_uniform_float(glsl_program_num, prefix, "c2", -0.4067291321);
+		set_uniform_float(glsl_program_num, prefix, "c3", 0.2919741179);
+		set_uniform_float(glsl_program_num, prefix, "c4", -0.09256205770);
+		set_uniform_float(glsl_program_num, prefix, "beta", 0.0181);
+	}
+}
diff --git a/gamma_compression_effect.frag b/gamma_compression_effect.frag
index 27e014e..e1e786c 100644
--- a/gamma_compression_effect.frag
+++ b/gamma_compression_effect.frag
@@ -1,11 +1,24 @@
-// Compress to sRGB gamma curve.
+// Compress gamma curve.
+
+uniform float PREFIX(linear_scale);
+uniform float PREFIX(c0), PREFIX(c1), PREFIX(c2), PREFIX(c3), PREFIX(c4);
+uniform float PREFIX(beta);
 
 vec4 FUNCNAME(vec2 tc) {
 	vec4 x = INPUT(tc);
 
-	x.r = texture1D(PREFIX(compression_curve_tex), x.r).x;
-	x.g = texture1D(PREFIX(compression_curve_tex), x.g).x;
-	x.b = texture1D(PREFIX(compression_curve_tex), x.b).x;
+	// We could reasonably get values outside (0.0, 1.0), but the formulas below
+	// are not valid outside that range, so clamp before we do anything else.
+	x.rgb = clamp(x.rgb, 0.0, 1.0);
+
+	vec3 a = x.rgb * PREFIX(linear_scale);
+
+	// Fourth-order polynomial approximation to pow(). See the .cpp file for details.
+	vec3 s = sqrt(x.rgb);
+	vec3 b = PREFIX(c0) + (PREFIX(c1) + (PREFIX(c2) + (PREFIX(c3) + PREFIX(c4) * s) * s) * s) * s;
+
+	vec3 f = vec3(greaterThan(x.rgb, vec3(PREFIX(beta))));
+	x = vec4(mix(a, b, f), x.a);
 
 	return x;
 }
diff --git a/gamma_compression_effect.h b/gamma_compression_effect.h
index 235850a..342e026 100644
--- a/gamma_compression_effect.h
+++ b/gamma_compression_effect.h
@@ -14,8 +14,6 @@
 #include "effect.h"
 #include "image_format.h"
 
-#define COMPRESSION_CURVE_SIZE 4096
-
 class GammaCompressionEffect : public Effect {
 private:
 	// Should not be instantiated by end users.
@@ -25,6 +23,7 @@ private:
 public:
 	virtual std::string effect_type_id() const { return "GammaCompressionEffect"; }
 	std::string output_fragment_shader();
+	virtual void set_gl_state(GLuint glsl_program_num, const std::string &prefix, unsigned *sampler_num);
 
 	virtual bool needs_srgb_primaries() const { return false; }
 
@@ -34,7 +33,6 @@ public:
 
 private:
 	GammaCurve destination_curve;
-	float compression_curve[COMPRESSION_CURVE_SIZE];
 };
 
 #endif // !defined(_MOVIT_GAMMA_COMPRESSION_EFFECT_H)
diff --git a/gamma_compression_effect_test.cpp b/gamma_compression_effect_test.cpp
index e65acee..465d239 100644
--- a/gamma_compression_effect_test.cpp
+++ b/gamma_compression_effect_test.cpp
@@ -2,7 +2,10 @@
 //
 // Pretty much the inverse of the GammaExpansionEffect tests;
 // EffectChainTest tests that they are actually inverses.
+// However, the accuracy tests are somewhat simpler, since we
+// only need to care about absolute errors and not relative.
 
+#include <math.h>
 #include <GL/glew.h>
 #include "gtest/gtest.h"
 #include "image_format.h"
@@ -12,16 +15,18 @@ TEST(GammaCompressionEffectTest, sRGB_KeyValues) {
 	float data[] = {
 		0.0f, 1.0f,
 		0.00309f, 0.00317f,   // On either side of the discontinuity.
+		-0.5f, 1.5f,          // To check clamping.
 	};
 	float expected_data[] = {
 		0.0f, 1.0f,
 		0.040f, 0.041f,
+		0.0f, 1.0f,
 	};
 	float out_data[4];
-	EffectChainTester tester(data, 2, 2, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	EffectChainTester tester(data, 2, 3, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
 	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_sRGB);
 
-	expect_equal(expected_data, out_data, 2, 2);
+	expect_equal(expected_data, out_data, 2, 3);
 }
 
 TEST(GammaCompressionEffectTest, sRGB_RampAlwaysIncreases) {
@@ -38,6 +43,33 @@ TEST(GammaCompressionEffectTest, sRGB_RampAlwaysIncreases) {
 	}
 }
 
+TEST(GammaCompressionEffectTest, sRGB_Accuracy) {
+	float data[256], expected_data[256], out_data[256];
+
+	for (int i = 0; i < 256; ++i) {
+		double x = i / 255.0;
+
+		expected_data[i] = x;
+
+		// From the Wikipedia article on sRGB.
+		if (x < 0.04045) {
+			data[i] = x / 12.92;
+		} else {
+			data[i] = pow((x + 0.055) / 1.055, 2.4);
+		}
+	}
+
+	EffectChainTester tester(data, 256, 1, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, GL_RGBA32F);
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_sRGB);
+
+	// Maximum absolute error is 25% of one pixel level. For comparison,
+	// a straightforward ALU solution (using a branch and pow()), used as a
+	// âhigh anchorâ to indicate limitations of float arithmetic etc.,
+	// reaches maximum absolute error of 3.7% of one pixel level
+	// and rms of 3.2e-6.
+	expect_equal(expected_data, out_data, 256, 1, 0.25 / 255.0, 1e-4);
+}
+
 TEST(GammaCompressionEffectTest, Rec709_KeyValues) {
 	float data[] = {
 		0.0f, 1.0f,
@@ -68,23 +100,113 @@ TEST(GammaCompressionEffectTest, Rec709_RampAlwaysIncreases) {
 	}
 }
 
+TEST(GammaCompressionEffectTest, Rec709_Accuracy) {
+	float data[256], expected_data[256], out_data[256];
+
+	for (int i = 0; i < 256; ++i) {
+		double x = i / 255.0;
+
+		expected_data[i] = x;
+
+		// Rec. 2020, page 3.
+		if (x < 0.018 * 4.5) {
+			data[i] = x / 4.5;
+		} else {
+			data[i] = pow((x + 0.099) / 1.099, 1.0 / 0.45);
+		}
+	}
+
+	EffectChainTester tester(data, 256, 1, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, GL_RGBA32F);
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_REC_709);
+
+	// Maximum absolute error is 25% of one pixel level. For comparison,
+	// a straightforward ALU solution (using a branch and pow()), used as a
+	// âhigh anchorâ to indicate limitations of float arithmetic etc.,
+	// reaches maximum absolute error of 3.7% of one pixel level
+	// and rms of 3.5e-6.
+	expect_equal(expected_data, out_data, 256, 1, 0.25 / 255.0, 1e-5);
+}
+
+// This test tests the same gamma ramp as Rec709_Accuracy, but with 10-bit
+// input range and somewhat looser error bounds. (One could claim that this is
+// already on the limit of what we can reasonably do with fp16 input, if you
+// look at the local relative error.)
+TEST(GammaCompressionEffectTest, Rec2020_10Bit_Accuracy) {
+	float data[1024], expected_data[1024], out_data[1024];
+
+	for (int i = 0; i < 1024; ++i) {
+		double x = i / 1023.0;
+
+		expected_data[i] = x;
+
+		// Rec. 2020, page 3.
+		if (x < 0.018 * 4.5) {
+			data[i] = x / 4.5;
+		} else {
+			data[i] = pow((x + 0.099) / 1.099, 1.0 / 0.45);
+		}
+	}
+
+	EffectChainTester tester(data, 1024, 1, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, GL_RGBA32F);
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_REC_2020_10_BIT);
+
+	// Maximum absolute error is 30% of one pixel level. For comparison,
+	// a straightforward ALU solution (using a branch and pow()), used as a
+	// âhigh anchorâ to indicate limitations of float arithmetic etc.,
+	// reaches maximum absolute error of 25.2% of one pixel level
+	// and rms of 1.8e-6, so this is probably mostly related to input precision.
+	expect_equal(expected_data, out_data, 1024, 1, 0.30 / 1023.0, 1e-5);
+}
+
 TEST(GammaCompressionEffectTest, Rec2020_12BitIsVeryCloseToRec709) {
-	float data[256];
-	for (unsigned i = 0; i < 256; ++i) {
-		data[i] = i / 255.0f;
+	float data[4096];
+	for (unsigned i = 0; i < 4096; ++i) {
+		data[i] = i / 4095.0f;
 	}
-	float out_data_709[256];
-	float out_data_2020[256];
+	float out_data_709[4096];
+	float out_data_2020[4096];
 
-	EffectChainTester tester(data, 256, 1, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	EffectChainTester tester(data, 4096, 1, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
 	tester.run(out_data_709, GL_RED, COLORSPACE_sRGB, GAMMA_REC_709);
-	EffectChainTester tester2(data, 256, 1, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	EffectChainTester tester2(data, 4096, 1, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
 	tester2.run(out_data_2020, GL_RED, COLORSPACE_sRGB, GAMMA_REC_2020_12_BIT);
 
 	double sqdiff = 0.0;
-	for (unsigned i = 0; i < 256; ++i) {
-		EXPECT_NEAR(out_data_709[i], out_data_2020[i], 1e-3);
+	for (unsigned i = 0; i < 4096; ++i) {
+		EXPECT_NEAR(out_data_709[i], out_data_2020[i], 0.001);
 		sqdiff += (out_data_709[i] - out_data_2020[i]) * (out_data_709[i] - out_data_2020[i]);
 	}
 	EXPECT_GT(sqdiff, 1e-6);
 }
+
+// The fp16 _input_ provided by FlatInput is not enough to distinguish between
+// all of the possible 12-bit input values (every other level translates to the
+// same value). Thus, this test has extremely loose bounds; if we ever decide
+// to start supporting fp32, we should re-run this and tighten them a lot.
+TEST(GammaCompressionEffectTest, Rec2020_12Bit_Inaccuracy) {
+	float data[4096], expected_data[4096], out_data[4096];
+
+	for (int i = 0; i < 4096; ++i) {
+		double x = i / 4095.0;
+
+		expected_data[i] = x;
+
+		// Rec. 2020, page 3.
+		if (x < 0.0181 * 4.5) {
+			data[i] = x / 4.5;
+		} else {
+			data[i] = pow((x + 0.0993) / 1.0993, 1.0 / 0.45);
+		}
+	}
+
+	EffectChainTester tester(data, 4096, 1, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, GL_RGBA32F);
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_REC_2020_12_BIT);
+
+	// Maximum absolute error is 120% of one pixel level. For comparison,
+	// a straightforward ALU solution (using a branch and pow()), used as a
+	// âhigh anchorâ to indicate limitations of float arithmetic etc.,
+	// reaches maximum absolute error of 71.1% of one pixel level
+	// and rms of 0.9e-6, so this is probably a combination of input
+	// precision and inaccuracies in the polynomial approximation.
+	expect_equal(expected_data, out_data, 4096, 1, 1.2 / 4095.0, 1e-5);
+}
-- 
2.39.2