From 9a00101dbb6f98d21c6b8ce4d33200af840ea908 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Wed, 8 Jan 2014 22:14:42 +0100
Subject: [PATCH 1/1] Implement GammaExpansionEffect using ALU ops instead of
 texture lookups.

In a standalone benchmark (on a Sandy Bridge laptop), this is pretty much
a no-op performance-wise, but when more ops are put into the mix, it's
a ~20% FPS win, and but in a more real situation with multiple inputs
etc., it's subjectively also a pretty clear win. The reason is probably
that we generally are way overloaded on texture operations.

Note that we had similar code like this before (before we started using
the texture for lookup), but it used pow(), which is markedly slower than our
fourth-degree polynomial approximation.

We should probably do the same for GammaCompressionEffect.
---
 gamma_expansion_effect.cpp  | 130 +++++++++++++++++++++++++++---------
 gamma_expansion_effect.frag |  14 +++-
 gamma_expansion_effect.h    |   4 +-
 3 files changed, 109 insertions(+), 39 deletions(-)

diff --git a/gamma_expansion_effect.cpp b/gamma_expansion_effect.cpp
index 5b5f44d..efec553 100644
--- a/gamma_expansion_effect.cpp
+++ b/gamma_expansion_effect.cpp
@@ -1,6 +1,7 @@
 #include <math.h>
 #include <assert.h>
 
+#include "effect_util.h"
 #include "gamma_expansion_effect.h"
 #include "util.h"
 
@@ -8,8 +9,6 @@ GammaExpansionEffect::GammaExpansionEffect()
 	: source_curve(GAMMA_LINEAR)
 {
 	register_int("source_curve", (int *)&source_curve);
-	memset(expansion_curve, 0, sizeof(expansion_curve));
-	register_1d_texture("expansion_curve_tex", expansion_curve, EXPANSION_CURVE_SIZE);
 }
 
 std::string GammaExpansionEffect::output_fragment_shader()
@@ -17,39 +16,104 @@ std::string GammaExpansionEffect::output_fragment_shader()
 	if (source_curve == GAMMA_LINEAR) {
 		return read_file("identity.frag");
 	}
-	if (source_curve == GAMMA_sRGB) {
-		for (unsigned i = 0; i < EXPANSION_CURVE_SIZE; ++i) {
-			float x = i / (float)(EXPANSION_CURVE_SIZE - 1);
-			if (x < 0.04045f) {
-				expansion_curve[i] = (1.0/12.92f) * x;
-			} else {
-				expansion_curve[i] = pow((x + 0.055) * (1.0/1.055f), 2.4);
-			}
-		}
-		invalidate_1d_texture("expansion_curve_tex");
-		return read_file("gamma_expansion_effect.frag");
-	}
-	if (source_curve == GAMMA_REC_709 ||  // Also includes Rec. 601, and 10-bit Rec. 2020.
+	if (source_curve == GAMMA_sRGB ||
+	    source_curve == GAMMA_REC_709 ||  // Also includes Rec. 601, and 10-bit Rec. 2020.
 	    source_curve == GAMMA_REC_2020_12_BIT) {
-		// Rec. 2020, page 3.
-		float alpha, beta;
-		if (source_curve == GAMMA_REC_2020_12_BIT) {
-			alpha = 1.0993f;
-			beta = 0.0181f;
-		} else {
-			alpha = 1.099f;
-			beta = 0.018f;
-		}
-		for (unsigned i = 0; i < EXPANSION_CURVE_SIZE; ++i) {
-			float x = i / (float)(EXPANSION_CURVE_SIZE - 1);
-			if (x < beta * 4.5f) {
-				expansion_curve[i] = (1.0/4.5f) * x;
-			} else {
-				expansion_curve[i] = pow((x + (alpha - 1.0f)) / alpha, 1.0f/0.45f);
-			}
-		}
-		invalidate_1d_texture("expansion_curve_tex");
 		return read_file("gamma_expansion_effect.frag");
 	}
 	assert(false);
 }
+
+void GammaExpansionEffect::set_gl_state(GLuint glsl_program_num, const std::string &prefix, unsigned *sampler_num)
+{
+	Effect::set_gl_state(glsl_program_num, prefix, sampler_num);
+
+	// All of these curves follow a continuous curve that's piecewise defined;
+	// very low values (up to some Î²) are linear. Above Î², we have a power curve
+	// that looks like this:
+	//
+	//   y = ((x + É - 1) / É)^Î²
+	//
+	// However, pow() is relatively slow in GLSL, so we approximate this
+	// part by a minimax polynomial, whose coefficients are precalculated
+	// in Maple. (It is very hard to accurately model the curve as a whole
+	// using minimax polynomials; both Maple and Mathematically generally
+	// just error out if you ask them to optimize over 0..1 with a higher-degree
+	// polynomial.)
+	//
+	// We put some extra weight on areas near Î² to keep a continuous curve,
+	// and near 1.0, since we'd really like f(1.0) = 1.0, or approximately so.
+	// The following Maple commands, using sRGB below as an example, will
+	// compute the coefficients:
+	//
+	// > alpha := 1.055;
+	// > beta := 0.04045;
+	// > gamma_ := 2.4;
+	// > w := x -> piecewise(x < beta + 0.001, 10, x > 0.999, 10, 1);
+	// > numapprox[minimax](((x + alpha - 1) / alpha)^gamma_, x=beta..1, [4,0], w(x), 'maxerror');
+	//
+	// The variable 'maxerror' will then contain the maximum absolute error
+	// at any point of the curve, and we report this along with the absolute
+	// error at beta and at 1.0. Keep in mind that along this curve,
+	// the smallest minimum difference between any two 8-bit sRGB pixel levels
+	// (in the exponential part of the curve) in linear light is that
+	// between 11/255 and 12/255, which is about 0.00033 (or three to four
+	// times of the sRGB maxerror below). The choice of a fourth-degree
+	// polynomial was made with this in mind; we have not cared equally
+	// much about 10- and 12-bit Rec. 2020.
+	//
+	// NOTE: The error at beta is compared to the _linear_ part of the curve.
+	// Since the standards give these with only a few decimals, it means that
+	// the linear and exponential parts will not match up exactly, and even
+	// a perfect approximation will have error > 0 here; sometimes, even larger
+	// than maxerror for the curve itself.
+
+	if (source_curve == GAMMA_sRGB) {
+		// From the Wikipedia article on sRGB; É (called a+1 there) = 1.055,
+		// Î² = 0.04045, É£ = 2.4.
+		// maxerror      = 0.000094
+		// error at beta = 0.000094
+		// error at 1.0  = 0.000094
+		set_uniform_float(glsl_program_num, prefix, "linear_scale", 1.0 / 12.92);
+		set_uniform_float(glsl_program_num, prefix, "c0", 0.001324469581);
+		set_uniform_float(glsl_program_num, prefix, "c1", 0.02227416690);
+		set_uniform_float(glsl_program_num, prefix, "c2", 0.5917615253);
+		set_uniform_float(glsl_program_num, prefix, "c3", 0.4733532353);
+		set_uniform_float(glsl_program_num, prefix, "c4", -0.08880738120);
+		set_uniform_float(glsl_program_num, prefix, "beta", 0.04045);
+	}
+	if (source_curve == GAMMA_REC_709) {  // Also includes Rec. 601, and 10-bit Rec. 2020.
+		// Rec. 2020, page 3; É = 1.099, Î² = 0.018 * 4.5, É£ = 1/0.45.
+		// maxerror      = 0.000043
+		// error at beta = 0.000051 (see note above!)
+		// error at 1.0  = 0.000004
+		//
+		// Note that Rec. 2020 only gives the other direction, which is why
+		// our beta and gamma are different from the numbers mentioned
+		// (we've inverted the formula).
+		set_uniform_float(glsl_program_num, prefix, "linear_scale", 1.0 / 4.5);
+		set_uniform_float(glsl_program_num, prefix, "c0", 0.005137028744);
+		set_uniform_float(glsl_program_num, prefix, "c1", 0.09802596889);
+		set_uniform_float(glsl_program_num, prefix, "c2", 0.7255768864);
+		set_uniform_float(glsl_program_num, prefix, "c3", 0.2135067966);
+		set_uniform_float(glsl_program_num, prefix, "c4", -0.04225094667);
+		set_uniform_float(glsl_program_num, prefix, "beta", 0.018 * 4.5);
+	}
+	if (source_curve == GAMMA_REC_2020_12_BIT) {
+		// Rec. 2020, page 3; É = 1.0993, Î² = 0.0181 * 4.5, É£ = 1/0.45.
+		// maxerror      = 0.000042
+		// error at beta = 0.000005
+		// error at 1.0  = 0.000004
+		//
+		// Note that Rec. 2020 only gives the other direction, which is why
+		// our beta and gamma are different from the numbers mentioned
+		// (we've inverted the formula).
+		set_uniform_float(glsl_program_num, prefix, "linear_scale", 1.0 / 4.5);
+		set_uniform_float(glsl_program_num, prefix, "c0", 0.005167545928);
+		set_uniform_float(glsl_program_num, prefix, "c1", 0.09835585809);
+		set_uniform_float(glsl_program_num, prefix, "c2", 0.7254820139);
+		set_uniform_float(glsl_program_num, prefix, "c3", 0.2131291155);
+		set_uniform_float(glsl_program_num, prefix, "c4", -0.04213877222);
+		set_uniform_float(glsl_program_num, prefix, "beta", 0.0181 * 4.5);
+	}
+}
diff --git a/gamma_expansion_effect.frag b/gamma_expansion_effect.frag
index 4733656..359b026 100644
--- a/gamma_expansion_effect.frag
+++ b/gamma_expansion_effect.frag
@@ -1,11 +1,19 @@
 // Expand gamma curve.
 
+uniform float PREFIX(linear_scale);
+uniform float PREFIX(c0), PREFIX(c1), PREFIX(c2), PREFIX(c3), PREFIX(c4);
+uniform float PREFIX(beta);
+
 vec4 FUNCNAME(vec2 tc) {
 	vec4 x = INPUT(tc);
 
-	x.r = texture1D(PREFIX(expansion_curve_tex), x.r).x;
-	x.g = texture1D(PREFIX(expansion_curve_tex), x.g).x;
-	x.b = texture1D(PREFIX(expansion_curve_tex), x.b).x;
+	vec3 a = x.rgb * PREFIX(linear_scale);
+
+	// Fourth-order polynomial approximation to pow(). See the .cpp file for details.
+	vec3 b = PREFIX(c0) + (PREFIX(c1) + (PREFIX(c2) + (PREFIX(c3) + PREFIX(c4) * x.rgb) * x.rgb) * x.rgb) * x.rgb;
+
+	vec3 f = vec3(greaterThan(x.rgb, vec3(PREFIX(beta))));
+	x = vec4(mix(a, b, f), x.a);
 
 	return x;
 }
diff --git a/gamma_expansion_effect.h b/gamma_expansion_effect.h
index e4a94b3..679c16d 100644
--- a/gamma_expansion_effect.h
+++ b/gamma_expansion_effect.h
@@ -12,8 +12,6 @@
 #include "effect.h"
 #include "image_format.h"
 
-#define EXPANSION_CURVE_SIZE 256
-
 class GammaExpansionEffect : public Effect {
 private:
 	// Should not be instantiated by end users.
@@ -23,6 +21,7 @@ private:
 public:
 	virtual std::string effect_type_id() const { return "GammaExpansionEffect"; }
 	std::string output_fragment_shader();
+	virtual void set_gl_state(GLuint glsl_program_num, const std::string &prefix, unsigned *sampler_num);
 
 	virtual bool needs_linear_light() const { return false; }
 	virtual bool needs_srgb_primaries() const { return false; }
@@ -33,7 +32,6 @@ public:
 
 private:
 	GammaCurve source_curve;
-	float expansion_curve[EXPANSION_CURVE_SIZE];
 };
 
 #endif // !defined(_MOVIT_GAMMA_EXPANSION_EFFECT_H)
-- 
2.39.2