From: Steinar H. Gunderson <sgunderson@bigfoot.com>
Date: Sun, 28 Oct 2012 16:40:56 +0000 (+0100)
Subject: Make combine_two_pixels() handle the fact that the GPU has limited subpixel interpola... 
X-Git-Tag: 1.0~220
X-Git-Url: https://git.sesse.net/?p=movit;a=commitdiff_plain;h=34f5f331bd3b643949ccbb0c3488af2823634d59

Make combine_two_pixels() handle the fact that the GPU has limited subpixel interpolation precision.

Currently we only use it to somewhat improve the total_weight estimate
to reduce the error slightly. However, the function can now also return
the total error estimate, which will be useful for knowing when we can
and cannot combine weights with reasonable error.
---

diff --git a/Makefile b/Makefile
index edbe09b..59a89e7 100644
--- a/Makefile
+++ b/Makefile
@@ -31,7 +31,7 @@ TESTS += flat_input_test
 TESTS += ycbcr_input_test
 
 # Core.
-LIB_OBJS=util.o widgets.o effect.o effect_chain.o
+LIB_OBJS=util.o widgets.o effect.o effect_chain.o init.o
 
 # Inputs.
 LIB_OBJS += flat_input.o
diff --git a/blur_effect.cpp b/blur_effect.cpp
index 90d1b68..ec61aea 100644
--- a/blur_effect.cpp
+++ b/blur_effect.cpp
@@ -162,7 +162,7 @@ void SingleBlurPassEffect::set_gl_state(GLuint glsl_program_num, const std::stri
 		float w2 = weight[base_pos + 1];
 
 		float offset, total_weight;
-		combine_two_samples(w1, w2, &offset, &total_weight);
+		combine_two_samples(w1, w2, &offset, &total_weight, NULL);
 
 		float x = 0.0f, y = 0.0f;
 
diff --git a/demo.cpp b/demo.cpp
index 552380a..2243f2d 100644
--- a/demo.cpp
+++ b/demo.cpp
@@ -18,6 +18,7 @@
 #include <SDL/SDL_opengl.h>
 #include <SDL/SDL_image.h>
 
+#include "init.h"
 #include "effect.h"
 #include "effect_chain.h"
 #include "util.h"
@@ -152,14 +153,16 @@ int main(int argc, char **argv)
 	SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
 	SDL_SetVideoMode(WIDTH, HEIGHT, 0, SDL_OPENGL);
 	SDL_WM_SetCaption("OpenGL window", NULL);
-	
-	// geez	
-	glPixelStorei(GL_PACK_ALIGNMENT, 1);
 
+	init_movit();
+	printf("GPU texture subpixel precision: about %.1f bits\n",
+		log2(1.0f / movit_texel_subpixel_precision));
+	
 	unsigned img_w, img_h;
 	unsigned char *src_img = load_image("blg_wheels_woman_1.jpg", &img_w, &img_h);
 
 	EffectChain chain(WIDTH, HEIGHT);
+	glViewport(0, 0, WIDTH, HEIGHT);
 
 	ImageFormat inout_format;
 	inout_format.color_space = COLORSPACE_sRGB;
diff --git a/init.cpp b/init.cpp
new file mode 100644
index 0000000..129ee4c
--- /dev/null
+++ b/init.cpp
@@ -0,0 +1,137 @@
+#include "init.h"
+#include "opengl.h"
+#include "util.h"
+
+bool movit_initialized = false;
+float movit_texel_subpixel_precision;
+
+namespace {
+
+void measure_texel_subpixel_precision()
+{
+	static const unsigned width = 1024;
+
+	// Generate a destination texture to render to, and an FBO.
+	GLuint dst_texnum, fbo;
+
+	glGenTextures(1, &dst_texnum);
+	check_error();
+	glBindTexture(GL_TEXTURE_2D, dst_texnum);
+	check_error();
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, width, 1, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+	check_error();
+
+	glGenFramebuffers(1, &fbo);
+	check_error();
+	glBindFramebuffer(GL_FRAMEBUFFER, fbo);
+	check_error();
+	glFramebufferTexture2D(
+		GL_FRAMEBUFFER,
+		GL_COLOR_ATTACHMENT0,
+		GL_TEXTURE_2D,
+		dst_texnum,
+		0);
+	check_error();
+
+	// Now generate a simple texture that's just [0,1].
+	GLuint src_texnum;
+	float texdata[] = { 0, 1 };
+	glGenTextures(1, &dst_texnum);
+	check_error();
+	glBindTexture(GL_TEXTURE_1D, dst_texnum);
+	check_error();
+	glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+	check_error();
+	glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+	check_error();
+	glTexImage1D(GL_TEXTURE_1D, 0, GL_LUMINANCE16F_ARB, 2, 0, GL_LUMINANCE, GL_FLOAT, texdata);
+	check_error();
+	glEnable(GL_TEXTURE_1D);
+	check_error();
+
+	// Basic state.
+	glDisable(GL_BLEND);
+	check_error();
+	glDisable(GL_DEPTH_TEST);
+	check_error();
+	glDepthMask(GL_FALSE);
+	check_error();
+
+	glViewport(0, 0, width, 1);
+
+	glMatrixMode(GL_PROJECTION);
+	glLoadIdentity();
+	glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+
+	glMatrixMode(GL_MODELVIEW);
+	glLoadIdentity();
+	check_error();
+
+	// Draw the texture stretched over a long quad, interpolating it out.
+	// Note that since the texel center is in (0.5), we need to adjust the
+	// texture coordinates in order not to get long stretches of (1,1,1,...)
+	// at the start and (...,0,0,0) at the end.
+	glBegin(GL_QUADS);
+
+	glTexCoord1f(0.25f);
+	glVertex2f(0.0f, 0.0f);
+
+	glTexCoord1f(0.75f);
+	glVertex2f(1.0f, 0.0f);
+
+	glTexCoord1f(0.75f);
+	glVertex2f(1.0f, 1.0f);
+
+	glTexCoord1f(0.25f);
+	glVertex2f(0.0f, 1.0f);
+
+	glEnd();
+	check_error();
+
+	glDisable(GL_TEXTURE_1D);
+	check_error();
+
+	// Now read the data back and see what the card did.
+	// (We only look at the red channel; the others will surely be the same.)
+	// We assume a linear ramp; anything else will give sort of odd results here.
+	float out_data[width];
+	glReadPixels(0, 0, width, 1, GL_RED, GL_FLOAT, out_data);
+	check_error();
+
+	float biggest_jump = 0.0f;
+	for (unsigned i = 1; i < width; ++i) {
+		assert(out_data[i] >= out_data[i - 1]);
+		biggest_jump = std::max(biggest_jump, out_data[i] - out_data[i - 1]);
+	}
+
+	movit_texel_subpixel_precision = biggest_jump;
+
+	// Clean up.
+	glBindTexture(GL_TEXTURE_1D, 0);
+	check_error();
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+	check_error();
+	glDeleteFramebuffers(1, &fbo);
+	check_error();
+	glDeleteTextures(1, &dst_texnum);
+	check_error();
+	glDeleteTextures(1, &src_texnum);
+	check_error();
+}
+
+}  // namespace
+
+void init_movit()
+{
+	if (movit_initialized) {
+		return;
+	}
+
+	// geez	
+	glPixelStorei(GL_PACK_ALIGNMENT, 1);
+	glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
+	measure_texel_subpixel_precision();
+
+	movit_initialized = true;
+}
diff --git a/init.h b/init.h
new file mode 100644
index 0000000..e0dbe63
--- /dev/null
+++ b/init.h
@@ -0,0 +1,21 @@
+#ifndef _INIT_H
+#define _INIT_H
+
+// Initialize the library; in particular, will query the GPU for information
+// that is needed by various components. (In time, for instance, we will query
+// about extensions here.)
+void init_movit();
+
+// GPU features. These are not intended for end-user use.
+
+// Whether init_movit() has been called.
+extern bool movit_initialized;
+
+// An estimate on the number of different levels the linear texture interpolation
+// of the GPU can deliver. My Intel card seems to be limited to 2^6 levels here,
+// while a modern nVidia card (GTX 550 Ti) seem to use 2^8.
+//
+// We currently don't bother to test above 2^10.
+extern float movit_texel_subpixel_precision;
+
+#endif  // !defined(_INIT_H)
diff --git a/test_util.cpp b/test_util.cpp
index 6beaba1..afeb848 100644
--- a/test_util.cpp
+++ b/test_util.cpp
@@ -1,3 +1,4 @@
+#include "init.h"
 #include "test_util.h"
 #include "flat_input.h"
 #include "gtest/gtest.h"
@@ -11,6 +12,8 @@ EffectChainTester::EffectChainTester(const float *data, unsigned width, unsigned
                                      MovitPixelFormat pixel_format, Colorspace color_space, GammaCurve gamma_curve)
 	: chain(width, height), width(width), height(height), finalized(false)
 {
+	init_movit();
+
 	if (data != NULL) {
 		add_input(data, pixel_format, color_space, gamma_curve);
 	}
diff --git a/util.cpp b/util.cpp
index c6517c9..eeea198 100644
--- a/util.cpp
+++ b/util.cpp
@@ -5,6 +5,7 @@
 #include <math.h>
 #include "util.h"
 #include "opengl.h"
+#include "init.h"
 
 void hsv2rgb(float h, float s, float v, float *r, float *g, float *b)
 {
@@ -125,16 +126,44 @@ std::string output_glsl_mat3(const std::string &name, const Eigen::Matrix3d &m)
 	return buf;
 }
 
-void combine_two_samples(float w1, float w2, float *offset, float *total_weight)
+void combine_two_samples(float w1, float w2, float *offset, float *total_weight, float *sum_sq_error)
 {
+	assert(movit_initialized);
 	assert(w1 * w2 >= 0.0f);  // Should not have differing signs.
+	float z;  // Just a shorter name for offset.
 	if (fabs(w1 + w2) < 1e-6) {
-		*offset = 0.5f;
-		*total_weight = 0.0f;
+		z = 0.5f;
 	} else {
-		*offset = w2 / (w1 + w2);
-		*total_weight = w1 + w2;
+		z = w2 / (w1 + w2);
 	}
+
+	// Round to the minimum number of bits we have measured earlier.
+	// The card will do this for us anyway, but if we know what the real z
+	// is, we can pick a better total_weight below.
+	z = lrintf(z / movit_texel_subpixel_precision) * movit_texel_subpixel_precision;
+	
+	// Choose total weight w so that we minimize total squared error
+	// for the effective weights:
+	//
+	//   e = (w(1-z) - a)Â² + (wz - b)Â²
+	//
+	// Differentiating by w and setting equal to zero:
+	//
+	//   2(w(1-z) - a)(1-z) + 2(wz - b)z = 0
+	//   w(1-z)Â² - a(1-z) + wzÂ² - bz = 0
+	//   w((1-z)Â² + zÂ²) = a(1-z) + bz
+	//   w = (a(1-z) + bz) / ((1-z)Â² + zÂ²)
+	//
+	// If z had infinite precision, this would simply reduce to w = w1 + w2.
+	*total_weight = (w1 * (1 - z) + w2 * z) / (z * z + (1 - z) * (1 - z));
+	*offset = z;
+
+	if (sum_sq_error != NULL) {
+		float err1 = *total_weight * (1 - z) - w1;
+		float err2 = *total_weight * z - w2;
+		*sum_sq_error = err1 * err1 + err2 * err2;
+	}
+
 	assert(*offset >= 0.0f);
 	assert(*offset <= 1.0f);
 }
diff --git a/util.h b/util.h
index 414c208..082d114 100644
--- a/util.h
+++ b/util.h
@@ -36,7 +36,12 @@ std::string output_glsl_mat3(const std::string &name, const Eigen::Matrix3d &m);
 
 // Calculate where to sample, and with what weight, if one wants to use
 // the GPU's bilinear hardware to sample w1 * x[0] + w2 * x[1].
-void combine_two_samples(float w1, float w2, float *offset, float *total_weight);
+//
+// Note that since the GPU might have limited precision in its linear
+// interpolation, the effective weights might be different from the ones you
+// asked for. sum_sq_error, if not NULL, will contain the sum of the
+// (estimated) squared errors of the two weights.
+void combine_two_samples(float w1, float w2, float *offset, float *total_weight, float *sum_sq_error);
 
 #ifdef NDEBUG
 #define check_error()