From: Steinar H. Gunderson <sgunderson@bigfoot.com>
Date: Sat, 21 Feb 2015 01:27:14 +0000 (+0100)
Subject: When combining samples, take fp16 rounding into account.
X-Git-Tag: 1.1.3~15
X-Git-Url: https://git.sesse.net/?p=movit;a=commitdiff_plain;h=42f0fd5ccbb3560a76d55f3e725416a5e0f93523

When combining samples, take fp16 rounding into account.

This makes us somewhat more conservative in combining samples;
when we are near the lower/right edges of the image, we are starting
to get close to 1.0, and fp16 just doesn't have enough precision
to give us the 6 or 8 bits of subpixel precision we want (it is
hardly enough to address individual pixels!). In particular, this
can affect zooming with ResampleEffect, as reported by Christophe
Thommeret.

This does not fix all cases (especially not non-power-of-two cases);
for that, we will probably need to be able to fall back to fp32
when we detect fp16 doesn't work well.
---

diff --git a/blur_effect.cpp b/blur_effect.cpp
index 027d25b..6e5e843 100644
--- a/blur_effect.cpp
+++ b/blur_effect.cpp
@@ -187,18 +187,21 @@ void SingleBlurPassEffect::set_gl_state(GLuint glsl_program_num, const string &p
 		unsigned base_pos = i * 2 - 1;
 		float w1 = weight[base_pos];
 		float w2 = weight[base_pos + 1];
-
-		float offset, total_weight;
-		combine_two_samples(w1, w2, &offset, &total_weight, NULL);
-
+		int size;
 		if (direction == HORIZONTAL) {
-			samples[2 * i + 0] = (base_pos + offset) / (float)width;
+			size = width;
 		} else if (direction == VERTICAL) {
-			samples[2 * i + 0] = (base_pos + offset) / (float)height;
+			size = height;
 		} else {
 			assert(false);
 		}
 
+		float pos1 = base_pos / (float)size;
+		float pos2 = (base_pos + 1) / (float)size;
+		float pos, total_weight;
+		combine_two_samples(w1, w2, pos1, pos2, size, COMBINE_DO_NOT_ROUND, &pos, &total_weight, NULL);
+
+		samples[2 * i + 0] = pos;
 		samples[2 * i + 1] = total_weight;
 	}
 
diff --git a/resample_effect.cpp b/resample_effect.cpp
index 20c2f9b..3ccb2fd 100644
--- a/resample_effect.cpp
+++ b/resample_effect.cpp
@@ -55,7 +55,7 @@ unsigned gcd(unsigned a, unsigned b)
 	return a;
 }
 
-unsigned combine_samples(Tap<float> *src, Tap<float> *dst, unsigned num_src_samples, unsigned max_samples_saved)
+unsigned combine_samples(Tap<float> *src, Tap<float> *dst, unsigned src_size, unsigned num_src_samples, unsigned max_samples_saved)
 {
 	unsigned num_samples_saved = 0;
 	for (unsigned i = 0, j = 0; i < num_src_samples; ++i, ++j) {
@@ -85,8 +85,8 @@ unsigned combine_samples(Tap<float> *src, Tap<float> *dst, unsigned num_src_samp
 		float pos2 = src[i + 1].pos;
 		assert(pos2 > pos1);
 
-		float offset, total_weight, sum_sq_error;
-		combine_two_samples(w1, w2, &offset, &total_weight, &sum_sq_error);
+		float pos, total_weight, sum_sq_error;
+		combine_two_samples(w1, w2, pos1, pos2, src_size, COMBINE_ROUND_TO_FP16, &pos, &total_weight, &sum_sq_error);
 
 		// If the interpolation error is larger than that of about sqrt(2) of
 		// a level at 8-bit precision, don't combine. (You'd think 1.0 was enough,
@@ -100,7 +100,7 @@ unsigned combine_samples(Tap<float> *src, Tap<float> *dst, unsigned num_src_samp
 		// OK, we can combine this and the next sample.
 		if (dst != NULL) {
 			dst[j].weight = total_weight;
-			dst[j].pos = pos1 + offset * (pos2 - pos1);
+			dst[j].pos = pos;
 		}
 
 		++i;  // Skip the next sample.
@@ -403,7 +403,7 @@ void SingleResamplePassEffect::update_texture(GLuint glsl_program_num, const str
 	// The greedy strategy for combining samples is optimal.
 	src_bilinear_samples = 0;
 	for (unsigned y = 0; y < dst_samples; ++y) {
-		unsigned num_samples_saved = combine_samples(weights + y * src_samples, NULL, src_samples, UINT_MAX);
+		unsigned num_samples_saved = combine_samples(weights + y * src_samples, NULL, src_size, src_samples, UINT_MAX);
 		src_bilinear_samples = max<int>(src_bilinear_samples, src_samples - num_samples_saved);
 	}
 
@@ -416,6 +416,7 @@ void SingleResamplePassEffect::update_texture(GLuint glsl_program_num, const str
 		unsigned num_samples_saved = combine_samples(
 			weights + y * src_samples,
 			bilinear_weights_ptr,
+			src_size,
 			src_samples,
 			src_samples - src_bilinear_samples);
 		assert(int(src_samples) - int(num_samples_saved) == src_bilinear_samples);
diff --git a/resample_effect_test.cpp b/resample_effect_test.cpp
index e90f19e..9082b26 100644
--- a/resample_effect_test.cpp
+++ b/resample_effect_test.cpp
@@ -96,11 +96,11 @@ TEST(ResampleEffectTest, DownscaleByTwoGetsCorrectPixelCenters) {
 	// the texel center right (everything is nicely symmetric).
 	// The approximate magnitudes have been checked against ImageMagick.
 	float expected_data[size * size] = {
-		 0.0045, -0.0067, -0.0598, -0.0067,  0.0045, 
-		-0.0067,  0.0099,  0.0886,  0.0099, -0.0067, 
-		-0.0598,  0.0886,  0.7930,  0.0886, -0.0598, 
-		-0.0067,  0.0099,  0.0886,  0.0099, -0.0067, 
-		 0.0045, -0.0067, -0.0598, -0.0067,  0.0045, 
+		 0.0046, -0.0068, -0.0611, -0.0068,  0.0047,
+		-0.0068,  0.0100,  0.0895,  0.0100, -0.0068,
+		-0.0603,  0.0892,  0.7993,  0.0895, -0.0611,
+		-0.0067,  0.0100,  0.0892,  0.0100, -0.0068,
+		 0.0045, -0.0067, -0.0603, -0.0068,  0.0046,
 	};
 	float data[size * size * 4], out_data[size * size];
 
@@ -401,4 +401,28 @@ TEST(ResampleEffectTest, VerticalZoomFromTop) {
 	expect_equal(expected_data, out_data, width, height);
 }
 
+TEST(ResampleEffectTest, Precision) {
+	const int size = 2048;
+	const int offset = 5;
+
+	// Deliberately put the data of interest very close to the right,
+	// where texture coordinates are farther from 0 and thus less precise.
+	float data[size] = {0};
+	data[size - offset] = 1.0f;
+	float expected_data[size * 2] = {0};
+	for (int x = 0; x < size * 2; ++x) {
+		expected_data[x] = lanczos((x - (size - 2 * offset + 1) + 0.5f) * 0.5f, 3.0f);
+	}
+	float out_data[size * 2];
+
+	EffectChainTester tester(data, size * 2, 1, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	Effect *resample_effect = tester.get_chain()->add_effect(new ResampleEffect());
+	ASSERT_TRUE(resample_effect->set_int("width", size * 2));
+	ASSERT_TRUE(resample_effect->set_int("height", 1));
+	ASSERT_TRUE(resample_effect->set_float("zoom_x", 2.0f));
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR);
+
+	expect_equal(expected_data, out_data, size, 1);
+}
+
 }  // namespace movit
diff --git a/util.cpp b/util.cpp
index 579efb3..61f3e4d 100644
--- a/util.cpp
+++ b/util.cpp
@@ -6,6 +6,7 @@
 #include <string.h>
 #include <Eigen/Core>
 
+#include "fp16.h"
 #include "init.h"
 #include "util.h"
 
@@ -157,21 +158,33 @@ string output_glsl_mat3(const string &name, const Eigen::Matrix3d &m)
 	return buf;
 }
 
-void combine_two_samples(float w1, float w2, float *offset, float *total_weight, float *sum_sq_error)
+void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned size, CombineRoundingBehavior rounding_behavior,
+                         float *offset, float *total_weight, float *sum_sq_error)
 {
 	assert(movit_initialized);
 	assert(w1 * w2 >= 0.0f);  // Should not have differing signs.
-	float z;  // Just a shorter name for offset.
+	float z;  // Normalized 0..1 between pos1 and pos2.
 	if (fabs(w1 + w2) < 1e-6) {
 		z = 0.5f;
 	} else {
 		z = w2 / (w1 + w2);
 	}
 
+	*offset = pos1 + z * (pos2 - pos1);
+	if (rounding_behavior == COMBINE_ROUND_TO_FP16) {	
+		// Round to fp16. Note that this might take z outside the 0..1 range.
+		*offset = fp16_to_fp64(fp64_to_fp16(*offset));
+		z = (z - pos1) / (pos2 - pos1);
+	} else {
+		assert(rounding_behavior == COMBINE_DO_NOT_ROUND);
+	}
+
 	// Round to the minimum number of bits we have measured earlier.
 	// The card will do this for us anyway, but if we know what the real z
 	// is, we can pick a better total_weight below.
-	z = lrintf(z / movit_texel_subpixel_precision) * movit_texel_subpixel_precision;
+	z *= size;  // Move to pixel coordinates,
+	z = lrintf(z / movit_texel_subpixel_precision) * movit_texel_subpixel_precision;  // Round.
+	z /= size;  // Move back to normalized coordinates.
 	
 	// Choose total weight w so that we minimize total squared error
 	// for the effective weights:
@@ -187,16 +200,12 @@ void combine_two_samples(float w1, float w2, float *offset, float *total_weight,
 	//
 	// If z had infinite precision, this would simply reduce to w = w1 + w2.
 	*total_weight = (w1 * (1 - z) + w2 * z) / (z * z + (1 - z) * (1 - z));
-	*offset = z;
 
 	if (sum_sq_error != NULL) {
 		float err1 = *total_weight * (1 - z) - w1;
 		float err2 = *total_weight * z - w2;
 		*sum_sq_error = err1 * err1 + err2 * err2;
 	}
-
-	assert(*offset >= 0.0f);
-	assert(*offset <= 1.0f);
 }
 
 GLuint fill_vertex_attribute(GLuint glsl_program_num, const string &attribute_name, GLint size, GLenum type, GLsizeiptr data_size, const GLvoid *data)
diff --git a/util.h b/util.h
index 1fa4e78..a89d3a2 100644
--- a/util.h
+++ b/util.h
@@ -41,14 +41,29 @@ std::string output_glsl_mat3(const std::string &name, const Eigen::Matrix3d &m);
 // Calculate a / b, rounding up. Does not handle overflow correctly.
 unsigned div_round_up(unsigned a, unsigned b);
 
+enum CombineRoundingBehavior {
+	COMBINE_DO_NOT_ROUND = 0,
+	COMBINE_ROUND_TO_FP16 = 1,
+};
+
 // Calculate where to sample, and with what weight, if one wants to use
-// the GPU's bilinear hardware to sample w1 * x[0] + w2 * x[1].
+// the GPU's bilinear hardware to sample w1 * x[pos1] + w2 * x[pos2],
+// where pos1 and pos2 must be normalized coordinates describing neighboring
+// pixels in the mipmap level at which you sample, and the total number of
+// pixels (in given mipmap level) is <size>.
 //
 // Note that since the GPU might have limited precision in its linear
 // interpolation, the effective weights might be different from the ones you
 // asked for. sum_sq_error, if not NULL, will contain the sum of the
 // (estimated) squared errors of the two weights.
-void combine_two_samples(float w1, float w2, float *offset, float *total_weight, float *sum_sq_error);
+//
+// The answer, in "offset", comes as a normalized coordinate,
+// so if e.g. w2 = 0, you have simply offset = pos1. If <rounding_behavior>
+// is COMBINE_ROUND_TO_FP16, the coordinate is assumed to be stored as a
+// rounded fp16 value. This enables more precise calculation of total_weight
+// and sum_sq_error.
+void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned size, CombineRoundingBehavior rounding_behavior,
+                         float *offset, float *total_weight, float *sum_sq_error);
 
 // Create a VBO with the given data, and bind it to the vertex attribute
 // with name <attribute_name>. Returns the VBO number.