From f62661c1bef3acac7b98631970700dfa5ac94768 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sun, 9 Mar 2014 18:22:18 +0100
Subject: [PATCH] Do our own fp16 conversion in ResampleEffect.

This not only fixes issues with poor downconversion on ATI, but also
allows us to normalize while being aware of fp16 roundoff issues.
Seems to about cut the error in half in the HeavyResampleGetsSumRight
test, which as far as I can see would take us up to 10-bit accuracy.
---
 resample_effect.cpp      | 26 +++++++++++++++++++-------
 resample_effect_test.cpp | 10 ++++------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/resample_effect.cpp b/resample_effect.cpp
index dbc5788..4af60de 100644
--- a/resample_effect.cpp
+++ b/resample_effect.cpp
@@ -10,6 +10,7 @@
 
 #include "effect_chain.h"
 #include "effect_util.h"
+#include "fp16.h"
 #include "resample_effect.h"
 #include "util.h"
 
@@ -142,7 +143,7 @@ void ResampleEffect::inform_input_size(unsigned input_num, unsigned width, unsig
 	input_height = height;
 	update_size();
 }
-		
+
 void ResampleEffect::update_size()
 {
 	bool ok = true;
@@ -325,26 +326,36 @@ void SingleResamplePassEffect::update_texture(GLuint glsl_program_num, const str
 
 	// Now that we know the right width, actually combine the samples.
 	float *bilinear_weights = new float[dst_samples * src_bilinear_samples * 2];
+	fp16_int_t *bilinear_weights_fp16 = new fp16_int_t[dst_samples * src_bilinear_samples * 2];
 	for (unsigned y = 0; y < dst_samples; ++y) {
+		float *bilinear_weights_ptr = bilinear_weights + (y * src_bilinear_samples) * 2;
+		fp16_int_t *bilinear_weights_fp16_ptr = bilinear_weights_fp16 + (y * src_bilinear_samples) * 2;
 		unsigned num_samples_saved = combine_samples(
 			weights + (y * src_samples) * 2,
-			bilinear_weights + (y * src_bilinear_samples) * 2,
+			bilinear_weights_ptr,
 			src_samples,
 			src_samples - src_bilinear_samples);
 		assert(int(src_samples) - int(num_samples_saved) == src_bilinear_samples);
 
+		// Convert to fp16.
+		for (int i = 0; i < src_bilinear_samples; ++i) {
+			bilinear_weights_fp16_ptr[i * 2 + 0] = fp64_to_fp16(bilinear_weights_ptr[i * 2 + 0]);
+			bilinear_weights_fp16_ptr[i * 2 + 1] = fp64_to_fp16(bilinear_weights_ptr[i * 2 + 1]);
+		}
+
 		// Normalize so that the sum becomes one. Note that we do it twice;
 		// this sometimes helps a tiny little bit when we have many samples.
 		for (int normalize_pass = 0; normalize_pass < 2; ++normalize_pass) {
-			float sum = 0.0;
+			double sum = 0.0;
 			for (int i = 0; i < src_bilinear_samples; ++i) {
-				sum += bilinear_weights[(y * src_bilinear_samples + i) * 2 + 0];
+				sum += fp16_to_fp64(bilinear_weights_fp16_ptr[i * 2 + 0]);
 			}
 			for (int i = 0; i < src_bilinear_samples; ++i) {
-				bilinear_weights[(y * src_bilinear_samples + i) * 2 + 0] /= sum;
+				bilinear_weights_fp16_ptr[i * 2 + 0] = fp64_to_fp16(
+					fp16_to_fp64(bilinear_weights_fp16_ptr[i * 2 + 0]) / sum);
 			}
 		}
-	}	
+	}
 
 	// Encode as a two-component texture. Note the GL_REPEAT.
 	glActiveTexture(GL_TEXTURE0 + *sampler_num);
@@ -357,11 +368,12 @@ void SingleResamplePassEffect::update_texture(GLuint glsl_program_num, const str
 	check_error();
 	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
 	check_error();
-	glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16F, src_bilinear_samples, dst_samples, 0, GL_RG, GL_FLOAT, bilinear_weights);
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16F, src_bilinear_samples, dst_samples, 0, GL_RG, GL_HALF_FLOAT, bilinear_weights_fp16);
 	check_error();
 
 	delete[] weights;
 	delete[] bilinear_weights;
+	delete[] bilinear_weights_fp16;
 }
 
 void SingleResamplePassEffect::set_gl_state(GLuint glsl_program_num, const string &prefix, unsigned *sampler_num)
diff --git a/resample_effect_test.cpp b/resample_effect_test.cpp
index 971e694..211e6d1 100644
--- a/resample_effect_test.cpp
+++ b/resample_effect_test.cpp
@@ -203,12 +203,10 @@ TEST(ResampleEffectTest, HeavyResampleGetsSumRight) {
 	ASSERT_TRUE(resample_effect->set_int("height", dheight));
 	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR);
 
-	// Require that we are within 10-bit accuracy. Note that this is for
-	// one pass only; some cards that don't have correct fp32 -> fp16
-	// rounding in the intermediate framebuffers will go outside this after
-	// a 2D resize. This limit is tight enough that it will be good enough
-	// for 8-bit accuracy, though.
-	expect_equal(expected_data, out_data, dwidth, dheight, 0.5 / 1023.0);
+	// Require that we are within 10-bit accuracy. Note that this limit is for
+	// one pass only, but the limit is tight enough that it should be good enough
+	// for 10-bit accuracy even after two passes.
+	expect_equal(expected_data, out_data, dwidth, dheight, 0.1 / 1023.0);
 }
 
 }  // namespace movit
-- 
2.39.2