Fix a bug where combined fp16 weights would be horribly wrong.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Wed, 23 Sep 2015 23:59:47 +0000 (01:59 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 24 Sep 2015 00:04:33 +0000 (02:04 +0200)
Seemingly weights were always returned as float, and then cast
to fp16_int_t -- without proper conversion! And sum_sq_error
would be calculated based on the correct value, not the broken-
casted one.

It's a small miracle the unit tests didn't catch this; they didn't
until I started introducing small errors for another reason.
Most real-world testing seems to have hit fp32, and thus this
wasn't caught there either.

Also make fp16_int_t a struct so that it is not implicitly
convertible to/from numeric types, so this never ever can happen again.

fp16.cpp
fp16.h
fp16_test.cpp
resample_effect.cpp
util.cpp

index fc5800e..e8993f9 100644 (file)
--- a/fp16.cpp
+++ b/fp16.cpp
@@ -13,9 +13,9 @@ template<class FP16_INT_T,
          int FP64_BIAS, int FP64_MANTISSA_BITS, int FP64_EXPONENT_BITS, int FP64_MAX_EXPONENT>
 inline double fp_upconvert(FP16_INT_T x)
 {
-       int sign = x >> (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS);
-       int exponent = (x & ((1ULL << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) - 1)) >> FP16_MANTISSA_BITS;
-       unsigned long long mantissa = x & ((1ULL << FP16_MANTISSA_BITS) - 1);
+       int sign = x.val >> (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS);
+       int exponent = (x.val & ((1ULL << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) - 1)) >> FP16_MANTISSA_BITS;
+       unsigned long long mantissa = x.val & ((1ULL << FP16_MANTISSA_BITS) - 1);
 
        int sign64;
        int exponent64;
@@ -187,9 +187,11 @@ inline FP16_INT_T fp_downconvert(double x)
                }
        }
 
-       return (sign16 << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS))
+       FP16_INT_T ret;
+       ret.val = (sign16 << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS))
            | (exponent16 << FP16_MANTISSA_BITS)
            | mantissa16;
+       return ret;
 }
 
 const int FP64_BIAS = 1023;
diff --git a/fp16.h b/fp16.h
index 5417e02..c21153b 100644 (file)
--- a/fp16.h
+++ b/fp16.h
 
 namespace movit {
 
-typedef unsigned int fp32_int_t;
-typedef unsigned short fp16_int_t;
+// structs instead of ints, so that they are not implicitly convertible.
+struct fp32_int_t {
+       unsigned int val;
+};
+struct fp16_int_t {
+       unsigned short val;
+};
 
 #ifdef __F16C__
 
@@ -23,14 +28,16 @@ typedef unsigned short fp16_int_t;
 // are at compile time).
 static inline double fp16_to_fp64(fp16_int_t x)
 {
-       return _cvtsh_ss(x);
+       return _cvtsh_ss(x.val);
 }
 
 static inline fp16_int_t fp64_to_fp16(double x)
 {
        // NOTE: Strictly speaking, there are some select values where this isn't correct,
        // since we first round to fp32 and then to fp16.
-       return _cvtss_sh(x, 0);
+       fp16_int_t ret;
+       ret.val = _cvtss_sh(x, 0);
+       return ret;
 }
 
 #else
index bb8b182..e0920e9 100644 (file)
@@ -4,31 +4,48 @@
 #include <gtest/gtest.h>
 
 namespace movit {
+namespace {
+
+fp16_int_t make_fp16(unsigned short x)
+{
+       fp16_int_t ret;
+       ret.val = x;
+       return ret;
+}
+
+fp32_int_t make_fp32(unsigned int x)
+{
+       fp32_int_t ret;
+       ret.val = x;
+       return ret;
+}
+
+}  // namespace
 
 TEST(FP16Test, Simple) {
-       EXPECT_EQ(0x0000, fp64_to_fp16(0.0));
-       EXPECT_DOUBLE_EQ(0.0, fp16_to_fp64(0x0000));
+       EXPECT_EQ(0x0000, fp64_to_fp16(0.0).val);
+       EXPECT_DOUBLE_EQ(0.0, fp16_to_fp64(make_fp16(0x0000)));
 
-       EXPECT_EQ(0x3c00, fp64_to_fp16(1.0));
-       EXPECT_DOUBLE_EQ(1.0, fp16_to_fp64(0x3c00));
+       EXPECT_EQ(0x3c00, fp64_to_fp16(1.0).val);
+       EXPECT_DOUBLE_EQ(1.0, fp16_to_fp64(make_fp16(0x3c00)));
 
-       EXPECT_EQ(0x3555, fp64_to_fp16(1.0 / 3.0));
-       EXPECT_DOUBLE_EQ(0.333251953125, fp16_to_fp64(0x3555));
+       EXPECT_EQ(0x3555, fp64_to_fp16(1.0 / 3.0).val);
+       EXPECT_DOUBLE_EQ(0.333251953125, fp16_to_fp64(make_fp16(0x3555)));
 }
 
 TEST(FP16Test, RoundToNearestEven) {
-       ASSERT_DOUBLE_EQ(1.0, fp16_to_fp64(0x3c00));
-
-       double x0 = fp16_to_fp64(0x3c00);
-       double x1 = fp16_to_fp64(0x3c01);
-       double x2 = fp16_to_fp64(0x3c02);
-       double x3 = fp16_to_fp64(0x3c03);
-       double x4 = fp16_to_fp64(0x3c04);
-
-       EXPECT_EQ(0x3c00, fp64_to_fp16(0.5 * (x0 + x1)));
-       EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x1 + x2)));
-       EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x2 + x3)));
-       EXPECT_EQ(0x3c04, fp64_to_fp16(0.5 * (x3 + x4)));
+       ASSERT_DOUBLE_EQ(1.0, fp16_to_fp64(make_fp16(0x3c00)));
+
+       double x0 = fp16_to_fp64(make_fp16(0x3c00));
+       double x1 = fp16_to_fp64(make_fp16(0x3c01));
+       double x2 = fp16_to_fp64(make_fp16(0x3c02));
+       double x3 = fp16_to_fp64(make_fp16(0x3c03));
+       double x4 = fp16_to_fp64(make_fp16(0x3c04));
+
+       EXPECT_EQ(0x3c00, fp64_to_fp16(0.5 * (x0 + x1)).val);
+       EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x1 + x2)).val);
+       EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x2 + x3)).val);
+       EXPECT_EQ(0x3c04, fp64_to_fp16(0.5 * (x3 + x4)).val);
 }
 
 union fp64 {
@@ -42,8 +59,8 @@ union fp32 {
 
 TEST(FP16Test, NaN) {
        // Ignore the sign bit.
-       EXPECT_EQ(0x7e00, fp64_to_fp16(0.0 / 0.0) & 0x7fff);
-       EXPECT_TRUE(isnan(fp16_to_fp64(0xfe00)));
+       EXPECT_EQ(0x7e00, fp64_to_fp16(0.0 / 0.0).val & 0x7fff);
+       EXPECT_TRUE(isnan(fp16_to_fp64(make_fp16(0xfe00))));
 
        fp64 borderline_inf;
        borderline_inf.ll = 0x7ff0000000000000ull;
@@ -68,15 +85,15 @@ TEST(FP16Test, NaN) {
 
 TEST(FP16Test, Denormals) {
        const double smallest_fp16_denormal = 5.9604644775390625e-08;
-       EXPECT_EQ(0x0001, fp64_to_fp16(smallest_fp16_denormal));
-       EXPECT_EQ(0x0000, fp64_to_fp16(0.5 * smallest_fp16_denormal));  // Round-to-even.
-       EXPECT_EQ(0x0001, fp64_to_fp16(0.51 * smallest_fp16_denormal));
-       EXPECT_EQ(0x0002, fp64_to_fp16(1.5 * smallest_fp16_denormal));
+       EXPECT_EQ(0x0001, fp64_to_fp16(smallest_fp16_denormal).val);
+       EXPECT_EQ(0x0000, fp64_to_fp16(0.5 * smallest_fp16_denormal).val);  // Round-to-even.
+       EXPECT_EQ(0x0001, fp64_to_fp16(0.51 * smallest_fp16_denormal).val);
+       EXPECT_EQ(0x0002, fp64_to_fp16(1.5 * smallest_fp16_denormal).val);
 
        const double smallest_fp16_non_denormal = 6.103515625e-05;
-       EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal));
-       EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal - 0.5 * smallest_fp16_denormal));  // Round-to-even.
-       EXPECT_EQ(0x03ff, fp64_to_fp16(smallest_fp16_non_denormal - smallest_fp16_denormal));
+       EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal).val);
+       EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal - 0.5 * smallest_fp16_denormal).val);  // Round-to-even.
+       EXPECT_EQ(0x03ff, fp64_to_fp16(smallest_fp16_non_denormal - smallest_fp16_denormal).val);
 }
 
 // Randomly test a large number of fp64 -> fp32 conversions, comparing
@@ -93,7 +110,7 @@ TEST(FP16Test, FP32ReferenceDownconvert) {
 
                src.ll = (((unsigned long long)r1) << 33) ^ ((unsigned long long)r2 << 16) ^ r3;
                reference.f = float(src.f);
-               result.u = fp64_to_fp32(src.f);
+               result.u = fp64_to_fp32(src.f).val;
 
                EXPECT_EQ(isnan(result.f), isnan(reference.f));
                if (!isnan(result.f)) {
@@ -116,7 +133,7 @@ TEST(FP16Test, FP32ReferenceUpconvert) {
 
                src.u = ((unsigned long long)r1 << 16) ^ r2;
                reference.f = double(src.f);
-               result.f = fp32_to_fp64(src.u);
+               result.f = fp32_to_fp64(make_fp32(src.u));
 
                EXPECT_EQ(isnan(result.f), isnan(reference.f));
                if (!isnan(result.f)) {
index 156098e..244a3e2 100644 (file)
@@ -107,7 +107,7 @@ unsigned combine_samples(const Tap<float> *src, Tap<DestFloat> *dst, float num_s
                float pos2 = src[i + 1].pos;
                assert(pos2 > pos1);
 
-               fp16_int_t pos, total_weight;
+               DestFloat pos, total_weight;
                float sum_sq_error;
                combine_two_samples(w1, w2, pos1, pos2, num_subtexels, inv_num_subtexels, &pos, &total_weight, &sum_sq_error);
 
index da6057e..59f1dcd 100644 (file)
--- a/util.cpp
+++ b/util.cpp
@@ -253,11 +253,11 @@ void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_s
        //   w = (a(1-z) + bz) / ((1-z)² + z²)
        //
        // If z had infinite precision, this would simply reduce to w = w1 + w2.
-       *total_weight = (w1 + z * (w2 - w1)) / (z * z + (1 - z) * (1 - z));
+       *total_weight = from_fp64<DestFloat>((w1 + z * (w2 - w1)) / (z * z + (1 - z) * (1 - z)));
 
        if (sum_sq_error != NULL) {
-               float err1 = *total_weight * (1 - z) - w1;
-               float err2 = *total_weight * z - w2;
+               float err1 = to_fp64(*total_weight) * (1 - z) - w1;
+               float err2 = to_fp64(*total_weight) * z - w2;
                *sum_sq_error = err1 * err1 + err2 * err2;
        }
 }