Fix a bug where combined fp16 weights would be horribly wrong.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Wed, 23 Sep 2015 23:59:47 +0000 (01:59 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 24 Sep 2015 00:04:33 +0000 (02:04 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Wed, 23 Sep 2015 23:59:47 +0000 (01:59 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 24 Sep 2015 00:04:33 +0000 (02:04 +0200)
diff --git a/fp16.cpp b/fp16.cpp

index fc5800e4a964855c0cfb00efa29c0c7d96d94a31..e8993f9eb153f508516dec4cbd6a2642965d9597 100644 (file)
--- a/fp16.cpp
+++ b/fp16.cpp
@@ -13,9 +13,9 @@ template<class FP16_INT_T,
           int FP64_BIAS, int FP64_MANTISSA_BITS, int FP64_EXPONENT_BITS, int FP64_MAX_EXPONENT>
  inline double fp_upconvert(FP16_INT_T x)
  {
-       int sign = x >> (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS);
-       int exponent = (x & ((1ULL << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) - 1)) >> FP16_MANTISSA_BITS;
-       unsigned long long mantissa = x & ((1ULL << FP16_MANTISSA_BITS) - 1);
+       int sign = x.val >> (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS);
+       int exponent = (x.val & ((1ULL << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) - 1)) >> FP16_MANTISSA_BITS;
+       unsigned long long mantissa = x.val & ((1ULL << FP16_MANTISSA_BITS) - 1);
  
         int sign64;
         int exponent64;
@@ -187,9 +187,11 @@ inline FP16_INT_T fp_downconvert(double x)
                 }
         }
  
-       return (sign16 << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS))
+       FP16_INT_T ret;
+       ret.val = (sign16 << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS))
             | (exponent16 << FP16_MANTISSA_BITS)
             | mantissa16;
+       return ret;
  }
  
  const int FP64_BIAS = 1023;
diff --git a/fp16.h b/fp16.h

index 5417e020b82cdfc039e78a1124b3c98833604e77..c21153b3466fe71424283063cb7cf9b775a89153 100644 (file)
--- a/fp16.h
+++ b/fp16.h
@@ -14,8 +14,13 @@
  
  namespace movit {
  
-typedef unsigned int fp32_int_t;
-typedef unsigned short fp16_int_t;
+// structs instead of ints, so that they are not implicitly convertible.
+struct fp32_int_t {
+       unsigned int val;
+};
+struct fp16_int_t {
+       unsigned short val;
+};
  
  #ifdef __F16C__
  
@@ -23,14 +28,16 @@ typedef unsigned short fp16_int_t;
  // are at compile time).
  static inline double fp16_to_fp64(fp16_int_t x)
  {
-       return _cvtsh_ss(x);
+       return _cvtsh_ss(x.val);
  }
  
  static inline fp16_int_t fp64_to_fp16(double x)
  {
         // NOTE: Strictly speaking, there are some select values where this isn't correct,
         // since we first round to fp32 and then to fp16.
-       return _cvtss_sh(x, 0);
+       fp16_int_t ret;
+       ret.val = _cvtss_sh(x, 0);
+       return ret;
  }
  
  #else
diff --git a/fp16_test.cpp b/fp16_test.cpp

index bb8b18280d1f487889680d307286c476403cb5f8..e0920e9ce60bf499bb636ac4c7ab8eac9b2b1b5c 100644 (file)
--- a/fp16_test.cpp
+++ b/fp16_test.cpp
@@ -4,31 +4,48 @@
  #include <gtest/gtest.h>
  
  namespace movit {
+namespace {
+
+fp16_int_t make_fp16(unsigned short x)
+{
+       fp16_int_t ret;
+       ret.val = x;
+       return ret;
+}
+
+fp32_int_t make_fp32(unsigned int x)
+{
+       fp32_int_t ret;
+       ret.val = x;
+       return ret;
+}
+
+}  // namespace
  
  TEST(FP16Test, Simple) {
-       EXPECT_EQ(0x0000, fp64_to_fp16(0.0));
-       EXPECT_DOUBLE_EQ(0.0, fp16_to_fp64(0x0000));
+       EXPECT_EQ(0x0000, fp64_to_fp16(0.0).val);
+       EXPECT_DOUBLE_EQ(0.0, fp16_to_fp64(make_fp16(0x0000)));
  
-       EXPECT_EQ(0x3c00, fp64_to_fp16(1.0));
-       EXPECT_DOUBLE_EQ(1.0, fp16_to_fp64(0x3c00));
+       EXPECT_EQ(0x3c00, fp64_to_fp16(1.0).val);
+       EXPECT_DOUBLE_EQ(1.0, fp16_to_fp64(make_fp16(0x3c00)));
  
-       EXPECT_EQ(0x3555, fp64_to_fp16(1.0 / 3.0));
-       EXPECT_DOUBLE_EQ(0.333251953125, fp16_to_fp64(0x3555));
+       EXPECT_EQ(0x3555, fp64_to_fp16(1.0 / 3.0).val);
+       EXPECT_DOUBLE_EQ(0.333251953125, fp16_to_fp64(make_fp16(0x3555)));
  }
  
  TEST(FP16Test, RoundToNearestEven) {
-       ASSERT_DOUBLE_EQ(1.0, fp16_to_fp64(0x3c00));
-
-       double x0 = fp16_to_fp64(0x3c00);
-       double x1 = fp16_to_fp64(0x3c01);
-       double x2 = fp16_to_fp64(0x3c02);
-       double x3 = fp16_to_fp64(0x3c03);
-       double x4 = fp16_to_fp64(0x3c04);
-
-       EXPECT_EQ(0x3c00, fp64_to_fp16(0.5 * (x0 + x1)));
-       EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x1 + x2)));
-       EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x2 + x3)));
-       EXPECT_EQ(0x3c04, fp64_to_fp16(0.5 * (x3 + x4)));
+       ASSERT_DOUBLE_EQ(1.0, fp16_to_fp64(make_fp16(0x3c00)));
+
+       double x0 = fp16_to_fp64(make_fp16(0x3c00));
+       double x1 = fp16_to_fp64(make_fp16(0x3c01));
+       double x2 = fp16_to_fp64(make_fp16(0x3c02));
+       double x3 = fp16_to_fp64(make_fp16(0x3c03));
+       double x4 = fp16_to_fp64(make_fp16(0x3c04));
+
+       EXPECT_EQ(0x3c00, fp64_to_fp16(0.5 * (x0 + x1)).val);
+       EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x1 + x2)).val);
+       EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x2 + x3)).val);
+       EXPECT_EQ(0x3c04, fp64_to_fp16(0.5 * (x3 + x4)).val);
  }
  
  union fp64 {
@@ -42,8 +59,8 @@ union fp32 {
  
  TEST(FP16Test, NaN) {
         // Ignore the sign bit.
-       EXPECT_EQ(0x7e00, fp64_to_fp16(0.0 / 0.0) & 0x7fff);
-       EXPECT_TRUE(isnan(fp16_to_fp64(0xfe00)));
+       EXPECT_EQ(0x7e00, fp64_to_fp16(0.0 / 0.0).val & 0x7fff);
+       EXPECT_TRUE(isnan(fp16_to_fp64(make_fp16(0xfe00))));
  
         fp64 borderline_inf;
         borderline_inf.ll = 0x7ff0000000000000ull;
@@ -68,15 +85,15 @@ TEST(FP16Test, NaN) {
  
  TEST(FP16Test, Denormals) {
         const double smallest_fp16_denormal = 5.9604644775390625e-08;
-       EXPECT_EQ(0x0001, fp64_to_fp16(smallest_fp16_denormal));
-       EXPECT_EQ(0x0000, fp64_to_fp16(0.5 * smallest_fp16_denormal));  // Round-to-even.
-       EXPECT_EQ(0x0001, fp64_to_fp16(0.51 * smallest_fp16_denormal));
-       EXPECT_EQ(0x0002, fp64_to_fp16(1.5 * smallest_fp16_denormal));
+       EXPECT_EQ(0x0001, fp64_to_fp16(smallest_fp16_denormal).val);
+       EXPECT_EQ(0x0000, fp64_to_fp16(0.5 * smallest_fp16_denormal).val);  // Round-to-even.
+       EXPECT_EQ(0x0001, fp64_to_fp16(0.51 * smallest_fp16_denormal).val);
+       EXPECT_EQ(0x0002, fp64_to_fp16(1.5 * smallest_fp16_denormal).val);
  
         const double smallest_fp16_non_denormal = 6.103515625e-05;
-       EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal));
-       EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal - 0.5 * smallest_fp16_denormal));  // Round-to-even.
-       EXPECT_EQ(0x03ff, fp64_to_fp16(smallest_fp16_non_denormal - smallest_fp16_denormal));
+       EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal).val);
+       EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal - 0.5 * smallest_fp16_denormal).val);  // Round-to-even.
+       EXPECT_EQ(0x03ff, fp64_to_fp16(smallest_fp16_non_denormal - smallest_fp16_denormal).val);
  }
  
  // Randomly test a large number of fp64 -> fp32 conversions, comparing
@@ -93,7 +110,7 @@ TEST(FP16Test, FP32ReferenceDownconvert) {
  
                 src.ll = (((unsigned long long)r1) << 33) ^ ((unsigned long long)r2 << 16) ^ r3;
                 reference.f = float(src.f);
-               result.u = fp64_to_fp32(src.f);
+               result.u = fp64_to_fp32(src.f).val;
  
                 EXPECT_EQ(isnan(result.f), isnan(reference.f));
                 if (!isnan(result.f)) {
@@ -116,7 +133,7 @@ TEST(FP16Test, FP32ReferenceUpconvert) {
  
                 src.u = ((unsigned long long)r1 << 16) ^ r2;
                 reference.f = double(src.f);
-               result.f = fp32_to_fp64(src.u);
+               result.f = fp32_to_fp64(make_fp32(src.u));
  
                 EXPECT_EQ(isnan(result.f), isnan(reference.f));
                 if (!isnan(result.f)) {
diff --git a/resample_effect.cpp b/resample_effect.cpp

index 156098e993dbac47c2132c46dc7d1ed9348bb9c9..244a3e2a6081187f19962b531c7fa86dde46203b 100644 (file)
--- a/resample_effect.cpp
+++ b/resample_effect.cpp
@@ -107,7 +107,7 @@ unsigned combine_samples(const Tap<float> *src, Tap<DestFloat> *dst, float num_s
                 float pos2 = src[i + 1].pos;
                 assert(pos2 > pos1);
  
-               fp16_int_t pos, total_weight;
+               DestFloat pos, total_weight;
                 float sum_sq_error;
                 combine_two_samples(w1, w2, pos1, pos2, num_subtexels, inv_num_subtexels, &pos, &total_weight, &sum_sq_error);
  
diff --git a/util.cpp b/util.cpp

index da6057e52f13b4ad4af9ca82a4c83acab6d23388..59f1dcdd67529ef9b4fd54f959fdc49bdc41d633 100644 (file)
--- a/util.cpp
+++ b/util.cpp
@@ -253,11 +253,11 @@ void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_s
         //   w = (a(1-z) + bz) / ((1-z)² + z²)
         //
         // If z had infinite precision, this would simply reduce to w = w1 + w2.
-       *total_weight = (w1 + z * (w2 - w1)) / (z * z + (1 - z) * (1 - z));
+       *total_weight = from_fp64<DestFloat>((w1 + z * (w2 - w1)) / (z * z + (1 - z) * (1 - z)));
  
         if (sum_sq_error != NULL) {
-               float err1 = *total_weight * (1 - z) - w1;
-               float err2 = *total_weight * z - w2;
+               float err1 = to_fp64(*total_weight) * (1 - z) - w1;
+               float err2 = to_fp64(*total_weight) * z - w2;
                 *sum_sq_error = err1 * err1 + err2 * err2;
         }
  }
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Wed, 23 Sep 2015 23:59:47 +0000 (01:59 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 24 Sep 2015 00:04:33 +0000 (02:04 +0200)
fp16.cpp		patch \| blob \| history
fp16.h		patch \| blob \| history
fp16_test.cpp		patch \| blob \| history
resample_effect.cpp		patch \| blob \| history
util.cpp		patch \| blob \| history