Seemingly weights were always returned as float, and then cast
to fp16_int_t -- without proper conversion! And sum_sq_error
would be calculated based on the correct value, not the broken-
casted one.
It's a small miracle the unit tests didn't catch this; they didn't
until I started introducing small errors for another reason.
Most real-world testing seems to have hit fp32, and thus this
wasn't caught there either.
Also make fp16_int_t a struct so that it is not implicitly
convertible to/from numeric types, so this never ever can happen again.
int FP64_BIAS, int FP64_MANTISSA_BITS, int FP64_EXPONENT_BITS, int FP64_MAX_EXPONENT>
inline double fp_upconvert(FP16_INT_T x)
{
int FP64_BIAS, int FP64_MANTISSA_BITS, int FP64_EXPONENT_BITS, int FP64_MAX_EXPONENT>
inline double fp_upconvert(FP16_INT_T x)
{
- int sign = x >> (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS);
- int exponent = (x & ((1ULL << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) - 1)) >> FP16_MANTISSA_BITS;
- unsigned long long mantissa = x & ((1ULL << FP16_MANTISSA_BITS) - 1);
+ int sign = x.val >> (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS);
+ int exponent = (x.val & ((1ULL << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) - 1)) >> FP16_MANTISSA_BITS;
+ unsigned long long mantissa = x.val & ((1ULL << FP16_MANTISSA_BITS) - 1);
int sign64;
int exponent64;
int sign64;
int exponent64;
- return (sign16 << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS))
+ FP16_INT_T ret;
+ ret.val = (sign16 << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS))
| (exponent16 << FP16_MANTISSA_BITS)
| mantissa16;
| (exponent16 << FP16_MANTISSA_BITS)
| mantissa16;
}
const int FP64_BIAS = 1023;
}
const int FP64_BIAS = 1023;
-typedef unsigned int fp32_int_t;
-typedef unsigned short fp16_int_t;
+// structs instead of ints, so that they are not implicitly convertible.
+struct fp32_int_t {
+ unsigned int val;
+};
+struct fp16_int_t {
+ unsigned short val;
+};
// are at compile time).
static inline double fp16_to_fp64(fp16_int_t x)
{
// are at compile time).
static inline double fp16_to_fp64(fp16_int_t x)
{
+ return _cvtsh_ss(x.val);
}
static inline fp16_int_t fp64_to_fp16(double x)
{
// NOTE: Strictly speaking, there are some select values where this isn't correct,
// since we first round to fp32 and then to fp16.
}
static inline fp16_int_t fp64_to_fp16(double x)
{
// NOTE: Strictly speaking, there are some select values where this isn't correct,
// since we first round to fp32 and then to fp16.
- return _cvtss_sh(x, 0);
+ fp16_int_t ret;
+ ret.val = _cvtss_sh(x, 0);
+ return ret;
#include <gtest/gtest.h>
namespace movit {
#include <gtest/gtest.h>
namespace movit {
+namespace {
+
+fp16_int_t make_fp16(unsigned short x)
+{
+ fp16_int_t ret;
+ ret.val = x;
+ return ret;
+}
+
+fp32_int_t make_fp32(unsigned int x)
+{
+ fp32_int_t ret;
+ ret.val = x;
+ return ret;
+}
+
+} // namespace
- EXPECT_EQ(0x0000, fp64_to_fp16(0.0));
- EXPECT_DOUBLE_EQ(0.0, fp16_to_fp64(0x0000));
+ EXPECT_EQ(0x0000, fp64_to_fp16(0.0).val);
+ EXPECT_DOUBLE_EQ(0.0, fp16_to_fp64(make_fp16(0x0000)));
- EXPECT_EQ(0x3c00, fp64_to_fp16(1.0));
- EXPECT_DOUBLE_EQ(1.0, fp16_to_fp64(0x3c00));
+ EXPECT_EQ(0x3c00, fp64_to_fp16(1.0).val);
+ EXPECT_DOUBLE_EQ(1.0, fp16_to_fp64(make_fp16(0x3c00)));
- EXPECT_EQ(0x3555, fp64_to_fp16(1.0 / 3.0));
- EXPECT_DOUBLE_EQ(0.333251953125, fp16_to_fp64(0x3555));
+ EXPECT_EQ(0x3555, fp64_to_fp16(1.0 / 3.0).val);
+ EXPECT_DOUBLE_EQ(0.333251953125, fp16_to_fp64(make_fp16(0x3555)));
}
TEST(FP16Test, RoundToNearestEven) {
}
TEST(FP16Test, RoundToNearestEven) {
- ASSERT_DOUBLE_EQ(1.0, fp16_to_fp64(0x3c00));
-
- double x0 = fp16_to_fp64(0x3c00);
- double x1 = fp16_to_fp64(0x3c01);
- double x2 = fp16_to_fp64(0x3c02);
- double x3 = fp16_to_fp64(0x3c03);
- double x4 = fp16_to_fp64(0x3c04);
-
- EXPECT_EQ(0x3c00, fp64_to_fp16(0.5 * (x0 + x1)));
- EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x1 + x2)));
- EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x2 + x3)));
- EXPECT_EQ(0x3c04, fp64_to_fp16(0.5 * (x3 + x4)));
+ ASSERT_DOUBLE_EQ(1.0, fp16_to_fp64(make_fp16(0x3c00)));
+
+ double x0 = fp16_to_fp64(make_fp16(0x3c00));
+ double x1 = fp16_to_fp64(make_fp16(0x3c01));
+ double x2 = fp16_to_fp64(make_fp16(0x3c02));
+ double x3 = fp16_to_fp64(make_fp16(0x3c03));
+ double x4 = fp16_to_fp64(make_fp16(0x3c04));
+
+ EXPECT_EQ(0x3c00, fp64_to_fp16(0.5 * (x0 + x1)).val);
+ EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x1 + x2)).val);
+ EXPECT_EQ(0x3c02, fp64_to_fp16(0.5 * (x2 + x3)).val);
+ EXPECT_EQ(0x3c04, fp64_to_fp16(0.5 * (x3 + x4)).val);
TEST(FP16Test, NaN) {
// Ignore the sign bit.
TEST(FP16Test, NaN) {
// Ignore the sign bit.
- EXPECT_EQ(0x7e00, fp64_to_fp16(0.0 / 0.0) & 0x7fff);
- EXPECT_TRUE(isnan(fp16_to_fp64(0xfe00)));
+ EXPECT_EQ(0x7e00, fp64_to_fp16(0.0 / 0.0).val & 0x7fff);
+ EXPECT_TRUE(isnan(fp16_to_fp64(make_fp16(0xfe00))));
fp64 borderline_inf;
borderline_inf.ll = 0x7ff0000000000000ull;
fp64 borderline_inf;
borderline_inf.ll = 0x7ff0000000000000ull;
TEST(FP16Test, Denormals) {
const double smallest_fp16_denormal = 5.9604644775390625e-08;
TEST(FP16Test, Denormals) {
const double smallest_fp16_denormal = 5.9604644775390625e-08;
- EXPECT_EQ(0x0001, fp64_to_fp16(smallest_fp16_denormal));
- EXPECT_EQ(0x0000, fp64_to_fp16(0.5 * smallest_fp16_denormal)); // Round-to-even.
- EXPECT_EQ(0x0001, fp64_to_fp16(0.51 * smallest_fp16_denormal));
- EXPECT_EQ(0x0002, fp64_to_fp16(1.5 * smallest_fp16_denormal));
+ EXPECT_EQ(0x0001, fp64_to_fp16(smallest_fp16_denormal).val);
+ EXPECT_EQ(0x0000, fp64_to_fp16(0.5 * smallest_fp16_denormal).val); // Round-to-even.
+ EXPECT_EQ(0x0001, fp64_to_fp16(0.51 * smallest_fp16_denormal).val);
+ EXPECT_EQ(0x0002, fp64_to_fp16(1.5 * smallest_fp16_denormal).val);
const double smallest_fp16_non_denormal = 6.103515625e-05;
const double smallest_fp16_non_denormal = 6.103515625e-05;
- EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal));
- EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal - 0.5 * smallest_fp16_denormal)); // Round-to-even.
- EXPECT_EQ(0x03ff, fp64_to_fp16(smallest_fp16_non_denormal - smallest_fp16_denormal));
+ EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal).val);
+ EXPECT_EQ(0x0400, fp64_to_fp16(smallest_fp16_non_denormal - 0.5 * smallest_fp16_denormal).val); // Round-to-even.
+ EXPECT_EQ(0x03ff, fp64_to_fp16(smallest_fp16_non_denormal - smallest_fp16_denormal).val);
}
// Randomly test a large number of fp64 -> fp32 conversions, comparing
}
// Randomly test a large number of fp64 -> fp32 conversions, comparing
src.ll = (((unsigned long long)r1) << 33) ^ ((unsigned long long)r2 << 16) ^ r3;
reference.f = float(src.f);
src.ll = (((unsigned long long)r1) << 33) ^ ((unsigned long long)r2 << 16) ^ r3;
reference.f = float(src.f);
- result.u = fp64_to_fp32(src.f);
+ result.u = fp64_to_fp32(src.f).val;
EXPECT_EQ(isnan(result.f), isnan(reference.f));
if (!isnan(result.f)) {
EXPECT_EQ(isnan(result.f), isnan(reference.f));
if (!isnan(result.f)) {
src.u = ((unsigned long long)r1 << 16) ^ r2;
reference.f = double(src.f);
src.u = ((unsigned long long)r1 << 16) ^ r2;
reference.f = double(src.f);
- result.f = fp32_to_fp64(src.u);
+ result.f = fp32_to_fp64(make_fp32(src.u));
EXPECT_EQ(isnan(result.f), isnan(reference.f));
if (!isnan(result.f)) {
EXPECT_EQ(isnan(result.f), isnan(reference.f));
if (!isnan(result.f)) {
float pos2 = src[i + 1].pos;
assert(pos2 > pos1);
float pos2 = src[i + 1].pos;
assert(pos2 > pos1);
- fp16_int_t pos, total_weight;
+ DestFloat pos, total_weight;
float sum_sq_error;
combine_two_samples(w1, w2, pos1, pos2, num_subtexels, inv_num_subtexels, &pos, &total_weight, &sum_sq_error);
float sum_sq_error;
combine_two_samples(w1, w2, pos1, pos2, num_subtexels, inv_num_subtexels, &pos, &total_weight, &sum_sq_error);
// w = (a(1-z) + bz) / ((1-z)² + z²)
//
// If z had infinite precision, this would simply reduce to w = w1 + w2.
// w = (a(1-z) + bz) / ((1-z)² + z²)
//
// If z had infinite precision, this would simply reduce to w = w1 + w2.
- *total_weight = (w1 + z * (w2 - w1)) / (z * z + (1 - z) * (1 - z));
+ *total_weight = from_fp64<DestFloat>((w1 + z * (w2 - w1)) / (z * z + (1 - z) * (1 - z)));
if (sum_sq_error != NULL) {
if (sum_sq_error != NULL) {
- float err1 = *total_weight * (1 - z) - w1;
- float err2 = *total_weight * z - w2;
+ float err1 = to_fp64(*total_weight) * (1 - z) - w1;
+ float err2 = to_fp64(*total_weight) * z - w2;
*sum_sq_error = err1 * err1 + err2 * err2;
}
}
*sum_sq_error = err1 * err1 + err2 * err2;
}
}