From: Steinar H. Gunderson Date: Tue, 22 Jan 2013 16:17:48 +0000 (+0100) Subject: Initial checkin for move to Git (no prior version history available). X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=0ffad9a9708014bb66203d3ab94cbe214b096b30;p=fp-downconvert Initial checkin for move to Git (no prior version history available). --- 0ffad9a9708014bb66203d3ab94cbe214b096b30 diff --git a/fp16.c b/fp16.c new file mode 100644 index 0000000..d1fdb92 --- /dev/null +++ b/fp16.c @@ -0,0 +1,261 @@ +/* + * C89 (+ long long) code to convert 64-bit IEEE 754 floating-point numbers to + * 16-bit floating-point numbers (OpenEXR et al), without any special hardware + * support for either format. Written from scratch by Steinar H. Gunderson + * , put in the public domain. + */ +#include +#include +#include + +const int FP64_BIAS = 1023; +const int FP64_MANTISSA_BITS = 52; +const int FP64_EXPONENT_BITS = 11; +const int FP64_MAX_EXPONENT = 0x7FF; + +#if 1 +const int FP16_BIAS = 15; +const int FP16_MANTISSA_BITS = 10; +const int FP16_EXPONENT_BITS = 5; +const int FP16_MAX_EXPONENT = 0x1F; +typedef unsigned short fp16_int_t; +#else +/* test using fp32, since we can verify against the FPU */ +const int FP16_BIAS = 127; +const int FP16_MANTISSA_BITS = 23; +const int FP16_EXPONENT_BITS = 8; +typedef unsigned int fp16_int_t; +#endif + +union fp64 { + double f; + unsigned long long ll; +}; +union fp32 { + float f; + unsigned u; +}; + +double fromfp16(fp16_int_t x) +{ + int sign = x >> (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS); + int exponent = (x & ((1ULL << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) - 1)) >> FP16_MANTISSA_BITS; + unsigned long long mantissa = x & ((1ULL << FP16_MANTISSA_BITS) - 1); + + int sign64; + int exponent64; + unsigned long long mantissa64; + + if (exponent == 0) { + /* + * Denormals, or zero. Zero is still zero, denormals become + * ordinary numbers. + */ + if (mantissa == 0) { + sign64 = sign; + exponent64 = 0; + mantissa64 = 0; + } else { + sign64 = sign; + exponent64 = FP64_BIAS - FP16_BIAS; + mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS + 1); + + /* Normalize the number. */ + while ((mantissa64 & (1ULL << FP64_MANTISSA_BITS)) == 0) { + --exponent64; + mantissa64 <<= 1; + } + + /* Clear the now-implicit one-bit. */ + mantissa64 &= ~(1ULL << FP64_MANTISSA_BITS); + } + } else if (exponent == FP16_MAX_EXPONENT) { + /* + * Infinities or NaN (mantissa=0 => infinity, otherwise NaN). + * We don't care much about NaNs, so let us just make sure we + * keep the first bit (which signals signalling/non-signalling + * in many implementations). + */ + sign64 = sign; + exponent64 = FP64_MAX_EXPONENT; + mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS); + } else { + sign64 = sign; + + /* Up-conversion is simple. Just re-bias the exponent... */ + exponent64 = exponent + FP64_BIAS - FP16_BIAS; + + /* ...and convert the mantissa. */ + mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS); + } + + union fp64 nx; + nx.ll = ((unsigned long long)sign64 << (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS)) + | ((unsigned long long)exponent64 << FP64_MANTISSA_BITS) + | mantissa64; + return nx.f; +} + +unsigned long long shift_right_with_round(unsigned long long x, unsigned shift) +{ + /* shifts >= 64 need to be special-cased */ + if (shift > 64) { + return 0; + } else if (shift == 64) { + if (x > (1ULL << 63)) { + return 1; + } else { + return 0; + } + } + + unsigned long long round_part = x & ((1ULL << shift) - 1); + if (round_part < (1ULL << (shift - 1))) { + /* round down */ + x >>= shift; + } else if (round_part > (1ULL << (shift - 1))) { + /* round up */ + x >>= shift; + ++x; + } else { + /* round to nearest even number */ + x >>= shift; + if ((x & 1) != 0) { + ++x; + } + } + return x; +} + +fp16_int_t tofp16(double x) +{ + union fp64 nx; + nx.f = x; + unsigned long long f = nx.ll; + int sign = f >> (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS); + int exponent = (f & ((1ULL << (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS)) - 1)) >> FP64_MANTISSA_BITS; + unsigned long long mantissa = f & ((1ULL << FP64_MANTISSA_BITS) - 1); + + int sign16; + int exponent16; + unsigned long long mantissa16; + + if (exponent == 0) { + /* + * Denormals, or zero. The largest possible 64-bit + * denormal is about +- 2^-1022, and the smallest possible + * 16-bit denormal is +- 2^-24. Thus, we can safely + * just set all of these to zero (but keep the sign bit). + */ + sign16 = sign; + exponent16 = 0; + mantissa16 = 0; + } else if (exponent == FP64_MAX_EXPONENT) { + /* + * Infinities or NaN (mantissa=0 => infinity, otherwise NaN). + * We don't care much about NaNs, so let us just keep the first + * bit (which signals signalling/ non-signalling) and make sure + * that we don't coerce NaNs down to infinities. + */ + if (mantissa == 0) { + sign16 = sign; + exponent16 = FP16_MAX_EXPONENT; + mantissa16 = 0; + } else { + sign16 = sign; /* undefined */ + exponent16 = FP16_MAX_EXPONENT; + mantissa16 = mantissa >> (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS); + if (mantissa16 == 0) { + mantissa16 = 1; + } + } + } else { + /* Re-bias the exponent, and check if we will create a denormal. */ + exponent16 = exponent + FP16_BIAS - FP64_BIAS; + if (exponent16 <= 0) { + int shift_amount = FP64_MANTISSA_BITS - FP16_MANTISSA_BITS - exponent16 + 1; + sign16 = sign; + exponent16 = 0; + mantissa16 = shift_right_with_round(mantissa | (1ULL << FP64_MANTISSA_BITS), shift_amount); + + /* + * We could actually have rounded back into the lowest possible non-denormal + * here, so check for that. + */ + if (mantissa16 == (1ULL << FP16_MANTISSA_BITS)) { + exponent16 = 1; + mantissa16 = 0; + } + } else { + /* + * First, round off the mantissa, since that could change + * the exponent. We use standard IEEE 754r roundTiesToEven + * mode. + */ + sign16 = sign; + mantissa16 = shift_right_with_round(mantissa, FP64_MANTISSA_BITS - FP16_MANTISSA_BITS); + + /* Check if we overflowed and need to increase the exponent. */ + if (mantissa16 == (1ULL << FP16_MANTISSA_BITS)) { + ++exponent16; + mantissa16 = 0; + } + + /* Finally, check for overflow, and create +- inf if we need to. */ + if (exponent16 >= FP16_MAX_EXPONENT) { + exponent16 = FP16_MAX_EXPONENT; + mantissa16 = 0; + } + } + } + + return (sign16 << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) + | (exponent16 << FP16_MANTISSA_BITS) + | mantissa16; +} + +int main(void) +{ +#if 1 + printf("%.9f\n", fromfp16(0x34aa)); + printf("%.9f = %04x => %.9f\n\n", 1.0, tofp16(1.0), fromfp16(tofp16(1.0))); + printf("%.9f = %04x => %.9f\n\n", 1.0/3.0, tofp16(1.0/3.0), fromfp16(tofp16(1.0/3.0))); + printf("%.9f = %04x => %.9f\n\n", 0.0, tofp16(0.0), fromfp16(tofp16(0.0))); + + { + int i; + double t = 7.999; + for (i = 0; i < 2800; ++i) { + t *= 3.0; + printf("%.9f = %04x => %.9f\n\n", t, tofp16(t), fromfp16(tofp16(t))); + } + } +#else + /* + * Randomly test a large number of fp64 -> fp32 conversions, comparing + * against the FPU. + */ + unsigned long long num = 0; + srand((time_t)time(NULL)); + + for ( ;; ) { + unsigned r1 = rand(); + unsigned r2 = rand(); + unsigned r3 = rand(); + union fp64 f; + union fp32 fs; + + f.ll = (((unsigned long long)r1) << 33) ^ ((unsigned long long)r2 << 16) ^ r3; + fs.f = (float)f.f; + + if (tofp16(f.f) != fs.u) { + printf("%llx (%.15f): FPU says %x, our code says %x\n", f.ll, f.f, fs.u, tofp16(f.f)); + } + + if (++num % 1048576 == 0) { + printf("%lluM checked, last: %llx -> %x\n", num / 1048576, f.ll, fs.u); + } + } +#endif + exit(0); +}