2 * C89 (+ long long) code to convert 64-bit IEEE 754 floating-point numbers to
3 * 16-bit floating-point numbers (OpenEXR et al), without any special hardware
4 * support for either format. Written from scratch by Steinar H. Gunderson
5 * <sgunderson@bigfoot.com>, put in the public domain.
11 const int FP64_BIAS = 1023;
12 const int FP64_MANTISSA_BITS = 52;
13 const int FP64_EXPONENT_BITS = 11;
14 const int FP64_MAX_EXPONENT = 0x7FF;
17 const int FP16_BIAS = 15;
18 const int FP16_MANTISSA_BITS = 10;
19 const int FP16_EXPONENT_BITS = 5;
20 const int FP16_MAX_EXPONENT = 0x1F;
21 typedef unsigned short fp16_int_t;
23 /* test using fp32, since we can verify against the FPU */
24 const int FP16_BIAS = 127;
25 const int FP16_MANTISSA_BITS = 23;
26 const int FP16_EXPONENT_BITS = 8;
27 typedef unsigned int fp16_int_t;
32 unsigned long long ll;
39 double fromfp16(fp16_int_t x)
41 int sign = x >> (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS);
42 int exponent = (x & ((1ULL << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) - 1)) >> FP16_MANTISSA_BITS;
43 unsigned long long mantissa = x & ((1ULL << FP16_MANTISSA_BITS) - 1);
47 unsigned long long mantissa64;
51 * Denormals, or zero. Zero is still zero, denormals become
60 exponent64 = FP64_BIAS - FP16_BIAS;
61 mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS + 1);
63 /* Normalize the number. */
64 while ((mantissa64 & (1ULL << FP64_MANTISSA_BITS)) == 0) {
69 /* Clear the now-implicit one-bit. */
70 mantissa64 &= ~(1ULL << FP64_MANTISSA_BITS);
72 } else if (exponent == FP16_MAX_EXPONENT) {
74 * Infinities or NaN (mantissa=0 => infinity, otherwise NaN).
75 * We don't care much about NaNs, so let us just make sure we
76 * keep the first bit (which signals signalling/non-signalling
77 * in many implementations).
80 exponent64 = FP64_MAX_EXPONENT;
81 mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
85 /* Up-conversion is simple. Just re-bias the exponent... */
86 exponent64 = exponent + FP64_BIAS - FP16_BIAS;
88 /* ...and convert the mantissa. */
89 mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
93 nx.ll = ((unsigned long long)sign64 << (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS))
94 | ((unsigned long long)exponent64 << FP64_MANTISSA_BITS)
99 unsigned long long shift_right_with_round(unsigned long long x, unsigned shift)
101 /* shifts >= 64 need to be special-cased */
104 } else if (shift == 64) {
105 if (x > (1ULL << 63)) {
112 unsigned long long round_part = x & ((1ULL << shift) - 1);
113 if (round_part < (1ULL << (shift - 1))) {
116 } else if (round_part > (1ULL << (shift - 1))) {
121 /* round to nearest even number */
130 fp16_int_t tofp16(double x)
134 unsigned long long f = nx.ll;
135 int sign = f >> (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS);
136 int exponent = (f & ((1ULL << (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS)) - 1)) >> FP64_MANTISSA_BITS;
137 unsigned long long mantissa = f & ((1ULL << FP64_MANTISSA_BITS) - 1);
141 unsigned long long mantissa16;
145 * Denormals, or zero. The largest possible 64-bit
146 * denormal is about +- 2^-1022, and the smallest possible
147 * 16-bit denormal is +- 2^-24. Thus, we can safely
148 * just set all of these to zero (but keep the sign bit).
153 } else if (exponent == FP64_MAX_EXPONENT) {
155 * Infinities or NaN (mantissa=0 => infinity, otherwise NaN).
156 * We don't care much about NaNs, so let us just keep the first
157 * bit (which signals signalling/ non-signalling) and make sure
158 * that we don't coerce NaNs down to infinities.
162 exponent16 = FP16_MAX_EXPONENT;
165 sign16 = sign; /* undefined */
166 exponent16 = FP16_MAX_EXPONENT;
167 mantissa16 = mantissa >> (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
168 if (mantissa16 == 0) {
173 /* Re-bias the exponent, and check if we will create a denormal. */
174 exponent16 = exponent + FP16_BIAS - FP64_BIAS;
175 if (exponent16 <= 0) {
176 int shift_amount = FP64_MANTISSA_BITS - FP16_MANTISSA_BITS - exponent16 + 1;
179 mantissa16 = shift_right_with_round(mantissa | (1ULL << FP64_MANTISSA_BITS), shift_amount);
182 * We could actually have rounded back into the lowest possible non-denormal
183 * here, so check for that.
185 if (mantissa16 == (1ULL << FP16_MANTISSA_BITS)) {
191 * First, round off the mantissa, since that could change
192 * the exponent. We use standard IEEE 754r roundTiesToEven
196 mantissa16 = shift_right_with_round(mantissa, FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
198 /* Check if we overflowed and need to increase the exponent. */
199 if (mantissa16 == (1ULL << FP16_MANTISSA_BITS)) {
204 /* Finally, check for overflow, and create +- inf if we need to. */
205 if (exponent16 >= FP16_MAX_EXPONENT) {
206 exponent16 = FP16_MAX_EXPONENT;
212 return (sign16 << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS))
213 | (exponent16 << FP16_MANTISSA_BITS)
220 printf("%.9f\n", fromfp16(0x34aa));
221 printf("%.9f = %04x => %.9f\n\n", 1.0, tofp16(1.0), fromfp16(tofp16(1.0)));
222 printf("%.9f = %04x => %.9f\n\n", 1.0/3.0, tofp16(1.0/3.0), fromfp16(tofp16(1.0/3.0)));
223 printf("%.9f = %04x => %.9f\n\n", 0.0, tofp16(0.0), fromfp16(tofp16(0.0)));
228 for (i = 0; i < 2800; ++i) {
230 printf("%.9f = %04x => %.9f\n\n", t, tofp16(t), fromfp16(tofp16(t)));
235 * Randomly test a large number of fp64 -> fp32 conversions, comparing
238 unsigned long long num = 0;
239 srand((time_t)time(NULL));
242 unsigned r1 = rand();
243 unsigned r2 = rand();
244 unsigned r3 = rand();
248 f.ll = (((unsigned long long)r1) << 33) ^ ((unsigned long long)r2 << 16) ^ r3;
251 if (tofp16(f.f) != fs.u) {
252 printf("%llx (%.15f): FPU says %x, our code says %x\n", f.ll, f.f, fs.u, tofp16(f.f));
255 if (++num % 1048576 == 0) {
256 printf("%lluM checked, last: %llx -> %x\n", num / 1048576, f.ll, fs.u);