11 template<class FP16_INT_T,
12 int FP16_BIAS, int FP16_MANTISSA_BITS, int FP16_EXPONENT_BITS, int FP16_MAX_EXPONENT,
13 int FP64_BIAS, int FP64_MANTISSA_BITS, int FP64_EXPONENT_BITS, int FP64_MAX_EXPONENT>
14 inline double fp_upconvert(FP16_INT_T x)
16 int sign = x >> (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS);
17 int exponent = (x & ((1ULL << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) - 1)) >> FP16_MANTISSA_BITS;
18 unsigned long long mantissa = x & ((1ULL << FP16_MANTISSA_BITS) - 1);
22 unsigned long long mantissa64;
26 * Denormals, or zero. Zero is still zero, denormals become
35 exponent64 = FP64_BIAS - FP16_BIAS;
36 mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS + 1);
38 /* Normalize the number. */
39 while ((mantissa64 & (1ULL << FP64_MANTISSA_BITS)) == 0) {
44 /* Clear the now-implicit one-bit. */
45 mantissa64 &= ~(1ULL << FP64_MANTISSA_BITS);
47 } else if (exponent == FP16_MAX_EXPONENT) {
49 * Infinities or NaN (mantissa=0 => infinity, otherwise NaN).
50 * We don't care much about NaNs, so let us just make sure we
51 * keep the first bit (which signals signalling/non-signalling
52 * in many implementations).
55 exponent64 = FP64_MAX_EXPONENT;
56 mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
60 /* Up-conversion is simple. Just re-bias the exponent... */
61 exponent64 = exponent + FP64_BIAS - FP16_BIAS;
63 /* ...and convert the mantissa. */
64 mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
68 nx.ll = ((unsigned long long)sign64 << (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS))
69 | ((unsigned long long)exponent64 << FP64_MANTISSA_BITS)
74 unsigned long long shift_right_with_round(unsigned long long x, unsigned shift)
76 /* shifts >= 64 need to be special-cased */
79 } else if (shift == 64) {
80 if (x > (1ULL << 63)) {
87 unsigned long long round_part = x & ((1ULL << shift) - 1);
88 if (round_part < (1ULL << (shift - 1))) {
91 } else if (round_part > (1ULL << (shift - 1))) {
96 /* round to nearest even number */
105 template<class FP16_INT_T,
106 int FP16_BIAS, int FP16_MANTISSA_BITS, int FP16_EXPONENT_BITS, int FP16_MAX_EXPONENT,
107 int FP64_BIAS, int FP64_MANTISSA_BITS, int FP64_EXPONENT_BITS, int FP64_MAX_EXPONENT>
108 inline FP16_INT_T fp_downconvert(double x)
112 unsigned long long f = nx.ll;
113 int sign = f >> (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS);
114 int exponent = (f & ((1ULL << (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS)) - 1)) >> FP64_MANTISSA_BITS;
115 unsigned long long mantissa = f & ((1ULL << FP64_MANTISSA_BITS) - 1);
119 unsigned long long mantissa16;
123 * Denormals, or zero. The largest possible 64-bit
124 * denormal is about +- 2^-1022, and the smallest possible
125 * 16-bit denormal is +- 2^-24. Thus, we can safely
126 * just set all of these to zero (but keep the sign bit).
131 } else if (exponent == FP64_MAX_EXPONENT) {
133 * Infinities or NaN (mantissa=0 => infinity, otherwise NaN).
134 * We don't care much about NaNs, so let us just keep the first
135 * bit (which signals signalling/ non-signalling) and make sure
136 * that we don't coerce NaNs down to infinities.
140 exponent16 = FP16_MAX_EXPONENT;
143 sign16 = sign; /* undefined */
144 exponent16 = FP16_MAX_EXPONENT;
145 mantissa16 = mantissa >> (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
146 if (mantissa16 == 0) {
151 /* Re-bias the exponent, and check if we will create a denormal. */
152 exponent16 = exponent + FP16_BIAS - FP64_BIAS;
153 if (exponent16 <= 0) {
154 int shift_amount = FP64_MANTISSA_BITS - FP16_MANTISSA_BITS - exponent16 + 1;
157 mantissa16 = shift_right_with_round(mantissa | (1ULL << FP64_MANTISSA_BITS), shift_amount);
160 * We could actually have rounded back into the lowest possible non-denormal
161 * here, so check for that.
163 if (mantissa16 == (1ULL << FP16_MANTISSA_BITS)) {
169 * First, round off the mantissa, since that could change
170 * the exponent. We use standard IEEE 754r roundTiesToEven
174 mantissa16 = shift_right_with_round(mantissa, FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
176 /* Check if we overflowed and need to increase the exponent. */
177 if (mantissa16 == (1ULL << FP16_MANTISSA_BITS)) {
182 /* Finally, check for overflow, and create +- inf if we need to. */
183 if (exponent16 >= FP16_MAX_EXPONENT) {
184 exponent16 = FP16_MAX_EXPONENT;
190 return (sign16 << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS))
191 | (exponent16 << FP16_MANTISSA_BITS)
195 const int FP64_BIAS = 1023;
196 const int FP64_MANTISSA_BITS = 52;
197 const int FP64_EXPONENT_BITS = 11;
198 const int FP64_MAX_EXPONENT = (1 << FP64_EXPONENT_BITS) - 1;
200 const int FP32_BIAS = 127;
201 const int FP32_MANTISSA_BITS = 23;
202 const int FP32_EXPONENT_BITS = 8;
203 const int FP32_MAX_EXPONENT = (1 << FP32_EXPONENT_BITS) - 1;
205 const int FP16_BIAS = 15;
206 const int FP16_MANTISSA_BITS = 10;
207 const int FP16_EXPONENT_BITS = 5;
208 const int FP16_MAX_EXPONENT = (1 << FP16_EXPONENT_BITS) - 1;
212 double fp16_to_fp64(fp16_int_t x)
214 return fp_upconvert<fp16_int_t,
215 FP16_BIAS, FP16_MANTISSA_BITS, FP16_EXPONENT_BITS, FP16_MAX_EXPONENT,
216 FP64_BIAS, FP64_MANTISSA_BITS, FP64_EXPONENT_BITS, FP64_MAX_EXPONENT>(x);
219 fp16_int_t fp64_to_fp16(double x)
221 return fp_downconvert<fp16_int_t,
222 FP16_BIAS, FP16_MANTISSA_BITS, FP16_EXPONENT_BITS, FP16_MAX_EXPONENT,
223 FP64_BIAS, FP64_MANTISSA_BITS, FP64_EXPONENT_BITS, FP64_MAX_EXPONENT>(x);
226 double fp32_to_fp64(fp32_int_t x)
228 return fp_upconvert<fp32_int_t,
229 FP32_BIAS, FP32_MANTISSA_BITS, FP32_EXPONENT_BITS, FP32_MAX_EXPONENT,
230 FP64_BIAS, FP64_MANTISSA_BITS, FP64_EXPONENT_BITS, FP64_MAX_EXPONENT>(x);
233 fp32_int_t fp64_to_fp32(double x)
235 return fp_downconvert<fp32_int_t,
236 FP32_BIAS, FP32_MANTISSA_BITS, FP32_EXPONENT_BITS, FP32_MAX_EXPONENT,
237 FP64_BIAS, FP64_MANTISSA_BITS, FP64_EXPONENT_BITS, FP64_MAX_EXPONENT>(x);