git.sesse.net Git - fp-downconvert/blob - fp16.c

   1 /*
   2  * C89 (+ long long) code to convert 64-bit IEEE 754 floating-point numbers to
   3  * 16-bit floating-point numbers (OpenEXR et al), without any special hardware
   4  * support for either format. Written from scratch by Steinar H. Gunderson
   5  * <sgunderson@bigfoot.com>, put in the public domain.
   6  */
   7 #include <stdio.h>
   8 #include <math.h>
   9 #include <stdlib.h>
  10
  11 const int FP64_BIAS = 1023;
  12 const int FP64_MANTISSA_BITS = 52;
  13 const int FP64_EXPONENT_BITS = 11;
  14 const int FP64_MAX_EXPONENT = 0x7FF;
  15
  16 #if 1
  17 const int FP16_BIAS = 15;
  18 const int FP16_MANTISSA_BITS = 10;
  19 const int FP16_EXPONENT_BITS = 5;
  20 const int FP16_MAX_EXPONENT = 0x1F;
  21 typedef unsigned short fp16_int_t;
  22 #else
  23 /* test using fp32, since we can verify against the FPU */
  24 const int FP16_BIAS = 127;
  25 const int FP16_MANTISSA_BITS = 23;
  26 const int FP16_EXPONENT_BITS = 8;
  27 typedef unsigned int fp16_int_t;
  28 #endif
  29
  30 union fp64 {
  31         double f;
  32         unsigned long long ll;
  33 };
  34 union fp32 {
  35         float f;
  36         unsigned u;
  37 };
  38
  39 double fromfp16(fp16_int_t x)
  40 {
  41         int sign = x >> (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS);
  42         int exponent = (x & ((1ULL << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS)) - 1)) >> FP16_MANTISSA_BITS;
  43         unsigned long long mantissa = x & ((1ULL << FP16_MANTISSA_BITS) - 1);
  44
  45         int sign64;
  46         int exponent64;
  47         unsigned long long mantissa64;
  48
  49         if (exponent == 0) {
  50                 /*
  51                  * Denormals, or zero. Zero is still zero, denormals become
  52                  * ordinary numbers.
  53                  */
  54                 if (mantissa == 0) {
  55                         sign64 = sign;
  56                         exponent64 = 0;
  57                         mantissa64 = 0;
  58                 } else {
  59                         sign64 = sign;
  60                         exponent64 = FP64_BIAS - FP16_BIAS;
  61                         mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS + 1);
  62
  63                         /* Normalize the number. */
  64                         while ((mantissa64 & (1ULL << FP64_MANTISSA_BITS)) == 0) {
  65                                 --exponent64;
  66                                 mantissa64 <<= 1;
  67                         }
  68
  69                         /* Clear the now-implicit one-bit. */
  70                         mantissa64 &= ~(1ULL << FP64_MANTISSA_BITS);
  71                 }
  72         } else if (exponent == FP16_MAX_EXPONENT) {
  73                 /*
  74                  * Infinities or NaN (mantissa=0 => infinity, otherwise NaN).
  75                  * We don't care much about NaNs, so let us just make sure we
  76                  * keep the first bit (which signals signalling/non-signalling
  77                  * in many implementations).
  78                  */
  79                 sign64 = sign;
  80                 exponent64 = FP64_MAX_EXPONENT;
  81                 mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
  82         } else {
  83                 sign64 = sign;
  84
  85                 /* Up-conversion is simple. Just re-bias the exponent... */
  86                 exponent64 = exponent + FP64_BIAS - FP16_BIAS;
  87
  88                 /* ...and convert the mantissa. */
  89                 mantissa64 = mantissa << (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
  90         }
  91
  92         union fp64 nx;
  93         nx.ll = ((unsigned long long)sign64 << (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS))
  94             | ((unsigned long long)exponent64 << FP64_MANTISSA_BITS)
  95             | mantissa64;
  96         return nx.f;
  97 }
  98
  99 unsigned long long shift_right_with_round(unsigned long long x, unsigned shift)
 100 {
 101         /* shifts >= 64 need to be special-cased */
 102         if (shift > 64) {
 103                 return 0;
 104         } else if (shift == 64) {
 105                 if (x > (1ULL << 63)) {
 106                         return 1;
 107                 } else {
 108                         return 0;
 109                 }
 110         }
 111
 112         unsigned long long round_part = x & ((1ULL << shift) - 1);
 113         if (round_part < (1ULL << (shift - 1))) {
 114                 /* round down */
 115                 x >>= shift;
 116         } else if (round_part > (1ULL << (shift - 1))) {
 117                 /* round up */
 118                 x >>= shift;
 119                 ++x;
 120         } else {
 121                 /* round to nearest even number */
 122                 x >>= shift;
 123                 if ((x & 1) != 0) {
 124                         ++x;
 125                 }
 126         }
 127         return x;
 128 }
 129
 130 fp16_int_t tofp16(double x)
 131 {
 132         union fp64 nx;
 133         nx.f = x;
 134         unsigned long long f = nx.ll;
 135         int sign = f >> (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS);
 136         int exponent = (f & ((1ULL << (FP64_MANTISSA_BITS + FP64_EXPONENT_BITS)) - 1)) >> FP64_MANTISSA_BITS;
 137         unsigned long long mantissa = f & ((1ULL << FP64_MANTISSA_BITS) - 1);
 138
 139         int sign16;
 140         int exponent16;
 141         unsigned long long mantissa16;
 142
 143         if (exponent == 0) {
 144                 /*
 145                  * Denormals, or zero. The largest possible 64-bit
 146                  * denormal is about +- 2^-1022, and the smallest possible
 147                  * 16-bit denormal is +- 2^-24. Thus, we can safely
 148                  * just set all of these to zero (but keep the sign bit).
 149                  */
 150                 sign16 = sign;
 151                 exponent16 = 0;
 152                 mantissa16 = 0;
 153         } else if (exponent == FP64_MAX_EXPONENT) {
 154                 /*
 155                  * Infinities or NaN (mantissa=0 => infinity, otherwise NaN).
 156                  * We don't care much about NaNs, so let us just keep the first
 157                  * bit (which signals signalling/ non-signalling) and make sure
 158                  * that we don't coerce NaNs down to infinities.
 159                  */
 160                 if (mantissa == 0) {
 161                         sign16 = sign;
 162                         exponent16 = FP16_MAX_EXPONENT;
 163                         mantissa16 = 0;
 164                 } else {
 165                         sign16 = sign;  /* undefined */
 166                         exponent16 = FP16_MAX_EXPONENT;
 167                         mantissa16 = mantissa >> (FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
 168                         if (mantissa16 == 0) {
 169                                 mantissa16 = 1;
 170                         }
 171                 }
 172         } else {
 173                 /* Re-bias the exponent, and check if we will create a denormal. */
 174                 exponent16 = exponent + FP16_BIAS - FP64_BIAS;
 175                 if (exponent16 <= 0) {
 176                         int shift_amount = FP64_MANTISSA_BITS - FP16_MANTISSA_BITS - exponent16 + 1;
 177                         sign16 = sign;
 178                         exponent16 = 0;
 179                         mantissa16 = shift_right_with_round(mantissa | (1ULL << FP64_MANTISSA_BITS), shift_amount);
 180
 181                         /*
 182                          * We could actually have rounded back into the lowest possible non-denormal
 183                          * here, so check for that.
 184                          */
 185                         if (mantissa16 == (1ULL << FP16_MANTISSA_BITS)) {
 186                                 exponent16 = 1;
 187                                 mantissa16 = 0;
 188                         }
 189                 } else {
 190                         /*
 191                          * First, round off the mantissa, since that could change
 192                          * the exponent. We use standard IEEE 754r roundTiesToEven
 193                          * mode.
 194                          */
 195                         sign16 = sign;
 196                         mantissa16 = shift_right_with_round(mantissa, FP64_MANTISSA_BITS - FP16_MANTISSA_BITS);
 197
 198                         /* Check if we overflowed and need to increase the exponent. */
 199                         if (mantissa16 == (1ULL << FP16_MANTISSA_BITS)) {
 200                                 ++exponent16;
 201                                 mantissa16 = 0;
 202                         }
 203
 204                         /* Finally, check for overflow, and create +- inf if we need to. */
 205                         if (exponent16 >= FP16_MAX_EXPONENT) {
 206                                 exponent16 = FP16_MAX_EXPONENT;
 207                                 mantissa16 = 0;
 208                         }
 209                 }
 210         }
 211
 212         return (sign16 << (FP16_MANTISSA_BITS + FP16_EXPONENT_BITS))
 213             | (exponent16 << FP16_MANTISSA_BITS)
 214             | mantissa16;
 215 }
 216
 217 int main(void)
 218 {
 219 #if 1
 220         printf("%.9f\n", fromfp16(0x34aa));
 221         printf("%.9f = %04x => %.9f\n\n", 1.0, tofp16(1.0), fromfp16(tofp16(1.0)));
 222         printf("%.9f = %04x => %.9f\n\n", 1.0/3.0, tofp16(1.0/3.0), fromfp16(tofp16(1.0/3.0)));
 223         printf("%.9f = %04x => %.9f\n\n", 0.0, tofp16(0.0), fromfp16(tofp16(0.0)));
 224
 225         {
 226                 int i;
 227                 double t = 7.999;
 228                 for (i = 0; i < 2800; ++i) {
 229                         t *= 3.0;
 230                         printf("%.9f = %04x => %.9f\n\n", t, tofp16(t), fromfp16(tofp16(t)));
 231                 }
 232         }
 233 #else
 234         /*
 235          * Randomly test a large number of fp64 -> fp32 conversions, comparing
 236          * against the FPU.
 237          */
 238         unsigned long long num = 0;
 239         srand((time_t)time(NULL));
 240
 241         for ( ;; ) {
 242                 unsigned r1 = rand();
 243                 unsigned r2 = rand();
 244                 unsigned r3 = rand();
 245                 union fp64 f;
 246                 union fp32 fs;
 247
 248                 f.ll = (((unsigned long long)r1) << 33) ^ ((unsigned long long)r2 << 16) ^ r3;
 249                 fs.f = (float)f.f;
 250
 251                 if (tofp16(f.f) != fs.u) {
 252                         printf("%llx (%.15f): FPU says %x, our code says %x\n", f.ll, f.f, fs.u, tofp16(f.f));
 253                 }
 254
 255                 if (++num % 1048576 == 0) {
 256                         printf("%lluM checked, last: %llx -> %x\n", num / 1048576, f.ll, fs.u);
 257                 }
 258         }
 259 #endif
 260         exit(0);
 261 }