git.sesse.net Git - pistorm/blob - softfloat/softfloat.h

   1
   2 /*============================================================================
   3
   4 This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 /*----------------------------------------------------------------------------
  34 | The macro `FLOATX80' must be defined to enable the extended double-precision
  35 | floating-point format `floatx80'.  If this macro is not defined, the
  36 | `floatx80' type will not be defined, and none of the functions that either
  37 | input or output the `floatx80' type will be defined.  The same applies to
  38 | the `FLOAT128' macro and the quadruple-precision format `float128'.
  39 *----------------------------------------------------------------------------*/
  40 #define FLOATX80
  41 #define FLOAT128
  42
  43 /*----------------------------------------------------------------------------
  44 | Software IEC/IEEE floating-point types.
  45 *----------------------------------------------------------------------------*/
  46 typedef bits32 float32;
  47 typedef bits64 float64;
  48 #ifdef FLOATX80
  49 typedef struct {
  50         bits16 high;
  51         bits64 low;
  52 } floatx80;
  53 #endif
  54 #ifdef FLOAT128
  55 typedef struct {
  56         bits64 high, low;
  57 } float128;
  58 #endif
  59
  60 /*----------------------------------------------------------------------------
  61 | Primitive arithmetic functions, including multi-word arithmetic, and
  62 | division and square root approximations.  (Can be specialized to target if
  63 | desired.)
  64 *----------------------------------------------------------------------------*/
  65 #include "softfloat-macros"
  66
  67 /*----------------------------------------------------------------------------
  68 | Software IEC/IEEE floating-point underflow tininess-detection mode.
  69 *----------------------------------------------------------------------------*/
  70 extern int8 float_detect_tininess;
  71 enum {
  72         float_tininess_after_rounding  = 0,
  73         float_tininess_before_rounding = 1
  74 };
  75
  76 /*----------------------------------------------------------------------------
  77 | Software IEC/IEEE floating-point rounding mode.
  78 *----------------------------------------------------------------------------*/
  79 extern int8 float_rounding_mode;
  80 enum {
  81         float_round_nearest_even = 0,
  82         float_round_to_zero      = 1,
  83         float_round_down         = 2,
  84         float_round_up           = 3
  85 };
  86
  87 /*----------------------------------------------------------------------------
  88 | Software IEC/IEEE floating-point exception flags.
  89 *----------------------------------------------------------------------------*/
  90 extern int8 float_exception_flags;
  91 enum {
  92         float_flag_invalid = 0x01, float_flag_denormal = 0x02, float_flag_divbyzero = 0x04, float_flag_overflow = 0x08,
  93         float_flag_underflow = 0x10, float_flag_inexact = 0x20
  94 };
  95
  96 /*----------------------------------------------------------------------------
  97 | Routine to raise any or all of the software IEC/IEEE floating-point
  98 | exception flags.
  99 *----------------------------------------------------------------------------*/
 100 void float_raise( int8 );
 101
 102 /*----------------------------------------------------------------------------
 103 | Software IEC/IEEE integer-to-floating-point conversion routines.
 104 *----------------------------------------------------------------------------*/
 105 float32 int32_to_float32( int32 );
 106 float64 int32_to_float64( int32 );
 107 #ifdef FLOATX80
 108 floatx80 int32_to_floatx80( int32 );
 109 #endif
 110 #ifdef FLOAT128
 111 float128 int32_to_float128( int32 );
 112 #endif
 113 float32 int64_to_float32( int64 );
 114 float64 int64_to_float64( int64 );
 115 #ifdef FLOATX80
 116 floatx80 int64_to_floatx80( int64 );
 117 #endif
 118 #ifdef FLOAT128
 119 float128 int64_to_float128( int64 );
 120 #endif
 121
 122 /*----------------------------------------------------------------------------
 123 | Software IEC/IEEE single-precision conversion routines.
 124 *----------------------------------------------------------------------------*/
 125 int32 float32_to_int32( float32 );
 126 int32 float32_to_int32_round_to_zero( float32 );
 127 int64 float32_to_int64( float32 );
 128 int64 float32_to_int64_round_to_zero( float32 );
 129 float64 float32_to_float64( float32 );
 130 #ifdef FLOATX80
 131 floatx80 float32_to_floatx80( float32 );
 132 #endif
 133 #ifdef FLOAT128
 134 float128 float32_to_float128( float32 );
 135 #endif
 136
 137 /*----------------------------------------------------------------------------
 138 | Software IEC/IEEE single-precision operations.
 139 *----------------------------------------------------------------------------*/
 140 float32 float32_round_to_int( float32 );
 141 float32 float32_add( float32, float32 );
 142 float32 float32_sub( float32, float32 );
 143 float32 float32_mul( float32, float32 );
 144 float32 float32_div( float32, float32 );
 145 float32 float32_rem( float32, float32 );
 146 float32 float32_sqrt( float32 );
 147 flag float32_eq( float32, float32 );
 148 flag float32_le( float32, float32 );
 149 flag float32_lt( float32, float32 );
 150 flag float32_eq_signaling( float32, float32 );
 151 flag float32_le_quiet( float32, float32 );
 152 flag float32_lt_quiet( float32, float32 );
 153 flag float32_is_signaling_nan( float32 );
 154
 155 /*----------------------------------------------------------------------------
 156 | Software IEC/IEEE double-precision conversion routines.
 157 *----------------------------------------------------------------------------*/
 158 int32 float64_to_int32( float64 );
 159 int32 float64_to_int32_round_to_zero( float64 );
 160 int64 float64_to_int64( float64 );
 161 int64 float64_to_int64_round_to_zero( float64 );
 162 float32 float64_to_float32( float64 );
 163 #ifdef FLOATX80
 164 floatx80 float64_to_floatx80( float64 );
 165 #endif
 166 #ifdef FLOAT128
 167 float128 float64_to_float128( float64 );
 168 #endif
 169
 170 /*----------------------------------------------------------------------------
 171 | Software IEC/IEEE double-precision operations.
 172 *----------------------------------------------------------------------------*/
 173 float64 float64_round_to_int( float64 );
 174 float64 float64_add( float64, float64 );
 175 float64 float64_sub( float64, float64 );
 176 float64 float64_mul( float64, float64 );
 177 float64 float64_div( float64, float64 );
 178 float64 float64_rem( float64, float64 );
 179 float64 float64_sqrt( float64 );
 180 flag float64_eq( float64, float64 );
 181 flag float64_le( float64, float64 );
 182 flag float64_lt( float64, float64 );
 183 flag float64_eq_signaling( float64, float64 );
 184 flag float64_le_quiet( float64, float64 );
 185 flag float64_lt_quiet( float64, float64 );
 186 flag float64_is_signaling_nan( float64 );
 187
 188 #ifdef FLOATX80
 189
 190 /*----------------------------------------------------------------------------
 191 | Software IEC/IEEE extended double-precision conversion routines.
 192 *----------------------------------------------------------------------------*/
 193 int32 floatx80_to_int32( floatx80 );
 194 int32 floatx80_to_int32_round_to_zero( floatx80 );
 195 int64 floatx80_to_int64( floatx80 );
 196 int64 floatx80_to_int64_round_to_zero( floatx80 );
 197 float32 floatx80_to_float32( floatx80 );
 198 float64 floatx80_to_float64( floatx80 );
 199 #ifdef FLOAT128
 200 float128 floatx80_to_float128( floatx80 );
 201 #endif
 202 floatx80 floatx80_scale(floatx80 a, floatx80 b);
 203
 204 /*----------------------------------------------------------------------------
 205 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 206 | extended double-precision floating-point value, returning the result.
 207 *----------------------------------------------------------------------------*/
 208
 209 static inline floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 210 {
 211         floatx80 z;
 212
 213         z.low = zSig;
 214         z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 215         return z;
 216
 217 }
 218
 219 /*----------------------------------------------------------------------------
 220 | Software IEC/IEEE extended double-precision rounding precision.  Valid
 221 | values are 32, 64, and 80.
 222 *----------------------------------------------------------------------------*/
 223 extern int8 floatx80_rounding_precision;
 224
 225 /*----------------------------------------------------------------------------
 226 | Software IEC/IEEE extended double-precision operations.
 227 *----------------------------------------------------------------------------*/
 228 floatx80 floatx80_round_to_int( floatx80 );
 229 floatx80 floatx80_add( floatx80, floatx80 );
 230 floatx80 floatx80_sub( floatx80, floatx80 );
 231 floatx80 floatx80_mul( floatx80, floatx80 );
 232 floatx80 floatx80_div( floatx80, floatx80 );
 233 floatx80 floatx80_rem( floatx80, floatx80 );
 234 floatx80 floatx80_sqrt( floatx80 );
 235 flag floatx80_eq( floatx80, floatx80 );
 236 flag floatx80_le( floatx80, floatx80 );
 237 flag floatx80_lt( floatx80, floatx80 );
 238 flag floatx80_eq_signaling( floatx80, floatx80 );
 239 flag floatx80_le_quiet( floatx80, floatx80 );
 240 flag floatx80_lt_quiet( floatx80, floatx80 );
 241 flag floatx80_is_signaling_nan( floatx80 );
 242
 243 /* int floatx80_fsin(floatx80 &a);
 244 int floatx80_fcos(floatx80 &a);
 245 int floatx80_ftan(floatx80 &a); */
 246
 247 floatx80 floatx80_flognp1(floatx80 a);
 248 floatx80 floatx80_flogn(floatx80 a);
 249 floatx80 floatx80_flog2(floatx80 a);
 250 floatx80 floatx80_flog10(floatx80 a);
 251
 252 // roundAndPackFloatx80 used to be in softfloat-round-pack, is now in softfloat.c
 253 floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1);
 254
 255 #endif
 256
 257 #ifdef FLOAT128
 258
 259 /*----------------------------------------------------------------------------
 260 | Software IEC/IEEE quadruple-precision conversion routines.
 261 *----------------------------------------------------------------------------*/
 262 int32 float128_to_int32( float128 );
 263 int32 float128_to_int32_round_to_zero( float128 );
 264 int64 float128_to_int64( float128 );
 265 int64 float128_to_int64_round_to_zero( float128 );
 266 float32 float128_to_float32( float128 );
 267 float64 float128_to_float64( float128 );
 268 #ifdef FLOATX80
 269 floatx80 float128_to_floatx80( float128 );
 270 #endif
 271
 272 /*----------------------------------------------------------------------------
 273 | Software IEC/IEEE quadruple-precision operations.
 274 *----------------------------------------------------------------------------*/
 275 float128 float128_round_to_int( float128 );
 276 float128 float128_add( float128, float128 );
 277 float128 float128_sub( float128, float128 );
 278 float128 float128_mul( float128, float128 );
 279 float128 float128_div( float128, float128 );
 280 float128 float128_rem( float128, float128 );
 281 float128 float128_sqrt( float128 );
 282 flag float128_eq( float128, float128 );
 283 flag float128_le( float128, float128 );
 284 flag float128_lt( float128, float128 );
 285 flag float128_eq_signaling( float128, float128 );
 286 flag float128_le_quiet( float128, float128 );
 287 flag float128_lt_quiet( float128, float128 );
 288 flag float128_is_signaling_nan( float128 );
 289
 290 /*----------------------------------------------------------------------------
 291 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 292 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 293 | floating-point value, returning the result.  After being shifted into the
 294 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 295 | added together to form the most significant 32 bits of the result.  This
 296 | means that any integer portion of `zSig0' will be added into the exponent.
 297 | Since a properly normalized significand will have an integer portion equal
 298 | to 1, the `zExp' input should be 1 less than the desired result exponent
 299 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 300 | significand.
 301 *----------------------------------------------------------------------------*/
 302
 303 static inline float128
 304         packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 305 {
 306         float128 z;
 307
 308         z.low = zSig1;
 309         z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 310         return z;
 311
 312 }
 313
 314 /*----------------------------------------------------------------------------
 315 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 316 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 317 | and `zSig2', and returns the proper quadruple-precision floating-point value
 318 | corresponding to the abstract input.  Ordinarily, the abstract value is
 319 | simply rounded and packed into the quadruple-precision format, with the
 320 | inexact exception raised if the abstract input cannot be represented
 321 | exactly.  However, if the abstract value is too large, the overflow and
 322 | inexact exceptions are raised and an infinity or maximal finite value is
 323 | returned.  If the abstract value is too small, the input value is rounded to
 324 | a subnormal number, and the underflow and inexact exceptions are raised if
 325 | the abstract input cannot be represented exactly as a subnormal quadruple-
 326 | precision floating-point number.
 327 |     The input significand must be normalized or smaller.  If the input
 328 | significand is not normalized, `zExp' must be 0; in that case, the result
 329 | returned is a subnormal number, and it must not require rounding.  In the
 330 | usual case that the input significand is normalized, `zExp' must be 1 less
 331 | than the ``true'' floating-point exponent.  The handling of underflow and
 332 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 333 *----------------------------------------------------------------------------*/
 334
 335 static inline float128
 336         roundAndPackFloat128(
 337                 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
 338 {
 339         int8 roundingMode;
 340         flag roundNearestEven, increment, isTiny;
 341
 342         roundingMode = float_rounding_mode;
 343         roundNearestEven = ( roundingMode == float_round_nearest_even );
 344         increment = ( (sbits64) zSig2 < 0 );
 345         if ( ! roundNearestEven ) {
 346                 if ( roundingMode == float_round_to_zero ) {
 347                         increment = 0;
 348                 }
 349                 else {
 350                         if ( zSign ) {
 351                                 increment = ( roundingMode == float_round_down ) && zSig2;
 352                         }
 353                         else {
 354                                 increment = ( roundingMode == float_round_up ) && zSig2;
 355                         }
 356                 }
 357         }
 358         if ( 0x7FFD <= (bits32) zExp ) {
 359                 if (    ( 0x7FFD < zExp )
 360                                 || (    ( zExp == 0x7FFD )
 361                                         && eq128(
 362                                                         LIT64( 0x0001FFFFFFFFFFFF ),
 363                                                         LIT64( 0xFFFFFFFFFFFFFFFF ),
 364                                                         zSig0,
 365                                                         zSig1
 366                                                 )
 367                                         && increment
 368                                 )
 369                         ) {
 370                         float_raise( float_flag_overflow | float_flag_inexact );
 371                         if (    ( roundingMode == float_round_to_zero )
 372                                         || ( zSign && ( roundingMode == float_round_up ) )
 373                                         || ( ! zSign && ( roundingMode == float_round_down ) )
 374                                 ) {
 375                                 return
 376                                         packFloat128(
 377                                                 zSign,
 378                                                 0x7FFE,
 379                                                 LIT64( 0x0000FFFFFFFFFFFF ),
 380                                                 LIT64( 0xFFFFFFFFFFFFFFFF )
 381                                         );
 382                         }
 383                         return packFloat128( zSign, 0x7FFF, 0, 0 );
 384                 }
 385                 if ( zExp < 0 ) {
 386                         isTiny =
 387                                         ( float_detect_tininess == float_tininess_before_rounding )
 388                                 || ( zExp < -1 )
 389                                 || ! increment
 390                                 || lt128(
 391                                                 zSig0,
 392                                                 zSig1,
 393                                                 LIT64( 0x0001FFFFFFFFFFFF ),
 394                                                 LIT64( 0xFFFFFFFFFFFFFFFF )
 395                                         );
 396                         shift128ExtraRightJamming(
 397                                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 398                         zExp = 0;
 399                         if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
 400                         if ( roundNearestEven ) {
 401                                 increment = ( (sbits64) zSig2 < 0 );
 402                         }
 403                         else {
 404                                 if ( zSign ) {
 405                                         increment = ( roundingMode == float_round_down ) && zSig2;
 406                                 }
 407                                 else {
 408                                         increment = ( roundingMode == float_round_up ) && zSig2;
 409                                 }
 410                         }
 411                 }
 412         }
 413         if ( zSig2 ) float_exception_flags |= float_flag_inexact;
 414         if ( increment ) {
 415                 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 416                 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 417         }
 418         else {
 419                 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
 420         }
 421         return packFloat128( zSign, zExp, zSig0, zSig1 );
 422
 423 }
 424
 425 /*----------------------------------------------------------------------------
 426 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 427 | and significand formed by the concatenation of `zSig0' and `zSig1', and
 428 | returns the proper quadruple-precision floating-point value corresponding
 429 | to the abstract input.  This routine is just like `roundAndPackFloat128'
 430 | except that the input significand has fewer bits and does not have to be
 431 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 432 | point exponent.
 433 *----------------------------------------------------------------------------*/
 434
 435 static inline float128
 436         normalizeRoundAndPackFloat128(
 437                 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 438 {
 439         int8 shiftCount;
 440         bits64 zSig2;
 441
 442         if ( zSig0 == 0 ) {
 443                 zSig0 = zSig1;
 444                 zSig1 = 0;
 445                 zExp -= 64;
 446         }
 447         shiftCount = countLeadingZeros64( zSig0 ) - 15;
 448         if ( 0 <= shiftCount ) {
 449                 zSig2 = 0;
 450                 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 451         }
 452         else {
 453                 shift128ExtraRightJamming(
 454                         zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
 455         }
 456         zExp -= shiftCount;
 457         return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
 458
 459 }
 460 #endif