git.sesse.net Git - pistorm/blob - softfloat/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "m68kcpu.h" // which includes softfloat.h after defining the basic types
  34
  35 /*----------------------------------------------------------------------------
  36 | Floating-point rounding mode, extended double-precision rounding precision,
  37 | and exception flags.
  38 *----------------------------------------------------------------------------*/
  39 int8 float_exception_flags = 0;
  40 #ifdef FLOATX80
  41 int8 floatx80_rounding_precision = 80;
  42 #endif
  43
  44 int8 float_rounding_mode = float_round_nearest_even;
  45
  46 /*----------------------------------------------------------------------------
  47 | Functions and definitions to determine:  (1) whether tininess for underflow
  48 | is detected before or after rounding by default, (2) what (if anything)
  49 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  50 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  51 | are propagated from function inputs to output.  These details are target-
  52 | specific.
  53 *----------------------------------------------------------------------------*/
  54 #include "softfloat-specialize"
  55
  56 /*----------------------------------------------------------------------------
  57 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  58 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  59 | input.  If `zSign' is 1, the input is negated before being converted to an
  60 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  61 | is simply rounded to an integer, with the inexact exception raised if the
  62 | input cannot be represented exactly as an integer.  However, if the fixed-
  63 | point input is too large, the invalid exception is raised and the largest
  64 | positive or negative integer is returned.
  65 *----------------------------------------------------------------------------*/
  66
  67 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
  68 {
  69         int8 roundingMode;
  70         flag roundNearestEven;
  71         int8 roundIncrement, roundBits;
  72         int32 z;
  73
  74         roundingMode = float_rounding_mode;
  75         roundNearestEven = ( roundingMode == float_round_nearest_even );
  76         roundIncrement = 0x40;
  77         if ( ! roundNearestEven ) {
  78                 if ( roundingMode == float_round_to_zero ) {
  79                         roundIncrement = 0;
  80                 }
  81                 else {
  82                         roundIncrement = 0x7F;
  83                         if ( zSign ) {
  84                                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  85                         }
  86                         else {
  87                                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  88                         }
  89                 }
  90         }
  91         roundBits = absZ & 0x7F;
  92         absZ = ( absZ + roundIncrement )>>7;
  93         absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  94         z = absZ;
  95         if ( zSign ) z = - z;
  96         if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
  97                 float_raise( float_flag_invalid );
  98                 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
  99         }
 100         if ( roundBits ) float_exception_flags |= float_flag_inexact;
 101         return z;
 102
 103 }
 104
 105 /*----------------------------------------------------------------------------
 106 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 107 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 108 | and returns the properly rounded 64-bit integer corresponding to the input.
 109 | If `zSign' is 1, the input is negated before being converted to an integer.
 110 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 111 | the inexact exception raised if the input cannot be represented exactly as
 112 | an integer.  However, if the fixed-point input is too large, the invalid
 113 | exception is raised and the largest positive or negative integer is
 114 | returned.
 115 *----------------------------------------------------------------------------*/
 116
 117 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
 118 {
 119         int8 roundingMode;
 120         flag roundNearestEven, increment;
 121         int64 z;
 122
 123         roundingMode = float_rounding_mode;
 124         roundNearestEven = ( roundingMode == float_round_nearest_even );
 125         increment = ( (sbits64) absZ1 < 0 );
 126         if ( ! roundNearestEven ) {
 127                 if ( roundingMode == float_round_to_zero ) {
 128                         increment = 0;
 129                 }
 130                 else {
 131                         if ( zSign ) {
 132                                 increment = ( roundingMode == float_round_down ) && absZ1;
 133                         }
 134                         else {
 135                                 increment = ( roundingMode == float_round_up ) && absZ1;
 136                         }
 137                 }
 138         }
 139         if ( increment ) {
 140                 ++absZ0;
 141                 if ( absZ0 == 0 ) goto overflow;
 142                 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 143         }
 144         z = absZ0;
 145         if ( zSign ) z = - z;
 146         if ( z && ( ( z < 0 ) ^ zSign ) ) {
 147         overflow:
 148                 float_raise( float_flag_invalid );
 149                 return
 150                                 zSign ? (sbits64) LIT64( 0x8000000000000000 )
 151                         : (sbits64) LIT64( 0x7FFFFFFFFFFFFFFF );
 152         }
 153         if ( absZ1 ) float_exception_flags |= float_flag_inexact;
 154         return z;
 155
 156 }
 157
 158 /*----------------------------------------------------------------------------
 159 | Returns the fraction bits of the single-precision floating-point value `a'.
 160 *----------------------------------------------------------------------------*/
 161
 162 static inline bits32 extractFloat32Frac( float32 a )
 163 {
 164         return a & 0x007FFFFF;
 165
 166 }
 167
 168 /*----------------------------------------------------------------------------
 169 | Returns the exponent bits of the single-precision floating-point value `a'.
 170 *----------------------------------------------------------------------------*/
 171
 172 static inline int16 extractFloat32Exp( float32 a )
 173 {
 174         return ( a>>23 ) & 0xFF;
 175
 176 }
 177
 178 /*----------------------------------------------------------------------------
 179 | Returns the sign bit of the single-precision floating-point value `a'.
 180 *----------------------------------------------------------------------------*/
 181
 182 static inline flag extractFloat32Sign( float32 a )
 183 {
 184         return a>>31;
 185
 186 }
 187
 188 /*----------------------------------------------------------------------------
 189 | Normalizes the subnormal single-precision floating-point value represented
 190 | by the denormalized significand `aSig'.  The normalized exponent and
 191 | significand are stored at the locations pointed to by `zExpPtr' and
 192 | `zSigPtr', respectively.
 193 *----------------------------------------------------------------------------*/
 194
 195 static void
 196         normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 197 {
 198         int8 shiftCount;
 199
 200         shiftCount = countLeadingZeros32( aSig ) - 8;
 201         *zSigPtr = aSig<<shiftCount;
 202         *zExpPtr = 1 - shiftCount;
 203
 204 }
 205
 206 /*----------------------------------------------------------------------------
 207 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 208 | single-precision floating-point value, returning the result.  After being
 209 | shifted into the proper positions, the three fields are simply added
 210 | together to form the result.  This means that any integer portion of `zSig'
 211 | will be added into the exponent.  Since a properly normalized significand
 212 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 213 | than the desired result exponent whenever `zSig' is a complete, normalized
 214 | significand.
 215 *----------------------------------------------------------------------------*/
 216
 217 static inline float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 218 {
 219         return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 220
 221 }
 222
 223 /*----------------------------------------------------------------------------
 224 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 225 | and significand `zSig', and returns the proper single-precision floating-
 226 | point value corresponding to the abstract input.  Ordinarily, the abstract
 227 | value is simply rounded and packed into the single-precision format, with
 228 | the inexact exception raised if the abstract input cannot be represented
 229 | exactly.  However, if the abstract value is too large, the overflow and
 230 | inexact exceptions are raised and an infinity or maximal finite value is
 231 | returned.  If the abstract value is too small, the input value is rounded to
 232 | a subnormal number, and the underflow and inexact exceptions are raised if
 233 | the abstract input cannot be represented exactly as a subnormal single-
 234 | precision floating-point number.
 235 |     The input significand `zSig' has its binary point between bits 30
 236 | and 29, which is 7 bits to the left of the usual location.  This shifted
 237 | significand must be normalized or smaller.  If `zSig' is not normalized,
 238 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 239 | and it must not require rounding.  In the usual case that `zSig' is
 240 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 241 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 242 | Binary Floating-Point Arithmetic.
 243 *----------------------------------------------------------------------------*/
 244
 245 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 246 {
 247         int8 roundingMode;
 248         flag roundNearestEven;
 249         int8 roundIncrement, roundBits;
 250         flag isTiny;
 251
 252         roundingMode = float_rounding_mode;
 253         roundNearestEven = ( roundingMode == float_round_nearest_even );
 254         roundIncrement = 0x40;
 255         if ( ! roundNearestEven ) {
 256                 if ( roundingMode == float_round_to_zero ) {
 257                         roundIncrement = 0;
 258                 }
 259                 else {
 260                         roundIncrement = 0x7F;
 261                         if ( zSign ) {
 262                                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 263                         }
 264                         else {
 265                                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 266                         }
 267                 }
 268         }
 269         roundBits = zSig & 0x7F;
 270         if ( 0xFD <= (bits16) zExp ) {
 271                 if (    ( 0xFD < zExp )
 272                                 || (    ( zExp == 0xFD )
 273                                         && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 274                         ) {
 275                         float_raise( float_flag_overflow | float_flag_inexact );
 276                         return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 277                 }
 278                 if ( zExp < 0 ) {
 279                         isTiny =
 280                                         ( float_detect_tininess == float_tininess_before_rounding )
 281                                 || ( zExp < -1 )
 282                                 || ( zSig + roundIncrement < 0x80000000 );
 283                         shift32RightJamming( zSig, - zExp, &zSig );
 284                         zExp = 0;
 285                         roundBits = zSig & 0x7F;
 286                         if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 287                 }
 288         }
 289         if ( roundBits ) float_exception_flags |= float_flag_inexact;
 290         zSig = ( zSig + roundIncrement )>>7;
 291         zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 292         if ( zSig == 0 ) zExp = 0;
 293         return packFloat32( zSign, zExp, zSig );
 294
 295 }
 296
 297 /*----------------------------------------------------------------------------
 298 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 299 | and significand `zSig', and returns the proper single-precision floating-
 300 | point value corresponding to the abstract input.  This routine is just like
 301 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 302 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 303 | floating-point exponent.
 304 *----------------------------------------------------------------------------*/
 305
 306 static float32
 307         normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 308 {
 309         int8 shiftCount;
 310
 311         shiftCount = countLeadingZeros32( zSig ) - 1;
 312         return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 313
 314 }
 315
 316 /*----------------------------------------------------------------------------
 317 | Returns the fraction bits of the double-precision floating-point value `a'.
 318 *----------------------------------------------------------------------------*/
 319
 320 static inline bits64 extractFloat64Frac( float64 a )
 321 {
 322         return a & LIT64( 0x000FFFFFFFFFFFFF );
 323
 324 }
 325
 326 /*----------------------------------------------------------------------------
 327 | Returns the exponent bits of the double-precision floating-point value `a'.
 328 *----------------------------------------------------------------------------*/
 329
 330 static inline int16 extractFloat64Exp( float64 a )
 331 {
 332         return ( a>>52 ) & 0x7FF;
 333
 334 }
 335
 336 /*----------------------------------------------------------------------------
 337 | Returns the sign bit of the double-precision floating-point value `a'.
 338 *----------------------------------------------------------------------------*/
 339
 340 static inline flag extractFloat64Sign( float64 a )
 341 {
 342         return a>>63;
 343
 344 }
 345
 346 /*----------------------------------------------------------------------------
 347 | Normalizes the subnormal double-precision floating-point value represented
 348 | by the denormalized significand `aSig'.  The normalized exponent and
 349 | significand are stored at the locations pointed to by `zExpPtr' and
 350 | `zSigPtr', respectively.
 351 *----------------------------------------------------------------------------*/
 352
 353 static void
 354         normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 355 {
 356         int8 shiftCount;
 357
 358         shiftCount = countLeadingZeros64( aSig ) - 11;
 359         *zSigPtr = aSig<<shiftCount;
 360         *zExpPtr = 1 - shiftCount;
 361
 362 }
 363
 364 /*----------------------------------------------------------------------------
 365 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 366 | double-precision floating-point value, returning the result.  After being
 367 | shifted into the proper positions, the three fields are simply added
 368 | together to form the result.  This means that any integer portion of `zSig'
 369 | will be added into the exponent.  Since a properly normalized significand
 370 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 371 | than the desired result exponent whenever `zSig' is a complete, normalized
 372 | significand.
 373 *----------------------------------------------------------------------------*/
 374
 375 static inline float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 376 {
 377         return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 378
 379 }
 380
 381 /*----------------------------------------------------------------------------
 382 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 383 | and significand `zSig', and returns the proper double-precision floating-
 384 | point value corresponding to the abstract input.  Ordinarily, the abstract
 385 | value is simply rounded and packed into the double-precision format, with
 386 | the inexact exception raised if the abstract input cannot be represented
 387 | exactly.  However, if the abstract value is too large, the overflow and
 388 | inexact exceptions are raised and an infinity or maximal finite value is
 389 | returned.  If the abstract value is too small, the input value is rounded
 390 | to a subnormal number, and the underflow and inexact exceptions are raised
 391 | if the abstract input cannot be represented exactly as a subnormal double-
 392 | precision floating-point number.
 393 |     The input significand `zSig' has its binary point between bits 62
 394 | and 61, which is 10 bits to the left of the usual location.  This shifted
 395 | significand must be normalized or smaller.  If `zSig' is not normalized,
 396 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 397 | and it must not require rounding.  In the usual case that `zSig' is
 398 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 399 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 400 | Binary Floating-Point Arithmetic.
 401 *----------------------------------------------------------------------------*/
 402
 403 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 404 {
 405         int8 roundingMode;
 406         flag roundNearestEven;
 407         int16 roundIncrement, roundBits;
 408         flag isTiny;
 409
 410         roundingMode = float_rounding_mode;
 411         roundNearestEven = ( roundingMode == float_round_nearest_even );
 412         roundIncrement = 0x200;
 413         if ( ! roundNearestEven ) {
 414                 if ( roundingMode == float_round_to_zero ) {
 415                         roundIncrement = 0;
 416                 }
 417                 else {
 418                         roundIncrement = 0x3FF;
 419                         if ( zSign ) {
 420                                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 421                         }
 422                         else {
 423                                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 424                         }
 425                 }
 426         }
 427         roundBits = zSig & 0x3FF;
 428         if ( 0x7FD <= (bits16) zExp ) {
 429                 if (    ( 0x7FD < zExp )
 430                                 || (    ( zExp == 0x7FD )
 431                                         && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 432                         ) {
 433                         float_raise( float_flag_overflow | float_flag_inexact );
 434                         return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 435                 }
 436                 if ( zExp < 0 ) {
 437                         isTiny =
 438                                         ( float_detect_tininess == float_tininess_before_rounding )
 439                                 || ( zExp < -1 )
 440                                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 441                         shift64RightJamming( zSig, - zExp, &zSig );
 442                         zExp = 0;
 443                         roundBits = zSig & 0x3FF;
 444                         if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 445                 }
 446         }
 447         if ( roundBits ) float_exception_flags |= float_flag_inexact;
 448         zSig = ( zSig + roundIncrement )>>10;
 449         zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 450         if ( zSig == 0 ) zExp = 0;
 451         return packFloat64( zSign, zExp, zSig );
 452
 453 }
 454
 455 /*----------------------------------------------------------------------------
 456 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 457 | and significand `zSig', and returns the proper double-precision floating-
 458 | point value corresponding to the abstract input.  This routine is just like
 459 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 460 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 461 | floating-point exponent.
 462 *----------------------------------------------------------------------------*/
 463
 464 static float64
 465         normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 466 {
 467         int8 shiftCount;
 468
 469         shiftCount = countLeadingZeros64( zSig ) - 1;
 470         return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 471
 472 }
 473
 474 #ifdef FLOATX80
 475
 476 /*----------------------------------------------------------------------------
 477 | Normalizes the subnormal extended double-precision floating-point value
 478 | represented by the denormalized significand `aSig'.  The normalized exponent
 479 | and significand are stored at the locations pointed to by `zExpPtr' and
 480 | `zSigPtr', respectively.
 481 *----------------------------------------------------------------------------*/
 482
 483 static void
 484         normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 485 {
 486         int8 shiftCount;
 487
 488         shiftCount = countLeadingZeros64( aSig );
 489         *zSigPtr = aSig<<shiftCount;
 490         *zExpPtr = 1 - shiftCount;
 491
 492 }
 493
 494 /*----------------------------------------------------------------------------
 495 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 496 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 497 | and returns the proper extended double-precision floating-point value
 498 | corresponding to the abstract input.  Ordinarily, the abstract value is
 499 | rounded and packed into the extended double-precision format, with the
 500 | inexact exception raised if the abstract input cannot be represented
 501 | exactly.  However, if the abstract value is too large, the overflow and
 502 | inexact exceptions are raised and an infinity or maximal finite value is
 503 | returned.  If the abstract value is too small, the input value is rounded to
 504 | a subnormal number, and the underflow and inexact exceptions are raised if
 505 | the abstract input cannot be represented exactly as a subnormal extended
 506 | double-precision floating-point number.
 507 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 508 | number of bits as single or double precision, respectively.  Otherwise, the
 509 | result is rounded to the full precision of the extended double-precision
 510 | format.
 511 |     The input significand must be normalized or smaller.  If the input
 512 | significand is not normalized, `zExp' must be 0; in that case, the result
 513 | returned is a subnormal number, and it must not require rounding.  The
 514 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 515 | Floating-Point Arithmetic.
 516 *----------------------------------------------------------------------------*/
 517
 518 // roundAndPackFloatx80 is now also used in fyl2x.c
 519
 520 /* static */ floatx80
 521         roundAndPackFloatx80(
 522                 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 523         )
 524 {
 525         int8 roundingMode;
 526         flag roundNearestEven, increment, isTiny;
 527         int64 roundIncrement, roundMask, roundBits;
 528
 529         roundingMode = float_rounding_mode;
 530         roundNearestEven = ( roundingMode == float_round_nearest_even );
 531         if ( roundingPrecision == 80 ) goto precision80;
 532         if ( roundingPrecision == 64 ) {
 533                 roundIncrement = LIT64( 0x0000000000000400 );
 534                 roundMask = LIT64( 0x00000000000007FF );
 535         }
 536         else if ( roundingPrecision == 32 ) {
 537                 roundIncrement = LIT64( 0x0000008000000000 );
 538                 roundMask = LIT64( 0x000000FFFFFFFFFF );
 539         }
 540         else {
 541                 goto precision80;
 542         }
 543         zSig0 |= ( zSig1 != 0 );
 544         if ( ! roundNearestEven ) {
 545                 if ( roundingMode == float_round_to_zero ) {
 546                         roundIncrement = 0;
 547                 }
 548                 else {
 549                         roundIncrement = roundMask;
 550                         if ( zSign ) {
 551                                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 552                         }
 553                         else {
 554                                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 555                         }
 556                 }
 557         }
 558         roundBits = zSig0 & roundMask;
 559         if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 560                 if (    ( 0x7FFE < zExp )
 561                                 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 562                         ) {
 563                         goto overflow;
 564                 }
 565                 if ( zExp <= 0 ) {
 566                         isTiny =
 567                                         ( float_detect_tininess == float_tininess_before_rounding )
 568                                 || ( zExp < 0 )
 569                                 || ( zSig0 <= zSig0 + roundIncrement );
 570                         shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 571                         zExp = 0;
 572                         roundBits = zSig0 & roundMask;
 573                         if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 574                         if ( roundBits ) float_exception_flags |= float_flag_inexact;
 575                         zSig0 += roundIncrement;
 576                         if ( (sbits64) zSig0 < 0 ) zExp = 1;
 577                         roundIncrement = roundMask + 1;
 578                         if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 579                                 roundMask |= roundIncrement;
 580                         }
 581                         zSig0 &= ~ roundMask;
 582                         return packFloatx80( zSign, zExp, zSig0 );
 583                 }
 584         }
 585         if ( roundBits ) float_exception_flags |= float_flag_inexact;
 586         zSig0 += roundIncrement;
 587         if ( zSig0 < (bits64)roundIncrement ) {
 588                 ++zExp;
 589                 zSig0 = LIT64( 0x8000000000000000 );
 590         }
 591         roundIncrement = roundMask + 1;
 592         if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 593                 roundMask |= roundIncrement;
 594         }
 595         zSig0 &= ~ roundMask;
 596         if ( zSig0 == 0 ) zExp = 0;
 597         return packFloatx80( zSign, zExp, zSig0 );
 598         precision80:
 599         increment = ( (sbits64) zSig1 < 0 );
 600         if ( ! roundNearestEven ) {
 601                 if ( roundingMode == float_round_to_zero ) {
 602                         increment = 0;
 603                 }
 604                 else {
 605                         if ( zSign ) {
 606                                 increment = ( roundingMode == float_round_down ) && zSig1;
 607                         }
 608                         else {
 609                                 increment = ( roundingMode == float_round_up ) && zSig1;
 610                         }
 611                 }
 612         }
 613         if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 614                 if (    ( 0x7FFE < zExp )
 615                                 || (    ( zExp == 0x7FFE )
 616                                         && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 617                                         && increment
 618                                 )
 619                         ) {
 620                         roundMask = 0;
 621         overflow:
 622                         float_raise( float_flag_overflow | float_flag_inexact );
 623                         if (    ( roundingMode == float_round_to_zero )
 624                                         || ( zSign && ( roundingMode == float_round_up ) )
 625                                         || ( ! zSign && ( roundingMode == float_round_down ) )
 626                                 ) {
 627                                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 628                         }
 629                         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 630                 }
 631                 if ( zExp <= 0 ) {
 632                         isTiny =
 633                                         ( float_detect_tininess == float_tininess_before_rounding )
 634                                 || ( zExp < 0 )
 635                                 || ! increment
 636                                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 637                         shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 638                         zExp = 0;
 639                         if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 640                         if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 641                         if ( roundNearestEven ) {
 642                                 increment = ( (sbits64) zSig1 < 0 );
 643                         }
 644                         else {
 645                                 if ( zSign ) {
 646                                         increment = ( roundingMode == float_round_down ) && zSig1;
 647                                 }
 648                                 else {
 649                                         increment = ( roundingMode == float_round_up ) && zSig1;
 650                                 }
 651                         }
 652                         if ( increment ) {
 653                                 ++zSig0;
 654                                 zSig0 &=
 655                                         ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 656                                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 657                         }
 658                         return packFloatx80( zSign, zExp, zSig0 );
 659                 }
 660         }
 661         if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 662         if ( increment ) {
 663                 ++zSig0;
 664                 if ( zSig0 == 0 ) {
 665                         ++zExp;
 666                         zSig0 = LIT64( 0x8000000000000000 );
 667                 }
 668                 else {
 669                         zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 670                 }
 671         }
 672         else {
 673                 if ( zSig0 == 0 ) zExp = 0;
 674         }
 675         return packFloatx80( zSign, zExp, zSig0 );
 676
 677 }
 678
 679 /*----------------------------------------------------------------------------
 680 | Takes an abstract floating-point value having sign `zSign', exponent
 681 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 682 | and returns the proper extended double-precision floating-point value
 683 | corresponding to the abstract input.  This routine is just like
 684 | `roundAndPackFloatx80' except that the input significand does not have to be
 685 | normalized.
 686 *----------------------------------------------------------------------------*/
 687
 688 static floatx80
 689         normalizeRoundAndPackFloatx80(
 690                 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 691         )
 692 {
 693         int8 shiftCount;
 694
 695         if ( zSig0 == 0 ) {
 696                 zSig0 = zSig1;
 697                 zSig1 = 0;
 698                 zExp -= 64;
 699         }
 700         shiftCount = countLeadingZeros64( zSig0 );
 701         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 702         zExp -= shiftCount;
 703         return
 704                 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 705
 706 }
 707
 708 #endif
 709
 710 #ifdef FLOAT128
 711
 712 /*----------------------------------------------------------------------------
 713 | Returns the least-significant 64 fraction bits of the quadruple-precision
 714 | floating-point value `a'.
 715 *----------------------------------------------------------------------------*/
 716
 717 static inline bits64 extractFloat128Frac1( float128 a )
 718 {
 719         return a.low;
 720
 721 }
 722
 723 /*----------------------------------------------------------------------------
 724 | Returns the most-significant 48 fraction bits of the quadruple-precision
 725 | floating-point value `a'.
 726 *----------------------------------------------------------------------------*/
 727
 728 static inline bits64 extractFloat128Frac0( float128 a )
 729 {
 730         return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 731
 732 }
 733
 734 /*----------------------------------------------------------------------------
 735 | Returns the exponent bits of the quadruple-precision floating-point value
 736 | `a'.
 737 *----------------------------------------------------------------------------*/
 738
 739 static inline int32 extractFloat128Exp( float128 a )
 740 {
 741         return ( a.high>>48 ) & 0x7FFF;
 742
 743 }
 744
 745 /*----------------------------------------------------------------------------
 746 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 747 *----------------------------------------------------------------------------*/
 748
 749 static inline flag extractFloat128Sign( float128 a )
 750 {
 751         return a.high>>63;
 752
 753 }
 754
 755 /*----------------------------------------------------------------------------
 756 | Normalizes the subnormal quadruple-precision floating-point value
 757 | represented by the denormalized significand formed by the concatenation of
 758 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 759 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 760 | significand are stored at the location pointed to by `zSig0Ptr', and the
 761 | least significant 64 bits of the normalized significand are stored at the
 762 | location pointed to by `zSig1Ptr'.
 763 *----------------------------------------------------------------------------*/
 764
 765 static void
 766         normalizeFloat128Subnormal(
 767                 bits64 aSig0,
 768                 bits64 aSig1,
 769                 int32 *zExpPtr,
 770                 bits64 *zSig0Ptr,
 771                 bits64 *zSig1Ptr
 772         )
 773 {
 774         int8 shiftCount;
 775
 776         if ( aSig0 == 0 ) {
 777                 shiftCount = countLeadingZeros64( aSig1 ) - 15;
 778                 if ( shiftCount < 0 ) {
 779                         *zSig0Ptr = aSig1>>( - shiftCount );
 780                         *zSig1Ptr = aSig1<<( shiftCount & 63 );
 781                 }
 782                 else {
 783                         *zSig0Ptr = aSig1<<shiftCount;
 784                         *zSig1Ptr = 0;
 785                 }
 786                 *zExpPtr = - shiftCount - 63;
 787         }
 788         else {
 789                 shiftCount = countLeadingZeros64( aSig0 ) - 15;
 790                 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 791                 *zExpPtr = 1 - shiftCount;
 792         }
 793
 794 }
 795
 796 #endif
 797
 798 /*----------------------------------------------------------------------------
 799 | Returns the result of converting the 32-bit two's complement integer `a'
 800 | to the single-precision floating-point format.  The conversion is performed
 801 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 802 *----------------------------------------------------------------------------*/
 803
 804 float32 int32_to_float32( int32 a )
 805 {
 806         flag zSign;
 807
 808         if ( a == 0 ) return 0;
 809         if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
 810         zSign = ( a < 0 );
 811         return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
 812
 813 }
 814
 815 /*----------------------------------------------------------------------------
 816 | Returns the result of converting the 32-bit two's complement integer `a'
 817 | to the double-precision floating-point format.  The conversion is performed
 818 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 819 *----------------------------------------------------------------------------*/
 820
 821 float64 int32_to_float64( int32 a )
 822 {
 823         flag zSign;
 824         uint32 absA;
 825         int8 shiftCount;
 826         bits64 zSig;
 827
 828         if ( a == 0 ) return 0;
 829         zSign = ( a < 0 );
 830         absA = zSign ? - a : a;
 831         shiftCount = countLeadingZeros32( absA ) + 21;
 832         zSig = absA;
 833         return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
 834
 835 }
 836
 837 #ifdef FLOATX80
 838
 839 /*----------------------------------------------------------------------------
 840 | Returns the result of converting the 32-bit two's complement integer `a'
 841 | to the extended double-precision floating-point format.  The conversion
 842 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
 843 | Arithmetic.
 844 *----------------------------------------------------------------------------*/
 845
 846 floatx80 int32_to_floatx80( int32 a )
 847 {
 848         flag zSign;
 849         uint32 absA;
 850         int8 shiftCount;
 851         bits64 zSig;
 852
 853         if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 854         zSign = ( a < 0 );
 855         absA = zSign ? - a : a;
 856         shiftCount = countLeadingZeros32( absA ) + 32;
 857         zSig = absA;
 858         return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
 859
 860 }
 861
 862 #endif
 863
 864 #ifdef FLOAT128
 865
 866 /*----------------------------------------------------------------------------
 867 | Returns the result of converting the 32-bit two's complement integer `a' to
 868 | the quadruple-precision floating-point format.  The conversion is performed
 869 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 870 *----------------------------------------------------------------------------*/
 871
 872 float128 int32_to_float128( int32 a )
 873 {
 874         flag zSign;
 875         uint32 absA;
 876         int8 shiftCount;
 877         bits64 zSig0;
 878
 879         if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
 880         zSign = ( a < 0 );
 881         absA = zSign ? - a : a;
 882         shiftCount = countLeadingZeros32( absA ) + 17;
 883         zSig0 = absA;
 884         return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
 885
 886 }
 887
 888 #endif
 889
 890 /*----------------------------------------------------------------------------
 891 | Returns the result of converting the 64-bit two's complement integer `a'
 892 | to the single-precision floating-point format.  The conversion is performed
 893 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 894 *----------------------------------------------------------------------------*/
 895
 896 float32 int64_to_float32( int64 a )
 897 {
 898         flag zSign;
 899         uint64 absA;
 900         int8 shiftCount;
 901 //    bits32 zSig;
 902
 903         if ( a == 0 ) return 0;
 904         zSign = ( a < 0 );
 905         absA = zSign ? - a : a;
 906         shiftCount = countLeadingZeros64( absA ) - 40;
 907         if ( 0 <= shiftCount ) {
 908                 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
 909         }
 910         else {
 911                 shiftCount += 7;
 912                 if ( shiftCount < 0 ) {
 913                         shift64RightJamming( absA, - shiftCount, &absA );
 914                 }
 915                 else {
 916                         absA <<= shiftCount;
 917                 }
 918                 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
 919         }
 920
 921 }
 922
 923 /*----------------------------------------------------------------------------
 924 | Returns the result of converting the 64-bit two's complement integer `a'
 925 | to the double-precision floating-point format.  The conversion is performed
 926 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 927 *----------------------------------------------------------------------------*/
 928
 929 float64 int64_to_float64( int64 a )
 930 {
 931         flag zSign;
 932
 933         if ( a == 0 ) return 0;
 934         if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
 935                 return packFloat64( 1, 0x43E, 0 );
 936         }
 937         zSign = ( a < 0 );
 938         return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
 939
 940 }
 941
 942 #ifdef FLOATX80
 943
 944 /*----------------------------------------------------------------------------
 945 | Returns the result of converting the 64-bit two's complement integer `a'
 946 | to the extended double-precision floating-point format.  The conversion
 947 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
 948 | Arithmetic.
 949 *----------------------------------------------------------------------------*/
 950
 951 floatx80 int64_to_floatx80( int64 a )
 952 {
 953         flag zSign;
 954         uint64 absA;
 955         int8 shiftCount;
 956
 957         if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 958         zSign = ( a < 0 );
 959         absA = zSign ? - a : a;
 960         shiftCount = countLeadingZeros64( absA );
 961         return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
 962
 963 }
 964
 965 #endif
 966
 967 #ifdef FLOAT128
 968
 969 /*----------------------------------------------------------------------------
 970 | Returns the result of converting the 64-bit two's complement integer `a' to
 971 | the quadruple-precision floating-point format.  The conversion is performed
 972 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 973 *----------------------------------------------------------------------------*/
 974
 975 float128 int64_to_float128( int64 a )
 976 {
 977         flag zSign;
 978         uint64 absA;
 979         int8 shiftCount;
 980         int32 zExp;
 981         bits64 zSig0, zSig1;
 982
 983         if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
 984         zSign = ( a < 0 );
 985         absA = zSign ? - a : a;
 986         shiftCount = countLeadingZeros64( absA ) + 49;
 987         zExp = 0x406E - shiftCount;
 988         if ( 64 <= shiftCount ) {
 989                 zSig1 = 0;
 990                 zSig0 = absA;
 991                 shiftCount -= 64;
 992         }
 993         else {
 994                 zSig1 = absA;
 995                 zSig0 = 0;
 996         }
 997         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 998         return packFloat128( zSign, zExp, zSig0, zSig1 );
 999
1000 }
1001
1002 #endif
1003
1004 /*----------------------------------------------------------------------------
1005 | Returns the result of converting the single-precision floating-point value
1006 | `a' to the 32-bit two's complement integer format.  The conversion is
1007 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1008 | Arithmetic---which means in particular that the conversion is rounded
1009 | according to the current rounding mode.  If `a' is a NaN, the largest
1010 | positive integer is returned.  Otherwise, if the conversion overflows, the
1011 | largest integer with the same sign as `a' is returned.
1012 *----------------------------------------------------------------------------*/
1013
1014 int32 float32_to_int32( float32 a )
1015 {
1016         flag aSign;
1017         int16 aExp, shiftCount;
1018         bits32 aSig;
1019         bits64 aSig64;
1020
1021         aSig = extractFloat32Frac( a );
1022         aExp = extractFloat32Exp( a );
1023         aSign = extractFloat32Sign( a );
1024         if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1025         if ( aExp ) aSig |= 0x00800000;
1026         shiftCount = 0xAF - aExp;
1027         aSig64 = aSig;
1028         aSig64 <<= 32;
1029         if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1030         return roundAndPackInt32( aSign, aSig64 );
1031
1032 }
1033
1034 /*----------------------------------------------------------------------------
1035 | Returns the result of converting the single-precision floating-point value
1036 | `a' to the 32-bit two's complement integer format.  The conversion is
1037 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1038 | Arithmetic, except that the conversion is always rounded toward zero.
1039 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1040 | the conversion overflows, the largest integer with the same sign as `a' is
1041 | returned.
1042 *----------------------------------------------------------------------------*/
1043
1044 int32 float32_to_int32_round_to_zero( float32 a )
1045 {
1046         flag aSign;
1047         int16 aExp, shiftCount;
1048         bits32 aSig;
1049         int32 z;
1050
1051         aSig = extractFloat32Frac( a );
1052         aExp = extractFloat32Exp( a );
1053         aSign = extractFloat32Sign( a );
1054         shiftCount = aExp - 0x9E;
1055         if ( 0 <= shiftCount ) {
1056                 if ( a != 0xCF000000 ) {
1057                         float_raise( float_flag_invalid );
1058                         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1059                 }
1060                 return (sbits32) 0x80000000;
1061         }
1062         else if ( aExp <= 0x7E ) {
1063                 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1064                 return 0;
1065         }
1066         aSig = ( aSig | 0x00800000 )<<8;
1067         z = aSig>>( - shiftCount );
1068         if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1069                 float_exception_flags |= float_flag_inexact;
1070         }
1071         if ( aSign ) z = - z;
1072         return z;
1073
1074 }
1075
1076 /*----------------------------------------------------------------------------
1077 | Returns the result of converting the single-precision floating-point value
1078 | `a' to the 64-bit two's complement integer format.  The conversion is
1079 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1080 | Arithmetic---which means in particular that the conversion is rounded
1081 | according to the current rounding mode.  If `a' is a NaN, the largest
1082 | positive integer is returned.  Otherwise, if the conversion overflows, the
1083 | largest integer with the same sign as `a' is returned.
1084 *----------------------------------------------------------------------------*/
1085
1086 int64 float32_to_int64( float32 a )
1087 {
1088         flag aSign;
1089         int16 aExp, shiftCount;
1090         bits32 aSig;
1091         bits64 aSig64, aSigExtra;
1092
1093         aSig = extractFloat32Frac( a );
1094         aExp = extractFloat32Exp( a );
1095         aSign = extractFloat32Sign( a );
1096         shiftCount = 0xBE - aExp;
1097         if ( shiftCount < 0 ) {
1098                 float_raise( float_flag_invalid );
1099                 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1100                         return LIT64( 0x7FFFFFFFFFFFFFFF );
1101                 }
1102                 return (sbits64) LIT64( 0x8000000000000000 );
1103         }
1104         if ( aExp ) aSig |= 0x00800000;
1105         aSig64 = aSig;
1106         aSig64 <<= 40;
1107         shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1108         return roundAndPackInt64( aSign, aSig64, aSigExtra );
1109
1110 }
1111
1112 /*----------------------------------------------------------------------------
1113 | Returns the result of converting the single-precision floating-point value
1114 | `a' to the 64-bit two's complement integer format.  The conversion is
1115 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1116 | Arithmetic, except that the conversion is always rounded toward zero.  If
1117 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1118 | conversion overflows, the largest integer with the same sign as `a' is
1119 | returned.
1120 *----------------------------------------------------------------------------*/
1121
1122 int64 float32_to_int64_round_to_zero( float32 a )
1123 {
1124         flag aSign;
1125         int16 aExp, shiftCount;
1126         bits32 aSig;
1127         bits64 aSig64;
1128         int64 z;
1129
1130         aSig = extractFloat32Frac( a );
1131         aExp = extractFloat32Exp( a );
1132         aSign = extractFloat32Sign( a );
1133         shiftCount = aExp - 0xBE;
1134         if ( 0 <= shiftCount ) {
1135                 if ( a != 0xDF000000 ) {
1136                         float_raise( float_flag_invalid );
1137                         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1138                                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1139                         }
1140                 }
1141                 return (sbits64) LIT64( 0x8000000000000000 );
1142         }
1143         else if ( aExp <= 0x7E ) {
1144                 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1145                 return 0;
1146         }
1147         aSig64 = aSig | 0x00800000;
1148         aSig64 <<= 40;
1149         z = aSig64>>( - shiftCount );
1150         if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1151                 float_exception_flags |= float_flag_inexact;
1152         }
1153         if ( aSign ) z = - z;
1154         return z;
1155
1156 }
1157
1158 /*----------------------------------------------------------------------------
1159 | Returns the result of converting the single-precision floating-point value
1160 | `a' to the double-precision floating-point format.  The conversion is
1161 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1162 | Arithmetic.
1163 *----------------------------------------------------------------------------*/
1164
1165 float64 float32_to_float64( float32 a )
1166 {
1167         flag aSign;
1168         int16 aExp;
1169         bits32 aSig;
1170
1171         aSig = extractFloat32Frac( a );
1172         aExp = extractFloat32Exp( a );
1173         aSign = extractFloat32Sign( a );
1174         if ( aExp == 0xFF ) {
1175                 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1176                 return packFloat64( aSign, 0x7FF, 0 );
1177         }
1178         if ( aExp == 0 ) {
1179                 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1180                 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1181                 --aExp;
1182         }
1183         return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1184
1185 }
1186
1187 #ifdef FLOATX80
1188
1189 /*----------------------------------------------------------------------------
1190 | Returns the result of converting the single-precision floating-point value
1191 | `a' to the extended double-precision floating-point format.  The conversion
1192 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1193 | Arithmetic.
1194 *----------------------------------------------------------------------------*/
1195
1196 floatx80 float32_to_floatx80( float32 a )
1197 {
1198         flag aSign;
1199         int16 aExp;
1200         bits32 aSig;
1201
1202         aSig = extractFloat32Frac( a );
1203         aExp = extractFloat32Exp( a );
1204         aSign = extractFloat32Sign( a );
1205         if ( aExp == 0xFF ) {
1206                 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1207                 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1208         }
1209         if ( aExp == 0 ) {
1210                 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1211                 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1212         }
1213         aSig |= 0x00800000;
1214         return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1215
1216 }
1217
1218 #endif
1219
1220 #ifdef FLOAT128
1221
1222 /*----------------------------------------------------------------------------
1223 | Returns the result of converting the single-precision floating-point value
1224 | `a' to the double-precision floating-point format.  The conversion is
1225 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1226 | Arithmetic.
1227 *----------------------------------------------------------------------------*/
1228
1229 float128 float32_to_float128( float32 a )
1230 {
1231         flag aSign;
1232         int16 aExp;
1233         bits32 aSig;
1234
1235         aSig = extractFloat32Frac( a );
1236         aExp = extractFloat32Exp( a );
1237         aSign = extractFloat32Sign( a );
1238         if ( aExp == 0xFF ) {
1239                 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1240                 return packFloat128( aSign, 0x7FFF, 0, 0 );
1241         }
1242         if ( aExp == 0 ) {
1243                 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1244                 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1245                 --aExp;
1246         }
1247         return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1248
1249 }
1250
1251 #endif
1252
1253 /*----------------------------------------------------------------------------
1254 | Rounds the single-precision floating-point value `a' to an integer, and
1255 | returns the result as a single-precision floating-point value.  The
1256 | operation is performed according to the IEC/IEEE Standard for Binary
1257 | Floating-Point Arithmetic.
1258 *----------------------------------------------------------------------------*/
1259
1260 float32 float32_round_to_int( float32 a )
1261 {
1262         flag aSign;
1263         int16 aExp;
1264         bits32 lastBitMask, roundBitsMask;
1265         int8 roundingMode;
1266         float32 z;
1267
1268         aExp = extractFloat32Exp( a );
1269         if ( 0x96 <= aExp ) {
1270                 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1271                         return propagateFloat32NaN( a, a );
1272                 }
1273                 return a;
1274         }
1275         if ( aExp <= 0x7E ) {
1276                 if ( (bits32) ( a<<1 ) == 0 ) return a;
1277                 float_exception_flags |= float_flag_inexact;
1278                 aSign = extractFloat32Sign( a );
1279                 switch ( float_rounding_mode ) {
1280                         case float_round_nearest_even:
1281                         if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1282                                 return packFloat32( aSign, 0x7F, 0 );
1283                         }
1284                         break;
1285                         case float_round_down:
1286                         return aSign ? 0xBF800000 : 0;
1287                         case float_round_up:
1288                         return aSign ? 0x80000000 : 0x3F800000;
1289                 }
1290                 return packFloat32( aSign, 0, 0 );
1291         }
1292         lastBitMask = 1;
1293         lastBitMask <<= 0x96 - aExp;
1294         roundBitsMask = lastBitMask - 1;
1295         z = a;
1296         roundingMode = float_rounding_mode;
1297         if ( roundingMode == float_round_nearest_even ) {
1298                 z += lastBitMask>>1;
1299                 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1300         }
1301         else if ( roundingMode != float_round_to_zero ) {
1302                 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1303                         z += roundBitsMask;
1304                 }
1305         }
1306         z &= ~ roundBitsMask;
1307         if ( z != a ) float_exception_flags |= float_flag_inexact;
1308         return z;
1309
1310 }
1311
1312 /*----------------------------------------------------------------------------
1313 | Returns the result of adding the absolute values of the single-precision
1314 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1315 | before being returned.  `zSign' is ignored if the result is a NaN.
1316 | The addition is performed according to the IEC/IEEE Standard for Binary
1317 | Floating-Point Arithmetic.
1318 *----------------------------------------------------------------------------*/
1319
1320 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1321 {
1322         int16 aExp, bExp, zExp;
1323         bits32 aSig, bSig, zSig;
1324         int16 expDiff;
1325
1326         aSig = extractFloat32Frac( a );
1327         aExp = extractFloat32Exp( a );
1328         bSig = extractFloat32Frac( b );
1329         bExp = extractFloat32Exp( b );
1330         expDiff = aExp - bExp;
1331         aSig <<= 6;
1332         bSig <<= 6;
1333         if ( 0 < expDiff ) {
1334                 if ( aExp == 0xFF ) {
1335                         if ( aSig ) return propagateFloat32NaN( a, b );
1336                         return a;
1337                 }
1338                 if ( bExp == 0 ) {
1339                         --expDiff;
1340                 }
1341                 else {
1342                         bSig |= 0x20000000;
1343                 }
1344                 shift32RightJamming( bSig, expDiff, &bSig );
1345                 zExp = aExp;
1346         }
1347         else if ( expDiff < 0 ) {
1348                 if ( bExp == 0xFF ) {
1349                         if ( bSig ) return propagateFloat32NaN( a, b );
1350                         return packFloat32( zSign, 0xFF, 0 );
1351                 }
1352                 if ( aExp == 0 ) {
1353                         ++expDiff;
1354                 }
1355                 else {
1356                         aSig |= 0x20000000;
1357                 }
1358                 shift32RightJamming( aSig, - expDiff, &aSig );
1359                 zExp = bExp;
1360         }
1361         else {
1362                 if ( aExp == 0xFF ) {
1363                         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1364                         return a;
1365                 }
1366                 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1367                 zSig = 0x40000000 + aSig + bSig;
1368                 zExp = aExp;
1369                 goto roundAndPack;
1370         }
1371         aSig |= 0x20000000;
1372         zSig = ( aSig + bSig )<<1;
1373         --zExp;
1374         if ( (sbits32) zSig < 0 ) {
1375                 zSig = aSig + bSig;
1376                 ++zExp;
1377         }
1378         roundAndPack:
1379         return roundAndPackFloat32( zSign, zExp, zSig );
1380
1381 }
1382
1383 /*----------------------------------------------------------------------------
1384 | Returns the result of subtracting the absolute values of the single-
1385 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1386 | difference is negated before being returned.  `zSign' is ignored if the
1387 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1388 | Standard for Binary Floating-Point Arithmetic.
1389 *----------------------------------------------------------------------------*/
1390
1391 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1392 {
1393         int16 aExp, bExp, zExp;
1394         bits32 aSig, bSig, zSig;
1395         int16 expDiff;
1396
1397         aSig = extractFloat32Frac( a );
1398         aExp = extractFloat32Exp( a );
1399         bSig = extractFloat32Frac( b );
1400         bExp = extractFloat32Exp( b );
1401         expDiff = aExp - bExp;
1402         aSig <<= 7;
1403         bSig <<= 7;
1404         if ( 0 < expDiff ) goto aExpBigger;
1405         if ( expDiff < 0 ) goto bExpBigger;
1406         if ( aExp == 0xFF ) {
1407                 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1408                 float_raise( float_flag_invalid );
1409                 return float32_default_nan;
1410         }
1411         if ( aExp == 0 ) {
1412                 aExp = 1;
1413                 bExp = 1;
1414         }
1415         if ( bSig < aSig ) goto aBigger;
1416         if ( aSig < bSig ) goto bBigger;
1417         return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1418         bExpBigger:
1419         if ( bExp == 0xFF ) {
1420                 if ( bSig ) return propagateFloat32NaN( a, b );
1421                 return packFloat32( zSign ^ 1, 0xFF, 0 );
1422         }
1423         if ( aExp == 0 ) {
1424                 ++expDiff;
1425         }
1426         else {
1427                 aSig |= 0x40000000;
1428         }
1429         shift32RightJamming( aSig, - expDiff, &aSig );
1430         bSig |= 0x40000000;
1431         bBigger:
1432         zSig = bSig - aSig;
1433         zExp = bExp;
1434         zSign ^= 1;
1435         goto normalizeRoundAndPack;
1436         aExpBigger:
1437         if ( aExp == 0xFF ) {
1438                 if ( aSig ) return propagateFloat32NaN( a, b );
1439                 return a;
1440         }
1441         if ( bExp == 0 ) {
1442                 --expDiff;
1443         }
1444         else {
1445                 bSig |= 0x40000000;
1446         }
1447         shift32RightJamming( bSig, expDiff, &bSig );
1448         aSig |= 0x40000000;
1449         aBigger:
1450         zSig = aSig - bSig;
1451         zExp = aExp;
1452         normalizeRoundAndPack:
1453         --zExp;
1454         return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1455
1456 }
1457
1458 /*----------------------------------------------------------------------------
1459 | Returns the result of adding the single-precision floating-point values `a'
1460 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1461 | Binary Floating-Point Arithmetic.
1462 *----------------------------------------------------------------------------*/
1463
1464 float32 float32_add( float32 a, float32 b )
1465 {
1466         flag aSign, bSign;
1467
1468         aSign = extractFloat32Sign( a );
1469         bSign = extractFloat32Sign( b );
1470         if ( aSign == bSign ) {
1471                 return addFloat32Sigs( a, b, aSign );
1472         }
1473         else {
1474                 return subFloat32Sigs( a, b, aSign );
1475         }
1476
1477 }
1478
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of subtracting the single-precision floating-point values
1481 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1482 | for Binary Floating-Point Arithmetic.
1483 *----------------------------------------------------------------------------*/
1484
1485 float32 float32_sub( float32 a, float32 b )
1486 {
1487         flag aSign, bSign;
1488
1489         aSign = extractFloat32Sign( a );
1490         bSign = extractFloat32Sign( b );
1491         if ( aSign == bSign ) {
1492                 return subFloat32Sigs( a, b, aSign );
1493         }
1494         else {
1495                 return addFloat32Sigs( a, b, aSign );
1496         }
1497
1498 }
1499
1500 /*----------------------------------------------------------------------------
1501 | Returns the result of multiplying the single-precision floating-point values
1502 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1503 | for Binary Floating-Point Arithmetic.
1504 *----------------------------------------------------------------------------*/
1505
1506 float32 float32_mul( float32 a, float32 b )
1507 {
1508         flag aSign, bSign, zSign;
1509         int16 aExp, bExp, zExp;
1510         bits32 aSig, bSig;
1511         bits64 zSig64;
1512         bits32 zSig;
1513
1514         aSig = extractFloat32Frac( a );
1515         aExp = extractFloat32Exp( a );
1516         aSign = extractFloat32Sign( a );
1517         bSig = extractFloat32Frac( b );
1518         bExp = extractFloat32Exp( b );
1519         bSign = extractFloat32Sign( b );
1520         zSign = aSign ^ bSign;
1521         if ( aExp == 0xFF ) {
1522                 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1523                         return propagateFloat32NaN( a, b );
1524                 }
1525                 if ( ( bExp | bSig ) == 0 ) {
1526                         float_raise( float_flag_invalid );
1527                         return float32_default_nan;
1528                 }
1529                 return packFloat32( zSign, 0xFF, 0 );
1530         }
1531         if ( bExp == 0xFF ) {
1532                 if ( bSig ) return propagateFloat32NaN( a, b );
1533                 if ( ( aExp | aSig ) == 0 ) {
1534                         float_raise( float_flag_invalid );
1535                         return float32_default_nan;
1536                 }
1537                 return packFloat32( zSign, 0xFF, 0 );
1538         }
1539         if ( aExp == 0 ) {
1540                 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1541                 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1542         }
1543         if ( bExp == 0 ) {
1544                 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1545                 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1546         }
1547         zExp = aExp + bExp - 0x7F;
1548         aSig = ( aSig | 0x00800000 )<<7;
1549         bSig = ( bSig | 0x00800000 )<<8;
1550         shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1551         zSig = zSig64;
1552         if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1553                 zSig <<= 1;
1554                 --zExp;
1555         }
1556         return roundAndPackFloat32( zSign, zExp, zSig );
1557
1558 }
1559
1560 /*----------------------------------------------------------------------------
1561 | Returns the result of dividing the single-precision floating-point value `a'
1562 | by the corresponding value `b'.  The operation is performed according to the
1563 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1564 *----------------------------------------------------------------------------*/
1565
1566 float32 float32_div( float32 a, float32 b )
1567 {
1568         flag aSign, bSign, zSign;
1569         int16 aExp, bExp, zExp;
1570         bits32 aSig, bSig, zSig;
1571
1572         aSig = extractFloat32Frac( a );
1573         aExp = extractFloat32Exp( a );
1574         aSign = extractFloat32Sign( a );
1575         bSig = extractFloat32Frac( b );
1576         bExp = extractFloat32Exp( b );
1577         bSign = extractFloat32Sign( b );
1578         zSign = aSign ^ bSign;
1579         if ( aExp == 0xFF ) {
1580                 if ( aSig ) return propagateFloat32NaN( a, b );
1581                 if ( bExp == 0xFF ) {
1582                         if ( bSig ) return propagateFloat32NaN( a, b );
1583                         float_raise( float_flag_invalid );
1584                         return float32_default_nan;
1585                 }
1586                 return packFloat32( zSign, 0xFF, 0 );
1587         }
1588         if ( bExp == 0xFF ) {
1589                 if ( bSig ) return propagateFloat32NaN( a, b );
1590                 return packFloat32( zSign, 0, 0 );
1591         }
1592         if ( bExp == 0 ) {
1593                 if ( bSig == 0 ) {
1594                         if ( ( aExp | aSig ) == 0 ) {
1595                                 float_raise( float_flag_invalid );
1596                                 return float32_default_nan;
1597                         }
1598                         float_raise( float_flag_divbyzero );
1599                         return packFloat32( zSign, 0xFF, 0 );
1600                 }
1601                 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1602         }
1603         if ( aExp == 0 ) {
1604                 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1605                 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1606         }
1607         zExp = aExp - bExp + 0x7D;
1608         aSig = ( aSig | 0x00800000 )<<7;
1609         bSig = ( bSig | 0x00800000 )<<8;
1610         if ( bSig <= ( aSig + aSig ) ) {
1611                 aSig >>= 1;
1612                 ++zExp;
1613         }
1614         zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1615         if ( ( zSig & 0x3F ) == 0 ) {
1616                 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1617         }
1618         return roundAndPackFloat32( zSign, zExp, zSig );
1619
1620 }
1621
1622 /*----------------------------------------------------------------------------
1623 | Returns the remainder of the single-precision floating-point value `a'
1624 | with respect to the corresponding value `b'.  The operation is performed
1625 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1626 *----------------------------------------------------------------------------*/
1627
1628 float32 float32_rem( float32 a, float32 b )
1629 {
1630         flag aSign, zSign;
1631         int16 aExp, bExp, expDiff;
1632         bits32 aSig, bSig;
1633         bits32 q;
1634         bits64 aSig64, bSig64, q64;
1635         bits32 alternateASig;
1636         sbits32 sigMean;
1637
1638         aSig = extractFloat32Frac( a );
1639         aExp = extractFloat32Exp( a );
1640         aSign = extractFloat32Sign( a );
1641         bSig = extractFloat32Frac( b );
1642         bExp = extractFloat32Exp( b );
1643 //    bSign = extractFloat32Sign( b );
1644         if ( aExp == 0xFF ) {
1645                 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1646                         return propagateFloat32NaN( a, b );
1647                 }
1648                 float_raise( float_flag_invalid );
1649                 return float32_default_nan;
1650         }
1651         if ( bExp == 0xFF ) {
1652                 if ( bSig ) return propagateFloat32NaN( a, b );
1653                 return a;
1654         }
1655         if ( bExp == 0 ) {
1656                 if ( bSig == 0 ) {
1657                         float_raise( float_flag_invalid );
1658                         return float32_default_nan;
1659                 }
1660                 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1661         }
1662         if ( aExp == 0 ) {
1663                 if ( aSig == 0 ) return a;
1664                 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1665         }
1666         expDiff = aExp - bExp;
1667         aSig |= 0x00800000;
1668         bSig |= 0x00800000;
1669         if ( expDiff < 32 ) {
1670                 aSig <<= 8;
1671                 bSig <<= 8;
1672                 if ( expDiff < 0 ) {
1673                         if ( expDiff < -1 ) return a;
1674                         aSig >>= 1;
1675                 }
1676                 q = ( bSig <= aSig );
1677                 if ( q ) aSig -= bSig;
1678                 if ( 0 < expDiff ) {
1679                         q = ( ( (bits64) aSig )<<32 ) / bSig;
1680                         q >>= 32 - expDiff;
1681                         bSig >>= 2;
1682                         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1683                 }
1684                 else {
1685                         aSig >>= 2;
1686                         bSig >>= 2;
1687                 }
1688         }
1689         else {
1690                 if ( bSig <= aSig ) aSig -= bSig;
1691                 aSig64 = ( (bits64) aSig )<<40;
1692                 bSig64 = ( (bits64) bSig )<<40;
1693                 expDiff -= 64;
1694                 while ( 0 < expDiff ) {
1695                         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1696                         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1697                         aSig64 = - ( ( bSig * q64 )<<38 );
1698                         expDiff -= 62;
1699                 }
1700                 expDiff += 64;
1701                 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1702                 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1703                 q = q64>>( 64 - expDiff );
1704                 bSig <<= 6;
1705                 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1706         }
1707         do {
1708                 alternateASig = aSig;
1709                 ++q;
1710                 aSig -= bSig;
1711         } while ( 0 <= (sbits32) aSig );
1712         sigMean = aSig + alternateASig;
1713         if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1714                 aSig = alternateASig;
1715         }
1716         zSign = ( (sbits32) aSig < 0 );
1717         if ( zSign ) aSig = - aSig;
1718         return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1719
1720 }
1721
1722 /*----------------------------------------------------------------------------
1723 | Returns the square root of the single-precision floating-point value `a'.
1724 | The operation is performed according to the IEC/IEEE Standard for Binary
1725 | Floating-Point Arithmetic.
1726 *----------------------------------------------------------------------------*/
1727
1728 float32 float32_sqrt( float32 a )
1729 {
1730         flag aSign;
1731         int16 aExp, zExp;
1732         bits32 aSig, zSig;
1733         bits64 rem, term;
1734
1735         aSig = extractFloat32Frac( a );
1736         aExp = extractFloat32Exp( a );
1737         aSign = extractFloat32Sign( a );
1738         if ( aExp == 0xFF ) {
1739                 if ( aSig ) return propagateFloat32NaN( a, 0 );
1740                 if ( ! aSign ) return a;
1741                 float_raise( float_flag_invalid );
1742                 return float32_default_nan;
1743         }
1744         if ( aSign ) {
1745                 if ( ( aExp | aSig ) == 0 ) return a;
1746                 float_raise( float_flag_invalid );
1747                 return float32_default_nan;
1748         }
1749         if ( aExp == 0 ) {
1750                 if ( aSig == 0 ) return 0;
1751                 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1752         }
1753         zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1754         aSig = ( aSig | 0x00800000 )<<8;
1755         zSig = estimateSqrt32( aExp, aSig ) + 2;
1756         if ( ( zSig & 0x7F ) <= 5 ) {
1757                 if ( zSig < 2 ) {
1758                         zSig = 0x7FFFFFFF;
1759                         goto roundAndPack;
1760                 }
1761                 aSig >>= aExp & 1;
1762                 term = ( (bits64) zSig ) * zSig;
1763                 rem = ( ( (bits64) aSig )<<32 ) - term;
1764                 while ( (sbits64) rem < 0 ) {
1765                         --zSig;
1766                         rem += ( ( (bits64) zSig )<<1 ) | 1;
1767                 }
1768                 zSig |= ( rem != 0 );
1769         }
1770         shift32RightJamming( zSig, 1, &zSig );
1771         roundAndPack:
1772         return roundAndPackFloat32( 0, zExp, zSig );
1773
1774 }
1775
1776 /*----------------------------------------------------------------------------
1777 | Returns 1 if the single-precision floating-point value `a' is equal to
1778 | the corresponding value `b', and 0 otherwise.  The comparison is performed
1779 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1780 *----------------------------------------------------------------------------*/
1781
1782 flag float32_eq( float32 a, float32 b )
1783 {
1784         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1785                         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1786                 ) {
1787                 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1788                         float_raise( float_flag_invalid );
1789                 }
1790                 return 0;
1791         }
1792         return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1793
1794 }
1795
1796 /*----------------------------------------------------------------------------
1797 | Returns 1 if the single-precision floating-point value `a' is less than
1798 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
1799 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1800 | Arithmetic.
1801 *----------------------------------------------------------------------------*/
1802
1803 flag float32_le( float32 a, float32 b )
1804 {
1805         flag aSign, bSign;
1806
1807         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1808                         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1809                 ) {
1810                 float_raise( float_flag_invalid );
1811                 return 0;
1812         }
1813         aSign = extractFloat32Sign( a );
1814         bSign = extractFloat32Sign( b );
1815         if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1816         return ( a == b ) || ( aSign ^ ( a < b ) );
1817
1818 }
1819
1820 /*----------------------------------------------------------------------------
1821 | Returns 1 if the single-precision floating-point value `a' is less than
1822 | the corresponding value `b', and 0 otherwise.  The comparison is performed
1823 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1824 *----------------------------------------------------------------------------*/
1825
1826 flag float32_lt( float32 a, float32 b )
1827 {
1828         flag aSign, bSign;
1829
1830         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1831                         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1832                 ) {
1833                 float_raise( float_flag_invalid );
1834                 return 0;
1835         }
1836         aSign = extractFloat32Sign( a );
1837         bSign = extractFloat32Sign( b );
1838         if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1839         return ( a != b ) && ( aSign ^ ( a < b ) );
1840
1841 }
1842
1843 /*----------------------------------------------------------------------------
1844 | Returns 1 if the single-precision floating-point value `a' is equal to
1845 | the corresponding value `b', and 0 otherwise.  The invalid exception is
1846 | raised if either operand is a NaN.  Otherwise, the comparison is performed
1847 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1848 *----------------------------------------------------------------------------*/
1849
1850 flag float32_eq_signaling( float32 a, float32 b )
1851 {
1852         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1853                         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1854                 ) {
1855                 float_raise( float_flag_invalid );
1856                 return 0;
1857         }
1858         return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1859
1860 }
1861
1862 /*----------------------------------------------------------------------------
1863 | Returns 1 if the single-precision floating-point value `a' is less than or
1864 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1865 | cause an exception.  Otherwise, the comparison is performed according to the
1866 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1867 *----------------------------------------------------------------------------*/
1868
1869 flag float32_le_quiet( float32 a, float32 b )
1870 {
1871         flag aSign, bSign;
1872 //    int16 aExp, bExp;
1873
1874         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1875                         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1876                 ) {
1877                 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1878                         float_raise( float_flag_invalid );
1879                 }
1880                 return 0;
1881         }
1882         aSign = extractFloat32Sign( a );
1883         bSign = extractFloat32Sign( b );
1884         if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1885         return ( a == b ) || ( aSign ^ ( a < b ) );
1886
1887 }
1888
1889 /*----------------------------------------------------------------------------
1890 | Returns 1 if the single-precision floating-point value `a' is less than
1891 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1892 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1893 | Standard for Binary Floating-Point Arithmetic.
1894 *----------------------------------------------------------------------------*/
1895
1896 flag float32_lt_quiet( float32 a, float32 b )
1897 {
1898         flag aSign, bSign;
1899
1900         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1901                         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1902                 ) {
1903                 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1904                         float_raise( float_flag_invalid );
1905                 }
1906                 return 0;
1907         }
1908         aSign = extractFloat32Sign( a );
1909         bSign = extractFloat32Sign( b );
1910         if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1911         return ( a != b ) && ( aSign ^ ( a < b ) );
1912
1913 }
1914
1915 /*----------------------------------------------------------------------------
1916 | Returns the result of converting the double-precision floating-point value
1917 | `a' to the 32-bit two's complement integer format.  The conversion is
1918 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1919 | Arithmetic---which means in particular that the conversion is rounded
1920 | according to the current rounding mode.  If `a' is a NaN, the largest
1921 | positive integer is returned.  Otherwise, if the conversion overflows, the
1922 | largest integer with the same sign as `a' is returned.
1923 *----------------------------------------------------------------------------*/
1924
1925 int32 float64_to_int32( float64 a )
1926 {
1927         flag aSign;
1928         int16 aExp, shiftCount;
1929         bits64 aSig;
1930
1931         aSig = extractFloat64Frac( a );
1932         aExp = extractFloat64Exp( a );
1933         aSign = extractFloat64Sign( a );
1934         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1935         if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1936         shiftCount = 0x42C - aExp;
1937         if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1938         return roundAndPackInt32( aSign, aSig );
1939
1940 }
1941
1942 /*----------------------------------------------------------------------------
1943 | Returns the result of converting the double-precision floating-point value
1944 | `a' to the 32-bit two's complement integer format.  The conversion is
1945 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1946 | Arithmetic, except that the conversion is always rounded toward zero.
1947 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1948 | the conversion overflows, the largest integer with the same sign as `a' is
1949 | returned.
1950 *----------------------------------------------------------------------------*/
1951
1952 int32 float64_to_int32_round_to_zero( float64 a )
1953 {
1954         flag aSign;
1955         int16 aExp, shiftCount;
1956         bits64 aSig, savedASig;
1957         int32 z;
1958
1959         aSig = extractFloat64Frac( a );
1960         aExp = extractFloat64Exp( a );
1961         aSign = extractFloat64Sign( a );
1962         if ( 0x41E < aExp ) {
1963                 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1964                 goto invalid;
1965         }
1966         else if ( aExp < 0x3FF ) {
1967                 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1968                 return 0;
1969         }
1970         aSig |= LIT64( 0x0010000000000000 );
1971         shiftCount = 0x433 - aExp;
1972         savedASig = aSig;
1973         aSig >>= shiftCount;
1974         z = aSig;
1975         if ( aSign ) z = - z;
1976         if ( ( z < 0 ) ^ aSign ) {
1977         invalid:
1978                 float_raise( float_flag_invalid );
1979                 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1980         }
1981         if ( ( aSig<<shiftCount ) != savedASig ) {
1982                 float_exception_flags |= float_flag_inexact;
1983         }
1984         return z;
1985
1986 }
1987
1988 /*----------------------------------------------------------------------------
1989 | Returns the result of converting the double-precision floating-point value
1990 | `a' to the 64-bit two's complement integer format.  The conversion is
1991 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1992 | Arithmetic---which means in particular that the conversion is rounded
1993 | according to the current rounding mode.  If `a' is a NaN, the largest
1994 | positive integer is returned.  Otherwise, if the conversion overflows, the
1995 | largest integer with the same sign as `a' is returned.
1996 *----------------------------------------------------------------------------*/
1997
1998 int64 float64_to_int64( float64 a )
1999 {
2000         flag aSign;
2001         int16 aExp, shiftCount;
2002         bits64 aSig, aSigExtra;
2003
2004         aSig = extractFloat64Frac( a );
2005         aExp = extractFloat64Exp( a );
2006         aSign = extractFloat64Sign( a );
2007         if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2008         shiftCount = 0x433 - aExp;
2009         if ( shiftCount <= 0 ) {
2010                 if ( 0x43E < aExp ) {
2011                         float_raise( float_flag_invalid );
2012                         if (    ! aSign
2013                                         || (    ( aExp == 0x7FF )
2014                                                 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2015                                 ) {
2016                                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2017                         }
2018                         return (sbits64) LIT64( 0x8000000000000000 );
2019                 }
2020                 aSigExtra = 0;
2021                 aSig <<= - shiftCount;
2022         }
2023         else {
2024                 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2025         }
2026         return roundAndPackInt64( aSign, aSig, aSigExtra );
2027
2028 }
2029
2030 /*----------------------------------------------------------------------------
2031 | Returns the result of converting the double-precision floating-point value
2032 | `a' to the 64-bit two's complement integer format.  The conversion is
2033 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2034 | Arithmetic, except that the conversion is always rounded toward zero.
2035 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2036 | the conversion overflows, the largest integer with the same sign as `a' is
2037 | returned.
2038 *----------------------------------------------------------------------------*/
2039
2040 int64 float64_to_int64_round_to_zero( float64 a )
2041 {
2042         flag aSign;
2043         int16 aExp, shiftCount;
2044         bits64 aSig;
2045         int64 z;
2046
2047         aSig = extractFloat64Frac( a );
2048         aExp = extractFloat64Exp( a );
2049         aSign = extractFloat64Sign( a );
2050         if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2051         shiftCount = aExp - 0x433;
2052         if ( 0 <= shiftCount ) {
2053                 if ( 0x43E <= aExp ) {
2054                         if ( a != LIT64( 0xC3E0000000000000 ) ) {
2055                                 float_raise( float_flag_invalid );
2056                                 if (    ! aSign
2057                                                 || (    ( aExp == 0x7FF )
2058                                                         && ( aSig != LIT64( 0x0010000000000000 ) ) )
2059                                         ) {
2060                                         return LIT64( 0x7FFFFFFFFFFFFFFF );
2061                                 }
2062                         }
2063                         return (sbits64) LIT64( 0x8000000000000000 );
2064                 }
2065                 z = aSig<<shiftCount;
2066         }
2067         else {
2068                 if ( aExp < 0x3FE ) {
2069                         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2070                         return 0;
2071                 }
2072                 z = aSig>>( - shiftCount );
2073                 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2074                         float_exception_flags |= float_flag_inexact;
2075                 }
2076         }
2077         if ( aSign ) z = - z;
2078         return z;
2079
2080 }
2081
2082 /*----------------------------------------------------------------------------
2083 | Returns the result of converting the double-precision floating-point value
2084 | `a' to the single-precision floating-point format.  The conversion is
2085 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2086 | Arithmetic.
2087 *----------------------------------------------------------------------------*/
2088
2089 float32 float64_to_float32( float64 a )
2090 {
2091         flag aSign;
2092         int16 aExp;
2093         bits64 aSig;
2094         bits32 zSig;
2095
2096         aSig = extractFloat64Frac( a );
2097         aExp = extractFloat64Exp( a );
2098         aSign = extractFloat64Sign( a );
2099         if ( aExp == 0x7FF ) {
2100                 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2101                 return packFloat32( aSign, 0xFF, 0 );
2102         }
2103         shift64RightJamming( aSig, 22, &aSig );
2104         zSig = aSig;
2105         if ( aExp || zSig ) {
2106                 zSig |= 0x40000000;
2107                 aExp -= 0x381;
2108         }
2109         return roundAndPackFloat32( aSign, aExp, zSig );
2110
2111 }
2112
2113 #ifdef FLOATX80
2114
2115 /*----------------------------------------------------------------------------
2116 | Returns the result of converting the double-precision floating-point value
2117 | `a' to the extended double-precision floating-point format.  The conversion
2118 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2119 | Arithmetic.
2120 *----------------------------------------------------------------------------*/
2121
2122 floatx80 float64_to_floatx80( float64 a )
2123 {
2124         flag aSign;
2125         int16 aExp;
2126         bits64 aSig;
2127
2128         aSig = extractFloat64Frac( a );
2129         aExp = extractFloat64Exp( a );
2130         aSign = extractFloat64Sign( a );
2131         if ( aExp == 0x7FF ) {
2132                 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2133                 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2134         }
2135         if ( aExp == 0 ) {
2136                 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2137                 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2138         }
2139         return
2140                 packFloatx80(
2141                         aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2142
2143 }
2144
2145 #endif
2146
2147 #ifdef FLOAT128
2148
2149 /*----------------------------------------------------------------------------
2150 | Returns the result of converting the double-precision floating-point value
2151 | `a' to the quadruple-precision floating-point format.  The conversion is
2152 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2153 | Arithmetic.
2154 *----------------------------------------------------------------------------*/
2155
2156 float128 float64_to_float128( float64 a )
2157 {
2158         flag aSign;
2159         int16 aExp;
2160         bits64 aSig, zSig0, zSig1;
2161
2162         aSig = extractFloat64Frac( a );
2163         aExp = extractFloat64Exp( a );
2164         aSign = extractFloat64Sign( a );
2165         if ( aExp == 0x7FF ) {
2166                 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2167                 return packFloat128( aSign, 0x7FFF, 0, 0 );
2168         }
2169         if ( aExp == 0 ) {
2170                 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2171                 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2172                 --aExp;
2173         }
2174         shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2175         return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2176
2177 }
2178
2179 #endif
2180
2181 /*----------------------------------------------------------------------------
2182 | Rounds the double-precision floating-point value `a' to an integer, and
2183 | returns the result as a double-precision floating-point value.  The
2184 | operation is performed according to the IEC/IEEE Standard for Binary
2185 | Floating-Point Arithmetic.
2186 *----------------------------------------------------------------------------*/
2187
2188 float64 float64_round_to_int( float64 a )
2189 {
2190         flag aSign;
2191         int16 aExp;
2192         bits64 lastBitMask, roundBitsMask;
2193         int8 roundingMode;
2194         float64 z;
2195
2196         aExp = extractFloat64Exp( a );
2197         if ( 0x433 <= aExp ) {
2198                 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2199                         return propagateFloat64NaN( a, a );
2200                 }
2201                 return a;
2202         }
2203         if ( aExp < 0x3FF ) {
2204                 if ( (bits64) ( a<<1 ) == 0 ) return a;
2205                 float_exception_flags |= float_flag_inexact;
2206                 aSign = extractFloat64Sign( a );
2207                 switch ( float_rounding_mode ) {
2208                         case float_round_nearest_even:
2209                         if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2210                                 return packFloat64( aSign, 0x3FF, 0 );
2211                         }
2212                         break;
2213                         case float_round_down:
2214                         return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2215                         case float_round_up:
2216                         return
2217                         aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2218                 }
2219                 return packFloat64( aSign, 0, 0 );
2220         }
2221         lastBitMask = 1;
2222         lastBitMask <<= 0x433 - aExp;
2223         roundBitsMask = lastBitMask - 1;
2224         z = a;
2225         roundingMode = float_rounding_mode;
2226         if ( roundingMode == float_round_nearest_even ) {
2227                 z += lastBitMask>>1;
2228                 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2229         }
2230         else if ( roundingMode != float_round_to_zero ) {
2231                 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2232                         z += roundBitsMask;
2233                 }
2234         }
2235         z &= ~ roundBitsMask;
2236         if ( z != a ) float_exception_flags |= float_flag_inexact;
2237         return z;
2238
2239 }
2240
2241 /*----------------------------------------------------------------------------
2242 | Returns the result of adding the absolute values of the double-precision
2243 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2244 | before being returned.  `zSign' is ignored if the result is a NaN.
2245 | The addition is performed according to the IEC/IEEE Standard for Binary
2246 | Floating-Point Arithmetic.
2247 *----------------------------------------------------------------------------*/
2248
2249 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2250 {
2251         int16 aExp, bExp, zExp;
2252         bits64 aSig, bSig, zSig;
2253         int16 expDiff;
2254
2255         aSig = extractFloat64Frac( a );
2256         aExp = extractFloat64Exp( a );
2257         bSig = extractFloat64Frac( b );
2258         bExp = extractFloat64Exp( b );
2259         expDiff = aExp - bExp;
2260         aSig <<= 9;
2261         bSig <<= 9;
2262         if ( 0 < expDiff ) {
2263                 if ( aExp == 0x7FF ) {
2264                         if ( aSig ) return propagateFloat64NaN( a, b );
2265                         return a;
2266                 }
2267                 if ( bExp == 0 ) {
2268                         --expDiff;
2269                 }
2270                 else {
2271                         bSig |= LIT64( 0x2000000000000000 );
2272                 }
2273                 shift64RightJamming( bSig, expDiff, &bSig );
2274                 zExp = aExp;
2275         }
2276         else if ( expDiff < 0 ) {
2277                 if ( bExp == 0x7FF ) {
2278                         if ( bSig ) return propagateFloat64NaN( a, b );
2279                         return packFloat64( zSign, 0x7FF, 0 );
2280                 }
2281                 if ( aExp == 0 ) {
2282                         ++expDiff;
2283                 }
2284                 else {
2285                         aSig |= LIT64( 0x2000000000000000 );
2286                 }
2287                 shift64RightJamming( aSig, - expDiff, &aSig );
2288                 zExp = bExp;
2289         }
2290         else {
2291                 if ( aExp == 0x7FF ) {
2292                         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2293                         return a;
2294                 }
2295                 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2296                 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2297                 zExp = aExp;
2298                 goto roundAndPack;
2299         }
2300         aSig |= LIT64( 0x2000000000000000 );
2301         zSig = ( aSig + bSig )<<1;
2302         --zExp;
2303         if ( (sbits64) zSig < 0 ) {
2304                 zSig = aSig + bSig;
2305                 ++zExp;
2306         }
2307         roundAndPack:
2308         return roundAndPackFloat64( zSign, zExp, zSig );
2309
2310 }
2311
2312 /*----------------------------------------------------------------------------
2313 | Returns the result of subtracting the absolute values of the double-
2314 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2315 | difference is negated before being returned.  `zSign' is ignored if the
2316 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2317 | Standard for Binary Floating-Point Arithmetic.
2318 *----------------------------------------------------------------------------*/
2319
2320 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2321 {
2322         int16 aExp, bExp, zExp;
2323         bits64 aSig, bSig, zSig;
2324         int16 expDiff;
2325
2326         aSig = extractFloat64Frac( a );
2327         aExp = extractFloat64Exp( a );
2328         bSig = extractFloat64Frac( b );
2329         bExp = extractFloat64Exp( b );
2330         expDiff = aExp - bExp;
2331         aSig <<= 10;
2332         bSig <<= 10;
2333         if ( 0 < expDiff ) goto aExpBigger;
2334         if ( expDiff < 0 ) goto bExpBigger;
2335         if ( aExp == 0x7FF ) {
2336                 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2337                 float_raise( float_flag_invalid );
2338                 return float64_default_nan;
2339         }
2340         if ( aExp == 0 ) {
2341                 aExp = 1;
2342                 bExp = 1;
2343         }
2344         if ( bSig < aSig ) goto aBigger;
2345         if ( aSig < bSig ) goto bBigger;
2346         return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2347         bExpBigger:
2348         if ( bExp == 0x7FF ) {
2349                 if ( bSig ) return propagateFloat64NaN( a, b );
2350                 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2351         }
2352         if ( aExp == 0 ) {
2353                 ++expDiff;
2354         }
2355         else {
2356                 aSig |= LIT64( 0x4000000000000000 );
2357         }
2358         shift64RightJamming( aSig, - expDiff, &aSig );
2359         bSig |= LIT64( 0x4000000000000000 );
2360         bBigger:
2361         zSig = bSig - aSig;
2362         zExp = bExp;
2363         zSign ^= 1;
2364         goto normalizeRoundAndPack;
2365         aExpBigger:
2366         if ( aExp == 0x7FF ) {
2367                 if ( aSig ) return propagateFloat64NaN( a, b );
2368                 return a;
2369         }
2370         if ( bExp == 0 ) {
2371                 --expDiff;
2372         }
2373         else {
2374                 bSig |= LIT64( 0x4000000000000000 );
2375         }
2376         shift64RightJamming( bSig, expDiff, &bSig );
2377         aSig |= LIT64( 0x4000000000000000 );
2378         aBigger:
2379         zSig = aSig - bSig;
2380         zExp = aExp;
2381         normalizeRoundAndPack:
2382         --zExp;
2383         return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2384
2385 }
2386
2387 /*----------------------------------------------------------------------------
2388 | Returns the result of adding the double-precision floating-point values `a'
2389 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2390 | Binary Floating-Point Arithmetic.
2391 *----------------------------------------------------------------------------*/
2392
2393 float64 float64_add( float64 a, float64 b )
2394 {
2395         flag aSign, bSign;
2396
2397         aSign = extractFloat64Sign( a );
2398         bSign = extractFloat64Sign( b );
2399         if ( aSign == bSign ) {
2400                 return addFloat64Sigs( a, b, aSign );
2401         }
2402         else {
2403                 return subFloat64Sigs( a, b, aSign );
2404         }
2405
2406 }
2407
2408 /*----------------------------------------------------------------------------
2409 | Returns the result of subtracting the double-precision floating-point values
2410 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2411 | for Binary Floating-Point Arithmetic.
2412 *----------------------------------------------------------------------------*/
2413
2414 float64 float64_sub( float64 a, float64 b )
2415 {
2416         flag aSign, bSign;
2417
2418         aSign = extractFloat64Sign( a );
2419         bSign = extractFloat64Sign( b );
2420         if ( aSign == bSign ) {
2421                 return subFloat64Sigs( a, b, aSign );
2422         }
2423         else {
2424                 return addFloat64Sigs( a, b, aSign );
2425         }
2426
2427 }
2428
2429 /*----------------------------------------------------------------------------
2430 | Returns the result of multiplying the double-precision floating-point values
2431 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2432 | for Binary Floating-Point Arithmetic.
2433 *----------------------------------------------------------------------------*/
2434
2435 float64 float64_mul( float64 a, float64 b )
2436 {
2437         flag aSign, bSign, zSign;
2438         int16 aExp, bExp, zExp;
2439         bits64 aSig, bSig, zSig0, zSig1;
2440
2441         aSig = extractFloat64Frac( a );
2442         aExp = extractFloat64Exp( a );
2443         aSign = extractFloat64Sign( a );
2444         bSig = extractFloat64Frac( b );
2445         bExp = extractFloat64Exp( b );
2446         bSign = extractFloat64Sign( b );
2447         zSign = aSign ^ bSign;
2448         if ( aExp == 0x7FF ) {
2449                 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2450                         return propagateFloat64NaN( a, b );
2451                 }
2452                 if ( ( bExp | bSig ) == 0 ) {
2453                         float_raise( float_flag_invalid );
2454                         return float64_default_nan;
2455                 }
2456                 return packFloat64( zSign, 0x7FF, 0 );
2457         }
2458         if ( bExp == 0x7FF ) {
2459                 if ( bSig ) return propagateFloat64NaN( a, b );
2460                 if ( ( aExp | aSig ) == 0 ) {
2461                         float_raise( float_flag_invalid );
2462                         return float64_default_nan;
2463                 }
2464                 return packFloat64( zSign, 0x7FF, 0 );
2465         }
2466         if ( aExp == 0 ) {
2467                 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2468                 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2469         }
2470         if ( bExp == 0 ) {
2471                 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2472                 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2473         }
2474         zExp = aExp + bExp - 0x3FF;
2475         aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2476         bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2477         mul64To128( aSig, bSig, &zSig0, &zSig1 );
2478         zSig0 |= ( zSig1 != 0 );
2479         if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2480                 zSig0 <<= 1;
2481                 --zExp;
2482         }
2483         return roundAndPackFloat64( zSign, zExp, zSig0 );
2484
2485 }
2486
2487 /*----------------------------------------------------------------------------
2488 | Returns the result of dividing the double-precision floating-point value `a'
2489 | by the corresponding value `b'.  The operation is performed according to
2490 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2491 *----------------------------------------------------------------------------*/
2492
2493 float64 float64_div( float64 a, float64 b )
2494 {
2495         flag aSign, bSign, zSign;
2496         int16 aExp, bExp, zExp;
2497         bits64 aSig, bSig, zSig;
2498         bits64 rem0, rem1;
2499         bits64 term0, term1;
2500
2501         aSig = extractFloat64Frac( a );
2502         aExp = extractFloat64Exp( a );
2503         aSign = extractFloat64Sign( a );
2504         bSig = extractFloat64Frac( b );
2505         bExp = extractFloat64Exp( b );
2506         bSign = extractFloat64Sign( b );
2507         zSign = aSign ^ bSign;
2508         if ( aExp == 0x7FF ) {
2509                 if ( aSig ) return propagateFloat64NaN( a, b );
2510                 if ( bExp == 0x7FF ) {
2511                         if ( bSig ) return propagateFloat64NaN( a, b );
2512                         float_raise( float_flag_invalid );
2513                         return float64_default_nan;
2514                 }
2515                 return packFloat64( zSign, 0x7FF, 0 );
2516         }
2517         if ( bExp == 0x7FF ) {
2518                 if ( bSig ) return propagateFloat64NaN( a, b );
2519                 return packFloat64( zSign, 0, 0 );
2520         }
2521         if ( bExp == 0 ) {
2522                 if ( bSig == 0 ) {
2523                         if ( ( aExp | aSig ) == 0 ) {
2524                                 float_raise( float_flag_invalid );
2525                                 return float64_default_nan;
2526                         }
2527                         float_raise( float_flag_divbyzero );
2528                         return packFloat64( zSign, 0x7FF, 0 );
2529                 }
2530                 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2531         }
2532         if ( aExp == 0 ) {
2533                 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2534                 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2535         }
2536         zExp = aExp - bExp + 0x3FD;
2537         aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2538         bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2539         if ( bSig <= ( aSig + aSig ) ) {
2540                 aSig >>= 1;
2541                 ++zExp;
2542         }
2543         zSig = estimateDiv128To64( aSig, 0, bSig );
2544         if ( ( zSig & 0x1FF ) <= 2 ) {
2545                 mul64To128( bSig, zSig, &term0, &term1 );
2546                 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2547                 while ( (sbits64) rem0 < 0 ) {
2548                         --zSig;
2549                         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2550                 }
2551                 zSig |= ( rem1 != 0 );
2552         }
2553         return roundAndPackFloat64( zSign, zExp, zSig );
2554
2555 }
2556
2557 /*----------------------------------------------------------------------------
2558 | Returns the remainder of the double-precision floating-point value `a'
2559 | with respect to the corresponding value `b'.  The operation is performed
2560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2561 *----------------------------------------------------------------------------*/
2562
2563 float64 float64_rem( float64 a, float64 b )
2564 {
2565         flag aSign, zSign;
2566         int16 aExp, bExp, expDiff;
2567         bits64 aSig, bSig;
2568         bits64 q, alternateASig;
2569         sbits64 sigMean;
2570
2571         aSig = extractFloat64Frac( a );
2572         aExp = extractFloat64Exp( a );
2573         aSign = extractFloat64Sign( a );
2574         bSig = extractFloat64Frac( b );
2575         bExp = extractFloat64Exp( b );
2576 //    bSign = extractFloat64Sign( b );
2577         if ( aExp == 0x7FF ) {
2578                 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2579                         return propagateFloat64NaN( a, b );
2580                 }
2581                 float_raise( float_flag_invalid );
2582                 return float64_default_nan;
2583         }
2584         if ( bExp == 0x7FF ) {
2585                 if ( bSig ) return propagateFloat64NaN( a, b );
2586                 return a;
2587         }
2588         if ( bExp == 0 ) {
2589                 if ( bSig == 0 ) {
2590                         float_raise( float_flag_invalid );
2591                         return float64_default_nan;
2592                 }
2593                 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2594         }
2595         if ( aExp == 0 ) {
2596                 if ( aSig == 0 ) return a;
2597                 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2598         }
2599         expDiff = aExp - bExp;
2600         aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2601         bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2602         if ( expDiff < 0 ) {
2603                 if ( expDiff < -1 ) return a;
2604                 aSig >>= 1;
2605         }
2606         q = ( bSig <= aSig );
2607         if ( q ) aSig -= bSig;
2608         expDiff -= 64;
2609         while ( 0 < expDiff ) {
2610                 q = estimateDiv128To64( aSig, 0, bSig );
2611                 q = ( 2 < q ) ? q - 2 : 0;
2612                 aSig = - ( ( bSig>>2 ) * q );
2613                 expDiff -= 62;
2614         }
2615         expDiff += 64;
2616         if ( 0 < expDiff ) {
2617                 q = estimateDiv128To64( aSig, 0, bSig );
2618                 q = ( 2 < q ) ? q - 2 : 0;
2619                 q >>= 64 - expDiff;
2620                 bSig >>= 2;
2621                 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2622         }
2623         else {
2624                 aSig >>= 2;
2625                 bSig >>= 2;
2626         }
2627         do {
2628                 alternateASig = aSig;
2629                 ++q;
2630                 aSig -= bSig;
2631         } while ( 0 <= (sbits64) aSig );
2632         sigMean = aSig + alternateASig;
2633         if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2634                 aSig = alternateASig;
2635         }
2636         zSign = ( (sbits64) aSig < 0 );
2637         if ( zSign ) aSig = - aSig;
2638         return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2639
2640 }
2641
2642 /*----------------------------------------------------------------------------
2643 | Returns the square root of the double-precision floating-point value `a'.
2644 | The operation is performed according to the IEC/IEEE Standard for Binary
2645 | Floating-Point Arithmetic.
2646 *----------------------------------------------------------------------------*/
2647
2648 float64 float64_sqrt( float64 a )
2649 {
2650         flag aSign;
2651         int16 aExp, zExp;
2652         bits64 aSig, zSig, doubleZSig;
2653         bits64 rem0, rem1, term0, term1;
2654 //    float64 z;
2655
2656         aSig = extractFloat64Frac( a );
2657         aExp = extractFloat64Exp( a );
2658         aSign = extractFloat64Sign( a );
2659         if ( aExp == 0x7FF ) {
2660                 if ( aSig ) return propagateFloat64NaN( a, a );
2661                 if ( ! aSign ) return a;
2662                 float_raise( float_flag_invalid );
2663                 return float64_default_nan;
2664         }
2665         if ( aSign ) {
2666                 if ( ( aExp | aSig ) == 0 ) return a;
2667                 float_raise( float_flag_invalid );
2668                 return float64_default_nan;
2669         }
2670         if ( aExp == 0 ) {
2671                 if ( aSig == 0 ) return 0;
2672                 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2673         }
2674         zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2675         aSig |= LIT64( 0x0010000000000000 );
2676         zSig = estimateSqrt32( aExp, aSig>>21 );
2677         aSig <<= 9 - ( aExp & 1 );
2678         zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2679         if ( ( zSig & 0x1FF ) <= 5 ) {
2680                 doubleZSig = zSig<<1;
2681                 mul64To128( zSig, zSig, &term0, &term1 );
2682                 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2683                 while ( (sbits64) rem0 < 0 ) {
2684                         --zSig;
2685                         doubleZSig -= 2;
2686                         add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2687                 }
2688                 zSig |= ( ( rem0 | rem1 ) != 0 );
2689         }
2690         return roundAndPackFloat64( 0, zExp, zSig );
2691
2692 }
2693
2694 /*----------------------------------------------------------------------------
2695 | Returns 1 if the double-precision floating-point value `a' is equal to the
2696 | corresponding value `b', and 0 otherwise.  The comparison is performed
2697 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2698 *----------------------------------------------------------------------------*/
2699
2700 flag float64_eq( float64 a, float64 b )
2701 {
2702         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2703                         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2704                 ) {
2705                 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2706                         float_raise( float_flag_invalid );
2707                 }
2708                 return 0;
2709         }
2710         return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2711
2712 }
2713
2714 /*----------------------------------------------------------------------------
2715 | Returns 1 if the double-precision floating-point value `a' is less than or
2716 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
2717 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2718 | Arithmetic.
2719 *----------------------------------------------------------------------------*/
2720
2721 flag float64_le( float64 a, float64 b )
2722 {
2723         flag aSign, bSign;
2724
2725         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2726                         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2727                 ) {
2728                 float_raise( float_flag_invalid );
2729                 return 0;
2730         }
2731         aSign = extractFloat64Sign( a );
2732         bSign = extractFloat64Sign( b );
2733         if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2734         return ( a == b ) || ( aSign ^ ( a < b ) );
2735
2736 }
2737
2738 /*----------------------------------------------------------------------------
2739 | Returns 1 if the double-precision floating-point value `a' is less than
2740 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2741 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2742 *----------------------------------------------------------------------------*/
2743
2744 flag float64_lt( float64 a, float64 b )
2745 {
2746         flag aSign, bSign;
2747
2748         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2749                         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2750                 ) {
2751                 float_raise( float_flag_invalid );
2752                 return 0;
2753         }
2754         aSign = extractFloat64Sign( a );
2755         bSign = extractFloat64Sign( b );
2756         if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2757         return ( a != b ) && ( aSign ^ ( a < b ) );
2758
2759 }
2760
2761 /*----------------------------------------------------------------------------
2762 | Returns 1 if the double-precision floating-point value `a' is equal to the
2763 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
2764 | if either operand is a NaN.  Otherwise, the comparison is performed
2765 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2766 *----------------------------------------------------------------------------*/
2767
2768 flag float64_eq_signaling( float64 a, float64 b )
2769 {
2770         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2771                         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2772                 ) {
2773                 float_raise( float_flag_invalid );
2774                 return 0;
2775         }
2776         return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2777
2778 }
2779
2780 /*----------------------------------------------------------------------------
2781 | Returns 1 if the double-precision floating-point value `a' is less than or
2782 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2783 | cause an exception.  Otherwise, the comparison is performed according to the
2784 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2785 *----------------------------------------------------------------------------*/
2786
2787 flag float64_le_quiet( float64 a, float64 b )
2788 {
2789         flag aSign, bSign;
2790 //    int16 aExp, bExp;
2791
2792         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2793                         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2794                 ) {
2795                 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2796                         float_raise( float_flag_invalid );
2797                 }
2798                 return 0;
2799         }
2800         aSign = extractFloat64Sign( a );
2801         bSign = extractFloat64Sign( b );
2802         if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2803         return ( a == b ) || ( aSign ^ ( a < b ) );
2804
2805 }
2806
2807 /*----------------------------------------------------------------------------
2808 | Returns 1 if the double-precision floating-point value `a' is less than
2809 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2810 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2811 | Standard for Binary Floating-Point Arithmetic.
2812 *----------------------------------------------------------------------------*/
2813
2814 flag float64_lt_quiet( float64 a, float64 b )
2815 {
2816         flag aSign, bSign;
2817
2818         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2819                         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2820                 ) {
2821                 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2822                         float_raise( float_flag_invalid );
2823                 }
2824                 return 0;
2825         }
2826         aSign = extractFloat64Sign( a );
2827         bSign = extractFloat64Sign( b );
2828         if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2829         return ( a != b ) && ( aSign ^ ( a < b ) );
2830
2831 }
2832
2833 #ifdef FLOATX80
2834
2835 /*----------------------------------------------------------------------------
2836 | Returns the result of converting the extended double-precision floating-
2837 | point value `a' to the 32-bit two's complement integer format.  The
2838 | conversion is performed according to the IEC/IEEE Standard for Binary
2839 | Floating-Point Arithmetic---which means in particular that the conversion
2840 | is rounded according to the current rounding mode.  If `a' is a NaN, the
2841 | largest positive integer is returned.  Otherwise, if the conversion
2842 | overflows, the largest integer with the same sign as `a' is returned.
2843 *----------------------------------------------------------------------------*/
2844
2845 int32 floatx80_to_int32( floatx80 a )
2846 {
2847         flag aSign;
2848         int32 aExp, shiftCount;
2849         bits64 aSig;
2850
2851         aSig = extractFloatx80Frac( a );
2852         aExp = extractFloatx80Exp( a );
2853         aSign = extractFloatx80Sign( a );
2854         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2855         shiftCount = 0x4037 - aExp;
2856         if ( shiftCount <= 0 ) shiftCount = 1;
2857         shift64RightJamming( aSig, shiftCount, &aSig );
2858         return roundAndPackInt32( aSign, aSig );
2859
2860 }
2861
2862 /*----------------------------------------------------------------------------
2863 | Returns the result of converting the extended double-precision floating-
2864 | point value `a' to the 32-bit two's complement integer format.  The
2865 | conversion is performed according to the IEC/IEEE Standard for Binary
2866 | Floating-Point Arithmetic, except that the conversion is always rounded
2867 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
2868 | Otherwise, if the conversion overflows, the largest integer with the same
2869 | sign as `a' is returned.
2870 *----------------------------------------------------------------------------*/
2871
2872 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2873 {
2874         flag aSign;
2875         int32 aExp, shiftCount;
2876         bits64 aSig, savedASig;
2877         int32 z;
2878
2879         aSig = extractFloatx80Frac( a );
2880         aExp = extractFloatx80Exp( a );
2881         aSign = extractFloatx80Sign( a );
2882         if ( 0x401E < aExp ) {
2883                 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2884                 goto invalid;
2885         }
2886         else if ( aExp < 0x3FFF ) {
2887                 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2888                 return 0;
2889         }
2890         shiftCount = 0x403E - aExp;
2891         savedASig = aSig;
2892         aSig >>= shiftCount;
2893         z = aSig;
2894         if ( aSign ) z = - z;
2895         if ( ( z < 0 ) ^ aSign ) {
2896         invalid:
2897                 float_raise( float_flag_invalid );
2898                 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2899         }
2900         if ( ( aSig<<shiftCount ) != savedASig ) {
2901                 float_exception_flags |= float_flag_inexact;
2902         }
2903         return z;
2904
2905 }
2906
2907 /*----------------------------------------------------------------------------
2908 | Returns the result of converting the extended double-precision floating-
2909 | point value `a' to the 64-bit two's complement integer format.  The
2910 | conversion is performed according to the IEC/IEEE Standard for Binary
2911 | Floating-Point Arithmetic---which means in particular that the conversion
2912 | is rounded according to the current rounding mode.  If `a' is a NaN,
2913 | the largest positive integer is returned.  Otherwise, if the conversion
2914 | overflows, the largest integer with the same sign as `a' is returned.
2915 *----------------------------------------------------------------------------*/
2916
2917 int64 floatx80_to_int64( floatx80 a )
2918 {
2919         flag aSign;
2920         int32 aExp, shiftCount;
2921         bits64 aSig, aSigExtra;
2922
2923         aSig = extractFloatx80Frac( a );
2924         aExp = extractFloatx80Exp( a );
2925         aSign = extractFloatx80Sign( a );
2926         shiftCount = 0x403E - aExp;
2927         if ( shiftCount <= 0 ) {
2928                 if ( shiftCount ) {
2929                         float_raise( float_flag_invalid );
2930                         if (    ! aSign
2931                                         || (    ( aExp == 0x7FFF )
2932                                                 && ( aSig != LIT64( 0x8000000000000000 ) ) )
2933                                 ) {
2934                                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2935                         }
2936                         return (sbits64) LIT64( 0x8000000000000000 );
2937                 }
2938                 aSigExtra = 0;
2939         }
2940         else {
2941                 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2942         }
2943         return roundAndPackInt64( aSign, aSig, aSigExtra );
2944
2945 }
2946
2947 /*----------------------------------------------------------------------------
2948 | Returns the result of converting the extended double-precision floating-
2949 | point value `a' to the 64-bit two's complement integer format.  The
2950 | conversion is performed according to the IEC/IEEE Standard for Binary
2951 | Floating-Point Arithmetic, except that the conversion is always rounded
2952 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
2953 | Otherwise, if the conversion overflows, the largest integer with the same
2954 | sign as `a' is returned.
2955 *----------------------------------------------------------------------------*/
2956
2957 int64 floatx80_to_int64_round_to_zero( floatx80 a )
2958 {
2959         flag aSign;
2960         int32 aExp, shiftCount;
2961         bits64 aSig;
2962         int64 z;
2963
2964         aSig = extractFloatx80Frac( a );
2965         aExp = extractFloatx80Exp( a );
2966         aSign = extractFloatx80Sign( a );
2967         shiftCount = aExp - 0x403E;
2968         if ( 0 <= shiftCount ) {
2969                 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
2970                 if ( ( a.high != 0xC03E ) || aSig ) {
2971                         float_raise( float_flag_invalid );
2972                         if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
2973                                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2974                         }
2975                 }
2976                 return (sbits64) LIT64( 0x8000000000000000 );
2977         }
2978         else if ( aExp < 0x3FFF ) {
2979                 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2980                 return 0;
2981         }
2982         z = aSig>>( - shiftCount );
2983         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2984                 float_exception_flags |= float_flag_inexact;
2985         }
2986         if ( aSign ) z = - z;
2987         return z;
2988
2989 }
2990
2991 /*----------------------------------------------------------------------------
2992 | Returns the result of converting the extended double-precision floating-
2993 | point value `a' to the single-precision floating-point format.  The
2994 | conversion is performed according to the IEC/IEEE Standard for Binary
2995 | Floating-Point Arithmetic.
2996 *----------------------------------------------------------------------------*/
2997
2998 float32 floatx80_to_float32( floatx80 a )
2999 {
3000         flag aSign;
3001         int32 aExp;
3002         bits64 aSig;
3003
3004         aSig = extractFloatx80Frac( a );
3005         aExp = extractFloatx80Exp( a );
3006         aSign = extractFloatx80Sign( a );
3007         if ( aExp == 0x7FFF ) {
3008                 if ( (bits64) ( aSig<<1 ) ) {
3009                         return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3010                 }
3011                 return packFloat32( aSign, 0xFF, 0 );
3012         }
3013         shift64RightJamming( aSig, 33, &aSig );
3014         if ( aExp || aSig ) aExp -= 0x3F81;
3015         return roundAndPackFloat32( aSign, aExp, aSig );
3016
3017 }
3018
3019 /*----------------------------------------------------------------------------
3020 | Returns the result of converting the extended double-precision floating-
3021 | point value `a' to the double-precision floating-point format.  The
3022 | conversion is performed according to the IEC/IEEE Standard for Binary
3023 | Floating-Point Arithmetic.
3024 *----------------------------------------------------------------------------*/
3025
3026 float64 floatx80_to_float64( floatx80 a )
3027 {
3028         flag aSign;
3029         int32 aExp;
3030         bits64 aSig, zSig;
3031
3032         aSig = extractFloatx80Frac( a );
3033         aExp = extractFloatx80Exp( a );
3034         aSign = extractFloatx80Sign( a );
3035         if ( aExp == 0x7FFF ) {
3036                 if ( (bits64) ( aSig<<1 ) ) {
3037                         return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3038                 }
3039                 return packFloat64( aSign, 0x7FF, 0 );
3040         }
3041         shift64RightJamming( aSig, 1, &zSig );
3042         if ( aExp || aSig ) aExp -= 0x3C01;
3043         return roundAndPackFloat64( aSign, aExp, zSig );
3044
3045 }
3046
3047 #ifdef FLOAT128
3048
3049 /*----------------------------------------------------------------------------
3050 | Returns the result of converting the extended double-precision floating-
3051 | point value `a' to the quadruple-precision floating-point format.  The
3052 | conversion is performed according to the IEC/IEEE Standard for Binary
3053 | Floating-Point Arithmetic.
3054 *----------------------------------------------------------------------------*/
3055
3056 float128 floatx80_to_float128( floatx80 a )
3057 {
3058         flag aSign;
3059         int16 aExp;
3060         bits64 aSig, zSig0, zSig1;
3061
3062         aSig = extractFloatx80Frac( a );
3063         aExp = extractFloatx80Exp( a );
3064         aSign = extractFloatx80Sign( a );
3065         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3066                 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3067         }
3068         shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3069         return packFloat128( aSign, aExp, zSig0, zSig1 );
3070
3071 }
3072
3073 #endif
3074
3075 /*----------------------------------------------------------------------------
3076 | Rounds the extended double-precision floating-point value `a' to an integer,
3077 | and returns the result as an extended quadruple-precision floating-point
3078 | value.  The operation is performed according to the IEC/IEEE Standard for
3079 | Binary Floating-Point Arithmetic.
3080 *----------------------------------------------------------------------------*/
3081
3082 floatx80 floatx80_round_to_int( floatx80 a )
3083 {
3084         flag aSign;
3085         int32 aExp;
3086         bits64 lastBitMask, roundBitsMask;
3087         int8 roundingMode;
3088         floatx80 z;
3089
3090         aExp = extractFloatx80Exp( a );
3091         if ( 0x403E <= aExp ) {
3092                 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3093                         return propagateFloatx80NaN( a, a );
3094                 }
3095                 return a;
3096         }
3097         if ( aExp < 0x3FFF ) {
3098                 if (    ( aExp == 0 )
3099                                 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3100                         return a;
3101                 }
3102                 float_exception_flags |= float_flag_inexact;
3103                 aSign = extractFloatx80Sign( a );
3104                 switch ( float_rounding_mode ) {
3105                         case float_round_nearest_even:
3106                         if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3107                                 ) {
3108                                 return
3109                                         packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3110                         }
3111                         break;
3112                         case float_round_down:
3113                         return
3114                                         aSign ?
3115                                                 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3116                                 : packFloatx80( 0, 0, 0 );
3117                         case float_round_up:
3118                         return
3119                                         aSign ? packFloatx80( 1, 0, 0 )
3120                                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3121                 }
3122                 return packFloatx80( aSign, 0, 0 );
3123         }
3124         lastBitMask = 1;
3125         lastBitMask <<= 0x403E - aExp;
3126         roundBitsMask = lastBitMask - 1;
3127         z = a;
3128         roundingMode = float_rounding_mode;
3129         if ( roundingMode == float_round_nearest_even ) {
3130                 z.low += lastBitMask>>1;
3131                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3132         }
3133         else if ( roundingMode != float_round_to_zero ) {
3134                 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3135                         z.low += roundBitsMask;
3136                 }
3137         }
3138         z.low &= ~ roundBitsMask;
3139         if ( z.low == 0 ) {
3140                 ++z.high;
3141                 z.low = LIT64( 0x8000000000000000 );
3142         }
3143         if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3144         return z;
3145
3146 }
3147
3148 /*----------------------------------------------------------------------------
3149 | Returns the result of adding the absolute values of the extended double-
3150 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3151 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3152 | The addition is performed according to the IEC/IEEE Standard for Binary
3153 | Floating-Point Arithmetic.
3154 *----------------------------------------------------------------------------*/
3155
3156 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3157 {
3158         int32 aExp, bExp, zExp;
3159         bits64 aSig, bSig, zSig0, zSig1;
3160         int32 expDiff;
3161
3162         aSig = extractFloatx80Frac( a );
3163         aExp = extractFloatx80Exp( a );
3164         bSig = extractFloatx80Frac( b );
3165         bExp = extractFloatx80Exp( b );
3166         expDiff = aExp - bExp;
3167         if ( 0 < expDiff ) {
3168                 if ( aExp == 0x7FFF ) {
3169                         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3170                         return a;
3171                 }
3172                 if ( bExp == 0 ) --expDiff;
3173                 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3174                 zExp = aExp;
3175         }
3176         else if ( expDiff < 0 ) {
3177                 if ( bExp == 0x7FFF ) {
3178                         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3179                         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3180                 }
3181                 if ( aExp == 0 ) ++expDiff;
3182                 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3183                 zExp = bExp;
3184         }
3185         else {
3186                 if ( aExp == 0x7FFF ) {
3187                         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3188                                 return propagateFloatx80NaN( a, b );
3189                         }
3190                         return a;
3191                 }
3192                 zSig1 = 0;
3193                 zSig0 = aSig + bSig;
3194                 if ( aExp == 0 ) {
3195                         normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3196                         goto roundAndPack;
3197                 }
3198                 zExp = aExp;
3199                 goto shiftRight1;
3200         }
3201         zSig0 = aSig + bSig;
3202         if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3203         shiftRight1:
3204         shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3205         zSig0 |= LIT64( 0x8000000000000000 );
3206         ++zExp;
3207         roundAndPack:
3208         return
3209                 roundAndPackFloatx80(
3210                         floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3211
3212 }
3213
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of subtracting the absolute values of the extended
3216 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3217 | difference is negated before being returned.  `zSign' is ignored if the
3218 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3219 | Standard for Binary Floating-Point Arithmetic.
3220 *----------------------------------------------------------------------------*/
3221
3222 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3223 {
3224         int32 aExp, bExp, zExp;
3225         bits64 aSig, bSig, zSig0, zSig1;
3226         int32 expDiff;
3227         floatx80 z;
3228
3229         aSig = extractFloatx80Frac( a );
3230         aExp = extractFloatx80Exp( a );
3231         bSig = extractFloatx80Frac( b );
3232         bExp = extractFloatx80Exp( b );
3233         expDiff = aExp - bExp;
3234         if ( 0 < expDiff ) goto aExpBigger;
3235         if ( expDiff < 0 ) goto bExpBigger;
3236         if ( aExp == 0x7FFF ) {
3237                 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3238                         return propagateFloatx80NaN( a, b );
3239                 }
3240                 float_raise( float_flag_invalid );
3241                 z.low = floatx80_default_nan_low;
3242                 z.high = floatx80_default_nan_high;
3243                 return z;
3244         }
3245         if ( aExp == 0 ) {
3246                 aExp = 1;
3247                 bExp = 1;
3248         }
3249         zSig1 = 0;
3250         if ( bSig < aSig ) goto aBigger;
3251         if ( aSig < bSig ) goto bBigger;
3252         return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3253         bExpBigger:
3254         if ( bExp == 0x7FFF ) {
3255                 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3256                 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3257         }
3258         if ( aExp == 0 ) ++expDiff;
3259         shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3260         bBigger:
3261         sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3262         zExp = bExp;
3263         zSign ^= 1;
3264         goto normalizeRoundAndPack;
3265         aExpBigger:
3266         if ( aExp == 0x7FFF ) {
3267                 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3268                 return a;
3269         }
3270         if ( bExp == 0 ) --expDiff;
3271         shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3272         aBigger:
3273         sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3274         zExp = aExp;
3275         normalizeRoundAndPack:
3276         return
3277                 normalizeRoundAndPackFloatx80(
3278                         floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3279
3280 }
3281
3282 /*----------------------------------------------------------------------------
3283 | Returns the result of adding the extended double-precision floating-point
3284 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3285 | Standard for Binary Floating-Point Arithmetic.
3286 *----------------------------------------------------------------------------*/
3287
3288 floatx80 floatx80_add( floatx80 a, floatx80 b )
3289 {
3290         flag aSign, bSign;
3291
3292         aSign = extractFloatx80Sign( a );
3293         bSign = extractFloatx80Sign( b );
3294         if ( aSign == bSign ) {
3295                 return addFloatx80Sigs( a, b, aSign );
3296         }
3297         else {
3298                 return subFloatx80Sigs( a, b, aSign );
3299         }
3300
3301 }
3302
3303 /*----------------------------------------------------------------------------
3304 | Returns the result of subtracting the extended double-precision floating-
3305 | point values `a' and `b'.  The operation is performed according to the
3306 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3307 *----------------------------------------------------------------------------*/
3308
3309 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3310 {
3311         flag aSign, bSign;
3312
3313         aSign = extractFloatx80Sign( a );
3314         bSign = extractFloatx80Sign( b );
3315         if ( aSign == bSign ) {
3316                 return subFloatx80Sigs( a, b, aSign );
3317         }
3318         else {
3319                 return addFloatx80Sigs( a, b, aSign );
3320         }
3321
3322 }
3323
3324 /*----------------------------------------------------------------------------
3325 | Returns the result of multiplying the extended double-precision floating-
3326 | point values `a' and `b'.  The operation is performed according to the
3327 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3328 *----------------------------------------------------------------------------*/
3329
3330 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3331 {
3332         flag aSign, bSign, zSign;
3333         int32 aExp, bExp, zExp;
3334         bits64 aSig, bSig, zSig0, zSig1;
3335         floatx80 z;
3336
3337         aSig = extractFloatx80Frac( a );
3338         aExp = extractFloatx80Exp( a );
3339         aSign = extractFloatx80Sign( a );
3340         bSig = extractFloatx80Frac( b );
3341         bExp = extractFloatx80Exp( b );
3342         bSign = extractFloatx80Sign( b );
3343         zSign = aSign ^ bSign;
3344         if ( aExp == 0x7FFF ) {
3345                 if (    (bits64) ( aSig<<1 )
3346                                 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3347                         return propagateFloatx80NaN( a, b );
3348                 }
3349                 if ( ( bExp | bSig ) == 0 ) goto invalid;
3350                 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3351         }
3352         if ( bExp == 0x7FFF ) {
3353                 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3354                 if ( ( aExp | aSig ) == 0 ) {
3355         invalid:
3356                         float_raise( float_flag_invalid );
3357                         z.low = floatx80_default_nan_low;
3358                         z.high = floatx80_default_nan_high;
3359                         return z;
3360                 }
3361                 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3362         }
3363         if ( aExp == 0 ) {
3364                 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3365                 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3366         }
3367         if ( bExp == 0 ) {
3368                 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3369                 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3370         }
3371         zExp = aExp + bExp - 0x3FFE;
3372         mul64To128( aSig, bSig, &zSig0, &zSig1 );
3373         if ( 0 < (sbits64) zSig0 ) {
3374                 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3375                 --zExp;
3376         }
3377         return
3378                 roundAndPackFloatx80(
3379                         floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3380
3381 }
3382
3383 /*----------------------------------------------------------------------------
3384 | Returns the result of dividing the extended double-precision floating-point
3385 | value `a' by the corresponding value `b'.  The operation is performed
3386 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3387 *----------------------------------------------------------------------------*/
3388
3389 floatx80 floatx80_div( floatx80 a, floatx80 b )
3390 {
3391         flag aSign, bSign, zSign;
3392         int32 aExp, bExp, zExp;
3393         bits64 aSig, bSig, zSig0, zSig1;
3394         bits64 rem0, rem1, rem2, term0, term1, term2;
3395         floatx80 z;
3396
3397         aSig = extractFloatx80Frac( a );
3398         aExp = extractFloatx80Exp( a );
3399         aSign = extractFloatx80Sign( a );
3400         bSig = extractFloatx80Frac( b );
3401         bExp = extractFloatx80Exp( b );
3402         bSign = extractFloatx80Sign( b );
3403         zSign = aSign ^ bSign;
3404         if ( aExp == 0x7FFF ) {
3405                 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3406                 if ( bExp == 0x7FFF ) {
3407                         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3408                         goto invalid;
3409                 }
3410                 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3411         }
3412         if ( bExp == 0x7FFF ) {
3413                 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3414                 return packFloatx80( zSign, 0, 0 );
3415         }
3416         if ( bExp == 0 ) {
3417                 if ( bSig == 0 ) {
3418                         if ( ( aExp | aSig ) == 0 ) {
3419         invalid:
3420                                 float_raise( float_flag_invalid );
3421                                 z.low = floatx80_default_nan_low;
3422                                 z.high = floatx80_default_nan_high;
3423                                 return z;
3424                         }
3425                         float_raise( float_flag_divbyzero );
3426                         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3427                 }
3428                 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3429         }
3430         if ( aExp == 0 ) {
3431                 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3432                 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3433         }
3434         zExp = aExp - bExp + 0x3FFE;
3435         rem1 = 0;
3436         if ( bSig <= aSig ) {
3437                 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3438                 ++zExp;
3439         }
3440         zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3441         mul64To128( bSig, zSig0, &term0, &term1 );
3442         sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3443         while ( (sbits64) rem0 < 0 ) {
3444                 --zSig0;
3445                 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3446         }
3447         zSig1 = estimateDiv128To64( rem1, 0, bSig );
3448         if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3449                 mul64To128( bSig, zSig1, &term1, &term2 );
3450                 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3451                 while ( (sbits64) rem1 < 0 ) {
3452                         --zSig1;
3453                         add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3454                 }
3455                 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3456         }
3457         return
3458                 roundAndPackFloatx80(
3459                         floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3460
3461 }
3462
3463 /*----------------------------------------------------------------------------
3464 | Returns the remainder of the extended double-precision floating-point value
3465 | `a' with respect to the corresponding value `b'.  The operation is performed
3466 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3467 *----------------------------------------------------------------------------*/
3468
3469 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3470 {
3471         flag aSign, zSign;
3472         int32 aExp, bExp, expDiff;
3473         bits64 aSig0, aSig1, bSig;
3474         bits64 q, term0, term1, alternateASig0, alternateASig1;
3475         floatx80 z;
3476
3477         aSig0 = extractFloatx80Frac( a );
3478         aExp = extractFloatx80Exp( a );
3479         aSign = extractFloatx80Sign( a );
3480         bSig = extractFloatx80Frac( b );
3481         bExp = extractFloatx80Exp( b );
3482 //    bSign = extractFloatx80Sign( b );
3483         if ( aExp == 0x7FFF ) {
3484                 if (    (bits64) ( aSig0<<1 )
3485                                 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3486                         return propagateFloatx80NaN( a, b );
3487                 }
3488                 goto invalid;
3489         }
3490         if ( bExp == 0x7FFF ) {
3491                 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3492                 return a;
3493         }
3494         if ( bExp == 0 ) {
3495                 if ( bSig == 0 ) {
3496         invalid:
3497                         float_raise( float_flag_invalid );
3498                         z.low = floatx80_default_nan_low;
3499                         z.high = floatx80_default_nan_high;
3500                         return z;
3501                 }
3502                 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3503         }
3504         if ( aExp == 0 ) {
3505                 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3506                 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3507         }
3508         bSig |= LIT64( 0x8000000000000000 );
3509         zSign = aSign;
3510         expDiff = aExp - bExp;
3511         aSig1 = 0;
3512         if ( expDiff < 0 ) {
3513                 if ( expDiff < -1 ) return a;
3514                 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3515                 expDiff = 0;
3516         }
3517         q = ( bSig <= aSig0 );
3518         if ( q ) aSig0 -= bSig;
3519         expDiff -= 64;
3520         while ( 0 < expDiff ) {
3521                 q = estimateDiv128To64( aSig0, aSig1, bSig );
3522                 q = ( 2 < q ) ? q - 2 : 0;
3523                 mul64To128( bSig, q, &term0, &term1 );
3524                 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3525                 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3526                 expDiff -= 62;
3527         }
3528         expDiff += 64;
3529         if ( 0 < expDiff ) {
3530                 q = estimateDiv128To64( aSig0, aSig1, bSig );
3531                 q = ( 2 < q ) ? q - 2 : 0;
3532                 q >>= 64 - expDiff;
3533                 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3534                 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3535                 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3536                 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3537                         ++q;
3538                         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3539                 }
3540         }
3541         else {
3542                 term1 = 0;
3543                 term0 = bSig;
3544         }
3545         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3546         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3547                         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3548                                 && ( q & 1 ) )
3549                 ) {
3550                 aSig0 = alternateASig0;
3551                 aSig1 = alternateASig1;
3552                 zSign = ! zSign;
3553         }
3554         return
3555                 normalizeRoundAndPackFloatx80(
3556                         80, zSign, bExp + expDiff, aSig0, aSig1 );
3557
3558 }
3559
3560 /*----------------------------------------------------------------------------
3561 | Returns the square root of the extended double-precision floating-point
3562 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3563 | for Binary Floating-Point Arithmetic.
3564 *----------------------------------------------------------------------------*/
3565
3566 floatx80 floatx80_sqrt( floatx80 a )
3567 {
3568         flag aSign;
3569         int32 aExp, zExp;
3570         bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3571         bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3572         floatx80 z;
3573
3574         aSig0 = extractFloatx80Frac( a );
3575         aExp = extractFloatx80Exp( a );
3576         aSign = extractFloatx80Sign( a );
3577         if ( aExp == 0x7FFF ) {
3578                 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3579                 if ( ! aSign ) return a;
3580                 goto invalid;
3581         }
3582         if ( aSign ) {
3583                 if ( ( aExp | aSig0 ) == 0 ) return a;
3584         invalid:
3585                 float_raise( float_flag_invalid );
3586                 z.low = floatx80_default_nan_low;
3587                 z.high = floatx80_default_nan_high;
3588                 return z;
3589         }
3590         if ( aExp == 0 ) {
3591                 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3592                 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3593         }
3594         zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3595         zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3596         shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3597         zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3598         doubleZSig0 = zSig0<<1;
3599         mul64To128( zSig0, zSig0, &term0, &term1 );
3600         sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3601         while ( (sbits64) rem0 < 0 ) {
3602                 --zSig0;
3603                 doubleZSig0 -= 2;
3604                 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3605         }
3606         zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3607         if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3608                 if ( zSig1 == 0 ) zSig1 = 1;
3609                 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3610                 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3611                 mul64To128( zSig1, zSig1, &term2, &term3 );
3612                 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3613                 while ( (sbits64) rem1 < 0 ) {
3614                         --zSig1;
3615                         shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3616                         term3 |= 1;
3617                         term2 |= doubleZSig0;
3618                         add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3619                 }
3620                 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3621         }
3622         shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3623         zSig0 |= doubleZSig0;
3624         return
3625                 roundAndPackFloatx80(
3626                         floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3627
3628 }
3629
3630 /*----------------------------------------------------------------------------
3631 | Returns 1 if the extended double-precision floating-point value `a' is
3632 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3633 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3634 | Arithmetic.
3635 *----------------------------------------------------------------------------*/
3636
3637 flag floatx80_eq( floatx80 a, floatx80 b )
3638 {
3639         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3640                                 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3641                         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3642                                 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3643                 ) {
3644                 if (    floatx80_is_signaling_nan( a )
3645                                 || floatx80_is_signaling_nan( b ) ) {
3646                         float_raise( float_flag_invalid );
3647                 }
3648                 return 0;
3649         }
3650         return
3651                         ( a.low == b.low )
3652                 && (    ( a.high == b.high )
3653                                 || (    ( a.low == 0 )
3654                                         && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3655                         );
3656
3657 }
3658
3659 /*----------------------------------------------------------------------------
3660 | Returns 1 if the extended double-precision floating-point value `a' is
3661 | less than or equal to the corresponding value `b', and 0 otherwise.  The
3662 | comparison is performed according to the IEC/IEEE Standard for Binary
3663 | Floating-Point Arithmetic.
3664 *----------------------------------------------------------------------------*/
3665
3666 flag floatx80_le( floatx80 a, floatx80 b )
3667 {
3668         flag aSign, bSign;
3669
3670         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3671                                 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3672                         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3673                                 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3674                 ) {
3675                 float_raise( float_flag_invalid );
3676                 return 0;
3677         }
3678         aSign = extractFloatx80Sign( a );
3679         bSign = extractFloatx80Sign( b );
3680         if ( aSign != bSign ) {
3681                 return
3682                                 aSign
3683                         || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3684                                         == 0 );
3685         }
3686         return
3687                         aSign ? le128( b.high, b.low, a.high, a.low )
3688                 : le128( a.high, a.low, b.high, b.low );
3689
3690 }
3691
3692 /*----------------------------------------------------------------------------
3693 | Returns 1 if the extended double-precision floating-point value `a' is
3694 | less than the corresponding value `b', and 0 otherwise.  The comparison
3695 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3696 | Arithmetic.
3697 *----------------------------------------------------------------------------*/
3698
3699 flag floatx80_lt( floatx80 a, floatx80 b )
3700 {
3701         flag aSign, bSign;
3702
3703         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3704                                 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3705                         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3706                                 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3707                 ) {
3708                 float_raise( float_flag_invalid );
3709                 return 0;
3710         }
3711         aSign = extractFloatx80Sign( a );
3712         bSign = extractFloatx80Sign( b );
3713         if ( aSign != bSign ) {
3714                 return
3715                                 aSign
3716                         && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3717                                         != 0 );
3718         }
3719         return
3720                         aSign ? lt128( b.high, b.low, a.high, a.low )
3721                 : lt128( a.high, a.low, b.high, b.low );
3722
3723 }
3724
3725 /*----------------------------------------------------------------------------
3726 | Returns 1 if the extended double-precision floating-point value `a' is equal
3727 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
3728 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3729 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3730 *----------------------------------------------------------------------------*/
3731
3732 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3733 {
3734         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3735                                 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3736                         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3737                                 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3738                 ) {
3739                 float_raise( float_flag_invalid );
3740                 return 0;
3741         }
3742         return
3743                         ( a.low == b.low )
3744                 && (    ( a.high == b.high )
3745                                 || (    ( a.low == 0 )
3746                                         && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3747                         );
3748
3749 }
3750
3751 /*----------------------------------------------------------------------------
3752 | Returns 1 if the extended double-precision floating-point value `a' is less
3753 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3754 | do not cause an exception.  Otherwise, the comparison is performed according
3755 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3756 *----------------------------------------------------------------------------*/
3757
3758 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3759 {
3760         flag aSign, bSign;
3761
3762         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3763                                 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3764                         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3765                                 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3766                 ) {
3767                 if (    floatx80_is_signaling_nan( a )
3768                                 || floatx80_is_signaling_nan( b ) ) {
3769                         float_raise( float_flag_invalid );
3770                 }
3771                 return 0;
3772         }
3773         aSign = extractFloatx80Sign( a );
3774         bSign = extractFloatx80Sign( b );
3775         if ( aSign != bSign ) {
3776                 return
3777                                 aSign
3778                         || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3779                                         == 0 );
3780         }
3781         return
3782                         aSign ? le128( b.high, b.low, a.high, a.low )
3783                 : le128( a.high, a.low, b.high, b.low );
3784
3785 }
3786
3787 /*----------------------------------------------------------------------------
3788 | Returns 1 if the extended double-precision floating-point value `a' is less
3789 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3790 | an exception.  Otherwise, the comparison is performed according to the
3791 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3792 *----------------------------------------------------------------------------*/
3793
3794 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3795 {
3796         flag aSign, bSign;
3797
3798         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3799                                 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3800                         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3801                                 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3802                 ) {
3803                 if (    floatx80_is_signaling_nan( a )
3804                                 || floatx80_is_signaling_nan( b ) ) {
3805                         float_raise( float_flag_invalid );
3806                 }
3807                 return 0;
3808         }
3809         aSign = extractFloatx80Sign( a );
3810         bSign = extractFloatx80Sign( b );
3811         if ( aSign != bSign ) {
3812                 return
3813                                 aSign
3814                         && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3815                                         != 0 );
3816         }
3817         return
3818                         aSign ? lt128( b.high, b.low, a.high, a.low )
3819                 : lt128( a.high, a.low, b.high, b.low );
3820
3821 }
3822
3823 #endif
3824
3825 #ifdef FLOAT128
3826
3827 /*----------------------------------------------------------------------------
3828 | Returns the result of converting the quadruple-precision floating-point
3829 | value `a' to the 32-bit two's complement integer format.  The conversion
3830 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3831 | Arithmetic---which means in particular that the conversion is rounded
3832 | according to the current rounding mode.  If `a' is a NaN, the largest
3833 | positive integer is returned.  Otherwise, if the conversion overflows, the
3834 | largest integer with the same sign as `a' is returned.
3835 *----------------------------------------------------------------------------*/
3836
3837 int32 float128_to_int32( float128 a )
3838 {
3839         flag aSign;
3840         int32 aExp, shiftCount;
3841         bits64 aSig0, aSig1;
3842
3843         aSig1 = extractFloat128Frac1( a );
3844         aSig0 = extractFloat128Frac0( a );
3845         aExp = extractFloat128Exp( a );
3846         aSign = extractFloat128Sign( a );
3847         if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
3848         if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3849         aSig0 |= ( aSig1 != 0 );
3850         shiftCount = 0x4028 - aExp;
3851         if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
3852         return roundAndPackInt32( aSign, aSig0 );
3853
3854 }
3855
3856 /*----------------------------------------------------------------------------
3857 | Returns the result of converting the quadruple-precision floating-point
3858 | value `a' to the 32-bit two's complement integer format.  The conversion
3859 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3860 | Arithmetic, except that the conversion is always rounded toward zero.  If
3861 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
3862 | conversion overflows, the largest integer with the same sign as `a' is
3863 | returned.
3864 *----------------------------------------------------------------------------*/
3865
3866 int32 float128_to_int32_round_to_zero( float128 a )
3867 {
3868         flag aSign;
3869         int32 aExp, shiftCount;
3870         bits64 aSig0, aSig1, savedASig;
3871         int32 z;
3872
3873         aSig1 = extractFloat128Frac1( a );
3874         aSig0 = extractFloat128Frac0( a );
3875         aExp = extractFloat128Exp( a );
3876         aSign = extractFloat128Sign( a );
3877         aSig0 |= ( aSig1 != 0 );
3878         if ( 0x401E < aExp ) {
3879                 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
3880                 goto invalid;
3881         }
3882         else if ( aExp < 0x3FFF ) {
3883                 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
3884                 return 0;
3885         }
3886         aSig0 |= LIT64( 0x0001000000000000 );
3887         shiftCount = 0x402F - aExp;
3888         savedASig = aSig0;
3889         aSig0 >>= shiftCount;
3890         z = aSig0;
3891         if ( aSign ) z = - z;
3892         if ( ( z < 0 ) ^ aSign ) {
3893         invalid:
3894                 float_raise( float_flag_invalid );
3895                 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3896         }
3897         if ( ( aSig0<<shiftCount ) != savedASig ) {
3898                 float_exception_flags |= float_flag_inexact;
3899         }
3900         return z;
3901
3902 }
3903
3904 /*----------------------------------------------------------------------------
3905 | Returns the result of converting the quadruple-precision floating-point
3906 | value `a' to the 64-bit two's complement integer format.  The conversion
3907 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3908 | Arithmetic---which means in particular that the conversion is rounded
3909 | according to the current rounding mode.  If `a' is a NaN, the largest
3910 | positive integer is returned.  Otherwise, if the conversion overflows, the
3911 | largest integer with the same sign as `a' is returned.
3912 *----------------------------------------------------------------------------*/
3913
3914 int64 float128_to_int64( float128 a )
3915 {
3916         flag aSign;
3917         int32 aExp, shiftCount;
3918         bits64 aSig0, aSig1;
3919
3920         aSig1 = extractFloat128Frac1( a );
3921         aSig0 = extractFloat128Frac0( a );
3922         aExp = extractFloat128Exp( a );
3923         aSign = extractFloat128Sign( a );
3924         if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3925         shiftCount = 0x402F - aExp;
3926         if ( shiftCount <= 0 ) {
3927                 if ( 0x403E < aExp ) {
3928                         float_raise( float_flag_invalid );
3929                         if (    ! aSign
3930                                         || (    ( aExp == 0x7FFF )
3931                                                 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
3932                                         )
3933                                 ) {
3934                                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3935                         }
3936                         return (sbits64) LIT64( 0x8000000000000000 );
3937                 }
3938                 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
3939         }
3940         else {
3941                 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
3942         }
3943         return roundAndPackInt64( aSign, aSig0, aSig1 );
3944
3945 }
3946
3947 /*----------------------------------------------------------------------------
3948 | Returns the result of converting the quadruple-precision floating-point
3949 | value `a' to the 64-bit two's complement integer format.  The conversion
3950 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3951 | Arithmetic, except that the conversion is always rounded toward zero.
3952 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3953 | the conversion overflows, the largest integer with the same sign as `a' is
3954 | returned.
3955 *----------------------------------------------------------------------------*/
3956
3957 int64 float128_to_int64_round_to_zero( float128 a )
3958 {
3959         flag aSign;
3960         int32 aExp, shiftCount;
3961         bits64 aSig0, aSig1;
3962         int64 z;
3963
3964         aSig1 = extractFloat128Frac1( a );
3965         aSig0 = extractFloat128Frac0( a );
3966         aExp = extractFloat128Exp( a );
3967         aSign = extractFloat128Sign( a );
3968         if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3969         shiftCount = aExp - 0x402F;
3970         if ( 0 < shiftCount ) {
3971                 if ( 0x403E <= aExp ) {
3972                         aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
3973                         if (    ( a.high == LIT64( 0xC03E000000000000 ) )
3974                                         && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
3975                                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
3976                         }
3977                         else {
3978                                 float_raise( float_flag_invalid );
3979                                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
3980                                         return LIT64( 0x7FFFFFFFFFFFFFFF );
3981                                 }
3982                         }
3983                         return (sbits64) LIT64( 0x8000000000000000 );
3984                 }
3985                 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
3986                 if ( (bits64) ( aSig1<<shiftCount ) ) {
3987                         float_exception_flags |= float_flag_inexact;
3988                 }
3989         }
3990         else {
3991                 if ( aExp < 0x3FFF ) {
3992                         if ( aExp | aSig0 | aSig1 ) {
3993                                 float_exception_flags |= float_flag_inexact;
3994                         }
3995                         return 0;
3996                 }
3997                 z = aSig0>>( - shiftCount );
3998                 if (    aSig1
3999                                 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4000                         float_exception_flags |= float_flag_inexact;
4001                 }
4002         }
4003         if ( aSign ) z = - z;
4004         return z;
4005
4006 }
4007
4008 /*----------------------------------------------------------------------------
4009 | Returns the result of converting the quadruple-precision floating-point
4010 | value `a' to the single-precision floating-point format.  The conversion
4011 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4012 | Arithmetic.
4013 *----------------------------------------------------------------------------*/
4014
4015 float32 float128_to_float32( float128 a )
4016 {
4017         flag aSign;
4018         int32 aExp;
4019         bits64 aSig0, aSig1;
4020         bits32 zSig;
4021
4022         aSig1 = extractFloat128Frac1( a );
4023         aSig0 = extractFloat128Frac0( a );
4024         aExp = extractFloat128Exp( a );
4025         aSign = extractFloat128Sign( a );
4026         if ( aExp == 0x7FFF ) {
4027                 if ( aSig0 | aSig1 ) {
4028                         return commonNaNToFloat32( float128ToCommonNaN( a ) );
4029                 }
4030                 return packFloat32( aSign, 0xFF, 0 );
4031         }
4032         aSig0 |= ( aSig1 != 0 );
4033         shift64RightJamming( aSig0, 18, &aSig0 );
4034         zSig = aSig0;
4035         if ( aExp || zSig ) {
4036                 zSig |= 0x40000000;
4037                 aExp -= 0x3F81;
4038         }
4039         return roundAndPackFloat32( aSign, aExp, zSig );
4040
4041 }
4042
4043 /*----------------------------------------------------------------------------
4044 | Returns the result of converting the quadruple-precision floating-point
4045 | value `a' to the double-precision floating-point format.  The conversion
4046 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4047 | Arithmetic.
4048 *----------------------------------------------------------------------------*/
4049
4050 float64 float128_to_float64( float128 a )
4051 {
4052         flag aSign;
4053         int32 aExp;
4054         bits64 aSig0, aSig1;
4055
4056         aSig1 = extractFloat128Frac1( a );
4057         aSig0 = extractFloat128Frac0( a );
4058         aExp = extractFloat128Exp( a );
4059         aSign = extractFloat128Sign( a );
4060         if ( aExp == 0x7FFF ) {
4061                 if ( aSig0 | aSig1 ) {
4062                         return commonNaNToFloat64( float128ToCommonNaN( a ) );
4063                 }
4064                 return packFloat64( aSign, 0x7FF, 0 );
4065         }
4066         shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4067         aSig0 |= ( aSig1 != 0 );
4068         if ( aExp || aSig0 ) {
4069                 aSig0 |= LIT64( 0x4000000000000000 );
4070                 aExp -= 0x3C01;
4071         }
4072         return roundAndPackFloat64( aSign, aExp, aSig0 );
4073
4074 }
4075
4076 #ifdef FLOATX80
4077
4078 /*----------------------------------------------------------------------------
4079 | Returns the result of converting the quadruple-precision floating-point
4080 | value `a' to the extended double-precision floating-point format.  The
4081 | conversion is performed according to the IEC/IEEE Standard for Binary
4082 | Floating-Point Arithmetic.
4083 *----------------------------------------------------------------------------*/
4084
4085 floatx80 float128_to_floatx80( float128 a )
4086 {
4087         flag aSign;
4088         int32 aExp;
4089         bits64 aSig0, aSig1;
4090
4091         aSig1 = extractFloat128Frac1( a );
4092         aSig0 = extractFloat128Frac0( a );
4093         aExp = extractFloat128Exp( a );
4094         aSign = extractFloat128Sign( a );
4095         if ( aExp == 0x7FFF ) {
4096                 if ( aSig0 | aSig1 ) {
4097                         return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4098                 }
4099                 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4100         }
4101         if ( aExp == 0 ) {
4102                 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4103                 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4104         }
4105         else {
4106                 aSig0 |= LIT64( 0x0001000000000000 );
4107         }
4108         shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4109         return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4110
4111 }
4112
4113 #endif
4114
4115 /*----------------------------------------------------------------------------
4116 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4117 | returns the result as a quadruple-precision floating-point value.  The
4118 | operation is performed according to the IEC/IEEE Standard for Binary
4119 | Floating-Point Arithmetic.
4120 *----------------------------------------------------------------------------*/
4121
4122 float128 float128_round_to_int( float128 a )
4123 {
4124         flag aSign;
4125         int32 aExp;
4126         bits64 lastBitMask, roundBitsMask;
4127         int8 roundingMode;
4128         float128 z;
4129
4130         aExp = extractFloat128Exp( a );
4131         if ( 0x402F <= aExp ) {
4132                 if ( 0x406F <= aExp ) {
4133                         if (    ( aExp == 0x7FFF )
4134                                         && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4135                                 ) {
4136                                 return propagateFloat128NaN( a, a );
4137                         }
4138                         return a;
4139                 }
4140                 lastBitMask = 1;
4141                 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4142                 roundBitsMask = lastBitMask - 1;
4143                 z = a;
4144                 roundingMode = float_rounding_mode;
4145                 if ( roundingMode == float_round_nearest_even ) {
4146                         if ( lastBitMask ) {
4147                                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4148                                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4149                         }
4150                         else {
4151                                 if ( (sbits64) z.low < 0 ) {
4152                                         ++z.high;
4153                                         if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4154                                 }
4155                         }
4156                 }
4157                 else if ( roundingMode != float_round_to_zero ) {
4158                         if (   extractFloat128Sign( z )
4159                                         ^ ( roundingMode == float_round_up ) ) {
4160                                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4161                         }
4162                 }
4163                 z.low &= ~ roundBitsMask;
4164         }
4165         else {
4166                 if ( aExp < 0x3FFF ) {
4167                         if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4168                         float_exception_flags |= float_flag_inexact;
4169                         aSign = extractFloat128Sign( a );
4170                         switch ( float_rounding_mode ) {
4171                                 case float_round_nearest_even:
4172                                 if (    ( aExp == 0x3FFE )
4173                                                 && (   extractFloat128Frac0( a )
4174                                                         | extractFloat128Frac1( a ) )
4175                                         ) {
4176                                         return packFloat128( aSign, 0x3FFF, 0, 0 );
4177                                 }
4178                                 break;
4179                                 case float_round_down:
4180                                 return
4181                                                 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4182                                         : packFloat128( 0, 0, 0, 0 );
4183                                 case float_round_up:
4184                                 return
4185                                                 aSign ? packFloat128( 1, 0, 0, 0 )
4186                                         : packFloat128( 0, 0x3FFF, 0, 0 );
4187                         }
4188                         return packFloat128( aSign, 0, 0, 0 );
4189                 }
4190                 lastBitMask = 1;
4191                 lastBitMask <<= 0x402F - aExp;
4192                 roundBitsMask = lastBitMask - 1;
4193                 z.low = 0;
4194                 z.high = a.high;
4195                 roundingMode = float_rounding_mode;
4196                 if ( roundingMode == float_round_nearest_even ) {
4197                         z.high += lastBitMask>>1;
4198                         if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4199                                 z.high &= ~ lastBitMask;
4200                         }
4201                 }
4202                 else if ( roundingMode != float_round_to_zero ) {
4203                         if (   extractFloat128Sign( z )
4204                                         ^ ( roundingMode == float_round_up ) ) {
4205                                 z.high |= ( a.low != 0 );
4206                                 z.high += roundBitsMask;
4207                         }
4208                 }
4209                 z.high &= ~ roundBitsMask;
4210         }
4211         if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4212                 float_exception_flags |= float_flag_inexact;
4213         }
4214         return z;
4215
4216 }
4217
4218 /*----------------------------------------------------------------------------
4219 | Returns the result of adding the absolute values of the quadruple-precision
4220 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4221 | before being returned.  `zSign' is ignored if the result is a NaN.
4222 | The addition is performed according to the IEC/IEEE Standard for Binary
4223 | Floating-Point Arithmetic.
4224 *----------------------------------------------------------------------------*/
4225
4226 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4227 {
4228         int32 aExp, bExp, zExp;
4229         bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4230         int32 expDiff;
4231
4232         aSig1 = extractFloat128Frac1( a );
4233         aSig0 = extractFloat128Frac0( a );
4234         aExp = extractFloat128Exp( a );
4235         bSig1 = extractFloat128Frac1( b );
4236         bSig0 = extractFloat128Frac0( b );
4237         bExp = extractFloat128Exp( b );
4238         expDiff = aExp - bExp;
4239         if ( 0 < expDiff ) {
4240                 if ( aExp == 0x7FFF ) {
4241                         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4242                         return a;
4243                 }
4244                 if ( bExp == 0 ) {
4245                         --expDiff;
4246                 }
4247                 else {
4248                         bSig0 |= LIT64( 0x0001000000000000 );
4249                 }
4250                 shift128ExtraRightJamming(
4251                         bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4252                 zExp = aExp;
4253         }
4254         else if ( expDiff < 0 ) {
4255                 if ( bExp == 0x7FFF ) {
4256                         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4257                         return packFloat128( zSign, 0x7FFF, 0, 0 );
4258                 }
4259                 if ( aExp == 0 ) {
4260                         ++expDiff;
4261                 }
4262                 else {
4263                         aSig0 |= LIT64( 0x0001000000000000 );
4264                 }
4265                 shift128ExtraRightJamming(
4266                         aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4267                 zExp = bExp;
4268         }
4269         else {
4270                 if ( aExp == 0x7FFF ) {
4271                         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4272                                 return propagateFloat128NaN( a, b );
4273                         }
4274                         return a;
4275                 }
4276                 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4277                 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4278                 zSig2 = 0;
4279                 zSig0 |= LIT64( 0x0002000000000000 );
4280                 zExp = aExp;
4281                 goto shiftRight1;
4282         }
4283         aSig0 |= LIT64( 0x0001000000000000 );
4284         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4285         --zExp;
4286         if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4287         ++zExp;
4288         shiftRight1:
4289         shift128ExtraRightJamming(
4290                 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4291         roundAndPack:
4292         return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4293
4294 }
4295
4296 /*----------------------------------------------------------------------------
4297 | Returns the result of subtracting the absolute values of the quadruple-
4298 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4299 | difference is negated before being returned.  `zSign' is ignored if the
4300 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4301 | Standard for Binary Floating-Point Arithmetic.
4302 *----------------------------------------------------------------------------*/
4303
4304 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4305 {
4306         int32 aExp, bExp, zExp;
4307         bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4308         int32 expDiff;
4309         float128 z;
4310
4311         aSig1 = extractFloat128Frac1( a );
4312         aSig0 = extractFloat128Frac0( a );
4313         aExp = extractFloat128Exp( a );
4314         bSig1 = extractFloat128Frac1( b );
4315         bSig0 = extractFloat128Frac0( b );
4316         bExp = extractFloat128Exp( b );
4317         expDiff = aExp - bExp;
4318         shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4319         shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4320         if ( 0 < expDiff ) goto aExpBigger;
4321         if ( expDiff < 0 ) goto bExpBigger;
4322         if ( aExp == 0x7FFF ) {
4323                 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4324                         return propagateFloat128NaN( a, b );
4325                 }
4326                 float_raise( float_flag_invalid );
4327                 z.low = float128_default_nan_low;
4328                 z.high = float128_default_nan_high;
4329                 return z;
4330         }
4331         if ( aExp == 0 ) {
4332                 aExp = 1;
4333                 bExp = 1;
4334         }
4335         if ( bSig0 < aSig0 ) goto aBigger;
4336         if ( aSig0 < bSig0 ) goto bBigger;
4337         if ( bSig1 < aSig1 ) goto aBigger;
4338         if ( aSig1 < bSig1 ) goto bBigger;
4339         return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4340         bExpBigger:
4341         if ( bExp == 0x7FFF ) {
4342                 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4343                 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4344         }
4345         if ( aExp == 0 ) {
4346                 ++expDiff;
4347         }
4348         else {
4349                 aSig0 |= LIT64( 0x4000000000000000 );
4350         }
4351         shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4352         bSig0 |= LIT64( 0x4000000000000000 );
4353         bBigger:
4354         sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4355         zExp = bExp;
4356         zSign ^= 1;
4357         goto normalizeRoundAndPack;
4358         aExpBigger:
4359         if ( aExp == 0x7FFF ) {
4360                 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4361                 return a;
4362         }
4363         if ( bExp == 0 ) {
4364                 --expDiff;
4365         }
4366         else {
4367                 bSig0 |= LIT64( 0x4000000000000000 );
4368         }
4369         shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4370         aSig0 |= LIT64( 0x4000000000000000 );
4371         aBigger:
4372         sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4373         zExp = aExp;
4374         normalizeRoundAndPack:
4375         --zExp;
4376         return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4377
4378 }
4379
4380 /*----------------------------------------------------------------------------
4381 | Returns the result of adding the quadruple-precision floating-point values
4382 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4383 | for Binary Floating-Point Arithmetic.
4384 *----------------------------------------------------------------------------*/
4385
4386 float128 float128_add( float128 a, float128 b )
4387 {
4388         flag aSign, bSign;
4389
4390         aSign = extractFloat128Sign( a );
4391         bSign = extractFloat128Sign( b );
4392         if ( aSign == bSign ) {
4393                 return addFloat128Sigs( a, b, aSign );
4394         }
4395         else {
4396                 return subFloat128Sigs( a, b, aSign );
4397         }
4398
4399 }
4400
4401 /*----------------------------------------------------------------------------
4402 | Returns the result of subtracting the quadruple-precision floating-point
4403 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4404 | Standard for Binary Floating-Point Arithmetic.
4405 *----------------------------------------------------------------------------*/
4406
4407 float128 float128_sub( float128 a, float128 b )
4408 {
4409         flag aSign, bSign;
4410
4411         aSign = extractFloat128Sign( a );
4412         bSign = extractFloat128Sign( b );
4413         if ( aSign == bSign ) {
4414                 return subFloat128Sigs( a, b, aSign );
4415         }
4416         else {
4417                 return addFloat128Sigs( a, b, aSign );
4418         }
4419
4420 }
4421
4422 /*----------------------------------------------------------------------------
4423 | Returns the result of multiplying the quadruple-precision floating-point
4424 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4425 | Standard for Binary Floating-Point Arithmetic.
4426 *----------------------------------------------------------------------------*/
4427
4428 float128 float128_mul( float128 a, float128 b )
4429 {
4430         flag aSign, bSign, zSign;
4431         int32 aExp, bExp, zExp;
4432         bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4433         float128 z;
4434
4435         aSig1 = extractFloat128Frac1( a );
4436         aSig0 = extractFloat128Frac0( a );
4437         aExp = extractFloat128Exp( a );
4438         aSign = extractFloat128Sign( a );
4439         bSig1 = extractFloat128Frac1( b );
4440         bSig0 = extractFloat128Frac0( b );
4441         bExp = extractFloat128Exp( b );
4442         bSign = extractFloat128Sign( b );
4443         zSign = aSign ^ bSign;
4444         if ( aExp == 0x7FFF ) {
4445                 if (    ( aSig0 | aSig1 )
4446                                 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4447                         return propagateFloat128NaN( a, b );
4448                 }
4449                 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4450                 return packFloat128( zSign, 0x7FFF, 0, 0 );
4451         }
4452         if ( bExp == 0x7FFF ) {
4453                 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4454                 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4455         invalid:
4456                         float_raise( float_flag_invalid );
4457                         z.low = float128_default_nan_low;
4458                         z.high = float128_default_nan_high;
4459                         return z;
4460                 }
4461                 return packFloat128( zSign, 0x7FFF, 0, 0 );
4462         }
4463         if ( aExp == 0 ) {
4464                 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4465                 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4466         }
4467         if ( bExp == 0 ) {
4468                 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4469                 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4470         }
4471         zExp = aExp + bExp - 0x4000;
4472         aSig0 |= LIT64( 0x0001000000000000 );
4473         shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4474         mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4475         add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4476         zSig2 |= ( zSig3 != 0 );
4477         if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4478                 shift128ExtraRightJamming(
4479                         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4480                 ++zExp;
4481         }
4482         return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4483
4484 }
4485
4486 /*----------------------------------------------------------------------------
4487 | Returns the result of dividing the quadruple-precision floating-point value
4488 | `a' by the corresponding value `b'.  The operation is performed according to
4489 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4490 *----------------------------------------------------------------------------*/
4491
4492 float128 float128_div( float128 a, float128 b )
4493 {
4494         flag aSign, bSign, zSign;
4495         int32 aExp, bExp, zExp;
4496         bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4497         bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4498         float128 z;
4499
4500         aSig1 = extractFloat128Frac1( a );
4501         aSig0 = extractFloat128Frac0( a );
4502         aExp = extractFloat128Exp( a );
4503         aSign = extractFloat128Sign( a );
4504         bSig1 = extractFloat128Frac1( b );
4505         bSig0 = extractFloat128Frac0( b );
4506         bExp = extractFloat128Exp( b );
4507         bSign = extractFloat128Sign( b );
4508         zSign = aSign ^ bSign;
4509         if ( aExp == 0x7FFF ) {
4510                 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4511                 if ( bExp == 0x7FFF ) {
4512                         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4513                         goto invalid;
4514                 }
4515                 return packFloat128( zSign, 0x7FFF, 0, 0 );
4516         }
4517         if ( bExp == 0x7FFF ) {
4518                 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4519                 return packFloat128( zSign, 0, 0, 0 );
4520         }
4521         if ( bExp == 0 ) {
4522                 if ( ( bSig0 | bSig1 ) == 0 ) {
4523                         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4524         invalid:
4525                                 float_raise( float_flag_invalid );
4526                                 z.low = float128_default_nan_low;
4527                                 z.high = float128_default_nan_high;
4528                                 return z;
4529                         }
4530                         float_raise( float_flag_divbyzero );
4531                         return packFloat128( zSign, 0x7FFF, 0, 0 );
4532                 }
4533                 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4534         }
4535         if ( aExp == 0 ) {
4536                 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4537                 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4538         }
4539         zExp = aExp - bExp + 0x3FFD;
4540         shortShift128Left(
4541                 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4542         shortShift128Left(
4543                 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4544         if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4545                 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4546                 ++zExp;
4547         }
4548         zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4549         mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4550         sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4551         while ( (sbits64) rem0 < 0 ) {
4552                 --zSig0;
4553                 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4554         }
4555         zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4556         if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4557                 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4558                 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4559                 while ( (sbits64) rem1 < 0 ) {
4560                         --zSig1;
4561                         add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4562                 }
4563                 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4564         }
4565         shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4566         return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4567
4568 }
4569
4570 /*----------------------------------------------------------------------------
4571 | Returns the remainder of the quadruple-precision floating-point value `a'
4572 | with respect to the corresponding value `b'.  The operation is performed
4573 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4574 *----------------------------------------------------------------------------*/
4575
4576 float128 float128_rem( float128 a, float128 b )
4577 {
4578         flag aSign, zSign;
4579         int32 aExp, bExp, expDiff;
4580         bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4581         bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4582         sbits64 sigMean0;
4583         float128 z;
4584
4585         aSig1 = extractFloat128Frac1( a );
4586         aSig0 = extractFloat128Frac0( a );
4587         aExp = extractFloat128Exp( a );
4588         aSign = extractFloat128Sign( a );
4589         bSig1 = extractFloat128Frac1( b );
4590         bSig0 = extractFloat128Frac0( b );
4591         bExp = extractFloat128Exp( b );
4592 //    bSign = extractFloat128Sign( b );
4593         if ( aExp == 0x7FFF ) {
4594                 if (    ( aSig0 | aSig1 )
4595                                 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4596                         return propagateFloat128NaN( a, b );
4597                 }
4598                 goto invalid;
4599         }
4600         if ( bExp == 0x7FFF ) {
4601                 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4602                 return a;
4603         }
4604         if ( bExp == 0 ) {
4605                 if ( ( bSig0 | bSig1 ) == 0 ) {
4606         invalid:
4607                         float_raise( float_flag_invalid );
4608                         z.low = float128_default_nan_low;
4609                         z.high = float128_default_nan_high;
4610                         return z;
4611                 }
4612                 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4613         }
4614         if ( aExp == 0 ) {
4615                 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4616                 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4617         }
4618         expDiff = aExp - bExp;
4619         if ( expDiff < -1 ) return a;
4620         shortShift128Left(
4621                 aSig0 | LIT64( 0x0001000000000000 ),
4622                 aSig1,
4623                 15 - ( expDiff < 0 ),
4624                 &aSig0,
4625                 &aSig1
4626         );
4627         shortShift128Left(
4628                 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4629         q = le128( bSig0, bSig1, aSig0, aSig1 );
4630         if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4631         expDiff -= 64;
4632         while ( 0 < expDiff ) {
4633                 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4634                 q = ( 4 < q ) ? q - 4 : 0;
4635                 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4636                 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4637                 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4638                 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4639                 expDiff -= 61;
4640         }
4641         if ( -64 < expDiff ) {
4642                 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4643                 q = ( 4 < q ) ? q - 4 : 0;
4644                 q >>= - expDiff;
4645                 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4646                 expDiff += 52;
4647                 if ( expDiff < 0 ) {
4648                         shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4649                 }
4650                 else {
4651                         shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4652                 }
4653                 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4654                 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4655         }
4656         else {
4657                 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4658                 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4659         }
4660         do {
4661                 alternateASig0 = aSig0;
4662                 alternateASig1 = aSig1;
4663                 ++q;
4664                 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4665         } while ( 0 <= (sbits64) aSig0 );
4666         add128(
4667                 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
4668         if (    ( sigMean0 < 0 )
4669                         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4670                 aSig0 = alternateASig0;
4671                 aSig1 = alternateASig1;
4672         }
4673         zSign = ( (sbits64) aSig0 < 0 );
4674         if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4675         return
4676                 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4677
4678 }
4679
4680 /*----------------------------------------------------------------------------
4681 | Returns the square root of the quadruple-precision floating-point value `a'.
4682 | The operation is performed according to the IEC/IEEE Standard for Binary
4683 | Floating-Point Arithmetic.
4684 *----------------------------------------------------------------------------*/
4685
4686 float128 float128_sqrt( float128 a )
4687 {
4688         flag aSign;
4689         int32 aExp, zExp;
4690         bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4691         bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4692         float128 z;
4693
4694         aSig1 = extractFloat128Frac1( a );
4695         aSig0 = extractFloat128Frac0( a );
4696         aExp = extractFloat128Exp( a );
4697         aSign = extractFloat128Sign( a );
4698         if ( aExp == 0x7FFF ) {
4699                 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4700                 if ( ! aSign ) return a;
4701                 goto invalid;
4702         }
4703         if ( aSign ) {
4704                 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4705         invalid:
4706                 float_raise( float_flag_invalid );
4707                 z.low = float128_default_nan_low;
4708                 z.high = float128_default_nan_high;
4709                 return z;
4710         }
4711         if ( aExp == 0 ) {
4712                 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4713                 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4714         }
4715         zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4716         aSig0 |= LIT64( 0x0001000000000000 );
4717         zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4718         shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4719         zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4720         doubleZSig0 = zSig0<<1;
4721         mul64To128( zSig0, zSig0, &term0, &term1 );
4722         sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4723         while ( (sbits64) rem0 < 0 ) {
4724                 --zSig0;
4725                 doubleZSig0 -= 2;
4726                 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4727         }
4728         zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4729         if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4730                 if ( zSig1 == 0 ) zSig1 = 1;
4731                 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4732                 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4733                 mul64To128( zSig1, zSig1, &term2, &term3 );
4734                 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4735                 while ( (sbits64) rem1 < 0 ) {
4736                         --zSig1;
4737                         shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4738                         term3 |= 1;
4739                         term2 |= doubleZSig0;
4740                         add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4741                 }
4742                 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4743         }
4744         shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4745         return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4746
4747 }
4748
4749 /*----------------------------------------------------------------------------
4750 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4751 | the corresponding value `b', and 0 otherwise.  The comparison is performed
4752 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4753 *----------------------------------------------------------------------------*/
4754
4755 flag float128_eq( float128 a, float128 b )
4756 {
4757         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4758                                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4759                         || (    ( extractFloat128Exp( b ) == 0x7FFF )
4760                                 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4761                 ) {
4762                 if (    float128_is_signaling_nan( a )
4763                                 || float128_is_signaling_nan( b ) ) {
4764                         float_raise( float_flag_invalid );
4765                 }
4766                 return 0;
4767         }
4768         return
4769                         ( a.low == b.low )
4770                 && (    ( a.high == b.high )
4771                                 || (    ( a.low == 0 )
4772                                         && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4773                         );
4774
4775 }
4776
4777 /*----------------------------------------------------------------------------
4778 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4779 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
4780 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4781 | Arithmetic.
4782 *----------------------------------------------------------------------------*/
4783
4784 flag float128_le( float128 a, float128 b )
4785 {
4786         flag aSign, bSign;
4787
4788         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4789                                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4790                         || (    ( extractFloat128Exp( b ) == 0x7FFF )
4791                                 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4792                 ) {
4793                 float_raise( float_flag_invalid );
4794                 return 0;
4795         }
4796         aSign = extractFloat128Sign( a );
4797         bSign = extractFloat128Sign( b );
4798         if ( aSign != bSign ) {
4799                 return
4800                                 aSign
4801                         || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4802                                         == 0 );
4803         }
4804         return
4805                         aSign ? le128( b.high, b.low, a.high, a.low )
4806                 : le128( a.high, a.low, b.high, b.low );
4807
4808 }
4809
4810 /*----------------------------------------------------------------------------
4811 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4812 | the corresponding value `b', and 0 otherwise.  The comparison is performed
4813 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4814 *----------------------------------------------------------------------------*/
4815
4816 flag float128_lt( float128 a, float128 b )
4817 {
4818         flag aSign, bSign;
4819
4820         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4821                                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4822                         || (    ( extractFloat128Exp( b ) == 0x7FFF )
4823                                 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4824                 ) {
4825                 float_raise( float_flag_invalid );
4826                 return 0;
4827         }
4828         aSign = extractFloat128Sign( a );
4829         bSign = extractFloat128Sign( b );
4830         if ( aSign != bSign ) {
4831                 return
4832                                 aSign
4833                         && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4834                                         != 0 );
4835         }
4836         return
4837                         aSign ? lt128( b.high, b.low, a.high, a.low )
4838                 : lt128( a.high, a.low, b.high, b.low );
4839
4840 }
4841
4842 /*----------------------------------------------------------------------------
4843 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4844 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4845 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4846 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4847 *----------------------------------------------------------------------------*/
4848
4849 flag float128_eq_signaling( float128 a, float128 b )
4850 {
4851         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4852                                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4853                         || (    ( extractFloat128Exp( b ) == 0x7FFF )
4854                                 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4855                 ) {
4856                 float_raise( float_flag_invalid );
4857                 return 0;
4858         }
4859         return
4860                         ( a.low == b.low )
4861                 && (    ( a.high == b.high )
4862                                 || (    ( a.low == 0 )
4863                                         && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4864                         );
4865
4866 }
4867
4868 /*----------------------------------------------------------------------------
4869 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4870 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4871 | cause an exception.  Otherwise, the comparison is performed according to the
4872 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4873 *----------------------------------------------------------------------------*/
4874
4875 flag float128_le_quiet( float128 a, float128 b )
4876 {
4877         flag aSign, bSign;
4878
4879         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4880                                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4881                         || (    ( extractFloat128Exp( b ) == 0x7FFF )
4882                                 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4883                 ) {
4884                 if (    float128_is_signaling_nan( a )
4885                                 || float128_is_signaling_nan( b ) ) {
4886                         float_raise( float_flag_invalid );
4887                 }
4888                 return 0;
4889         }
4890         aSign = extractFloat128Sign( a );
4891         bSign = extractFloat128Sign( b );
4892         if ( aSign != bSign ) {
4893                 return
4894                                 aSign
4895                         || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4896                                         == 0 );
4897         }
4898         return
4899                         aSign ? le128( b.high, b.low, a.high, a.low )
4900                 : le128( a.high, a.low, b.high, b.low );
4901
4902 }
4903
4904 /*----------------------------------------------------------------------------
4905 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4906 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4907 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4908 | Standard for Binary Floating-Point Arithmetic.
4909 *----------------------------------------------------------------------------*/
4910
4911 flag float128_lt_quiet( float128 a, float128 b )
4912 {
4913         flag aSign, bSign;
4914
4915         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4916                                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4917                         || (    ( extractFloat128Exp( b ) == 0x7FFF )
4918                                 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4919                 ) {
4920                 if (    float128_is_signaling_nan( a )
4921                                 || float128_is_signaling_nan( b ) ) {
4922                         float_raise( float_flag_invalid );
4923                 }
4924                 return 0;
4925         }
4926         aSign = extractFloat128Sign( a );
4927         bSign = extractFloat128Sign( b );
4928         if ( aSign != bSign ) {
4929                 return
4930                                 aSign
4931                         && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4932                                         != 0 );
4933         }
4934         return
4935                         aSign ? lt128( b.high, b.low, a.high, a.low )
4936                 : lt128( a.high, a.low, b.high, b.low );
4937
4938 }
4939
4940 #endif