2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "m68kcpu.h" // which includes softfloat.h after defining the basic types
35 /*----------------------------------------------------------------------------
36 | Floating-point rounding mode, extended double-precision rounding precision,
37 | and exception flags.
38 *----------------------------------------------------------------------------*/
39 int8 float_exception_flags = 0;
41 int8 floatx80_rounding_precision = 80;
44 int8 float_rounding_mode = float_round_nearest_even;
46 /*----------------------------------------------------------------------------
47 | Functions and definitions to determine: (1) whether tininess for underflow
48 | is detected before or after rounding by default, (2) what (if anything)
49 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
50 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
51 | are propagated from function inputs to output. These details are target-
53 *----------------------------------------------------------------------------*/
54 #include "softfloat-specialize"
56 /*----------------------------------------------------------------------------
57 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
58 | and 7, and returns the properly rounded 32-bit integer corresponding to the
59 | input. If `zSign' is 1, the input is negated before being converted to an
60 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
61 | is simply rounded to an integer, with the inexact exception raised if the
62 | input cannot be represented exactly as an integer. However, if the fixed-
63 | point input is too large, the invalid exception is raised and the largest
64 | positive or negative integer is returned.
65 *----------------------------------------------------------------------------*/
67 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
70 flag roundNearestEven;
71 int8 roundIncrement, roundBits;
74 roundingMode = float_rounding_mode;
75 roundNearestEven = ( roundingMode == float_round_nearest_even );
76 roundIncrement = 0x40;
77 if ( ! roundNearestEven ) {
78 if ( roundingMode == float_round_to_zero ) {
82 roundIncrement = 0x7F;
84 if ( roundingMode == float_round_up ) roundIncrement = 0;
87 if ( roundingMode == float_round_down ) roundIncrement = 0;
91 roundBits = absZ & 0x7F;
92 absZ = ( absZ + roundIncrement )>>7;
93 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
96 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
97 float_raise( float_flag_invalid );
98 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
100 if ( roundBits ) float_exception_flags |= float_flag_inexact;
105 /*----------------------------------------------------------------------------
106 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
107 | `absZ1', with binary point between bits 63 and 64 (between the input words),
108 | and returns the properly rounded 64-bit integer corresponding to the input.
109 | If `zSign' is 1, the input is negated before being converted to an integer.
110 | Ordinarily, the fixed-point input is simply rounded to an integer, with
111 | the inexact exception raised if the input cannot be represented exactly as
112 | an integer. However, if the fixed-point input is too large, the invalid
113 | exception is raised and the largest positive or negative integer is
115 *----------------------------------------------------------------------------*/
117 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
120 flag roundNearestEven, increment;
123 roundingMode = float_rounding_mode;
124 roundNearestEven = ( roundingMode == float_round_nearest_even );
125 increment = ( (sbits64) absZ1 < 0 );
126 if ( ! roundNearestEven ) {
127 if ( roundingMode == float_round_to_zero ) {
132 increment = ( roundingMode == float_round_down ) && absZ1;
135 increment = ( roundingMode == float_round_up ) && absZ1;
141 if ( absZ0 == 0 ) goto overflow;
142 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
145 if ( zSign ) z = - z;
146 if ( z && ( ( z < 0 ) ^ zSign ) ) {
148 float_raise( float_flag_invalid );
150 zSign ? (sbits64) LIT64( 0x8000000000000000 )
151 : (sbits64) LIT64( 0x7FFFFFFFFFFFFFFF );
153 if ( absZ1 ) float_exception_flags |= float_flag_inexact;
158 /*----------------------------------------------------------------------------
159 | Returns the fraction bits of the single-precision floating-point value `a'.
160 *----------------------------------------------------------------------------*/
162 static inline bits32 extractFloat32Frac( float32 a )
164 return a & 0x007FFFFF;
168 /*----------------------------------------------------------------------------
169 | Returns the exponent bits of the single-precision floating-point value `a'.
170 *----------------------------------------------------------------------------*/
172 static inline int16 extractFloat32Exp( float32 a )
174 return ( a>>23 ) & 0xFF;
178 /*----------------------------------------------------------------------------
179 | Returns the sign bit of the single-precision floating-point value `a'.
180 *----------------------------------------------------------------------------*/
182 static inline flag extractFloat32Sign( float32 a )
188 /*----------------------------------------------------------------------------
189 | Normalizes the subnormal single-precision floating-point value represented
190 | by the denormalized significand `aSig'. The normalized exponent and
191 | significand are stored at the locations pointed to by `zExpPtr' and
192 | `zSigPtr', respectively.
193 *----------------------------------------------------------------------------*/
196 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
200 shiftCount = countLeadingZeros32( aSig ) - 8;
201 *zSigPtr = aSig<<shiftCount;
202 *zExpPtr = 1 - shiftCount;
206 /*----------------------------------------------------------------------------
207 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
208 | single-precision floating-point value, returning the result. After being
209 | shifted into the proper positions, the three fields are simply added
210 | together to form the result. This means that any integer portion of `zSig'
211 | will be added into the exponent. Since a properly normalized significand
212 | will have an integer portion equal to 1, the `zExp' input should be 1 less
213 | than the desired result exponent whenever `zSig' is a complete, normalized
215 *----------------------------------------------------------------------------*/
217 static inline float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
219 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
223 /*----------------------------------------------------------------------------
224 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
225 | and significand `zSig', and returns the proper single-precision floating-
226 | point value corresponding to the abstract input. Ordinarily, the abstract
227 | value is simply rounded and packed into the single-precision format, with
228 | the inexact exception raised if the abstract input cannot be represented
229 | exactly. However, if the abstract value is too large, the overflow and
230 | inexact exceptions are raised and an infinity or maximal finite value is
231 | returned. If the abstract value is too small, the input value is rounded to
232 | a subnormal number, and the underflow and inexact exceptions are raised if
233 | the abstract input cannot be represented exactly as a subnormal single-
234 | precision floating-point number.
235 | The input significand `zSig' has its binary point between bits 30
236 | and 29, which is 7 bits to the left of the usual location. This shifted
237 | significand must be normalized or smaller. If `zSig' is not normalized,
238 | `zExp' must be 0; in that case, the result returned is a subnormal number,
239 | and it must not require rounding. In the usual case that `zSig' is
240 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
241 | The handling of underflow and overflow follows the IEC/IEEE Standard for
242 | Binary Floating-Point Arithmetic.
243 *----------------------------------------------------------------------------*/
245 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
248 flag roundNearestEven;
249 int8 roundIncrement, roundBits;
252 roundingMode = float_rounding_mode;
253 roundNearestEven = ( roundingMode == float_round_nearest_even );
254 roundIncrement = 0x40;
255 if ( ! roundNearestEven ) {
256 if ( roundingMode == float_round_to_zero ) {
260 roundIncrement = 0x7F;
262 if ( roundingMode == float_round_up ) roundIncrement = 0;
265 if ( roundingMode == float_round_down ) roundIncrement = 0;
269 roundBits = zSig & 0x7F;
270 if ( 0xFD <= (bits16) zExp ) {
272 || ( ( zExp == 0xFD )
273 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
275 float_raise( float_flag_overflow | float_flag_inexact );
276 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
280 ( float_detect_tininess == float_tininess_before_rounding )
282 || ( zSig + roundIncrement < 0x80000000 );
283 shift32RightJamming( zSig, - zExp, &zSig );
285 roundBits = zSig & 0x7F;
286 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
289 if ( roundBits ) float_exception_flags |= float_flag_inexact;
290 zSig = ( zSig + roundIncrement )>>7;
291 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
292 if ( zSig == 0 ) zExp = 0;
293 return packFloat32( zSign, zExp, zSig );
297 /*----------------------------------------------------------------------------
298 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
299 | and significand `zSig', and returns the proper single-precision floating-
300 | point value corresponding to the abstract input. This routine is just like
301 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
302 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
303 | floating-point exponent.
304 *----------------------------------------------------------------------------*/
307 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
311 shiftCount = countLeadingZeros32( zSig ) - 1;
312 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
316 /*----------------------------------------------------------------------------
317 | Returns the fraction bits of the double-precision floating-point value `a'.
318 *----------------------------------------------------------------------------*/
320 static inline bits64 extractFloat64Frac( float64 a )
322 return a & LIT64( 0x000FFFFFFFFFFFFF );
326 /*----------------------------------------------------------------------------
327 | Returns the exponent bits of the double-precision floating-point value `a'.
328 *----------------------------------------------------------------------------*/
330 static inline int16 extractFloat64Exp( float64 a )
332 return ( a>>52 ) & 0x7FF;
336 /*----------------------------------------------------------------------------
337 | Returns the sign bit of the double-precision floating-point value `a'.
338 *----------------------------------------------------------------------------*/
340 static inline flag extractFloat64Sign( float64 a )
346 /*----------------------------------------------------------------------------
347 | Normalizes the subnormal double-precision floating-point value represented
348 | by the denormalized significand `aSig'. The normalized exponent and
349 | significand are stored at the locations pointed to by `zExpPtr' and
350 | `zSigPtr', respectively.
351 *----------------------------------------------------------------------------*/
354 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
358 shiftCount = countLeadingZeros64( aSig ) - 11;
359 *zSigPtr = aSig<<shiftCount;
360 *zExpPtr = 1 - shiftCount;
364 /*----------------------------------------------------------------------------
365 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
366 | double-precision floating-point value, returning the result. After being
367 | shifted into the proper positions, the three fields are simply added
368 | together to form the result. This means that any integer portion of `zSig'
369 | will be added into the exponent. Since a properly normalized significand
370 | will have an integer portion equal to 1, the `zExp' input should be 1 less
371 | than the desired result exponent whenever `zSig' is a complete, normalized
373 *----------------------------------------------------------------------------*/
375 static inline float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
377 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
381 /*----------------------------------------------------------------------------
382 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
383 | and significand `zSig', and returns the proper double-precision floating-
384 | point value corresponding to the abstract input. Ordinarily, the abstract
385 | value is simply rounded and packed into the double-precision format, with
386 | the inexact exception raised if the abstract input cannot be represented
387 | exactly. However, if the abstract value is too large, the overflow and
388 | inexact exceptions are raised and an infinity or maximal finite value is
389 | returned. If the abstract value is too small, the input value is rounded
390 | to a subnormal number, and the underflow and inexact exceptions are raised
391 | if the abstract input cannot be represented exactly as a subnormal double-
392 | precision floating-point number.
393 | The input significand `zSig' has its binary point between bits 62
394 | and 61, which is 10 bits to the left of the usual location. This shifted
395 | significand must be normalized or smaller. If `zSig' is not normalized,
396 | `zExp' must be 0; in that case, the result returned is a subnormal number,
397 | and it must not require rounding. In the usual case that `zSig' is
398 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
399 | The handling of underflow and overflow follows the IEC/IEEE Standard for
400 | Binary Floating-Point Arithmetic.
401 *----------------------------------------------------------------------------*/
403 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
406 flag roundNearestEven;
407 int16 roundIncrement, roundBits;
410 roundingMode = float_rounding_mode;
411 roundNearestEven = ( roundingMode == float_round_nearest_even );
412 roundIncrement = 0x200;
413 if ( ! roundNearestEven ) {
414 if ( roundingMode == float_round_to_zero ) {
418 roundIncrement = 0x3FF;
420 if ( roundingMode == float_round_up ) roundIncrement = 0;
423 if ( roundingMode == float_round_down ) roundIncrement = 0;
427 roundBits = zSig & 0x3FF;
428 if ( 0x7FD <= (bits16) zExp ) {
429 if ( ( 0x7FD < zExp )
430 || ( ( zExp == 0x7FD )
431 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
433 float_raise( float_flag_overflow | float_flag_inexact );
434 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
438 ( float_detect_tininess == float_tininess_before_rounding )
440 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
441 shift64RightJamming( zSig, - zExp, &zSig );
443 roundBits = zSig & 0x3FF;
444 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
447 if ( roundBits ) float_exception_flags |= float_flag_inexact;
448 zSig = ( zSig + roundIncrement )>>10;
449 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
450 if ( zSig == 0 ) zExp = 0;
451 return packFloat64( zSign, zExp, zSig );
455 /*----------------------------------------------------------------------------
456 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
457 | and significand `zSig', and returns the proper double-precision floating-
458 | point value corresponding to the abstract input. This routine is just like
459 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
460 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
461 | floating-point exponent.
462 *----------------------------------------------------------------------------*/
465 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
469 shiftCount = countLeadingZeros64( zSig ) - 1;
470 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
476 /*----------------------------------------------------------------------------
477 | Normalizes the subnormal extended double-precision floating-point value
478 | represented by the denormalized significand `aSig'. The normalized exponent
479 | and significand are stored at the locations pointed to by `zExpPtr' and
480 | `zSigPtr', respectively.
481 *----------------------------------------------------------------------------*/
484 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
488 shiftCount = countLeadingZeros64( aSig );
489 *zSigPtr = aSig<<shiftCount;
490 *zExpPtr = 1 - shiftCount;
494 /*----------------------------------------------------------------------------
495 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
496 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
497 | and returns the proper extended double-precision floating-point value
498 | corresponding to the abstract input. Ordinarily, the abstract value is
499 | rounded and packed into the extended double-precision format, with the
500 | inexact exception raised if the abstract input cannot be represented
501 | exactly. However, if the abstract value is too large, the overflow and
502 | inexact exceptions are raised and an infinity or maximal finite value is
503 | returned. If the abstract value is too small, the input value is rounded to
504 | a subnormal number, and the underflow and inexact exceptions are raised if
505 | the abstract input cannot be represented exactly as a subnormal extended
506 | double-precision floating-point number.
507 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
508 | number of bits as single or double precision, respectively. Otherwise, the
509 | result is rounded to the full precision of the extended double-precision
511 | The input significand must be normalized or smaller. If the input
512 | significand is not normalized, `zExp' must be 0; in that case, the result
513 | returned is a subnormal number, and it must not require rounding. The
514 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
515 | Floating-Point Arithmetic.
516 *----------------------------------------------------------------------------*/
518 // roundAndPackFloatx80 is now also used in fyl2x.c
520 /* static */ floatx80
521 roundAndPackFloatx80(
522 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
526 flag roundNearestEven, increment, isTiny;
527 int64 roundIncrement, roundMask, roundBits;
529 roundingMode = float_rounding_mode;
530 roundNearestEven = ( roundingMode == float_round_nearest_even );
531 if ( roundingPrecision == 80 ) goto precision80;
532 if ( roundingPrecision == 64 ) {
533 roundIncrement = LIT64( 0x0000000000000400 );
534 roundMask = LIT64( 0x00000000000007FF );
536 else if ( roundingPrecision == 32 ) {
537 roundIncrement = LIT64( 0x0000008000000000 );
538 roundMask = LIT64( 0x000000FFFFFFFFFF );
543 zSig0 |= ( zSig1 != 0 );
544 if ( ! roundNearestEven ) {
545 if ( roundingMode == float_round_to_zero ) {
549 roundIncrement = roundMask;
551 if ( roundingMode == float_round_up ) roundIncrement = 0;
554 if ( roundingMode == float_round_down ) roundIncrement = 0;
558 roundBits = zSig0 & roundMask;
559 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
560 if ( ( 0x7FFE < zExp )
561 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
567 ( float_detect_tininess == float_tininess_before_rounding )
569 || ( zSig0 <= zSig0 + roundIncrement );
570 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
572 roundBits = zSig0 & roundMask;
573 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
574 if ( roundBits ) float_exception_flags |= float_flag_inexact;
575 zSig0 += roundIncrement;
576 if ( (sbits64) zSig0 < 0 ) zExp = 1;
577 roundIncrement = roundMask + 1;
578 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
579 roundMask |= roundIncrement;
581 zSig0 &= ~ roundMask;
582 return packFloatx80( zSign, zExp, zSig0 );
585 if ( roundBits ) float_exception_flags |= float_flag_inexact;
586 zSig0 += roundIncrement;
587 if ( zSig0 < (bits64)roundIncrement ) {
589 zSig0 = LIT64( 0x8000000000000000 );
591 roundIncrement = roundMask + 1;
592 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
593 roundMask |= roundIncrement;
595 zSig0 &= ~ roundMask;
596 if ( zSig0 == 0 ) zExp = 0;
597 return packFloatx80( zSign, zExp, zSig0 );
599 increment = ( (sbits64) zSig1 < 0 );
600 if ( ! roundNearestEven ) {
601 if ( roundingMode == float_round_to_zero ) {
606 increment = ( roundingMode == float_round_down ) && zSig1;
609 increment = ( roundingMode == float_round_up ) && zSig1;
613 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
614 if ( ( 0x7FFE < zExp )
615 || ( ( zExp == 0x7FFE )
616 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
622 float_raise( float_flag_overflow | float_flag_inexact );
623 if ( ( roundingMode == float_round_to_zero )
624 || ( zSign && ( roundingMode == float_round_up ) )
625 || ( ! zSign && ( roundingMode == float_round_down ) )
627 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
629 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
633 ( float_detect_tininess == float_tininess_before_rounding )
636 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
637 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
639 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
640 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
641 if ( roundNearestEven ) {
642 increment = ( (sbits64) zSig1 < 0 );
646 increment = ( roundingMode == float_round_down ) && zSig1;
649 increment = ( roundingMode == float_round_up ) && zSig1;
655 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
656 if ( (sbits64) zSig0 < 0 ) zExp = 1;
658 return packFloatx80( zSign, zExp, zSig0 );
661 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
666 zSig0 = LIT64( 0x8000000000000000 );
669 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
673 if ( zSig0 == 0 ) zExp = 0;
675 return packFloatx80( zSign, zExp, zSig0 );
679 /*----------------------------------------------------------------------------
680 | Takes an abstract floating-point value having sign `zSign', exponent
681 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
682 | and returns the proper extended double-precision floating-point value
683 | corresponding to the abstract input. This routine is just like
684 | `roundAndPackFloatx80' except that the input significand does not have to be
686 *----------------------------------------------------------------------------*/
689 normalizeRoundAndPackFloatx80(
690 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
700 shiftCount = countLeadingZeros64( zSig0 );
701 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
704 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
712 /*----------------------------------------------------------------------------
713 | Returns the least-significant 64 fraction bits of the quadruple-precision
714 | floating-point value `a'.
715 *----------------------------------------------------------------------------*/
717 static inline bits64 extractFloat128Frac1( float128 a )
723 /*----------------------------------------------------------------------------
724 | Returns the most-significant 48 fraction bits of the quadruple-precision
725 | floating-point value `a'.
726 *----------------------------------------------------------------------------*/
728 static inline bits64 extractFloat128Frac0( float128 a )
730 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
734 /*----------------------------------------------------------------------------
735 | Returns the exponent bits of the quadruple-precision floating-point value
737 *----------------------------------------------------------------------------*/
739 static inline int32 extractFloat128Exp( float128 a )
741 return ( a.high>>48 ) & 0x7FFF;
745 /*----------------------------------------------------------------------------
746 | Returns the sign bit of the quadruple-precision floating-point value `a'.
747 *----------------------------------------------------------------------------*/
749 static inline flag extractFloat128Sign( float128 a )
755 /*----------------------------------------------------------------------------
756 | Normalizes the subnormal quadruple-precision floating-point value
757 | represented by the denormalized significand formed by the concatenation of
758 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
759 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
760 | significand are stored at the location pointed to by `zSig0Ptr', and the
761 | least significant 64 bits of the normalized significand are stored at the
762 | location pointed to by `zSig1Ptr'.
763 *----------------------------------------------------------------------------*/
766 normalizeFloat128Subnormal(
777 shiftCount = countLeadingZeros64( aSig1 ) - 15;
778 if ( shiftCount < 0 ) {
779 *zSig0Ptr = aSig1>>( - shiftCount );
780 *zSig1Ptr = aSig1<<( shiftCount & 63 );
783 *zSig0Ptr = aSig1<<shiftCount;
786 *zExpPtr = - shiftCount - 63;
789 shiftCount = countLeadingZeros64( aSig0 ) - 15;
790 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
791 *zExpPtr = 1 - shiftCount;
798 /*----------------------------------------------------------------------------
799 | Returns the result of converting the 32-bit two's complement integer `a'
800 | to the single-precision floating-point format. The conversion is performed
801 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
802 *----------------------------------------------------------------------------*/
804 float32 int32_to_float32( int32 a )
808 if ( a == 0 ) return 0;
809 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
811 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
815 /*----------------------------------------------------------------------------
816 | Returns the result of converting the 32-bit two's complement integer `a'
817 | to the double-precision floating-point format. The conversion is performed
818 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
819 *----------------------------------------------------------------------------*/
821 float64 int32_to_float64( int32 a )
828 if ( a == 0 ) return 0;
830 absA = zSign ? - a : a;
831 shiftCount = countLeadingZeros32( absA ) + 21;
833 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
839 /*----------------------------------------------------------------------------
840 | Returns the result of converting the 32-bit two's complement integer `a'
841 | to the extended double-precision floating-point format. The conversion
842 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
844 *----------------------------------------------------------------------------*/
846 floatx80 int32_to_floatx80( int32 a )
853 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
855 absA = zSign ? - a : a;
856 shiftCount = countLeadingZeros32( absA ) + 32;
858 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
866 /*----------------------------------------------------------------------------
867 | Returns the result of converting the 32-bit two's complement integer `a' to
868 | the quadruple-precision floating-point format. The conversion is performed
869 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
870 *----------------------------------------------------------------------------*/
872 float128 int32_to_float128( int32 a )
879 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
881 absA = zSign ? - a : a;
882 shiftCount = countLeadingZeros32( absA ) + 17;
884 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
890 /*----------------------------------------------------------------------------
891 | Returns the result of converting the 64-bit two's complement integer `a'
892 | to the single-precision floating-point format. The conversion is performed
893 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
894 *----------------------------------------------------------------------------*/
896 float32 int64_to_float32( int64 a )
903 if ( a == 0 ) return 0;
905 absA = zSign ? - a : a;
906 shiftCount = countLeadingZeros64( absA ) - 40;
907 if ( 0 <= shiftCount ) {
908 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
912 if ( shiftCount < 0 ) {
913 shift64RightJamming( absA, - shiftCount, &absA );
918 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
923 /*----------------------------------------------------------------------------
924 | Returns the result of converting the 64-bit two's complement integer `a'
925 | to the double-precision floating-point format. The conversion is performed
926 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
927 *----------------------------------------------------------------------------*/
929 float64 int64_to_float64( int64 a )
933 if ( a == 0 ) return 0;
934 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
935 return packFloat64( 1, 0x43E, 0 );
938 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
944 /*----------------------------------------------------------------------------
945 | Returns the result of converting the 64-bit two's complement integer `a'
946 | to the extended double-precision floating-point format. The conversion
947 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
949 *----------------------------------------------------------------------------*/
951 floatx80 int64_to_floatx80( int64 a )
957 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
959 absA = zSign ? - a : a;
960 shiftCount = countLeadingZeros64( absA );
961 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
969 /*----------------------------------------------------------------------------
970 | Returns the result of converting the 64-bit two's complement integer `a' to
971 | the quadruple-precision floating-point format. The conversion is performed
972 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
973 *----------------------------------------------------------------------------*/
975 float128 int64_to_float128( int64 a )
983 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
985 absA = zSign ? - a : a;
986 shiftCount = countLeadingZeros64( absA ) + 49;
987 zExp = 0x406E - shiftCount;
988 if ( 64 <= shiftCount ) {
997 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
998 return packFloat128( zSign, zExp, zSig0, zSig1 );
1004 /*----------------------------------------------------------------------------
1005 | Returns the result of converting the single-precision floating-point value
1006 | `a' to the 32-bit two's complement integer format. The conversion is
1007 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1008 | Arithmetic---which means in particular that the conversion is rounded
1009 | according to the current rounding mode. If `a' is a NaN, the largest
1010 | positive integer is returned. Otherwise, if the conversion overflows, the
1011 | largest integer with the same sign as `a' is returned.
1012 *----------------------------------------------------------------------------*/
1014 int32 float32_to_int32( float32 a )
1017 int16 aExp, shiftCount;
1021 aSig = extractFloat32Frac( a );
1022 aExp = extractFloat32Exp( a );
1023 aSign = extractFloat32Sign( a );
1024 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1025 if ( aExp ) aSig |= 0x00800000;
1026 shiftCount = 0xAF - aExp;
1029 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1030 return roundAndPackInt32( aSign, aSig64 );
1034 /*----------------------------------------------------------------------------
1035 | Returns the result of converting the single-precision floating-point value
1036 | `a' to the 32-bit two's complement integer format. The conversion is
1037 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1038 | Arithmetic, except that the conversion is always rounded toward zero.
1039 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1040 | the conversion overflows, the largest integer with the same sign as `a' is
1042 *----------------------------------------------------------------------------*/
1044 int32 float32_to_int32_round_to_zero( float32 a )
1047 int16 aExp, shiftCount;
1051 aSig = extractFloat32Frac( a );
1052 aExp = extractFloat32Exp( a );
1053 aSign = extractFloat32Sign( a );
1054 shiftCount = aExp - 0x9E;
1055 if ( 0 <= shiftCount ) {
1056 if ( a != 0xCF000000 ) {
1057 float_raise( float_flag_invalid );
1058 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1060 return (sbits32) 0x80000000;
1062 else if ( aExp <= 0x7E ) {
1063 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1066 aSig = ( aSig | 0x00800000 )<<8;
1067 z = aSig>>( - shiftCount );
1068 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1069 float_exception_flags |= float_flag_inexact;
1071 if ( aSign ) z = - z;
1076 /*----------------------------------------------------------------------------
1077 | Returns the result of converting the single-precision floating-point value
1078 | `a' to the 64-bit two's complement integer format. The conversion is
1079 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1080 | Arithmetic---which means in particular that the conversion is rounded
1081 | according to the current rounding mode. If `a' is a NaN, the largest
1082 | positive integer is returned. Otherwise, if the conversion overflows, the
1083 | largest integer with the same sign as `a' is returned.
1084 *----------------------------------------------------------------------------*/
1086 int64 float32_to_int64( float32 a )
1089 int16 aExp, shiftCount;
1091 bits64 aSig64, aSigExtra;
1093 aSig = extractFloat32Frac( a );
1094 aExp = extractFloat32Exp( a );
1095 aSign = extractFloat32Sign( a );
1096 shiftCount = 0xBE - aExp;
1097 if ( shiftCount < 0 ) {
1098 float_raise( float_flag_invalid );
1099 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1100 return LIT64( 0x7FFFFFFFFFFFFFFF );
1102 return (sbits64) LIT64( 0x8000000000000000 );
1104 if ( aExp ) aSig |= 0x00800000;
1107 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1108 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1112 /*----------------------------------------------------------------------------
1113 | Returns the result of converting the single-precision floating-point value
1114 | `a' to the 64-bit two's complement integer format. The conversion is
1115 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1116 | Arithmetic, except that the conversion is always rounded toward zero. If
1117 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1118 | conversion overflows, the largest integer with the same sign as `a' is
1120 *----------------------------------------------------------------------------*/
1122 int64 float32_to_int64_round_to_zero( float32 a )
1125 int16 aExp, shiftCount;
1130 aSig = extractFloat32Frac( a );
1131 aExp = extractFloat32Exp( a );
1132 aSign = extractFloat32Sign( a );
1133 shiftCount = aExp - 0xBE;
1134 if ( 0 <= shiftCount ) {
1135 if ( a != 0xDF000000 ) {
1136 float_raise( float_flag_invalid );
1137 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1138 return LIT64( 0x7FFFFFFFFFFFFFFF );
1141 return (sbits64) LIT64( 0x8000000000000000 );
1143 else if ( aExp <= 0x7E ) {
1144 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1147 aSig64 = aSig | 0x00800000;
1149 z = aSig64>>( - shiftCount );
1150 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1151 float_exception_flags |= float_flag_inexact;
1153 if ( aSign ) z = - z;
1158 /*----------------------------------------------------------------------------
1159 | Returns the result of converting the single-precision floating-point value
1160 | `a' to the double-precision floating-point format. The conversion is
1161 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1163 *----------------------------------------------------------------------------*/
1165 float64 float32_to_float64( float32 a )
1171 aSig = extractFloat32Frac( a );
1172 aExp = extractFloat32Exp( a );
1173 aSign = extractFloat32Sign( a );
1174 if ( aExp == 0xFF ) {
1175 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1176 return packFloat64( aSign, 0x7FF, 0 );
1179 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1180 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1183 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1189 /*----------------------------------------------------------------------------
1190 | Returns the result of converting the single-precision floating-point value
1191 | `a' to the extended double-precision floating-point format. The conversion
1192 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1194 *----------------------------------------------------------------------------*/
1196 floatx80 float32_to_floatx80( float32 a )
1202 aSig = extractFloat32Frac( a );
1203 aExp = extractFloat32Exp( a );
1204 aSign = extractFloat32Sign( a );
1205 if ( aExp == 0xFF ) {
1206 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1207 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1210 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1211 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1214 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1222 /*----------------------------------------------------------------------------
1223 | Returns the result of converting the single-precision floating-point value
1224 | `a' to the double-precision floating-point format. The conversion is
1225 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1227 *----------------------------------------------------------------------------*/
1229 float128 float32_to_float128( float32 a )
1235 aSig = extractFloat32Frac( a );
1236 aExp = extractFloat32Exp( a );
1237 aSign = extractFloat32Sign( a );
1238 if ( aExp == 0xFF ) {
1239 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1240 return packFloat128( aSign, 0x7FFF, 0, 0 );
1243 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1244 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1247 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1253 /*----------------------------------------------------------------------------
1254 | Rounds the single-precision floating-point value `a' to an integer, and
1255 | returns the result as a single-precision floating-point value. The
1256 | operation is performed according to the IEC/IEEE Standard for Binary
1257 | Floating-Point Arithmetic.
1258 *----------------------------------------------------------------------------*/
1260 float32 float32_round_to_int( float32 a )
1264 bits32 lastBitMask, roundBitsMask;
1268 aExp = extractFloat32Exp( a );
1269 if ( 0x96 <= aExp ) {
1270 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1271 return propagateFloat32NaN( a, a );
1275 if ( aExp <= 0x7E ) {
1276 if ( (bits32) ( a<<1 ) == 0 ) return a;
1277 float_exception_flags |= float_flag_inexact;
1278 aSign = extractFloat32Sign( a );
1279 switch ( float_rounding_mode ) {
1280 case float_round_nearest_even:
1281 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1282 return packFloat32( aSign, 0x7F, 0 );
1285 case float_round_down:
1286 return aSign ? 0xBF800000 : 0;
1287 case float_round_up:
1288 return aSign ? 0x80000000 : 0x3F800000;
1290 return packFloat32( aSign, 0, 0 );
1293 lastBitMask <<= 0x96 - aExp;
1294 roundBitsMask = lastBitMask - 1;
1296 roundingMode = float_rounding_mode;
1297 if ( roundingMode == float_round_nearest_even ) {
1298 z += lastBitMask>>1;
1299 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1301 else if ( roundingMode != float_round_to_zero ) {
1302 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1306 z &= ~ roundBitsMask;
1307 if ( z != a ) float_exception_flags |= float_flag_inexact;
1312 /*----------------------------------------------------------------------------
1313 | Returns the result of adding the absolute values of the single-precision
1314 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1315 | before being returned. `zSign' is ignored if the result is a NaN.
1316 | The addition is performed according to the IEC/IEEE Standard for Binary
1317 | Floating-Point Arithmetic.
1318 *----------------------------------------------------------------------------*/
1320 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1322 int16 aExp, bExp, zExp;
1323 bits32 aSig, bSig, zSig;
1326 aSig = extractFloat32Frac( a );
1327 aExp = extractFloat32Exp( a );
1328 bSig = extractFloat32Frac( b );
1329 bExp = extractFloat32Exp( b );
1330 expDiff = aExp - bExp;
1333 if ( 0 < expDiff ) {
1334 if ( aExp == 0xFF ) {
1335 if ( aSig ) return propagateFloat32NaN( a, b );
1344 shift32RightJamming( bSig, expDiff, &bSig );
1347 else if ( expDiff < 0 ) {
1348 if ( bExp == 0xFF ) {
1349 if ( bSig ) return propagateFloat32NaN( a, b );
1350 return packFloat32( zSign, 0xFF, 0 );
1358 shift32RightJamming( aSig, - expDiff, &aSig );
1362 if ( aExp == 0xFF ) {
1363 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1366 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1367 zSig = 0x40000000 + aSig + bSig;
1372 zSig = ( aSig + bSig )<<1;
1374 if ( (sbits32) zSig < 0 ) {
1379 return roundAndPackFloat32( zSign, zExp, zSig );
1383 /*----------------------------------------------------------------------------
1384 | Returns the result of subtracting the absolute values of the single-
1385 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1386 | difference is negated before being returned. `zSign' is ignored if the
1387 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1388 | Standard for Binary Floating-Point Arithmetic.
1389 *----------------------------------------------------------------------------*/
1391 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1393 int16 aExp, bExp, zExp;
1394 bits32 aSig, bSig, zSig;
1397 aSig = extractFloat32Frac( a );
1398 aExp = extractFloat32Exp( a );
1399 bSig = extractFloat32Frac( b );
1400 bExp = extractFloat32Exp( b );
1401 expDiff = aExp - bExp;
1404 if ( 0 < expDiff ) goto aExpBigger;
1405 if ( expDiff < 0 ) goto bExpBigger;
1406 if ( aExp == 0xFF ) {
1407 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1408 float_raise( float_flag_invalid );
1409 return float32_default_nan;
1415 if ( bSig < aSig ) goto aBigger;
1416 if ( aSig < bSig ) goto bBigger;
1417 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1419 if ( bExp == 0xFF ) {
1420 if ( bSig ) return propagateFloat32NaN( a, b );
1421 return packFloat32( zSign ^ 1, 0xFF, 0 );
1429 shift32RightJamming( aSig, - expDiff, &aSig );
1435 goto normalizeRoundAndPack;
1437 if ( aExp == 0xFF ) {
1438 if ( aSig ) return propagateFloat32NaN( a, b );
1447 shift32RightJamming( bSig, expDiff, &bSig );
1452 normalizeRoundAndPack:
1454 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1458 /*----------------------------------------------------------------------------
1459 | Returns the result of adding the single-precision floating-point values `a'
1460 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1461 | Binary Floating-Point Arithmetic.
1462 *----------------------------------------------------------------------------*/
1464 float32 float32_add( float32 a, float32 b )
1468 aSign = extractFloat32Sign( a );
1469 bSign = extractFloat32Sign( b );
1470 if ( aSign == bSign ) {
1471 return addFloat32Sigs( a, b, aSign );
1474 return subFloat32Sigs( a, b, aSign );
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of subtracting the single-precision floating-point values
1481 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1482 | for Binary Floating-Point Arithmetic.
1483 *----------------------------------------------------------------------------*/
1485 float32 float32_sub( float32 a, float32 b )
1489 aSign = extractFloat32Sign( a );
1490 bSign = extractFloat32Sign( b );
1491 if ( aSign == bSign ) {
1492 return subFloat32Sigs( a, b, aSign );
1495 return addFloat32Sigs( a, b, aSign );
1500 /*----------------------------------------------------------------------------
1501 | Returns the result of multiplying the single-precision floating-point values
1502 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1503 | for Binary Floating-Point Arithmetic.
1504 *----------------------------------------------------------------------------*/
1506 float32 float32_mul( float32 a, float32 b )
1508 flag aSign, bSign, zSign;
1509 int16 aExp, bExp, zExp;
1514 aSig = extractFloat32Frac( a );
1515 aExp = extractFloat32Exp( a );
1516 aSign = extractFloat32Sign( a );
1517 bSig = extractFloat32Frac( b );
1518 bExp = extractFloat32Exp( b );
1519 bSign = extractFloat32Sign( b );
1520 zSign = aSign ^ bSign;
1521 if ( aExp == 0xFF ) {
1522 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1523 return propagateFloat32NaN( a, b );
1525 if ( ( bExp | bSig ) == 0 ) {
1526 float_raise( float_flag_invalid );
1527 return float32_default_nan;
1529 return packFloat32( zSign, 0xFF, 0 );
1531 if ( bExp == 0xFF ) {
1532 if ( bSig ) return propagateFloat32NaN( a, b );
1533 if ( ( aExp | aSig ) == 0 ) {
1534 float_raise( float_flag_invalid );
1535 return float32_default_nan;
1537 return packFloat32( zSign, 0xFF, 0 );
1540 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1541 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1544 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1545 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1547 zExp = aExp + bExp - 0x7F;
1548 aSig = ( aSig | 0x00800000 )<<7;
1549 bSig = ( bSig | 0x00800000 )<<8;
1550 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1552 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1556 return roundAndPackFloat32( zSign, zExp, zSig );
1560 /*----------------------------------------------------------------------------
1561 | Returns the result of dividing the single-precision floating-point value `a'
1562 | by the corresponding value `b'. The operation is performed according to the
1563 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1564 *----------------------------------------------------------------------------*/
1566 float32 float32_div( float32 a, float32 b )
1568 flag aSign, bSign, zSign;
1569 int16 aExp, bExp, zExp;
1570 bits32 aSig, bSig, zSig;
1572 aSig = extractFloat32Frac( a );
1573 aExp = extractFloat32Exp( a );
1574 aSign = extractFloat32Sign( a );
1575 bSig = extractFloat32Frac( b );
1576 bExp = extractFloat32Exp( b );
1577 bSign = extractFloat32Sign( b );
1578 zSign = aSign ^ bSign;
1579 if ( aExp == 0xFF ) {
1580 if ( aSig ) return propagateFloat32NaN( a, b );
1581 if ( bExp == 0xFF ) {
1582 if ( bSig ) return propagateFloat32NaN( a, b );
1583 float_raise( float_flag_invalid );
1584 return float32_default_nan;
1586 return packFloat32( zSign, 0xFF, 0 );
1588 if ( bExp == 0xFF ) {
1589 if ( bSig ) return propagateFloat32NaN( a, b );
1590 return packFloat32( zSign, 0, 0 );
1594 if ( ( aExp | aSig ) == 0 ) {
1595 float_raise( float_flag_invalid );
1596 return float32_default_nan;
1598 float_raise( float_flag_divbyzero );
1599 return packFloat32( zSign, 0xFF, 0 );
1601 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1604 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1605 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607 zExp = aExp - bExp + 0x7D;
1608 aSig = ( aSig | 0x00800000 )<<7;
1609 bSig = ( bSig | 0x00800000 )<<8;
1610 if ( bSig <= ( aSig + aSig ) ) {
1614 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1615 if ( ( zSig & 0x3F ) == 0 ) {
1616 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1618 return roundAndPackFloat32( zSign, zExp, zSig );
1622 /*----------------------------------------------------------------------------
1623 | Returns the remainder of the single-precision floating-point value `a'
1624 | with respect to the corresponding value `b'. The operation is performed
1625 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1626 *----------------------------------------------------------------------------*/
1628 float32 float32_rem( float32 a, float32 b )
1631 int16 aExp, bExp, expDiff;
1634 bits64 aSig64, bSig64, q64;
1635 bits32 alternateASig;
1638 aSig = extractFloat32Frac( a );
1639 aExp = extractFloat32Exp( a );
1640 aSign = extractFloat32Sign( a );
1641 bSig = extractFloat32Frac( b );
1642 bExp = extractFloat32Exp( b );
1643 // bSign = extractFloat32Sign( b );
1644 if ( aExp == 0xFF ) {
1645 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1646 return propagateFloat32NaN( a, b );
1648 float_raise( float_flag_invalid );
1649 return float32_default_nan;
1651 if ( bExp == 0xFF ) {
1652 if ( bSig ) return propagateFloat32NaN( a, b );
1657 float_raise( float_flag_invalid );
1658 return float32_default_nan;
1660 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1663 if ( aSig == 0 ) return a;
1664 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1666 expDiff = aExp - bExp;
1669 if ( expDiff < 32 ) {
1672 if ( expDiff < 0 ) {
1673 if ( expDiff < -1 ) return a;
1676 q = ( bSig <= aSig );
1677 if ( q ) aSig -= bSig;
1678 if ( 0 < expDiff ) {
1679 q = ( ( (bits64) aSig )<<32 ) / bSig;
1682 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1690 if ( bSig <= aSig ) aSig -= bSig;
1691 aSig64 = ( (bits64) aSig )<<40;
1692 bSig64 = ( (bits64) bSig )<<40;
1694 while ( 0 < expDiff ) {
1695 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1696 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1697 aSig64 = - ( ( bSig * q64 )<<38 );
1701 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1702 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1703 q = q64>>( 64 - expDiff );
1705 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1708 alternateASig = aSig;
1711 } while ( 0 <= (sbits32) aSig );
1712 sigMean = aSig + alternateASig;
1713 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1714 aSig = alternateASig;
1716 zSign = ( (sbits32) aSig < 0 );
1717 if ( zSign ) aSig = - aSig;
1718 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1722 /*----------------------------------------------------------------------------
1723 | Returns the square root of the single-precision floating-point value `a'.
1724 | The operation is performed according to the IEC/IEEE Standard for Binary
1725 | Floating-Point Arithmetic.
1726 *----------------------------------------------------------------------------*/
1728 float32 float32_sqrt( float32 a )
1735 aSig = extractFloat32Frac( a );
1736 aExp = extractFloat32Exp( a );
1737 aSign = extractFloat32Sign( a );
1738 if ( aExp == 0xFF ) {
1739 if ( aSig ) return propagateFloat32NaN( a, 0 );
1740 if ( ! aSign ) return a;
1741 float_raise( float_flag_invalid );
1742 return float32_default_nan;
1745 if ( ( aExp | aSig ) == 0 ) return a;
1746 float_raise( float_flag_invalid );
1747 return float32_default_nan;
1750 if ( aSig == 0 ) return 0;
1751 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1753 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1754 aSig = ( aSig | 0x00800000 )<<8;
1755 zSig = estimateSqrt32( aExp, aSig ) + 2;
1756 if ( ( zSig & 0x7F ) <= 5 ) {
1762 term = ( (bits64) zSig ) * zSig;
1763 rem = ( ( (bits64) aSig )<<32 ) - term;
1764 while ( (sbits64) rem < 0 ) {
1766 rem += ( ( (bits64) zSig )<<1 ) | 1;
1768 zSig |= ( rem != 0 );
1770 shift32RightJamming( zSig, 1, &zSig );
1772 return roundAndPackFloat32( 0, zExp, zSig );
1776 /*----------------------------------------------------------------------------
1777 | Returns 1 if the single-precision floating-point value `a' is equal to
1778 | the corresponding value `b', and 0 otherwise. The comparison is performed
1779 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1780 *----------------------------------------------------------------------------*/
1782 flag float32_eq( float32 a, float32 b )
1784 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1785 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1787 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1788 float_raise( float_flag_invalid );
1792 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1796 /*----------------------------------------------------------------------------
1797 | Returns 1 if the single-precision floating-point value `a' is less than
1798 | or equal to the corresponding value `b', and 0 otherwise. The comparison
1799 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1801 *----------------------------------------------------------------------------*/
1803 flag float32_le( float32 a, float32 b )
1807 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1808 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1810 float_raise( float_flag_invalid );
1813 aSign = extractFloat32Sign( a );
1814 bSign = extractFloat32Sign( b );
1815 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1816 return ( a == b ) || ( aSign ^ ( a < b ) );
1820 /*----------------------------------------------------------------------------
1821 | Returns 1 if the single-precision floating-point value `a' is less than
1822 | the corresponding value `b', and 0 otherwise. The comparison is performed
1823 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1824 *----------------------------------------------------------------------------*/
1826 flag float32_lt( float32 a, float32 b )
1830 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1831 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1833 float_raise( float_flag_invalid );
1836 aSign = extractFloat32Sign( a );
1837 bSign = extractFloat32Sign( b );
1838 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1839 return ( a != b ) && ( aSign ^ ( a < b ) );
1843 /*----------------------------------------------------------------------------
1844 | Returns 1 if the single-precision floating-point value `a' is equal to
1845 | the corresponding value `b', and 0 otherwise. The invalid exception is
1846 | raised if either operand is a NaN. Otherwise, the comparison is performed
1847 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1848 *----------------------------------------------------------------------------*/
1850 flag float32_eq_signaling( float32 a, float32 b )
1852 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1853 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1855 float_raise( float_flag_invalid );
1858 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1862 /*----------------------------------------------------------------------------
1863 | Returns 1 if the single-precision floating-point value `a' is less than or
1864 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1865 | cause an exception. Otherwise, the comparison is performed according to the
1866 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1867 *----------------------------------------------------------------------------*/
1869 flag float32_le_quiet( float32 a, float32 b )
1872 // int16 aExp, bExp;
1874 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1875 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1877 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1878 float_raise( float_flag_invalid );
1882 aSign = extractFloat32Sign( a );
1883 bSign = extractFloat32Sign( b );
1884 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1885 return ( a == b ) || ( aSign ^ ( a < b ) );
1889 /*----------------------------------------------------------------------------
1890 | Returns 1 if the single-precision floating-point value `a' is less than
1891 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1892 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
1893 | Standard for Binary Floating-Point Arithmetic.
1894 *----------------------------------------------------------------------------*/
1896 flag float32_lt_quiet( float32 a, float32 b )
1900 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1901 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1903 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1904 float_raise( float_flag_invalid );
1908 aSign = extractFloat32Sign( a );
1909 bSign = extractFloat32Sign( b );
1910 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1911 return ( a != b ) && ( aSign ^ ( a < b ) );
1915 /*----------------------------------------------------------------------------
1916 | Returns the result of converting the double-precision floating-point value
1917 | `a' to the 32-bit two's complement integer format. The conversion is
1918 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1919 | Arithmetic---which means in particular that the conversion is rounded
1920 | according to the current rounding mode. If `a' is a NaN, the largest
1921 | positive integer is returned. Otherwise, if the conversion overflows, the
1922 | largest integer with the same sign as `a' is returned.
1923 *----------------------------------------------------------------------------*/
1925 int32 float64_to_int32( float64 a )
1928 int16 aExp, shiftCount;
1931 aSig = extractFloat64Frac( a );
1932 aExp = extractFloat64Exp( a );
1933 aSign = extractFloat64Sign( a );
1934 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1935 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1936 shiftCount = 0x42C - aExp;
1937 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1938 return roundAndPackInt32( aSign, aSig );
1942 /*----------------------------------------------------------------------------
1943 | Returns the result of converting the double-precision floating-point value
1944 | `a' to the 32-bit two's complement integer format. The conversion is
1945 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1946 | Arithmetic, except that the conversion is always rounded toward zero.
1947 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1948 | the conversion overflows, the largest integer with the same sign as `a' is
1950 *----------------------------------------------------------------------------*/
1952 int32 float64_to_int32_round_to_zero( float64 a )
1955 int16 aExp, shiftCount;
1956 bits64 aSig, savedASig;
1959 aSig = extractFloat64Frac( a );
1960 aExp = extractFloat64Exp( a );
1961 aSign = extractFloat64Sign( a );
1962 if ( 0x41E < aExp ) {
1963 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1966 else if ( aExp < 0x3FF ) {
1967 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1970 aSig |= LIT64( 0x0010000000000000 );
1971 shiftCount = 0x433 - aExp;
1973 aSig >>= shiftCount;
1975 if ( aSign ) z = - z;
1976 if ( ( z < 0 ) ^ aSign ) {
1978 float_raise( float_flag_invalid );
1979 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1981 if ( ( aSig<<shiftCount ) != savedASig ) {
1982 float_exception_flags |= float_flag_inexact;
1988 /*----------------------------------------------------------------------------
1989 | Returns the result of converting the double-precision floating-point value
1990 | `a' to the 64-bit two's complement integer format. The conversion is
1991 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1992 | Arithmetic---which means in particular that the conversion is rounded
1993 | according to the current rounding mode. If `a' is a NaN, the largest
1994 | positive integer is returned. Otherwise, if the conversion overflows, the
1995 | largest integer with the same sign as `a' is returned.
1996 *----------------------------------------------------------------------------*/
1998 int64 float64_to_int64( float64 a )
2001 int16 aExp, shiftCount;
2002 bits64 aSig, aSigExtra;
2004 aSig = extractFloat64Frac( a );
2005 aExp = extractFloat64Exp( a );
2006 aSign = extractFloat64Sign( a );
2007 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2008 shiftCount = 0x433 - aExp;
2009 if ( shiftCount <= 0 ) {
2010 if ( 0x43E < aExp ) {
2011 float_raise( float_flag_invalid );
2013 || ( ( aExp == 0x7FF )
2014 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2016 return LIT64( 0x7FFFFFFFFFFFFFFF );
2018 return (sbits64) LIT64( 0x8000000000000000 );
2021 aSig <<= - shiftCount;
2024 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2026 return roundAndPackInt64( aSign, aSig, aSigExtra );
2030 /*----------------------------------------------------------------------------
2031 | Returns the result of converting the double-precision floating-point value
2032 | `a' to the 64-bit two's complement integer format. The conversion is
2033 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2034 | Arithmetic, except that the conversion is always rounded toward zero.
2035 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2036 | the conversion overflows, the largest integer with the same sign as `a' is
2038 *----------------------------------------------------------------------------*/
2040 int64 float64_to_int64_round_to_zero( float64 a )
2043 int16 aExp, shiftCount;
2047 aSig = extractFloat64Frac( a );
2048 aExp = extractFloat64Exp( a );
2049 aSign = extractFloat64Sign( a );
2050 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2051 shiftCount = aExp - 0x433;
2052 if ( 0 <= shiftCount ) {
2053 if ( 0x43E <= aExp ) {
2054 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2055 float_raise( float_flag_invalid );
2057 || ( ( aExp == 0x7FF )
2058 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2060 return LIT64( 0x7FFFFFFFFFFFFFFF );
2063 return (sbits64) LIT64( 0x8000000000000000 );
2065 z = aSig<<shiftCount;
2068 if ( aExp < 0x3FE ) {
2069 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2072 z = aSig>>( - shiftCount );
2073 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2074 float_exception_flags |= float_flag_inexact;
2077 if ( aSign ) z = - z;
2082 /*----------------------------------------------------------------------------
2083 | Returns the result of converting the double-precision floating-point value
2084 | `a' to the single-precision floating-point format. The conversion is
2085 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2087 *----------------------------------------------------------------------------*/
2089 float32 float64_to_float32( float64 a )
2096 aSig = extractFloat64Frac( a );
2097 aExp = extractFloat64Exp( a );
2098 aSign = extractFloat64Sign( a );
2099 if ( aExp == 0x7FF ) {
2100 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2101 return packFloat32( aSign, 0xFF, 0 );
2103 shift64RightJamming( aSig, 22, &aSig );
2105 if ( aExp || zSig ) {
2109 return roundAndPackFloat32( aSign, aExp, zSig );
2115 /*----------------------------------------------------------------------------
2116 | Returns the result of converting the double-precision floating-point value
2117 | `a' to the extended double-precision floating-point format. The conversion
2118 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2120 *----------------------------------------------------------------------------*/
2122 floatx80 float64_to_floatx80( float64 a )
2128 aSig = extractFloat64Frac( a );
2129 aExp = extractFloat64Exp( a );
2130 aSign = extractFloat64Sign( a );
2131 if ( aExp == 0x7FF ) {
2132 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2133 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2136 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2137 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2141 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2149 /*----------------------------------------------------------------------------
2150 | Returns the result of converting the double-precision floating-point value
2151 | `a' to the quadruple-precision floating-point format. The conversion is
2152 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2154 *----------------------------------------------------------------------------*/
2156 float128 float64_to_float128( float64 a )
2160 bits64 aSig, zSig0, zSig1;
2162 aSig = extractFloat64Frac( a );
2163 aExp = extractFloat64Exp( a );
2164 aSign = extractFloat64Sign( a );
2165 if ( aExp == 0x7FF ) {
2166 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2167 return packFloat128( aSign, 0x7FFF, 0, 0 );
2170 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2171 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2174 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2175 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2181 /*----------------------------------------------------------------------------
2182 | Rounds the double-precision floating-point value `a' to an integer, and
2183 | returns the result as a double-precision floating-point value. The
2184 | operation is performed according to the IEC/IEEE Standard for Binary
2185 | Floating-Point Arithmetic.
2186 *----------------------------------------------------------------------------*/
2188 float64 float64_round_to_int( float64 a )
2192 bits64 lastBitMask, roundBitsMask;
2196 aExp = extractFloat64Exp( a );
2197 if ( 0x433 <= aExp ) {
2198 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2199 return propagateFloat64NaN( a, a );
2203 if ( aExp < 0x3FF ) {
2204 if ( (bits64) ( a<<1 ) == 0 ) return a;
2205 float_exception_flags |= float_flag_inexact;
2206 aSign = extractFloat64Sign( a );
2207 switch ( float_rounding_mode ) {
2208 case float_round_nearest_even:
2209 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2210 return packFloat64( aSign, 0x3FF, 0 );
2213 case float_round_down:
2214 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2215 case float_round_up:
2217 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2219 return packFloat64( aSign, 0, 0 );
2222 lastBitMask <<= 0x433 - aExp;
2223 roundBitsMask = lastBitMask - 1;
2225 roundingMode = float_rounding_mode;
2226 if ( roundingMode == float_round_nearest_even ) {
2227 z += lastBitMask>>1;
2228 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2230 else if ( roundingMode != float_round_to_zero ) {
2231 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2235 z &= ~ roundBitsMask;
2236 if ( z != a ) float_exception_flags |= float_flag_inexact;
2241 /*----------------------------------------------------------------------------
2242 | Returns the result of adding the absolute values of the double-precision
2243 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2244 | before being returned. `zSign' is ignored if the result is a NaN.
2245 | The addition is performed according to the IEC/IEEE Standard for Binary
2246 | Floating-Point Arithmetic.
2247 *----------------------------------------------------------------------------*/
2249 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2251 int16 aExp, bExp, zExp;
2252 bits64 aSig, bSig, zSig;
2255 aSig = extractFloat64Frac( a );
2256 aExp = extractFloat64Exp( a );
2257 bSig = extractFloat64Frac( b );
2258 bExp = extractFloat64Exp( b );
2259 expDiff = aExp - bExp;
2262 if ( 0 < expDiff ) {
2263 if ( aExp == 0x7FF ) {
2264 if ( aSig ) return propagateFloat64NaN( a, b );
2271 bSig |= LIT64( 0x2000000000000000 );
2273 shift64RightJamming( bSig, expDiff, &bSig );
2276 else if ( expDiff < 0 ) {
2277 if ( bExp == 0x7FF ) {
2278 if ( bSig ) return propagateFloat64NaN( a, b );
2279 return packFloat64( zSign, 0x7FF, 0 );
2285 aSig |= LIT64( 0x2000000000000000 );
2287 shift64RightJamming( aSig, - expDiff, &aSig );
2291 if ( aExp == 0x7FF ) {
2292 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2295 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2296 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2300 aSig |= LIT64( 0x2000000000000000 );
2301 zSig = ( aSig + bSig )<<1;
2303 if ( (sbits64) zSig < 0 ) {
2308 return roundAndPackFloat64( zSign, zExp, zSig );
2312 /*----------------------------------------------------------------------------
2313 | Returns the result of subtracting the absolute values of the double-
2314 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2315 | difference is negated before being returned. `zSign' is ignored if the
2316 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2317 | Standard for Binary Floating-Point Arithmetic.
2318 *----------------------------------------------------------------------------*/
2320 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2322 int16 aExp, bExp, zExp;
2323 bits64 aSig, bSig, zSig;
2326 aSig = extractFloat64Frac( a );
2327 aExp = extractFloat64Exp( a );
2328 bSig = extractFloat64Frac( b );
2329 bExp = extractFloat64Exp( b );
2330 expDiff = aExp - bExp;
2333 if ( 0 < expDiff ) goto aExpBigger;
2334 if ( expDiff < 0 ) goto bExpBigger;
2335 if ( aExp == 0x7FF ) {
2336 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2337 float_raise( float_flag_invalid );
2338 return float64_default_nan;
2344 if ( bSig < aSig ) goto aBigger;
2345 if ( aSig < bSig ) goto bBigger;
2346 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2348 if ( bExp == 0x7FF ) {
2349 if ( bSig ) return propagateFloat64NaN( a, b );
2350 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2356 aSig |= LIT64( 0x4000000000000000 );
2358 shift64RightJamming( aSig, - expDiff, &aSig );
2359 bSig |= LIT64( 0x4000000000000000 );
2364 goto normalizeRoundAndPack;
2366 if ( aExp == 0x7FF ) {
2367 if ( aSig ) return propagateFloat64NaN( a, b );
2374 bSig |= LIT64( 0x4000000000000000 );
2376 shift64RightJamming( bSig, expDiff, &bSig );
2377 aSig |= LIT64( 0x4000000000000000 );
2381 normalizeRoundAndPack:
2383 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2387 /*----------------------------------------------------------------------------
2388 | Returns the result of adding the double-precision floating-point values `a'
2389 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2390 | Binary Floating-Point Arithmetic.
2391 *----------------------------------------------------------------------------*/
2393 float64 float64_add( float64 a, float64 b )
2397 aSign = extractFloat64Sign( a );
2398 bSign = extractFloat64Sign( b );
2399 if ( aSign == bSign ) {
2400 return addFloat64Sigs( a, b, aSign );
2403 return subFloat64Sigs( a, b, aSign );
2408 /*----------------------------------------------------------------------------
2409 | Returns the result of subtracting the double-precision floating-point values
2410 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2411 | for Binary Floating-Point Arithmetic.
2412 *----------------------------------------------------------------------------*/
2414 float64 float64_sub( float64 a, float64 b )
2418 aSign = extractFloat64Sign( a );
2419 bSign = extractFloat64Sign( b );
2420 if ( aSign == bSign ) {
2421 return subFloat64Sigs( a, b, aSign );
2424 return addFloat64Sigs( a, b, aSign );
2429 /*----------------------------------------------------------------------------
2430 | Returns the result of multiplying the double-precision floating-point values
2431 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2432 | for Binary Floating-Point Arithmetic.
2433 *----------------------------------------------------------------------------*/
2435 float64 float64_mul( float64 a, float64 b )
2437 flag aSign, bSign, zSign;
2438 int16 aExp, bExp, zExp;
2439 bits64 aSig, bSig, zSig0, zSig1;
2441 aSig = extractFloat64Frac( a );
2442 aExp = extractFloat64Exp( a );
2443 aSign = extractFloat64Sign( a );
2444 bSig = extractFloat64Frac( b );
2445 bExp = extractFloat64Exp( b );
2446 bSign = extractFloat64Sign( b );
2447 zSign = aSign ^ bSign;
2448 if ( aExp == 0x7FF ) {
2449 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2450 return propagateFloat64NaN( a, b );
2452 if ( ( bExp | bSig ) == 0 ) {
2453 float_raise( float_flag_invalid );
2454 return float64_default_nan;
2456 return packFloat64( zSign, 0x7FF, 0 );
2458 if ( bExp == 0x7FF ) {
2459 if ( bSig ) return propagateFloat64NaN( a, b );
2460 if ( ( aExp | aSig ) == 0 ) {
2461 float_raise( float_flag_invalid );
2462 return float64_default_nan;
2464 return packFloat64( zSign, 0x7FF, 0 );
2467 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2468 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2471 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2472 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2474 zExp = aExp + bExp - 0x3FF;
2475 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2476 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2477 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2478 zSig0 |= ( zSig1 != 0 );
2479 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2483 return roundAndPackFloat64( zSign, zExp, zSig0 );
2487 /*----------------------------------------------------------------------------
2488 | Returns the result of dividing the double-precision floating-point value `a'
2489 | by the corresponding value `b'. The operation is performed according to
2490 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2491 *----------------------------------------------------------------------------*/
2493 float64 float64_div( float64 a, float64 b )
2495 flag aSign, bSign, zSign;
2496 int16 aExp, bExp, zExp;
2497 bits64 aSig, bSig, zSig;
2499 bits64 term0, term1;
2501 aSig = extractFloat64Frac( a );
2502 aExp = extractFloat64Exp( a );
2503 aSign = extractFloat64Sign( a );
2504 bSig = extractFloat64Frac( b );
2505 bExp = extractFloat64Exp( b );
2506 bSign = extractFloat64Sign( b );
2507 zSign = aSign ^ bSign;
2508 if ( aExp == 0x7FF ) {
2509 if ( aSig ) return propagateFloat64NaN( a, b );
2510 if ( bExp == 0x7FF ) {
2511 if ( bSig ) return propagateFloat64NaN( a, b );
2512 float_raise( float_flag_invalid );
2513 return float64_default_nan;
2515 return packFloat64( zSign, 0x7FF, 0 );
2517 if ( bExp == 0x7FF ) {
2518 if ( bSig ) return propagateFloat64NaN( a, b );
2519 return packFloat64( zSign, 0, 0 );
2523 if ( ( aExp | aSig ) == 0 ) {
2524 float_raise( float_flag_invalid );
2525 return float64_default_nan;
2527 float_raise( float_flag_divbyzero );
2528 return packFloat64( zSign, 0x7FF, 0 );
2530 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2533 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2534 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2536 zExp = aExp - bExp + 0x3FD;
2537 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2538 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2539 if ( bSig <= ( aSig + aSig ) ) {
2543 zSig = estimateDiv128To64( aSig, 0, bSig );
2544 if ( ( zSig & 0x1FF ) <= 2 ) {
2545 mul64To128( bSig, zSig, &term0, &term1 );
2546 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2547 while ( (sbits64) rem0 < 0 ) {
2549 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2551 zSig |= ( rem1 != 0 );
2553 return roundAndPackFloat64( zSign, zExp, zSig );
2557 /*----------------------------------------------------------------------------
2558 | Returns the remainder of the double-precision floating-point value `a'
2559 | with respect to the corresponding value `b'. The operation is performed
2560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2561 *----------------------------------------------------------------------------*/
2563 float64 float64_rem( float64 a, float64 b )
2566 int16 aExp, bExp, expDiff;
2568 bits64 q, alternateASig;
2571 aSig = extractFloat64Frac( a );
2572 aExp = extractFloat64Exp( a );
2573 aSign = extractFloat64Sign( a );
2574 bSig = extractFloat64Frac( b );
2575 bExp = extractFloat64Exp( b );
2576 // bSign = extractFloat64Sign( b );
2577 if ( aExp == 0x7FF ) {
2578 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2579 return propagateFloat64NaN( a, b );
2581 float_raise( float_flag_invalid );
2582 return float64_default_nan;
2584 if ( bExp == 0x7FF ) {
2585 if ( bSig ) return propagateFloat64NaN( a, b );
2590 float_raise( float_flag_invalid );
2591 return float64_default_nan;
2593 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2596 if ( aSig == 0 ) return a;
2597 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2599 expDiff = aExp - bExp;
2600 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2601 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2602 if ( expDiff < 0 ) {
2603 if ( expDiff < -1 ) return a;
2606 q = ( bSig <= aSig );
2607 if ( q ) aSig -= bSig;
2609 while ( 0 < expDiff ) {
2610 q = estimateDiv128To64( aSig, 0, bSig );
2611 q = ( 2 < q ) ? q - 2 : 0;
2612 aSig = - ( ( bSig>>2 ) * q );
2616 if ( 0 < expDiff ) {
2617 q = estimateDiv128To64( aSig, 0, bSig );
2618 q = ( 2 < q ) ? q - 2 : 0;
2621 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2628 alternateASig = aSig;
2631 } while ( 0 <= (sbits64) aSig );
2632 sigMean = aSig + alternateASig;
2633 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2634 aSig = alternateASig;
2636 zSign = ( (sbits64) aSig < 0 );
2637 if ( zSign ) aSig = - aSig;
2638 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2642 /*----------------------------------------------------------------------------
2643 | Returns the square root of the double-precision floating-point value `a'.
2644 | The operation is performed according to the IEC/IEEE Standard for Binary
2645 | Floating-Point Arithmetic.
2646 *----------------------------------------------------------------------------*/
2648 float64 float64_sqrt( float64 a )
2652 bits64 aSig, zSig, doubleZSig;
2653 bits64 rem0, rem1, term0, term1;
2656 aSig = extractFloat64Frac( a );
2657 aExp = extractFloat64Exp( a );
2658 aSign = extractFloat64Sign( a );
2659 if ( aExp == 0x7FF ) {
2660 if ( aSig ) return propagateFloat64NaN( a, a );
2661 if ( ! aSign ) return a;
2662 float_raise( float_flag_invalid );
2663 return float64_default_nan;
2666 if ( ( aExp | aSig ) == 0 ) return a;
2667 float_raise( float_flag_invalid );
2668 return float64_default_nan;
2671 if ( aSig == 0 ) return 0;
2672 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2674 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2675 aSig |= LIT64( 0x0010000000000000 );
2676 zSig = estimateSqrt32( aExp, aSig>>21 );
2677 aSig <<= 9 - ( aExp & 1 );
2678 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2679 if ( ( zSig & 0x1FF ) <= 5 ) {
2680 doubleZSig = zSig<<1;
2681 mul64To128( zSig, zSig, &term0, &term1 );
2682 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2683 while ( (sbits64) rem0 < 0 ) {
2686 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2688 zSig |= ( ( rem0 | rem1 ) != 0 );
2690 return roundAndPackFloat64( 0, zExp, zSig );
2694 /*----------------------------------------------------------------------------
2695 | Returns 1 if the double-precision floating-point value `a' is equal to the
2696 | corresponding value `b', and 0 otherwise. The comparison is performed
2697 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2698 *----------------------------------------------------------------------------*/
2700 flag float64_eq( float64 a, float64 b )
2702 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2703 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2705 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2706 float_raise( float_flag_invalid );
2710 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2714 /*----------------------------------------------------------------------------
2715 | Returns 1 if the double-precision floating-point value `a' is less than or
2716 | equal to the corresponding value `b', and 0 otherwise. The comparison is
2717 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2719 *----------------------------------------------------------------------------*/
2721 flag float64_le( float64 a, float64 b )
2725 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2726 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2728 float_raise( float_flag_invalid );
2731 aSign = extractFloat64Sign( a );
2732 bSign = extractFloat64Sign( b );
2733 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2734 return ( a == b ) || ( aSign ^ ( a < b ) );
2738 /*----------------------------------------------------------------------------
2739 | Returns 1 if the double-precision floating-point value `a' is less than
2740 | the corresponding value `b', and 0 otherwise. The comparison is performed
2741 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2742 *----------------------------------------------------------------------------*/
2744 flag float64_lt( float64 a, float64 b )
2748 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2749 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2751 float_raise( float_flag_invalid );
2754 aSign = extractFloat64Sign( a );
2755 bSign = extractFloat64Sign( b );
2756 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2757 return ( a != b ) && ( aSign ^ ( a < b ) );
2761 /*----------------------------------------------------------------------------
2762 | Returns 1 if the double-precision floating-point value `a' is equal to the
2763 | corresponding value `b', and 0 otherwise. The invalid exception is raised
2764 | if either operand is a NaN. Otherwise, the comparison is performed
2765 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2766 *----------------------------------------------------------------------------*/
2768 flag float64_eq_signaling( float64 a, float64 b )
2770 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2771 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2773 float_raise( float_flag_invalid );
2776 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2780 /*----------------------------------------------------------------------------
2781 | Returns 1 if the double-precision floating-point value `a' is less than or
2782 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2783 | cause an exception. Otherwise, the comparison is performed according to the
2784 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2785 *----------------------------------------------------------------------------*/
2787 flag float64_le_quiet( float64 a, float64 b )
2790 // int16 aExp, bExp;
2792 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2793 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2795 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2796 float_raise( float_flag_invalid );
2800 aSign = extractFloat64Sign( a );
2801 bSign = extractFloat64Sign( b );
2802 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2803 return ( a == b ) || ( aSign ^ ( a < b ) );
2807 /*----------------------------------------------------------------------------
2808 | Returns 1 if the double-precision floating-point value `a' is less than
2809 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2810 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2811 | Standard for Binary Floating-Point Arithmetic.
2812 *----------------------------------------------------------------------------*/
2814 flag float64_lt_quiet( float64 a, float64 b )
2818 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2819 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2821 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2822 float_raise( float_flag_invalid );
2826 aSign = extractFloat64Sign( a );
2827 bSign = extractFloat64Sign( b );
2828 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2829 return ( a != b ) && ( aSign ^ ( a < b ) );
2835 /*----------------------------------------------------------------------------
2836 | Returns the result of converting the extended double-precision floating-
2837 | point value `a' to the 32-bit two's complement integer format. The
2838 | conversion is performed according to the IEC/IEEE Standard for Binary
2839 | Floating-Point Arithmetic---which means in particular that the conversion
2840 | is rounded according to the current rounding mode. If `a' is a NaN, the
2841 | largest positive integer is returned. Otherwise, if the conversion
2842 | overflows, the largest integer with the same sign as `a' is returned.
2843 *----------------------------------------------------------------------------*/
2845 int32 floatx80_to_int32( floatx80 a )
2848 int32 aExp, shiftCount;
2851 aSig = extractFloatx80Frac( a );
2852 aExp = extractFloatx80Exp( a );
2853 aSign = extractFloatx80Sign( a );
2854 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2855 shiftCount = 0x4037 - aExp;
2856 if ( shiftCount <= 0 ) shiftCount = 1;
2857 shift64RightJamming( aSig, shiftCount, &aSig );
2858 return roundAndPackInt32( aSign, aSig );
2862 /*----------------------------------------------------------------------------
2863 | Returns the result of converting the extended double-precision floating-
2864 | point value `a' to the 32-bit two's complement integer format. The
2865 | conversion is performed according to the IEC/IEEE Standard for Binary
2866 | Floating-Point Arithmetic, except that the conversion is always rounded
2867 | toward zero. If `a' is a NaN, the largest positive integer is returned.
2868 | Otherwise, if the conversion overflows, the largest integer with the same
2869 | sign as `a' is returned.
2870 *----------------------------------------------------------------------------*/
2872 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2875 int32 aExp, shiftCount;
2876 bits64 aSig, savedASig;
2879 aSig = extractFloatx80Frac( a );
2880 aExp = extractFloatx80Exp( a );
2881 aSign = extractFloatx80Sign( a );
2882 if ( 0x401E < aExp ) {
2883 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2886 else if ( aExp < 0x3FFF ) {
2887 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2890 shiftCount = 0x403E - aExp;
2892 aSig >>= shiftCount;
2894 if ( aSign ) z = - z;
2895 if ( ( z < 0 ) ^ aSign ) {
2897 float_raise( float_flag_invalid );
2898 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2900 if ( ( aSig<<shiftCount ) != savedASig ) {
2901 float_exception_flags |= float_flag_inexact;
2907 /*----------------------------------------------------------------------------
2908 | Returns the result of converting the extended double-precision floating-
2909 | point value `a' to the 64-bit two's complement integer format. The
2910 | conversion is performed according to the IEC/IEEE Standard for Binary
2911 | Floating-Point Arithmetic---which means in particular that the conversion
2912 | is rounded according to the current rounding mode. If `a' is a NaN,
2913 | the largest positive integer is returned. Otherwise, if the conversion
2914 | overflows, the largest integer with the same sign as `a' is returned.
2915 *----------------------------------------------------------------------------*/
2917 int64 floatx80_to_int64( floatx80 a )
2920 int32 aExp, shiftCount;
2921 bits64 aSig, aSigExtra;
2923 aSig = extractFloatx80Frac( a );
2924 aExp = extractFloatx80Exp( a );
2925 aSign = extractFloatx80Sign( a );
2926 shiftCount = 0x403E - aExp;
2927 if ( shiftCount <= 0 ) {
2929 float_raise( float_flag_invalid );
2931 || ( ( aExp == 0x7FFF )
2932 && ( aSig != LIT64( 0x8000000000000000 ) ) )
2934 return LIT64( 0x7FFFFFFFFFFFFFFF );
2936 return (sbits64) LIT64( 0x8000000000000000 );
2941 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2943 return roundAndPackInt64( aSign, aSig, aSigExtra );
2947 /*----------------------------------------------------------------------------
2948 | Returns the result of converting the extended double-precision floating-
2949 | point value `a' to the 64-bit two's complement integer format. The
2950 | conversion is performed according to the IEC/IEEE Standard for Binary
2951 | Floating-Point Arithmetic, except that the conversion is always rounded
2952 | toward zero. If `a' is a NaN, the largest positive integer is returned.
2953 | Otherwise, if the conversion overflows, the largest integer with the same
2954 | sign as `a' is returned.
2955 *----------------------------------------------------------------------------*/
2957 int64 floatx80_to_int64_round_to_zero( floatx80 a )
2960 int32 aExp, shiftCount;
2964 aSig = extractFloatx80Frac( a );
2965 aExp = extractFloatx80Exp( a );
2966 aSign = extractFloatx80Sign( a );
2967 shiftCount = aExp - 0x403E;
2968 if ( 0 <= shiftCount ) {
2969 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
2970 if ( ( a.high != 0xC03E ) || aSig ) {
2971 float_raise( float_flag_invalid );
2972 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
2973 return LIT64( 0x7FFFFFFFFFFFFFFF );
2976 return (sbits64) LIT64( 0x8000000000000000 );
2978 else if ( aExp < 0x3FFF ) {
2979 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2982 z = aSig>>( - shiftCount );
2983 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2984 float_exception_flags |= float_flag_inexact;
2986 if ( aSign ) z = - z;
2991 /*----------------------------------------------------------------------------
2992 | Returns the result of converting the extended double-precision floating-
2993 | point value `a' to the single-precision floating-point format. The
2994 | conversion is performed according to the IEC/IEEE Standard for Binary
2995 | Floating-Point Arithmetic.
2996 *----------------------------------------------------------------------------*/
2998 float32 floatx80_to_float32( floatx80 a )
3004 aSig = extractFloatx80Frac( a );
3005 aExp = extractFloatx80Exp( a );
3006 aSign = extractFloatx80Sign( a );
3007 if ( aExp == 0x7FFF ) {
3008 if ( (bits64) ( aSig<<1 ) ) {
3009 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3011 return packFloat32( aSign, 0xFF, 0 );
3013 shift64RightJamming( aSig, 33, &aSig );
3014 if ( aExp || aSig ) aExp -= 0x3F81;
3015 return roundAndPackFloat32( aSign, aExp, aSig );
3019 /*----------------------------------------------------------------------------
3020 | Returns the result of converting the extended double-precision floating-
3021 | point value `a' to the double-precision floating-point format. The
3022 | conversion is performed according to the IEC/IEEE Standard for Binary
3023 | Floating-Point Arithmetic.
3024 *----------------------------------------------------------------------------*/
3026 float64 floatx80_to_float64( floatx80 a )
3032 aSig = extractFloatx80Frac( a );
3033 aExp = extractFloatx80Exp( a );
3034 aSign = extractFloatx80Sign( a );
3035 if ( aExp == 0x7FFF ) {
3036 if ( (bits64) ( aSig<<1 ) ) {
3037 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3039 return packFloat64( aSign, 0x7FF, 0 );
3041 shift64RightJamming( aSig, 1, &zSig );
3042 if ( aExp || aSig ) aExp -= 0x3C01;
3043 return roundAndPackFloat64( aSign, aExp, zSig );
3049 /*----------------------------------------------------------------------------
3050 | Returns the result of converting the extended double-precision floating-
3051 | point value `a' to the quadruple-precision floating-point format. The
3052 | conversion is performed according to the IEC/IEEE Standard for Binary
3053 | Floating-Point Arithmetic.
3054 *----------------------------------------------------------------------------*/
3056 float128 floatx80_to_float128( floatx80 a )
3060 bits64 aSig, zSig0, zSig1;
3062 aSig = extractFloatx80Frac( a );
3063 aExp = extractFloatx80Exp( a );
3064 aSign = extractFloatx80Sign( a );
3065 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3066 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3068 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3069 return packFloat128( aSign, aExp, zSig0, zSig1 );
3075 /*----------------------------------------------------------------------------
3076 | Rounds the extended double-precision floating-point value `a' to an integer,
3077 | and returns the result as an extended quadruple-precision floating-point
3078 | value. The operation is performed according to the IEC/IEEE Standard for
3079 | Binary Floating-Point Arithmetic.
3080 *----------------------------------------------------------------------------*/
3082 floatx80 floatx80_round_to_int( floatx80 a )
3086 bits64 lastBitMask, roundBitsMask;
3090 aExp = extractFloatx80Exp( a );
3091 if ( 0x403E <= aExp ) {
3092 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3093 return propagateFloatx80NaN( a, a );
3097 if ( aExp < 0x3FFF ) {
3099 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3102 float_exception_flags |= float_flag_inexact;
3103 aSign = extractFloatx80Sign( a );
3104 switch ( float_rounding_mode ) {
3105 case float_round_nearest_even:
3106 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3109 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3112 case float_round_down:
3115 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3116 : packFloatx80( 0, 0, 0 );
3117 case float_round_up:
3119 aSign ? packFloatx80( 1, 0, 0 )
3120 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3122 return packFloatx80( aSign, 0, 0 );
3125 lastBitMask <<= 0x403E - aExp;
3126 roundBitsMask = lastBitMask - 1;
3128 roundingMode = float_rounding_mode;
3129 if ( roundingMode == float_round_nearest_even ) {
3130 z.low += lastBitMask>>1;
3131 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3133 else if ( roundingMode != float_round_to_zero ) {
3134 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3135 z.low += roundBitsMask;
3138 z.low &= ~ roundBitsMask;
3141 z.low = LIT64( 0x8000000000000000 );
3143 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3148 /*----------------------------------------------------------------------------
3149 | Returns the result of adding the absolute values of the extended double-
3150 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3151 | negated before being returned. `zSign' is ignored if the result is a NaN.
3152 | The addition is performed according to the IEC/IEEE Standard for Binary
3153 | Floating-Point Arithmetic.
3154 *----------------------------------------------------------------------------*/
3156 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3158 int32 aExp, bExp, zExp;
3159 bits64 aSig, bSig, zSig0, zSig1;
3162 aSig = extractFloatx80Frac( a );
3163 aExp = extractFloatx80Exp( a );
3164 bSig = extractFloatx80Frac( b );
3165 bExp = extractFloatx80Exp( b );
3166 expDiff = aExp - bExp;
3167 if ( 0 < expDiff ) {
3168 if ( aExp == 0x7FFF ) {
3169 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3172 if ( bExp == 0 ) --expDiff;
3173 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3176 else if ( expDiff < 0 ) {
3177 if ( bExp == 0x7FFF ) {
3178 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3179 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3181 if ( aExp == 0 ) ++expDiff;
3182 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3186 if ( aExp == 0x7FFF ) {
3187 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3188 return propagateFloatx80NaN( a, b );
3193 zSig0 = aSig + bSig;
3195 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3201 zSig0 = aSig + bSig;
3202 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3204 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3205 zSig0 |= LIT64( 0x8000000000000000 );
3209 roundAndPackFloatx80(
3210 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of subtracting the absolute values of the extended
3216 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3217 | difference is negated before being returned. `zSign' is ignored if the
3218 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3219 | Standard for Binary Floating-Point Arithmetic.
3220 *----------------------------------------------------------------------------*/
3222 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3224 int32 aExp, bExp, zExp;
3225 bits64 aSig, bSig, zSig0, zSig1;
3229 aSig = extractFloatx80Frac( a );
3230 aExp = extractFloatx80Exp( a );
3231 bSig = extractFloatx80Frac( b );
3232 bExp = extractFloatx80Exp( b );
3233 expDiff = aExp - bExp;
3234 if ( 0 < expDiff ) goto aExpBigger;
3235 if ( expDiff < 0 ) goto bExpBigger;
3236 if ( aExp == 0x7FFF ) {
3237 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3238 return propagateFloatx80NaN( a, b );
3240 float_raise( float_flag_invalid );
3241 z.low = floatx80_default_nan_low;
3242 z.high = floatx80_default_nan_high;
3250 if ( bSig < aSig ) goto aBigger;
3251 if ( aSig < bSig ) goto bBigger;
3252 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3254 if ( bExp == 0x7FFF ) {
3255 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3256 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3258 if ( aExp == 0 ) ++expDiff;
3259 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3261 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3264 goto normalizeRoundAndPack;
3266 if ( aExp == 0x7FFF ) {
3267 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3270 if ( bExp == 0 ) --expDiff;
3271 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3273 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3275 normalizeRoundAndPack:
3277 normalizeRoundAndPackFloatx80(
3278 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3282 /*----------------------------------------------------------------------------
3283 | Returns the result of adding the extended double-precision floating-point
3284 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3285 | Standard for Binary Floating-Point Arithmetic.
3286 *----------------------------------------------------------------------------*/
3288 floatx80 floatx80_add( floatx80 a, floatx80 b )
3292 aSign = extractFloatx80Sign( a );
3293 bSign = extractFloatx80Sign( b );
3294 if ( aSign == bSign ) {
3295 return addFloatx80Sigs( a, b, aSign );
3298 return subFloatx80Sigs( a, b, aSign );
3303 /*----------------------------------------------------------------------------
3304 | Returns the result of subtracting the extended double-precision floating-
3305 | point values `a' and `b'. The operation is performed according to the
3306 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3307 *----------------------------------------------------------------------------*/
3309 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3313 aSign = extractFloatx80Sign( a );
3314 bSign = extractFloatx80Sign( b );
3315 if ( aSign == bSign ) {
3316 return subFloatx80Sigs( a, b, aSign );
3319 return addFloatx80Sigs( a, b, aSign );
3324 /*----------------------------------------------------------------------------
3325 | Returns the result of multiplying the extended double-precision floating-
3326 | point values `a' and `b'. The operation is performed according to the
3327 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3328 *----------------------------------------------------------------------------*/
3330 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3332 flag aSign, bSign, zSign;
3333 int32 aExp, bExp, zExp;
3334 bits64 aSig, bSig, zSig0, zSig1;
3337 aSig = extractFloatx80Frac( a );
3338 aExp = extractFloatx80Exp( a );
3339 aSign = extractFloatx80Sign( a );
3340 bSig = extractFloatx80Frac( b );
3341 bExp = extractFloatx80Exp( b );
3342 bSign = extractFloatx80Sign( b );
3343 zSign = aSign ^ bSign;
3344 if ( aExp == 0x7FFF ) {
3345 if ( (bits64) ( aSig<<1 )
3346 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3347 return propagateFloatx80NaN( a, b );
3349 if ( ( bExp | bSig ) == 0 ) goto invalid;
3350 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3352 if ( bExp == 0x7FFF ) {
3353 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3354 if ( ( aExp | aSig ) == 0 ) {
3356 float_raise( float_flag_invalid );
3357 z.low = floatx80_default_nan_low;
3358 z.high = floatx80_default_nan_high;
3361 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3364 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3365 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3368 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3369 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3371 zExp = aExp + bExp - 0x3FFE;
3372 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3373 if ( 0 < (sbits64) zSig0 ) {
3374 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3378 roundAndPackFloatx80(
3379 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3383 /*----------------------------------------------------------------------------
3384 | Returns the result of dividing the extended double-precision floating-point
3385 | value `a' by the corresponding value `b'. The operation is performed
3386 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3387 *----------------------------------------------------------------------------*/
3389 floatx80 floatx80_div( floatx80 a, floatx80 b )
3391 flag aSign, bSign, zSign;
3392 int32 aExp, bExp, zExp;
3393 bits64 aSig, bSig, zSig0, zSig1;
3394 bits64 rem0, rem1, rem2, term0, term1, term2;
3397 aSig = extractFloatx80Frac( a );
3398 aExp = extractFloatx80Exp( a );
3399 aSign = extractFloatx80Sign( a );
3400 bSig = extractFloatx80Frac( b );
3401 bExp = extractFloatx80Exp( b );
3402 bSign = extractFloatx80Sign( b );
3403 zSign = aSign ^ bSign;
3404 if ( aExp == 0x7FFF ) {
3405 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3406 if ( bExp == 0x7FFF ) {
3407 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3410 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3412 if ( bExp == 0x7FFF ) {
3413 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3414 return packFloatx80( zSign, 0, 0 );
3418 if ( ( aExp | aSig ) == 0 ) {
3420 float_raise( float_flag_invalid );
3421 z.low = floatx80_default_nan_low;
3422 z.high = floatx80_default_nan_high;
3425 float_raise( float_flag_divbyzero );
3426 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3428 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3431 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3432 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3434 zExp = aExp - bExp + 0x3FFE;
3436 if ( bSig <= aSig ) {
3437 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3440 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3441 mul64To128( bSig, zSig0, &term0, &term1 );
3442 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3443 while ( (sbits64) rem0 < 0 ) {
3445 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3447 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3448 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3449 mul64To128( bSig, zSig1, &term1, &term2 );
3450 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3451 while ( (sbits64) rem1 < 0 ) {
3453 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3455 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3458 roundAndPackFloatx80(
3459 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3463 /*----------------------------------------------------------------------------
3464 | Returns the remainder of the extended double-precision floating-point value
3465 | `a' with respect to the corresponding value `b'. The operation is performed
3466 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3467 *----------------------------------------------------------------------------*/
3469 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3472 int32 aExp, bExp, expDiff;
3473 bits64 aSig0, aSig1, bSig;
3474 bits64 q, term0, term1, alternateASig0, alternateASig1;
3477 aSig0 = extractFloatx80Frac( a );
3478 aExp = extractFloatx80Exp( a );
3479 aSign = extractFloatx80Sign( a );
3480 bSig = extractFloatx80Frac( b );
3481 bExp = extractFloatx80Exp( b );
3482 // bSign = extractFloatx80Sign( b );
3483 if ( aExp == 0x7FFF ) {
3484 if ( (bits64) ( aSig0<<1 )
3485 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3486 return propagateFloatx80NaN( a, b );
3490 if ( bExp == 0x7FFF ) {
3491 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3497 float_raise( float_flag_invalid );
3498 z.low = floatx80_default_nan_low;
3499 z.high = floatx80_default_nan_high;
3502 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3505 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3506 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3508 bSig |= LIT64( 0x8000000000000000 );
3510 expDiff = aExp - bExp;
3512 if ( expDiff < 0 ) {
3513 if ( expDiff < -1 ) return a;
3514 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3517 q = ( bSig <= aSig0 );
3518 if ( q ) aSig0 -= bSig;
3520 while ( 0 < expDiff ) {
3521 q = estimateDiv128To64( aSig0, aSig1, bSig );
3522 q = ( 2 < q ) ? q - 2 : 0;
3523 mul64To128( bSig, q, &term0, &term1 );
3524 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3525 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3529 if ( 0 < expDiff ) {
3530 q = estimateDiv128To64( aSig0, aSig1, bSig );
3531 q = ( 2 < q ) ? q - 2 : 0;
3533 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3534 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3535 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3536 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3538 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3545 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3546 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3547 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3550 aSig0 = alternateASig0;
3551 aSig1 = alternateASig1;
3555 normalizeRoundAndPackFloatx80(
3556 80, zSign, bExp + expDiff, aSig0, aSig1 );
3560 /*----------------------------------------------------------------------------
3561 | Returns the square root of the extended double-precision floating-point
3562 | value `a'. The operation is performed according to the IEC/IEEE Standard
3563 | for Binary Floating-Point Arithmetic.
3564 *----------------------------------------------------------------------------*/
3566 floatx80 floatx80_sqrt( floatx80 a )
3570 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3571 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3574 aSig0 = extractFloatx80Frac( a );
3575 aExp = extractFloatx80Exp( a );
3576 aSign = extractFloatx80Sign( a );
3577 if ( aExp == 0x7FFF ) {
3578 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3579 if ( ! aSign ) return a;
3583 if ( ( aExp | aSig0 ) == 0 ) return a;
3585 float_raise( float_flag_invalid );
3586 z.low = floatx80_default_nan_low;
3587 z.high = floatx80_default_nan_high;
3591 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3592 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3594 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3595 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3596 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3597 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3598 doubleZSig0 = zSig0<<1;
3599 mul64To128( zSig0, zSig0, &term0, &term1 );
3600 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3601 while ( (sbits64) rem0 < 0 ) {
3604 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3606 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3607 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3608 if ( zSig1 == 0 ) zSig1 = 1;
3609 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3610 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3611 mul64To128( zSig1, zSig1, &term2, &term3 );
3612 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3613 while ( (sbits64) rem1 < 0 ) {
3615 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3617 term2 |= doubleZSig0;
3618 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3620 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3622 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3623 zSig0 |= doubleZSig0;
3625 roundAndPackFloatx80(
3626 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3630 /*----------------------------------------------------------------------------
3631 | Returns 1 if the extended double-precision floating-point value `a' is
3632 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3633 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3635 *----------------------------------------------------------------------------*/
3637 flag floatx80_eq( floatx80 a, floatx80 b )
3639 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3640 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3641 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3642 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3644 if ( floatx80_is_signaling_nan( a )
3645 || floatx80_is_signaling_nan( b ) ) {
3646 float_raise( float_flag_invalid );
3652 && ( ( a.high == b.high )
3654 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3659 /*----------------------------------------------------------------------------
3660 | Returns 1 if the extended double-precision floating-point value `a' is
3661 | less than or equal to the corresponding value `b', and 0 otherwise. The
3662 | comparison is performed according to the IEC/IEEE Standard for Binary
3663 | Floating-Point Arithmetic.
3664 *----------------------------------------------------------------------------*/
3666 flag floatx80_le( floatx80 a, floatx80 b )
3670 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3671 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3672 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3673 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3675 float_raise( float_flag_invalid );
3678 aSign = extractFloatx80Sign( a );
3679 bSign = extractFloatx80Sign( b );
3680 if ( aSign != bSign ) {
3683 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3687 aSign ? le128( b.high, b.low, a.high, a.low )
3688 : le128( a.high, a.low, b.high, b.low );
3692 /*----------------------------------------------------------------------------
3693 | Returns 1 if the extended double-precision floating-point value `a' is
3694 | less than the corresponding value `b', and 0 otherwise. The comparison
3695 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3697 *----------------------------------------------------------------------------*/
3699 flag floatx80_lt( floatx80 a, floatx80 b )
3703 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3704 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3705 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3706 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3708 float_raise( float_flag_invalid );
3711 aSign = extractFloatx80Sign( a );
3712 bSign = extractFloatx80Sign( b );
3713 if ( aSign != bSign ) {
3716 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3720 aSign ? lt128( b.high, b.low, a.high, a.low )
3721 : lt128( a.high, a.low, b.high, b.low );
3725 /*----------------------------------------------------------------------------
3726 | Returns 1 if the extended double-precision floating-point value `a' is equal
3727 | to the corresponding value `b', and 0 otherwise. The invalid exception is
3728 | raised if either operand is a NaN. Otherwise, the comparison is performed
3729 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3730 *----------------------------------------------------------------------------*/
3732 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3734 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3735 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3736 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3737 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3739 float_raise( float_flag_invalid );
3744 && ( ( a.high == b.high )
3746 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3751 /*----------------------------------------------------------------------------
3752 | Returns 1 if the extended double-precision floating-point value `a' is less
3753 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3754 | do not cause an exception. Otherwise, the comparison is performed according
3755 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3756 *----------------------------------------------------------------------------*/
3758 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3762 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3763 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3764 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3765 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3767 if ( floatx80_is_signaling_nan( a )
3768 || floatx80_is_signaling_nan( b ) ) {
3769 float_raise( float_flag_invalid );
3773 aSign = extractFloatx80Sign( a );
3774 bSign = extractFloatx80Sign( b );
3775 if ( aSign != bSign ) {
3778 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3782 aSign ? le128( b.high, b.low, a.high, a.low )
3783 : le128( a.high, a.low, b.high, b.low );
3787 /*----------------------------------------------------------------------------
3788 | Returns 1 if the extended double-precision floating-point value `a' is less
3789 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3790 | an exception. Otherwise, the comparison is performed according to the
3791 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3792 *----------------------------------------------------------------------------*/
3794 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3798 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3799 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3800 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3801 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3803 if ( floatx80_is_signaling_nan( a )
3804 || floatx80_is_signaling_nan( b ) ) {
3805 float_raise( float_flag_invalid );
3809 aSign = extractFloatx80Sign( a );
3810 bSign = extractFloatx80Sign( b );
3811 if ( aSign != bSign ) {
3814 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3818 aSign ? lt128( b.high, b.low, a.high, a.low )
3819 : lt128( a.high, a.low, b.high, b.low );
3827 /*----------------------------------------------------------------------------
3828 | Returns the result of converting the quadruple-precision floating-point
3829 | value `a' to the 32-bit two's complement integer format. The conversion
3830 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3831 | Arithmetic---which means in particular that the conversion is rounded
3832 | according to the current rounding mode. If `a' is a NaN, the largest
3833 | positive integer is returned. Otherwise, if the conversion overflows, the
3834 | largest integer with the same sign as `a' is returned.
3835 *----------------------------------------------------------------------------*/
3837 int32 float128_to_int32( float128 a )
3840 int32 aExp, shiftCount;
3841 bits64 aSig0, aSig1;
3843 aSig1 = extractFloat128Frac1( a );
3844 aSig0 = extractFloat128Frac0( a );
3845 aExp = extractFloat128Exp( a );
3846 aSign = extractFloat128Sign( a );
3847 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
3848 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3849 aSig0 |= ( aSig1 != 0 );
3850 shiftCount = 0x4028 - aExp;
3851 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
3852 return roundAndPackInt32( aSign, aSig0 );
3856 /*----------------------------------------------------------------------------
3857 | Returns the result of converting the quadruple-precision floating-point
3858 | value `a' to the 32-bit two's complement integer format. The conversion
3859 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3860 | Arithmetic, except that the conversion is always rounded toward zero. If
3861 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
3862 | conversion overflows, the largest integer with the same sign as `a' is
3864 *----------------------------------------------------------------------------*/
3866 int32 float128_to_int32_round_to_zero( float128 a )
3869 int32 aExp, shiftCount;
3870 bits64 aSig0, aSig1, savedASig;
3873 aSig1 = extractFloat128Frac1( a );
3874 aSig0 = extractFloat128Frac0( a );
3875 aExp = extractFloat128Exp( a );
3876 aSign = extractFloat128Sign( a );
3877 aSig0 |= ( aSig1 != 0 );
3878 if ( 0x401E < aExp ) {
3879 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
3882 else if ( aExp < 0x3FFF ) {
3883 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
3886 aSig0 |= LIT64( 0x0001000000000000 );
3887 shiftCount = 0x402F - aExp;
3889 aSig0 >>= shiftCount;
3891 if ( aSign ) z = - z;
3892 if ( ( z < 0 ) ^ aSign ) {
3894 float_raise( float_flag_invalid );
3895 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3897 if ( ( aSig0<<shiftCount ) != savedASig ) {
3898 float_exception_flags |= float_flag_inexact;
3904 /*----------------------------------------------------------------------------
3905 | Returns the result of converting the quadruple-precision floating-point
3906 | value `a' to the 64-bit two's complement integer format. The conversion
3907 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3908 | Arithmetic---which means in particular that the conversion is rounded
3909 | according to the current rounding mode. If `a' is a NaN, the largest
3910 | positive integer is returned. Otherwise, if the conversion overflows, the
3911 | largest integer with the same sign as `a' is returned.
3912 *----------------------------------------------------------------------------*/
3914 int64 float128_to_int64( float128 a )
3917 int32 aExp, shiftCount;
3918 bits64 aSig0, aSig1;
3920 aSig1 = extractFloat128Frac1( a );
3921 aSig0 = extractFloat128Frac0( a );
3922 aExp = extractFloat128Exp( a );
3923 aSign = extractFloat128Sign( a );
3924 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3925 shiftCount = 0x402F - aExp;
3926 if ( shiftCount <= 0 ) {
3927 if ( 0x403E < aExp ) {
3928 float_raise( float_flag_invalid );
3930 || ( ( aExp == 0x7FFF )
3931 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
3934 return LIT64( 0x7FFFFFFFFFFFFFFF );
3936 return (sbits64) LIT64( 0x8000000000000000 );
3938 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
3941 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
3943 return roundAndPackInt64( aSign, aSig0, aSig1 );
3947 /*----------------------------------------------------------------------------
3948 | Returns the result of converting the quadruple-precision floating-point
3949 | value `a' to the 64-bit two's complement integer format. The conversion
3950 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3951 | Arithmetic, except that the conversion is always rounded toward zero.
3952 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3953 | the conversion overflows, the largest integer with the same sign as `a' is
3955 *----------------------------------------------------------------------------*/
3957 int64 float128_to_int64_round_to_zero( float128 a )
3960 int32 aExp, shiftCount;
3961 bits64 aSig0, aSig1;
3964 aSig1 = extractFloat128Frac1( a );
3965 aSig0 = extractFloat128Frac0( a );
3966 aExp = extractFloat128Exp( a );
3967 aSign = extractFloat128Sign( a );
3968 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3969 shiftCount = aExp - 0x402F;
3970 if ( 0 < shiftCount ) {
3971 if ( 0x403E <= aExp ) {
3972 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
3973 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
3974 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
3975 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
3978 float_raise( float_flag_invalid );
3979 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
3980 return LIT64( 0x7FFFFFFFFFFFFFFF );
3983 return (sbits64) LIT64( 0x8000000000000000 );
3985 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
3986 if ( (bits64) ( aSig1<<shiftCount ) ) {
3987 float_exception_flags |= float_flag_inexact;
3991 if ( aExp < 0x3FFF ) {
3992 if ( aExp | aSig0 | aSig1 ) {
3993 float_exception_flags |= float_flag_inexact;
3997 z = aSig0>>( - shiftCount );
3999 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4000 float_exception_flags |= float_flag_inexact;
4003 if ( aSign ) z = - z;
4008 /*----------------------------------------------------------------------------
4009 | Returns the result of converting the quadruple-precision floating-point
4010 | value `a' to the single-precision floating-point format. The conversion
4011 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4013 *----------------------------------------------------------------------------*/
4015 float32 float128_to_float32( float128 a )
4019 bits64 aSig0, aSig1;
4022 aSig1 = extractFloat128Frac1( a );
4023 aSig0 = extractFloat128Frac0( a );
4024 aExp = extractFloat128Exp( a );
4025 aSign = extractFloat128Sign( a );
4026 if ( aExp == 0x7FFF ) {
4027 if ( aSig0 | aSig1 ) {
4028 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4030 return packFloat32( aSign, 0xFF, 0 );
4032 aSig0 |= ( aSig1 != 0 );
4033 shift64RightJamming( aSig0, 18, &aSig0 );
4035 if ( aExp || zSig ) {
4039 return roundAndPackFloat32( aSign, aExp, zSig );
4043 /*----------------------------------------------------------------------------
4044 | Returns the result of converting the quadruple-precision floating-point
4045 | value `a' to the double-precision floating-point format. The conversion
4046 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4048 *----------------------------------------------------------------------------*/
4050 float64 float128_to_float64( float128 a )
4054 bits64 aSig0, aSig1;
4056 aSig1 = extractFloat128Frac1( a );
4057 aSig0 = extractFloat128Frac0( a );
4058 aExp = extractFloat128Exp( a );
4059 aSign = extractFloat128Sign( a );
4060 if ( aExp == 0x7FFF ) {
4061 if ( aSig0 | aSig1 ) {
4062 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4064 return packFloat64( aSign, 0x7FF, 0 );
4066 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4067 aSig0 |= ( aSig1 != 0 );
4068 if ( aExp || aSig0 ) {
4069 aSig0 |= LIT64( 0x4000000000000000 );
4072 return roundAndPackFloat64( aSign, aExp, aSig0 );
4078 /*----------------------------------------------------------------------------
4079 | Returns the result of converting the quadruple-precision floating-point
4080 | value `a' to the extended double-precision floating-point format. The
4081 | conversion is performed according to the IEC/IEEE Standard for Binary
4082 | Floating-Point Arithmetic.
4083 *----------------------------------------------------------------------------*/
4085 floatx80 float128_to_floatx80( float128 a )
4089 bits64 aSig0, aSig1;
4091 aSig1 = extractFloat128Frac1( a );
4092 aSig0 = extractFloat128Frac0( a );
4093 aExp = extractFloat128Exp( a );
4094 aSign = extractFloat128Sign( a );
4095 if ( aExp == 0x7FFF ) {
4096 if ( aSig0 | aSig1 ) {
4097 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4099 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4102 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4103 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4106 aSig0 |= LIT64( 0x0001000000000000 );
4108 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4109 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4115 /*----------------------------------------------------------------------------
4116 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4117 | returns the result as a quadruple-precision floating-point value. The
4118 | operation is performed according to the IEC/IEEE Standard for Binary
4119 | Floating-Point Arithmetic.
4120 *----------------------------------------------------------------------------*/
4122 float128 float128_round_to_int( float128 a )
4126 bits64 lastBitMask, roundBitsMask;
4130 aExp = extractFloat128Exp( a );
4131 if ( 0x402F <= aExp ) {
4132 if ( 0x406F <= aExp ) {
4133 if ( ( aExp == 0x7FFF )
4134 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4136 return propagateFloat128NaN( a, a );
4141 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4142 roundBitsMask = lastBitMask - 1;
4144 roundingMode = float_rounding_mode;
4145 if ( roundingMode == float_round_nearest_even ) {
4146 if ( lastBitMask ) {
4147 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4148 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4151 if ( (sbits64) z.low < 0 ) {
4153 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4157 else if ( roundingMode != float_round_to_zero ) {
4158 if ( extractFloat128Sign( z )
4159 ^ ( roundingMode == float_round_up ) ) {
4160 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4163 z.low &= ~ roundBitsMask;
4166 if ( aExp < 0x3FFF ) {
4167 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4168 float_exception_flags |= float_flag_inexact;
4169 aSign = extractFloat128Sign( a );
4170 switch ( float_rounding_mode ) {
4171 case float_round_nearest_even:
4172 if ( ( aExp == 0x3FFE )
4173 && ( extractFloat128Frac0( a )
4174 | extractFloat128Frac1( a ) )
4176 return packFloat128( aSign, 0x3FFF, 0, 0 );
4179 case float_round_down:
4181 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4182 : packFloat128( 0, 0, 0, 0 );
4183 case float_round_up:
4185 aSign ? packFloat128( 1, 0, 0, 0 )
4186 : packFloat128( 0, 0x3FFF, 0, 0 );
4188 return packFloat128( aSign, 0, 0, 0 );
4191 lastBitMask <<= 0x402F - aExp;
4192 roundBitsMask = lastBitMask - 1;
4195 roundingMode = float_rounding_mode;
4196 if ( roundingMode == float_round_nearest_even ) {
4197 z.high += lastBitMask>>1;
4198 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4199 z.high &= ~ lastBitMask;
4202 else if ( roundingMode != float_round_to_zero ) {
4203 if ( extractFloat128Sign( z )
4204 ^ ( roundingMode == float_round_up ) ) {
4205 z.high |= ( a.low != 0 );
4206 z.high += roundBitsMask;
4209 z.high &= ~ roundBitsMask;
4211 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4212 float_exception_flags |= float_flag_inexact;
4218 /*----------------------------------------------------------------------------
4219 | Returns the result of adding the absolute values of the quadruple-precision
4220 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4221 | before being returned. `zSign' is ignored if the result is a NaN.
4222 | The addition is performed according to the IEC/IEEE Standard for Binary
4223 | Floating-Point Arithmetic.
4224 *----------------------------------------------------------------------------*/
4226 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4228 int32 aExp, bExp, zExp;
4229 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4232 aSig1 = extractFloat128Frac1( a );
4233 aSig0 = extractFloat128Frac0( a );
4234 aExp = extractFloat128Exp( a );
4235 bSig1 = extractFloat128Frac1( b );
4236 bSig0 = extractFloat128Frac0( b );
4237 bExp = extractFloat128Exp( b );
4238 expDiff = aExp - bExp;
4239 if ( 0 < expDiff ) {
4240 if ( aExp == 0x7FFF ) {
4241 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4248 bSig0 |= LIT64( 0x0001000000000000 );
4250 shift128ExtraRightJamming(
4251 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4254 else if ( expDiff < 0 ) {
4255 if ( bExp == 0x7FFF ) {
4256 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4257 return packFloat128( zSign, 0x7FFF, 0, 0 );
4263 aSig0 |= LIT64( 0x0001000000000000 );
4265 shift128ExtraRightJamming(
4266 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4270 if ( aExp == 0x7FFF ) {
4271 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4272 return propagateFloat128NaN( a, b );
4276 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4277 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4279 zSig0 |= LIT64( 0x0002000000000000 );
4283 aSig0 |= LIT64( 0x0001000000000000 );
4284 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4286 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4289 shift128ExtraRightJamming(
4290 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4292 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4296 /*----------------------------------------------------------------------------
4297 | Returns the result of subtracting the absolute values of the quadruple-
4298 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4299 | difference is negated before being returned. `zSign' is ignored if the
4300 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4301 | Standard for Binary Floating-Point Arithmetic.
4302 *----------------------------------------------------------------------------*/
4304 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4306 int32 aExp, bExp, zExp;
4307 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4311 aSig1 = extractFloat128Frac1( a );
4312 aSig0 = extractFloat128Frac0( a );
4313 aExp = extractFloat128Exp( a );
4314 bSig1 = extractFloat128Frac1( b );
4315 bSig0 = extractFloat128Frac0( b );
4316 bExp = extractFloat128Exp( b );
4317 expDiff = aExp - bExp;
4318 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4319 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4320 if ( 0 < expDiff ) goto aExpBigger;
4321 if ( expDiff < 0 ) goto bExpBigger;
4322 if ( aExp == 0x7FFF ) {
4323 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4324 return propagateFloat128NaN( a, b );
4326 float_raise( float_flag_invalid );
4327 z.low = float128_default_nan_low;
4328 z.high = float128_default_nan_high;
4335 if ( bSig0 < aSig0 ) goto aBigger;
4336 if ( aSig0 < bSig0 ) goto bBigger;
4337 if ( bSig1 < aSig1 ) goto aBigger;
4338 if ( aSig1 < bSig1 ) goto bBigger;
4339 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4341 if ( bExp == 0x7FFF ) {
4342 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4343 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4349 aSig0 |= LIT64( 0x4000000000000000 );
4351 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4352 bSig0 |= LIT64( 0x4000000000000000 );
4354 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4357 goto normalizeRoundAndPack;
4359 if ( aExp == 0x7FFF ) {
4360 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4367 bSig0 |= LIT64( 0x4000000000000000 );
4369 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4370 aSig0 |= LIT64( 0x4000000000000000 );
4372 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4374 normalizeRoundAndPack:
4376 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4380 /*----------------------------------------------------------------------------
4381 | Returns the result of adding the quadruple-precision floating-point values
4382 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4383 | for Binary Floating-Point Arithmetic.
4384 *----------------------------------------------------------------------------*/
4386 float128 float128_add( float128 a, float128 b )
4390 aSign = extractFloat128Sign( a );
4391 bSign = extractFloat128Sign( b );
4392 if ( aSign == bSign ) {
4393 return addFloat128Sigs( a, b, aSign );
4396 return subFloat128Sigs( a, b, aSign );
4401 /*----------------------------------------------------------------------------
4402 | Returns the result of subtracting the quadruple-precision floating-point
4403 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4404 | Standard for Binary Floating-Point Arithmetic.
4405 *----------------------------------------------------------------------------*/
4407 float128 float128_sub( float128 a, float128 b )
4411 aSign = extractFloat128Sign( a );
4412 bSign = extractFloat128Sign( b );
4413 if ( aSign == bSign ) {
4414 return subFloat128Sigs( a, b, aSign );
4417 return addFloat128Sigs( a, b, aSign );
4422 /*----------------------------------------------------------------------------
4423 | Returns the result of multiplying the quadruple-precision floating-point
4424 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4425 | Standard for Binary Floating-Point Arithmetic.
4426 *----------------------------------------------------------------------------*/
4428 float128 float128_mul( float128 a, float128 b )
4430 flag aSign, bSign, zSign;
4431 int32 aExp, bExp, zExp;
4432 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4435 aSig1 = extractFloat128Frac1( a );
4436 aSig0 = extractFloat128Frac0( a );
4437 aExp = extractFloat128Exp( a );
4438 aSign = extractFloat128Sign( a );
4439 bSig1 = extractFloat128Frac1( b );
4440 bSig0 = extractFloat128Frac0( b );
4441 bExp = extractFloat128Exp( b );
4442 bSign = extractFloat128Sign( b );
4443 zSign = aSign ^ bSign;
4444 if ( aExp == 0x7FFF ) {
4445 if ( ( aSig0 | aSig1 )
4446 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4447 return propagateFloat128NaN( a, b );
4449 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4450 return packFloat128( zSign, 0x7FFF, 0, 0 );
4452 if ( bExp == 0x7FFF ) {
4453 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4454 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4456 float_raise( float_flag_invalid );
4457 z.low = float128_default_nan_low;
4458 z.high = float128_default_nan_high;
4461 return packFloat128( zSign, 0x7FFF, 0, 0 );
4464 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4465 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4468 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4469 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4471 zExp = aExp + bExp - 0x4000;
4472 aSig0 |= LIT64( 0x0001000000000000 );
4473 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4474 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4475 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4476 zSig2 |= ( zSig3 != 0 );
4477 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4478 shift128ExtraRightJamming(
4479 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4482 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4486 /*----------------------------------------------------------------------------
4487 | Returns the result of dividing the quadruple-precision floating-point value
4488 | `a' by the corresponding value `b'. The operation is performed according to
4489 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4490 *----------------------------------------------------------------------------*/
4492 float128 float128_div( float128 a, float128 b )
4494 flag aSign, bSign, zSign;
4495 int32 aExp, bExp, zExp;
4496 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4497 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4500 aSig1 = extractFloat128Frac1( a );
4501 aSig0 = extractFloat128Frac0( a );
4502 aExp = extractFloat128Exp( a );
4503 aSign = extractFloat128Sign( a );
4504 bSig1 = extractFloat128Frac1( b );
4505 bSig0 = extractFloat128Frac0( b );
4506 bExp = extractFloat128Exp( b );
4507 bSign = extractFloat128Sign( b );
4508 zSign = aSign ^ bSign;
4509 if ( aExp == 0x7FFF ) {
4510 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4511 if ( bExp == 0x7FFF ) {
4512 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4515 return packFloat128( zSign, 0x7FFF, 0, 0 );
4517 if ( bExp == 0x7FFF ) {
4518 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4519 return packFloat128( zSign, 0, 0, 0 );
4522 if ( ( bSig0 | bSig1 ) == 0 ) {
4523 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4525 float_raise( float_flag_invalid );
4526 z.low = float128_default_nan_low;
4527 z.high = float128_default_nan_high;
4530 float_raise( float_flag_divbyzero );
4531 return packFloat128( zSign, 0x7FFF, 0, 0 );
4533 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4536 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4537 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4539 zExp = aExp - bExp + 0x3FFD;
4541 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4543 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4544 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4545 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4548 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4549 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4550 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4551 while ( (sbits64) rem0 < 0 ) {
4553 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4555 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4556 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4557 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4558 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4559 while ( (sbits64) rem1 < 0 ) {
4561 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4563 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4565 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4566 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4570 /*----------------------------------------------------------------------------
4571 | Returns the remainder of the quadruple-precision floating-point value `a'
4572 | with respect to the corresponding value `b'. The operation is performed
4573 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4574 *----------------------------------------------------------------------------*/
4576 float128 float128_rem( float128 a, float128 b )
4579 int32 aExp, bExp, expDiff;
4580 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4581 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4585 aSig1 = extractFloat128Frac1( a );
4586 aSig0 = extractFloat128Frac0( a );
4587 aExp = extractFloat128Exp( a );
4588 aSign = extractFloat128Sign( a );
4589 bSig1 = extractFloat128Frac1( b );
4590 bSig0 = extractFloat128Frac0( b );
4591 bExp = extractFloat128Exp( b );
4592 // bSign = extractFloat128Sign( b );
4593 if ( aExp == 0x7FFF ) {
4594 if ( ( aSig0 | aSig1 )
4595 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4596 return propagateFloat128NaN( a, b );
4600 if ( bExp == 0x7FFF ) {
4601 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4605 if ( ( bSig0 | bSig1 ) == 0 ) {
4607 float_raise( float_flag_invalid );
4608 z.low = float128_default_nan_low;
4609 z.high = float128_default_nan_high;
4612 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4615 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4616 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4618 expDiff = aExp - bExp;
4619 if ( expDiff < -1 ) return a;
4621 aSig0 | LIT64( 0x0001000000000000 ),
4623 15 - ( expDiff < 0 ),
4628 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4629 q = le128( bSig0, bSig1, aSig0, aSig1 );
4630 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4632 while ( 0 < expDiff ) {
4633 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4634 q = ( 4 < q ) ? q - 4 : 0;
4635 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4636 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4637 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4638 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4641 if ( -64 < expDiff ) {
4642 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4643 q = ( 4 < q ) ? q - 4 : 0;
4645 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4647 if ( expDiff < 0 ) {
4648 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4651 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4653 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4654 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4657 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4658 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4661 alternateASig0 = aSig0;
4662 alternateASig1 = aSig1;
4664 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4665 } while ( 0 <= (sbits64) aSig0 );
4667 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
4668 if ( ( sigMean0 < 0 )
4669 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4670 aSig0 = alternateASig0;
4671 aSig1 = alternateASig1;
4673 zSign = ( (sbits64) aSig0 < 0 );
4674 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4676 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4680 /*----------------------------------------------------------------------------
4681 | Returns the square root of the quadruple-precision floating-point value `a'.
4682 | The operation is performed according to the IEC/IEEE Standard for Binary
4683 | Floating-Point Arithmetic.
4684 *----------------------------------------------------------------------------*/
4686 float128 float128_sqrt( float128 a )
4690 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4691 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4694 aSig1 = extractFloat128Frac1( a );
4695 aSig0 = extractFloat128Frac0( a );
4696 aExp = extractFloat128Exp( a );
4697 aSign = extractFloat128Sign( a );
4698 if ( aExp == 0x7FFF ) {
4699 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4700 if ( ! aSign ) return a;
4704 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4706 float_raise( float_flag_invalid );
4707 z.low = float128_default_nan_low;
4708 z.high = float128_default_nan_high;
4712 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4713 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4715 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4716 aSig0 |= LIT64( 0x0001000000000000 );
4717 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4718 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4719 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4720 doubleZSig0 = zSig0<<1;
4721 mul64To128( zSig0, zSig0, &term0, &term1 );
4722 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4723 while ( (sbits64) rem0 < 0 ) {
4726 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4728 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4729 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4730 if ( zSig1 == 0 ) zSig1 = 1;
4731 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4732 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4733 mul64To128( zSig1, zSig1, &term2, &term3 );
4734 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4735 while ( (sbits64) rem1 < 0 ) {
4737 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4739 term2 |= doubleZSig0;
4740 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4742 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4744 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4745 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4749 /*----------------------------------------------------------------------------
4750 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4751 | the corresponding value `b', and 0 otherwise. The comparison is performed
4752 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4753 *----------------------------------------------------------------------------*/
4755 flag float128_eq( float128 a, float128 b )
4757 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4758 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4759 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4760 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4762 if ( float128_is_signaling_nan( a )
4763 || float128_is_signaling_nan( b ) ) {
4764 float_raise( float_flag_invalid );
4770 && ( ( a.high == b.high )
4772 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4777 /*----------------------------------------------------------------------------
4778 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4779 | or equal to the corresponding value `b', and 0 otherwise. The comparison
4780 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4782 *----------------------------------------------------------------------------*/
4784 flag float128_le( float128 a, float128 b )
4788 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4789 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4790 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4791 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4793 float_raise( float_flag_invalid );
4796 aSign = extractFloat128Sign( a );
4797 bSign = extractFloat128Sign( b );
4798 if ( aSign != bSign ) {
4801 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4805 aSign ? le128( b.high, b.low, a.high, a.low )
4806 : le128( a.high, a.low, b.high, b.low );
4810 /*----------------------------------------------------------------------------
4811 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4812 | the corresponding value `b', and 0 otherwise. The comparison is performed
4813 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4814 *----------------------------------------------------------------------------*/
4816 flag float128_lt( float128 a, float128 b )
4820 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4821 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4822 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4823 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4825 float_raise( float_flag_invalid );
4828 aSign = extractFloat128Sign( a );
4829 bSign = extractFloat128Sign( b );
4830 if ( aSign != bSign ) {
4833 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4837 aSign ? lt128( b.high, b.low, a.high, a.low )
4838 : lt128( a.high, a.low, b.high, b.low );
4842 /*----------------------------------------------------------------------------
4843 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4844 | the corresponding value `b', and 0 otherwise. The invalid exception is
4845 | raised if either operand is a NaN. Otherwise, the comparison is performed
4846 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4847 *----------------------------------------------------------------------------*/
4849 flag float128_eq_signaling( float128 a, float128 b )
4851 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4852 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4853 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4854 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4856 float_raise( float_flag_invalid );
4861 && ( ( a.high == b.high )
4863 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4868 /*----------------------------------------------------------------------------
4869 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4870 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4871 | cause an exception. Otherwise, the comparison is performed according to the
4872 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4873 *----------------------------------------------------------------------------*/
4875 flag float128_le_quiet( float128 a, float128 b )
4879 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4880 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4881 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4882 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4884 if ( float128_is_signaling_nan( a )
4885 || float128_is_signaling_nan( b ) ) {
4886 float_raise( float_flag_invalid );
4890 aSign = extractFloat128Sign( a );
4891 bSign = extractFloat128Sign( b );
4892 if ( aSign != bSign ) {
4895 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4899 aSign ? le128( b.high, b.low, a.high, a.low )
4900 : le128( a.high, a.low, b.high, b.low );
4904 /*----------------------------------------------------------------------------
4905 | Returns 1 if the quadruple-precision floating-point value `a' is less than
4906 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4907 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4908 | Standard for Binary Floating-Point Arithmetic.
4909 *----------------------------------------------------------------------------*/
4911 flag float128_lt_quiet( float128 a, float128 b )
4915 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4916 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4917 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4918 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4920 if ( float128_is_signaling_nan( a )
4921 || float128_is_signaling_nan( b ) ) {
4922 float_raise( float_flag_invalid );
4926 aSign = extractFloat128Sign( a );
4927 bSign = extractFloat128Sign( b );
4928 if ( aSign != bSign ) {
4931 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4935 aSign ? lt128( b.high, b.low, a.high, a.low )
4936 : lt128( a.high, a.low, b.high, b.low );