2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
33 #include "softfloat.h"
36 -------------------------------------------------------------------------------
37 Floating-point rounding mode, extended double-precision rounding precision,
39 -------------------------------------------------------------------------------
41 int8 float_rounding_mode = float_round_nearest_even;
42 int8 floatx80_rounding_precision = 80;
43 int8 float_exception_flags;
46 -------------------------------------------------------------------------------
47 Primitive arithmetic functions, including multi-word arithmetic, and
48 division and square root approximations. (Can be specialized to target if
50 -------------------------------------------------------------------------------
52 #include "softfloat-macros"
55 -------------------------------------------------------------------------------
56 Functions and definitions to determine: (1) whether tininess for underflow
57 is detected before or after rounding by default, (2) what (if anything)
58 happens when exceptions are raised, (3) how signaling NaNs are distinguished
59 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60 are propagated from function inputs to output. These details are target-
62 -------------------------------------------------------------------------------
64 #include "softfloat-specialize"
67 -------------------------------------------------------------------------------
68 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
69 and 7, and returns the properly rounded 32-bit integer corresponding to the
70 input. If `zSign' is nonzero, the input is negated before being converted
71 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
72 input is simply rounded to an integer, with the inexact exception raised if
73 the input cannot be represented exactly as an integer. If the fixed-point
74 input is too large, however, the invalid exception is raised and the largest
75 positive or negative integer is returned.
76 -------------------------------------------------------------------------------
78 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
81 flag roundNearestEven;
82 int8 roundIncrement, roundBits;
85 roundingMode = float_rounding_mode;
86 roundNearestEven = ( roundingMode == float_round_nearest_even );
87 roundIncrement = 0x40;
88 if ( ! roundNearestEven ) {
89 if ( roundingMode == float_round_to_zero ) {
93 roundIncrement = 0x7F;
95 if ( roundingMode == float_round_up ) roundIncrement = 0;
98 if ( roundingMode == float_round_down ) roundIncrement = 0;
102 roundBits = absZ & 0x7F;
103 absZ = ( absZ + roundIncrement )>>7;
104 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
106 if ( zSign ) z = - z;
107 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
108 float_exception_flags |= float_flag_invalid;
109 return zSign ? 0x80000000 : 0x7FFFFFFF;
111 if ( roundBits ) float_exception_flags |= float_flag_inexact;
117 -------------------------------------------------------------------------------
118 Returns the fraction bits of the single-precision floating-point value `a'.
119 -------------------------------------------------------------------------------
121 INLINE bits32 extractFloat32Frac( float32 a )
124 return a & 0x007FFFFF;
129 -------------------------------------------------------------------------------
130 Returns the exponent bits of the single-precision floating-point value `a'.
131 -------------------------------------------------------------------------------
133 INLINE int16 extractFloat32Exp( float32 a )
136 return ( a>>23 ) & 0xFF;
141 -------------------------------------------------------------------------------
142 Returns the sign bit of the single-precision floating-point value `a'.
143 -------------------------------------------------------------------------------
145 INLINE flag extractFloat32Sign( float32 a )
153 -------------------------------------------------------------------------------
154 Normalizes the subnormal single-precision floating-point value represented
155 by the denormalized significand `aSig'. The normalized exponent and
156 significand are stored at the locations pointed to by `zExpPtr' and
157 `zSigPtr', respectively.
158 -------------------------------------------------------------------------------
161 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
165 shiftCount = countLeadingZeros32( aSig ) - 8;
166 *zSigPtr = aSig<<shiftCount;
167 *zExpPtr = 1 - shiftCount;
172 -------------------------------------------------------------------------------
173 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
174 single-precision floating-point value, returning the result. After being
175 shifted into the proper positions, the three fields are simply added
176 together to form the result. This means that any integer portion of `zSig'
177 will be added into the exponent. Since a properly normalized significand
178 will have an integer portion equal to 1, the `zExp' input should be 1 less
179 than the desired result exponent whenever `zSig' is a complete, normalized
181 -------------------------------------------------------------------------------
183 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
187 __asm__("@ packFloat32; \n\
188 mov %0, %1, asl #31; \n\
189 orr %0, %2, asl #23; \n\
192 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
196 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
201 -------------------------------------------------------------------------------
202 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
203 and significand `zSig', and returns the proper single-precision floating-
204 point value corresponding to the abstract input. Ordinarily, the abstract
205 value is simply rounded and packed into the single-precision format, with
206 the inexact exception raised if the abstract input cannot be represented
207 exactly. If the abstract value is too large, however, the overflow and
208 inexact exceptions are raised and an infinity or maximal finite value is
209 returned. If the abstract value is too small, the input value is rounded to
210 a subnormal number, and the underflow and inexact exceptions are raised if
211 the abstract input cannot be represented exactly as a subnormal single-
212 precision floating-point number.
213 The input significand `zSig' has its binary point between bits 30
214 and 29, which is 7 bits to the left of the usual location. This shifted
215 significand must be normalized or smaller. If `zSig' is not normalized,
216 `zExp' must be 0; in that case, the result returned is a subnormal number,
217 and it must not require rounding. In the usual case that `zSig' is
218 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
219 The handling of underflow and overflow follows the IEC/IEEE Standard for
220 Binary Floating-point Arithmetic.
221 -------------------------------------------------------------------------------
223 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
226 flag roundNearestEven;
227 int8 roundIncrement, roundBits;
230 roundingMode = float_rounding_mode;
231 roundNearestEven = ( roundingMode == float_round_nearest_even );
232 roundIncrement = 0x40;
233 if ( ! roundNearestEven ) {
234 if ( roundingMode == float_round_to_zero ) {
238 roundIncrement = 0x7F;
240 if ( roundingMode == float_round_up ) roundIncrement = 0;
243 if ( roundingMode == float_round_down ) roundIncrement = 0;
247 roundBits = zSig & 0x7F;
248 if ( 0xFD <= (bits16) zExp ) {
250 || ( ( zExp == 0xFD )
251 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
253 float_raise( float_flag_overflow | float_flag_inexact );
254 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
258 ( float_detect_tininess == float_tininess_before_rounding )
260 || ( zSig + roundIncrement < 0x80000000 );
261 shift32RightJamming( zSig, - zExp, &zSig );
263 roundBits = zSig & 0x7F;
264 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
267 if ( roundBits ) float_exception_flags |= float_flag_inexact;
268 zSig = ( zSig + roundIncrement )>>7;
269 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
270 if ( zSig == 0 ) zExp = 0;
271 return packFloat32( zSign, zExp, zSig );
276 -------------------------------------------------------------------------------
277 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
278 and significand `zSig', and returns the proper single-precision floating-
279 point value corresponding to the abstract input. This routine is just like
280 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
281 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
283 -------------------------------------------------------------------------------
286 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
290 shiftCount = countLeadingZeros32( zSig ) - 1;
291 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
296 -------------------------------------------------------------------------------
297 Returns the fraction bits of the double-precision floating-point value `a'.
298 -------------------------------------------------------------------------------
300 INLINE bits64 extractFloat64Frac( float64 a )
303 return a & LIT64( 0x000FFFFFFFFFFFFF );
308 -------------------------------------------------------------------------------
309 Returns the exponent bits of the double-precision floating-point value `a'.
310 -------------------------------------------------------------------------------
312 INLINE int16 extractFloat64Exp( float64 a )
315 return ( a>>52 ) & 0x7FF;
320 -------------------------------------------------------------------------------
321 Returns the sign bit of the double-precision floating-point value `a'.
322 -------------------------------------------------------------------------------
324 INLINE flag extractFloat64Sign( float64 a )
332 -------------------------------------------------------------------------------
333 Normalizes the subnormal double-precision floating-point value represented
334 by the denormalized significand `aSig'. The normalized exponent and
335 significand are stored at the locations pointed to by `zExpPtr' and
336 `zSigPtr', respectively.
337 -------------------------------------------------------------------------------
340 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
344 shiftCount = countLeadingZeros64( aSig ) - 11;
345 *zSigPtr = aSig<<shiftCount;
346 *zExpPtr = 1 - shiftCount;
351 -------------------------------------------------------------------------------
352 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
353 double-precision floating-point value, returning the result. After being
354 shifted into the proper positions, the three fields are simply added
355 together to form the result. This means that any integer portion of `zSig'
356 will be added into the exponent. Since a properly normalized significand
357 will have an integer portion equal to 1, the `zExp' input should be 1 less
358 than the desired result exponent whenever `zSig' is a complete, normalized
360 -------------------------------------------------------------------------------
362 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
365 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
370 -------------------------------------------------------------------------------
371 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
372 and significand `zSig', and returns the proper double-precision floating-
373 point value corresponding to the abstract input. Ordinarily, the abstract
374 value is simply rounded and packed into the double-precision format, with
375 the inexact exception raised if the abstract input cannot be represented
376 exactly. If the abstract value is too large, however, the overflow and
377 inexact exceptions are raised and an infinity or maximal finite value is
378 returned. If the abstract value is too small, the input value is rounded to
379 a subnormal number, and the underflow and inexact exceptions are raised if
380 the abstract input cannot be represented exactly as a subnormal double-
381 precision floating-point number.
382 The input significand `zSig' has its binary point between bits 62
383 and 61, which is 10 bits to the left of the usual location. This shifted
384 significand must be normalized or smaller. If `zSig' is not normalized,
385 `zExp' must be 0; in that case, the result returned is a subnormal number,
386 and it must not require rounding. In the usual case that `zSig' is
387 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
388 The handling of underflow and overflow follows the IEC/IEEE Standard for
389 Binary Floating-point Arithmetic.
390 -------------------------------------------------------------------------------
392 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
395 flag roundNearestEven;
396 int16 roundIncrement, roundBits;
399 roundingMode = float_rounding_mode;
400 roundNearestEven = ( roundingMode == float_round_nearest_even );
401 roundIncrement = 0x200;
402 if ( ! roundNearestEven ) {
403 if ( roundingMode == float_round_to_zero ) {
407 roundIncrement = 0x3FF;
409 if ( roundingMode == float_round_up ) roundIncrement = 0;
412 if ( roundingMode == float_round_down ) roundIncrement = 0;
416 roundBits = zSig & 0x3FF;
417 if ( 0x7FD <= (bits16) zExp ) {
418 if ( ( 0x7FD < zExp )
419 || ( ( zExp == 0x7FD )
420 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
422 //register int lr = __builtin_return_address(0);
423 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
424 float_raise( float_flag_overflow | float_flag_inexact );
425 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
429 ( float_detect_tininess == float_tininess_before_rounding )
431 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
432 shift64RightJamming( zSig, - zExp, &zSig );
434 roundBits = zSig & 0x3FF;
435 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
438 if ( roundBits ) float_exception_flags |= float_flag_inexact;
439 zSig = ( zSig + roundIncrement )>>10;
440 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
441 if ( zSig == 0 ) zExp = 0;
442 return packFloat64( zSign, zExp, zSig );
447 -------------------------------------------------------------------------------
448 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
449 and significand `zSig', and returns the proper double-precision floating-
450 point value corresponding to the abstract input. This routine is just like
451 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
452 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
454 -------------------------------------------------------------------------------
457 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
461 shiftCount = countLeadingZeros64( zSig ) - 1;
462 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
469 -------------------------------------------------------------------------------
470 Returns the fraction bits of the extended double-precision floating-point
472 -------------------------------------------------------------------------------
474 INLINE bits64 extractFloatx80Frac( floatx80 a )
482 -------------------------------------------------------------------------------
483 Returns the exponent bits of the extended double-precision floating-point
485 -------------------------------------------------------------------------------
487 INLINE int32 extractFloatx80Exp( floatx80 a )
490 return a.high & 0x7FFF;
495 -------------------------------------------------------------------------------
496 Returns the sign bit of the extended double-precision floating-point value
498 -------------------------------------------------------------------------------
500 INLINE flag extractFloatx80Sign( floatx80 a )
508 -------------------------------------------------------------------------------
509 Normalizes the subnormal extended double-precision floating-point value
510 represented by the denormalized significand `aSig'. The normalized exponent
511 and significand are stored at the locations pointed to by `zExpPtr' and
512 `zSigPtr', respectively.
513 -------------------------------------------------------------------------------
516 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
520 shiftCount = countLeadingZeros64( aSig );
521 *zSigPtr = aSig<<shiftCount;
522 *zExpPtr = 1 - shiftCount;
527 -------------------------------------------------------------------------------
528 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
529 extended double-precision floating-point value, returning the result.
530 -------------------------------------------------------------------------------
532 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
537 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
543 -------------------------------------------------------------------------------
544 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
545 and extended significand formed by the concatenation of `zSig0' and `zSig1',
546 and returns the proper extended double-precision floating-point value
547 corresponding to the abstract input. Ordinarily, the abstract value is
548 rounded and packed into the extended double-precision format, with the
549 inexact exception raised if the abstract input cannot be represented
550 exactly. If the abstract value is too large, however, the overflow and
551 inexact exceptions are raised and an infinity or maximal finite value is
552 returned. If the abstract value is too small, the input value is rounded to
553 a subnormal number, and the underflow and inexact exceptions are raised if
554 the abstract input cannot be represented exactly as a subnormal extended
555 double-precision floating-point number.
556 If `roundingPrecision' is 32 or 64, the result is rounded to the same
557 number of bits as single or double precision, respectively. Otherwise, the
558 result is rounded to the full precision of the extended double-precision
560 The input significand must be normalized or smaller. If the input
561 significand is not normalized, `zExp' must be 0; in that case, the result
562 returned is a subnormal number, and it must not require rounding. The
563 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
564 Floating-point Arithmetic.
565 -------------------------------------------------------------------------------
568 roundAndPackFloatx80(
569 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
573 flag roundNearestEven, increment, isTiny;
574 int64 roundIncrement, roundMask, roundBits;
576 roundingMode = float_rounding_mode;
577 roundNearestEven = ( roundingMode == float_round_nearest_even );
578 if ( roundingPrecision == 80 ) goto precision80;
579 if ( roundingPrecision == 64 ) {
580 roundIncrement = LIT64( 0x0000000000000400 );
581 roundMask = LIT64( 0x00000000000007FF );
583 else if ( roundingPrecision == 32 ) {
584 roundIncrement = LIT64( 0x0000008000000000 );
585 roundMask = LIT64( 0x000000FFFFFFFFFF );
590 zSig0 |= ( zSig1 != 0 );
591 if ( ! roundNearestEven ) {
592 if ( roundingMode == float_round_to_zero ) {
596 roundIncrement = roundMask;
598 if ( roundingMode == float_round_up ) roundIncrement = 0;
601 if ( roundingMode == float_round_down ) roundIncrement = 0;
605 roundBits = zSig0 & roundMask;
606 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
607 if ( ( 0x7FFE < zExp )
608 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
614 ( float_detect_tininess == float_tininess_before_rounding )
616 || ( zSig0 <= zSig0 + roundIncrement );
617 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
619 roundBits = zSig0 & roundMask;
620 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
621 if ( roundBits ) float_exception_flags |= float_flag_inexact;
622 zSig0 += roundIncrement;
623 if ( (sbits64) zSig0 < 0 ) zExp = 1;
624 roundIncrement = roundMask + 1;
625 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
626 roundMask |= roundIncrement;
628 zSig0 &= ~ roundMask;
629 return packFloatx80( zSign, zExp, zSig0 );
632 if ( roundBits ) float_exception_flags |= float_flag_inexact;
633 zSig0 += roundIncrement;
634 if ( zSig0 < roundIncrement ) {
636 zSig0 = LIT64( 0x8000000000000000 );
638 roundIncrement = roundMask + 1;
639 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
640 roundMask |= roundIncrement;
642 zSig0 &= ~ roundMask;
643 if ( zSig0 == 0 ) zExp = 0;
644 return packFloatx80( zSign, zExp, zSig0 );
646 increment = ( (sbits64) zSig1 < 0 );
647 if ( ! roundNearestEven ) {
648 if ( roundingMode == float_round_to_zero ) {
653 increment = ( roundingMode == float_round_down ) && zSig1;
656 increment = ( roundingMode == float_round_up ) && zSig1;
660 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
661 if ( ( 0x7FFE < zExp )
662 || ( ( zExp == 0x7FFE )
663 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
669 float_raise( float_flag_overflow | float_flag_inexact );
670 if ( ( roundingMode == float_round_to_zero )
671 || ( zSign && ( roundingMode == float_round_up ) )
672 || ( ! zSign && ( roundingMode == float_round_down ) )
674 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
676 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
680 ( float_detect_tininess == float_tininess_before_rounding )
683 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
684 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
686 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
687 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
688 if ( roundNearestEven ) {
689 increment = ( (sbits64) zSig1 < 0 );
693 increment = ( roundingMode == float_round_down ) && zSig1;
696 increment = ( roundingMode == float_round_up ) && zSig1;
701 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
702 if ( (sbits64) zSig0 < 0 ) zExp = 1;
704 return packFloatx80( zSign, zExp, zSig0 );
707 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
712 zSig0 = LIT64( 0x8000000000000000 );
715 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
719 if ( zSig0 == 0 ) zExp = 0;
722 return packFloatx80( zSign, zExp, zSig0 );
726 -------------------------------------------------------------------------------
727 Takes an abstract floating-point value having sign `zSign', exponent
728 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
729 and returns the proper extended double-precision floating-point value
730 corresponding to the abstract input. This routine is just like
731 `roundAndPackFloatx80' except that the input significand does not have to be
733 -------------------------------------------------------------------------------
736 normalizeRoundAndPackFloatx80(
737 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
747 shiftCount = countLeadingZeros64( zSig0 );
748 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
751 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
758 -------------------------------------------------------------------------------
759 Returns the result of converting the 32-bit two's complement integer `a' to
760 the single-precision floating-point format. The conversion is performed
761 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
762 -------------------------------------------------------------------------------
764 float32 int32_to_float32( int32 a )
768 if ( a == 0 ) return 0;
769 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
771 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
776 -------------------------------------------------------------------------------
777 Returns the result of converting the 32-bit two's complement integer `a' to
778 the double-precision floating-point format. The conversion is performed
779 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
780 -------------------------------------------------------------------------------
782 float64 int32_to_float64( int32 a )
789 if ( a == 0 ) return 0;
791 absA = aSign ? - a : a;
792 shiftCount = countLeadingZeros32( absA ) + 21;
794 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
801 -------------------------------------------------------------------------------
802 Returns the result of converting the 32-bit two's complement integer `a'
803 to the extended double-precision floating-point format. The conversion
804 is performed according to the IEC/IEEE Standard for Binary Floating-point
806 -------------------------------------------------------------------------------
808 floatx80 int32_to_floatx80( int32 a )
815 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
817 absA = zSign ? - a : a;
818 shiftCount = countLeadingZeros32( absA ) + 32;
820 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
827 -------------------------------------------------------------------------------
828 Returns the result of converting the single-precision floating-point value
829 `a' to the 32-bit two's complement integer format. The conversion is
830 performed according to the IEC/IEEE Standard for Binary Floating-point
831 Arithmetic---which means in particular that the conversion is rounded
832 according to the current rounding mode. If `a' is a NaN, the largest
833 positive integer is returned. Otherwise, if the conversion overflows, the
834 largest integer with the same sign as `a' is returned.
835 -------------------------------------------------------------------------------
837 int32 float32_to_int32( float32 a )
840 int16 aExp, shiftCount;
844 aSig = extractFloat32Frac( a );
845 aExp = extractFloat32Exp( a );
846 aSign = extractFloat32Sign( a );
847 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
848 if ( aExp ) aSig |= 0x00800000;
849 shiftCount = 0xAF - aExp;
852 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
853 return roundAndPackInt32( aSign, zSig );
858 -------------------------------------------------------------------------------
859 Returns the result of converting the single-precision floating-point value
860 `a' to the 32-bit two's complement integer format. The conversion is
861 performed according to the IEC/IEEE Standard for Binary Floating-point
862 Arithmetic, except that the conversion is always rounded toward zero. If
863 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
864 conversion overflows, the largest integer with the same sign as `a' is
866 -------------------------------------------------------------------------------
868 int32 float32_to_int32_round_to_zero( float32 a )
871 int16 aExp, shiftCount;
875 aSig = extractFloat32Frac( a );
876 aExp = extractFloat32Exp( a );
877 aSign = extractFloat32Sign( a );
878 shiftCount = aExp - 0x9E;
879 if ( 0 <= shiftCount ) {
880 if ( a == 0xCF000000 ) return 0x80000000;
881 float_raise( float_flag_invalid );
882 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
885 else if ( aExp <= 0x7E ) {
886 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
889 aSig = ( aSig | 0x00800000 )<<8;
890 z = aSig>>( - shiftCount );
891 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
892 float_exception_flags |= float_flag_inexact;
894 return aSign ? - z : z;
899 -------------------------------------------------------------------------------
900 Returns the result of converting the single-precision floating-point value
901 `a' to the double-precision floating-point format. The conversion is
902 performed according to the IEC/IEEE Standard for Binary Floating-point
904 -------------------------------------------------------------------------------
906 float64 float32_to_float64( float32 a )
912 aSig = extractFloat32Frac( a );
913 aExp = extractFloat32Exp( a );
914 aSign = extractFloat32Sign( a );
915 if ( aExp == 0xFF ) {
916 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
917 return packFloat64( aSign, 0x7FF, 0 );
920 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
921 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
924 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
931 -------------------------------------------------------------------------------
932 Returns the result of converting the single-precision floating-point value
933 `a' to the extended double-precision floating-point format. The conversion
934 is performed according to the IEC/IEEE Standard for Binary Floating-point
936 -------------------------------------------------------------------------------
938 floatx80 float32_to_floatx80( float32 a )
944 aSig = extractFloat32Frac( a );
945 aExp = extractFloat32Exp( a );
946 aSign = extractFloat32Sign( a );
947 if ( aExp == 0xFF ) {
948 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
949 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
952 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
953 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
956 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
963 -------------------------------------------------------------------------------
964 Rounds the single-precision floating-point value `a' to an integer, and
965 returns the result as a single-precision floating-point value. The
966 operation is performed according to the IEC/IEEE Standard for Binary
967 Floating-point Arithmetic.
968 -------------------------------------------------------------------------------
970 float32 float32_round_to_int( float32 a )
974 bits32 lastBitMask, roundBitsMask;
978 aExp = extractFloat32Exp( a );
979 if ( 0x96 <= aExp ) {
980 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
981 return propagateFloat32NaN( a, a );
985 if ( aExp <= 0x7E ) {
986 if ( (bits32) ( a<<1 ) == 0 ) return a;
987 float_exception_flags |= float_flag_inexact;
988 aSign = extractFloat32Sign( a );
989 switch ( float_rounding_mode ) {
990 case float_round_nearest_even:
991 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
992 return packFloat32( aSign, 0x7F, 0 );
995 case float_round_down:
996 return aSign ? 0xBF800000 : 0;
998 return aSign ? 0x80000000 : 0x3F800000;
1000 return packFloat32( aSign, 0, 0 );
1003 lastBitMask <<= 0x96 - aExp;
1004 roundBitsMask = lastBitMask - 1;
1006 roundingMode = float_rounding_mode;
1007 if ( roundingMode == float_round_nearest_even ) {
1008 z += lastBitMask>>1;
1009 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1011 else if ( roundingMode != float_round_to_zero ) {
1012 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1016 z &= ~ roundBitsMask;
1017 if ( z != a ) float_exception_flags |= float_flag_inexact;
1023 -------------------------------------------------------------------------------
1024 Returns the result of adding the absolute values of the single-precision
1025 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1026 before being returned. `zSign' is ignored if the result is a NaN. The
1027 addition is performed according to the IEC/IEEE Standard for Binary
1028 Floating-point Arithmetic.
1029 -------------------------------------------------------------------------------
1031 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1033 int16 aExp, bExp, zExp;
1034 bits32 aSig, bSig, zSig;
1037 aSig = extractFloat32Frac( a );
1038 aExp = extractFloat32Exp( a );
1039 bSig = extractFloat32Frac( b );
1040 bExp = extractFloat32Exp( b );
1041 expDiff = aExp - bExp;
1044 if ( 0 < expDiff ) {
1045 if ( aExp == 0xFF ) {
1046 if ( aSig ) return propagateFloat32NaN( a, b );
1055 shift32RightJamming( bSig, expDiff, &bSig );
1058 else if ( expDiff < 0 ) {
1059 if ( bExp == 0xFF ) {
1060 if ( bSig ) return propagateFloat32NaN( a, b );
1061 return packFloat32( zSign, 0xFF, 0 );
1069 shift32RightJamming( aSig, - expDiff, &aSig );
1073 if ( aExp == 0xFF ) {
1074 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1077 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1078 zSig = 0x40000000 + aSig + bSig;
1083 zSig = ( aSig + bSig )<<1;
1085 if ( (sbits32) zSig < 0 ) {
1090 return roundAndPackFloat32( zSign, zExp, zSig );
1095 -------------------------------------------------------------------------------
1096 Returns the result of subtracting the absolute values of the single-
1097 precision floating-point values `a' and `b'. If `zSign' is true, the
1098 difference is negated before being returned. `zSign' is ignored if the
1099 result is a NaN. The subtraction is performed according to the IEC/IEEE
1100 Standard for Binary Floating-point Arithmetic.
1101 -------------------------------------------------------------------------------
1103 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1105 int16 aExp, bExp, zExp;
1106 bits32 aSig, bSig, zSig;
1109 aSig = extractFloat32Frac( a );
1110 aExp = extractFloat32Exp( a );
1111 bSig = extractFloat32Frac( b );
1112 bExp = extractFloat32Exp( b );
1113 expDiff = aExp - bExp;
1116 if ( 0 < expDiff ) goto aExpBigger;
1117 if ( expDiff < 0 ) goto bExpBigger;
1118 if ( aExp == 0xFF ) {
1119 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1120 float_raise( float_flag_invalid );
1121 return float32_default_nan;
1127 if ( bSig < aSig ) goto aBigger;
1128 if ( aSig < bSig ) goto bBigger;
1129 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1131 if ( bExp == 0xFF ) {
1132 if ( bSig ) return propagateFloat32NaN( a, b );
1133 return packFloat32( zSign ^ 1, 0xFF, 0 );
1141 shift32RightJamming( aSig, - expDiff, &aSig );
1147 goto normalizeRoundAndPack;
1149 if ( aExp == 0xFF ) {
1150 if ( aSig ) return propagateFloat32NaN( a, b );
1159 shift32RightJamming( bSig, expDiff, &bSig );
1164 normalizeRoundAndPack:
1166 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1171 -------------------------------------------------------------------------------
1172 Returns the result of adding the single-precision floating-point values `a'
1173 and `b'. The operation is performed according to the IEC/IEEE Standard for
1174 Binary Floating-point Arithmetic.
1175 -------------------------------------------------------------------------------
1177 float32 float32_add( float32 a, float32 b )
1181 aSign = extractFloat32Sign( a );
1182 bSign = extractFloat32Sign( b );
1183 if ( aSign == bSign ) {
1184 return addFloat32Sigs( a, b, aSign );
1187 return subFloat32Sigs( a, b, aSign );
1193 -------------------------------------------------------------------------------
1194 Returns the result of subtracting the single-precision floating-point values
1195 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1196 for Binary Floating-point Arithmetic.
1197 -------------------------------------------------------------------------------
1199 float32 float32_sub( float32 a, float32 b )
1203 aSign = extractFloat32Sign( a );
1204 bSign = extractFloat32Sign( b );
1205 if ( aSign == bSign ) {
1206 return subFloat32Sigs( a, b, aSign );
1209 return addFloat32Sigs( a, b, aSign );
1215 -------------------------------------------------------------------------------
1216 Returns the result of multiplying the single-precision floating-point values
1217 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1218 for Binary Floating-point Arithmetic.
1219 -------------------------------------------------------------------------------
1221 float32 float32_mul( float32 a, float32 b )
1223 flag aSign, bSign, zSign;
1224 int16 aExp, bExp, zExp;
1229 aSig = extractFloat32Frac( a );
1230 aExp = extractFloat32Exp( a );
1231 aSign = extractFloat32Sign( a );
1232 bSig = extractFloat32Frac( b );
1233 bExp = extractFloat32Exp( b );
1234 bSign = extractFloat32Sign( b );
1235 zSign = aSign ^ bSign;
1236 if ( aExp == 0xFF ) {
1237 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1238 return propagateFloat32NaN( a, b );
1240 if ( ( bExp | bSig ) == 0 ) {
1241 float_raise( float_flag_invalid );
1242 return float32_default_nan;
1244 return packFloat32( zSign, 0xFF, 0 );
1246 if ( bExp == 0xFF ) {
1247 if ( bSig ) return propagateFloat32NaN( a, b );
1248 if ( ( aExp | aSig ) == 0 ) {
1249 float_raise( float_flag_invalid );
1250 return float32_default_nan;
1252 return packFloat32( zSign, 0xFF, 0 );
1255 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1256 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1259 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1260 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1262 zExp = aExp + bExp - 0x7F;
1263 aSig = ( aSig | 0x00800000 )<<7;
1264 bSig = ( bSig | 0x00800000 )<<8;
1265 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1267 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1271 return roundAndPackFloat32( zSign, zExp, zSig );
1276 -------------------------------------------------------------------------------
1277 Returns the result of dividing the single-precision floating-point value `a'
1278 by the corresponding value `b'. The operation is performed according to the
1279 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1280 -------------------------------------------------------------------------------
1282 float32 float32_div( float32 a, float32 b )
1284 flag aSign, bSign, zSign;
1285 int16 aExp, bExp, zExp;
1286 bits32 aSig, bSig, zSig;
1288 aSig = extractFloat32Frac( a );
1289 aExp = extractFloat32Exp( a );
1290 aSign = extractFloat32Sign( a );
1291 bSig = extractFloat32Frac( b );
1292 bExp = extractFloat32Exp( b );
1293 bSign = extractFloat32Sign( b );
1294 zSign = aSign ^ bSign;
1295 if ( aExp == 0xFF ) {
1296 if ( aSig ) return propagateFloat32NaN( a, b );
1297 if ( bExp == 0xFF ) {
1298 if ( bSig ) return propagateFloat32NaN( a, b );
1299 float_raise( float_flag_invalid );
1300 return float32_default_nan;
1302 return packFloat32( zSign, 0xFF, 0 );
1304 if ( bExp == 0xFF ) {
1305 if ( bSig ) return propagateFloat32NaN( a, b );
1306 return packFloat32( zSign, 0, 0 );
1310 if ( ( aExp | aSig ) == 0 ) {
1311 float_raise( float_flag_invalid );
1312 return float32_default_nan;
1314 float_raise( float_flag_divbyzero );
1315 return packFloat32( zSign, 0xFF, 0 );
1317 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1320 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1321 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1323 zExp = aExp - bExp + 0x7D;
1324 aSig = ( aSig | 0x00800000 )<<7;
1325 bSig = ( bSig | 0x00800000 )<<8;
1326 if ( bSig <= ( aSig + aSig ) ) {
1330 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1331 if ( ( zSig & 0x3F ) == 0 ) {
1332 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1334 return roundAndPackFloat32( zSign, zExp, zSig );
1339 -------------------------------------------------------------------------------
1340 Returns the remainder of the single-precision floating-point value `a'
1341 with respect to the corresponding value `b'. The operation is performed
1342 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1343 -------------------------------------------------------------------------------
1345 float32 float32_rem( float32 a, float32 b )
1347 flag aSign, bSign, zSign;
1348 int16 aExp, bExp, expDiff;
1351 bits64 aSig64, bSig64, q64;
1352 bits32 alternateASig;
1355 aSig = extractFloat32Frac( a );
1356 aExp = extractFloat32Exp( a );
1357 aSign = extractFloat32Sign( a );
1358 bSig = extractFloat32Frac( b );
1359 bExp = extractFloat32Exp( b );
1360 bSign = extractFloat32Sign( b );
1361 if ( aExp == 0xFF ) {
1362 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1363 return propagateFloat32NaN( a, b );
1365 float_raise( float_flag_invalid );
1366 return float32_default_nan;
1368 if ( bExp == 0xFF ) {
1369 if ( bSig ) return propagateFloat32NaN( a, b );
1374 float_raise( float_flag_invalid );
1375 return float32_default_nan;
1377 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1380 if ( aSig == 0 ) return a;
1381 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1383 expDiff = aExp - bExp;
1386 if ( expDiff < 32 ) {
1389 if ( expDiff < 0 ) {
1390 if ( expDiff < -1 ) return a;
1393 q = ( bSig <= aSig );
1394 if ( q ) aSig -= bSig;
1395 if ( 0 < expDiff ) {
1396 q = ( ( (bits64) aSig )<<32 ) / bSig;
1399 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1407 if ( bSig <= aSig ) aSig -= bSig;
1408 aSig64 = ( (bits64) aSig )<<40;
1409 bSig64 = ( (bits64) bSig )<<40;
1411 while ( 0 < expDiff ) {
1412 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1413 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1414 aSig64 = - ( ( bSig * q64 )<<38 );
1418 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1419 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1420 q = q64>>( 64 - expDiff );
1422 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1425 alternateASig = aSig;
1428 } while ( 0 <= (sbits32) aSig );
1429 sigMean = aSig + alternateASig;
1430 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1431 aSig = alternateASig;
1433 zSign = ( (sbits32) aSig < 0 );
1434 if ( zSign ) aSig = - aSig;
1435 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1440 -------------------------------------------------------------------------------
1441 Returns the square root of the single-precision floating-point value `a'.
1442 The operation is performed according to the IEC/IEEE Standard for Binary
1443 Floating-point Arithmetic.
1444 -------------------------------------------------------------------------------
1446 float32 float32_sqrt( float32 a )
1453 aSig = extractFloat32Frac( a );
1454 aExp = extractFloat32Exp( a );
1455 aSign = extractFloat32Sign( a );
1456 if ( aExp == 0xFF ) {
1457 if ( aSig ) return propagateFloat32NaN( a, 0 );
1458 if ( ! aSign ) return a;
1459 float_raise( float_flag_invalid );
1460 return float32_default_nan;
1463 if ( ( aExp | aSig ) == 0 ) return a;
1464 float_raise( float_flag_invalid );
1465 return float32_default_nan;
1468 if ( aSig == 0 ) return 0;
1469 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1471 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1472 aSig = ( aSig | 0x00800000 )<<8;
1473 zSig = estimateSqrt32( aExp, aSig ) + 2;
1474 if ( ( zSig & 0x7F ) <= 5 ) {
1480 term = ( (bits64) zSig ) * zSig;
1481 rem = ( ( (bits64) aSig )<<32 ) - term;
1482 while ( (sbits64) rem < 0 ) {
1484 rem += ( ( (bits64) zSig )<<1 ) | 1;
1486 zSig |= ( rem != 0 );
1489 shift32RightJamming( zSig, 1, &zSig );
1490 return roundAndPackFloat32( 0, zExp, zSig );
1495 -------------------------------------------------------------------------------
1496 Returns 1 if the single-precision floating-point value `a' is equal to the
1497 corresponding value `b', and 0 otherwise. The comparison is performed
1498 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1499 -------------------------------------------------------------------------------
1501 flag float32_eq( float32 a, float32 b )
1504 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1505 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1507 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1508 float_raise( float_flag_invalid );
1512 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1517 -------------------------------------------------------------------------------
1518 Returns 1 if the single-precision floating-point value `a' is less than or
1519 equal to the corresponding value `b', and 0 otherwise. The comparison is
1520 performed according to the IEC/IEEE Standard for Binary Floating-point
1522 -------------------------------------------------------------------------------
1524 flag float32_le( float32 a, float32 b )
1528 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1529 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1531 float_raise( float_flag_invalid );
1534 aSign = extractFloat32Sign( a );
1535 bSign = extractFloat32Sign( b );
1536 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1537 return ( a == b ) || ( aSign ^ ( a < b ) );
1542 -------------------------------------------------------------------------------
1543 Returns 1 if the single-precision floating-point value `a' is less than
1544 the corresponding value `b', and 0 otherwise. The comparison is performed
1545 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1546 -------------------------------------------------------------------------------
1548 flag float32_lt( float32 a, float32 b )
1552 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1553 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1555 float_raise( float_flag_invalid );
1558 aSign = extractFloat32Sign( a );
1559 bSign = extractFloat32Sign( b );
1560 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1561 return ( a != b ) && ( aSign ^ ( a < b ) );
1566 -------------------------------------------------------------------------------
1567 Returns 1 if the single-precision floating-point value `a' is equal to the
1568 corresponding value `b', and 0 otherwise. The invalid exception is raised
1569 if either operand is a NaN. Otherwise, the comparison is performed
1570 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1571 -------------------------------------------------------------------------------
1573 flag float32_eq_signaling( float32 a, float32 b )
1576 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1577 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1579 float_raise( float_flag_invalid );
1582 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1587 -------------------------------------------------------------------------------
1588 Returns 1 if the single-precision floating-point value `a' is less than or
1589 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1590 cause an exception. Otherwise, the comparison is performed according to the
1591 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1592 -------------------------------------------------------------------------------
1594 flag float32_le_quiet( float32 a, float32 b )
1599 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1600 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1602 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1603 float_raise( float_flag_invalid );
1607 aSign = extractFloat32Sign( a );
1608 bSign = extractFloat32Sign( b );
1609 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1610 return ( a == b ) || ( aSign ^ ( a < b ) );
1615 -------------------------------------------------------------------------------
1616 Returns 1 if the single-precision floating-point value `a' is less than
1617 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1618 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1619 Standard for Binary Floating-point Arithmetic.
1620 -------------------------------------------------------------------------------
1622 flag float32_lt_quiet( float32 a, float32 b )
1626 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1627 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1629 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1630 float_raise( float_flag_invalid );
1634 aSign = extractFloat32Sign( a );
1635 bSign = extractFloat32Sign( b );
1636 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1637 return ( a != b ) && ( aSign ^ ( a < b ) );
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format. The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic---which means in particular that the conversion is rounded
1647 according to the current rounding mode. If `a' is a NaN, the largest
1648 positive integer is returned. Otherwise, if the conversion overflows, the
1649 largest integer with the same sign as `a' is returned.
1650 -------------------------------------------------------------------------------
1652 int32 float64_to_int32( float64 a )
1655 int16 aExp, shiftCount;
1658 aSig = extractFloat64Frac( a );
1659 aExp = extractFloat64Exp( a );
1660 aSign = extractFloat64Sign( a );
1661 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1662 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1663 shiftCount = 0x42C - aExp;
1664 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1665 return roundAndPackInt32( aSign, aSig );
1670 -------------------------------------------------------------------------------
1671 Returns the result of converting the double-precision floating-point value
1672 `a' to the 32-bit two's complement integer format. The conversion is
1673 performed according to the IEC/IEEE Standard for Binary Floating-point
1674 Arithmetic, except that the conversion is always rounded toward zero. If
1675 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1676 conversion overflows, the largest integer with the same sign as `a' is
1678 -------------------------------------------------------------------------------
1680 int32 float64_to_int32_round_to_zero( float64 a )
1683 int16 aExp, shiftCount;
1684 bits64 aSig, savedASig;
1687 aSig = extractFloat64Frac( a );
1688 aExp = extractFloat64Exp( a );
1689 aSign = extractFloat64Sign( a );
1690 shiftCount = 0x433 - aExp;
1691 if ( shiftCount < 21 ) {
1692 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1695 else if ( 52 < shiftCount ) {
1696 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1699 aSig |= LIT64( 0x0010000000000000 );
1701 aSig >>= shiftCount;
1703 if ( aSign ) z = - z;
1704 if ( ( z < 0 ) ^ aSign ) {
1706 float_exception_flags |= float_flag_invalid;
1707 return aSign ? 0x80000000 : 0x7FFFFFFF;
1709 if ( ( aSig<<shiftCount ) != savedASig ) {
1710 float_exception_flags |= float_flag_inexact;
1717 -------------------------------------------------------------------------------
1718 Returns the result of converting the double-precision floating-point value
1719 `a' to the 32-bit two's complement unsigned integer format. The conversion
1720 is performed according to the IEC/IEEE Standard for Binary Floating-point
1721 Arithmetic---which means in particular that the conversion is rounded
1722 according to the current rounding mode. If `a' is a NaN, the largest
1723 positive integer is returned. Otherwise, if the conversion overflows, the
1724 largest positive integer is returned.
1725 -------------------------------------------------------------------------------
1727 int32 float64_to_uint32( float64 a )
1730 int16 aExp, shiftCount;
1733 aSig = extractFloat64Frac( a );
1734 aExp = extractFloat64Exp( a );
1735 aSign = 0; //extractFloat64Sign( a );
1736 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1737 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1738 shiftCount = 0x42C - aExp;
1739 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1740 return roundAndPackInt32( aSign, aSig );
1744 -------------------------------------------------------------------------------
1745 Returns the result of converting the double-precision floating-point value
1746 `a' to the 32-bit two's complement integer format. The conversion is
1747 performed according to the IEC/IEEE Standard for Binary Floating-point
1748 Arithmetic, except that the conversion is always rounded toward zero. If
1749 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1750 conversion overflows, the largest positive integer is returned.
1751 -------------------------------------------------------------------------------
1753 int32 float64_to_uint32_round_to_zero( float64 a )
1756 int16 aExp, shiftCount;
1757 bits64 aSig, savedASig;
1760 aSig = extractFloat64Frac( a );
1761 aExp = extractFloat64Exp( a );
1762 aSign = extractFloat64Sign( a );
1763 shiftCount = 0x433 - aExp;
1764 if ( shiftCount < 21 ) {
1765 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1768 else if ( 52 < shiftCount ) {
1769 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1772 aSig |= LIT64( 0x0010000000000000 );
1774 aSig >>= shiftCount;
1776 if ( aSign ) z = - z;
1777 if ( ( z < 0 ) ^ aSign ) {
1779 float_exception_flags |= float_flag_invalid;
1780 return aSign ? 0x80000000 : 0x7FFFFFFF;
1782 if ( ( aSig<<shiftCount ) != savedASig ) {
1783 float_exception_flags |= float_flag_inexact;
1789 -------------------------------------------------------------------------------
1790 Returns the result of converting the double-precision floating-point value
1791 `a' to the single-precision floating-point format. The conversion is
1792 performed according to the IEC/IEEE Standard for Binary Floating-point
1794 -------------------------------------------------------------------------------
1796 float32 float64_to_float32( float64 a )
1803 aSig = extractFloat64Frac( a );
1804 aExp = extractFloat64Exp( a );
1805 aSign = extractFloat64Sign( a );
1806 if ( aExp == 0x7FF ) {
1807 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1808 return packFloat32( aSign, 0xFF, 0 );
1810 shift64RightJamming( aSig, 22, &aSig );
1812 if ( aExp || zSig ) {
1816 return roundAndPackFloat32( aSign, aExp, zSig );
1823 -------------------------------------------------------------------------------
1824 Returns the result of converting the double-precision floating-point value
1825 `a' to the extended double-precision floating-point format. The conversion
1826 is performed according to the IEC/IEEE Standard for Binary Floating-point
1828 -------------------------------------------------------------------------------
1830 floatx80 float64_to_floatx80( float64 a )
1836 aSig = extractFloat64Frac( a );
1837 aExp = extractFloat64Exp( a );
1838 aSign = extractFloat64Sign( a );
1839 if ( aExp == 0x7FF ) {
1840 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1841 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1844 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1845 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1849 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1856 -------------------------------------------------------------------------------
1857 Rounds the double-precision floating-point value `a' to an integer, and
1858 returns the result as a double-precision floating-point value. The
1859 operation is performed according to the IEC/IEEE Standard for Binary
1860 Floating-point Arithmetic.
1861 -------------------------------------------------------------------------------
1863 float64 float64_round_to_int( float64 a )
1867 bits64 lastBitMask, roundBitsMask;
1871 aExp = extractFloat64Exp( a );
1872 if ( 0x433 <= aExp ) {
1873 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1874 return propagateFloat64NaN( a, a );
1878 if ( aExp <= 0x3FE ) {
1879 if ( (bits64) ( a<<1 ) == 0 ) return a;
1880 float_exception_flags |= float_flag_inexact;
1881 aSign = extractFloat64Sign( a );
1882 switch ( float_rounding_mode ) {
1883 case float_round_nearest_even:
1884 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1885 return packFloat64( aSign, 0x3FF, 0 );
1888 case float_round_down:
1889 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1890 case float_round_up:
1892 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1894 return packFloat64( aSign, 0, 0 );
1897 lastBitMask <<= 0x433 - aExp;
1898 roundBitsMask = lastBitMask - 1;
1900 roundingMode = float_rounding_mode;
1901 if ( roundingMode == float_round_nearest_even ) {
1902 z += lastBitMask>>1;
1903 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1905 else if ( roundingMode != float_round_to_zero ) {
1906 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1910 z &= ~ roundBitsMask;
1911 if ( z != a ) float_exception_flags |= float_flag_inexact;
1917 -------------------------------------------------------------------------------
1918 Returns the result of adding the absolute values of the double-precision
1919 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1920 before being returned. `zSign' is ignored if the result is a NaN. The
1921 addition is performed according to the IEC/IEEE Standard for Binary
1922 Floating-point Arithmetic.
1923 -------------------------------------------------------------------------------
1925 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1927 int16 aExp, bExp, zExp;
1928 bits64 aSig, bSig, zSig;
1931 aSig = extractFloat64Frac( a );
1932 aExp = extractFloat64Exp( a );
1933 bSig = extractFloat64Frac( b );
1934 bExp = extractFloat64Exp( b );
1935 expDiff = aExp - bExp;
1938 if ( 0 < expDiff ) {
1939 if ( aExp == 0x7FF ) {
1940 if ( aSig ) return propagateFloat64NaN( a, b );
1947 bSig |= LIT64( 0x2000000000000000 );
1949 shift64RightJamming( bSig, expDiff, &bSig );
1952 else if ( expDiff < 0 ) {
1953 if ( bExp == 0x7FF ) {
1954 if ( bSig ) return propagateFloat64NaN( a, b );
1955 return packFloat64( zSign, 0x7FF, 0 );
1961 aSig |= LIT64( 0x2000000000000000 );
1963 shift64RightJamming( aSig, - expDiff, &aSig );
1967 if ( aExp == 0x7FF ) {
1968 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1971 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1972 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1976 aSig |= LIT64( 0x2000000000000000 );
1977 zSig = ( aSig + bSig )<<1;
1979 if ( (sbits64) zSig < 0 ) {
1984 return roundAndPackFloat64( zSign, zExp, zSig );
1989 -------------------------------------------------------------------------------
1990 Returns the result of subtracting the absolute values of the double-
1991 precision floating-point values `a' and `b'. If `zSign' is true, the
1992 difference is negated before being returned. `zSign' is ignored if the
1993 result is a NaN. The subtraction is performed according to the IEC/IEEE
1994 Standard for Binary Floating-point Arithmetic.
1995 -------------------------------------------------------------------------------
1997 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1999 int16 aExp, bExp, zExp;
2000 bits64 aSig, bSig, zSig;
2003 aSig = extractFloat64Frac( a );
2004 aExp = extractFloat64Exp( a );
2005 bSig = extractFloat64Frac( b );
2006 bExp = extractFloat64Exp( b );
2007 expDiff = aExp - bExp;
2010 if ( 0 < expDiff ) goto aExpBigger;
2011 if ( expDiff < 0 ) goto bExpBigger;
2012 if ( aExp == 0x7FF ) {
2013 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2014 float_raise( float_flag_invalid );
2015 return float64_default_nan;
2021 if ( bSig < aSig ) goto aBigger;
2022 if ( aSig < bSig ) goto bBigger;
2023 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2025 if ( bExp == 0x7FF ) {
2026 if ( bSig ) return propagateFloat64NaN( a, b );
2027 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2033 aSig |= LIT64( 0x4000000000000000 );
2035 shift64RightJamming( aSig, - expDiff, &aSig );
2036 bSig |= LIT64( 0x4000000000000000 );
2041 goto normalizeRoundAndPack;
2043 if ( aExp == 0x7FF ) {
2044 if ( aSig ) return propagateFloat64NaN( a, b );
2051 bSig |= LIT64( 0x4000000000000000 );
2053 shift64RightJamming( bSig, expDiff, &bSig );
2054 aSig |= LIT64( 0x4000000000000000 );
2058 normalizeRoundAndPack:
2060 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2065 -------------------------------------------------------------------------------
2066 Returns the result of adding the double-precision floating-point values `a'
2067 and `b'. The operation is performed according to the IEC/IEEE Standard for
2068 Binary Floating-point Arithmetic.
2069 -------------------------------------------------------------------------------
2071 float64 float64_add( float64 a, float64 b )
2075 aSign = extractFloat64Sign( a );
2076 bSign = extractFloat64Sign( b );
2077 if ( aSign == bSign ) {
2078 return addFloat64Sigs( a, b, aSign );
2081 return subFloat64Sigs( a, b, aSign );
2087 -------------------------------------------------------------------------------
2088 Returns the result of subtracting the double-precision floating-point values
2089 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2090 for Binary Floating-point Arithmetic.
2091 -------------------------------------------------------------------------------
2093 float64 float64_sub( float64 a, float64 b )
2097 aSign = extractFloat64Sign( a );
2098 bSign = extractFloat64Sign( b );
2099 if ( aSign == bSign ) {
2100 return subFloat64Sigs( a, b, aSign );
2103 return addFloat64Sigs( a, b, aSign );
2109 -------------------------------------------------------------------------------
2110 Returns the result of multiplying the double-precision floating-point values
2111 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2112 for Binary Floating-point Arithmetic.
2113 -------------------------------------------------------------------------------
2115 float64 float64_mul( float64 a, float64 b )
2117 flag aSign, bSign, zSign;
2118 int16 aExp, bExp, zExp;
2119 bits64 aSig, bSig, zSig0, zSig1;
2121 aSig = extractFloat64Frac( a );
2122 aExp = extractFloat64Exp( a );
2123 aSign = extractFloat64Sign( a );
2124 bSig = extractFloat64Frac( b );
2125 bExp = extractFloat64Exp( b );
2126 bSign = extractFloat64Sign( b );
2127 zSign = aSign ^ bSign;
2128 if ( aExp == 0x7FF ) {
2129 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2130 return propagateFloat64NaN( a, b );
2132 if ( ( bExp | bSig ) == 0 ) {
2133 float_raise( float_flag_invalid );
2134 return float64_default_nan;
2136 return packFloat64( zSign, 0x7FF, 0 );
2138 if ( bExp == 0x7FF ) {
2139 if ( bSig ) return propagateFloat64NaN( a, b );
2140 if ( ( aExp | aSig ) == 0 ) {
2141 float_raise( float_flag_invalid );
2142 return float64_default_nan;
2144 return packFloat64( zSign, 0x7FF, 0 );
2147 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2148 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2151 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2152 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2154 zExp = aExp + bExp - 0x3FF;
2155 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2156 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2157 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2158 zSig0 |= ( zSig1 != 0 );
2159 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2163 return roundAndPackFloat64( zSign, zExp, zSig0 );
2168 -------------------------------------------------------------------------------
2169 Returns the result of dividing the double-precision floating-point value `a'
2170 by the corresponding value `b'. The operation is performed according to
2171 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2172 -------------------------------------------------------------------------------
2174 float64 float64_div( float64 a, float64 b )
2176 flag aSign, bSign, zSign;
2177 int16 aExp, bExp, zExp;
2178 bits64 aSig, bSig, zSig;
2180 bits64 term0, term1;
2182 aSig = extractFloat64Frac( a );
2183 aExp = extractFloat64Exp( a );
2184 aSign = extractFloat64Sign( a );
2185 bSig = extractFloat64Frac( b );
2186 bExp = extractFloat64Exp( b );
2187 bSign = extractFloat64Sign( b );
2188 zSign = aSign ^ bSign;
2189 if ( aExp == 0x7FF ) {
2190 if ( aSig ) return propagateFloat64NaN( a, b );
2191 if ( bExp == 0x7FF ) {
2192 if ( bSig ) return propagateFloat64NaN( a, b );
2193 float_raise( float_flag_invalid );
2194 return float64_default_nan;
2196 return packFloat64( zSign, 0x7FF, 0 );
2198 if ( bExp == 0x7FF ) {
2199 if ( bSig ) return propagateFloat64NaN( a, b );
2200 return packFloat64( zSign, 0, 0 );
2204 if ( ( aExp | aSig ) == 0 ) {
2205 float_raise( float_flag_invalid );
2206 return float64_default_nan;
2208 float_raise( float_flag_divbyzero );
2209 return packFloat64( zSign, 0x7FF, 0 );
2211 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2214 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2215 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2217 zExp = aExp - bExp + 0x3FD;
2218 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2219 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2220 if ( bSig <= ( aSig + aSig ) ) {
2224 zSig = estimateDiv128To64( aSig, 0, bSig );
2225 if ( ( zSig & 0x1FF ) <= 2 ) {
2226 mul64To128( bSig, zSig, &term0, &term1 );
2227 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2228 while ( (sbits64) rem0 < 0 ) {
2230 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2232 zSig |= ( rem1 != 0 );
2234 return roundAndPackFloat64( zSign, zExp, zSig );
2239 -------------------------------------------------------------------------------
2240 Returns the remainder of the double-precision floating-point value `a'
2241 with respect to the corresponding value `b'. The operation is performed
2242 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2243 -------------------------------------------------------------------------------
2245 float64 float64_rem( float64 a, float64 b )
2247 flag aSign, bSign, zSign;
2248 int16 aExp, bExp, expDiff;
2250 bits64 q, alternateASig;
2253 aSig = extractFloat64Frac( a );
2254 aExp = extractFloat64Exp( a );
2255 aSign = extractFloat64Sign( a );
2256 bSig = extractFloat64Frac( b );
2257 bExp = extractFloat64Exp( b );
2258 bSign = extractFloat64Sign( b );
2259 if ( aExp == 0x7FF ) {
2260 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2261 return propagateFloat64NaN( a, b );
2263 float_raise( float_flag_invalid );
2264 return float64_default_nan;
2266 if ( bExp == 0x7FF ) {
2267 if ( bSig ) return propagateFloat64NaN( a, b );
2272 float_raise( float_flag_invalid );
2273 return float64_default_nan;
2275 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2278 if ( aSig == 0 ) return a;
2279 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2281 expDiff = aExp - bExp;
2282 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2283 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2284 if ( expDiff < 0 ) {
2285 if ( expDiff < -1 ) return a;
2288 q = ( bSig <= aSig );
2289 if ( q ) aSig -= bSig;
2291 while ( 0 < expDiff ) {
2292 q = estimateDiv128To64( aSig, 0, bSig );
2293 q = ( 2 < q ) ? q - 2 : 0;
2294 aSig = - ( ( bSig>>2 ) * q );
2298 if ( 0 < expDiff ) {
2299 q = estimateDiv128To64( aSig, 0, bSig );
2300 q = ( 2 < q ) ? q - 2 : 0;
2303 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2310 alternateASig = aSig;
2313 } while ( 0 <= (sbits64) aSig );
2314 sigMean = aSig + alternateASig;
2315 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2316 aSig = alternateASig;
2318 zSign = ( (sbits64) aSig < 0 );
2319 if ( zSign ) aSig = - aSig;
2320 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2325 -------------------------------------------------------------------------------
2326 Returns the square root of the double-precision floating-point value `a'.
2327 The operation is performed according to the IEC/IEEE Standard for Binary
2328 Floating-point Arithmetic.
2329 -------------------------------------------------------------------------------
2331 float64 float64_sqrt( float64 a )
2336 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2339 aSig = extractFloat64Frac( a );
2340 aExp = extractFloat64Exp( a );
2341 aSign = extractFloat64Sign( a );
2342 if ( aExp == 0x7FF ) {
2343 if ( aSig ) return propagateFloat64NaN( a, a );
2344 if ( ! aSign ) return a;
2345 float_raise( float_flag_invalid );
2346 return float64_default_nan;
2349 if ( ( aExp | aSig ) == 0 ) return a;
2350 float_raise( float_flag_invalid );
2351 return float64_default_nan;
2354 if ( aSig == 0 ) return 0;
2355 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2357 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2358 aSig |= LIT64( 0x0010000000000000 );
2359 zSig = estimateSqrt32( aExp, aSig>>21 );
2361 aSig <<= 9 - ( aExp & 1 );
2362 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2363 if ( ( zSig & 0x3FF ) <= 5 ) {
2365 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2369 mul64To128( zSig, zSig, &term0, &term1 );
2370 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2371 while ( (sbits64) rem0 < 0 ) {
2373 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2375 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2377 zSig |= ( ( rem0 | rem1 ) != 0 );
2380 shift64RightJamming( zSig, 1, &zSig );
2381 return roundAndPackFloat64( 0, zExp, zSig );
2386 -------------------------------------------------------------------------------
2387 Returns 1 if the double-precision floating-point value `a' is equal to the
2388 corresponding value `b', and 0 otherwise. The comparison is performed
2389 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2390 -------------------------------------------------------------------------------
2392 flag float64_eq( float64 a, float64 b )
2395 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2396 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2398 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2399 float_raise( float_flag_invalid );
2403 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2408 -------------------------------------------------------------------------------
2409 Returns 1 if the double-precision floating-point value `a' is less than or
2410 equal to the corresponding value `b', and 0 otherwise. The comparison is
2411 performed according to the IEC/IEEE Standard for Binary Floating-point
2413 -------------------------------------------------------------------------------
2415 flag float64_le( float64 a, float64 b )
2419 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2420 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2422 float_raise( float_flag_invalid );
2425 aSign = extractFloat64Sign( a );
2426 bSign = extractFloat64Sign( b );
2427 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2428 return ( a == b ) || ( aSign ^ ( a < b ) );
2433 -------------------------------------------------------------------------------
2434 Returns 1 if the double-precision floating-point value `a' is less than
2435 the corresponding value `b', and 0 otherwise. The comparison is performed
2436 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2437 -------------------------------------------------------------------------------
2439 flag float64_lt( float64 a, float64 b )
2443 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2444 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2446 float_raise( float_flag_invalid );
2449 aSign = extractFloat64Sign( a );
2450 bSign = extractFloat64Sign( b );
2451 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2452 return ( a != b ) && ( aSign ^ ( a < b ) );
2457 -------------------------------------------------------------------------------
2458 Returns 1 if the double-precision floating-point value `a' is equal to the
2459 corresponding value `b', and 0 otherwise. The invalid exception is raised
2460 if either operand is a NaN. Otherwise, the comparison is performed
2461 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2462 -------------------------------------------------------------------------------
2464 flag float64_eq_signaling( float64 a, float64 b )
2467 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2468 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2470 float_raise( float_flag_invalid );
2473 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2478 -------------------------------------------------------------------------------
2479 Returns 1 if the double-precision floating-point value `a' is less than or
2480 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2481 cause an exception. Otherwise, the comparison is performed according to the
2482 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2483 -------------------------------------------------------------------------------
2485 flag float64_le_quiet( float64 a, float64 b )
2490 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2491 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2493 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2494 float_raise( float_flag_invalid );
2498 aSign = extractFloat64Sign( a );
2499 bSign = extractFloat64Sign( b );
2500 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2501 return ( a == b ) || ( aSign ^ ( a < b ) );
2506 -------------------------------------------------------------------------------
2507 Returns 1 if the double-precision floating-point value `a' is less than
2508 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2509 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2510 Standard for Binary Floating-point Arithmetic.
2511 -------------------------------------------------------------------------------
2513 flag float64_lt_quiet( float64 a, float64 b )
2517 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2518 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2520 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2521 float_raise( float_flag_invalid );
2525 aSign = extractFloat64Sign( a );
2526 bSign = extractFloat64Sign( b );
2527 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2528 return ( a != b ) && ( aSign ^ ( a < b ) );
2535 -------------------------------------------------------------------------------
2536 Returns the result of converting the extended double-precision floating-
2537 point value `a' to the 32-bit two's complement integer format. The
2538 conversion is performed according to the IEC/IEEE Standard for Binary
2539 Floating-point Arithmetic---which means in particular that the conversion
2540 is rounded according to the current rounding mode. If `a' is a NaN, the
2541 largest positive integer is returned. Otherwise, if the conversion
2542 overflows, the largest integer with the same sign as `a' is returned.
2543 -------------------------------------------------------------------------------
2545 int32 floatx80_to_int32( floatx80 a )
2548 int32 aExp, shiftCount;
2551 aSig = extractFloatx80Frac( a );
2552 aExp = extractFloatx80Exp( a );
2553 aSign = extractFloatx80Sign( a );
2554 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2555 shiftCount = 0x4037 - aExp;
2556 if ( shiftCount <= 0 ) shiftCount = 1;
2557 shift64RightJamming( aSig, shiftCount, &aSig );
2558 return roundAndPackInt32( aSign, aSig );
2563 -------------------------------------------------------------------------------
2564 Returns the result of converting the extended double-precision floating-
2565 point value `a' to the 32-bit two's complement integer format. The
2566 conversion is performed according to the IEC/IEEE Standard for Binary
2567 Floating-point Arithmetic, except that the conversion is always rounded
2568 toward zero. If `a' is a NaN, the largest positive integer is returned.
2569 Otherwise, if the conversion overflows, the largest integer with the same
2570 sign as `a' is returned.
2571 -------------------------------------------------------------------------------
2573 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2576 int32 aExp, shiftCount;
2577 bits64 aSig, savedASig;
2580 aSig = extractFloatx80Frac( a );
2581 aExp = extractFloatx80Exp( a );
2582 aSign = extractFloatx80Sign( a );
2583 shiftCount = 0x403E - aExp;
2584 if ( shiftCount < 32 ) {
2585 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2588 else if ( 63 < shiftCount ) {
2589 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2593 aSig >>= shiftCount;
2595 if ( aSign ) z = - z;
2596 if ( ( z < 0 ) ^ aSign ) {
2598 float_exception_flags |= float_flag_invalid;
2599 return aSign ? 0x80000000 : 0x7FFFFFFF;
2601 if ( ( aSig<<shiftCount ) != savedASig ) {
2602 float_exception_flags |= float_flag_inexact;
2609 -------------------------------------------------------------------------------
2610 Returns the result of converting the extended double-precision floating-
2611 point value `a' to the single-precision floating-point format. The
2612 conversion is performed according to the IEC/IEEE Standard for Binary
2613 Floating-point Arithmetic.
2614 -------------------------------------------------------------------------------
2616 float32 floatx80_to_float32( floatx80 a )
2622 aSig = extractFloatx80Frac( a );
2623 aExp = extractFloatx80Exp( a );
2624 aSign = extractFloatx80Sign( a );
2625 if ( aExp == 0x7FFF ) {
2626 if ( (bits64) ( aSig<<1 ) ) {
2627 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2629 return packFloat32( aSign, 0xFF, 0 );
2631 shift64RightJamming( aSig, 33, &aSig );
2632 if ( aExp || aSig ) aExp -= 0x3F81;
2633 return roundAndPackFloat32( aSign, aExp, aSig );
2638 -------------------------------------------------------------------------------
2639 Returns the result of converting the extended double-precision floating-
2640 point value `a' to the double-precision floating-point format. The
2641 conversion is performed according to the IEC/IEEE Standard for Binary
2642 Floating-point Arithmetic.
2643 -------------------------------------------------------------------------------
2645 float64 floatx80_to_float64( floatx80 a )
2651 aSig = extractFloatx80Frac( a );
2652 aExp = extractFloatx80Exp( a );
2653 aSign = extractFloatx80Sign( a );
2654 if ( aExp == 0x7FFF ) {
2655 if ( (bits64) ( aSig<<1 ) ) {
2656 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2658 return packFloat64( aSign, 0x7FF, 0 );
2660 shift64RightJamming( aSig, 1, &zSig );
2661 if ( aExp || aSig ) aExp -= 0x3C01;
2662 return roundAndPackFloat64( aSign, aExp, zSig );
2667 -------------------------------------------------------------------------------
2668 Rounds the extended double-precision floating-point value `a' to an integer,
2669 and returns the result as an extended quadruple-precision floating-point
2670 value. The operation is performed according to the IEC/IEEE Standard for
2671 Binary Floating-point Arithmetic.
2672 -------------------------------------------------------------------------------
2674 floatx80 floatx80_round_to_int( floatx80 a )
2678 bits64 lastBitMask, roundBitsMask;
2682 aExp = extractFloatx80Exp( a );
2683 if ( 0x403E <= aExp ) {
2684 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2685 return propagateFloatx80NaN( a, a );
2689 if ( aExp <= 0x3FFE ) {
2691 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2694 float_exception_flags |= float_flag_inexact;
2695 aSign = extractFloatx80Sign( a );
2696 switch ( float_rounding_mode ) {
2697 case float_round_nearest_even:
2698 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2701 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2704 case float_round_down:
2707 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2708 : packFloatx80( 0, 0, 0 );
2709 case float_round_up:
2711 aSign ? packFloatx80( 1, 0, 0 )
2712 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2714 return packFloatx80( aSign, 0, 0 );
2717 lastBitMask <<= 0x403E - aExp;
2718 roundBitsMask = lastBitMask - 1;
2720 roundingMode = float_rounding_mode;
2721 if ( roundingMode == float_round_nearest_even ) {
2722 z.low += lastBitMask>>1;
2723 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2725 else if ( roundingMode != float_round_to_zero ) {
2726 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2727 z.low += roundBitsMask;
2730 z.low &= ~ roundBitsMask;
2733 z.low = LIT64( 0x8000000000000000 );
2735 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2741 -------------------------------------------------------------------------------
2742 Returns the result of adding the absolute values of the extended double-
2743 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2744 negated before being returned. `zSign' is ignored if the result is a NaN.
2745 The addition is performed according to the IEC/IEEE Standard for Binary
2746 Floating-point Arithmetic.
2747 -------------------------------------------------------------------------------
2749 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2751 int32 aExp, bExp, zExp;
2752 bits64 aSig, bSig, zSig0, zSig1;
2755 aSig = extractFloatx80Frac( a );
2756 aExp = extractFloatx80Exp( a );
2757 bSig = extractFloatx80Frac( b );
2758 bExp = extractFloatx80Exp( b );
2759 expDiff = aExp - bExp;
2760 if ( 0 < expDiff ) {
2761 if ( aExp == 0x7FFF ) {
2762 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2765 if ( bExp == 0 ) --expDiff;
2766 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2769 else if ( expDiff < 0 ) {
2770 if ( bExp == 0x7FFF ) {
2771 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2772 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2774 if ( aExp == 0 ) ++expDiff;
2775 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2779 if ( aExp == 0x7FFF ) {
2780 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2781 return propagateFloatx80NaN( a, b );
2786 zSig0 = aSig + bSig;
2788 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2795 zSig0 = aSig + bSig;
2797 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2799 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2800 zSig0 |= LIT64( 0x8000000000000000 );
2804 roundAndPackFloatx80(
2805 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2810 -------------------------------------------------------------------------------
2811 Returns the result of subtracting the absolute values of the extended
2812 double-precision floating-point values `a' and `b'. If `zSign' is true,
2813 the difference is negated before being returned. `zSign' is ignored if the
2814 result is a NaN. The subtraction is performed according to the IEC/IEEE
2815 Standard for Binary Floating-point Arithmetic.
2816 -------------------------------------------------------------------------------
2818 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2820 int32 aExp, bExp, zExp;
2821 bits64 aSig, bSig, zSig0, zSig1;
2825 aSig = extractFloatx80Frac( a );
2826 aExp = extractFloatx80Exp( a );
2827 bSig = extractFloatx80Frac( b );
2828 bExp = extractFloatx80Exp( b );
2829 expDiff = aExp - bExp;
2830 if ( 0 < expDiff ) goto aExpBigger;
2831 if ( expDiff < 0 ) goto bExpBigger;
2832 if ( aExp == 0x7FFF ) {
2833 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2834 return propagateFloatx80NaN( a, b );
2836 float_raise( float_flag_invalid );
2837 z.low = floatx80_default_nan_low;
2838 z.high = floatx80_default_nan_high;
2846 if ( bSig < aSig ) goto aBigger;
2847 if ( aSig < bSig ) goto bBigger;
2848 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2850 if ( bExp == 0x7FFF ) {
2851 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2852 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2854 if ( aExp == 0 ) ++expDiff;
2855 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2857 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2860 goto normalizeRoundAndPack;
2862 if ( aExp == 0x7FFF ) {
2863 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2866 if ( bExp == 0 ) --expDiff;
2867 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2869 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2871 normalizeRoundAndPack:
2873 normalizeRoundAndPackFloatx80(
2874 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2879 -------------------------------------------------------------------------------
2880 Returns the result of adding the extended double-precision floating-point
2881 values `a' and `b'. The operation is performed according to the IEC/IEEE
2882 Standard for Binary Floating-point Arithmetic.
2883 -------------------------------------------------------------------------------
2885 floatx80 floatx80_add( floatx80 a, floatx80 b )
2889 aSign = extractFloatx80Sign( a );
2890 bSign = extractFloatx80Sign( b );
2891 if ( aSign == bSign ) {
2892 return addFloatx80Sigs( a, b, aSign );
2895 return subFloatx80Sigs( a, b, aSign );
2901 -------------------------------------------------------------------------------
2902 Returns the result of subtracting the extended double-precision floating-
2903 point values `a' and `b'. The operation is performed according to the
2904 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2905 -------------------------------------------------------------------------------
2907 floatx80 floatx80_sub( floatx80 a, floatx80 b )
2911 aSign = extractFloatx80Sign( a );
2912 bSign = extractFloatx80Sign( b );
2913 if ( aSign == bSign ) {
2914 return subFloatx80Sigs( a, b, aSign );
2917 return addFloatx80Sigs( a, b, aSign );
2923 -------------------------------------------------------------------------------
2924 Returns the result of multiplying the extended double-precision floating-
2925 point values `a' and `b'. The operation is performed according to the
2926 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2927 -------------------------------------------------------------------------------
2929 floatx80 floatx80_mul( floatx80 a, floatx80 b )
2931 flag aSign, bSign, zSign;
2932 int32 aExp, bExp, zExp;
2933 bits64 aSig, bSig, zSig0, zSig1;
2936 aSig = extractFloatx80Frac( a );
2937 aExp = extractFloatx80Exp( a );
2938 aSign = extractFloatx80Sign( a );
2939 bSig = extractFloatx80Frac( b );
2940 bExp = extractFloatx80Exp( b );
2941 bSign = extractFloatx80Sign( b );
2942 zSign = aSign ^ bSign;
2943 if ( aExp == 0x7FFF ) {
2944 if ( (bits64) ( aSig<<1 )
2945 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2946 return propagateFloatx80NaN( a, b );
2948 if ( ( bExp | bSig ) == 0 ) goto invalid;
2949 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2951 if ( bExp == 0x7FFF ) {
2952 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2953 if ( ( aExp | aSig ) == 0 ) {
2955 float_raise( float_flag_invalid );
2956 z.low = floatx80_default_nan_low;
2957 z.high = floatx80_default_nan_high;
2960 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2963 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2964 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2967 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2968 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2970 zExp = aExp + bExp - 0x3FFE;
2971 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2972 if ( 0 < (sbits64) zSig0 ) {
2973 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2977 roundAndPackFloatx80(
2978 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2983 -------------------------------------------------------------------------------
2984 Returns the result of dividing the extended double-precision floating-point
2985 value `a' by the corresponding value `b'. The operation is performed
2986 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2987 -------------------------------------------------------------------------------
2989 floatx80 floatx80_div( floatx80 a, floatx80 b )
2991 flag aSign, bSign, zSign;
2992 int32 aExp, bExp, zExp;
2993 bits64 aSig, bSig, zSig0, zSig1;
2994 bits64 rem0, rem1, rem2, term0, term1, term2;
2997 aSig = extractFloatx80Frac( a );
2998 aExp = extractFloatx80Exp( a );
2999 aSign = extractFloatx80Sign( a );
3000 bSig = extractFloatx80Frac( b );
3001 bExp = extractFloatx80Exp( b );
3002 bSign = extractFloatx80Sign( b );
3003 zSign = aSign ^ bSign;
3004 if ( aExp == 0x7FFF ) {
3005 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3006 if ( bExp == 0x7FFF ) {
3007 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3010 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3012 if ( bExp == 0x7FFF ) {
3013 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3014 return packFloatx80( zSign, 0, 0 );
3018 if ( ( aExp | aSig ) == 0 ) {
3020 float_raise( float_flag_invalid );
3021 z.low = floatx80_default_nan_low;
3022 z.high = floatx80_default_nan_high;
3025 float_raise( float_flag_divbyzero );
3026 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3028 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3031 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3032 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3034 zExp = aExp - bExp + 0x3FFE;
3036 if ( bSig <= aSig ) {
3037 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3040 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3041 mul64To128( bSig, zSig0, &term0, &term1 );
3042 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3043 while ( (sbits64) rem0 < 0 ) {
3045 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3047 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3048 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3049 mul64To128( bSig, zSig1, &term1, &term2 );
3050 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3051 while ( (sbits64) rem1 < 0 ) {
3053 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3055 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3058 roundAndPackFloatx80(
3059 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3064 -------------------------------------------------------------------------------
3065 Returns the remainder of the extended double-precision floating-point value
3066 `a' with respect to the corresponding value `b'. The operation is performed
3067 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3068 -------------------------------------------------------------------------------
3070 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3072 flag aSign, bSign, zSign;
3073 int32 aExp, bExp, expDiff;
3074 bits64 aSig0, aSig1, bSig;
3075 bits64 q, term0, term1, alternateASig0, alternateASig1;
3078 aSig0 = extractFloatx80Frac( a );
3079 aExp = extractFloatx80Exp( a );
3080 aSign = extractFloatx80Sign( a );
3081 bSig = extractFloatx80Frac( b );
3082 bExp = extractFloatx80Exp( b );
3083 bSign = extractFloatx80Sign( b );
3084 if ( aExp == 0x7FFF ) {
3085 if ( (bits64) ( aSig0<<1 )
3086 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3087 return propagateFloatx80NaN( a, b );
3091 if ( bExp == 0x7FFF ) {
3092 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3098 float_raise( float_flag_invalid );
3099 z.low = floatx80_default_nan_low;
3100 z.high = floatx80_default_nan_high;
3103 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3106 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3107 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3109 bSig |= LIT64( 0x8000000000000000 );
3111 expDiff = aExp - bExp;
3113 if ( expDiff < 0 ) {
3114 if ( expDiff < -1 ) return a;
3115 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3118 q = ( bSig <= aSig0 );
3119 if ( q ) aSig0 -= bSig;
3121 while ( 0 < expDiff ) {
3122 q = estimateDiv128To64( aSig0, aSig1, bSig );
3123 q = ( 2 < q ) ? q - 2 : 0;
3124 mul64To128( bSig, q, &term0, &term1 );
3125 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3126 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3130 if ( 0 < expDiff ) {
3131 q = estimateDiv128To64( aSig0, aSig1, bSig );
3132 q = ( 2 < q ) ? q - 2 : 0;
3134 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3135 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3136 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3137 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3139 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3146 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3147 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3148 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3151 aSig0 = alternateASig0;
3152 aSig1 = alternateASig1;
3156 normalizeRoundAndPackFloatx80(
3157 80, zSign, bExp + expDiff, aSig0, aSig1 );
3162 -------------------------------------------------------------------------------
3163 Returns the square root of the extended double-precision floating-point
3164 value `a'. The operation is performed according to the IEC/IEEE Standard
3165 for Binary Floating-point Arithmetic.
3166 -------------------------------------------------------------------------------
3168 floatx80 floatx80_sqrt( floatx80 a )
3172 bits64 aSig0, aSig1, zSig0, zSig1;
3173 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3174 bits64 shiftedRem0, shiftedRem1;
3177 aSig0 = extractFloatx80Frac( a );
3178 aExp = extractFloatx80Exp( a );
3179 aSign = extractFloatx80Sign( a );
3180 if ( aExp == 0x7FFF ) {
3181 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3182 if ( ! aSign ) return a;
3186 if ( ( aExp | aSig0 ) == 0 ) return a;
3188 float_raise( float_flag_invalid );
3189 z.low = floatx80_default_nan_low;
3190 z.high = floatx80_default_nan_high;
3194 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3195 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3197 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3198 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3201 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3202 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3203 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3204 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3205 mul64To128( zSig0, zSig0, &term0, &term1 );
3206 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3207 while ( (sbits64) rem0 < 0 ) {
3209 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3211 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3213 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3214 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3215 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3216 if ( zSig1 == 0 ) zSig1 = 1;
3217 mul64To128( zSig0, zSig1, &term1, &term2 );
3218 shortShift128Left( term1, term2, 1, &term1, &term2 );
3219 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3220 mul64To128( zSig1, zSig1, &term2, &term3 );
3221 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3222 while ( (sbits64) rem1 < 0 ) {
3224 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3227 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3229 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3232 roundAndPackFloatx80(
3233 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3238 -------------------------------------------------------------------------------
3239 Returns 1 if the extended double-precision floating-point value `a' is
3240 equal to the corresponding value `b', and 0 otherwise. The comparison is
3241 performed according to the IEC/IEEE Standard for Binary Floating-point
3243 -------------------------------------------------------------------------------
3245 flag floatx80_eq( floatx80 a, floatx80 b )
3248 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3249 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3250 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3251 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3253 if ( floatx80_is_signaling_nan( a )
3254 || floatx80_is_signaling_nan( b ) ) {
3255 float_raise( float_flag_invalid );
3261 && ( ( a.high == b.high )
3263 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3269 -------------------------------------------------------------------------------
3270 Returns 1 if the extended double-precision floating-point value `a' is
3271 less than or equal to the corresponding value `b', and 0 otherwise. The
3272 comparison is performed according to the IEC/IEEE Standard for Binary
3273 Floating-point Arithmetic.
3274 -------------------------------------------------------------------------------
3276 flag floatx80_le( floatx80 a, floatx80 b )
3280 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3281 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3282 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3283 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3285 float_raise( float_flag_invalid );
3288 aSign = extractFloatx80Sign( a );
3289 bSign = extractFloatx80Sign( b );
3290 if ( aSign != bSign ) {
3293 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3297 aSign ? le128( b.high, b.low, a.high, a.low )
3298 : le128( a.high, a.low, b.high, b.low );
3303 -------------------------------------------------------------------------------
3304 Returns 1 if the extended double-precision floating-point value `a' is
3305 less than the corresponding value `b', and 0 otherwise. The comparison
3306 is performed according to the IEC/IEEE Standard for Binary Floating-point
3308 -------------------------------------------------------------------------------
3310 flag floatx80_lt( floatx80 a, floatx80 b )
3314 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3315 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3316 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3317 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3319 float_raise( float_flag_invalid );
3322 aSign = extractFloatx80Sign( a );
3323 bSign = extractFloatx80Sign( b );
3324 if ( aSign != bSign ) {
3327 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3331 aSign ? lt128( b.high, b.low, a.high, a.low )
3332 : lt128( a.high, a.low, b.high, b.low );
3337 -------------------------------------------------------------------------------
3338 Returns 1 if the extended double-precision floating-point value `a' is equal
3339 to the corresponding value `b', and 0 otherwise. The invalid exception is
3340 raised if either operand is a NaN. Otherwise, the comparison is performed
3341 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3342 -------------------------------------------------------------------------------
3344 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3347 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3348 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3349 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3350 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3352 float_raise( float_flag_invalid );
3357 && ( ( a.high == b.high )
3359 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3365 -------------------------------------------------------------------------------
3366 Returns 1 if the extended double-precision floating-point value `a' is less
3367 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3368 do not cause an exception. Otherwise, the comparison is performed according
3369 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3370 -------------------------------------------------------------------------------
3372 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3376 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3377 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3378 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3379 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3381 if ( floatx80_is_signaling_nan( a )
3382 || floatx80_is_signaling_nan( b ) ) {
3383 float_raise( float_flag_invalid );
3387 aSign = extractFloatx80Sign( a );
3388 bSign = extractFloatx80Sign( b );
3389 if ( aSign != bSign ) {
3392 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3396 aSign ? le128( b.high, b.low, a.high, a.low )
3397 : le128( a.high, a.low, b.high, b.low );
3402 -------------------------------------------------------------------------------
3403 Returns 1 if the extended double-precision floating-point value `a' is less
3404 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3405 an exception. Otherwise, the comparison is performed according to the
3406 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3407 -------------------------------------------------------------------------------
3409 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3413 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3414 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3415 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3416 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3418 if ( floatx80_is_signaling_nan( a )
3419 || floatx80_is_signaling_nan( b ) ) {
3420 float_raise( float_flag_invalid );
3424 aSign = extractFloatx80Sign( a );
3425 bSign = extractFloatx80Sign( b );
3426 if ( aSign != bSign ) {
3429 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3433 aSign ? lt128( b.high, b.low, a.high, a.low )
3434 : lt128( a.high, a.low, b.high, b.low );