2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include <asm/div64.h>
35 //#include "softfloat.h"
38 -------------------------------------------------------------------------------
39 Primitive arithmetic functions, including multi-word arithmetic, and
40 division and square root approximations. (Can be specialized to target if
42 -------------------------------------------------------------------------------
44 #include "softfloat-macros"
47 -------------------------------------------------------------------------------
48 Functions and definitions to determine: (1) whether tininess for underflow
49 is detected before or after rounding by default, (2) what (if anything)
50 happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 are propagated from function inputs to output. These details are target-
54 -------------------------------------------------------------------------------
56 #include "softfloat-specialize"
59 -------------------------------------------------------------------------------
60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
61 and 7, and returns the properly rounded 32-bit integer corresponding to the
62 input. If `zSign' is nonzero, the input is negated before being converted
63 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
64 input is simply rounded to an integer, with the inexact exception raised if
65 the input cannot be represented exactly as an integer. If the fixed-point
66 input is too large, however, the invalid exception is raised and the largest
67 positive or negative integer is returned.
68 -------------------------------------------------------------------------------
70 static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
73 flag roundNearestEven;
74 int8 roundIncrement, roundBits;
77 roundingMode = roundData->mode;
78 roundNearestEven = ( roundingMode == float_round_nearest_even );
79 roundIncrement = 0x40;
80 if ( ! roundNearestEven ) {
81 if ( roundingMode == float_round_to_zero ) {
85 roundIncrement = 0x7F;
87 if ( roundingMode == float_round_up ) roundIncrement = 0;
90 if ( roundingMode == float_round_down ) roundIncrement = 0;
94 roundBits = absZ & 0x7F;
95 absZ = ( absZ + roundIncrement )>>7;
96 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
99 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
100 roundData->exception |= float_flag_invalid;
101 return zSign ? 0x80000000 : 0x7FFFFFFF;
103 if ( roundBits ) roundData->exception |= float_flag_inexact;
109 -------------------------------------------------------------------------------
110 Returns the fraction bits of the single-precision floating-point value `a'.
111 -------------------------------------------------------------------------------
113 INLINE bits32 extractFloat32Frac( float32 a )
116 return a & 0x007FFFFF;
121 -------------------------------------------------------------------------------
122 Returns the exponent bits of the single-precision floating-point value `a'.
123 -------------------------------------------------------------------------------
125 INLINE int16 extractFloat32Exp( float32 a )
128 return ( a>>23 ) & 0xFF;
133 -------------------------------------------------------------------------------
134 Returns the sign bit of the single-precision floating-point value `a'.
135 -------------------------------------------------------------------------------
137 #if 0 /* in softfloat.h */
138 INLINE flag extractFloat32Sign( float32 a )
147 -------------------------------------------------------------------------------
148 Normalizes the subnormal single-precision floating-point value represented
149 by the denormalized significand `aSig'. The normalized exponent and
150 significand are stored at the locations pointed to by `zExpPtr' and
151 `zSigPtr', respectively.
152 -------------------------------------------------------------------------------
155 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
159 shiftCount = countLeadingZeros32( aSig ) - 8;
160 *zSigPtr = aSig<<shiftCount;
161 *zExpPtr = 1 - shiftCount;
166 -------------------------------------------------------------------------------
167 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
168 single-precision floating-point value, returning the result. After being
169 shifted into the proper positions, the three fields are simply added
170 together to form the result. This means that any integer portion of `zSig'
171 will be added into the exponent. Since a properly normalized significand
172 will have an integer portion equal to 1, the `zExp' input should be 1 less
173 than the desired result exponent whenever `zSig' is a complete, normalized
175 -------------------------------------------------------------------------------
177 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
181 __asm__("@ packFloat32 \n\
182 mov %0, %1, asl #31 \n\
183 orr %0, %2, asl #23 \n\
186 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
190 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
195 -------------------------------------------------------------------------------
196 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
197 and significand `zSig', and returns the proper single-precision floating-
198 point value corresponding to the abstract input. Ordinarily, the abstract
199 value is simply rounded and packed into the single-precision format, with
200 the inexact exception raised if the abstract input cannot be represented
201 exactly. If the abstract value is too large, however, the overflow and
202 inexact exceptions are raised and an infinity or maximal finite value is
203 returned. If the abstract value is too small, the input value is rounded to
204 a subnormal number, and the underflow and inexact exceptions are raised if
205 the abstract input cannot be represented exactly as a subnormal single-
206 precision floating-point number.
207 The input significand `zSig' has its binary point between bits 30
208 and 29, which is 7 bits to the left of the usual location. This shifted
209 significand must be normalized or smaller. If `zSig' is not normalized,
210 `zExp' must be 0; in that case, the result returned is a subnormal number,
211 and it must not require rounding. In the usual case that `zSig' is
212 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
213 The handling of underflow and overflow follows the IEC/IEEE Standard for
214 Binary Floating-point Arithmetic.
215 -------------------------------------------------------------------------------
217 static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
220 flag roundNearestEven;
221 int8 roundIncrement, roundBits;
224 roundingMode = roundData->mode;
225 roundNearestEven = ( roundingMode == float_round_nearest_even );
226 roundIncrement = 0x40;
227 if ( ! roundNearestEven ) {
228 if ( roundingMode == float_round_to_zero ) {
232 roundIncrement = 0x7F;
234 if ( roundingMode == float_round_up ) roundIncrement = 0;
237 if ( roundingMode == float_round_down ) roundIncrement = 0;
241 roundBits = zSig & 0x7F;
242 if ( 0xFD <= (bits16) zExp ) {
244 || ( ( zExp == 0xFD )
245 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
247 roundData->exception |= float_flag_overflow | float_flag_inexact;
248 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
252 ( float_detect_tininess == float_tininess_before_rounding )
254 || ( zSig + roundIncrement < 0x80000000 );
255 shift32RightJamming( zSig, - zExp, &zSig );
257 roundBits = zSig & 0x7F;
258 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
261 if ( roundBits ) roundData->exception |= float_flag_inexact;
262 zSig = ( zSig + roundIncrement )>>7;
263 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
264 if ( zSig == 0 ) zExp = 0;
265 return packFloat32( zSign, zExp, zSig );
270 -------------------------------------------------------------------------------
271 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
272 and significand `zSig', and returns the proper single-precision floating-
273 point value corresponding to the abstract input. This routine is just like
274 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
275 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
277 -------------------------------------------------------------------------------
280 normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
284 shiftCount = countLeadingZeros32( zSig ) - 1;
285 return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
290 -------------------------------------------------------------------------------
291 Returns the fraction bits of the double-precision floating-point value `a'.
292 -------------------------------------------------------------------------------
294 INLINE bits64 extractFloat64Frac( float64 a )
297 return a & LIT64( 0x000FFFFFFFFFFFFF );
302 -------------------------------------------------------------------------------
303 Returns the exponent bits of the double-precision floating-point value `a'.
304 -------------------------------------------------------------------------------
306 INLINE int16 extractFloat64Exp( float64 a )
309 return ( a>>52 ) & 0x7FF;
314 -------------------------------------------------------------------------------
315 Returns the sign bit of the double-precision floating-point value `a'.
316 -------------------------------------------------------------------------------
318 #if 0 /* in softfloat.h */
319 INLINE flag extractFloat64Sign( float64 a )
328 -------------------------------------------------------------------------------
329 Normalizes the subnormal double-precision floating-point value represented
330 by the denormalized significand `aSig'. The normalized exponent and
331 significand are stored at the locations pointed to by `zExpPtr' and
332 `zSigPtr', respectively.
333 -------------------------------------------------------------------------------
336 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
340 shiftCount = countLeadingZeros64( aSig ) - 11;
341 *zSigPtr = aSig<<shiftCount;
342 *zExpPtr = 1 - shiftCount;
347 -------------------------------------------------------------------------------
348 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
349 double-precision floating-point value, returning the result. After being
350 shifted into the proper positions, the three fields are simply added
351 together to form the result. This means that any integer portion of `zSig'
352 will be added into the exponent. Since a properly normalized significand
353 will have an integer portion equal to 1, the `zExp' input should be 1 less
354 than the desired result exponent whenever `zSig' is a complete, normalized
356 -------------------------------------------------------------------------------
358 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
361 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
366 -------------------------------------------------------------------------------
367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 and significand `zSig', and returns the proper double-precision floating-
369 point value corresponding to the abstract input. Ordinarily, the abstract
370 value is simply rounded and packed into the double-precision format, with
371 the inexact exception raised if the abstract input cannot be represented
372 exactly. If the abstract value is too large, however, the overflow and
373 inexact exceptions are raised and an infinity or maximal finite value is
374 returned. If the abstract value is too small, the input value is rounded to
375 a subnormal number, and the underflow and inexact exceptions are raised if
376 the abstract input cannot be represented exactly as a subnormal double-
377 precision floating-point number.
378 The input significand `zSig' has its binary point between bits 62
379 and 61, which is 10 bits to the left of the usual location. This shifted
380 significand must be normalized or smaller. If `zSig' is not normalized,
381 `zExp' must be 0; in that case, the result returned is a subnormal number,
382 and it must not require rounding. In the usual case that `zSig' is
383 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
384 The handling of underflow and overflow follows the IEC/IEEE Standard for
385 Binary Floating-point Arithmetic.
386 -------------------------------------------------------------------------------
388 static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
391 flag roundNearestEven;
392 int16 roundIncrement, roundBits;
395 roundingMode = roundData->mode;
396 roundNearestEven = ( roundingMode == float_round_nearest_even );
397 roundIncrement = 0x200;
398 if ( ! roundNearestEven ) {
399 if ( roundingMode == float_round_to_zero ) {
403 roundIncrement = 0x3FF;
405 if ( roundingMode == float_round_up ) roundIncrement = 0;
408 if ( roundingMode == float_round_down ) roundIncrement = 0;
412 roundBits = zSig & 0x3FF;
413 if ( 0x7FD <= (bits16) zExp ) {
414 if ( ( 0x7FD < zExp )
415 || ( ( zExp == 0x7FD )
416 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
418 //register int lr = __builtin_return_address(0);
419 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
420 roundData->exception |= float_flag_overflow | float_flag_inexact;
421 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
425 ( float_detect_tininess == float_tininess_before_rounding )
427 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
428 shift64RightJamming( zSig, - zExp, &zSig );
430 roundBits = zSig & 0x3FF;
431 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
434 if ( roundBits ) roundData->exception |= float_flag_inexact;
435 zSig = ( zSig + roundIncrement )>>10;
436 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
437 if ( zSig == 0 ) zExp = 0;
438 return packFloat64( zSign, zExp, zSig );
443 -------------------------------------------------------------------------------
444 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
445 and significand `zSig', and returns the proper double-precision floating-
446 point value corresponding to the abstract input. This routine is just like
447 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
448 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
450 -------------------------------------------------------------------------------
453 normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
457 shiftCount = countLeadingZeros64( zSig ) - 1;
458 return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
465 -------------------------------------------------------------------------------
466 Returns the fraction bits of the extended double-precision floating-point
468 -------------------------------------------------------------------------------
470 INLINE bits64 extractFloatx80Frac( floatx80 a )
478 -------------------------------------------------------------------------------
479 Returns the exponent bits of the extended double-precision floating-point
481 -------------------------------------------------------------------------------
483 INLINE int32 extractFloatx80Exp( floatx80 a )
486 return a.high & 0x7FFF;
491 -------------------------------------------------------------------------------
492 Returns the sign bit of the extended double-precision floating-point value
494 -------------------------------------------------------------------------------
496 INLINE flag extractFloatx80Sign( floatx80 a )
504 -------------------------------------------------------------------------------
505 Normalizes the subnormal extended double-precision floating-point value
506 represented by the denormalized significand `aSig'. The normalized exponent
507 and significand are stored at the locations pointed to by `zExpPtr' and
508 `zSigPtr', respectively.
509 -------------------------------------------------------------------------------
512 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
516 shiftCount = countLeadingZeros64( aSig );
517 *zSigPtr = aSig<<shiftCount;
518 *zExpPtr = 1 - shiftCount;
523 -------------------------------------------------------------------------------
524 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
525 extended double-precision floating-point value, returning the result.
526 -------------------------------------------------------------------------------
528 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
533 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
540 -------------------------------------------------------------------------------
541 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
542 and extended significand formed by the concatenation of `zSig0' and `zSig1',
543 and returns the proper extended double-precision floating-point value
544 corresponding to the abstract input. Ordinarily, the abstract value is
545 rounded and packed into the extended double-precision format, with the
546 inexact exception raised if the abstract input cannot be represented
547 exactly. If the abstract value is too large, however, the overflow and
548 inexact exceptions are raised and an infinity or maximal finite value is
549 returned. If the abstract value is too small, the input value is rounded to
550 a subnormal number, and the underflow and inexact exceptions are raised if
551 the abstract input cannot be represented exactly as a subnormal extended
552 double-precision floating-point number.
553 If `roundingPrecision' is 32 or 64, the result is rounded to the same
554 number of bits as single or double precision, respectively. Otherwise, the
555 result is rounded to the full precision of the extended double-precision
557 The input significand must be normalized or smaller. If the input
558 significand is not normalized, `zExp' must be 0; in that case, the result
559 returned is a subnormal number, and it must not require rounding. The
560 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
561 Floating-point Arithmetic.
562 -------------------------------------------------------------------------------
565 roundAndPackFloatx80(
566 struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
569 int8 roundingMode, roundingPrecision;
570 flag roundNearestEven, increment, isTiny;
571 int64 roundIncrement, roundMask, roundBits;
573 roundingMode = roundData->mode;
574 roundingPrecision = roundData->precision;
575 roundNearestEven = ( roundingMode == float_round_nearest_even );
576 if ( roundingPrecision == 80 ) goto precision80;
577 if ( roundingPrecision == 64 ) {
578 roundIncrement = LIT64( 0x0000000000000400 );
579 roundMask = LIT64( 0x00000000000007FF );
581 else if ( roundingPrecision == 32 ) {
582 roundIncrement = LIT64( 0x0000008000000000 );
583 roundMask = LIT64( 0x000000FFFFFFFFFF );
588 zSig0 |= ( zSig1 != 0 );
589 if ( ! roundNearestEven ) {
590 if ( roundingMode == float_round_to_zero ) {
594 roundIncrement = roundMask;
596 if ( roundingMode == float_round_up ) roundIncrement = 0;
599 if ( roundingMode == float_round_down ) roundIncrement = 0;
603 roundBits = zSig0 & roundMask;
604 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
605 if ( ( 0x7FFE < zExp )
606 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
612 ( float_detect_tininess == float_tininess_before_rounding )
614 || ( zSig0 <= zSig0 + roundIncrement );
615 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
617 roundBits = zSig0 & roundMask;
618 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
619 if ( roundBits ) roundData->exception |= float_flag_inexact;
620 zSig0 += roundIncrement;
621 if ( (sbits64) zSig0 < 0 ) zExp = 1;
622 roundIncrement = roundMask + 1;
623 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
624 roundMask |= roundIncrement;
626 zSig0 &= ~ roundMask;
627 return packFloatx80( zSign, zExp, zSig0 );
630 if ( roundBits ) roundData->exception |= float_flag_inexact;
631 zSig0 += roundIncrement;
632 if ( zSig0 < roundIncrement ) {
634 zSig0 = LIT64( 0x8000000000000000 );
636 roundIncrement = roundMask + 1;
637 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
638 roundMask |= roundIncrement;
640 zSig0 &= ~ roundMask;
641 if ( zSig0 == 0 ) zExp = 0;
642 return packFloatx80( zSign, zExp, zSig0 );
644 increment = ( (sbits64) zSig1 < 0 );
645 if ( ! roundNearestEven ) {
646 if ( roundingMode == float_round_to_zero ) {
651 increment = ( roundingMode == float_round_down ) && zSig1;
654 increment = ( roundingMode == float_round_up ) && zSig1;
658 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
659 if ( ( 0x7FFE < zExp )
660 || ( ( zExp == 0x7FFE )
661 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
667 roundData->exception |= float_flag_overflow | float_flag_inexact;
668 if ( ( roundingMode == float_round_to_zero )
669 || ( zSign && ( roundingMode == float_round_up ) )
670 || ( ! zSign && ( roundingMode == float_round_down ) )
672 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
674 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
678 ( float_detect_tininess == float_tininess_before_rounding )
681 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
682 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
684 if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
685 if ( zSig1 ) roundData->exception |= float_flag_inexact;
686 if ( roundNearestEven ) {
687 increment = ( (sbits64) zSig1 < 0 );
691 increment = ( roundingMode == float_round_down ) && zSig1;
694 increment = ( roundingMode == float_round_up ) && zSig1;
699 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
700 if ( (sbits64) zSig0 < 0 ) zExp = 1;
702 return packFloatx80( zSign, zExp, zSig0 );
705 if ( zSig1 ) roundData->exception |= float_flag_inexact;
710 zSig0 = LIT64( 0x8000000000000000 );
713 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
717 if ( zSig0 == 0 ) zExp = 0;
720 return packFloatx80( zSign, zExp, zSig0 );
724 -------------------------------------------------------------------------------
725 Takes an abstract floating-point value having sign `zSign', exponent
726 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
727 and returns the proper extended double-precision floating-point value
728 corresponding to the abstract input. This routine is just like
729 `roundAndPackFloatx80' except that the input significand does not have to be
731 -------------------------------------------------------------------------------
734 normalizeRoundAndPackFloatx80(
735 struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
745 shiftCount = countLeadingZeros64( zSig0 );
746 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
749 roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
756 -------------------------------------------------------------------------------
757 Returns the result of converting the 32-bit two's complement integer `a' to
758 the single-precision floating-point format. The conversion is performed
759 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
760 -------------------------------------------------------------------------------
762 float32 int32_to_float32(struct roundingData *roundData, int32 a)
766 if ( a == 0 ) return 0;
767 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
769 return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
774 -------------------------------------------------------------------------------
775 Returns the result of converting the 32-bit two's complement integer `a' to
776 the double-precision floating-point format. The conversion is performed
777 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
778 -------------------------------------------------------------------------------
780 float64 int32_to_float64( int32 a )
787 if ( a == 0 ) return 0;
789 absA = aSign ? - a : a;
790 shiftCount = countLeadingZeros32( absA ) + 21;
792 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
799 -------------------------------------------------------------------------------
800 Returns the result of converting the 32-bit two's complement integer `a'
801 to the extended double-precision floating-point format. The conversion
802 is performed according to the IEC/IEEE Standard for Binary Floating-point
804 -------------------------------------------------------------------------------
806 floatx80 int32_to_floatx80( int32 a )
813 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
815 absA = zSign ? - a : a;
816 shiftCount = countLeadingZeros32( absA ) + 32;
818 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
825 -------------------------------------------------------------------------------
826 Returns the result of converting the single-precision floating-point value
827 `a' to the 32-bit two's complement integer format. The conversion is
828 performed according to the IEC/IEEE Standard for Binary Floating-point
829 Arithmetic---which means in particular that the conversion is rounded
830 according to the current rounding mode. If `a' is a NaN, the largest
831 positive integer is returned. Otherwise, if the conversion overflows, the
832 largest integer with the same sign as `a' is returned.
833 -------------------------------------------------------------------------------
835 int32 float32_to_int32( struct roundingData *roundData, float32 a )
838 int16 aExp, shiftCount;
842 aSig = extractFloat32Frac( a );
843 aExp = extractFloat32Exp( a );
844 aSign = extractFloat32Sign( a );
845 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
846 if ( aExp ) aSig |= 0x00800000;
847 shiftCount = 0xAF - aExp;
850 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
851 return roundAndPackInt32( roundData, aSign, zSig );
856 -------------------------------------------------------------------------------
857 Returns the result of converting the single-precision floating-point value
858 `a' to the 32-bit two's complement integer format. The conversion is
859 performed according to the IEC/IEEE Standard for Binary Floating-point
860 Arithmetic, except that the conversion is always rounded toward zero. If
861 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
862 conversion overflows, the largest integer with the same sign as `a' is
864 -------------------------------------------------------------------------------
866 int32 float32_to_int32_round_to_zero( float32 a )
869 int16 aExp, shiftCount;
873 aSig = extractFloat32Frac( a );
874 aExp = extractFloat32Exp( a );
875 aSign = extractFloat32Sign( a );
876 shiftCount = aExp - 0x9E;
877 if ( 0 <= shiftCount ) {
878 if ( a == 0xCF000000 ) return 0x80000000;
879 float_raise( float_flag_invalid );
880 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
883 else if ( aExp <= 0x7E ) {
884 if ( aExp | aSig ) float_raise( float_flag_inexact );
887 aSig = ( aSig | 0x00800000 )<<8;
888 z = aSig>>( - shiftCount );
889 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
890 float_raise( float_flag_inexact );
892 return aSign ? - z : z;
897 -------------------------------------------------------------------------------
898 Returns the result of converting the single-precision floating-point value
899 `a' to the double-precision floating-point format. The conversion is
900 performed according to the IEC/IEEE Standard for Binary Floating-point
902 -------------------------------------------------------------------------------
904 float64 float32_to_float64( float32 a )
910 aSig = extractFloat32Frac( a );
911 aExp = extractFloat32Exp( a );
912 aSign = extractFloat32Sign( a );
913 if ( aExp == 0xFF ) {
914 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
915 return packFloat64( aSign, 0x7FF, 0 );
918 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
919 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
922 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
929 -------------------------------------------------------------------------------
930 Returns the result of converting the single-precision floating-point value
931 `a' to the extended double-precision floating-point format. The conversion
932 is performed according to the IEC/IEEE Standard for Binary Floating-point
934 -------------------------------------------------------------------------------
936 floatx80 float32_to_floatx80( float32 a )
942 aSig = extractFloat32Frac( a );
943 aExp = extractFloat32Exp( a );
944 aSign = extractFloat32Sign( a );
945 if ( aExp == 0xFF ) {
946 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
947 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
950 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
951 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
954 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
961 -------------------------------------------------------------------------------
962 Rounds the single-precision floating-point value `a' to an integer, and
963 returns the result as a single-precision floating-point value. The
964 operation is performed according to the IEC/IEEE Standard for Binary
965 Floating-point Arithmetic.
966 -------------------------------------------------------------------------------
968 float32 float32_round_to_int( struct roundingData *roundData, float32 a )
972 bits32 lastBitMask, roundBitsMask;
976 aExp = extractFloat32Exp( a );
977 if ( 0x96 <= aExp ) {
978 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
979 return propagateFloat32NaN( a, a );
983 roundingMode = roundData->mode;
984 if ( aExp <= 0x7E ) {
985 if ( (bits32) ( a<<1 ) == 0 ) return a;
986 roundData->exception |= float_flag_inexact;
987 aSign = extractFloat32Sign( a );
988 switch ( roundingMode ) {
989 case float_round_nearest_even:
990 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
991 return packFloat32( aSign, 0x7F, 0 );
994 case float_round_down:
995 return aSign ? 0xBF800000 : 0;
997 return aSign ? 0x80000000 : 0x3F800000;
999 return packFloat32( aSign, 0, 0 );
1002 lastBitMask <<= 0x96 - aExp;
1003 roundBitsMask = lastBitMask - 1;
1005 if ( roundingMode == float_round_nearest_even ) {
1006 z += lastBitMask>>1;
1007 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1009 else if ( roundingMode != float_round_to_zero ) {
1010 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1014 z &= ~ roundBitsMask;
1015 if ( z != a ) roundData->exception |= float_flag_inexact;
1021 -------------------------------------------------------------------------------
1022 Returns the result of adding the absolute values of the single-precision
1023 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1024 before being returned. `zSign' is ignored if the result is a NaN. The
1025 addition is performed according to the IEC/IEEE Standard for Binary
1026 Floating-point Arithmetic.
1027 -------------------------------------------------------------------------------
1029 static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1031 int16 aExp, bExp, zExp;
1032 bits32 aSig, bSig, zSig;
1035 aSig = extractFloat32Frac( a );
1036 aExp = extractFloat32Exp( a );
1037 bSig = extractFloat32Frac( b );
1038 bExp = extractFloat32Exp( b );
1039 expDiff = aExp - bExp;
1042 if ( 0 < expDiff ) {
1043 if ( aExp == 0xFF ) {
1044 if ( aSig ) return propagateFloat32NaN( a, b );
1053 shift32RightJamming( bSig, expDiff, &bSig );
1056 else if ( expDiff < 0 ) {
1057 if ( bExp == 0xFF ) {
1058 if ( bSig ) return propagateFloat32NaN( a, b );
1059 return packFloat32( zSign, 0xFF, 0 );
1067 shift32RightJamming( aSig, - expDiff, &aSig );
1071 if ( aExp == 0xFF ) {
1072 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1075 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1076 zSig = 0x40000000 + aSig + bSig;
1081 zSig = ( aSig + bSig )<<1;
1083 if ( (sbits32) zSig < 0 ) {
1088 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1093 -------------------------------------------------------------------------------
1094 Returns the result of subtracting the absolute values of the single-
1095 precision floating-point values `a' and `b'. If `zSign' is true, the
1096 difference is negated before being returned. `zSign' is ignored if the
1097 result is a NaN. The subtraction is performed according to the IEC/IEEE
1098 Standard for Binary Floating-point Arithmetic.
1099 -------------------------------------------------------------------------------
1101 static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1103 int16 aExp, bExp, zExp;
1104 bits32 aSig, bSig, zSig;
1107 aSig = extractFloat32Frac( a );
1108 aExp = extractFloat32Exp( a );
1109 bSig = extractFloat32Frac( b );
1110 bExp = extractFloat32Exp( b );
1111 expDiff = aExp - bExp;
1114 if ( 0 < expDiff ) goto aExpBigger;
1115 if ( expDiff < 0 ) goto bExpBigger;
1116 if ( aExp == 0xFF ) {
1117 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1118 roundData->exception |= float_flag_invalid;
1119 return float32_default_nan;
1125 if ( bSig < aSig ) goto aBigger;
1126 if ( aSig < bSig ) goto bBigger;
1127 return packFloat32( roundData->mode == float_round_down, 0, 0 );
1129 if ( bExp == 0xFF ) {
1130 if ( bSig ) return propagateFloat32NaN( a, b );
1131 return packFloat32( zSign ^ 1, 0xFF, 0 );
1139 shift32RightJamming( aSig, - expDiff, &aSig );
1145 goto normalizeRoundAndPack;
1147 if ( aExp == 0xFF ) {
1148 if ( aSig ) return propagateFloat32NaN( a, b );
1157 shift32RightJamming( bSig, expDiff, &bSig );
1162 normalizeRoundAndPack:
1164 return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1169 -------------------------------------------------------------------------------
1170 Returns the result of adding the single-precision floating-point values `a'
1171 and `b'. The operation is performed according to the IEC/IEEE Standard for
1172 Binary Floating-point Arithmetic.
1173 -------------------------------------------------------------------------------
1175 float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1179 aSign = extractFloat32Sign( a );
1180 bSign = extractFloat32Sign( b );
1181 if ( aSign == bSign ) {
1182 return addFloat32Sigs( roundData, a, b, aSign );
1185 return subFloat32Sigs( roundData, a, b, aSign );
1191 -------------------------------------------------------------------------------
1192 Returns the result of subtracting the single-precision floating-point values
1193 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1194 for Binary Floating-point Arithmetic.
1195 -------------------------------------------------------------------------------
1197 float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1201 aSign = extractFloat32Sign( a );
1202 bSign = extractFloat32Sign( b );
1203 if ( aSign == bSign ) {
1204 return subFloat32Sigs( roundData, a, b, aSign );
1207 return addFloat32Sigs( roundData, a, b, aSign );
1213 -------------------------------------------------------------------------------
1214 Returns the result of multiplying the single-precision floating-point values
1215 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1216 for Binary Floating-point Arithmetic.
1217 -------------------------------------------------------------------------------
1219 float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1221 flag aSign, bSign, zSign;
1222 int16 aExp, bExp, zExp;
1227 aSig = extractFloat32Frac( a );
1228 aExp = extractFloat32Exp( a );
1229 aSign = extractFloat32Sign( a );
1230 bSig = extractFloat32Frac( b );
1231 bExp = extractFloat32Exp( b );
1232 bSign = extractFloat32Sign( b );
1233 zSign = aSign ^ bSign;
1234 if ( aExp == 0xFF ) {
1235 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1236 return propagateFloat32NaN( a, b );
1238 if ( ( bExp | bSig ) == 0 ) {
1239 roundData->exception |= float_flag_invalid;
1240 return float32_default_nan;
1242 return packFloat32( zSign, 0xFF, 0 );
1244 if ( bExp == 0xFF ) {
1245 if ( bSig ) return propagateFloat32NaN( a, b );
1246 if ( ( aExp | aSig ) == 0 ) {
1247 roundData->exception |= float_flag_invalid;
1248 return float32_default_nan;
1250 return packFloat32( zSign, 0xFF, 0 );
1253 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1254 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1257 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1258 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1260 zExp = aExp + bExp - 0x7F;
1261 aSig = ( aSig | 0x00800000 )<<7;
1262 bSig = ( bSig | 0x00800000 )<<8;
1263 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1265 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1269 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1274 -------------------------------------------------------------------------------
1275 Returns the result of dividing the single-precision floating-point value `a'
1276 by the corresponding value `b'. The operation is performed according to the
1277 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1278 -------------------------------------------------------------------------------
1280 float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1282 flag aSign, bSign, zSign;
1283 int16 aExp, bExp, zExp;
1284 bits32 aSig, bSig, zSig;
1286 aSig = extractFloat32Frac( a );
1287 aExp = extractFloat32Exp( a );
1288 aSign = extractFloat32Sign( a );
1289 bSig = extractFloat32Frac( b );
1290 bExp = extractFloat32Exp( b );
1291 bSign = extractFloat32Sign( b );
1292 zSign = aSign ^ bSign;
1293 if ( aExp == 0xFF ) {
1294 if ( aSig ) return propagateFloat32NaN( a, b );
1295 if ( bExp == 0xFF ) {
1296 if ( bSig ) return propagateFloat32NaN( a, b );
1297 roundData->exception |= float_flag_invalid;
1298 return float32_default_nan;
1300 return packFloat32( zSign, 0xFF, 0 );
1302 if ( bExp == 0xFF ) {
1303 if ( bSig ) return propagateFloat32NaN( a, b );
1304 return packFloat32( zSign, 0, 0 );
1308 if ( ( aExp | aSig ) == 0 ) {
1309 roundData->exception |= float_flag_invalid;
1310 return float32_default_nan;
1312 roundData->exception |= float_flag_divbyzero;
1313 return packFloat32( zSign, 0xFF, 0 );
1315 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1318 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1319 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1321 zExp = aExp - bExp + 0x7D;
1322 aSig = ( aSig | 0x00800000 )<<7;
1323 bSig = ( bSig | 0x00800000 )<<8;
1324 if ( bSig <= ( aSig + aSig ) ) {
1329 bits64 tmp = ( (bits64) aSig )<<32;
1330 do_div( tmp, bSig );
1333 if ( ( zSig & 0x3F ) == 0 ) {
1334 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1336 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1341 -------------------------------------------------------------------------------
1342 Returns the remainder of the single-precision floating-point value `a'
1343 with respect to the corresponding value `b'. The operation is performed
1344 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1345 -------------------------------------------------------------------------------
1347 float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1349 flag aSign, bSign, zSign;
1350 int16 aExp, bExp, expDiff;
1353 bits64 aSig64, bSig64, q64;
1354 bits32 alternateASig;
1357 aSig = extractFloat32Frac( a );
1358 aExp = extractFloat32Exp( a );
1359 aSign = extractFloat32Sign( a );
1360 bSig = extractFloat32Frac( b );
1361 bExp = extractFloat32Exp( b );
1362 bSign = extractFloat32Sign( b );
1363 if ( aExp == 0xFF ) {
1364 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1365 return propagateFloat32NaN( a, b );
1367 roundData->exception |= float_flag_invalid;
1368 return float32_default_nan;
1370 if ( bExp == 0xFF ) {
1371 if ( bSig ) return propagateFloat32NaN( a, b );
1376 roundData->exception |= float_flag_invalid;
1377 return float32_default_nan;
1379 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1382 if ( aSig == 0 ) return a;
1383 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1385 expDiff = aExp - bExp;
1388 if ( expDiff < 32 ) {
1391 if ( expDiff < 0 ) {
1392 if ( expDiff < -1 ) return a;
1395 q = ( bSig <= aSig );
1396 if ( q ) aSig -= bSig;
1397 if ( 0 < expDiff ) {
1398 bits64 tmp = ( (bits64) aSig )<<32;
1399 do_div( tmp, bSig );
1403 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1411 if ( bSig <= aSig ) aSig -= bSig;
1412 aSig64 = ( (bits64) aSig )<<40;
1413 bSig64 = ( (bits64) bSig )<<40;
1415 while ( 0 < expDiff ) {
1416 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1417 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1418 aSig64 = - ( ( bSig * q64 )<<38 );
1422 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1423 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1424 q = q64>>( 64 - expDiff );
1426 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1429 alternateASig = aSig;
1432 } while ( 0 <= (sbits32) aSig );
1433 sigMean = aSig + alternateASig;
1434 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1435 aSig = alternateASig;
1437 zSign = ( (sbits32) aSig < 0 );
1438 if ( zSign ) aSig = - aSig;
1439 return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1444 -------------------------------------------------------------------------------
1445 Returns the square root of the single-precision floating-point value `a'.
1446 The operation is performed according to the IEC/IEEE Standard for Binary
1447 Floating-point Arithmetic.
1448 -------------------------------------------------------------------------------
1450 float32 float32_sqrt( struct roundingData *roundData, float32 a )
1457 aSig = extractFloat32Frac( a );
1458 aExp = extractFloat32Exp( a );
1459 aSign = extractFloat32Sign( a );
1460 if ( aExp == 0xFF ) {
1461 if ( aSig ) return propagateFloat32NaN( a, 0 );
1462 if ( ! aSign ) return a;
1463 roundData->exception |= float_flag_invalid;
1464 return float32_default_nan;
1467 if ( ( aExp | aSig ) == 0 ) return a;
1468 roundData->exception |= float_flag_invalid;
1469 return float32_default_nan;
1472 if ( aSig == 0 ) return 0;
1473 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1475 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1476 aSig = ( aSig | 0x00800000 )<<8;
1477 zSig = estimateSqrt32( aExp, aSig ) + 2;
1478 if ( ( zSig & 0x7F ) <= 5 ) {
1484 term = ( (bits64) zSig ) * zSig;
1485 rem = ( ( (bits64) aSig )<<32 ) - term;
1486 while ( (sbits64) rem < 0 ) {
1488 rem += ( ( (bits64) zSig )<<1 ) | 1;
1490 zSig |= ( rem != 0 );
1493 shift32RightJamming( zSig, 1, &zSig );
1494 return roundAndPackFloat32( roundData, 0, zExp, zSig );
1499 -------------------------------------------------------------------------------
1500 Returns 1 if the single-precision floating-point value `a' is equal to the
1501 corresponding value `b', and 0 otherwise. The comparison is performed
1502 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1503 -------------------------------------------------------------------------------
1505 flag float32_eq( float32 a, float32 b )
1508 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1509 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1511 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1512 float_raise( float_flag_invalid );
1516 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1521 -------------------------------------------------------------------------------
1522 Returns 1 if the single-precision floating-point value `a' is less than or
1523 equal to the corresponding value `b', and 0 otherwise. The comparison is
1524 performed according to the IEC/IEEE Standard for Binary Floating-point
1526 -------------------------------------------------------------------------------
1528 flag float32_le( float32 a, float32 b )
1532 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1533 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1535 float_raise( float_flag_invalid );
1538 aSign = extractFloat32Sign( a );
1539 bSign = extractFloat32Sign( b );
1540 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1541 return ( a == b ) || ( aSign ^ ( a < b ) );
1546 -------------------------------------------------------------------------------
1547 Returns 1 if the single-precision floating-point value `a' is less than
1548 the corresponding value `b', and 0 otherwise. The comparison is performed
1549 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1550 -------------------------------------------------------------------------------
1552 flag float32_lt( float32 a, float32 b )
1556 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1557 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1559 float_raise( float_flag_invalid );
1562 aSign = extractFloat32Sign( a );
1563 bSign = extractFloat32Sign( b );
1564 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1565 return ( a != b ) && ( aSign ^ ( a < b ) );
1570 -------------------------------------------------------------------------------
1571 Returns 1 if the single-precision floating-point value `a' is equal to the
1572 corresponding value `b', and 0 otherwise. The invalid exception is raised
1573 if either operand is a NaN. Otherwise, the comparison is performed
1574 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1575 -------------------------------------------------------------------------------
1577 flag float32_eq_signaling( float32 a, float32 b )
1580 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1581 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1583 float_raise( float_flag_invalid );
1586 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1591 -------------------------------------------------------------------------------
1592 Returns 1 if the single-precision floating-point value `a' is less than or
1593 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1594 cause an exception. Otherwise, the comparison is performed according to the
1595 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1596 -------------------------------------------------------------------------------
1598 flag float32_le_quiet( float32 a, float32 b )
1603 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1604 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1606 /* Do nothing, even if NaN as we're quiet */
1609 aSign = extractFloat32Sign( a );
1610 bSign = extractFloat32Sign( b );
1611 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1612 return ( a == b ) || ( aSign ^ ( a < b ) );
1617 -------------------------------------------------------------------------------
1618 Returns 1 if the single-precision floating-point value `a' is less than
1619 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1620 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1621 Standard for Binary Floating-point Arithmetic.
1622 -------------------------------------------------------------------------------
1624 flag float32_lt_quiet( float32 a, float32 b )
1628 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1629 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1631 /* Do nothing, even if NaN as we're quiet */
1634 aSign = extractFloat32Sign( a );
1635 bSign = extractFloat32Sign( b );
1636 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1637 return ( a != b ) && ( aSign ^ ( a < b ) );
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format. The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic---which means in particular that the conversion is rounded
1647 according to the current rounding mode. If `a' is a NaN, the largest
1648 positive integer is returned. Otherwise, if the conversion overflows, the
1649 largest integer with the same sign as `a' is returned.
1650 -------------------------------------------------------------------------------
1652 int32 float64_to_int32( struct roundingData *roundData, float64 a )
1655 int16 aExp, shiftCount;
1658 aSig = extractFloat64Frac( a );
1659 aExp = extractFloat64Exp( a );
1660 aSign = extractFloat64Sign( a );
1661 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1662 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1663 shiftCount = 0x42C - aExp;
1664 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1665 return roundAndPackInt32( roundData, aSign, aSig );
1670 -------------------------------------------------------------------------------
1671 Returns the result of converting the double-precision floating-point value
1672 `a' to the 32-bit two's complement integer format. The conversion is
1673 performed according to the IEC/IEEE Standard for Binary Floating-point
1674 Arithmetic, except that the conversion is always rounded toward zero. If
1675 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1676 conversion overflows, the largest integer with the same sign as `a' is
1678 -------------------------------------------------------------------------------
1680 int32 float64_to_int32_round_to_zero( float64 a )
1683 int16 aExp, shiftCount;
1684 bits64 aSig, savedASig;
1687 aSig = extractFloat64Frac( a );
1688 aExp = extractFloat64Exp( a );
1689 aSign = extractFloat64Sign( a );
1690 shiftCount = 0x433 - aExp;
1691 if ( shiftCount < 21 ) {
1692 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1695 else if ( 52 < shiftCount ) {
1696 if ( aExp || aSig ) float_raise( float_flag_inexact );
1699 aSig |= LIT64( 0x0010000000000000 );
1701 aSig >>= shiftCount;
1703 if ( aSign ) z = - z;
1704 if ( ( z < 0 ) ^ aSign ) {
1706 float_raise( float_flag_invalid );
1707 return aSign ? 0x80000000 : 0x7FFFFFFF;
1709 if ( ( aSig<<shiftCount ) != savedASig ) {
1710 float_raise( float_flag_inexact );
1717 -------------------------------------------------------------------------------
1718 Returns the result of converting the double-precision floating-point value
1719 `a' to the 32-bit two's complement unsigned integer format. The conversion
1720 is performed according to the IEC/IEEE Standard for Binary Floating-point
1721 Arithmetic---which means in particular that the conversion is rounded
1722 according to the current rounding mode. If `a' is a NaN, the largest
1723 positive integer is returned. Otherwise, if the conversion overflows, the
1724 largest positive integer is returned.
1725 -------------------------------------------------------------------------------
1727 int32 float64_to_uint32( struct roundingData *roundData, float64 a )
1730 int16 aExp, shiftCount;
1733 aSig = extractFloat64Frac( a );
1734 aExp = extractFloat64Exp( a );
1735 aSign = 0; //extractFloat64Sign( a );
1736 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1737 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1738 shiftCount = 0x42C - aExp;
1739 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1740 return roundAndPackInt32( roundData, aSign, aSig );
1744 -------------------------------------------------------------------------------
1745 Returns the result of converting the double-precision floating-point value
1746 `a' to the 32-bit two's complement integer format. The conversion is
1747 performed according to the IEC/IEEE Standard for Binary Floating-point
1748 Arithmetic, except that the conversion is always rounded toward zero. If
1749 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1750 conversion overflows, the largest positive integer is returned.
1751 -------------------------------------------------------------------------------
1753 int32 float64_to_uint32_round_to_zero( float64 a )
1756 int16 aExp, shiftCount;
1757 bits64 aSig, savedASig;
1760 aSig = extractFloat64Frac( a );
1761 aExp = extractFloat64Exp( a );
1762 aSign = extractFloat64Sign( a );
1763 shiftCount = 0x433 - aExp;
1764 if ( shiftCount < 21 ) {
1765 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1768 else if ( 52 < shiftCount ) {
1769 if ( aExp || aSig ) float_raise( float_flag_inexact );
1772 aSig |= LIT64( 0x0010000000000000 );
1774 aSig >>= shiftCount;
1776 if ( aSign ) z = - z;
1777 if ( ( z < 0 ) ^ aSign ) {
1779 float_raise( float_flag_invalid );
1780 return aSign ? 0x80000000 : 0x7FFFFFFF;
1782 if ( ( aSig<<shiftCount ) != savedASig ) {
1783 float_raise( float_flag_inexact );
1789 -------------------------------------------------------------------------------
1790 Returns the result of converting the double-precision floating-point value
1791 `a' to the single-precision floating-point format. The conversion is
1792 performed according to the IEC/IEEE Standard for Binary Floating-point
1794 -------------------------------------------------------------------------------
1796 float32 float64_to_float32( struct roundingData *roundData, float64 a )
1803 aSig = extractFloat64Frac( a );
1804 aExp = extractFloat64Exp( a );
1805 aSign = extractFloat64Sign( a );
1806 if ( aExp == 0x7FF ) {
1807 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1808 return packFloat32( aSign, 0xFF, 0 );
1810 shift64RightJamming( aSig, 22, &aSig );
1812 if ( aExp || zSig ) {
1816 return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1823 -------------------------------------------------------------------------------
1824 Returns the result of converting the double-precision floating-point value
1825 `a' to the extended double-precision floating-point format. The conversion
1826 is performed according to the IEC/IEEE Standard for Binary Floating-point
1828 -------------------------------------------------------------------------------
1830 floatx80 float64_to_floatx80( float64 a )
1836 aSig = extractFloat64Frac( a );
1837 aExp = extractFloat64Exp( a );
1838 aSign = extractFloat64Sign( a );
1839 if ( aExp == 0x7FF ) {
1840 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1841 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1844 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1845 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1849 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1856 -------------------------------------------------------------------------------
1857 Rounds the double-precision floating-point value `a' to an integer, and
1858 returns the result as a double-precision floating-point value. The
1859 operation is performed according to the IEC/IEEE Standard for Binary
1860 Floating-point Arithmetic.
1861 -------------------------------------------------------------------------------
1863 float64 float64_round_to_int( struct roundingData *roundData, float64 a )
1867 bits64 lastBitMask, roundBitsMask;
1871 aExp = extractFloat64Exp( a );
1872 if ( 0x433 <= aExp ) {
1873 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1874 return propagateFloat64NaN( a, a );
1878 if ( aExp <= 0x3FE ) {
1879 if ( (bits64) ( a<<1 ) == 0 ) return a;
1880 roundData->exception |= float_flag_inexact;
1881 aSign = extractFloat64Sign( a );
1882 switch ( roundData->mode ) {
1883 case float_round_nearest_even:
1884 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1885 return packFloat64( aSign, 0x3FF, 0 );
1888 case float_round_down:
1889 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1890 case float_round_up:
1892 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1894 return packFloat64( aSign, 0, 0 );
1897 lastBitMask <<= 0x433 - aExp;
1898 roundBitsMask = lastBitMask - 1;
1900 roundingMode = roundData->mode;
1901 if ( roundingMode == float_round_nearest_even ) {
1902 z += lastBitMask>>1;
1903 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1905 else if ( roundingMode != float_round_to_zero ) {
1906 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1910 z &= ~ roundBitsMask;
1911 if ( z != a ) roundData->exception |= float_flag_inexact;
1917 -------------------------------------------------------------------------------
1918 Returns the result of adding the absolute values of the double-precision
1919 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1920 before being returned. `zSign' is ignored if the result is a NaN. The
1921 addition is performed according to the IEC/IEEE Standard for Binary
1922 Floating-point Arithmetic.
1923 -------------------------------------------------------------------------------
1925 static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1927 int16 aExp, bExp, zExp;
1928 bits64 aSig, bSig, zSig;
1931 aSig = extractFloat64Frac( a );
1932 aExp = extractFloat64Exp( a );
1933 bSig = extractFloat64Frac( b );
1934 bExp = extractFloat64Exp( b );
1935 expDiff = aExp - bExp;
1938 if ( 0 < expDiff ) {
1939 if ( aExp == 0x7FF ) {
1940 if ( aSig ) return propagateFloat64NaN( a, b );
1947 bSig |= LIT64( 0x2000000000000000 );
1949 shift64RightJamming( bSig, expDiff, &bSig );
1952 else if ( expDiff < 0 ) {
1953 if ( bExp == 0x7FF ) {
1954 if ( bSig ) return propagateFloat64NaN( a, b );
1955 return packFloat64( zSign, 0x7FF, 0 );
1961 aSig |= LIT64( 0x2000000000000000 );
1963 shift64RightJamming( aSig, - expDiff, &aSig );
1967 if ( aExp == 0x7FF ) {
1968 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1971 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1972 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1976 aSig |= LIT64( 0x2000000000000000 );
1977 zSig = ( aSig + bSig )<<1;
1979 if ( (sbits64) zSig < 0 ) {
1984 return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1989 -------------------------------------------------------------------------------
1990 Returns the result of subtracting the absolute values of the double-
1991 precision floating-point values `a' and `b'. If `zSign' is true, the
1992 difference is negated before being returned. `zSign' is ignored if the
1993 result is a NaN. The subtraction is performed according to the IEC/IEEE
1994 Standard for Binary Floating-point Arithmetic.
1995 -------------------------------------------------------------------------------
1997 static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1999 int16 aExp, bExp, zExp;
2000 bits64 aSig, bSig, zSig;
2003 aSig = extractFloat64Frac( a );
2004 aExp = extractFloat64Exp( a );
2005 bSig = extractFloat64Frac( b );
2006 bExp = extractFloat64Exp( b );
2007 expDiff = aExp - bExp;
2010 if ( 0 < expDiff ) goto aExpBigger;
2011 if ( expDiff < 0 ) goto bExpBigger;
2012 if ( aExp == 0x7FF ) {
2013 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2014 roundData->exception |= float_flag_invalid;
2015 return float64_default_nan;
2021 if ( bSig < aSig ) goto aBigger;
2022 if ( aSig < bSig ) goto bBigger;
2023 return packFloat64( roundData->mode == float_round_down, 0, 0 );
2025 if ( bExp == 0x7FF ) {
2026 if ( bSig ) return propagateFloat64NaN( a, b );
2027 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2033 aSig |= LIT64( 0x4000000000000000 );
2035 shift64RightJamming( aSig, - expDiff, &aSig );
2036 bSig |= LIT64( 0x4000000000000000 );
2041 goto normalizeRoundAndPack;
2043 if ( aExp == 0x7FF ) {
2044 if ( aSig ) return propagateFloat64NaN( a, b );
2051 bSig |= LIT64( 0x4000000000000000 );
2053 shift64RightJamming( bSig, expDiff, &bSig );
2054 aSig |= LIT64( 0x4000000000000000 );
2058 normalizeRoundAndPack:
2060 return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2065 -------------------------------------------------------------------------------
2066 Returns the result of adding the double-precision floating-point values `a'
2067 and `b'. The operation is performed according to the IEC/IEEE Standard for
2068 Binary Floating-point Arithmetic.
2069 -------------------------------------------------------------------------------
2071 float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2075 aSign = extractFloat64Sign( a );
2076 bSign = extractFloat64Sign( b );
2077 if ( aSign == bSign ) {
2078 return addFloat64Sigs( roundData, a, b, aSign );
2081 return subFloat64Sigs( roundData, a, b, aSign );
2087 -------------------------------------------------------------------------------
2088 Returns the result of subtracting the double-precision floating-point values
2089 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2090 for Binary Floating-point Arithmetic.
2091 -------------------------------------------------------------------------------
2093 float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2097 aSign = extractFloat64Sign( a );
2098 bSign = extractFloat64Sign( b );
2099 if ( aSign == bSign ) {
2100 return subFloat64Sigs( roundData, a, b, aSign );
2103 return addFloat64Sigs( roundData, a, b, aSign );
2109 -------------------------------------------------------------------------------
2110 Returns the result of multiplying the double-precision floating-point values
2111 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2112 for Binary Floating-point Arithmetic.
2113 -------------------------------------------------------------------------------
2115 float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2117 flag aSign, bSign, zSign;
2118 int16 aExp, bExp, zExp;
2119 bits64 aSig, bSig, zSig0, zSig1;
2121 aSig = extractFloat64Frac( a );
2122 aExp = extractFloat64Exp( a );
2123 aSign = extractFloat64Sign( a );
2124 bSig = extractFloat64Frac( b );
2125 bExp = extractFloat64Exp( b );
2126 bSign = extractFloat64Sign( b );
2127 zSign = aSign ^ bSign;
2128 if ( aExp == 0x7FF ) {
2129 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2130 return propagateFloat64NaN( a, b );
2132 if ( ( bExp | bSig ) == 0 ) {
2133 roundData->exception |= float_flag_invalid;
2134 return float64_default_nan;
2136 return packFloat64( zSign, 0x7FF, 0 );
2138 if ( bExp == 0x7FF ) {
2139 if ( bSig ) return propagateFloat64NaN( a, b );
2140 if ( ( aExp | aSig ) == 0 ) {
2141 roundData->exception |= float_flag_invalid;
2142 return float64_default_nan;
2144 return packFloat64( zSign, 0x7FF, 0 );
2147 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2148 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2151 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2152 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2154 zExp = aExp + bExp - 0x3FF;
2155 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2156 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2157 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2158 zSig0 |= ( zSig1 != 0 );
2159 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2163 return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2168 -------------------------------------------------------------------------------
2169 Returns the result of dividing the double-precision floating-point value `a'
2170 by the corresponding value `b'. The operation is performed according to
2171 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2172 -------------------------------------------------------------------------------
2174 float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2176 flag aSign, bSign, zSign;
2177 int16 aExp, bExp, zExp;
2178 bits64 aSig, bSig, zSig;
2180 bits64 term0, term1;
2182 aSig = extractFloat64Frac( a );
2183 aExp = extractFloat64Exp( a );
2184 aSign = extractFloat64Sign( a );
2185 bSig = extractFloat64Frac( b );
2186 bExp = extractFloat64Exp( b );
2187 bSign = extractFloat64Sign( b );
2188 zSign = aSign ^ bSign;
2189 if ( aExp == 0x7FF ) {
2190 if ( aSig ) return propagateFloat64NaN( a, b );
2191 if ( bExp == 0x7FF ) {
2192 if ( bSig ) return propagateFloat64NaN( a, b );
2193 roundData->exception |= float_flag_invalid;
2194 return float64_default_nan;
2196 return packFloat64( zSign, 0x7FF, 0 );
2198 if ( bExp == 0x7FF ) {
2199 if ( bSig ) return propagateFloat64NaN( a, b );
2200 return packFloat64( zSign, 0, 0 );
2204 if ( ( aExp | aSig ) == 0 ) {
2205 roundData->exception |= float_flag_invalid;
2206 return float64_default_nan;
2208 roundData->exception |= float_flag_divbyzero;
2209 return packFloat64( zSign, 0x7FF, 0 );
2211 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2214 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2215 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2217 zExp = aExp - bExp + 0x3FD;
2218 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2219 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2220 if ( bSig <= ( aSig + aSig ) ) {
2224 zSig = estimateDiv128To64( aSig, 0, bSig );
2225 if ( ( zSig & 0x1FF ) <= 2 ) {
2226 mul64To128( bSig, zSig, &term0, &term1 );
2227 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2228 while ( (sbits64) rem0 < 0 ) {
2230 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2232 zSig |= ( rem1 != 0 );
2234 return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2239 -------------------------------------------------------------------------------
2240 Returns the remainder of the double-precision floating-point value `a'
2241 with respect to the corresponding value `b'. The operation is performed
2242 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2243 -------------------------------------------------------------------------------
2245 float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2247 flag aSign, bSign, zSign;
2248 int16 aExp, bExp, expDiff;
2250 bits64 q, alternateASig;
2253 aSig = extractFloat64Frac( a );
2254 aExp = extractFloat64Exp( a );
2255 aSign = extractFloat64Sign( a );
2256 bSig = extractFloat64Frac( b );
2257 bExp = extractFloat64Exp( b );
2258 bSign = extractFloat64Sign( b );
2259 if ( aExp == 0x7FF ) {
2260 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2261 return propagateFloat64NaN( a, b );
2263 roundData->exception |= float_flag_invalid;
2264 return float64_default_nan;
2266 if ( bExp == 0x7FF ) {
2267 if ( bSig ) return propagateFloat64NaN( a, b );
2272 roundData->exception |= float_flag_invalid;
2273 return float64_default_nan;
2275 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2278 if ( aSig == 0 ) return a;
2279 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2281 expDiff = aExp - bExp;
2282 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2283 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2284 if ( expDiff < 0 ) {
2285 if ( expDiff < -1 ) return a;
2288 q = ( bSig <= aSig );
2289 if ( q ) aSig -= bSig;
2291 while ( 0 < expDiff ) {
2292 q = estimateDiv128To64( aSig, 0, bSig );
2293 q = ( 2 < q ) ? q - 2 : 0;
2294 aSig = - ( ( bSig>>2 ) * q );
2298 if ( 0 < expDiff ) {
2299 q = estimateDiv128To64( aSig, 0, bSig );
2300 q = ( 2 < q ) ? q - 2 : 0;
2303 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2310 alternateASig = aSig;
2313 } while ( 0 <= (sbits64) aSig );
2314 sigMean = aSig + alternateASig;
2315 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2316 aSig = alternateASig;
2318 zSign = ( (sbits64) aSig < 0 );
2319 if ( zSign ) aSig = - aSig;
2320 return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2325 -------------------------------------------------------------------------------
2326 Returns the square root of the double-precision floating-point value `a'.
2327 The operation is performed according to the IEC/IEEE Standard for Binary
2328 Floating-point Arithmetic.
2329 -------------------------------------------------------------------------------
2331 float64 float64_sqrt( struct roundingData *roundData, float64 a )
2336 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2339 aSig = extractFloat64Frac( a );
2340 aExp = extractFloat64Exp( a );
2341 aSign = extractFloat64Sign( a );
2342 if ( aExp == 0x7FF ) {
2343 if ( aSig ) return propagateFloat64NaN( a, a );
2344 if ( ! aSign ) return a;
2345 roundData->exception |= float_flag_invalid;
2346 return float64_default_nan;
2349 if ( ( aExp | aSig ) == 0 ) return a;
2350 roundData->exception |= float_flag_invalid;
2351 return float64_default_nan;
2354 if ( aSig == 0 ) return 0;
2355 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2357 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2358 aSig |= LIT64( 0x0010000000000000 );
2359 zSig = estimateSqrt32( aExp, aSig>>21 );
2361 aSig <<= 9 - ( aExp & 1 );
2362 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2363 if ( ( zSig & 0x3FF ) <= 5 ) {
2365 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2369 mul64To128( zSig, zSig, &term0, &term1 );
2370 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2371 while ( (sbits64) rem0 < 0 ) {
2373 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2375 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2377 zSig |= ( ( rem0 | rem1 ) != 0 );
2380 shift64RightJamming( zSig, 1, &zSig );
2381 return roundAndPackFloat64( roundData, 0, zExp, zSig );
2386 -------------------------------------------------------------------------------
2387 Returns 1 if the double-precision floating-point value `a' is equal to the
2388 corresponding value `b', and 0 otherwise. The comparison is performed
2389 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2390 -------------------------------------------------------------------------------
2392 flag float64_eq( float64 a, float64 b )
2395 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2396 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2398 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2399 float_raise( float_flag_invalid );
2403 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2408 -------------------------------------------------------------------------------
2409 Returns 1 if the double-precision floating-point value `a' is less than or
2410 equal to the corresponding value `b', and 0 otherwise. The comparison is
2411 performed according to the IEC/IEEE Standard for Binary Floating-point
2413 -------------------------------------------------------------------------------
2415 flag float64_le( float64 a, float64 b )
2419 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2420 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2422 float_raise( float_flag_invalid );
2425 aSign = extractFloat64Sign( a );
2426 bSign = extractFloat64Sign( b );
2427 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2428 return ( a == b ) || ( aSign ^ ( a < b ) );
2433 -------------------------------------------------------------------------------
2434 Returns 1 if the double-precision floating-point value `a' is less than
2435 the corresponding value `b', and 0 otherwise. The comparison is performed
2436 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2437 -------------------------------------------------------------------------------
2439 flag float64_lt( float64 a, float64 b )
2443 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2444 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2446 float_raise( float_flag_invalid );
2449 aSign = extractFloat64Sign( a );
2450 bSign = extractFloat64Sign( b );
2451 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2452 return ( a != b ) && ( aSign ^ ( a < b ) );
2457 -------------------------------------------------------------------------------
2458 Returns 1 if the double-precision floating-point value `a' is equal to the
2459 corresponding value `b', and 0 otherwise. The invalid exception is raised
2460 if either operand is a NaN. Otherwise, the comparison is performed
2461 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2462 -------------------------------------------------------------------------------
2464 flag float64_eq_signaling( float64 a, float64 b )
2467 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2468 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2470 float_raise( float_flag_invalid );
2473 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2478 -------------------------------------------------------------------------------
2479 Returns 1 if the double-precision floating-point value `a' is less than or
2480 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2481 cause an exception. Otherwise, the comparison is performed according to the
2482 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2483 -------------------------------------------------------------------------------
2485 flag float64_le_quiet( float64 a, float64 b )
2490 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2491 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2493 /* Do nothing, even if NaN as we're quiet */
2496 aSign = extractFloat64Sign( a );
2497 bSign = extractFloat64Sign( b );
2498 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2499 return ( a == b ) || ( aSign ^ ( a < b ) );
2504 -------------------------------------------------------------------------------
2505 Returns 1 if the double-precision floating-point value `a' is less than
2506 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2507 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2508 Standard for Binary Floating-point Arithmetic.
2509 -------------------------------------------------------------------------------
2511 flag float64_lt_quiet( float64 a, float64 b )
2515 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2516 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2518 /* Do nothing, even if NaN as we're quiet */
2521 aSign = extractFloat64Sign( a );
2522 bSign = extractFloat64Sign( b );
2523 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2524 return ( a != b ) && ( aSign ^ ( a < b ) );
2531 -------------------------------------------------------------------------------
2532 Returns the result of converting the extended double-precision floating-
2533 point value `a' to the 32-bit two's complement integer format. The
2534 conversion is performed according to the IEC/IEEE Standard for Binary
2535 Floating-point Arithmetic---which means in particular that the conversion
2536 is rounded according to the current rounding mode. If `a' is a NaN, the
2537 largest positive integer is returned. Otherwise, if the conversion
2538 overflows, the largest integer with the same sign as `a' is returned.
2539 -------------------------------------------------------------------------------
2541 int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2544 int32 aExp, shiftCount;
2547 aSig = extractFloatx80Frac( a );
2548 aExp = extractFloatx80Exp( a );
2549 aSign = extractFloatx80Sign( a );
2550 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2551 shiftCount = 0x4037 - aExp;
2552 if ( shiftCount <= 0 ) shiftCount = 1;
2553 shift64RightJamming( aSig, shiftCount, &aSig );
2554 return roundAndPackInt32( roundData, aSign, aSig );
2559 -------------------------------------------------------------------------------
2560 Returns the result of converting the extended double-precision floating-
2561 point value `a' to the 32-bit two's complement integer format. The
2562 conversion is performed according to the IEC/IEEE Standard for Binary
2563 Floating-point Arithmetic, except that the conversion is always rounded
2564 toward zero. If `a' is a NaN, the largest positive integer is returned.
2565 Otherwise, if the conversion overflows, the largest integer with the same
2566 sign as `a' is returned.
2567 -------------------------------------------------------------------------------
2569 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2572 int32 aExp, shiftCount;
2573 bits64 aSig, savedASig;
2576 aSig = extractFloatx80Frac( a );
2577 aExp = extractFloatx80Exp( a );
2578 aSign = extractFloatx80Sign( a );
2579 shiftCount = 0x403E - aExp;
2580 if ( shiftCount < 32 ) {
2581 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2584 else if ( 63 < shiftCount ) {
2585 if ( aExp || aSig ) float_raise( float_flag_inexact );
2589 aSig >>= shiftCount;
2591 if ( aSign ) z = - z;
2592 if ( ( z < 0 ) ^ aSign ) {
2594 float_raise( float_flag_invalid );
2595 return aSign ? 0x80000000 : 0x7FFFFFFF;
2597 if ( ( aSig<<shiftCount ) != savedASig ) {
2598 float_raise( float_flag_inexact );
2605 -------------------------------------------------------------------------------
2606 Returns the result of converting the extended double-precision floating-
2607 point value `a' to the single-precision floating-point format. The
2608 conversion is performed according to the IEC/IEEE Standard for Binary
2609 Floating-point Arithmetic.
2610 -------------------------------------------------------------------------------
2612 float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2618 aSig = extractFloatx80Frac( a );
2619 aExp = extractFloatx80Exp( a );
2620 aSign = extractFloatx80Sign( a );
2621 if ( aExp == 0x7FFF ) {
2622 if ( (bits64) ( aSig<<1 ) ) {
2623 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2625 return packFloat32( aSign, 0xFF, 0 );
2627 shift64RightJamming( aSig, 33, &aSig );
2628 if ( aExp || aSig ) aExp -= 0x3F81;
2629 return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2634 -------------------------------------------------------------------------------
2635 Returns the result of converting the extended double-precision floating-
2636 point value `a' to the double-precision floating-point format. The
2637 conversion is performed according to the IEC/IEEE Standard for Binary
2638 Floating-point Arithmetic.
2639 -------------------------------------------------------------------------------
2641 float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2647 aSig = extractFloatx80Frac( a );
2648 aExp = extractFloatx80Exp( a );
2649 aSign = extractFloatx80Sign( a );
2650 if ( aExp == 0x7FFF ) {
2651 if ( (bits64) ( aSig<<1 ) ) {
2652 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2654 return packFloat64( aSign, 0x7FF, 0 );
2656 shift64RightJamming( aSig, 1, &zSig );
2657 if ( aExp || aSig ) aExp -= 0x3C01;
2658 return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2663 -------------------------------------------------------------------------------
2664 Rounds the extended double-precision floating-point value `a' to an integer,
2665 and returns the result as an extended quadruple-precision floating-point
2666 value. The operation is performed according to the IEC/IEEE Standard for
2667 Binary Floating-point Arithmetic.
2668 -------------------------------------------------------------------------------
2670 floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2674 bits64 lastBitMask, roundBitsMask;
2678 aExp = extractFloatx80Exp( a );
2679 if ( 0x403E <= aExp ) {
2680 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2681 return propagateFloatx80NaN( a, a );
2685 if ( aExp <= 0x3FFE ) {
2687 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2690 roundData->exception |= float_flag_inexact;
2691 aSign = extractFloatx80Sign( a );
2692 switch ( roundData->mode ) {
2693 case float_round_nearest_even:
2694 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2697 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2700 case float_round_down:
2703 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2704 : packFloatx80( 0, 0, 0 );
2705 case float_round_up:
2707 aSign ? packFloatx80( 1, 0, 0 )
2708 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2710 return packFloatx80( aSign, 0, 0 );
2713 lastBitMask <<= 0x403E - aExp;
2714 roundBitsMask = lastBitMask - 1;
2716 roundingMode = roundData->mode;
2717 if ( roundingMode == float_round_nearest_even ) {
2718 z.low += lastBitMask>>1;
2719 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2721 else if ( roundingMode != float_round_to_zero ) {
2722 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2723 z.low += roundBitsMask;
2726 z.low &= ~ roundBitsMask;
2729 z.low = LIT64( 0x8000000000000000 );
2731 if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2737 -------------------------------------------------------------------------------
2738 Returns the result of adding the absolute values of the extended double-
2739 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2740 negated before being returned. `zSign' is ignored if the result is a NaN.
2741 The addition is performed according to the IEC/IEEE Standard for Binary
2742 Floating-point Arithmetic.
2743 -------------------------------------------------------------------------------
2745 static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2747 int32 aExp, bExp, zExp;
2748 bits64 aSig, bSig, zSig0, zSig1;
2751 aSig = extractFloatx80Frac( a );
2752 aExp = extractFloatx80Exp( a );
2753 bSig = extractFloatx80Frac( b );
2754 bExp = extractFloatx80Exp( b );
2755 expDiff = aExp - bExp;
2756 if ( 0 < expDiff ) {
2757 if ( aExp == 0x7FFF ) {
2758 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2761 if ( bExp == 0 ) --expDiff;
2762 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2765 else if ( expDiff < 0 ) {
2766 if ( bExp == 0x7FFF ) {
2767 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2768 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2770 if ( aExp == 0 ) ++expDiff;
2771 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2775 if ( aExp == 0x7FFF ) {
2776 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2777 return propagateFloatx80NaN( a, b );
2782 zSig0 = aSig + bSig;
2784 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2791 zSig0 = aSig + bSig;
2793 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2795 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2796 zSig0 |= LIT64( 0x8000000000000000 );
2800 roundAndPackFloatx80(
2801 roundData, zSign, zExp, zSig0, zSig1 );
2806 -------------------------------------------------------------------------------
2807 Returns the result of subtracting the absolute values of the extended
2808 double-precision floating-point values `a' and `b'. If `zSign' is true,
2809 the difference is negated before being returned. `zSign' is ignored if the
2810 result is a NaN. The subtraction is performed according to the IEC/IEEE
2811 Standard for Binary Floating-point Arithmetic.
2812 -------------------------------------------------------------------------------
2814 static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2816 int32 aExp, bExp, zExp;
2817 bits64 aSig, bSig, zSig0, zSig1;
2821 aSig = extractFloatx80Frac( a );
2822 aExp = extractFloatx80Exp( a );
2823 bSig = extractFloatx80Frac( b );
2824 bExp = extractFloatx80Exp( b );
2825 expDiff = aExp - bExp;
2826 if ( 0 < expDiff ) goto aExpBigger;
2827 if ( expDiff < 0 ) goto bExpBigger;
2828 if ( aExp == 0x7FFF ) {
2829 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2830 return propagateFloatx80NaN( a, b );
2832 roundData->exception |= float_flag_invalid;
2833 z.low = floatx80_default_nan_low;
2834 z.high = floatx80_default_nan_high;
2843 if ( bSig < aSig ) goto aBigger;
2844 if ( aSig < bSig ) goto bBigger;
2845 return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2847 if ( bExp == 0x7FFF ) {
2848 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2849 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2851 if ( aExp == 0 ) ++expDiff;
2852 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2854 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2857 goto normalizeRoundAndPack;
2859 if ( aExp == 0x7FFF ) {
2860 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2863 if ( bExp == 0 ) --expDiff;
2864 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2866 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2868 normalizeRoundAndPack:
2870 normalizeRoundAndPackFloatx80(
2871 roundData, zSign, zExp, zSig0, zSig1 );
2876 -------------------------------------------------------------------------------
2877 Returns the result of adding the extended double-precision floating-point
2878 values `a' and `b'. The operation is performed according to the IEC/IEEE
2879 Standard for Binary Floating-point Arithmetic.
2880 -------------------------------------------------------------------------------
2882 floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2886 aSign = extractFloatx80Sign( a );
2887 bSign = extractFloatx80Sign( b );
2888 if ( aSign == bSign ) {
2889 return addFloatx80Sigs( roundData, a, b, aSign );
2892 return subFloatx80Sigs( roundData, a, b, aSign );
2898 -------------------------------------------------------------------------------
2899 Returns the result of subtracting the extended double-precision floating-
2900 point values `a' and `b'. The operation is performed according to the
2901 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2902 -------------------------------------------------------------------------------
2904 floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2908 aSign = extractFloatx80Sign( a );
2909 bSign = extractFloatx80Sign( b );
2910 if ( aSign == bSign ) {
2911 return subFloatx80Sigs( roundData, a, b, aSign );
2914 return addFloatx80Sigs( roundData, a, b, aSign );
2920 -------------------------------------------------------------------------------
2921 Returns the result of multiplying the extended double-precision floating-
2922 point values `a' and `b'. The operation is performed according to the
2923 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2924 -------------------------------------------------------------------------------
2926 floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2928 flag aSign, bSign, zSign;
2929 int32 aExp, bExp, zExp;
2930 bits64 aSig, bSig, zSig0, zSig1;
2933 aSig = extractFloatx80Frac( a );
2934 aExp = extractFloatx80Exp( a );
2935 aSign = extractFloatx80Sign( a );
2936 bSig = extractFloatx80Frac( b );
2937 bExp = extractFloatx80Exp( b );
2938 bSign = extractFloatx80Sign( b );
2939 zSign = aSign ^ bSign;
2940 if ( aExp == 0x7FFF ) {
2941 if ( (bits64) ( aSig<<1 )
2942 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2943 return propagateFloatx80NaN( a, b );
2945 if ( ( bExp | bSig ) == 0 ) goto invalid;
2946 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2948 if ( bExp == 0x7FFF ) {
2949 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2950 if ( ( aExp | aSig ) == 0 ) {
2952 roundData->exception |= float_flag_invalid;
2953 z.low = floatx80_default_nan_low;
2954 z.high = floatx80_default_nan_high;
2958 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2961 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2962 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2965 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2966 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2968 zExp = aExp + bExp - 0x3FFE;
2969 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2970 if ( 0 < (sbits64) zSig0 ) {
2971 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2975 roundAndPackFloatx80(
2976 roundData, zSign, zExp, zSig0, zSig1 );
2981 -------------------------------------------------------------------------------
2982 Returns the result of dividing the extended double-precision floating-point
2983 value `a' by the corresponding value `b'. The operation is performed
2984 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2985 -------------------------------------------------------------------------------
2987 floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2989 flag aSign, bSign, zSign;
2990 int32 aExp, bExp, zExp;
2991 bits64 aSig, bSig, zSig0, zSig1;
2992 bits64 rem0, rem1, rem2, term0, term1, term2;
2995 aSig = extractFloatx80Frac( a );
2996 aExp = extractFloatx80Exp( a );
2997 aSign = extractFloatx80Sign( a );
2998 bSig = extractFloatx80Frac( b );
2999 bExp = extractFloatx80Exp( b );
3000 bSign = extractFloatx80Sign( b );
3001 zSign = aSign ^ bSign;
3002 if ( aExp == 0x7FFF ) {
3003 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3004 if ( bExp == 0x7FFF ) {
3005 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3008 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3010 if ( bExp == 0x7FFF ) {
3011 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3012 return packFloatx80( zSign, 0, 0 );
3016 if ( ( aExp | aSig ) == 0 ) {
3018 roundData->exception |= float_flag_invalid;
3019 z.low = floatx80_default_nan_low;
3020 z.high = floatx80_default_nan_high;
3024 roundData->exception |= float_flag_divbyzero;
3025 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3027 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3030 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3031 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3033 zExp = aExp - bExp + 0x3FFE;
3035 if ( bSig <= aSig ) {
3036 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3039 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3040 mul64To128( bSig, zSig0, &term0, &term1 );
3041 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3042 while ( (sbits64) rem0 < 0 ) {
3044 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3046 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3047 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3048 mul64To128( bSig, zSig1, &term1, &term2 );
3049 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3050 while ( (sbits64) rem1 < 0 ) {
3052 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3054 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3057 roundAndPackFloatx80(
3058 roundData, zSign, zExp, zSig0, zSig1 );
3063 -------------------------------------------------------------------------------
3064 Returns the remainder of the extended double-precision floating-point value
3065 `a' with respect to the corresponding value `b'. The operation is performed
3066 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3067 -------------------------------------------------------------------------------
3069 floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3071 flag aSign, bSign, zSign;
3072 int32 aExp, bExp, expDiff;
3073 bits64 aSig0, aSig1, bSig;
3074 bits64 q, term0, term1, alternateASig0, alternateASig1;
3077 aSig0 = extractFloatx80Frac( a );
3078 aExp = extractFloatx80Exp( a );
3079 aSign = extractFloatx80Sign( a );
3080 bSig = extractFloatx80Frac( b );
3081 bExp = extractFloatx80Exp( b );
3082 bSign = extractFloatx80Sign( b );
3083 if ( aExp == 0x7FFF ) {
3084 if ( (bits64) ( aSig0<<1 )
3085 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3086 return propagateFloatx80NaN( a, b );
3090 if ( bExp == 0x7FFF ) {
3091 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3097 roundData->exception |= float_flag_invalid;
3098 z.low = floatx80_default_nan_low;
3099 z.high = floatx80_default_nan_high;
3103 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3106 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3107 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3109 bSig |= LIT64( 0x8000000000000000 );
3111 expDiff = aExp - bExp;
3113 if ( expDiff < 0 ) {
3114 if ( expDiff < -1 ) return a;
3115 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3118 q = ( bSig <= aSig0 );
3119 if ( q ) aSig0 -= bSig;
3121 while ( 0 < expDiff ) {
3122 q = estimateDiv128To64( aSig0, aSig1, bSig );
3123 q = ( 2 < q ) ? q - 2 : 0;
3124 mul64To128( bSig, q, &term0, &term1 );
3125 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3126 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3130 if ( 0 < expDiff ) {
3131 q = estimateDiv128To64( aSig0, aSig1, bSig );
3132 q = ( 2 < q ) ? q - 2 : 0;
3134 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3135 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3136 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3137 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3139 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3146 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3147 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3148 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3151 aSig0 = alternateASig0;
3152 aSig1 = alternateASig1;
3157 normalizeRoundAndPackFloatx80(
3158 roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3163 -------------------------------------------------------------------------------
3164 Returns the square root of the extended double-precision floating-point
3165 value `a'. The operation is performed according to the IEC/IEEE Standard
3166 for Binary Floating-point Arithmetic.
3167 -------------------------------------------------------------------------------
3169 floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3173 bits64 aSig0, aSig1, zSig0, zSig1;
3174 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3175 bits64 shiftedRem0, shiftedRem1;
3178 aSig0 = extractFloatx80Frac( a );
3179 aExp = extractFloatx80Exp( a );
3180 aSign = extractFloatx80Sign( a );
3181 if ( aExp == 0x7FFF ) {
3182 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3183 if ( ! aSign ) return a;
3187 if ( ( aExp | aSig0 ) == 0 ) return a;
3189 roundData->exception |= float_flag_invalid;
3190 z.low = floatx80_default_nan_low;
3191 z.high = floatx80_default_nan_high;
3196 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3197 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3199 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3200 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3203 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3204 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3205 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3206 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3207 mul64To128( zSig0, zSig0, &term0, &term1 );
3208 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3209 while ( (sbits64) rem0 < 0 ) {
3211 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3213 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3215 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3216 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3217 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3218 if ( zSig1 == 0 ) zSig1 = 1;
3219 mul64To128( zSig0, zSig1, &term1, &term2 );
3220 shortShift128Left( term1, term2, 1, &term1, &term2 );
3221 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3222 mul64To128( zSig1, zSig1, &term2, &term3 );
3223 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3224 while ( (sbits64) rem1 < 0 ) {
3226 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3229 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3231 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3234 roundAndPackFloatx80(
3235 roundData, 0, zExp, zSig0, zSig1 );
3240 -------------------------------------------------------------------------------
3241 Returns 1 if the extended double-precision floating-point value `a' is
3242 equal to the corresponding value `b', and 0 otherwise. The comparison is
3243 performed according to the IEC/IEEE Standard for Binary Floating-point
3245 -------------------------------------------------------------------------------
3247 flag floatx80_eq( floatx80 a, floatx80 b )
3250 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3251 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3252 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3253 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3255 if ( floatx80_is_signaling_nan( a )
3256 || floatx80_is_signaling_nan( b ) ) {
3257 float_raise( float_flag_invalid );
3263 && ( ( a.high == b.high )
3265 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3271 -------------------------------------------------------------------------------
3272 Returns 1 if the extended double-precision floating-point value `a' is
3273 less than or equal to the corresponding value `b', and 0 otherwise. The
3274 comparison is performed according to the IEC/IEEE Standard for Binary
3275 Floating-point Arithmetic.
3276 -------------------------------------------------------------------------------
3278 flag floatx80_le( floatx80 a, floatx80 b )
3282 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3283 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3284 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3285 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3287 float_raise( float_flag_invalid );
3290 aSign = extractFloatx80Sign( a );
3291 bSign = extractFloatx80Sign( b );
3292 if ( aSign != bSign ) {
3295 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3299 aSign ? le128( b.high, b.low, a.high, a.low )
3300 : le128( a.high, a.low, b.high, b.low );
3305 -------------------------------------------------------------------------------
3306 Returns 1 if the extended double-precision floating-point value `a' is
3307 less than the corresponding value `b', and 0 otherwise. The comparison
3308 is performed according to the IEC/IEEE Standard for Binary Floating-point
3310 -------------------------------------------------------------------------------
3312 flag floatx80_lt( floatx80 a, floatx80 b )
3316 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3317 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3318 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3319 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3321 float_raise( float_flag_invalid );
3324 aSign = extractFloatx80Sign( a );
3325 bSign = extractFloatx80Sign( b );
3326 if ( aSign != bSign ) {
3329 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3333 aSign ? lt128( b.high, b.low, a.high, a.low )
3334 : lt128( a.high, a.low, b.high, b.low );
3339 -------------------------------------------------------------------------------
3340 Returns 1 if the extended double-precision floating-point value `a' is equal
3341 to the corresponding value `b', and 0 otherwise. The invalid exception is
3342 raised if either operand is a NaN. Otherwise, the comparison is performed
3343 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3344 -------------------------------------------------------------------------------
3346 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3349 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3350 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3351 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3352 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3354 float_raise( float_flag_invalid );
3359 && ( ( a.high == b.high )
3361 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3367 -------------------------------------------------------------------------------
3368 Returns 1 if the extended double-precision floating-point value `a' is less
3369 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3370 do not cause an exception. Otherwise, the comparison is performed according
3371 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3372 -------------------------------------------------------------------------------
3374 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3378 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3379 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3380 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3381 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3383 /* Do nothing, even if NaN as we're quiet */
3386 aSign = extractFloatx80Sign( a );
3387 bSign = extractFloatx80Sign( b );
3388 if ( aSign != bSign ) {
3391 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3395 aSign ? le128( b.high, b.low, a.high, a.low )
3396 : le128( a.high, a.low, b.high, b.low );
3401 -------------------------------------------------------------------------------
3402 Returns 1 if the extended double-precision floating-point value `a' is less
3403 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3404 an exception. Otherwise, the comparison is performed according to the
3405 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3406 -------------------------------------------------------------------------------
3408 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3412 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3413 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3414 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3415 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3417 /* Do nothing, even if NaN as we're quiet */
3420 aSign = extractFloatx80Sign( a );
3421 bSign = extractFloatx80Sign( b );
3422 if ( aSign != bSign ) {
3425 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3429 aSign ? lt128( b.high, b.low, a.high, a.low )
3430 : lt128( a.high, a.low, b.high, b.low );