1 /* $OpenBSD: softfloat.c,v 1.3 2015/09/13 14:21:46 miod Exp $ */
2 /* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:07 bjh21 Exp $ */
3
4 /*
5 * This version hacked for use with gcc -msoft-float by bjh21.
6 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
7 * itself).
8 */
9
10 /*
11 * Things you may want to define:
12 *
13 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
14 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
15 * properly renamed.
16 */
17
18 /*
19 * This differs from the standard bits32/softfloat.c in that float64
20 * is defined to be a 64-bit integer rather than a structure. The
21 * structure is float64s, with translation between the two going via
22 * float64u.
23 */
24
25 /*
26 ===============================================================================
27
28 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
29 Arithmetic Package, Release 2a.
30
31 Written by John R. Hauser. This work was made possible in part by the
32 International Computer Science Institute, located at Suite 600, 1947 Center
33 Street, Berkeley, California 94704. Funding was partially provided by the
34 National Science Foundation under grant MIP-9311980. The original version
35 of this code was written as part of a project to build a fixed-point vector
36 processor in collaboration with the University of California at Berkeley,
37 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
38 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
39 arithmetic/SoftFloat.html'.
40
41 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
42 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
43 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
44 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
45 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
46
47 Derivative works are acceptable, even for commercial purposes, so long as
48 (1) they include prominent notice that the work is derivative, and (2) they
49 include prominent notice akin to these four paragraphs for those parts of
50 this code that are retained.
51
52 ===============================================================================
53 */
54
55 #ifdef SOFTFLOAT_FOR_GCC
56 #include "softfloat-for-gcc.h"
57 #endif
58
59 #include "milieu.h"
60 #include <softfloat.h>
61
62 /*
63 * Conversions between floats as stored in memory and floats as
64 * SoftFloat uses them
65 */
66 #ifndef FLOAT64_DEMANGLE
67 #define FLOAT64_DEMANGLE(a) (a)
68 #endif
69 #ifndef FLOAT64_MANGLE
70 #define FLOAT64_MANGLE(a) (a)
71 #endif
72
73 /*
74 -------------------------------------------------------------------------------
75 Primitive arithmetic functions, including multi-word arithmetic, and
76 division and square root approximations. (Can be specialized to target if
77 desired.)
78 -------------------------------------------------------------------------------
79 */
80 #include "softfloat-macros.h"
81
82 /*
83 -------------------------------------------------------------------------------
84 Functions and definitions to determine: (1) whether tininess for underflow
85 is detected before or after rounding by default, (2) what (if anything)
86 happens when exceptions are raised, (3) how signaling NaNs are distinguished
87 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
88 are propagated from function inputs to output. These details are target-
89 specific.
90 -------------------------------------------------------------------------------
91 */
92 #include "softfloat-specialize.h"
93
94 /*
95 -------------------------------------------------------------------------------
96 Floating-point rounding mode and exception flags.
97 -------------------------------------------------------------------------------
98 */
99 fp_rnd float_rounding_mode = float_round_nearest_even;
100 fp_except float_exception_flags = 0;
101
102 /*
103 -------------------------------------------------------------------------------
104 Returns the fraction bits of the single-precision floating-point value `a'.
105 -------------------------------------------------------------------------------
106 */
extractFloat32Frac(float32 a)107 INLINE bits32 extractFloat32Frac( float32 a )
108 {
109
110 return a & 0x007FFFFF;
111
112 }
113
114 /*
115 -------------------------------------------------------------------------------
116 Returns the exponent bits of the single-precision floating-point value `a'.
117 -------------------------------------------------------------------------------
118 */
extractFloat32Exp(float32 a)119 INLINE int16 extractFloat32Exp( float32 a )
120 {
121
122 return ( a>>23 ) & 0xFF;
123
124 }
125
126 /*
127 -------------------------------------------------------------------------------
128 Returns the sign bit of the single-precision floating-point value `a'.
129 -------------------------------------------------------------------------------
130 */
extractFloat32Sign(float32 a)131 INLINE flag extractFloat32Sign( float32 a )
132 {
133
134 return a>>31;
135
136 }
137
138 /*
139 -------------------------------------------------------------------------------
140 Normalizes the subnormal single-precision floating-point value represented
141 by the denormalized significand `aSig'. The normalized exponent and
142 significand are stored at the locations pointed to by `zExpPtr' and
143 `zSigPtr', respectively.
144 -------------------------------------------------------------------------------
145 */
146 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)147 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
148 {
149 int8 shiftCount;
150
151 shiftCount = countLeadingZeros32( aSig ) - 8;
152 *zSigPtr = aSig<<shiftCount;
153 *zExpPtr = 1 - shiftCount;
154
155 }
156
157 /*
158 -------------------------------------------------------------------------------
159 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
160 single-precision floating-point value, returning the result. After being
161 shifted into the proper positions, the three fields are simply added
162 together to form the result. This means that any integer portion of `zSig'
163 will be added into the exponent. Since a properly normalized significand
164 will have an integer portion equal to 1, the `zExp' input should be 1 less
165 than the desired result exponent whenever `zSig' is a complete, normalized
166 significand.
167 -------------------------------------------------------------------------------
168 */
packFloat32(flag zSign,int16 zExp,bits32 zSig)169 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
170 {
171
172 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
173
174 }
175
176 /*
177 -------------------------------------------------------------------------------
178 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
179 and significand `zSig', and returns the proper single-precision floating-
180 point value corresponding to the abstract input. Ordinarily, the abstract
181 value is simply rounded and packed into the single-precision format, with
182 the inexact exception raised if the abstract input cannot be represented
183 exactly. However, if the abstract value is too large, the overflow and
184 inexact exceptions are raised and an infinity or maximal finite value is
185 returned. If the abstract value is too small, the input value is rounded to
186 a subnormal number, and the underflow and inexact exceptions are raised if
187 the abstract input cannot be represented exactly as a subnormal single-
188 precision floating-point number.
189 The input significand `zSig' has its binary point between bits 30
190 and 29, which is 7 bits to the left of the usual location. This shifted
191 significand must be normalized or smaller. If `zSig' is not normalized,
192 `zExp' must be 0; in that case, the result returned is a subnormal number,
193 and it must not require rounding. In the usual case that `zSig' is
194 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
195 The handling of underflow and overflow follows the IEC/IEEE Standard for
196 Binary Floating-Point Arithmetic.
197 -------------------------------------------------------------------------------
198 */
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)199 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
200 {
201 int8 roundingMode;
202 flag roundNearestEven;
203 int8 roundIncrement, roundBits;
204 flag isTiny;
205
206 roundingMode = float_rounding_mode;
207 roundNearestEven = roundingMode == float_round_nearest_even;
208 roundIncrement = 0x40;
209 if ( ! roundNearestEven ) {
210 if ( roundingMode == float_round_to_zero ) {
211 roundIncrement = 0;
212 }
213 else {
214 roundIncrement = 0x7F;
215 if ( zSign ) {
216 if ( roundingMode == float_round_up ) roundIncrement = 0;
217 }
218 else {
219 if ( roundingMode == float_round_down ) roundIncrement = 0;
220 }
221 }
222 }
223 roundBits = zSig & 0x7F;
224 if ( 0xFD <= (bits16) zExp ) {
225 if ( ( 0xFD < zExp )
226 || ( ( zExp == 0xFD )
227 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
228 ) {
229 float_raise( float_flag_overflow | float_flag_inexact );
230 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
231 }
232 if ( zExp < 0 ) {
233 isTiny =
234 ( float_detect_tininess == float_tininess_before_rounding )
235 || ( zExp < -1 )
236 || ( zSig + roundIncrement < 0x80000000 );
237 shift32RightJamming( zSig, - zExp, &zSig );
238 zExp = 0;
239 roundBits = zSig & 0x7F;
240 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
241 }
242 }
243 if ( roundBits ) float_exception_flags |= float_flag_inexact;
244 zSig = ( zSig + roundIncrement )>>7;
245 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
246 if ( zSig == 0 ) zExp = 0;
247 return packFloat32( zSign, zExp, zSig );
248
249 }
250
251 /*
252 -------------------------------------------------------------------------------
253 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
254 and significand `zSig', and returns the proper single-precision floating-
255 point value corresponding to the abstract input. This routine is just like
256 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
257 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
258 floating-point exponent.
259 -------------------------------------------------------------------------------
260 */
261 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)262 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
263 {
264 int8 shiftCount;
265
266 shiftCount = countLeadingZeros32( zSig ) - 1;
267 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
268
269 }
270
271 /*
272 -------------------------------------------------------------------------------
273 Returns the least-significant 32 fraction bits of the double-precision
274 floating-point value `a'.
275 -------------------------------------------------------------------------------
276 */
extractFloat64Frac1(float64 a)277 INLINE bits32 extractFloat64Frac1( float64 a )
278 {
279
280 return FLOAT64_DEMANGLE(a) & LIT64( 0x00000000FFFFFFFF );
281
282 }
283
284 /*
285 -------------------------------------------------------------------------------
286 Returns the most-significant 20 fraction bits of the double-precision
287 floating-point value `a'.
288 -------------------------------------------------------------------------------
289 */
extractFloat64Frac0(float64 a)290 INLINE bits32 extractFloat64Frac0( float64 a )
291 {
292
293 return ( FLOAT64_DEMANGLE(a)>>32 ) & 0x000FFFFF;
294
295 }
296
297 /*
298 -------------------------------------------------------------------------------
299 Returns the exponent bits of the double-precision floating-point value `a'.
300 -------------------------------------------------------------------------------
301 */
extractFloat64Exp(float64 a)302 INLINE int16 extractFloat64Exp( float64 a )
303 {
304
305 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
306
307 }
308
309 /*
310 -------------------------------------------------------------------------------
311 Returns the sign bit of the double-precision floating-point value `a'.
312 -------------------------------------------------------------------------------
313 */
extractFloat64Sign(float64 a)314 INLINE flag extractFloat64Sign( float64 a )
315 {
316
317 return FLOAT64_DEMANGLE(a)>>63;
318
319 }
320
321 /*
322 -------------------------------------------------------------------------------
323 Normalizes the subnormal double-precision floating-point value represented
324 by the denormalized significand formed by the concatenation of `aSig0' and
325 `aSig1'. The normalized exponent is stored at the location pointed to by
326 `zExpPtr'. The most significant 21 bits of the normalized significand are
327 stored at the location pointed to by `zSig0Ptr', and the least significant
328 32 bits of the normalized significand are stored at the location pointed to
329 by `zSig1Ptr'.
330 -------------------------------------------------------------------------------
331 */
332 static void
normalizeFloat64Subnormal(bits32 aSig0,bits32 aSig1,int16 * zExpPtr,bits32 * zSig0Ptr,bits32 * zSig1Ptr)333 normalizeFloat64Subnormal(
334 bits32 aSig0,
335 bits32 aSig1,
336 int16 *zExpPtr,
337 bits32 *zSig0Ptr,
338 bits32 *zSig1Ptr
339 )
340 {
341 int8 shiftCount;
342
343 if ( aSig0 == 0 ) {
344 shiftCount = countLeadingZeros32( aSig1 ) - 11;
345 if ( shiftCount < 0 ) {
346 *zSig0Ptr = aSig1>>( - shiftCount );
347 *zSig1Ptr = aSig1<<( shiftCount & 31 );
348 }
349 else {
350 *zSig0Ptr = aSig1<<shiftCount;
351 *zSig1Ptr = 0;
352 }
353 *zExpPtr = - shiftCount - 31;
354 }
355 else {
356 shiftCount = countLeadingZeros32( aSig0 ) - 11;
357 shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
358 *zExpPtr = 1 - shiftCount;
359 }
360
361 }
362
363 /*
364 -------------------------------------------------------------------------------
365 Packs the sign `zSign', the exponent `zExp', and the significand formed by
366 the concatenation of `zSig0' and `zSig1' into a double-precision floating-
367 point value, returning the result. After being shifted into the proper
368 positions, the three fields `zSign', `zExp', and `zSig0' are simply added
369 together to form the most significant 32 bits of the result. This means
370 that any integer portion of `zSig0' will be added into the exponent. Since
371 a properly normalized significand will have an integer portion equal to 1,
372 the `zExp' input should be 1 less than the desired result exponent whenever
373 `zSig0' and `zSig1' concatenated form a complete, normalized significand.
374 -------------------------------------------------------------------------------
375 */
376 INLINE float64
packFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1)377 packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
378 {
379
380 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
381 ( ( (bits64) zExp )<<52 ) +
382 ( ( (bits64) zSig0 )<<32 ) + zSig1 );
383
384
385 }
386
387 /*
388 -------------------------------------------------------------------------------
389 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
390 and extended significand formed by the concatenation of `zSig0', `zSig1',
391 and `zSig2', and returns the proper double-precision floating-point value
392 corresponding to the abstract input. Ordinarily, the abstract value is
393 simply rounded and packed into the double-precision format, with the inexact
394 exception raised if the abstract input cannot be represented exactly.
395 However, if the abstract value is too large, the overflow and inexact
396 exceptions are raised and an infinity or maximal finite value is returned.
397 If the abstract value is too small, the input value is rounded to a
398 subnormal number, and the underflow and inexact exceptions are raised if the
399 abstract input cannot be represented exactly as a subnormal double-precision
400 floating-point number.
401 The input significand must be normalized or smaller. If the input
402 significand is not normalized, `zExp' must be 0; in that case, the result
403 returned is a subnormal number, and it must not require rounding. In the
404 usual case that the input significand is normalized, `zExp' must be 1 less
405 than the ``true'' floating-point exponent. The handling of underflow and
406 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
407 -------------------------------------------------------------------------------
408 */
409 static float64
roundAndPackFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1,bits32 zSig2)410 roundAndPackFloat64(
411 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 )
412 {
413 int8 roundingMode;
414 flag roundNearestEven, increment, isTiny;
415
416 roundingMode = float_rounding_mode;
417 roundNearestEven = ( roundingMode == float_round_nearest_even );
418 increment = ( (sbits32) zSig2 < 0 );
419 if ( ! roundNearestEven ) {
420 if ( roundingMode == float_round_to_zero ) {
421 increment = 0;
422 }
423 else {
424 if ( zSign ) {
425 increment = ( roundingMode == float_round_down ) && zSig2;
426 }
427 else {
428 increment = ( roundingMode == float_round_up ) && zSig2;
429 }
430 }
431 }
432 if ( 0x7FD <= (bits16) zExp ) {
433 if ( ( 0x7FD < zExp )
434 || ( ( zExp == 0x7FD )
435 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 )
436 && increment
437 )
438 ) {
439 float_raise( float_flag_overflow | float_flag_inexact );
440 if ( ( roundingMode == float_round_to_zero )
441 || ( zSign && ( roundingMode == float_round_up ) )
442 || ( ! zSign && ( roundingMode == float_round_down ) )
443 ) {
444 return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
445 }
446 return packFloat64( zSign, 0x7FF, 0, 0 );
447 }
448 if ( zExp < 0 ) {
449 isTiny =
450 ( float_detect_tininess == float_tininess_before_rounding )
451 || ( zExp < -1 )
452 || ! increment
453 || lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF );
454 shift64ExtraRightJamming(
455 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
456 zExp = 0;
457 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
458 if ( roundNearestEven ) {
459 increment = ( (sbits32) zSig2 < 0 );
460 }
461 else {
462 if ( zSign ) {
463 increment = ( roundingMode == float_round_down ) && zSig2;
464 }
465 else {
466 increment = ( roundingMode == float_round_up ) && zSig2;
467 }
468 }
469 }
470 }
471 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
472 if ( increment ) {
473 add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
474 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
475 }
476 else {
477 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
478 }
479 return packFloat64( zSign, zExp, zSig0, zSig1 );
480
481 }
482
483 /*
484 -------------------------------------------------------------------------------
485 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
486 and significand formed by the concatenation of `zSig0' and `zSig1', and
487 returns the proper double-precision floating-point value corresponding
488 to the abstract input. This routine is just like `roundAndPackFloat64'
489 except that the input significand has fewer bits and does not have to be
490 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
491 point exponent.
492 -------------------------------------------------------------------------------
493 */
494 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1)495 normalizeRoundAndPackFloat64(
496 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
497 {
498 int8 shiftCount;
499 bits32 zSig2;
500
501 if ( zSig0 == 0 ) {
502 zSig0 = zSig1;
503 zSig1 = 0;
504 zExp -= 32;
505 }
506 shiftCount = countLeadingZeros32( zSig0 ) - 11;
507 if ( 0 <= shiftCount ) {
508 zSig2 = 0;
509 shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
510 }
511 else {
512 shift64ExtraRightJamming(
513 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
514 }
515 zExp -= shiftCount;
516 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
517
518 }
519
520 /*
521 -------------------------------------------------------------------------------
522 Returns the result of converting the 32-bit two's complement integer `a' to
523 the single-precision floating-point format. The conversion is performed
524 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
525 -------------------------------------------------------------------------------
526 */
int32_to_float32(int32 a)527 float32 int32_to_float32( int32 a )
528 {
529 flag zSign;
530
531 if ( a == 0 ) return 0;
532 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
533 zSign = ( a < 0 );
534 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
535
536 }
537
538 /*
539 -------------------------------------------------------------------------------
540 Returns the result of converting the 32-bit two's complement integer `a' to
541 the double-precision floating-point format. The conversion is performed
542 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
543 -------------------------------------------------------------------------------
544 */
int32_to_float64(int32 a)545 float64 int32_to_float64( int32 a )
546 {
547 flag zSign;
548 bits32 absA;
549 int8 shiftCount;
550 bits32 zSig0, zSig1;
551
552 if ( a == 0 ) return packFloat64( 0, 0, 0, 0 );
553 zSign = ( a < 0 );
554 absA = zSign ? - a : a;
555 shiftCount = countLeadingZeros32( absA ) - 11;
556 if ( 0 <= shiftCount ) {
557 zSig0 = absA<<shiftCount;
558 zSig1 = 0;
559 }
560 else {
561 shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 );
562 }
563 return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 );
564
565 }
566
567 #ifndef SOFTFLOAT_FOR_GCC
568 /*
569 -------------------------------------------------------------------------------
570 Returns the result of converting the single-precision floating-point value
571 `a' to the 32-bit two's complement integer format. The conversion is
572 performed according to the IEC/IEEE Standard for Binary Floating-Point
573 Arithmetic---which means in particular that the conversion is rounded
574 according to the current rounding mode. If `a' is a NaN, the largest
575 positive integer is returned. Otherwise, if the conversion overflows, the
576 largest integer with the same sign as `a' is returned.
577 -------------------------------------------------------------------------------
578 */
float32_to_int32(float32 a)579 int32 float32_to_int32( float32 a )
580 {
581 flag aSign;
582 int16 aExp, shiftCount;
583 bits32 aSig, aSigExtra;
584 int32 z;
585 int8 roundingMode;
586
587 aSig = extractFloat32Frac( a );
588 aExp = extractFloat32Exp( a );
589 aSign = extractFloat32Sign( a );
590 shiftCount = aExp - 0x96;
591 if ( 0 <= shiftCount ) {
592 if ( 0x9E <= aExp ) {
593 if ( a != 0xCF000000 ) {
594 float_raise( float_flag_invalid );
595 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
596 return 0x7FFFFFFF;
597 }
598 }
599 return (sbits32) 0x80000000;
600 }
601 z = ( aSig | 0x00800000 )<<shiftCount;
602 if ( aSign ) z = - z;
603 }
604 else {
605 if ( aExp < 0x7E ) {
606 aSigExtra = aExp | aSig;
607 z = 0;
608 }
609 else {
610 aSig |= 0x00800000;
611 aSigExtra = aSig<<( shiftCount & 31 );
612 z = aSig>>( - shiftCount );
613 }
614 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
615 roundingMode = float_rounding_mode;
616 if ( roundingMode == float_round_nearest_even ) {
617 if ( (sbits32) aSigExtra < 0 ) {
618 ++z;
619 if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1;
620 }
621 if ( aSign ) z = - z;
622 }
623 else {
624 aSigExtra = ( aSigExtra != 0 );
625 if ( aSign ) {
626 z += ( roundingMode == float_round_down ) & aSigExtra;
627 z = - z;
628 }
629 else {
630 z += ( roundingMode == float_round_up ) & aSigExtra;
631 }
632 }
633 }
634 return z;
635
636 }
637 #endif
638
639 /*
640 -------------------------------------------------------------------------------
641 Returns the result of converting the single-precision floating-point value
642 `a' to the 32-bit two's complement integer format. The conversion is
643 performed according to the IEC/IEEE Standard for Binary Floating-Point
644 Arithmetic, except that the conversion is always rounded toward zero.
645 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
646 the conversion overflows, the largest integer with the same sign as `a' is
647 returned.
648 -------------------------------------------------------------------------------
649 */
float32_to_int32_round_to_zero(float32 a)650 int32 float32_to_int32_round_to_zero( float32 a )
651 {
652 flag aSign;
653 int16 aExp, shiftCount;
654 bits32 aSig;
655 int32 z;
656
657 aSig = extractFloat32Frac( a );
658 aExp = extractFloat32Exp( a );
659 aSign = extractFloat32Sign( a );
660 shiftCount = aExp - 0x9E;
661 if ( 0 <= shiftCount ) {
662 if ( a != 0xCF000000 ) {
663 float_raise( float_flag_invalid );
664 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
665 }
666 return (sbits32) 0x80000000;
667 }
668 else if ( aExp <= 0x7E ) {
669 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
670 return 0;
671 }
672 aSig = ( aSig | 0x00800000 )<<8;
673 z = aSig>>( - shiftCount );
674 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
675 float_exception_flags |= float_flag_inexact;
676 }
677 if ( aSign ) z = - z;
678 return z;
679
680 }
681
682 /*
683 -------------------------------------------------------------------------------
684 Returns the result of converting the single-precision floating-point value
685 `a' to the double-precision floating-point format. The conversion is
686 performed according to the IEC/IEEE Standard for Binary Floating-Point
687 Arithmetic.
688 -------------------------------------------------------------------------------
689 */
float32_to_float64(float32 a)690 float64 float32_to_float64( float32 a )
691 {
692 flag aSign;
693 int16 aExp;
694 bits32 aSig, zSig0, zSig1;
695
696 aSig = extractFloat32Frac( a );
697 aExp = extractFloat32Exp( a );
698 aSign = extractFloat32Sign( a );
699 if ( aExp == 0xFF ) {
700 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
701 return packFloat64( aSign, 0x7FF, 0, 0 );
702 }
703 if ( aExp == 0 ) {
704 if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 );
705 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
706 --aExp;
707 }
708 shift64Right( aSig, 0, 3, &zSig0, &zSig1 );
709 return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 );
710
711 }
712
713 #ifndef SOFTFLOAT_FOR_GCC
714 /*
715 -------------------------------------------------------------------------------
716 Rounds the single-precision floating-point value `a' to an integer,
717 and returns the result as a single-precision floating-point value. The
718 operation is performed according to the IEC/IEEE Standard for Binary
719 Floating-Point Arithmetic.
720 -------------------------------------------------------------------------------
721 */
float32_round_to_int(float32 a)722 float32 float32_round_to_int( float32 a )
723 {
724 flag aSign;
725 int16 aExp;
726 bits32 lastBitMask, roundBitsMask;
727 int8 roundingMode;
728 float32 z;
729
730 aExp = extractFloat32Exp( a );
731 if ( 0x96 <= aExp ) {
732 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
733 return propagateFloat32NaN( a, a );
734 }
735 return a;
736 }
737 if ( aExp <= 0x7E ) {
738 if ( (bits32) ( a<<1 ) == 0 ) return a;
739 float_exception_flags |= float_flag_inexact;
740 aSign = extractFloat32Sign( a );
741 switch ( float_rounding_mode ) {
742 case float_round_nearest_even:
743 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
744 return packFloat32( aSign, 0x7F, 0 );
745 }
746 break;
747 case float_round_to_zero:
748 break;
749 case float_round_down:
750 return aSign ? 0xBF800000 : 0;
751 case float_round_up:
752 return aSign ? 0x80000000 : 0x3F800000;
753 }
754 return packFloat32( aSign, 0, 0 );
755 }
756 lastBitMask = 1;
757 lastBitMask <<= 0x96 - aExp;
758 roundBitsMask = lastBitMask - 1;
759 z = a;
760 roundingMode = float_rounding_mode;
761 if ( roundingMode == float_round_nearest_even ) {
762 z += lastBitMask>>1;
763 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
764 }
765 else if ( roundingMode != float_round_to_zero ) {
766 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
767 z += roundBitsMask;
768 }
769 }
770 z &= ~ roundBitsMask;
771 if ( z != a ) float_exception_flags |= float_flag_inexact;
772 return z;
773
774 }
775 #endif
776
777 /*
778 -------------------------------------------------------------------------------
779 Returns the result of adding the absolute values of the single-precision
780 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
781 before being returned. `zSign' is ignored if the result is a NaN.
782 The addition is performed according to the IEC/IEEE Standard for Binary
783 Floating-Point Arithmetic.
784 -------------------------------------------------------------------------------
785 */
addFloat32Sigs(float32 a,float32 b,flag zSign)786 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
787 {
788 int16 aExp, bExp, zExp;
789 bits32 aSig, bSig, zSig;
790 int16 expDiff;
791
792 aSig = extractFloat32Frac( a );
793 aExp = extractFloat32Exp( a );
794 bSig = extractFloat32Frac( b );
795 bExp = extractFloat32Exp( b );
796 expDiff = aExp - bExp;
797 aSig <<= 6;
798 bSig <<= 6;
799 if ( 0 < expDiff ) {
800 if ( aExp == 0xFF ) {
801 if ( aSig ) return propagateFloat32NaN( a, b );
802 return a;
803 }
804 if ( bExp == 0 ) {
805 --expDiff;
806 }
807 else {
808 bSig |= 0x20000000;
809 }
810 shift32RightJamming( bSig, expDiff, &bSig );
811 zExp = aExp;
812 }
813 else if ( expDiff < 0 ) {
814 if ( bExp == 0xFF ) {
815 if ( bSig ) return propagateFloat32NaN( a, b );
816 return packFloat32( zSign, 0xFF, 0 );
817 }
818 if ( aExp == 0 ) {
819 ++expDiff;
820 }
821 else {
822 aSig |= 0x20000000;
823 }
824 shift32RightJamming( aSig, - expDiff, &aSig );
825 zExp = bExp;
826 }
827 else {
828 if ( aExp == 0xFF ) {
829 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
830 return a;
831 }
832 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
833 zSig = 0x40000000 + aSig + bSig;
834 zExp = aExp;
835 goto roundAndPack;
836 }
837 aSig |= 0x20000000;
838 zSig = ( aSig + bSig )<<1;
839 --zExp;
840 if ( (sbits32) zSig < 0 ) {
841 zSig = aSig + bSig;
842 ++zExp;
843 }
844 roundAndPack:
845 return roundAndPackFloat32( zSign, zExp, zSig );
846
847 }
848
849 /*
850 -------------------------------------------------------------------------------
851 Returns the result of subtracting the absolute values of the single-
852 precision floating-point values `a' and `b'. If `zSign' is 1, the
853 difference is negated before being returned. `zSign' is ignored if the
854 result is a NaN. The subtraction is performed according to the IEC/IEEE
855 Standard for Binary Floating-Point Arithmetic.
856 -------------------------------------------------------------------------------
857 */
subFloat32Sigs(float32 a,float32 b,flag zSign)858 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
859 {
860 int16 aExp, bExp, zExp;
861 bits32 aSig, bSig, zSig;
862 int16 expDiff;
863
864 aSig = extractFloat32Frac( a );
865 aExp = extractFloat32Exp( a );
866 bSig = extractFloat32Frac( b );
867 bExp = extractFloat32Exp( b );
868 expDiff = aExp - bExp;
869 aSig <<= 7;
870 bSig <<= 7;
871 if ( 0 < expDiff ) goto aExpBigger;
872 if ( expDiff < 0 ) goto bExpBigger;
873 if ( aExp == 0xFF ) {
874 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
875 float_raise( float_flag_invalid );
876 return float32_default_nan;
877 }
878 if ( aExp == 0 ) {
879 aExp = 1;
880 bExp = 1;
881 }
882 if ( bSig < aSig ) goto aBigger;
883 if ( aSig < bSig ) goto bBigger;
884 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
885 bExpBigger:
886 if ( bExp == 0xFF ) {
887 if ( bSig ) return propagateFloat32NaN( a, b );
888 return packFloat32( zSign ^ 1, 0xFF, 0 );
889 }
890 if ( aExp == 0 ) {
891 ++expDiff;
892 }
893 else {
894 aSig |= 0x40000000;
895 }
896 shift32RightJamming( aSig, - expDiff, &aSig );
897 bSig |= 0x40000000;
898 bBigger:
899 zSig = bSig - aSig;
900 zExp = bExp;
901 zSign ^= 1;
902 goto normalizeRoundAndPack;
903 aExpBigger:
904 if ( aExp == 0xFF ) {
905 if ( aSig ) return propagateFloat32NaN( a, b );
906 return a;
907 }
908 if ( bExp == 0 ) {
909 --expDiff;
910 }
911 else {
912 bSig |= 0x40000000;
913 }
914 shift32RightJamming( bSig, expDiff, &bSig );
915 aSig |= 0x40000000;
916 aBigger:
917 zSig = aSig - bSig;
918 zExp = aExp;
919 normalizeRoundAndPack:
920 --zExp;
921 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
922
923 }
924
925 /*
926 -------------------------------------------------------------------------------
927 Returns the result of adding the single-precision floating-point values `a'
928 and `b'. The operation is performed according to the IEC/IEEE Standard for
929 Binary Floating-Point Arithmetic.
930 -------------------------------------------------------------------------------
931 */
float32_add(float32 a,float32 b)932 float32 float32_add( float32 a, float32 b )
933 {
934 flag aSign, bSign;
935
936 aSign = extractFloat32Sign( a );
937 bSign = extractFloat32Sign( b );
938 if ( aSign == bSign ) {
939 return addFloat32Sigs( a, b, aSign );
940 }
941 else {
942 return subFloat32Sigs( a, b, aSign );
943 }
944
945 }
946
947 /*
948 -------------------------------------------------------------------------------
949 Returns the result of subtracting the single-precision floating-point values
950 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
951 for Binary Floating-Point Arithmetic.
952 -------------------------------------------------------------------------------
953 */
float32_sub(float32 a,float32 b)954 float32 float32_sub( float32 a, float32 b )
955 {
956 flag aSign, bSign;
957
958 aSign = extractFloat32Sign( a );
959 bSign = extractFloat32Sign( b );
960 if ( aSign == bSign ) {
961 return subFloat32Sigs( a, b, aSign );
962 }
963 else {
964 return addFloat32Sigs( a, b, aSign );
965 }
966
967 }
968
969 /*
970 -------------------------------------------------------------------------------
971 Returns the result of multiplying the single-precision floating-point values
972 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
973 for Binary Floating-Point Arithmetic.
974 -------------------------------------------------------------------------------
975 */
float32_mul(float32 a,float32 b)976 float32 float32_mul( float32 a, float32 b )
977 {
978 flag aSign, bSign, zSign;
979 int16 aExp, bExp, zExp;
980 bits32 aSig, bSig, zSig0, zSig1;
981
982 aSig = extractFloat32Frac( a );
983 aExp = extractFloat32Exp( a );
984 aSign = extractFloat32Sign( a );
985 bSig = extractFloat32Frac( b );
986 bExp = extractFloat32Exp( b );
987 bSign = extractFloat32Sign( b );
988 zSign = aSign ^ bSign;
989 if ( aExp == 0xFF ) {
990 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
991 return propagateFloat32NaN( a, b );
992 }
993 if ( ( bExp | bSig ) == 0 ) {
994 float_raise( float_flag_invalid );
995 return float32_default_nan;
996 }
997 return packFloat32( zSign, 0xFF, 0 );
998 }
999 if ( bExp == 0xFF ) {
1000 if ( bSig ) return propagateFloat32NaN( a, b );
1001 if ( ( aExp | aSig ) == 0 ) {
1002 float_raise( float_flag_invalid );
1003 return float32_default_nan;
1004 }
1005 return packFloat32( zSign, 0xFF, 0 );
1006 }
1007 if ( aExp == 0 ) {
1008 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1009 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1010 }
1011 if ( bExp == 0 ) {
1012 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1013 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1014 }
1015 zExp = aExp + bExp - 0x7F;
1016 aSig = ( aSig | 0x00800000 )<<7;
1017 bSig = ( bSig | 0x00800000 )<<8;
1018 mul32To64( aSig, bSig, &zSig0, &zSig1 );
1019 zSig0 |= ( zSig1 != 0 );
1020 if ( 0 <= (sbits32) ( zSig0<<1 ) ) {
1021 zSig0 <<= 1;
1022 --zExp;
1023 }
1024 return roundAndPackFloat32( zSign, zExp, zSig0 );
1025
1026 }
1027
1028 /*
1029 -------------------------------------------------------------------------------
1030 Returns the result of dividing the single-precision floating-point value `a'
1031 by the corresponding value `b'. The operation is performed according to the
1032 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1033 -------------------------------------------------------------------------------
1034 */
float32_div(float32 a,float32 b)1035 float32 float32_div( float32 a, float32 b )
1036 {
1037 flag aSign, bSign, zSign;
1038 int16 aExp, bExp, zExp;
1039 bits32 aSig, bSig, zSig, rem0, rem1, term0, term1;
1040
1041 aSig = extractFloat32Frac( a );
1042 aExp = extractFloat32Exp( a );
1043 aSign = extractFloat32Sign( a );
1044 bSig = extractFloat32Frac( b );
1045 bExp = extractFloat32Exp( b );
1046 bSign = extractFloat32Sign( b );
1047 zSign = aSign ^ bSign;
1048 if ( aExp == 0xFF ) {
1049 if ( aSig ) return propagateFloat32NaN( a, b );
1050 if ( bExp == 0xFF ) {
1051 if ( bSig ) return propagateFloat32NaN( a, b );
1052 float_raise( float_flag_invalid );
1053 return float32_default_nan;
1054 }
1055 return packFloat32( zSign, 0xFF, 0 );
1056 }
1057 if ( bExp == 0xFF ) {
1058 if ( bSig ) return propagateFloat32NaN( a, b );
1059 return packFloat32( zSign, 0, 0 );
1060 }
1061 if ( bExp == 0 ) {
1062 if ( bSig == 0 ) {
1063 if ( ( aExp | aSig ) == 0 ) {
1064 float_raise( float_flag_invalid );
1065 return float32_default_nan;
1066 }
1067 float_raise( float_flag_divbyzero );
1068 return packFloat32( zSign, 0xFF, 0 );
1069 }
1070 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1071 }
1072 if ( aExp == 0 ) {
1073 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1074 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1075 }
1076 zExp = aExp - bExp + 0x7D;
1077 aSig = ( aSig | 0x00800000 )<<7;
1078 bSig = ( bSig | 0x00800000 )<<8;
1079 if ( bSig <= ( aSig + aSig ) ) {
1080 aSig >>= 1;
1081 ++zExp;
1082 }
1083 zSig = estimateDiv64To32( aSig, 0, bSig );
1084 if ( ( zSig & 0x3F ) <= 2 ) {
1085 mul32To64( bSig, zSig, &term0, &term1 );
1086 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1087 while ( (sbits32) rem0 < 0 ) {
1088 --zSig;
1089 add64( rem0, rem1, 0, bSig, &rem0, &rem1 );
1090 }
1091 zSig |= ( rem1 != 0 );
1092 }
1093 return roundAndPackFloat32( zSign, zExp, zSig );
1094
1095 }
1096
1097 #ifndef SOFTFLOAT_FOR_GCC
1098 /*
1099 -------------------------------------------------------------------------------
1100 Returns the remainder of the single-precision floating-point value `a'
1101 with respect to the corresponding value `b'. The operation is performed
1102 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1103 -------------------------------------------------------------------------------
1104 */
float32_rem(float32 a,float32 b)1105 float32 float32_rem( float32 a, float32 b )
1106 {
1107 flag aSign, bSign, zSign;
1108 int16 aExp, bExp, expDiff;
1109 bits32 aSig, bSig, q, allZero, alternateASig;
1110 sbits32 sigMean;
1111
1112 aSig = extractFloat32Frac( a );
1113 aExp = extractFloat32Exp( a );
1114 aSign = extractFloat32Sign( a );
1115 bSig = extractFloat32Frac( b );
1116 bExp = extractFloat32Exp( b );
1117 bSign = extractFloat32Sign( b );
1118 if ( aExp == 0xFF ) {
1119 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1120 return propagateFloat32NaN( a, b );
1121 }
1122 float_raise( float_flag_invalid );
1123 return float32_default_nan;
1124 }
1125 if ( bExp == 0xFF ) {
1126 if ( bSig ) return propagateFloat32NaN( a, b );
1127 return a;
1128 }
1129 if ( bExp == 0 ) {
1130 if ( bSig == 0 ) {
1131 float_raise( float_flag_invalid );
1132 return float32_default_nan;
1133 }
1134 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1135 }
1136 if ( aExp == 0 ) {
1137 if ( aSig == 0 ) return a;
1138 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1139 }
1140 expDiff = aExp - bExp;
1141 aSig = ( aSig | 0x00800000 )<<8;
1142 bSig = ( bSig | 0x00800000 )<<8;
1143 if ( expDiff < 0 ) {
1144 if ( expDiff < -1 ) return a;
1145 aSig >>= 1;
1146 }
1147 q = ( bSig <= aSig );
1148 if ( q ) aSig -= bSig;
1149 expDiff -= 32;
1150 while ( 0 < expDiff ) {
1151 q = estimateDiv64To32( aSig, 0, bSig );
1152 q = ( 2 < q ) ? q - 2 : 0;
1153 aSig = - ( ( bSig>>2 ) * q );
1154 expDiff -= 30;
1155 }
1156 expDiff += 32;
1157 if ( 0 < expDiff ) {
1158 q = estimateDiv64To32( aSig, 0, bSig );
1159 q = ( 2 < q ) ? q - 2 : 0;
1160 q >>= 32 - expDiff;
1161 bSig >>= 2;
1162 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1163 }
1164 else {
1165 aSig >>= 2;
1166 bSig >>= 2;
1167 }
1168 do {
1169 alternateASig = aSig;
1170 ++q;
1171 aSig -= bSig;
1172 } while ( 0 <= (sbits32) aSig );
1173 sigMean = aSig + alternateASig;
1174 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1175 aSig = alternateASig;
1176 }
1177 zSign = ( (sbits32) aSig < 0 );
1178 if ( zSign ) aSig = - aSig;
1179 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1180
1181 }
1182 #endif
1183
1184 #ifndef SOFTFLOAT_FOR_GCC
1185 /*
1186 -------------------------------------------------------------------------------
1187 Returns the square root of the single-precision floating-point value `a'.
1188 The operation is performed according to the IEC/IEEE Standard for Binary
1189 Floating-Point Arithmetic.
1190 -------------------------------------------------------------------------------
1191 */
float32_sqrt(float32 a)1192 float32 float32_sqrt( float32 a )
1193 {
1194 flag aSign;
1195 int16 aExp, zExp;
1196 bits32 aSig, zSig, rem0, rem1, term0, term1;
1197
1198 aSig = extractFloat32Frac( a );
1199 aExp = extractFloat32Exp( a );
1200 aSign = extractFloat32Sign( a );
1201 if ( aExp == 0xFF ) {
1202 if ( aSig ) return propagateFloat32NaN( a, 0 );
1203 if ( ! aSign ) return a;
1204 float_raise( float_flag_invalid );
1205 return float32_default_nan;
1206 }
1207 if ( aSign ) {
1208 if ( ( aExp | aSig ) == 0 ) return a;
1209 float_raise( float_flag_invalid );
1210 return float32_default_nan;
1211 }
1212 if ( aExp == 0 ) {
1213 if ( aSig == 0 ) return 0;
1214 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1215 }
1216 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1217 aSig = ( aSig | 0x00800000 )<<8;
1218 zSig = estimateSqrt32( aExp, aSig ) + 2;
1219 if ( ( zSig & 0x7F ) <= 5 ) {
1220 if ( zSig < 2 ) {
1221 zSig = 0x7FFFFFFF;
1222 goto roundAndPack;
1223 }
1224 else {
1225 aSig >>= aExp & 1;
1226 mul32To64( zSig, zSig, &term0, &term1 );
1227 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1228 while ( (sbits32) rem0 < 0 ) {
1229 --zSig;
1230 shortShift64Left( 0, zSig, 1, &term0, &term1 );
1231 term1 |= 1;
1232 add64( rem0, rem1, term0, term1, &rem0, &rem1 );
1233 }
1234 zSig |= ( ( rem0 | rem1 ) != 0 );
1235 }
1236 }
1237 shift32RightJamming( zSig, 1, &zSig );
1238 roundAndPack:
1239 return roundAndPackFloat32( 0, zExp, zSig );
1240
1241 }
1242 #endif
1243
1244 /*
1245 -------------------------------------------------------------------------------
1246 Returns 1 if the single-precision floating-point value `a' is equal to
1247 the corresponding value `b', and 0 otherwise. The comparison is performed
1248 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1249 -------------------------------------------------------------------------------
1250 */
float32_eq(float32 a,float32 b)1251 flag float32_eq( float32 a, float32 b )
1252 {
1253
1254 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1255 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1256 ) {
1257 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1258 float_raise( float_flag_invalid );
1259 }
1260 return 0;
1261 }
1262 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1263
1264 }
1265
1266 /*
1267 -------------------------------------------------------------------------------
1268 Returns 1 if the single-precision floating-point value `a' is less than
1269 or equal to the corresponding value `b', and 0 otherwise. The comparison
1270 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1271 Arithmetic.
1272 -------------------------------------------------------------------------------
1273 */
float32_le(float32 a,float32 b)1274 flag float32_le( float32 a, float32 b )
1275 {
1276 flag aSign, bSign;
1277
1278 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1279 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1280 ) {
1281 float_raise( float_flag_invalid );
1282 return 0;
1283 }
1284 aSign = extractFloat32Sign( a );
1285 bSign = extractFloat32Sign( b );
1286 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1287 return ( a == b ) || ( aSign ^ ( a < b ) );
1288
1289 }
1290
1291 /*
1292 -------------------------------------------------------------------------------
1293 Returns 1 if the single-precision floating-point value `a' is less than
1294 the corresponding value `b', and 0 otherwise. The comparison is performed
1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 -------------------------------------------------------------------------------
1297 */
float32_lt(float32 a,float32 b)1298 flag float32_lt( float32 a, float32 b )
1299 {
1300 flag aSign, bSign;
1301
1302 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1303 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1304 ) {
1305 float_raise( float_flag_invalid );
1306 return 0;
1307 }
1308 aSign = extractFloat32Sign( a );
1309 bSign = extractFloat32Sign( b );
1310 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1311 return ( a != b ) && ( aSign ^ ( a < b ) );
1312
1313 }
1314
1315 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1316 /*
1317 -------------------------------------------------------------------------------
1318 Returns 1 if the single-precision floating-point value `a' is equal to
1319 the corresponding value `b', and 0 otherwise. The invalid exception is
1320 raised if either operand is a NaN. Otherwise, the comparison is performed
1321 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1322 -------------------------------------------------------------------------------
1323 */
float32_eq_signaling(float32 a,float32 b)1324 flag float32_eq_signaling( float32 a, float32 b )
1325 {
1326
1327 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1328 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1329 ) {
1330 float_raise( float_flag_invalid );
1331 return 0;
1332 }
1333 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1334
1335 }
1336
1337 /*
1338 -------------------------------------------------------------------------------
1339 Returns 1 if the single-precision floating-point value `a' is less than or
1340 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1341 cause an exception. Otherwise, the comparison is performed according to the
1342 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1343 -------------------------------------------------------------------------------
1344 */
float32_le_quiet(float32 a,float32 b)1345 flag float32_le_quiet( float32 a, float32 b )
1346 {
1347 flag aSign, bSign;
1348 int16 aExp, bExp;
1349
1350 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1351 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1352 ) {
1353 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1354 float_raise( float_flag_invalid );
1355 }
1356 return 0;
1357 }
1358 aSign = extractFloat32Sign( a );
1359 bSign = extractFloat32Sign( b );
1360 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1361 return ( a == b ) || ( aSign ^ ( a < b ) );
1362
1363 }
1364
1365 /*
1366 -------------------------------------------------------------------------------
1367 Returns 1 if the single-precision floating-point value `a' is less than
1368 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1369 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1370 Standard for Binary Floating-Point Arithmetic.
1371 -------------------------------------------------------------------------------
1372 */
float32_lt_quiet(float32 a,float32 b)1373 flag float32_lt_quiet( float32 a, float32 b )
1374 {
1375 flag aSign, bSign;
1376
1377 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1378 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1379 ) {
1380 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1381 float_raise( float_flag_invalid );
1382 }
1383 return 0;
1384 }
1385 aSign = extractFloat32Sign( a );
1386 bSign = extractFloat32Sign( b );
1387 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1388 return ( a != b ) && ( aSign ^ ( a < b ) );
1389
1390 }
1391 #endif /* !SOFTFLOAT_FOR_GCC */
1392
1393 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1394 /*
1395 -------------------------------------------------------------------------------
1396 Returns the result of converting the double-precision floating-point value
1397 `a' to the 32-bit two's complement integer format. The conversion is
1398 performed according to the IEC/IEEE Standard for Binary Floating-Point
1399 Arithmetic---which means in particular that the conversion is rounded
1400 according to the current rounding mode. If `a' is a NaN, the largest
1401 positive integer is returned. Otherwise, if the conversion overflows, the
1402 largest integer with the same sign as `a' is returned.
1403 -------------------------------------------------------------------------------
1404 */
float64_to_int32(float64 a)1405 int32 float64_to_int32( float64 a )
1406 {
1407 flag aSign;
1408 int16 aExp, shiftCount;
1409 bits32 aSig0, aSig1, absZ, aSigExtra;
1410 int32 z;
1411 int8 roundingMode;
1412
1413 aSig1 = extractFloat64Frac1( a );
1414 aSig0 = extractFloat64Frac0( a );
1415 aExp = extractFloat64Exp( a );
1416 aSign = extractFloat64Sign( a );
1417 shiftCount = aExp - 0x413;
1418 if ( 0 <= shiftCount ) {
1419 if ( 0x41E < aExp ) {
1420 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1421 goto invalid;
1422 }
1423 shortShift64Left(
1424 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1425 if ( 0x80000000 < absZ ) goto invalid;
1426 }
1427 else {
1428 aSig1 = ( aSig1 != 0 );
1429 if ( aExp < 0x3FE ) {
1430 aSigExtra = aExp | aSig0 | aSig1;
1431 absZ = 0;
1432 }
1433 else {
1434 aSig0 |= 0x00100000;
1435 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1436 absZ = aSig0>>( - shiftCount );
1437 }
1438 }
1439 roundingMode = float_rounding_mode;
1440 if ( roundingMode == float_round_nearest_even ) {
1441 if ( (sbits32) aSigExtra < 0 ) {
1442 ++absZ;
1443 if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;
1444 }
1445 z = aSign ? - absZ : absZ;
1446 }
1447 else {
1448 aSigExtra = ( aSigExtra != 0 );
1449 if ( aSign ) {
1450 z = - ( absZ
1451 + ( ( roundingMode == float_round_down ) & aSigExtra ) );
1452 }
1453 else {
1454 z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );
1455 }
1456 }
1457 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1458 invalid:
1459 float_raise( float_flag_invalid );
1460 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1461 }
1462 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1463 return z;
1464
1465 }
1466 #endif /* !SOFTFLOAT_FOR_GCC */
1467
1468 /*
1469 -------------------------------------------------------------------------------
1470 Returns the result of converting the double-precision floating-point value
1471 `a' to the 32-bit two's complement integer format. The conversion is
1472 performed according to the IEC/IEEE Standard for Binary Floating-Point
1473 Arithmetic, except that the conversion is always rounded toward zero.
1474 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1475 the conversion overflows, the largest integer with the same sign as `a' is
1476 returned.
1477 -------------------------------------------------------------------------------
1478 */
float64_to_int32_round_to_zero(float64 a)1479 int32 float64_to_int32_round_to_zero( float64 a )
1480 {
1481 flag aSign;
1482 int16 aExp, shiftCount;
1483 bits32 aSig0, aSig1, absZ, aSigExtra;
1484 int32 z;
1485
1486 aSig1 = extractFloat64Frac1( a );
1487 aSig0 = extractFloat64Frac0( a );
1488 aExp = extractFloat64Exp( a );
1489 aSign = extractFloat64Sign( a );
1490 shiftCount = aExp - 0x413;
1491 if ( 0 <= shiftCount ) {
1492 if ( 0x41E < aExp ) {
1493 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1494 goto invalid;
1495 }
1496 shortShift64Left(
1497 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1498 }
1499 else {
1500 if ( aExp < 0x3FF ) {
1501 if ( aExp | aSig0 | aSig1 ) {
1502 float_exception_flags |= float_flag_inexact;
1503 }
1504 return 0;
1505 }
1506 aSig0 |= 0x00100000;
1507 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1508 absZ = aSig0>>( - shiftCount );
1509 }
1510 z = aSign ? - absZ : absZ;
1511 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1512 invalid:
1513 float_raise( float_flag_invalid );
1514 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1515 }
1516 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1517 return z;
1518
1519 }
1520
1521 /*
1522 -------------------------------------------------------------------------------
1523 Returns the result of converting the double-precision floating-point value
1524 `a' to the single-precision floating-point format. The conversion is
1525 performed according to the IEC/IEEE Standard for Binary Floating-Point
1526 Arithmetic.
1527 -------------------------------------------------------------------------------
1528 */
float64_to_float32(float64 a)1529 float32 float64_to_float32( float64 a )
1530 {
1531 flag aSign;
1532 int16 aExp;
1533 bits32 aSig0, aSig1, zSig;
1534 bits32 allZero;
1535
1536 aSig1 = extractFloat64Frac1( a );
1537 aSig0 = extractFloat64Frac0( a );
1538 aExp = extractFloat64Exp( a );
1539 aSign = extractFloat64Sign( a );
1540 if ( aExp == 0x7FF ) {
1541 if ( aSig0 | aSig1 ) {
1542 return commonNaNToFloat32( float64ToCommonNaN( a ) );
1543 }
1544 return packFloat32( aSign, 0xFF, 0 );
1545 }
1546 shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );
1547 if ( aExp ) zSig |= 0x40000000;
1548 return roundAndPackFloat32( aSign, aExp - 0x381, zSig );
1549
1550 }
1551
1552 #ifndef SOFTFLOAT_FOR_GCC
1553 /*
1554 -------------------------------------------------------------------------------
1555 Rounds the double-precision floating-point value `a' to an integer,
1556 and returns the result as a double-precision floating-point value. The
1557 operation is performed according to the IEC/IEEE Standard for Binary
1558 Floating-Point Arithmetic.
1559 -------------------------------------------------------------------------------
1560 */
float64_round_to_int(float64 a)1561 float64 float64_round_to_int( float64 a )
1562 {
1563 flag aSign;
1564 int16 aExp;
1565 bits32 lastBitMask, roundBitsMask;
1566 int8 roundingMode;
1567 float64 z;
1568
1569 aExp = extractFloat64Exp( a );
1570 if ( 0x413 <= aExp ) {
1571 if ( 0x433 <= aExp ) {
1572 if ( ( aExp == 0x7FF )
1573 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {
1574 return propagateFloat64NaN( a, a );
1575 }
1576 return a;
1577 }
1578 lastBitMask = 1;
1579 lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;
1580 roundBitsMask = lastBitMask - 1;
1581 z = a;
1582 roundingMode = float_rounding_mode;
1583 if ( roundingMode == float_round_nearest_even ) {
1584 if ( lastBitMask ) {
1585 add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
1586 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1587 }
1588 else {
1589 if ( (sbits32) z.low < 0 ) {
1590 ++z.high;
1591 if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;
1592 }
1593 }
1594 }
1595 else if ( roundingMode != float_round_to_zero ) {
1596 if ( extractFloat64Sign( z )
1597 ^ ( roundingMode == float_round_up ) ) {
1598 add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
1599 }
1600 }
1601 z.low &= ~ roundBitsMask;
1602 }
1603 else {
1604 if ( aExp <= 0x3FE ) {
1605 if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
1606 float_exception_flags |= float_flag_inexact;
1607 aSign = extractFloat64Sign( a );
1608 switch ( float_rounding_mode ) {
1609 case float_round_nearest_even:
1610 if ( ( aExp == 0x3FE )
1611 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )
1612 ) {
1613 return packFloat64( aSign, 0x3FF, 0, 0 );
1614 }
1615 break;
1616 case float_round_down:
1617 return
1618 aSign ? packFloat64( 1, 0x3FF, 0, 0 )
1619 : packFloat64( 0, 0, 0, 0 );
1620 case float_round_up:
1621 return
1622 aSign ? packFloat64( 1, 0, 0, 0 )
1623 : packFloat64( 0, 0x3FF, 0, 0 );
1624 }
1625 return packFloat64( aSign, 0, 0, 0 );
1626 }
1627 lastBitMask = 1;
1628 lastBitMask <<= 0x413 - aExp;
1629 roundBitsMask = lastBitMask - 1;
1630 z.low = 0;
1631 z.high = a.high;
1632 roundingMode = float_rounding_mode;
1633 if ( roundingMode == float_round_nearest_even ) {
1634 z.high += lastBitMask>>1;
1635 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
1636 z.high &= ~ lastBitMask;
1637 }
1638 }
1639 else if ( roundingMode != float_round_to_zero ) {
1640 if ( extractFloat64Sign( z )
1641 ^ ( roundingMode == float_round_up ) ) {
1642 z.high |= ( a.low != 0 );
1643 z.high += roundBitsMask;
1644 }
1645 }
1646 z.high &= ~ roundBitsMask;
1647 }
1648 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
1649 float_exception_flags |= float_flag_inexact;
1650 }
1651 return z;
1652
1653 }
1654 #endif
1655
1656 /*
1657 -------------------------------------------------------------------------------
1658 Returns the result of adding the absolute values of the double-precision
1659 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1660 before being returned. `zSign' is ignored if the result is a NaN.
1661 The addition is performed according to the IEC/IEEE Standard for Binary
1662 Floating-Point Arithmetic.
1663 -------------------------------------------------------------------------------
1664 */
addFloat64Sigs(float64 a,float64 b,flag zSign)1665 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1666 {
1667 int16 aExp, bExp, zExp;
1668 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1669 int16 expDiff;
1670
1671 aSig1 = extractFloat64Frac1( a );
1672 aSig0 = extractFloat64Frac0( a );
1673 aExp = extractFloat64Exp( a );
1674 bSig1 = extractFloat64Frac1( b );
1675 bSig0 = extractFloat64Frac0( b );
1676 bExp = extractFloat64Exp( b );
1677 expDiff = aExp - bExp;
1678 if ( 0 < expDiff ) {
1679 if ( aExp == 0x7FF ) {
1680 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1681 return a;
1682 }
1683 if ( bExp == 0 ) {
1684 --expDiff;
1685 }
1686 else {
1687 bSig0 |= 0x00100000;
1688 }
1689 shift64ExtraRightJamming(
1690 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
1691 zExp = aExp;
1692 }
1693 else if ( expDiff < 0 ) {
1694 if ( bExp == 0x7FF ) {
1695 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1696 return packFloat64( zSign, 0x7FF, 0, 0 );
1697 }
1698 if ( aExp == 0 ) {
1699 ++expDiff;
1700 }
1701 else {
1702 aSig0 |= 0x00100000;
1703 }
1704 shift64ExtraRightJamming(
1705 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
1706 zExp = bExp;
1707 }
1708 else {
1709 if ( aExp == 0x7FF ) {
1710 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1711 return propagateFloat64NaN( a, b );
1712 }
1713 return a;
1714 }
1715 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1716 if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );
1717 zSig2 = 0;
1718 zSig0 |= 0x00200000;
1719 zExp = aExp;
1720 goto shiftRight1;
1721 }
1722 aSig0 |= 0x00100000;
1723 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1724 --zExp;
1725 if ( zSig0 < 0x00200000 ) goto roundAndPack;
1726 ++zExp;
1727 shiftRight1:
1728 shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1729 roundAndPack:
1730 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1731
1732 }
1733
1734 /*
1735 -------------------------------------------------------------------------------
1736 Returns the result of subtracting the absolute values of the double-
1737 precision floating-point values `a' and `b'. If `zSign' is 1, the
1738 difference is negated before being returned. `zSign' is ignored if the
1739 result is a NaN. The subtraction is performed according to the IEC/IEEE
1740 Standard for Binary Floating-Point Arithmetic.
1741 -------------------------------------------------------------------------------
1742 */
subFloat64Sigs(float64 a,float64 b,flag zSign)1743 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1744 {
1745 int16 aExp, bExp, zExp;
1746 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
1747 int16 expDiff;
1748
1749 aSig1 = extractFloat64Frac1( a );
1750 aSig0 = extractFloat64Frac0( a );
1751 aExp = extractFloat64Exp( a );
1752 bSig1 = extractFloat64Frac1( b );
1753 bSig0 = extractFloat64Frac0( b );
1754 bExp = extractFloat64Exp( b );
1755 expDiff = aExp - bExp;
1756 shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 );
1757 shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 );
1758 if ( 0 < expDiff ) goto aExpBigger;
1759 if ( expDiff < 0 ) goto bExpBigger;
1760 if ( aExp == 0x7FF ) {
1761 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1762 return propagateFloat64NaN( a, b );
1763 }
1764 float_raise( float_flag_invalid );
1765 return float64_default_nan;
1766 }
1767 if ( aExp == 0 ) {
1768 aExp = 1;
1769 bExp = 1;
1770 }
1771 if ( bSig0 < aSig0 ) goto aBigger;
1772 if ( aSig0 < bSig0 ) goto bBigger;
1773 if ( bSig1 < aSig1 ) goto aBigger;
1774 if ( aSig1 < bSig1 ) goto bBigger;
1775 return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 );
1776 bExpBigger:
1777 if ( bExp == 0x7FF ) {
1778 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1779 return packFloat64( zSign ^ 1, 0x7FF, 0, 0 );
1780 }
1781 if ( aExp == 0 ) {
1782 ++expDiff;
1783 }
1784 else {
1785 aSig0 |= 0x40000000;
1786 }
1787 shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
1788 bSig0 |= 0x40000000;
1789 bBigger:
1790 sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
1791 zExp = bExp;
1792 zSign ^= 1;
1793 goto normalizeRoundAndPack;
1794 aExpBigger:
1795 if ( aExp == 0x7FF ) {
1796 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1797 return a;
1798 }
1799 if ( bExp == 0 ) {
1800 --expDiff;
1801 }
1802 else {
1803 bSig0 |= 0x40000000;
1804 }
1805 shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
1806 aSig0 |= 0x40000000;
1807 aBigger:
1808 sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1809 zExp = aExp;
1810 normalizeRoundAndPack:
1811 --zExp;
1812 return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );
1813
1814 }
1815
1816 /*
1817 -------------------------------------------------------------------------------
1818 Returns the result of adding the double-precision floating-point values `a'
1819 and `b'. The operation is performed according to the IEC/IEEE Standard for
1820 Binary Floating-Point Arithmetic.
1821 -------------------------------------------------------------------------------
1822 */
float64_add(float64 a,float64 b)1823 float64 float64_add( float64 a, float64 b )
1824 {
1825 flag aSign, bSign;
1826
1827 aSign = extractFloat64Sign( a );
1828 bSign = extractFloat64Sign( b );
1829 if ( aSign == bSign ) {
1830 return addFloat64Sigs( a, b, aSign );
1831 }
1832 else {
1833 return subFloat64Sigs( a, b, aSign );
1834 }
1835
1836 }
1837
1838 /*
1839 -------------------------------------------------------------------------------
1840 Returns the result of subtracting the double-precision floating-point values
1841 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1842 for Binary Floating-Point Arithmetic.
1843 -------------------------------------------------------------------------------
1844 */
float64_sub(float64 a,float64 b)1845 float64 float64_sub( float64 a, float64 b )
1846 {
1847 flag aSign, bSign;
1848
1849 aSign = extractFloat64Sign( a );
1850 bSign = extractFloat64Sign( b );
1851 if ( aSign == bSign ) {
1852 return subFloat64Sigs( a, b, aSign );
1853 }
1854 else {
1855 return addFloat64Sigs( a, b, aSign );
1856 }
1857
1858 }
1859
1860 /*
1861 -------------------------------------------------------------------------------
1862 Returns the result of multiplying the double-precision floating-point values
1863 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1864 for Binary Floating-Point Arithmetic.
1865 -------------------------------------------------------------------------------
1866 */
float64_mul(float64 a,float64 b)1867 float64 float64_mul( float64 a, float64 b )
1868 {
1869 flag aSign, bSign, zSign;
1870 int16 aExp, bExp, zExp;
1871 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
1872
1873 aSig1 = extractFloat64Frac1( a );
1874 aSig0 = extractFloat64Frac0( a );
1875 aExp = extractFloat64Exp( a );
1876 aSign = extractFloat64Sign( a );
1877 bSig1 = extractFloat64Frac1( b );
1878 bSig0 = extractFloat64Frac0( b );
1879 bExp = extractFloat64Exp( b );
1880 bSign = extractFloat64Sign( b );
1881 zSign = aSign ^ bSign;
1882 if ( aExp == 0x7FF ) {
1883 if ( ( aSig0 | aSig1 )
1884 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
1885 return propagateFloat64NaN( a, b );
1886 }
1887 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
1888 return packFloat64( zSign, 0x7FF, 0, 0 );
1889 }
1890 if ( bExp == 0x7FF ) {
1891 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1892 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1893 invalid:
1894 float_raise( float_flag_invalid );
1895 return float64_default_nan;
1896 }
1897 return packFloat64( zSign, 0x7FF, 0, 0 );
1898 }
1899 if ( aExp == 0 ) {
1900 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1901 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1902 }
1903 if ( bExp == 0 ) {
1904 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1905 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1906 }
1907 zExp = aExp + bExp - 0x400;
1908 aSig0 |= 0x00100000;
1909 shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 );
1910 mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
1911 add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
1912 zSig2 |= ( zSig3 != 0 );
1913 if ( 0x00200000 <= zSig0 ) {
1914 shift64ExtraRightJamming(
1915 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1916 ++zExp;
1917 }
1918 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1919
1920 }
1921
1922 /*
1923 -------------------------------------------------------------------------------
1924 Returns the result of dividing the double-precision floating-point value `a'
1925 by the corresponding value `b'. The operation is performed according to the
1926 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1927 -------------------------------------------------------------------------------
1928 */
float64_div(float64 a,float64 b)1929 float64 float64_div( float64 a, float64 b )
1930 {
1931 flag aSign, bSign, zSign;
1932 int16 aExp, bExp, zExp;
1933 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1934 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1935
1936 aSig1 = extractFloat64Frac1( a );
1937 aSig0 = extractFloat64Frac0( a );
1938 aExp = extractFloat64Exp( a );
1939 aSign = extractFloat64Sign( a );
1940 bSig1 = extractFloat64Frac1( b );
1941 bSig0 = extractFloat64Frac0( b );
1942 bExp = extractFloat64Exp( b );
1943 bSign = extractFloat64Sign( b );
1944 zSign = aSign ^ bSign;
1945 if ( aExp == 0x7FF ) {
1946 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1947 if ( bExp == 0x7FF ) {
1948 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1949 goto invalid;
1950 }
1951 return packFloat64( zSign, 0x7FF, 0, 0 );
1952 }
1953 if ( bExp == 0x7FF ) {
1954 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1955 return packFloat64( zSign, 0, 0, 0 );
1956 }
1957 if ( bExp == 0 ) {
1958 if ( ( bSig0 | bSig1 ) == 0 ) {
1959 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1960 invalid:
1961 float_raise( float_flag_invalid );
1962 return float64_default_nan;
1963 }
1964 float_raise( float_flag_divbyzero );
1965 return packFloat64( zSign, 0x7FF, 0, 0 );
1966 }
1967 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1968 }
1969 if ( aExp == 0 ) {
1970 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1971 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1972 }
1973 zExp = aExp - bExp + 0x3FD;
1974 shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 );
1975 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
1976 if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) {
1977 shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
1978 ++zExp;
1979 }
1980 zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 );
1981 mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
1982 sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
1983 while ( (sbits32) rem0 < 0 ) {
1984 --zSig0;
1985 add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
1986 }
1987 zSig1 = estimateDiv64To32( rem1, rem2, bSig0 );
1988 if ( ( zSig1 & 0x3FF ) <= 4 ) {
1989 mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
1990 sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
1991 while ( (sbits32) rem1 < 0 ) {
1992 --zSig1;
1993 add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
1994 }
1995 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
1996 }
1997 shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 );
1998 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1999
2000 }
2001
2002 #ifndef SOFTFLOAT_FOR_GCC
2003 /*
2004 -------------------------------------------------------------------------------
2005 Returns the remainder of the double-precision floating-point value `a'
2006 with respect to the corresponding value `b'. The operation is performed
2007 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2008 -------------------------------------------------------------------------------
2009 */
float64_rem(float64 a,float64 b)2010 float64 float64_rem( float64 a, float64 b )
2011 {
2012 flag aSign, bSign, zSign;
2013 int16 aExp, bExp, expDiff;
2014 bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2015 bits32 allZero, alternateASig0, alternateASig1, sigMean1;
2016 sbits32 sigMean0;
2017 float64 z;
2018
2019 aSig1 = extractFloat64Frac1( a );
2020 aSig0 = extractFloat64Frac0( a );
2021 aExp = extractFloat64Exp( a );
2022 aSign = extractFloat64Sign( a );
2023 bSig1 = extractFloat64Frac1( b );
2024 bSig0 = extractFloat64Frac0( b );
2025 bExp = extractFloat64Exp( b );
2026 bSign = extractFloat64Sign( b );
2027 if ( aExp == 0x7FF ) {
2028 if ( ( aSig0 | aSig1 )
2029 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
2030 return propagateFloat64NaN( a, b );
2031 }
2032 goto invalid;
2033 }
2034 if ( bExp == 0x7FF ) {
2035 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
2036 return a;
2037 }
2038 if ( bExp == 0 ) {
2039 if ( ( bSig0 | bSig1 ) == 0 ) {
2040 invalid:
2041 float_raise( float_flag_invalid );
2042 return float64_default_nan;
2043 }
2044 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2045 }
2046 if ( aExp == 0 ) {
2047 if ( ( aSig0 | aSig1 ) == 0 ) return a;
2048 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2049 }
2050 expDiff = aExp - bExp;
2051 if ( expDiff < -1 ) return a;
2052 shortShift64Left(
2053 aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 );
2054 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
2055 q = le64( bSig0, bSig1, aSig0, aSig1 );
2056 if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2057 expDiff -= 32;
2058 while ( 0 < expDiff ) {
2059 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2060 q = ( 4 < q ) ? q - 4 : 0;
2061 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2062 shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero );
2063 shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero );
2064 sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2065 expDiff -= 29;
2066 }
2067 if ( -32 < expDiff ) {
2068 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2069 q = ( 4 < q ) ? q - 4 : 0;
2070 q >>= - expDiff;
2071 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2072 expDiff += 24;
2073 if ( expDiff < 0 ) {
2074 shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2075 }
2076 else {
2077 shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2078 }
2079 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2080 sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2081 }
2082 else {
2083 shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 );
2084 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2085 }
2086 do {
2087 alternateASig0 = aSig0;
2088 alternateASig1 = aSig1;
2089 ++q;
2090 sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2091 } while ( 0 <= (sbits32) aSig0 );
2092 add64(
2093 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2094 if ( ( sigMean0 < 0 )
2095 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2096 aSig0 = alternateASig0;
2097 aSig1 = alternateASig1;
2098 }
2099 zSign = ( (sbits32) aSig0 < 0 );
2100 if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2101 return
2102 normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2103
2104 }
2105 #endif
2106
2107 #ifndef SOFTFLOAT_FOR_GCC
2108 /*
2109 -------------------------------------------------------------------------------
2110 Returns the square root of the double-precision floating-point value `a'.
2111 The operation is performed according to the IEC/IEEE Standard for Binary
2112 Floating-Point Arithmetic.
2113 -------------------------------------------------------------------------------
2114 */
float64_sqrt(float64 a)2115 float64 float64_sqrt( float64 a )
2116 {
2117 flag aSign;
2118 int16 aExp, zExp;
2119 bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2120 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2121 float64 z;
2122
2123 aSig1 = extractFloat64Frac1( a );
2124 aSig0 = extractFloat64Frac0( a );
2125 aExp = extractFloat64Exp( a );
2126 aSign = extractFloat64Sign( a );
2127 if ( aExp == 0x7FF ) {
2128 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a );
2129 if ( ! aSign ) return a;
2130 goto invalid;
2131 }
2132 if ( aSign ) {
2133 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2134 invalid:
2135 float_raise( float_flag_invalid );
2136 return float64_default_nan;
2137 }
2138 if ( aExp == 0 ) {
2139 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 );
2140 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2141 }
2142 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2143 aSig0 |= 0x00100000;
2144 shortShift64Left( aSig0, aSig1, 11, &term0, &term1 );
2145 zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1;
2146 if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF;
2147 doubleZSig0 = zSig0 + zSig0;
2148 shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 );
2149 mul32To64( zSig0, zSig0, &term0, &term1 );
2150 sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2151 while ( (sbits32) rem0 < 0 ) {
2152 --zSig0;
2153 doubleZSig0 -= 2;
2154 add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 );
2155 }
2156 zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 );
2157 if ( ( zSig1 & 0x1FF ) <= 5 ) {
2158 if ( zSig1 == 0 ) zSig1 = 1;
2159 mul32To64( doubleZSig0, zSig1, &term1, &term2 );
2160 sub64( rem1, 0, term1, term2, &rem1, &rem2 );
2161 mul32To64( zSig1, zSig1, &term2, &term3 );
2162 sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2163 while ( (sbits32) rem1 < 0 ) {
2164 --zSig1;
2165 shortShift64Left( 0, zSig1, 1, &term2, &term3 );
2166 term3 |= 1;
2167 term2 |= doubleZSig0;
2168 add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2169 }
2170 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2171 }
2172 shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 );
2173 return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 );
2174
2175 }
2176 #endif
2177
2178 /*
2179 -------------------------------------------------------------------------------
2180 Returns 1 if the double-precision floating-point value `a' is equal to
2181 the corresponding value `b', and 0 otherwise. The comparison is performed
2182 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2183 -------------------------------------------------------------------------------
2184 */
float64_eq(float64 a,float64 b)2185 flag float64_eq( float64 a, float64 b )
2186 {
2187
2188 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2189 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2190 || ( ( extractFloat64Exp( b ) == 0x7FF )
2191 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2192 ) {
2193 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2194 float_raise( float_flag_invalid );
2195 }
2196 return 0;
2197 }
2198 return ( a == b ) ||
2199 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
2200
2201 }
2202
2203 /*
2204 -------------------------------------------------------------------------------
2205 Returns 1 if the double-precision floating-point value `a' is less than
2206 or equal to the corresponding value `b', and 0 otherwise. The comparison
2207 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2208 Arithmetic.
2209 -------------------------------------------------------------------------------
2210 */
float64_le(float64 a,float64 b)2211 flag float64_le( float64 a, float64 b )
2212 {
2213 flag aSign, bSign;
2214
2215 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2216 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2217 || ( ( extractFloat64Exp( b ) == 0x7FF )
2218 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2219 ) {
2220 float_raise( float_flag_invalid );
2221 return 0;
2222 }
2223 aSign = extractFloat64Sign( a );
2224 bSign = extractFloat64Sign( b );
2225 if ( aSign != bSign )
2226 return aSign ||
2227 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
2228 0 );
2229 return ( a == b ) ||
2230 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2231 }
2232
2233 /*
2234 -------------------------------------------------------------------------------
2235 Returns 1 if the double-precision floating-point value `a' is less than
2236 the corresponding value `b', and 0 otherwise. The comparison is performed
2237 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2238 -------------------------------------------------------------------------------
2239 */
float64_lt(float64 a,float64 b)2240 flag float64_lt( float64 a, float64 b )
2241 {
2242 flag aSign, bSign;
2243
2244 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2245 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2246 || ( ( extractFloat64Exp( b ) == 0x7FF )
2247 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2248 ) {
2249 float_raise( float_flag_invalid );
2250 return 0;
2251 }
2252 aSign = extractFloat64Sign( a );
2253 bSign = extractFloat64Sign( b );
2254 if ( aSign != bSign )
2255 return aSign &&
2256 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
2257 0 );
2258 return ( a != b ) &&
2259 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2260
2261 }
2262
2263 #ifndef SOFTFLOAT_FOR_GCC
2264 /*
2265 -------------------------------------------------------------------------------
2266 Returns 1 if the double-precision floating-point value `a' is equal to
2267 the corresponding value `b', and 0 otherwise. The invalid exception is
2268 raised if either operand is a NaN. Otherwise, the comparison is performed
2269 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 -------------------------------------------------------------------------------
2271 */
float64_eq_signaling(float64 a,float64 b)2272 flag float64_eq_signaling( float64 a, float64 b )
2273 {
2274
2275 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2276 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2277 || ( ( extractFloat64Exp( b ) == 0x7FF )
2278 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2279 ) {
2280 float_raise( float_flag_invalid );
2281 return 0;
2282 }
2283 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2284
2285 }
2286
2287 /*
2288 -------------------------------------------------------------------------------
2289 Returns 1 if the double-precision floating-point value `a' is less than or
2290 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2291 cause an exception. Otherwise, the comparison is performed according to the
2292 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2293 -------------------------------------------------------------------------------
2294 */
float64_le_quiet(float64 a,float64 b)2295 flag float64_le_quiet( float64 a, float64 b )
2296 {
2297 flag aSign, bSign;
2298
2299 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2300 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2301 || ( ( extractFloat64Exp( b ) == 0x7FF )
2302 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2303 ) {
2304 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2305 float_raise( float_flag_invalid );
2306 }
2307 return 0;
2308 }
2309 aSign = extractFloat64Sign( a );
2310 bSign = extractFloat64Sign( b );
2311 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2312 return ( a == b ) || ( aSign ^ ( a < b ) );
2313
2314 }
2315
2316 /*
2317 -------------------------------------------------------------------------------
2318 Returns 1 if the double-precision floating-point value `a' is less than
2319 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2320 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2321 Standard for Binary Floating-Point Arithmetic.
2322 -------------------------------------------------------------------------------
2323 */
float64_lt_quiet(float64 a,float64 b)2324 flag float64_lt_quiet( float64 a, float64 b )
2325 {
2326 flag aSign, bSign;
2327
2328 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2329 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2330 || ( ( extractFloat64Exp( b ) == 0x7FF )
2331 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2332 ) {
2333 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2334 float_raise( float_flag_invalid );
2335 }
2336 return 0;
2337 }
2338 aSign = extractFloat64Sign( a );
2339 bSign = extractFloat64Sign( b );
2340 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2341 return ( a != b ) && ( aSign ^ ( a < b ) );
2342
2343 }
2344
2345 #endif
2346