xref: /openbsd/sys/lib/libkern/softfloat.c (revision 37ecf0c7)
1 /*	$OpenBSD: softfloat.c,v 1.6 2014/07/01 20:21:17 miod Exp $	*/
2 /*	$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $	*/
3 
4 /*
5  * This version hacked for use with gcc -msoft-float by bjh21.
6  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
7  *  itself).
8  */
9 
10 /*
11  * Things you may want to define:
12  *
13  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
14  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
15  *   properly renamed.
16  */
17 
18 /*
19 ===============================================================================
20 
21 This C source file is part of the SoftFloat IEC/IEEE Floating-point
22 Arithmetic Package, Release 2a.
23 
24 Written by John R. Hauser.  This work was made possible in part by the
25 International Computer Science Institute, located at Suite 600, 1947 Center
26 Street, Berkeley, California 94704.  Funding was partially provided by the
27 National Science Foundation under grant MIP-9311980.  The original version
28 of this code was written as part of a project to build a fixed-point vector
29 processor in collaboration with the University of California at Berkeley,
30 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
31 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
32 arithmetic/SoftFloat.html'.
33 
34 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable
35 effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT
36 WILL AT TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS
37 RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL
38 RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM
39 THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY
40 (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL
41 COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS
42 ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS.
43 
44 Derivative works are acceptable, even for commercial purposes, so long as
45 (1) they include prominent notice that the work is derivative, and (2) they
46 include prominent notice akin to these four paragraphs for those parts of
47 this code that are retained.
48 
49 ===============================================================================
50 */
51 
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
55 
56 #include "milieu.h"
57 #include "softfloat.h"
58 
59 float32 normalizeRoundAndPackFloat32(flag, int16, bits32);
60 float64 normalizeRoundAndPackFloat64(flag, int16, bits64);
61 
62 /*
63  * Conversions between floats as stored in memory and floats as
64  * SoftFloat uses them
65  */
66 #ifndef FLOAT64_DEMANGLE
67 #define FLOAT64_DEMANGLE(a)	(a)
68 #endif
69 #ifndef FLOAT64_MANGLE
70 #define FLOAT64_MANGLE(a)	(a)
71 #endif
72 
73 /*
74 -------------------------------------------------------------------------------
75 Floating-point rounding mode, extended double-precision rounding precision,
76 and exception flags.
77 -------------------------------------------------------------------------------
78 */
79 
80 /*
81  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
82  * 	Right now, it does not.  I've removed all other dynamic global
83  * 	variables. [ross]
84  */
85 #ifdef FLOATX80
86 int8 floatx80_rounding_precision = 80;
87 #endif
88 
89 /*
90 -------------------------------------------------------------------------------
91 Primitive arithmetic functions, including multi-word arithmetic, and
92 division and square root approximations.  (Can be specialized to target if
93 desired.)
94 -------------------------------------------------------------------------------
95 */
96 #include "softfloat-macros.h"
97 
98 /*
99 -------------------------------------------------------------------------------
100 Functions and definitions to determine:  (1) whether tininess for underflow
101 is detected before or after rounding by default, (2) what (if anything)
102 happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 are propagated from function inputs to output.  These details are target-
105 specific.
106 -------------------------------------------------------------------------------
107 */
108 #include "softfloat-specialize.h"
109 
110 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
111 /*
112 -------------------------------------------------------------------------------
113 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
114 and 7, and returns the properly rounded 32-bit integer corresponding to the
115 input.  If `zSign' is 1, the input is negated before being converted to an
116 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
117 is simply rounded to an integer, with the inexact exception raised if the
118 input cannot be represented exactly as an integer.  However, if the fixed-
119 point input is too large, the invalid exception is raised and the largest
120 positive or negative integer is returned.
121 -------------------------------------------------------------------------------
122 */
roundAndPackInt32(flag zSign,bits64 absZ)123 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
124 {
125     int8 roundingMode;
126     flag roundNearestEven;
127     int8 roundIncrement, roundBits;
128     int32 z;
129 
130     roundingMode = float_rounding_mode();
131     roundNearestEven = ( roundingMode == float_round_nearest_even );
132     roundIncrement = 0x40;
133     if ( ! roundNearestEven ) {
134         if ( roundingMode == float_round_to_zero ) {
135             roundIncrement = 0;
136         }
137         else {
138             roundIncrement = 0x7F;
139             if ( zSign ) {
140                 if ( roundingMode == float_round_up ) roundIncrement = 0;
141             }
142             else {
143                 if ( roundingMode == float_round_down ) roundIncrement = 0;
144             }
145         }
146     }
147     roundBits = absZ & 0x7F;
148     absZ = ( absZ + roundIncrement )>>7;
149     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
150     z = absZ;
151     if ( zSign ) z = - z;
152     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
153         float_raise( float_flag_invalid );
154         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
155     }
156     if ( roundBits ) float_set_inexact();
157     return z;
158 
159 }
160 
161 /*
162 -------------------------------------------------------------------------------
163 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
164 `absZ1', with binary point between bits 63 and 64 (between the input words),
165 and returns the properly rounded 64-bit integer corresponding to the input.
166 If `zSign' is 1, the input is negated before being converted to an integer.
167 Ordinarily, the fixed-point input is simply rounded to an integer, with
168 the inexact exception raised if the input cannot be represented exactly as
169 an integer.  However, if the fixed-point input is too large, the invalid
170 exception is raised and the largest positive or negative integer is
171 returned.
172 -------------------------------------------------------------------------------
173 */
roundAndPackInt64(flag zSign,bits64 absZ0,bits64 absZ1)174 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
175 {
176     int8 roundingMode;
177     flag roundNearestEven, increment;
178     int64 z;
179 
180     roundingMode = float_rounding_mode();
181     roundNearestEven = ( roundingMode == float_round_nearest_even );
182     increment = ( (sbits64) absZ1 < 0 );
183     if ( ! roundNearestEven ) {
184         if ( roundingMode == float_round_to_zero ) {
185             increment = 0;
186         }
187         else {
188             if ( zSign ) {
189                 increment = ( roundingMode == float_round_down ) && absZ1;
190             }
191             else {
192                 increment = ( roundingMode == float_round_up ) && absZ1;
193             }
194         }
195     }
196     if ( increment ) {
197         ++absZ0;
198         if ( absZ0 == 0 ) goto overflow;
199         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
200     }
201     z = absZ0;
202     if ( zSign ) z = - z;
203     if ( z && ( ( z < 0 ) ^ zSign ) ) {
204  overflow:
205         float_raise( float_flag_invalid );
206         return
207               zSign ? (sbits64) LIT64( 0x8000000000000000 )
208             : LIT64( 0x7FFFFFFFFFFFFFFF );
209     }
210     if ( absZ1 ) float_set_inexact();
211     return z;
212 
213 }
214 
215 #ifdef __alpha__
216 /*
217 -------------------------------------------------------------------------------
218 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
219 `absZ1', with binary point between bits 63 and 64 (between the input words),
220 and returns the properly rounded 64-bit integer corresponding to the input.
221 If `zSign' is 1, the input is negated before being converted to an integer.
222 Ordinarily, the fixed-point input is simply rounded to an integer, with
223 the inexact exception raised if the input cannot be represented exactly as
224 an integer.
225 -------------------------------------------------------------------------------
226 */
roundAndPackInt64NoOverflow(flag zSign,bits64 absZ0,bits64 absZ1)227 static int64 roundAndPackInt64NoOverflow( flag zSign, bits64 absZ0,
228     bits64 absZ1 )
229 {
230     int8 roundingMode;
231     flag roundNearestEven, increment;
232     int64 z;
233 
234     roundingMode = float_rounding_mode();
235     roundNearestEven = ( roundingMode == float_round_nearest_even );
236     increment = ( (sbits64) absZ1 < 0 );
237     if ( ! roundNearestEven ) {
238         if ( roundingMode == float_round_to_zero ) {
239             increment = 0;
240         }
241         else {
242             if ( zSign ) {
243                 increment = ( roundingMode == float_round_down ) && absZ1;
244             }
245             else {
246                 increment = ( roundingMode == float_round_up ) && absZ1;
247             }
248         }
249     }
250     if ( increment ) {
251         ++absZ0;
252         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
253     }
254     z = absZ0;
255     if ( zSign ) z = - z;
256     if ( absZ1 ) float_set_inexact();
257     return z;
258 
259 }
260 #endif /* __alpha__ */
261 #endif
262 
263 /*
264 -------------------------------------------------------------------------------
265 Returns the fraction bits of the single-precision floating-point value `a'.
266 -------------------------------------------------------------------------------
267 */
extractFloat32Frac(float32 a)268 INLINE bits32 extractFloat32Frac( float32 a )
269 {
270 
271     return a & 0x007FFFFF;
272 
273 }
274 
275 /*
276 -------------------------------------------------------------------------------
277 Returns the exponent bits of the single-precision floating-point value `a'.
278 -------------------------------------------------------------------------------
279 */
extractFloat32Exp(float32 a)280 INLINE int16 extractFloat32Exp( float32 a )
281 {
282 
283     return ( a>>23 ) & 0xFF;
284 
285 }
286 
287 /*
288 -------------------------------------------------------------------------------
289 Returns the sign bit of the single-precision floating-point value `a'.
290 -------------------------------------------------------------------------------
291 */
extractFloat32Sign(float32 a)292 INLINE flag extractFloat32Sign( float32 a )
293 {
294 
295     return a>>31;
296 
297 }
298 
299 /*
300 -------------------------------------------------------------------------------
301 Normalizes the subnormal single-precision floating-point value represented
302 by the denormalized significand `aSig'.  The normalized exponent and
303 significand are stored at the locations pointed to by `zExpPtr' and
304 `zSigPtr', respectively.
305 -------------------------------------------------------------------------------
306 */
307 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)308  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
309 {
310     int8 shiftCount;
311 
312     shiftCount = countLeadingZeros32( aSig ) - 8;
313     *zSigPtr = aSig<<shiftCount;
314     *zExpPtr = 1 - shiftCount;
315 
316 }
317 
318 /*
319 -------------------------------------------------------------------------------
320 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
321 single-precision floating-point value, returning the result.  After being
322 shifted into the proper positions, the three fields are simply added
323 together to form the result.  This means that any integer portion of `zSig'
324 will be added into the exponent.  Since a properly normalized significand
325 will have an integer portion equal to 1, the `zExp' input should be 1 less
326 than the desired result exponent whenever `zSig' is a complete, normalized
327 significand.
328 -------------------------------------------------------------------------------
329 */
packFloat32(flag zSign,int16 zExp,bits32 zSig)330 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
331 {
332 
333     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
334 
335 }
336 
337 /*
338 -------------------------------------------------------------------------------
339 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
340 and significand `zSig', and returns the proper single-precision floating-
341 point value corresponding to the abstract input.  Ordinarily, the abstract
342 value is simply rounded and packed into the single-precision format, with
343 the inexact exception raised if the abstract input cannot be represented
344 exactly.  However, if the abstract value is too large, the overflow and
345 inexact exceptions are raised and an infinity or maximal finite value is
346 returned.  If the abstract value is too small, the input value is rounded to
347 a subnormal number, and the underflow and inexact exceptions are raised if
348 the abstract input cannot be represented exactly as a subnormal single-
349 precision floating-point number.
350     The input significand `zSig' has its binary point between bits 30
351 and 29, which is 7 bits to the left of the usual location.  This shifted
352 significand must be normalized or smaller.  If `zSig' is not normalized,
353 `zExp' must be 0; in that case, the result returned is a subnormal number,
354 and it must not require rounding.  In the usual case that `zSig' is
355 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
356 The handling of underflow and overflow follows the IEC/IEEE Standard for
357 Binary Floating-Point Arithmetic.
358 -------------------------------------------------------------------------------
359 */
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)360 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
361 {
362     int8 roundingMode;
363     flag roundNearestEven;
364     int8 roundIncrement, roundBits;
365     flag isTiny;
366 
367     roundingMode = float_rounding_mode();
368     roundNearestEven = ( roundingMode == float_round_nearest_even );
369     roundIncrement = 0x40;
370     if ( ! roundNearestEven ) {
371         if ( roundingMode == float_round_to_zero ) {
372             roundIncrement = 0;
373         }
374         else {
375             roundIncrement = 0x7F;
376             if ( zSign ) {
377                 if ( roundingMode == float_round_up ) roundIncrement = 0;
378             }
379             else {
380                 if ( roundingMode == float_round_down ) roundIncrement = 0;
381             }
382         }
383     }
384     roundBits = zSig & 0x7F;
385     if ( 0xFD <= (bits16) zExp ) {
386         if (    ( 0xFD < zExp )
387              || (    ( zExp == 0xFD )
388                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
389            ) {
390             float_raise( float_flag_overflow | float_flag_inexact );
391             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
392         }
393         if ( zExp < 0 ) {
394             isTiny =
395                    ( float_detect_tininess == float_tininess_before_rounding )
396                 || ( zExp < -1 )
397                 || ( zSig + roundIncrement < 0x80000000 );
398             shift32RightJamming( zSig, - zExp, &zSig );
399             zExp = 0;
400             roundBits = zSig & 0x7F;
401             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
402         }
403     }
404     if ( roundBits ) float_set_inexact();
405     zSig = ( zSig + roundIncrement )>>7;
406     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
407     if ( zSig == 0 ) zExp = 0;
408     return packFloat32( zSign, zExp, zSig );
409 
410 }
411 
412 /*
413 -------------------------------------------------------------------------------
414 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
415 and significand `zSig', and returns the proper single-precision floating-
416 point value corresponding to the abstract input.  This routine is just like
417 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
418 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
419 floating-point exponent.
420 -------------------------------------------------------------------------------
421 */
422 float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)423  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
424 {
425     int8 shiftCount;
426 
427     shiftCount = countLeadingZeros32( zSig ) - 1;
428     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
429 
430 }
431 
432 /*
433 -------------------------------------------------------------------------------
434 Returns the fraction bits of the double-precision floating-point value `a'.
435 -------------------------------------------------------------------------------
436 */
extractFloat64Frac(float64 a)437 INLINE bits64 extractFloat64Frac( float64 a )
438 {
439 
440     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
441 
442 }
443 
444 /*
445 -------------------------------------------------------------------------------
446 Returns the exponent bits of the double-precision floating-point value `a'.
447 -------------------------------------------------------------------------------
448 */
extractFloat64Exp(float64 a)449 INLINE int16 extractFloat64Exp( float64 a )
450 {
451 
452     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
453 
454 }
455 
456 /*
457 -------------------------------------------------------------------------------
458 Returns the sign bit of the double-precision floating-point value `a'.
459 -------------------------------------------------------------------------------
460 */
extractFloat64Sign(float64 a)461 INLINE flag extractFloat64Sign( float64 a )
462 {
463 
464     return FLOAT64_DEMANGLE(a)>>63;
465 
466 }
467 
468 /*
469 -------------------------------------------------------------------------------
470 Normalizes the subnormal double-precision floating-point value represented
471 by the denormalized significand `aSig'.  The normalized exponent and
472 significand are stored at the locations pointed to by `zExpPtr' and
473 `zSigPtr', respectively.
474 -------------------------------------------------------------------------------
475 */
476 static void
normalizeFloat64Subnormal(bits64 aSig,int16 * zExpPtr,bits64 * zSigPtr)477  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
478 {
479     int8 shiftCount;
480 
481     shiftCount = countLeadingZeros64( aSig ) - 11;
482     *zSigPtr = aSig<<shiftCount;
483     *zExpPtr = 1 - shiftCount;
484 
485 }
486 
487 /*
488 -------------------------------------------------------------------------------
489 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
490 double-precision floating-point value, returning the result.  After being
491 shifted into the proper positions, the three fields are simply added
492 together to form the result.  This means that any integer portion of `zSig'
493 will be added into the exponent.  Since a properly normalized significand
494 will have an integer portion equal to 1, the `zExp' input should be 1 less
495 than the desired result exponent whenever `zSig' is a complete, normalized
496 significand.
497 -------------------------------------------------------------------------------
498 */
packFloat64(flag zSign,int16 zExp,bits64 zSig)499 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
500 {
501 
502     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
503 			   ( ( (bits64) zExp )<<52 ) + zSig );
504 
505 }
506 
507 /*
508 -------------------------------------------------------------------------------
509 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
510 and significand `zSig', and returns the proper double-precision floating-
511 point value corresponding to the abstract input.  Ordinarily, the abstract
512 value is simply rounded and packed into the double-precision format, with
513 the inexact exception raised if the abstract input cannot be represented
514 exactly.  However, if the abstract value is too large, the overflow and
515 inexact exceptions are raised and an infinity or maximal finite value is
516 returned.  If the abstract value is too small, the input value is rounded to
517 a subnormal number, and the underflow and inexact exceptions are raised if
518 the abstract input cannot be represented exactly as a subnormal double-
519 precision floating-point number.
520     The input significand `zSig' has its binary point between bits 62
521 and 61, which is 10 bits to the left of the usual location.  This shifted
522 significand must be normalized or smaller.  If `zSig' is not normalized,
523 `zExp' must be 0; in that case, the result returned is a subnormal number,
524 and it must not require rounding.  In the usual case that `zSig' is
525 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
526 The handling of underflow and overflow follows the IEC/IEEE Standard for
527 Binary Floating-Point Arithmetic.
528 -------------------------------------------------------------------------------
529 */
roundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)530 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
531 {
532     int8 roundingMode;
533     flag roundNearestEven;
534     int16 roundIncrement, roundBits;
535     flag isTiny;
536 
537     roundingMode = float_rounding_mode();
538     roundNearestEven = ( roundingMode == float_round_nearest_even );
539     roundIncrement = 0x200;
540     if ( ! roundNearestEven ) {
541         if ( roundingMode == float_round_to_zero ) {
542             roundIncrement = 0;
543         }
544         else {
545             roundIncrement = 0x3FF;
546             if ( zSign ) {
547                 if ( roundingMode == float_round_up ) roundIncrement = 0;
548             }
549             else {
550                 if ( roundingMode == float_round_down ) roundIncrement = 0;
551             }
552         }
553     }
554     roundBits = zSig & 0x3FF;
555     if ( 0x7FD <= (bits16) zExp ) {
556         if (    ( 0x7FD < zExp )
557              || (    ( zExp == 0x7FD )
558                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
559            ) {
560             float_raise( float_flag_overflow | float_flag_inexact );
561             return FLOAT64_MANGLE(
562 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
563 		( roundIncrement == 0 ));
564         }
565         if ( zExp < 0 ) {
566             isTiny =
567                    ( float_detect_tininess == float_tininess_before_rounding )
568                 || ( zExp < -1 )
569                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
570             shift64RightJamming( zSig, - zExp, &zSig );
571             zExp = 0;
572             roundBits = zSig & 0x3FF;
573             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
574         }
575     }
576     if ( roundBits ) float_set_inexact();
577     zSig = ( zSig + roundIncrement )>>10;
578     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
579     if ( zSig == 0 ) zExp = 0;
580     return packFloat64( zSign, zExp, zSig );
581 
582 }
583 
584 /*
585 -------------------------------------------------------------------------------
586 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
587 and significand `zSig', and returns the proper double-precision floating-
588 point value corresponding to the abstract input.  This routine is just like
589 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
590 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
591 floating-point exponent.
592 -------------------------------------------------------------------------------
593 */
594 float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)595  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
596 {
597     int8 shiftCount;
598 
599     shiftCount = countLeadingZeros64( zSig ) - 1;
600     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
601 
602 }
603 
604 #ifdef FLOATX80
605 
606 /*
607 -------------------------------------------------------------------------------
608 Returns the fraction bits of the extended double-precision floating-point
609 value `a'.
610 -------------------------------------------------------------------------------
611 */
extractFloatx80Frac(floatx80 a)612 INLINE bits64 extractFloatx80Frac( floatx80 a )
613 {
614 
615     return a.low;
616 
617 }
618 
619 /*
620 -------------------------------------------------------------------------------
621 Returns the exponent bits of the extended double-precision floating-point
622 value `a'.
623 -------------------------------------------------------------------------------
624 */
extractFloatx80Exp(floatx80 a)625 INLINE int32 extractFloatx80Exp( floatx80 a )
626 {
627 
628     return a.high & 0x7FFF;
629 
630 }
631 
632 /*
633 -------------------------------------------------------------------------------
634 Returns the sign bit of the extended double-precision floating-point value
635 `a'.
636 -------------------------------------------------------------------------------
637 */
extractFloatx80Sign(floatx80 a)638 INLINE flag extractFloatx80Sign( floatx80 a )
639 {
640 
641     return a.high>>15;
642 
643 }
644 
645 /*
646 -------------------------------------------------------------------------------
647 Normalizes the subnormal extended double-precision floating-point value
648 represented by the denormalized significand `aSig'.  The normalized exponent
649 and significand are stored at the locations pointed to by `zExpPtr' and
650 `zSigPtr', respectively.
651 -------------------------------------------------------------------------------
652 */
653 static void
normalizeFloatx80Subnormal(bits64 aSig,int32 * zExpPtr,bits64 * zSigPtr)654  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
655 {
656     int8 shiftCount;
657 
658     shiftCount = countLeadingZeros64( aSig );
659     *zSigPtr = aSig<<shiftCount;
660     *zExpPtr = 1 - shiftCount;
661 
662 }
663 
664 /*
665 -------------------------------------------------------------------------------
666 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
667 extended double-precision floating-point value, returning the result.
668 -------------------------------------------------------------------------------
669 */
packFloatx80(flag zSign,int32 zExp,bits64 zSig)670 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
671 {
672     floatx80 z;
673 
674     z.low = zSig;
675     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
676     return z;
677 
678 }
679 
680 /*
681 -------------------------------------------------------------------------------
682 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
683 and extended significand formed by the concatenation of `zSig0' and `zSig1',
684 and returns the proper extended double-precision floating-point value
685 corresponding to the abstract input.  Ordinarily, the abstract value is
686 rounded and packed into the extended double-precision format, with the
687 inexact exception raised if the abstract input cannot be represented
688 exactly.  However, if the abstract value is too large, the overflow and
689 inexact exceptions are raised and an infinity or maximal finite value is
690 returned.  If the abstract value is too small, the input value is rounded to
691 a subnormal number, and the underflow and inexact exceptions are raised if
692 the abstract input cannot be represented exactly as a subnormal extended
693 double-precision floating-point number.
694     If `roundingPrecision' is 32 or 64, the result is rounded to the same
695 number of bits as single or double precision, respectively.  Otherwise, the
696 result is rounded to the full precision of the extended double-precision
697 format.
698     The input significand must be normalized or smaller.  If the input
699 significand is not normalized, `zExp' must be 0; in that case, the result
700 returned is a subnormal number, and it must not require rounding.  The
701 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
702 Floating-Point Arithmetic.
703 -------------------------------------------------------------------------------
704 */
705 static floatx80
roundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)706  roundAndPackFloatx80(
707      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
708  )
709 {
710     int8 roundingMode;
711     flag roundNearestEven, increment, isTiny;
712     int64 roundIncrement, roundMask, roundBits;
713 
714     roundingMode = float_rounding_mode();
715     roundNearestEven = ( roundingMode == float_round_nearest_even );
716     if ( roundingPrecision == 80 ) goto precision80;
717     if ( roundingPrecision == 64 ) {
718         roundIncrement = LIT64( 0x0000000000000400 );
719         roundMask = LIT64( 0x00000000000007FF );
720     }
721     else if ( roundingPrecision == 32 ) {
722         roundIncrement = LIT64( 0x0000008000000000 );
723         roundMask = LIT64( 0x000000FFFFFFFFFF );
724     }
725     else {
726         goto precision80;
727     }
728     zSig0 |= ( zSig1 != 0 );
729     if ( ! roundNearestEven ) {
730         if ( roundingMode == float_round_to_zero ) {
731             roundIncrement = 0;
732         }
733         else {
734             roundIncrement = roundMask;
735             if ( zSign ) {
736                 if ( roundingMode == float_round_up ) roundIncrement = 0;
737             }
738             else {
739                 if ( roundingMode == float_round_down ) roundIncrement = 0;
740             }
741         }
742     }
743     roundBits = zSig0 & roundMask;
744     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
745         if (    ( 0x7FFE < zExp )
746              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
747            ) {
748             goto overflow;
749         }
750         if ( zExp <= 0 ) {
751             isTiny =
752                    ( float_detect_tininess == float_tininess_before_rounding )
753                 || ( zExp < 0 )
754                 || ( zSig0 <= zSig0 + roundIncrement );
755             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
756             zExp = 0;
757             roundBits = zSig0 & roundMask;
758             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
759             if ( roundBits ) float_set_inexact();
760             zSig0 += roundIncrement;
761             if ( (sbits64) zSig0 < 0 ) zExp = 1;
762             roundIncrement = roundMask + 1;
763             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
764                 roundMask |= roundIncrement;
765             }
766             zSig0 &= ~ roundMask;
767             return packFloatx80( zSign, zExp, zSig0 );
768         }
769     }
770     if ( roundBits ) float_set_inexact();
771     zSig0 += roundIncrement;
772     if ( zSig0 < roundIncrement ) {
773         ++zExp;
774         zSig0 = LIT64( 0x8000000000000000 );
775     }
776     roundIncrement = roundMask + 1;
777     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
778         roundMask |= roundIncrement;
779     }
780     zSig0 &= ~ roundMask;
781     if ( zSig0 == 0 ) zExp = 0;
782     return packFloatx80( zSign, zExp, zSig0 );
783  precision80:
784     increment = ( (sbits64) zSig1 < 0 );
785     if ( ! roundNearestEven ) {
786         if ( roundingMode == float_round_to_zero ) {
787             increment = 0;
788         }
789         else {
790             if ( zSign ) {
791                 increment = ( roundingMode == float_round_down ) && zSig1;
792             }
793             else {
794                 increment = ( roundingMode == float_round_up ) && zSig1;
795             }
796         }
797     }
798     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
799         if (    ( 0x7FFE < zExp )
800              || (    ( zExp == 0x7FFE )
801                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
802                   && increment
803                 )
804            ) {
805             roundMask = 0;
806  overflow:
807             float_raise( float_flag_overflow | float_flag_inexact );
808             if (    ( roundingMode == float_round_to_zero )
809                  || ( zSign && ( roundingMode == float_round_up ) )
810                  || ( ! zSign && ( roundingMode == float_round_down ) )
811                ) {
812                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
813             }
814             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
815         }
816         if ( zExp <= 0 ) {
817             isTiny =
818                    ( float_detect_tininess == float_tininess_before_rounding )
819                 || ( zExp < 0 )
820                 || ! increment
821                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
822             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
823             zExp = 0;
824             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
825             if ( zSig1 ) float_set_inexact();
826             if ( roundNearestEven ) {
827                 increment = ( (sbits64) zSig1 < 0 );
828             }
829             else {
830                 if ( zSign ) {
831                     increment = ( roundingMode == float_round_down ) && zSig1;
832                 }
833                 else {
834                     increment = ( roundingMode == float_round_up ) && zSig1;
835                 }
836             }
837             if ( increment ) {
838                 ++zSig0;
839                 zSig0 &=
840                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
841                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
842             }
843             return packFloatx80( zSign, zExp, zSig0 );
844         }
845     }
846     if ( zSig1 ) float_set_inexact();
847     if ( increment ) {
848         ++zSig0;
849         if ( zSig0 == 0 ) {
850             ++zExp;
851             zSig0 = LIT64( 0x8000000000000000 );
852         }
853         else {
854             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
855         }
856     }
857     else {
858         if ( zSig0 == 0 ) zExp = 0;
859     }
860     return packFloatx80( zSign, zExp, zSig0 );
861 
862 }
863 
864 /*
865 -------------------------------------------------------------------------------
866 Takes an abstract floating-point value having sign `zSign', exponent
867 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
868 and returns the proper extended double-precision floating-point value
869 corresponding to the abstract input.  This routine is just like
870 `roundAndPackFloatx80' except that the input significand does not have to be
871 normalized.
872 -------------------------------------------------------------------------------
873 */
874 static floatx80
normalizeRoundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)875  normalizeRoundAndPackFloatx80(
876      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
877  )
878 {
879     int8 shiftCount;
880 
881     if ( zSig0 == 0 ) {
882         zSig0 = zSig1;
883         zSig1 = 0;
884         zExp -= 64;
885     }
886     shiftCount = countLeadingZeros64( zSig0 );
887     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
888     zExp -= shiftCount;
889     return
890         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
891 
892 }
893 
894 #endif
895 
896 #ifdef FLOAT128
897 
898 /*
899 -------------------------------------------------------------------------------
900 Returns the least-significant 64 fraction bits of the quadruple-precision
901 floating-point value `a'.
902 -------------------------------------------------------------------------------
903 */
extractFloat128Frac1(float128 a)904 INLINE bits64 extractFloat128Frac1( float128 a )
905 {
906 
907     return a.low;
908 
909 }
910 
911 /*
912 -------------------------------------------------------------------------------
913 Returns the most-significant 48 fraction bits of the quadruple-precision
914 floating-point value `a'.
915 -------------------------------------------------------------------------------
916 */
extractFloat128Frac0(float128 a)917 INLINE bits64 extractFloat128Frac0( float128 a )
918 {
919 
920     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
921 
922 }
923 
924 /*
925 -------------------------------------------------------------------------------
926 Returns the exponent bits of the quadruple-precision floating-point value
927 `a'.
928 -------------------------------------------------------------------------------
929 */
extractFloat128Exp(float128 a)930 INLINE int32 extractFloat128Exp( float128 a )
931 {
932 
933     return ( a.high>>48 ) & 0x7FFF;
934 
935 }
936 
937 /*
938 -------------------------------------------------------------------------------
939 Returns the sign bit of the quadruple-precision floating-point value `a'.
940 -------------------------------------------------------------------------------
941 */
extractFloat128Sign(float128 a)942 INLINE flag extractFloat128Sign( float128 a )
943 {
944 
945     return a.high>>63;
946 
947 }
948 
949 /*
950 -------------------------------------------------------------------------------
951 Normalizes the subnormal quadruple-precision floating-point value
952 represented by the denormalized significand formed by the concatenation of
953 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
954 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
955 significand are stored at the location pointed to by `zSig0Ptr', and the
956 least significant 64 bits of the normalized significand are stored at the
957 location pointed to by `zSig1Ptr'.
958 -------------------------------------------------------------------------------
959 */
960 static void
normalizeFloat128Subnormal(bits64 aSig0,bits64 aSig1,int32 * zExpPtr,bits64 * zSig0Ptr,bits64 * zSig1Ptr)961  normalizeFloat128Subnormal(
962      bits64 aSig0,
963      bits64 aSig1,
964      int32 *zExpPtr,
965      bits64 *zSig0Ptr,
966      bits64 *zSig1Ptr
967  )
968 {
969     int8 shiftCount;
970 
971     if ( aSig0 == 0 ) {
972         shiftCount = countLeadingZeros64( aSig1 ) - 15;
973         if ( shiftCount < 0 ) {
974             *zSig0Ptr = aSig1>>( - shiftCount );
975             *zSig1Ptr = aSig1<<( shiftCount & 63 );
976         }
977         else {
978             *zSig0Ptr = aSig1<<shiftCount;
979             *zSig1Ptr = 0;
980         }
981         *zExpPtr = - shiftCount - 63;
982     }
983     else {
984         shiftCount = countLeadingZeros64( aSig0 ) - 15;
985         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
986         *zExpPtr = 1 - shiftCount;
987     }
988 
989 }
990 
991 /*
992 -------------------------------------------------------------------------------
993 Packs the sign `zSign', the exponent `zExp', and the significand formed
994 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
995 floating-point value, returning the result.  After being shifted into the
996 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
997 added together to form the most significant 32 bits of the result.  This
998 means that any integer portion of `zSig0' will be added into the exponent.
999 Since a properly normalized significand will have an integer portion equal
1000 to 1, the `zExp' input should be 1 less than the desired result exponent
1001 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1002 significand.
1003 -------------------------------------------------------------------------------
1004 */
1005 INLINE float128
packFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)1006  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1007 {
1008     float128 z;
1009 
1010     z.low = zSig1;
1011     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
1012     return z;
1013 
1014 }
1015 
1016 /*
1017 -------------------------------------------------------------------------------
1018 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1019 and extended significand formed by the concatenation of `zSig0', `zSig1',
1020 and `zSig2', and returns the proper quadruple-precision floating-point value
1021 corresponding to the abstract input.  Ordinarily, the abstract value is
1022 simply rounded and packed into the quadruple-precision format, with the
1023 inexact exception raised if the abstract input cannot be represented
1024 exactly.  However, if the abstract value is too large, the overflow and
1025 inexact exceptions are raised and an infinity or maximal finite value is
1026 returned.  If the abstract value is too small, the input value is rounded to
1027 a subnormal number, and the underflow and inexact exceptions are raised if
1028 the abstract input cannot be represented exactly as a subnormal quadruple-
1029 precision floating-point number.
1030     The input significand must be normalized or smaller.  If the input
1031 significand is not normalized, `zExp' must be 0; in that case, the result
1032 returned is a subnormal number, and it must not require rounding.  In the
1033 usual case that the input significand is normalized, `zExp' must be 1 less
1034 than the ``true'' floating-point exponent.  The handling of underflow and
1035 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1036 -------------------------------------------------------------------------------
1037 */
1038 static float128
roundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1,bits64 zSig2)1039  roundAndPackFloat128(
1040      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
1041 {
1042     int8 roundingMode;
1043     flag roundNearestEven, increment, isTiny;
1044 
1045     roundingMode = float_rounding_mode();
1046     roundNearestEven = ( roundingMode == float_round_nearest_even );
1047     increment = ( (sbits64) zSig2 < 0 );
1048     if ( ! roundNearestEven ) {
1049         if ( roundingMode == float_round_to_zero ) {
1050             increment = 0;
1051         }
1052         else {
1053             if ( zSign ) {
1054                 increment = ( roundingMode == float_round_down ) && zSig2;
1055             }
1056             else {
1057                 increment = ( roundingMode == float_round_up ) && zSig2;
1058             }
1059         }
1060     }
1061     if ( 0x7FFD <= (bits32) zExp ) {
1062         if (    ( 0x7FFD < zExp )
1063              || (    ( zExp == 0x7FFD )
1064                   && eq128(
1065                          LIT64( 0x0001FFFFFFFFFFFF ),
1066                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1067                          zSig0,
1068                          zSig1
1069                      )
1070                   && increment
1071                 )
1072            ) {
1073             float_raise( float_flag_overflow | float_flag_inexact );
1074             if (    ( roundingMode == float_round_to_zero )
1075                  || ( zSign && ( roundingMode == float_round_up ) )
1076                  || ( ! zSign && ( roundingMode == float_round_down ) )
1077                ) {
1078                 return
1079                     packFloat128(
1080                         zSign,
1081                         0x7FFE,
1082                         LIT64( 0x0000FFFFFFFFFFFF ),
1083                         LIT64( 0xFFFFFFFFFFFFFFFF )
1084                     );
1085             }
1086             return packFloat128( zSign, 0x7FFF, 0, 0 );
1087         }
1088         if ( zExp < 0 ) {
1089             isTiny =
1090                    ( float_detect_tininess == float_tininess_before_rounding )
1091                 || ( zExp < -1 )
1092                 || ! increment
1093                 || lt128(
1094                        zSig0,
1095                        zSig1,
1096                        LIT64( 0x0001FFFFFFFFFFFF ),
1097                        LIT64( 0xFFFFFFFFFFFFFFFF )
1098                    );
1099             shift128ExtraRightJamming(
1100                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1101             zExp = 0;
1102             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1103             if ( roundNearestEven ) {
1104                 increment = ( (sbits64) zSig2 < 0 );
1105             }
1106             else {
1107                 if ( zSign ) {
1108                     increment = ( roundingMode == float_round_down ) && zSig2;
1109                 }
1110                 else {
1111                     increment = ( roundingMode == float_round_up ) && zSig2;
1112                 }
1113             }
1114         }
1115     }
1116     if ( zSig2 ) float_set_inexact();
1117     if ( increment ) {
1118         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1119         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1120     }
1121     else {
1122         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1123     }
1124     return packFloat128( zSign, zExp, zSig0, zSig1 );
1125 
1126 }
1127 
1128 /*
1129 -------------------------------------------------------------------------------
1130 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1131 and significand formed by the concatenation of `zSig0' and `zSig1', and
1132 returns the proper quadruple-precision floating-point value corresponding
1133 to the abstract input.  This routine is just like `roundAndPackFloat128'
1134 except that the input significand has fewer bits and does not have to be
1135 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1136 point exponent.
1137 -------------------------------------------------------------------------------
1138 */
1139 static float128
normalizeRoundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)1140  normalizeRoundAndPackFloat128(
1141      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1142 {
1143     int8 shiftCount;
1144     bits64 zSig2;
1145 
1146     if ( zSig0 == 0 ) {
1147         zSig0 = zSig1;
1148         zSig1 = 0;
1149         zExp -= 64;
1150     }
1151     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1152     if ( 0 <= shiftCount ) {
1153         zSig2 = 0;
1154         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1155     }
1156     else {
1157         shift128ExtraRightJamming(
1158             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1159     }
1160     zExp -= shiftCount;
1161     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1162 
1163 }
1164 
1165 #endif
1166 
1167 /*
1168 -------------------------------------------------------------------------------
1169 Returns the result of converting the 32-bit two's complement integer `a'
1170 to the single-precision floating-point format.  The conversion is performed
1171 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1172 -------------------------------------------------------------------------------
1173 */
int32_to_float32(int32 a)1174 float32 int32_to_float32( int32 a )
1175 {
1176     flag zSign;
1177 
1178     if ( a == 0 ) return 0;
1179     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1180     zSign = ( a < 0 );
1181     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1182 
1183 }
1184 
1185 /*
1186 -------------------------------------------------------------------------------
1187 Returns the result of converting the 32-bit two's complement integer `a'
1188 to the double-precision floating-point format.  The conversion is performed
1189 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1190 -------------------------------------------------------------------------------
1191 */
int32_to_float64(int32 a)1192 float64 int32_to_float64( int32 a )
1193 {
1194     flag zSign;
1195     uint32 absA;
1196     int8 shiftCount;
1197     bits64 zSig;
1198 
1199     if ( a == 0 ) return 0;
1200     zSign = ( a < 0 );
1201     absA = zSign ? - a : a;
1202     shiftCount = countLeadingZeros32( absA ) + 21;
1203     zSig = absA;
1204     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1205 
1206 }
1207 
1208 #ifdef FLOATX80
1209 
1210 /*
1211 -------------------------------------------------------------------------------
1212 Returns the result of converting the 32-bit two's complement integer `a'
1213 to the extended double-precision floating-point format.  The conversion
1214 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1215 Arithmetic.
1216 -------------------------------------------------------------------------------
1217 */
int32_to_floatx80(int32 a)1218 floatx80 int32_to_floatx80( int32 a )
1219 {
1220     flag zSign;
1221     uint32 absA;
1222     int8 shiftCount;
1223     bits64 zSig;
1224 
1225     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1226     zSign = ( a < 0 );
1227     absA = zSign ? - a : a;
1228     shiftCount = countLeadingZeros32( absA ) + 32;
1229     zSig = absA;
1230     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1231 
1232 }
1233 
1234 #endif
1235 
1236 #ifdef FLOAT128
1237 
1238 /*
1239 -------------------------------------------------------------------------------
1240 Returns the result of converting the 32-bit two's complement integer `a' to
1241 the quadruple-precision floating-point format.  The conversion is performed
1242 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1243 -------------------------------------------------------------------------------
1244 */
int32_to_float128(int32 a)1245 float128 int32_to_float128( int32 a )
1246 {
1247     flag zSign;
1248     uint32 absA;
1249     int8 shiftCount;
1250     bits64 zSig0;
1251 
1252     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1253     zSign = ( a < 0 );
1254     absA = zSign ? - a : a;
1255     shiftCount = countLeadingZeros32( absA ) + 17;
1256     zSig0 = absA;
1257     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1258 
1259 }
1260 
1261 #endif
1262 
1263 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1264 /*
1265 -------------------------------------------------------------------------------
1266 Returns the result of converting the 64-bit two's complement integer `a'
1267 to the single-precision floating-point format.  The conversion is performed
1268 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1269 -------------------------------------------------------------------------------
1270 */
int64_to_float32(int64 a)1271 float32 int64_to_float32( int64 a )
1272 {
1273     flag zSign;
1274     uint64 absA;
1275     int8 shiftCount;
1276 
1277     if ( a == 0 ) return 0;
1278     zSign = ( a < 0 );
1279     absA = zSign ? - a : a;
1280     shiftCount = countLeadingZeros64( absA ) - 40;
1281     if ( 0 <= shiftCount ) {
1282         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1283     }
1284     else {
1285         shiftCount += 7;
1286         if ( shiftCount < 0 ) {
1287             shift64RightJamming( absA, - shiftCount, &absA );
1288         }
1289         else {
1290             absA <<= shiftCount;
1291         }
1292         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1293     }
1294 
1295 }
1296 
1297 /*
1298 -------------------------------------------------------------------------------
1299 Returns the result of converting the 64-bit two's complement integer `a'
1300 to the double-precision floating-point format.  The conversion is performed
1301 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1302 -------------------------------------------------------------------------------
1303 */
int64_to_float64(int64 a)1304 float64 int64_to_float64( int64 a )
1305 {
1306     flag zSign;
1307 
1308     if ( a == 0 ) return 0;
1309     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1310         return packFloat64( 1, 0x43E, 0 );
1311     }
1312     zSign = ( a < 0 );
1313     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1314 
1315 }
1316 
1317 #ifdef FLOATX80
1318 
1319 /*
1320 -------------------------------------------------------------------------------
1321 Returns the result of converting the 64-bit two's complement integer `a'
1322 to the extended double-precision floating-point format.  The conversion
1323 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1324 Arithmetic.
1325 -------------------------------------------------------------------------------
1326 */
int64_to_floatx80(int64 a)1327 floatx80 int64_to_floatx80( int64 a )
1328 {
1329     flag zSign;
1330     uint64 absA;
1331     int8 shiftCount;
1332 
1333     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1334     zSign = ( a < 0 );
1335     absA = zSign ? - a : a;
1336     shiftCount = countLeadingZeros64( absA );
1337     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1338 
1339 }
1340 
1341 #endif
1342 
1343 #ifdef FLOAT128
1344 
1345 /*
1346 -------------------------------------------------------------------------------
1347 Returns the result of converting the 64-bit two's complement integer `a' to
1348 the quadruple-precision floating-point format.  The conversion is performed
1349 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1350 -------------------------------------------------------------------------------
1351 */
int64_to_float128(int64 a)1352 float128 int64_to_float128( int64 a )
1353 {
1354     flag zSign;
1355     uint64 absA;
1356     int8 shiftCount;
1357     int32 zExp;
1358     bits64 zSig0, zSig1;
1359 
1360     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1361     zSign = ( a < 0 );
1362     absA = zSign ? - a : a;
1363     shiftCount = countLeadingZeros64( absA ) + 49;
1364     zExp = 0x406E - shiftCount;
1365     if ( 64 <= shiftCount ) {
1366         zSig1 = 0;
1367         zSig0 = absA;
1368         shiftCount -= 64;
1369     }
1370     else {
1371         zSig1 = absA;
1372         zSig0 = 0;
1373     }
1374     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1375     return packFloat128( zSign, zExp, zSig0, zSig1 );
1376 
1377 }
1378 
1379 #endif
1380 #endif /* !SOFTFLOAT_FOR_GCC */
1381 
1382 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1383 /*
1384 -------------------------------------------------------------------------------
1385 Returns the result of converting the single-precision floating-point value
1386 `a' to the 32-bit two's complement integer format.  The conversion is
1387 performed according to the IEC/IEEE Standard for Binary Floating-Point
1388 Arithmetic---which means in particular that the conversion is rounded
1389 according to the current rounding mode.  If `a' is a NaN, the largest
1390 positive integer is returned.  Otherwise, if the conversion overflows, the
1391 largest integer with the same sign as `a' is returned.
1392 -------------------------------------------------------------------------------
1393 */
float32_to_int32(float32 a)1394 int32 float32_to_int32( float32 a )
1395 {
1396     flag aSign;
1397     int16 aExp, shiftCount;
1398     bits32 aSig;
1399     bits64 aSig64;
1400 
1401     aSig = extractFloat32Frac( a );
1402     aExp = extractFloat32Exp( a );
1403     aSign = extractFloat32Sign( a );
1404     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1405     if ( aExp ) aSig |= 0x00800000;
1406     shiftCount = 0xAF - aExp;
1407     aSig64 = aSig;
1408     aSig64 <<= 32;
1409     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1410     return roundAndPackInt32( aSign, aSig64 );
1411 
1412 }
1413 #endif /* !SOFTFLOAT_FOR_GCC */
1414 
1415 /*
1416 -------------------------------------------------------------------------------
1417 Returns the result of converting the single-precision floating-point value
1418 `a' to the 32-bit two's complement integer format.  The conversion is
1419 performed according to the IEC/IEEE Standard for Binary Floating-Point
1420 Arithmetic, except that the conversion is always rounded toward zero.
1421 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1422 the conversion overflows, the largest integer with the same sign as `a' is
1423 returned.
1424 -------------------------------------------------------------------------------
1425 */
float32_to_int32_round_to_zero(float32 a)1426 int32 float32_to_int32_round_to_zero( float32 a )
1427 {
1428     flag aSign;
1429     int16 aExp, shiftCount;
1430     bits32 aSig;
1431     int32 z;
1432 
1433     aSig = extractFloat32Frac( a );
1434     aExp = extractFloat32Exp( a );
1435     aSign = extractFloat32Sign( a );
1436     shiftCount = aExp - 0x9E;
1437     if ( 0 <= shiftCount ) {
1438         if ( a != 0xCF000000 ) {
1439             float_raise( float_flag_invalid );
1440             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1441         }
1442         return (sbits32) 0x80000000;
1443     }
1444     else if ( aExp <= 0x7E ) {
1445         if ( aExp | aSig ) float_set_inexact();
1446         return 0;
1447     }
1448     aSig = ( aSig | 0x00800000 )<<8;
1449     z = aSig>>( - shiftCount );
1450     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1451         float_set_inexact();
1452     }
1453     if ( aSign ) z = - z;
1454     return z;
1455 
1456 }
1457 
1458 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1459 /*
1460 -------------------------------------------------------------------------------
1461 Returns the result of converting the single-precision floating-point value
1462 `a' to the 64-bit two's complement integer format.  The conversion is
1463 performed according to the IEC/IEEE Standard for Binary Floating-Point
1464 Arithmetic---which means in particular that the conversion is rounded
1465 according to the current rounding mode.  If `a' is a NaN, the largest
1466 positive integer is returned.  Otherwise, if the conversion overflows, the
1467 largest integer with the same sign as `a' is returned.
1468 -------------------------------------------------------------------------------
1469 */
float32_to_int64(float32 a)1470 int64 float32_to_int64( float32 a )
1471 {
1472     flag aSign;
1473     int16 aExp, shiftCount;
1474     bits32 aSig;
1475     bits64 aSig64, aSigExtra;
1476 
1477     aSig = extractFloat32Frac( a );
1478     aExp = extractFloat32Exp( a );
1479     aSign = extractFloat32Sign( a );
1480     shiftCount = 0xBE - aExp;
1481     if ( shiftCount < 0 ) {
1482         float_raise( float_flag_invalid );
1483         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1484             return LIT64( 0x7FFFFFFFFFFFFFFF );
1485         }
1486         return (sbits64) LIT64( 0x8000000000000000 );
1487     }
1488     if ( aExp ) aSig |= 0x00800000;
1489     aSig64 = aSig;
1490     aSig64 <<= 40;
1491     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1492     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1493 
1494 }
1495 
1496 /*
1497 -------------------------------------------------------------------------------
1498 Returns the result of converting the single-precision floating-point value
1499 `a' to the 64-bit two's complement integer format.  The conversion is
1500 performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 Arithmetic, except that the conversion is always rounded toward zero.  If
1502 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1503 conversion overflows, the largest integer with the same sign as `a' is
1504 returned.
1505 -------------------------------------------------------------------------------
1506 */
float32_to_int64_round_to_zero(float32 a)1507 int64 float32_to_int64_round_to_zero( float32 a )
1508 {
1509     flag aSign;
1510     int16 aExp, shiftCount;
1511     bits32 aSig;
1512     bits64 aSig64;
1513     int64 z;
1514 
1515     aSig = extractFloat32Frac( a );
1516     aExp = extractFloat32Exp( a );
1517     aSign = extractFloat32Sign( a );
1518     shiftCount = aExp - 0xBE;
1519     if ( 0 <= shiftCount ) {
1520         if ( a != 0xDF000000 ) {
1521             float_raise( float_flag_invalid );
1522             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1523                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1524             }
1525         }
1526         return (sbits64) LIT64( 0x8000000000000000 );
1527     }
1528     else if ( aExp <= 0x7E ) {
1529         if ( aExp | aSig ) float_set_inexact();
1530         return 0;
1531     }
1532     aSig64 = aSig | 0x00800000;
1533     aSig64 <<= 40;
1534     z = aSig64>>( - shiftCount );
1535     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1536         float_set_inexact();
1537     }
1538     if ( aSign ) z = - z;
1539     return z;
1540 
1541 }
1542 #endif /* !SOFTFLOAT_FOR_GCC */
1543 
1544 /*
1545 -------------------------------------------------------------------------------
1546 Returns the result of converting the single-precision floating-point value
1547 `a' to the double-precision floating-point format.  The conversion is
1548 performed according to the IEC/IEEE Standard for Binary Floating-Point
1549 Arithmetic.
1550 -------------------------------------------------------------------------------
1551 */
float32_to_float64(float32 a)1552 float64 float32_to_float64( float32 a )
1553 {
1554     flag aSign;
1555     int16 aExp;
1556     bits32 aSig;
1557 
1558     aSig = extractFloat32Frac( a );
1559     aExp = extractFloat32Exp( a );
1560     aSign = extractFloat32Sign( a );
1561     if ( aExp == 0xFF ) {
1562         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1563         return packFloat64( aSign, 0x7FF, 0 );
1564     }
1565     if ( aExp == 0 ) {
1566         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1567         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1568         --aExp;
1569     }
1570     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1571 
1572 }
1573 
1574 #ifdef FLOATX80
1575 
1576 /*
1577 -------------------------------------------------------------------------------
1578 Returns the result of converting the single-precision floating-point value
1579 `a' to the extended double-precision floating-point format.  The conversion
1580 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1581 Arithmetic.
1582 -------------------------------------------------------------------------------
1583 */
float32_to_floatx80(float32 a)1584 floatx80 float32_to_floatx80( float32 a )
1585 {
1586     flag aSign;
1587     int16 aExp;
1588     bits32 aSig;
1589 
1590     aSig = extractFloat32Frac( a );
1591     aExp = extractFloat32Exp( a );
1592     aSign = extractFloat32Sign( a );
1593     if ( aExp == 0xFF ) {
1594         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1595         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1596     }
1597     if ( aExp == 0 ) {
1598         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1599         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1600     }
1601     aSig |= 0x00800000;
1602     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1603 
1604 }
1605 
1606 #endif
1607 
1608 #ifdef FLOAT128
1609 
1610 /*
1611 -------------------------------------------------------------------------------
1612 Returns the result of converting the single-precision floating-point value
1613 `a' to the double-precision floating-point format.  The conversion is
1614 performed according to the IEC/IEEE Standard for Binary Floating-Point
1615 Arithmetic.
1616 -------------------------------------------------------------------------------
1617 */
float32_to_float128(float32 a)1618 float128 float32_to_float128( float32 a )
1619 {
1620     flag aSign;
1621     int16 aExp;
1622     bits32 aSig;
1623 
1624     aSig = extractFloat32Frac( a );
1625     aExp = extractFloat32Exp( a );
1626     aSign = extractFloat32Sign( a );
1627     if ( aExp == 0xFF ) {
1628         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1629         return packFloat128( aSign, 0x7FFF, 0, 0 );
1630     }
1631     if ( aExp == 0 ) {
1632         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1633         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1634         --aExp;
1635     }
1636     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1637 
1638 }
1639 
1640 #endif
1641 
1642 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1643 /*
1644 -------------------------------------------------------------------------------
1645 Rounds the single-precision floating-point value `a' to an integer, and
1646 returns the result as a single-precision floating-point value.  The
1647 operation is performed according to the IEC/IEEE Standard for Binary
1648 Floating-Point Arithmetic.
1649 -------------------------------------------------------------------------------
1650 */
float32_round_to_int(float32 a)1651 float32 float32_round_to_int( float32 a )
1652 {
1653     flag aSign;
1654     int16 aExp;
1655     bits32 lastBitMask, roundBitsMask;
1656     int8 roundingMode;
1657     float32 z;
1658 
1659     aExp = extractFloat32Exp( a );
1660     if ( 0x96 <= aExp ) {
1661         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1662             return propagateFloat32NaN( a, a );
1663         }
1664         return a;
1665     }
1666     if ( aExp <= 0x7E ) {
1667         if ( (bits32) ( a<<1 ) == 0 ) return a;
1668         float_set_inexact();
1669         aSign = extractFloat32Sign( a );
1670         switch ( float_rounding_mode() ) {
1671          case float_round_nearest_even:
1672             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1673                 return packFloat32( aSign, 0x7F, 0 );
1674             }
1675             break;
1676          case float_round_down:
1677             return aSign ? 0xBF800000 : 0;
1678          case float_round_up:
1679             return aSign ? 0x80000000 : 0x3F800000;
1680         }
1681         return packFloat32( aSign, 0, 0 );
1682     }
1683     lastBitMask = 1;
1684     lastBitMask <<= 0x96 - aExp;
1685     roundBitsMask = lastBitMask - 1;
1686     z = a;
1687     roundingMode = float_rounding_mode();
1688     if ( roundingMode == float_round_nearest_even ) {
1689         z += lastBitMask>>1;
1690         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1691     }
1692     else if ( roundingMode != float_round_to_zero ) {
1693         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1694             z += roundBitsMask;
1695         }
1696     }
1697     z &= ~ roundBitsMask;
1698     if ( z != a ) float_set_inexact();
1699     return z;
1700 
1701 }
1702 #endif /* !SOFTFLOAT_FOR_GCC */
1703 
1704 /*
1705 -------------------------------------------------------------------------------
1706 Returns the result of adding the absolute values of the single-precision
1707 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1708 before being returned.  `zSign' is ignored if the result is a NaN.
1709 The addition is performed according to the IEC/IEEE Standard for Binary
1710 Floating-Point Arithmetic.
1711 -------------------------------------------------------------------------------
1712 */
addFloat32Sigs(float32 a,float32 b,flag zSign)1713 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1714 {
1715     int16 aExp, bExp, zExp;
1716     bits32 aSig, bSig, zSig;
1717     int16 expDiff;
1718 
1719     aSig = extractFloat32Frac( a );
1720     aExp = extractFloat32Exp( a );
1721     bSig = extractFloat32Frac( b );
1722     bExp = extractFloat32Exp( b );
1723     expDiff = aExp - bExp;
1724     aSig <<= 6;
1725     bSig <<= 6;
1726     if ( 0 < expDiff ) {
1727         if ( aExp == 0xFF ) {
1728             if ( aSig ) return propagateFloat32NaN( a, b );
1729             return a;
1730         }
1731         if ( bExp == 0 ) {
1732             --expDiff;
1733         }
1734         else {
1735             bSig |= 0x20000000;
1736         }
1737         shift32RightJamming( bSig, expDiff, &bSig );
1738         zExp = aExp;
1739     }
1740     else if ( expDiff < 0 ) {
1741         if ( bExp == 0xFF ) {
1742             if ( bSig ) return propagateFloat32NaN( a, b );
1743             return packFloat32( zSign, 0xFF, 0 );
1744         }
1745         if ( aExp == 0 ) {
1746             ++expDiff;
1747         }
1748         else {
1749             aSig |= 0x20000000;
1750         }
1751         shift32RightJamming( aSig, - expDiff, &aSig );
1752         zExp = bExp;
1753     }
1754     else {
1755         if ( aExp == 0xFF ) {
1756             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1757             return a;
1758         }
1759         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1760         zSig = 0x40000000 + aSig + bSig;
1761         zExp = aExp;
1762         goto roundAndPack;
1763     }
1764     aSig |= 0x20000000;
1765     zSig = ( aSig + bSig )<<1;
1766     --zExp;
1767     if ( (sbits32) zSig < 0 ) {
1768         zSig = aSig + bSig;
1769         ++zExp;
1770     }
1771  roundAndPack:
1772     return roundAndPackFloat32( zSign, zExp, zSig );
1773 
1774 }
1775 
1776 /*
1777 -------------------------------------------------------------------------------
1778 Returns the result of subtracting the absolute values of the single-
1779 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1780 difference is negated before being returned.  `zSign' is ignored if the
1781 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1782 Standard for Binary Floating-Point Arithmetic.
1783 -------------------------------------------------------------------------------
1784 */
subFloat32Sigs(float32 a,float32 b,flag zSign)1785 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1786 {
1787     int16 aExp, bExp, zExp;
1788     bits32 aSig, bSig, zSig;
1789     int16 expDiff;
1790 
1791     aSig = extractFloat32Frac( a );
1792     aExp = extractFloat32Exp( a );
1793     bSig = extractFloat32Frac( b );
1794     bExp = extractFloat32Exp( b );
1795     expDiff = aExp - bExp;
1796     aSig <<= 7;
1797     bSig <<= 7;
1798     if ( 0 < expDiff ) goto aExpBigger;
1799     if ( expDiff < 0 ) goto bExpBigger;
1800     if ( aExp == 0xFF ) {
1801         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1802         float_raise( float_flag_invalid );
1803         return float32_default_nan;
1804     }
1805     if ( aExp == 0 ) {
1806         aExp = 1;
1807         bExp = 1;
1808     }
1809     if ( bSig < aSig ) goto aBigger;
1810     if ( aSig < bSig ) goto bBigger;
1811     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1812  bExpBigger:
1813     if ( bExp == 0xFF ) {
1814         if ( bSig ) return propagateFloat32NaN( a, b );
1815         return packFloat32( zSign ^ 1, 0xFF, 0 );
1816     }
1817     if ( aExp == 0 ) {
1818         ++expDiff;
1819     }
1820     else {
1821         aSig |= 0x40000000;
1822     }
1823     shift32RightJamming( aSig, - expDiff, &aSig );
1824     bSig |= 0x40000000;
1825  bBigger:
1826     zSig = bSig - aSig;
1827     zExp = bExp;
1828     zSign ^= 1;
1829     goto normalizeRoundAndPack;
1830  aExpBigger:
1831     if ( aExp == 0xFF ) {
1832         if ( aSig ) return propagateFloat32NaN( a, b );
1833         return a;
1834     }
1835     if ( bExp == 0 ) {
1836         --expDiff;
1837     }
1838     else {
1839         bSig |= 0x40000000;
1840     }
1841     shift32RightJamming( bSig, expDiff, &bSig );
1842     aSig |= 0x40000000;
1843  aBigger:
1844     zSig = aSig - bSig;
1845     zExp = aExp;
1846  normalizeRoundAndPack:
1847     --zExp;
1848     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1849 
1850 }
1851 
1852 /*
1853 -------------------------------------------------------------------------------
1854 Returns the result of adding the single-precision floating-point values `a'
1855 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1856 Binary Floating-Point Arithmetic.
1857 -------------------------------------------------------------------------------
1858 */
float32_add(float32 a,float32 b)1859 float32 float32_add( float32 a, float32 b )
1860 {
1861     flag aSign, bSign;
1862 
1863     aSign = extractFloat32Sign( a );
1864     bSign = extractFloat32Sign( b );
1865     if ( aSign == bSign ) {
1866         return addFloat32Sigs( a, b, aSign );
1867     }
1868     else {
1869         return subFloat32Sigs( a, b, aSign );
1870     }
1871 
1872 }
1873 
1874 /*
1875 -------------------------------------------------------------------------------
1876 Returns the result of subtracting the single-precision floating-point values
1877 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1878 for Binary Floating-Point Arithmetic.
1879 -------------------------------------------------------------------------------
1880 */
float32_sub(float32 a,float32 b)1881 float32 float32_sub( float32 a, float32 b )
1882 {
1883     flag aSign, bSign;
1884 
1885     aSign = extractFloat32Sign( a );
1886     bSign = extractFloat32Sign( b );
1887     if ( aSign == bSign ) {
1888         return subFloat32Sigs( a, b, aSign );
1889     }
1890     else {
1891         return addFloat32Sigs( a, b, aSign );
1892     }
1893 
1894 }
1895 
1896 /*
1897 -------------------------------------------------------------------------------
1898 Returns the result of multiplying the single-precision floating-point values
1899 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1900 for Binary Floating-Point Arithmetic.
1901 -------------------------------------------------------------------------------
1902 */
float32_mul(float32 a,float32 b)1903 float32 float32_mul( float32 a, float32 b )
1904 {
1905     flag aSign, bSign, zSign;
1906     int16 aExp, bExp, zExp;
1907     bits32 aSig, bSig;
1908     bits64 zSig64;
1909     bits32 zSig;
1910 
1911     aSig = extractFloat32Frac( a );
1912     aExp = extractFloat32Exp( a );
1913     aSign = extractFloat32Sign( a );
1914     bSig = extractFloat32Frac( b );
1915     bExp = extractFloat32Exp( b );
1916     bSign = extractFloat32Sign( b );
1917     zSign = aSign ^ bSign;
1918     if ( aExp == 0xFF ) {
1919         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1920             return propagateFloat32NaN( a, b );
1921         }
1922         if ( ( bExp | bSig ) == 0 ) {
1923             float_raise( float_flag_invalid );
1924             return float32_default_nan;
1925         }
1926         return packFloat32( zSign, 0xFF, 0 );
1927     }
1928     if ( bExp == 0xFF ) {
1929         if ( bSig ) return propagateFloat32NaN( a, b );
1930         if ( ( aExp | aSig ) == 0 ) {
1931             float_raise( float_flag_invalid );
1932             return float32_default_nan;
1933         }
1934         return packFloat32( zSign, 0xFF, 0 );
1935     }
1936     if ( aExp == 0 ) {
1937         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1938         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1939     }
1940     if ( bExp == 0 ) {
1941         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1942         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1943     }
1944     zExp = aExp + bExp - 0x7F;
1945     aSig = ( aSig | 0x00800000 )<<7;
1946     bSig = ( bSig | 0x00800000 )<<8;
1947     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1948     zSig = zSig64;
1949     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1950         zSig <<= 1;
1951         --zExp;
1952     }
1953     return roundAndPackFloat32( zSign, zExp, zSig );
1954 
1955 }
1956 
1957 /*
1958 -------------------------------------------------------------------------------
1959 Returns the result of dividing the single-precision floating-point value `a'
1960 by the corresponding value `b'.  The operation is performed according to the
1961 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1962 -------------------------------------------------------------------------------
1963 */
float32_div(float32 a,float32 b)1964 float32 float32_div( float32 a, float32 b )
1965 {
1966     flag aSign, bSign, zSign;
1967     int16 aExp, bExp, zExp;
1968     bits32 aSig, bSig, zSig;
1969 
1970     aSig = extractFloat32Frac( a );
1971     aExp = extractFloat32Exp( a );
1972     aSign = extractFloat32Sign( a );
1973     bSig = extractFloat32Frac( b );
1974     bExp = extractFloat32Exp( b );
1975     bSign = extractFloat32Sign( b );
1976     zSign = aSign ^ bSign;
1977     if ( aExp == 0xFF ) {
1978         if ( aSig ) return propagateFloat32NaN( a, b );
1979         if ( bExp == 0xFF ) {
1980             if ( bSig ) return propagateFloat32NaN( a, b );
1981             float_raise( float_flag_invalid );
1982             return float32_default_nan;
1983         }
1984         return packFloat32( zSign, 0xFF, 0 );
1985     }
1986     if ( bExp == 0xFF ) {
1987         if ( bSig ) return propagateFloat32NaN( a, b );
1988         return packFloat32( zSign, 0, 0 );
1989     }
1990     if ( bExp == 0 ) {
1991         if ( bSig == 0 ) {
1992             if ( ( aExp | aSig ) == 0 ) {
1993                 float_raise( float_flag_invalid );
1994                 return float32_default_nan;
1995             }
1996             float_raise( float_flag_divbyzero );
1997             return packFloat32( zSign, 0xFF, 0 );
1998         }
1999         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2000     }
2001     if ( aExp == 0 ) {
2002         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2003         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2004     }
2005     zExp = aExp - bExp + 0x7D;
2006     aSig = ( aSig | 0x00800000 )<<7;
2007     bSig = ( bSig | 0x00800000 )<<8;
2008     if ( bSig <= ( aSig + aSig ) ) {
2009         aSig >>= 1;
2010         ++zExp;
2011     }
2012     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2013     if ( ( zSig & 0x3F ) == 0 ) {
2014         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2015     }
2016     return roundAndPackFloat32( zSign, zExp, zSig );
2017 
2018 }
2019 
2020 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2021 /*
2022 -------------------------------------------------------------------------------
2023 Returns the remainder of the single-precision floating-point value `a'
2024 with respect to the corresponding value `b'.  The operation is performed
2025 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2026 -------------------------------------------------------------------------------
2027 */
float32_rem(float32 a,float32 b)2028 float32 float32_rem( float32 a, float32 b )
2029 {
2030     flag aSign, bSign, zSign;
2031     int16 aExp, bExp, expDiff;
2032     bits32 aSig, bSig;
2033     bits32 q;
2034     bits64 aSig64, bSig64, q64;
2035     bits32 alternateASig;
2036     sbits32 sigMean;
2037 
2038     aSig = extractFloat32Frac( a );
2039     aExp = extractFloat32Exp( a );
2040     aSign = extractFloat32Sign( a );
2041     bSig = extractFloat32Frac( b );
2042     bExp = extractFloat32Exp( b );
2043     bSign = extractFloat32Sign( b );
2044     if ( aExp == 0xFF ) {
2045         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2046             return propagateFloat32NaN( a, b );
2047         }
2048         float_raise( float_flag_invalid );
2049         return float32_default_nan;
2050     }
2051     if ( bExp == 0xFF ) {
2052         if ( bSig ) return propagateFloat32NaN( a, b );
2053         return a;
2054     }
2055     if ( bExp == 0 ) {
2056         if ( bSig == 0 ) {
2057             float_raise( float_flag_invalid );
2058             return float32_default_nan;
2059         }
2060         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2061     }
2062     if ( aExp == 0 ) {
2063         if ( aSig == 0 ) return a;
2064         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2065     }
2066     expDiff = aExp - bExp;
2067     aSig |= 0x00800000;
2068     bSig |= 0x00800000;
2069     if ( expDiff < 32 ) {
2070         aSig <<= 8;
2071         bSig <<= 8;
2072         if ( expDiff < 0 ) {
2073             if ( expDiff < -1 ) return a;
2074             aSig >>= 1;
2075         }
2076         q = ( bSig <= aSig );
2077         if ( q ) aSig -= bSig;
2078         if ( 0 < expDiff ) {
2079             q = ( ( (bits64) aSig )<<32 ) / bSig;
2080             q >>= 32 - expDiff;
2081             bSig >>= 2;
2082             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2083         }
2084         else {
2085             aSig >>= 2;
2086             bSig >>= 2;
2087         }
2088     }
2089     else {
2090         if ( bSig <= aSig ) aSig -= bSig;
2091         aSig64 = ( (bits64) aSig )<<40;
2092         bSig64 = ( (bits64) bSig )<<40;
2093         expDiff -= 64;
2094         while ( 0 < expDiff ) {
2095             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2096             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2097             aSig64 = - ( ( bSig * q64 )<<38 );
2098             expDiff -= 62;
2099         }
2100         expDiff += 64;
2101         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2102         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2103         q = q64>>( 64 - expDiff );
2104         bSig <<= 6;
2105         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2106     }
2107     do {
2108         alternateASig = aSig;
2109         ++q;
2110         aSig -= bSig;
2111     } while ( 0 <= (sbits32) aSig );
2112     sigMean = aSig + alternateASig;
2113     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2114         aSig = alternateASig;
2115     }
2116     zSign = ( (sbits32) aSig < 0 );
2117     if ( zSign ) aSig = - aSig;
2118     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2119 
2120 }
2121 #endif /* !SOFTFLOAT_FOR_GCC */
2122 
2123 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2124 /*
2125 -------------------------------------------------------------------------------
2126 Returns the square root of the single-precision floating-point value `a'.
2127 The operation is performed according to the IEC/IEEE Standard for Binary
2128 Floating-Point Arithmetic.
2129 -------------------------------------------------------------------------------
2130 */
float32_sqrt(float32 a)2131 float32 float32_sqrt( float32 a )
2132 {
2133     flag aSign;
2134     int16 aExp, zExp;
2135     bits32 aSig, zSig;
2136     bits64 rem, term;
2137 
2138     aSig = extractFloat32Frac( a );
2139     aExp = extractFloat32Exp( a );
2140     aSign = extractFloat32Sign( a );
2141     if ( aExp == 0xFF ) {
2142         if ( aSig ) return propagateFloat32NaN( a, 0 );
2143         if ( ! aSign ) return a;
2144         float_raise( float_flag_invalid );
2145         return float32_default_nan;
2146     }
2147     if ( aSign ) {
2148         if ( ( aExp | aSig ) == 0 ) return a;
2149         float_raise( float_flag_invalid );
2150         return float32_default_nan;
2151     }
2152     if ( aExp == 0 ) {
2153         if ( aSig == 0 ) return 0;
2154         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2155     }
2156     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2157     aSig = ( aSig | 0x00800000 )<<8;
2158     zSig = estimateSqrt32( aExp, aSig ) + 2;
2159     if ( ( zSig & 0x7F ) <= 5 ) {
2160         if ( zSig < 2 ) {
2161             zSig = 0x7FFFFFFF;
2162             goto roundAndPack;
2163         }
2164         aSig >>= aExp & 1;
2165         term = ( (bits64) zSig ) * zSig;
2166         rem = ( ( (bits64) aSig )<<32 ) - term;
2167         while ( (sbits64) rem < 0 ) {
2168             --zSig;
2169             rem += ( ( (bits64) zSig )<<1 ) | 1;
2170         }
2171         zSig |= ( rem != 0 );
2172     }
2173     shift32RightJamming( zSig, 1, &zSig );
2174  roundAndPack:
2175     return roundAndPackFloat32( 0, zExp, zSig );
2176 
2177 }
2178 #endif /* !SOFTFLOAT_FOR_GCC */
2179 
2180 /*
2181 -------------------------------------------------------------------------------
2182 Returns 1 if the single-precision floating-point value `a' is equal to
2183 the corresponding value `b', and 0 otherwise.  The comparison is performed
2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 -------------------------------------------------------------------------------
2186 */
float32_eq(float32 a,float32 b)2187 flag float32_eq( float32 a, float32 b )
2188 {
2189 
2190     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2191          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2192        ) {
2193         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2194             float_raise( float_flag_invalid );
2195         }
2196         return 0;
2197     }
2198     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2199 
2200 }
2201 
2202 /*
2203 -------------------------------------------------------------------------------
2204 Returns 1 if the single-precision floating-point value `a' is less than
2205 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2206 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2207 Arithmetic.
2208 -------------------------------------------------------------------------------
2209 */
float32_le(float32 a,float32 b)2210 flag float32_le( float32 a, float32 b )
2211 {
2212     flag aSign, bSign;
2213 
2214     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2215          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2216        ) {
2217         float_raise( float_flag_invalid );
2218         return 0;
2219     }
2220     aSign = extractFloat32Sign( a );
2221     bSign = extractFloat32Sign( b );
2222     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2223     return ( a == b ) || ( aSign ^ ( a < b ) );
2224 
2225 }
2226 
2227 /*
2228 -------------------------------------------------------------------------------
2229 Returns 1 if the single-precision floating-point value `a' is less than
2230 the corresponding value `b', and 0 otherwise.  The comparison is performed
2231 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2232 -------------------------------------------------------------------------------
2233 */
float32_lt(float32 a,float32 b)2234 flag float32_lt( float32 a, float32 b )
2235 {
2236     flag aSign, bSign;
2237 
2238     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2239          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2240        ) {
2241         float_raise( float_flag_invalid );
2242         return 0;
2243     }
2244     aSign = extractFloat32Sign( a );
2245     bSign = extractFloat32Sign( b );
2246     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2247     return ( a != b ) && ( aSign ^ ( a < b ) );
2248 
2249 }
2250 
2251 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2252 /*
2253 -------------------------------------------------------------------------------
2254 Returns 1 if the single-precision floating-point value `a' is equal to
2255 the corresponding value `b', and 0 otherwise.  The invalid exception is
2256 raised if either operand is a NaN.  Otherwise, the comparison is performed
2257 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2258 -------------------------------------------------------------------------------
2259 */
float32_eq_signaling(float32 a,float32 b)2260 flag float32_eq_signaling( float32 a, float32 b )
2261 {
2262 
2263     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2264          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2265        ) {
2266         float_raise( float_flag_invalid );
2267         return 0;
2268     }
2269     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2270 
2271 }
2272 
2273 /*
2274 -------------------------------------------------------------------------------
2275 Returns 1 if the single-precision floating-point value `a' is less than or
2276 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2277 cause an exception.  Otherwise, the comparison is performed according to the
2278 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2279 -------------------------------------------------------------------------------
2280 */
float32_le_quiet(float32 a,float32 b)2281 flag float32_le_quiet( float32 a, float32 b )
2282 {
2283     flag aSign, bSign;
2284 
2285     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2286          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2287        ) {
2288         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2289             float_raise( float_flag_invalid );
2290         }
2291         return 0;
2292     }
2293     aSign = extractFloat32Sign( a );
2294     bSign = extractFloat32Sign( b );
2295     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2296     return ( a == b ) || ( aSign ^ ( a < b ) );
2297 
2298 }
2299 
2300 /*
2301 -------------------------------------------------------------------------------
2302 Returns 1 if the single-precision floating-point value `a' is less than
2303 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2304 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2305 Standard for Binary Floating-Point Arithmetic.
2306 -------------------------------------------------------------------------------
2307 */
float32_lt_quiet(float32 a,float32 b)2308 flag float32_lt_quiet( float32 a, float32 b )
2309 {
2310     flag aSign, bSign;
2311 
2312     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2313          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2314        ) {
2315         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2316             float_raise( float_flag_invalid );
2317         }
2318         return 0;
2319     }
2320     aSign = extractFloat32Sign( a );
2321     bSign = extractFloat32Sign( b );
2322     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2323     return ( a != b ) && ( aSign ^ ( a < b ) );
2324 
2325 }
2326 #endif /* !SOFTFLOAT_FOR_GCC */
2327 
2328 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2329 /*
2330 -------------------------------------------------------------------------------
2331 Returns the result of converting the double-precision floating-point value
2332 `a' to the 32-bit two's complement integer format.  The conversion is
2333 performed according to the IEC/IEEE Standard for Binary Floating-Point
2334 Arithmetic---which means in particular that the conversion is rounded
2335 according to the current rounding mode.  If `a' is a NaN, the largest
2336 positive integer is returned.  Otherwise, if the conversion overflows, the
2337 largest integer with the same sign as `a' is returned.
2338 -------------------------------------------------------------------------------
2339 */
float64_to_int32(float64 a)2340 int32 float64_to_int32( float64 a )
2341 {
2342     flag aSign;
2343     int16 aExp, shiftCount;
2344     bits64 aSig;
2345 
2346     aSig = extractFloat64Frac( a );
2347     aExp = extractFloat64Exp( a );
2348     aSign = extractFloat64Sign( a );
2349     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2350     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2351     shiftCount = 0x42C - aExp;
2352     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2353     return roundAndPackInt32( aSign, aSig );
2354 
2355 }
2356 #endif /* !SOFTFLOAT_FOR_GCC */
2357 
2358 /*
2359 -------------------------------------------------------------------------------
2360 Returns the result of converting the double-precision floating-point value
2361 `a' to the 32-bit two's complement integer format.  The conversion is
2362 performed according to the IEC/IEEE Standard for Binary Floating-Point
2363 Arithmetic, except that the conversion is always rounded toward zero.
2364 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2365 the conversion overflows, the largest integer with the same sign as `a' is
2366 returned.
2367 -------------------------------------------------------------------------------
2368 */
float64_to_int32_round_to_zero(float64 a)2369 int32 float64_to_int32_round_to_zero( float64 a )
2370 {
2371     flag aSign;
2372     int16 aExp, shiftCount;
2373     bits64 aSig, savedASig;
2374     int32 z;
2375 
2376     aSig = extractFloat64Frac( a );
2377     aExp = extractFloat64Exp( a );
2378     aSign = extractFloat64Sign( a );
2379     if ( 0x41E < aExp ) {
2380         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2381         goto invalid;
2382     }
2383     else if ( aExp < 0x3FF ) {
2384         if ( aExp || aSig ) float_set_inexact();
2385         return 0;
2386     }
2387     aSig |= LIT64( 0x0010000000000000 );
2388     shiftCount = 0x433 - aExp;
2389     savedASig = aSig;
2390     aSig >>= shiftCount;
2391     z = aSig;
2392     if ( aSign ) z = - z;
2393     if ( ( z < 0 ) ^ aSign ) {
2394  invalid:
2395         float_raise( float_flag_invalid );
2396         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2397     }
2398     if ( ( aSig<<shiftCount ) != savedASig ) {
2399         float_set_inexact();
2400     }
2401     return z;
2402 
2403 }
2404 
2405 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2406 /*
2407 -------------------------------------------------------------------------------
2408 Returns the result of converting the double-precision floating-point value
2409 `a' to the 64-bit two's complement integer format.  The conversion is
2410 performed according to the IEC/IEEE Standard for Binary Floating-Point
2411 Arithmetic---which means in particular that the conversion is rounded
2412 according to the current rounding mode.  If `a' is a NaN, the largest
2413 positive integer is returned.  Otherwise, if the conversion overflows, the
2414 largest integer with the same sign as `a' is returned.
2415 -------------------------------------------------------------------------------
2416 */
float64_to_int64(float64 a)2417 int64 float64_to_int64( float64 a )
2418 {
2419     flag aSign;
2420     int16 aExp, shiftCount;
2421     bits64 aSig, aSigExtra;
2422 
2423     aSig = extractFloat64Frac( a );
2424     aExp = extractFloat64Exp( a );
2425     aSign = extractFloat64Sign( a );
2426     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2427     shiftCount = 0x433 - aExp;
2428     if ( shiftCount <= 0 ) {
2429         if ( 0x43E < aExp ) {
2430             float_raise( float_flag_invalid );
2431             if (    ! aSign
2432                  || (    ( aExp == 0x7FF )
2433                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2434                ) {
2435                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2436             }
2437             return (sbits64) LIT64( 0x8000000000000000 );
2438         }
2439         aSigExtra = 0;
2440         aSig <<= - shiftCount;
2441     }
2442     else {
2443         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2444     }
2445     return roundAndPackInt64( aSign, aSig, aSigExtra );
2446 
2447 }
2448 
2449 #ifdef __alpha__
2450 /*
2451 -------------------------------------------------------------------------------
2452 Returns the result of converting the double-precision floating-point value
2453 `a' to the 64-bit two's complement integer format.  The conversion is
2454 performed according to the IEC/IEEE Standard for Binary Floating-Point
2455 Arithmetic---which means in particular that the conversion is rounded
2456 according to the current rounding mode.  If `a' is a NaN, the invalid
2457 exception is raised and zero is returned.
2458 -------------------------------------------------------------------------------
2459 */
float64_to_int64_no_overflow(float64 a)2460 int64 float64_to_int64_no_overflow( float64 a )
2461 {
2462     flag aSign;
2463     int16 aExp, shiftCount;
2464     bits64 aSig, aSigExtra;
2465 
2466     aSig = extractFloat64Frac( a );
2467     aExp = extractFloat64Exp( a );
2468     aSign = extractFloat64Sign( a );
2469     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2470     shiftCount = 0x433 - aExp;
2471     if ( shiftCount <= 0 ) {
2472         if ( 0x43E < aExp ) {
2473             float_raise( float_flag_invalid );
2474             return 0;
2475         }
2476         aSigExtra = 0;
2477         aSig <<= - shiftCount;
2478     }
2479     else {
2480         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2481     }
2482     return roundAndPackInt64NoOverflow( aSign, aSig, aSigExtra );
2483 
2484 }
2485 #endif /* __alpha__ */
2486 
2487 /*
2488 -------------------------------------------------------------------------------
2489 Returns the result of converting the double-precision floating-point value
2490 `a' to the 64-bit two's complement integer format.  The conversion is
2491 performed according to the IEC/IEEE Standard for Binary Floating-Point
2492 Arithmetic, except that the conversion is always rounded toward zero.
2493 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2494 the conversion overflows, the largest integer with the same sign as `a' is
2495 returned.
2496 -------------------------------------------------------------------------------
2497 */
float64_to_int64_round_to_zero(float64 a)2498 int64 float64_to_int64_round_to_zero( float64 a )
2499 {
2500     flag aSign;
2501     int16 aExp, shiftCount;
2502     bits64 aSig;
2503     int64 z;
2504 
2505     aSig = extractFloat64Frac( a );
2506     aExp = extractFloat64Exp( a );
2507     aSign = extractFloat64Sign( a );
2508     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2509     shiftCount = aExp - 0x433;
2510     if ( 0 <= shiftCount ) {
2511         if ( 0x43E <= aExp ) {
2512             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2513                 float_raise( float_flag_invalid );
2514                 if (    ! aSign
2515                      || (    ( aExp == 0x7FF )
2516                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2517                    ) {
2518                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2519                 }
2520             }
2521             return (sbits64) LIT64( 0x8000000000000000 );
2522         }
2523         z = aSig<<shiftCount;
2524     }
2525     else {
2526         if ( aExp < 0x3FE ) {
2527             if ( aExp | aSig ) float_set_inexact();
2528             return 0;
2529         }
2530         z = aSig>>( - shiftCount );
2531         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2532             float_set_inexact();
2533         }
2534     }
2535     if ( aSign ) z = - z;
2536     return z;
2537 
2538 }
2539 #endif /* !SOFTFLOAT_FOR_GCC */
2540 
2541 /*
2542 -------------------------------------------------------------------------------
2543 Returns the result of converting the double-precision floating-point value
2544 `a' to the single-precision floating-point format.  The conversion is
2545 performed according to the IEC/IEEE Standard for Binary Floating-Point
2546 Arithmetic.
2547 -------------------------------------------------------------------------------
2548 */
float64_to_float32(float64 a)2549 float32 float64_to_float32( float64 a )
2550 {
2551     flag aSign;
2552     int16 aExp;
2553     bits64 aSig;
2554     bits32 zSig;
2555 
2556     aSig = extractFloat64Frac( a );
2557     aExp = extractFloat64Exp( a );
2558     aSign = extractFloat64Sign( a );
2559     if ( aExp == 0x7FF ) {
2560         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2561         return packFloat32( aSign, 0xFF, 0 );
2562     }
2563     shift64RightJamming( aSig, 22, &aSig );
2564     zSig = aSig;
2565     if ( aExp || zSig ) {
2566         zSig |= 0x40000000;
2567         aExp -= 0x381;
2568     }
2569     return roundAndPackFloat32( aSign, aExp, zSig );
2570 
2571 }
2572 
2573 #ifdef FLOATX80
2574 
2575 /*
2576 -------------------------------------------------------------------------------
2577 Returns the result of converting the double-precision floating-point value
2578 `a' to the extended double-precision floating-point format.  The conversion
2579 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2580 Arithmetic.
2581 -------------------------------------------------------------------------------
2582 */
float64_to_floatx80(float64 a)2583 floatx80 float64_to_floatx80( float64 a )
2584 {
2585     flag aSign;
2586     int16 aExp;
2587     bits64 aSig;
2588 
2589     aSig = extractFloat64Frac( a );
2590     aExp = extractFloat64Exp( a );
2591     aSign = extractFloat64Sign( a );
2592     if ( aExp == 0x7FF ) {
2593         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2594         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2595     }
2596     if ( aExp == 0 ) {
2597         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2598         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2599     }
2600     return
2601         packFloatx80(
2602             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2603 
2604 }
2605 
2606 #endif
2607 
2608 #ifdef FLOAT128
2609 
2610 /*
2611 -------------------------------------------------------------------------------
2612 Returns the result of converting the double-precision floating-point value
2613 `a' to the quadruple-precision floating-point format.  The conversion is
2614 performed according to the IEC/IEEE Standard for Binary Floating-Point
2615 Arithmetic.
2616 -------------------------------------------------------------------------------
2617 */
float64_to_float128(float64 a)2618 float128 float64_to_float128( float64 a )
2619 {
2620     flag aSign;
2621     int16 aExp;
2622     bits64 aSig, zSig0, zSig1;
2623 
2624     aSig = extractFloat64Frac( a );
2625     aExp = extractFloat64Exp( a );
2626     aSign = extractFloat64Sign( a );
2627     if ( aExp == 0x7FF ) {
2628         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2629         return packFloat128( aSign, 0x7FFF, 0, 0 );
2630     }
2631     if ( aExp == 0 ) {
2632         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2633         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2634         --aExp;
2635     }
2636     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2637     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2638 
2639 }
2640 
2641 #endif
2642 
2643 #ifndef SOFTFLOAT_FOR_GCC
2644 /*
2645 -------------------------------------------------------------------------------
2646 Rounds the double-precision floating-point value `a' to an integer, and
2647 returns the result as a double-precision floating-point value.  The
2648 operation is performed according to the IEC/IEEE Standard for Binary
2649 Floating-Point Arithmetic.
2650 -------------------------------------------------------------------------------
2651 */
float64_round_to_int(float64 a)2652 float64 float64_round_to_int( float64 a )
2653 {
2654     flag aSign;
2655     int16 aExp;
2656     bits64 lastBitMask, roundBitsMask;
2657     int8 roundingMode;
2658     float64 z;
2659 
2660     aExp = extractFloat64Exp( a );
2661     if ( 0x433 <= aExp ) {
2662         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2663             return propagateFloat64NaN( a, a );
2664         }
2665         return a;
2666     }
2667     if ( aExp < 0x3FF ) {
2668         if ( (bits64) ( a<<1 ) == 0 ) return a;
2669         float_set_inexact();
2670         aSign = extractFloat64Sign( a );
2671         switch ( float_rounding_mode() ) {
2672          case float_round_nearest_even:
2673             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2674                 return packFloat64( aSign, 0x3FF, 0 );
2675             }
2676             break;
2677          case float_round_down:
2678             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2679          case float_round_up:
2680             return
2681             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2682         }
2683         return packFloat64( aSign, 0, 0 );
2684     }
2685     lastBitMask = 1;
2686     lastBitMask <<= 0x433 - aExp;
2687     roundBitsMask = lastBitMask - 1;
2688     z = a;
2689     roundingMode = float_rounding_mode();
2690     if ( roundingMode == float_round_nearest_even ) {
2691         z += lastBitMask>>1;
2692         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2693     }
2694     else if ( roundingMode != float_round_to_zero ) {
2695         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2696             z += roundBitsMask;
2697         }
2698     }
2699     z &= ~ roundBitsMask;
2700     if ( z != a ) float_set_inexact();
2701     return z;
2702 
2703 }
2704 #endif
2705 
2706 /*
2707 -------------------------------------------------------------------------------
2708 Returns the result of adding the absolute values of the double-precision
2709 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2710 before being returned.  `zSign' is ignored if the result is a NaN.
2711 The addition is performed according to the IEC/IEEE Standard for Binary
2712 Floating-Point Arithmetic.
2713 -------------------------------------------------------------------------------
2714 */
addFloat64Sigs(float64 a,float64 b,flag zSign)2715 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2716 {
2717     int16 aExp, bExp, zExp;
2718     bits64 aSig, bSig, zSig;
2719     int16 expDiff;
2720 
2721     aSig = extractFloat64Frac( a );
2722     aExp = extractFloat64Exp( a );
2723     bSig = extractFloat64Frac( b );
2724     bExp = extractFloat64Exp( b );
2725     expDiff = aExp - bExp;
2726     aSig <<= 9;
2727     bSig <<= 9;
2728     if ( 0 < expDiff ) {
2729         if ( aExp == 0x7FF ) {
2730             if ( aSig ) return propagateFloat64NaN( a, b );
2731             return a;
2732         }
2733         if ( bExp == 0 ) {
2734             --expDiff;
2735         }
2736         else {
2737             bSig |= LIT64( 0x2000000000000000 );
2738         }
2739         shift64RightJamming( bSig, expDiff, &bSig );
2740         zExp = aExp;
2741     }
2742     else if ( expDiff < 0 ) {
2743         if ( bExp == 0x7FF ) {
2744             if ( bSig ) return propagateFloat64NaN( a, b );
2745             return packFloat64( zSign, 0x7FF, 0 );
2746         }
2747         if ( aExp == 0 ) {
2748             ++expDiff;
2749         }
2750         else {
2751             aSig |= LIT64( 0x2000000000000000 );
2752         }
2753         shift64RightJamming( aSig, - expDiff, &aSig );
2754         zExp = bExp;
2755     }
2756     else {
2757         if ( aExp == 0x7FF ) {
2758             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2759             return a;
2760         }
2761         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2762         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2763         zExp = aExp;
2764         goto roundAndPack;
2765     }
2766     aSig |= LIT64( 0x2000000000000000 );
2767     zSig = ( aSig + bSig )<<1;
2768     --zExp;
2769     if ( (sbits64) zSig < 0 ) {
2770         zSig = aSig + bSig;
2771         ++zExp;
2772     }
2773  roundAndPack:
2774     return roundAndPackFloat64( zSign, zExp, zSig );
2775 
2776 }
2777 
2778 /*
2779 -------------------------------------------------------------------------------
2780 Returns the result of subtracting the absolute values of the double-
2781 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2782 difference is negated before being returned.  `zSign' is ignored if the
2783 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2784 Standard for Binary Floating-Point Arithmetic.
2785 -------------------------------------------------------------------------------
2786 */
subFloat64Sigs(float64 a,float64 b,flag zSign)2787 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2788 {
2789     int16 aExp, bExp, zExp;
2790     bits64 aSig, bSig, zSig;
2791     int16 expDiff;
2792 
2793     aSig = extractFloat64Frac( a );
2794     aExp = extractFloat64Exp( a );
2795     bSig = extractFloat64Frac( b );
2796     bExp = extractFloat64Exp( b );
2797     expDiff = aExp - bExp;
2798     aSig <<= 10;
2799     bSig <<= 10;
2800     if ( 0 < expDiff ) goto aExpBigger;
2801     if ( expDiff < 0 ) goto bExpBigger;
2802     if ( aExp == 0x7FF ) {
2803         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2804         float_raise( float_flag_invalid );
2805         return float64_default_nan;
2806     }
2807     if ( aExp == 0 ) {
2808         aExp = 1;
2809         bExp = 1;
2810     }
2811     if ( bSig < aSig ) goto aBigger;
2812     if ( aSig < bSig ) goto bBigger;
2813     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2814  bExpBigger:
2815     if ( bExp == 0x7FF ) {
2816         if ( bSig ) return propagateFloat64NaN( a, b );
2817         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2818     }
2819     if ( aExp == 0 ) {
2820         ++expDiff;
2821     }
2822     else {
2823         aSig |= LIT64( 0x4000000000000000 );
2824     }
2825     shift64RightJamming( aSig, - expDiff, &aSig );
2826     bSig |= LIT64( 0x4000000000000000 );
2827  bBigger:
2828     zSig = bSig - aSig;
2829     zExp = bExp;
2830     zSign ^= 1;
2831     goto normalizeRoundAndPack;
2832  aExpBigger:
2833     if ( aExp == 0x7FF ) {
2834         if ( aSig ) return propagateFloat64NaN( a, b );
2835         return a;
2836     }
2837     if ( bExp == 0 ) {
2838         --expDiff;
2839     }
2840     else {
2841         bSig |= LIT64( 0x4000000000000000 );
2842     }
2843     shift64RightJamming( bSig, expDiff, &bSig );
2844     aSig |= LIT64( 0x4000000000000000 );
2845  aBigger:
2846     zSig = aSig - bSig;
2847     zExp = aExp;
2848  normalizeRoundAndPack:
2849     --zExp;
2850     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2851 
2852 }
2853 
2854 /*
2855 -------------------------------------------------------------------------------
2856 Returns the result of adding the double-precision floating-point values `a'
2857 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2858 Binary Floating-Point Arithmetic.
2859 -------------------------------------------------------------------------------
2860 */
float64_add(float64 a,float64 b)2861 float64 float64_add( float64 a, float64 b )
2862 {
2863     flag aSign, bSign;
2864 
2865     aSign = extractFloat64Sign( a );
2866     bSign = extractFloat64Sign( b );
2867     if ( aSign == bSign ) {
2868         return addFloat64Sigs( a, b, aSign );
2869     }
2870     else {
2871         return subFloat64Sigs( a, b, aSign );
2872     }
2873 
2874 }
2875 
2876 /*
2877 -------------------------------------------------------------------------------
2878 Returns the result of subtracting the double-precision floating-point values
2879 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2880 for Binary Floating-Point Arithmetic.
2881 -------------------------------------------------------------------------------
2882 */
float64_sub(float64 a,float64 b)2883 float64 float64_sub( float64 a, float64 b )
2884 {
2885     flag aSign, bSign;
2886 
2887     aSign = extractFloat64Sign( a );
2888     bSign = extractFloat64Sign( b );
2889     if ( aSign == bSign ) {
2890         return subFloat64Sigs( a, b, aSign );
2891     }
2892     else {
2893         return addFloat64Sigs( a, b, aSign );
2894     }
2895 
2896 }
2897 
2898 /*
2899 -------------------------------------------------------------------------------
2900 Returns the result of multiplying the double-precision floating-point values
2901 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2902 for Binary Floating-Point Arithmetic.
2903 -------------------------------------------------------------------------------
2904 */
float64_mul(float64 a,float64 b)2905 float64 float64_mul( float64 a, float64 b )
2906 {
2907     flag aSign, bSign, zSign;
2908     int16 aExp, bExp, zExp;
2909     bits64 aSig, bSig, zSig0, zSig1;
2910 
2911     aSig = extractFloat64Frac( a );
2912     aExp = extractFloat64Exp( a );
2913     aSign = extractFloat64Sign( a );
2914     bSig = extractFloat64Frac( b );
2915     bExp = extractFloat64Exp( b );
2916     bSign = extractFloat64Sign( b );
2917     zSign = aSign ^ bSign;
2918     if ( aExp == 0x7FF ) {
2919         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2920             return propagateFloat64NaN( a, b );
2921         }
2922         if ( ( bExp | bSig ) == 0 ) {
2923             float_raise( float_flag_invalid );
2924             return float64_default_nan;
2925         }
2926         return packFloat64( zSign, 0x7FF, 0 );
2927     }
2928     if ( bExp == 0x7FF ) {
2929         if ( bSig ) return propagateFloat64NaN( a, b );
2930         if ( ( aExp | aSig ) == 0 ) {
2931             float_raise( float_flag_invalid );
2932             return float64_default_nan;
2933         }
2934         return packFloat64( zSign, 0x7FF, 0 );
2935     }
2936     if ( aExp == 0 ) {
2937         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2938         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2939     }
2940     if ( bExp == 0 ) {
2941         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2942         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2943     }
2944     zExp = aExp + bExp - 0x3FF;
2945     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2946     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2947     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2948     zSig0 |= ( zSig1 != 0 );
2949     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2950         zSig0 <<= 1;
2951         --zExp;
2952     }
2953     return roundAndPackFloat64( zSign, zExp, zSig0 );
2954 
2955 }
2956 
2957 /*
2958 -------------------------------------------------------------------------------
2959 Returns the result of dividing the double-precision floating-point value `a'
2960 by the corresponding value `b'.  The operation is performed according to
2961 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2962 -------------------------------------------------------------------------------
2963 */
float64_div(float64 a,float64 b)2964 float64 float64_div( float64 a, float64 b )
2965 {
2966     flag aSign, bSign, zSign;
2967     int16 aExp, bExp, zExp;
2968     bits64 aSig, bSig, zSig;
2969     bits64 rem0, rem1;
2970     bits64 term0, term1;
2971 
2972     aSig = extractFloat64Frac( a );
2973     aExp = extractFloat64Exp( a );
2974     aSign = extractFloat64Sign( a );
2975     bSig = extractFloat64Frac( b );
2976     bExp = extractFloat64Exp( b );
2977     bSign = extractFloat64Sign( b );
2978     zSign = aSign ^ bSign;
2979     if ( aExp == 0x7FF ) {
2980         if ( aSig ) return propagateFloat64NaN( a, b );
2981         if ( bExp == 0x7FF ) {
2982             if ( bSig ) return propagateFloat64NaN( a, b );
2983             float_raise( float_flag_invalid );
2984             return float64_default_nan;
2985         }
2986         return packFloat64( zSign, 0x7FF, 0 );
2987     }
2988     if ( bExp == 0x7FF ) {
2989         if ( bSig ) return propagateFloat64NaN( a, b );
2990         return packFloat64( zSign, 0, 0 );
2991     }
2992     if ( bExp == 0 ) {
2993         if ( bSig == 0 ) {
2994             if ( ( aExp | aSig ) == 0 ) {
2995                 float_raise( float_flag_invalid );
2996                 return float64_default_nan;
2997             }
2998             float_raise( float_flag_divbyzero );
2999             return packFloat64( zSign, 0x7FF, 0 );
3000         }
3001         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3002     }
3003     if ( aExp == 0 ) {
3004         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3005         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3006     }
3007     zExp = aExp - bExp + 0x3FD;
3008     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3009     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3010     if ( bSig <= ( aSig + aSig ) ) {
3011         aSig >>= 1;
3012         ++zExp;
3013     }
3014     zSig = estimateDiv128To64( aSig, 0, bSig );
3015     if ( ( zSig & 0x1FF ) <= 2 ) {
3016         mul64To128( bSig, zSig, &term0, &term1 );
3017         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3018         while ( (sbits64) rem0 < 0 ) {
3019             --zSig;
3020             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3021         }
3022         zSig |= ( rem1 != 0 );
3023     }
3024     return roundAndPackFloat64( zSign, zExp, zSig );
3025 
3026 }
3027 
3028 #ifndef SOFTFLOAT_FOR_GCC
3029 /*
3030 -------------------------------------------------------------------------------
3031 Returns the remainder of the double-precision floating-point value `a'
3032 with respect to the corresponding value `b'.  The operation is performed
3033 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3034 -------------------------------------------------------------------------------
3035 */
float64_rem(float64 a,float64 b)3036 float64 float64_rem( float64 a, float64 b )
3037 {
3038     flag aSign, bSign, zSign;
3039     int16 aExp, bExp, expDiff;
3040     bits64 aSig, bSig;
3041     bits64 q, alternateASig;
3042     sbits64 sigMean;
3043 
3044     aSig = extractFloat64Frac( a );
3045     aExp = extractFloat64Exp( a );
3046     aSign = extractFloat64Sign( a );
3047     bSig = extractFloat64Frac( b );
3048     bExp = extractFloat64Exp( b );
3049     bSign = extractFloat64Sign( b );
3050     if ( aExp == 0x7FF ) {
3051         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3052             return propagateFloat64NaN( a, b );
3053         }
3054         float_raise( float_flag_invalid );
3055         return float64_default_nan;
3056     }
3057     if ( bExp == 0x7FF ) {
3058         if ( bSig ) return propagateFloat64NaN( a, b );
3059         return a;
3060     }
3061     if ( bExp == 0 ) {
3062         if ( bSig == 0 ) {
3063             float_raise( float_flag_invalid );
3064             return float64_default_nan;
3065         }
3066         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3067     }
3068     if ( aExp == 0 ) {
3069         if ( aSig == 0 ) return a;
3070         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3071     }
3072     expDiff = aExp - bExp;
3073     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3074     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3075     if ( expDiff < 0 ) {
3076         if ( expDiff < -1 ) return a;
3077         aSig >>= 1;
3078     }
3079     q = ( bSig <= aSig );
3080     if ( q ) aSig -= bSig;
3081     expDiff -= 64;
3082     while ( 0 < expDiff ) {
3083         q = estimateDiv128To64( aSig, 0, bSig );
3084         q = ( 2 < q ) ? q - 2 : 0;
3085         aSig = - ( ( bSig>>2 ) * q );
3086         expDiff -= 62;
3087     }
3088     expDiff += 64;
3089     if ( 0 < expDiff ) {
3090         q = estimateDiv128To64( aSig, 0, bSig );
3091         q = ( 2 < q ) ? q - 2 : 0;
3092         q >>= 64 - expDiff;
3093         bSig >>= 2;
3094         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3095     }
3096     else {
3097         aSig >>= 2;
3098         bSig >>= 2;
3099     }
3100     do {
3101         alternateASig = aSig;
3102         ++q;
3103         aSig -= bSig;
3104     } while ( 0 <= (sbits64) aSig );
3105     sigMean = aSig + alternateASig;
3106     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3107         aSig = alternateASig;
3108     }
3109     zSign = ( (sbits64) aSig < 0 );
3110     if ( zSign ) aSig = - aSig;
3111     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3112 
3113 }
3114 
3115 /*
3116 -------------------------------------------------------------------------------
3117 Returns the square root of the double-precision floating-point value `a'.
3118 The operation is performed according to the IEC/IEEE Standard for Binary
3119 Floating-Point Arithmetic.
3120 -------------------------------------------------------------------------------
3121 */
float64_sqrt(float64 a)3122 float64 float64_sqrt( float64 a )
3123 {
3124     flag aSign;
3125     int16 aExp, zExp;
3126     bits64 aSig, zSig, doubleZSig;
3127     bits64 rem0, rem1, term0, term1;
3128 
3129     aSig = extractFloat64Frac( a );
3130     aExp = extractFloat64Exp( a );
3131     aSign = extractFloat64Sign( a );
3132     if ( aExp == 0x7FF ) {
3133         if ( aSig ) return propagateFloat64NaN( a, a );
3134         if ( ! aSign ) return a;
3135         float_raise( float_flag_invalid );
3136         return float64_default_nan;
3137     }
3138     if ( aSign ) {
3139         if ( ( aExp | aSig ) == 0 ) return a;
3140         float_raise( float_flag_invalid );
3141         return float64_default_nan;
3142     }
3143     if ( aExp == 0 ) {
3144         if ( aSig == 0 ) return 0;
3145         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3146     }
3147     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3148     aSig |= LIT64( 0x0010000000000000 );
3149     zSig = estimateSqrt32( aExp, aSig>>21 );
3150     aSig <<= 9 - ( aExp & 1 );
3151     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3152     if ( ( zSig & 0x1FF ) <= 5 ) {
3153         doubleZSig = zSig<<1;
3154         mul64To128( zSig, zSig, &term0, &term1 );
3155         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3156         while ( (sbits64) rem0 < 0 ) {
3157             --zSig;
3158             doubleZSig -= 2;
3159             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3160         }
3161         zSig |= ( ( rem0 | rem1 ) != 0 );
3162     }
3163     return roundAndPackFloat64( 0, zExp, zSig );
3164 
3165 }
3166 #endif
3167 
3168 /*
3169 -------------------------------------------------------------------------------
3170 Returns 1 if the double-precision floating-point value `a' is equal to the
3171 corresponding value `b', and 0 otherwise.  The comparison is performed
3172 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3173 -------------------------------------------------------------------------------
3174 */
float64_eq(float64 a,float64 b)3175 flag float64_eq( float64 a, float64 b )
3176 {
3177 
3178     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3179          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3180        ) {
3181         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3182             float_raise( float_flag_invalid );
3183         }
3184         return 0;
3185     }
3186     return ( a == b ) ||
3187 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3188 
3189 }
3190 
3191 /*
3192 -------------------------------------------------------------------------------
3193 Returns 1 if the double-precision floating-point value `a' is less than or
3194 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3195 performed according to the IEC/IEEE Standard for Binary Floating-Point
3196 Arithmetic.
3197 -------------------------------------------------------------------------------
3198 */
float64_le(float64 a,float64 b)3199 flag float64_le( float64 a, float64 b )
3200 {
3201     flag aSign, bSign;
3202 
3203     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3204          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3205        ) {
3206         float_raise( float_flag_invalid );
3207         return 0;
3208     }
3209     aSign = extractFloat64Sign( a );
3210     bSign = extractFloat64Sign( b );
3211     if ( aSign != bSign )
3212 	return aSign ||
3213 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3214 	      0 );
3215     return ( a == b ) ||
3216 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3217 
3218 }
3219 
3220 /*
3221 -------------------------------------------------------------------------------
3222 Returns 1 if the double-precision floating-point value `a' is less than
3223 the corresponding value `b', and 0 otherwise.  The comparison is performed
3224 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3225 -------------------------------------------------------------------------------
3226 */
float64_lt(float64 a,float64 b)3227 flag float64_lt( float64 a, float64 b )
3228 {
3229     flag aSign, bSign;
3230 
3231     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3232          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3233        ) {
3234         float_raise( float_flag_invalid );
3235         return 0;
3236     }
3237     aSign = extractFloat64Sign( a );
3238     bSign = extractFloat64Sign( b );
3239     if ( aSign != bSign )
3240 	return aSign &&
3241 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3242 	      0 );
3243     return ( a != b ) &&
3244 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3245 
3246 }
3247 
3248 #ifndef SOFTFLOAT_FOR_GCC
3249 /*
3250 -------------------------------------------------------------------------------
3251 Returns 1 if the double-precision floating-point value `a' is equal to the
3252 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3253 if either operand is a NaN.  Otherwise, the comparison is performed
3254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3255 -------------------------------------------------------------------------------
3256 */
float64_eq_signaling(float64 a,float64 b)3257 flag float64_eq_signaling( float64 a, float64 b )
3258 {
3259 
3260     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3261          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3262        ) {
3263         float_raise( float_flag_invalid );
3264         return 0;
3265     }
3266     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3267 
3268 }
3269 
3270 /*
3271 -------------------------------------------------------------------------------
3272 Returns 1 if the double-precision floating-point value `a' is less than or
3273 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3274 cause an exception.  Otherwise, the comparison is performed according to the
3275 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3276 -------------------------------------------------------------------------------
3277 */
float64_le_quiet(float64 a,float64 b)3278 flag float64_le_quiet( float64 a, float64 b )
3279 {
3280     flag aSign, bSign;
3281 
3282     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3283          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3284        ) {
3285         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3286             float_raise( float_flag_invalid );
3287         }
3288         return 0;
3289     }
3290     aSign = extractFloat64Sign( a );
3291     bSign = extractFloat64Sign( b );
3292     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3293     return ( a == b ) || ( aSign ^ ( a < b ) );
3294 
3295 }
3296 
3297 /*
3298 -------------------------------------------------------------------------------
3299 Returns 1 if the double-precision floating-point value `a' is less than
3300 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3301 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3302 Standard for Binary Floating-Point Arithmetic.
3303 -------------------------------------------------------------------------------
3304 */
float64_lt_quiet(float64 a,float64 b)3305 flag float64_lt_quiet( float64 a, float64 b )
3306 {
3307     flag aSign, bSign;
3308 
3309     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3310          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3311        ) {
3312         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3313             float_raise( float_flag_invalid );
3314         }
3315         return 0;
3316     }
3317     aSign = extractFloat64Sign( a );
3318     bSign = extractFloat64Sign( b );
3319     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3320     return ( a != b ) && ( aSign ^ ( a < b ) );
3321 
3322 }
3323 #endif
3324 
3325 #ifdef FLOATX80
3326 
3327 /*
3328 -------------------------------------------------------------------------------
3329 Returns the result of converting the extended double-precision floating-
3330 point value `a' to the 32-bit two's complement integer format.  The
3331 conversion is performed according to the IEC/IEEE Standard for Binary
3332 Floating-Point Arithmetic---which means in particular that the conversion
3333 is rounded according to the current rounding mode.  If `a' is a NaN, the
3334 largest positive integer is returned.  Otherwise, if the conversion
3335 overflows, the largest integer with the same sign as `a' is returned.
3336 -------------------------------------------------------------------------------
3337 */
floatx80_to_int32(floatx80 a)3338 int32 floatx80_to_int32( floatx80 a )
3339 {
3340     flag aSign;
3341     int32 aExp, shiftCount;
3342     bits64 aSig;
3343 
3344     aSig = extractFloatx80Frac( a );
3345     aExp = extractFloatx80Exp( a );
3346     aSign = extractFloatx80Sign( a );
3347     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3348     shiftCount = 0x4037 - aExp;
3349     if ( shiftCount <= 0 ) shiftCount = 1;
3350     shift64RightJamming( aSig, shiftCount, &aSig );
3351     return roundAndPackInt32( aSign, aSig );
3352 
3353 }
3354 
3355 /*
3356 -------------------------------------------------------------------------------
3357 Returns the result of converting the extended double-precision floating-
3358 point value `a' to the 32-bit two's complement integer format.  The
3359 conversion is performed according to the IEC/IEEE Standard for Binary
3360 Floating-Point Arithmetic, except that the conversion is always rounded
3361 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3362 Otherwise, if the conversion overflows, the largest integer with the same
3363 sign as `a' is returned.
3364 -------------------------------------------------------------------------------
3365 */
floatx80_to_int32_round_to_zero(floatx80 a)3366 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3367 {
3368     flag aSign;
3369     int32 aExp, shiftCount;
3370     bits64 aSig, savedASig;
3371     int32 z;
3372 
3373     aSig = extractFloatx80Frac( a );
3374     aExp = extractFloatx80Exp( a );
3375     aSign = extractFloatx80Sign( a );
3376     if ( 0x401E < aExp ) {
3377         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3378         goto invalid;
3379     }
3380     else if ( aExp < 0x3FFF ) {
3381         if ( aExp || aSig ) float_set_inexact();
3382         return 0;
3383     }
3384     shiftCount = 0x403E - aExp;
3385     savedASig = aSig;
3386     aSig >>= shiftCount;
3387     z = aSig;
3388     if ( aSign ) z = - z;
3389     if ( ( z < 0 ) ^ aSign ) {
3390  invalid:
3391         float_raise( float_flag_invalid );
3392         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3393     }
3394     if ( ( aSig<<shiftCount ) != savedASig ) {
3395         float_set_inexact();
3396     }
3397     return z;
3398 
3399 }
3400 
3401 /*
3402 -------------------------------------------------------------------------------
3403 Returns the result of converting the extended double-precision floating-
3404 point value `a' to the 64-bit two's complement integer format.  The
3405 conversion is performed according to the IEC/IEEE Standard for Binary
3406 Floating-Point Arithmetic---which means in particular that the conversion
3407 is rounded according to the current rounding mode.  If `a' is a NaN,
3408 the largest positive integer is returned.  Otherwise, if the conversion
3409 overflows, the largest integer with the same sign as `a' is returned.
3410 -------------------------------------------------------------------------------
3411 */
floatx80_to_int64(floatx80 a)3412 int64 floatx80_to_int64( floatx80 a )
3413 {
3414     flag aSign;
3415     int32 aExp, shiftCount;
3416     bits64 aSig, aSigExtra;
3417 
3418     aSig = extractFloatx80Frac( a );
3419     aExp = extractFloatx80Exp( a );
3420     aSign = extractFloatx80Sign( a );
3421     shiftCount = 0x403E - aExp;
3422     if ( shiftCount <= 0 ) {
3423         if ( shiftCount ) {
3424             float_raise( float_flag_invalid );
3425             if (    ! aSign
3426                  || (    ( aExp == 0x7FFF )
3427                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3428                ) {
3429                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3430             }
3431             return (sbits64) LIT64( 0x8000000000000000 );
3432         }
3433         aSigExtra = 0;
3434     }
3435     else {
3436         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3437     }
3438     return roundAndPackInt64( aSign, aSig, aSigExtra );
3439 
3440 }
3441 
3442 /*
3443 -------------------------------------------------------------------------------
3444 Returns the result of converting the extended double-precision floating-
3445 point value `a' to the 64-bit two's complement integer format.  The
3446 conversion is performed according to the IEC/IEEE Standard for Binary
3447 Floating-Point Arithmetic, except that the conversion is always rounded
3448 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3449 Otherwise, if the conversion overflows, the largest integer with the same
3450 sign as `a' is returned.
3451 -------------------------------------------------------------------------------
3452 */
floatx80_to_int64_round_to_zero(floatx80 a)3453 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3454 {
3455     flag aSign;
3456     int32 aExp, shiftCount;
3457     bits64 aSig;
3458     int64 z;
3459 
3460     aSig = extractFloatx80Frac( a );
3461     aExp = extractFloatx80Exp( a );
3462     aSign = extractFloatx80Sign( a );
3463     shiftCount = aExp - 0x403E;
3464     if ( 0 <= shiftCount ) {
3465         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3466         if ( ( a.high != 0xC03E ) || aSig ) {
3467             float_raise( float_flag_invalid );
3468             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3469                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3470             }
3471         }
3472         return (sbits64) LIT64( 0x8000000000000000 );
3473     }
3474     else if ( aExp < 0x3FFF ) {
3475         if ( aExp | aSig ) float_set_inexact();
3476         return 0;
3477     }
3478     z = aSig>>( - shiftCount );
3479     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3480         float_set_inexact();
3481     }
3482     if ( aSign ) z = - z;
3483     return z;
3484 
3485 }
3486 
3487 /*
3488 -------------------------------------------------------------------------------
3489 Returns the result of converting the extended double-precision floating-
3490 point value `a' to the single-precision floating-point format.  The
3491 conversion is performed according to the IEC/IEEE Standard for Binary
3492 Floating-Point Arithmetic.
3493 -------------------------------------------------------------------------------
3494 */
floatx80_to_float32(floatx80 a)3495 float32 floatx80_to_float32( floatx80 a )
3496 {
3497     flag aSign;
3498     int32 aExp;
3499     bits64 aSig;
3500 
3501     aSig = extractFloatx80Frac( a );
3502     aExp = extractFloatx80Exp( a );
3503     aSign = extractFloatx80Sign( a );
3504     if ( aExp == 0x7FFF ) {
3505         if ( (bits64) ( aSig<<1 ) ) {
3506             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3507         }
3508         return packFloat32( aSign, 0xFF, 0 );
3509     }
3510     shift64RightJamming( aSig, 33, &aSig );
3511     if ( aExp || aSig ) aExp -= 0x3F81;
3512     return roundAndPackFloat32( aSign, aExp, aSig );
3513 
3514 }
3515 
3516 /*
3517 -------------------------------------------------------------------------------
3518 Returns the result of converting the extended double-precision floating-
3519 point value `a' to the double-precision floating-point format.  The
3520 conversion is performed according to the IEC/IEEE Standard for Binary
3521 Floating-Point Arithmetic.
3522 -------------------------------------------------------------------------------
3523 */
floatx80_to_float64(floatx80 a)3524 float64 floatx80_to_float64( floatx80 a )
3525 {
3526     flag aSign;
3527     int32 aExp;
3528     bits64 aSig, zSig;
3529 
3530     aSig = extractFloatx80Frac( a );
3531     aExp = extractFloatx80Exp( a );
3532     aSign = extractFloatx80Sign( a );
3533     if ( aExp == 0x7FFF ) {
3534         if ( (bits64) ( aSig<<1 ) ) {
3535             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3536         }
3537         return packFloat64( aSign, 0x7FF, 0 );
3538     }
3539     shift64RightJamming( aSig, 1, &zSig );
3540     if ( aExp || aSig ) aExp -= 0x3C01;
3541     return roundAndPackFloat64( aSign, aExp, zSig );
3542 
3543 }
3544 
3545 #ifdef FLOAT128
3546 
3547 /*
3548 -------------------------------------------------------------------------------
3549 Returns the result of converting the extended double-precision floating-
3550 point value `a' to the quadruple-precision floating-point format.  The
3551 conversion is performed according to the IEC/IEEE Standard for Binary
3552 Floating-Point Arithmetic.
3553 -------------------------------------------------------------------------------
3554 */
floatx80_to_float128(floatx80 a)3555 float128 floatx80_to_float128( floatx80 a )
3556 {
3557     flag aSign;
3558     int16 aExp;
3559     bits64 aSig, zSig0, zSig1;
3560 
3561     aSig = extractFloatx80Frac( a );
3562     aExp = extractFloatx80Exp( a );
3563     aSign = extractFloatx80Sign( a );
3564     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3565         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3566     }
3567     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3568     return packFloat128( aSign, aExp, zSig0, zSig1 );
3569 
3570 }
3571 
3572 #endif
3573 
3574 /*
3575 -------------------------------------------------------------------------------
3576 Rounds the extended double-precision floating-point value `a' to an integer,
3577 and returns the result as an extended quadruple-precision floating-point
3578 value.  The operation is performed according to the IEC/IEEE Standard for
3579 Binary Floating-Point Arithmetic.
3580 -------------------------------------------------------------------------------
3581 */
floatx80_round_to_int(floatx80 a)3582 floatx80 floatx80_round_to_int( floatx80 a )
3583 {
3584     flag aSign;
3585     int32 aExp;
3586     bits64 lastBitMask, roundBitsMask;
3587     int8 roundingMode;
3588     floatx80 z;
3589 
3590     aExp = extractFloatx80Exp( a );
3591     if ( 0x403E <= aExp ) {
3592         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3593             return propagateFloatx80NaN( a, a );
3594         }
3595         return a;
3596     }
3597     if ( aExp < 0x3FFF ) {
3598         if (    ( aExp == 0 )
3599              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3600             return a;
3601         }
3602         float_set_inexact();
3603         aSign = extractFloatx80Sign( a );
3604         switch ( float_rounding_mode() ) {
3605          case float_round_nearest_even:
3606             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3607                ) {
3608                 return
3609                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3610             }
3611             break;
3612          case float_round_down:
3613             return
3614                   aSign ?
3615                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3616                 : packFloatx80( 0, 0, 0 );
3617          case float_round_up:
3618             return
3619                   aSign ? packFloatx80( 1, 0, 0 )
3620                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3621         }
3622         return packFloatx80( aSign, 0, 0 );
3623     }
3624     lastBitMask = 1;
3625     lastBitMask <<= 0x403E - aExp;
3626     roundBitsMask = lastBitMask - 1;
3627     z = a;
3628     roundingMode = float_rounding_mode();
3629     if ( roundingMode == float_round_nearest_even ) {
3630         z.low += lastBitMask>>1;
3631         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3632     }
3633     else if ( roundingMode != float_round_to_zero ) {
3634         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3635             z.low += roundBitsMask;
3636         }
3637     }
3638     z.low &= ~ roundBitsMask;
3639     if ( z.low == 0 ) {
3640         ++z.high;
3641         z.low = LIT64( 0x8000000000000000 );
3642     }
3643     if ( z.low != a.low ) float_set_inexact();
3644     return z;
3645 
3646 }
3647 
3648 /*
3649 -------------------------------------------------------------------------------
3650 Returns the result of adding the absolute values of the extended double-
3651 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3652 negated before being returned.  `zSign' is ignored if the result is a NaN.
3653 The addition is performed according to the IEC/IEEE Standard for Binary
3654 Floating-Point Arithmetic.
3655 -------------------------------------------------------------------------------
3656 */
addFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3657 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3658 {
3659     int32 aExp, bExp, zExp;
3660     bits64 aSig, bSig, zSig0, zSig1;
3661     int32 expDiff;
3662 
3663     aSig = extractFloatx80Frac( a );
3664     aExp = extractFloatx80Exp( a );
3665     bSig = extractFloatx80Frac( b );
3666     bExp = extractFloatx80Exp( b );
3667     expDiff = aExp - bExp;
3668     if ( 0 < expDiff ) {
3669         if ( aExp == 0x7FFF ) {
3670             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3671             return a;
3672         }
3673         if ( bExp == 0 ) --expDiff;
3674         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3675         zExp = aExp;
3676     }
3677     else if ( expDiff < 0 ) {
3678         if ( bExp == 0x7FFF ) {
3679             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3680             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3681         }
3682         if ( aExp == 0 ) ++expDiff;
3683         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3684         zExp = bExp;
3685     }
3686     else {
3687         if ( aExp == 0x7FFF ) {
3688             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3689                 return propagateFloatx80NaN( a, b );
3690             }
3691             return a;
3692         }
3693         zSig1 = 0;
3694         zSig0 = aSig + bSig;
3695         if ( aExp == 0 ) {
3696             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3697             goto roundAndPack;
3698         }
3699         zExp = aExp;
3700         goto shiftRight1;
3701     }
3702     zSig0 = aSig + bSig;
3703     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3704  shiftRight1:
3705     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3706     zSig0 |= LIT64( 0x8000000000000000 );
3707     ++zExp;
3708  roundAndPack:
3709     return
3710         roundAndPackFloatx80(
3711             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3712 
3713 }
3714 
3715 /*
3716 -------------------------------------------------------------------------------
3717 Returns the result of subtracting the absolute values of the extended
3718 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3719 difference is negated before being returned.  `zSign' is ignored if the
3720 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3721 Standard for Binary Floating-Point Arithmetic.
3722 -------------------------------------------------------------------------------
3723 */
subFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3724 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3725 {
3726     int32 aExp, bExp, zExp;
3727     bits64 aSig, bSig, zSig0, zSig1;
3728     int32 expDiff;
3729     floatx80 z;
3730 
3731     aSig = extractFloatx80Frac( a );
3732     aExp = extractFloatx80Exp( a );
3733     bSig = extractFloatx80Frac( b );
3734     bExp = extractFloatx80Exp( b );
3735     expDiff = aExp - bExp;
3736     if ( 0 < expDiff ) goto aExpBigger;
3737     if ( expDiff < 0 ) goto bExpBigger;
3738     if ( aExp == 0x7FFF ) {
3739         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3740             return propagateFloatx80NaN( a, b );
3741         }
3742         float_raise( float_flag_invalid );
3743         z.low = floatx80_default_nan_low;
3744         z.high = floatx80_default_nan_high;
3745         return z;
3746     }
3747     if ( aExp == 0 ) {
3748         aExp = 1;
3749         bExp = 1;
3750     }
3751     zSig1 = 0;
3752     if ( bSig < aSig ) goto aBigger;
3753     if ( aSig < bSig ) goto bBigger;
3754     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3755  bExpBigger:
3756     if ( bExp == 0x7FFF ) {
3757         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3758         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3759     }
3760     if ( aExp == 0 ) ++expDiff;
3761     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3762  bBigger:
3763     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3764     zExp = bExp;
3765     zSign ^= 1;
3766     goto normalizeRoundAndPack;
3767  aExpBigger:
3768     if ( aExp == 0x7FFF ) {
3769         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3770         return a;
3771     }
3772     if ( bExp == 0 ) --expDiff;
3773     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3774  aBigger:
3775     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3776     zExp = aExp;
3777  normalizeRoundAndPack:
3778     return
3779         normalizeRoundAndPackFloatx80(
3780             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3781 
3782 }
3783 
3784 /*
3785 -------------------------------------------------------------------------------
3786 Returns the result of adding the extended double-precision floating-point
3787 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3788 Standard for Binary Floating-Point Arithmetic.
3789 -------------------------------------------------------------------------------
3790 */
floatx80_add(floatx80 a,floatx80 b)3791 floatx80 floatx80_add( floatx80 a, floatx80 b )
3792 {
3793     flag aSign, bSign;
3794 
3795     aSign = extractFloatx80Sign( a );
3796     bSign = extractFloatx80Sign( b );
3797     if ( aSign == bSign ) {
3798         return addFloatx80Sigs( a, b, aSign );
3799     }
3800     else {
3801         return subFloatx80Sigs( a, b, aSign );
3802     }
3803 
3804 }
3805 
3806 /*
3807 -------------------------------------------------------------------------------
3808 Returns the result of subtracting the extended double-precision floating-
3809 point values `a' and `b'.  The operation is performed according to the
3810 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3811 -------------------------------------------------------------------------------
3812 */
floatx80_sub(floatx80 a,floatx80 b)3813 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3814 {
3815     flag aSign, bSign;
3816 
3817     aSign = extractFloatx80Sign( a );
3818     bSign = extractFloatx80Sign( b );
3819     if ( aSign == bSign ) {
3820         return subFloatx80Sigs( a, b, aSign );
3821     }
3822     else {
3823         return addFloatx80Sigs( a, b, aSign );
3824     }
3825 
3826 }
3827 
3828 /*
3829 -------------------------------------------------------------------------------
3830 Returns the result of multiplying the extended double-precision floating-
3831 point values `a' and `b'.  The operation is performed according to the
3832 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3833 -------------------------------------------------------------------------------
3834 */
floatx80_mul(floatx80 a,floatx80 b)3835 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3836 {
3837     flag aSign, bSign, zSign;
3838     int32 aExp, bExp, zExp;
3839     bits64 aSig, bSig, zSig0, zSig1;
3840     floatx80 z;
3841 
3842     aSig = extractFloatx80Frac( a );
3843     aExp = extractFloatx80Exp( a );
3844     aSign = extractFloatx80Sign( a );
3845     bSig = extractFloatx80Frac( b );
3846     bExp = extractFloatx80Exp( b );
3847     bSign = extractFloatx80Sign( b );
3848     zSign = aSign ^ bSign;
3849     if ( aExp == 0x7FFF ) {
3850         if (    (bits64) ( aSig<<1 )
3851              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3852             return propagateFloatx80NaN( a, b );
3853         }
3854         if ( ( bExp | bSig ) == 0 ) goto invalid;
3855         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3856     }
3857     if ( bExp == 0x7FFF ) {
3858         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3859         if ( ( aExp | aSig ) == 0 ) {
3860  invalid:
3861             float_raise( float_flag_invalid );
3862             z.low = floatx80_default_nan_low;
3863             z.high = floatx80_default_nan_high;
3864             return z;
3865         }
3866         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3867     }
3868     if ( aExp == 0 ) {
3869         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3870         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3871     }
3872     if ( bExp == 0 ) {
3873         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3874         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3875     }
3876     zExp = aExp + bExp - 0x3FFE;
3877     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3878     if ( 0 < (sbits64) zSig0 ) {
3879         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3880         --zExp;
3881     }
3882     return
3883         roundAndPackFloatx80(
3884             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3885 
3886 }
3887 
3888 /*
3889 -------------------------------------------------------------------------------
3890 Returns the result of dividing the extended double-precision floating-point
3891 value `a' by the corresponding value `b'.  The operation is performed
3892 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3893 -------------------------------------------------------------------------------
3894 */
floatx80_div(floatx80 a,floatx80 b)3895 floatx80 floatx80_div( floatx80 a, floatx80 b )
3896 {
3897     flag aSign, bSign, zSign;
3898     int32 aExp, bExp, zExp;
3899     bits64 aSig, bSig, zSig0, zSig1;
3900     bits64 rem0, rem1, rem2, term0, term1, term2;
3901     floatx80 z;
3902 
3903     aSig = extractFloatx80Frac( a );
3904     aExp = extractFloatx80Exp( a );
3905     aSign = extractFloatx80Sign( a );
3906     bSig = extractFloatx80Frac( b );
3907     bExp = extractFloatx80Exp( b );
3908     bSign = extractFloatx80Sign( b );
3909     zSign = aSign ^ bSign;
3910     if ( aExp == 0x7FFF ) {
3911         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3912         if ( bExp == 0x7FFF ) {
3913             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3914             goto invalid;
3915         }
3916         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3917     }
3918     if ( bExp == 0x7FFF ) {
3919         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3920         return packFloatx80( zSign, 0, 0 );
3921     }
3922     if ( bExp == 0 ) {
3923         if ( bSig == 0 ) {
3924             if ( ( aExp | aSig ) == 0 ) {
3925  invalid:
3926                 float_raise( float_flag_invalid );
3927                 z.low = floatx80_default_nan_low;
3928                 z.high = floatx80_default_nan_high;
3929                 return z;
3930             }
3931             float_raise( float_flag_divbyzero );
3932             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3933         }
3934         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3935     }
3936     if ( aExp == 0 ) {
3937         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3938         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3939     }
3940     zExp = aExp - bExp + 0x3FFE;
3941     rem1 = 0;
3942     if ( bSig <= aSig ) {
3943         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3944         ++zExp;
3945     }
3946     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3947     mul64To128( bSig, zSig0, &term0, &term1 );
3948     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3949     while ( (sbits64) rem0 < 0 ) {
3950         --zSig0;
3951         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3952     }
3953     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3954     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3955         mul64To128( bSig, zSig1, &term1, &term2 );
3956         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3957         while ( (sbits64) rem1 < 0 ) {
3958             --zSig1;
3959             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3960         }
3961         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3962     }
3963     return
3964         roundAndPackFloatx80(
3965             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3966 
3967 }
3968 
3969 /*
3970 -------------------------------------------------------------------------------
3971 Returns the remainder of the extended double-precision floating-point value
3972 `a' with respect to the corresponding value `b'.  The operation is performed
3973 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3974 -------------------------------------------------------------------------------
3975 */
floatx80_rem(floatx80 a,floatx80 b)3976 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3977 {
3978     flag aSign, bSign, zSign;
3979     int32 aExp, bExp, expDiff;
3980     bits64 aSig0, aSig1, bSig;
3981     bits64 q, term0, term1, alternateASig0, alternateASig1;
3982     floatx80 z;
3983 
3984     aSig0 = extractFloatx80Frac( a );
3985     aExp = extractFloatx80Exp( a );
3986     aSign = extractFloatx80Sign( a );
3987     bSig = extractFloatx80Frac( b );
3988     bExp = extractFloatx80Exp( b );
3989     bSign = extractFloatx80Sign( b );
3990     if ( aExp == 0x7FFF ) {
3991         if (    (bits64) ( aSig0<<1 )
3992              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3993             return propagateFloatx80NaN( a, b );
3994         }
3995         goto invalid;
3996     }
3997     if ( bExp == 0x7FFF ) {
3998         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3999         return a;
4000     }
4001     if ( bExp == 0 ) {
4002         if ( bSig == 0 ) {
4003  invalid:
4004             float_raise( float_flag_invalid );
4005             z.low = floatx80_default_nan_low;
4006             z.high = floatx80_default_nan_high;
4007             return z;
4008         }
4009         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4010     }
4011     if ( aExp == 0 ) {
4012         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4013         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4014     }
4015     bSig |= LIT64( 0x8000000000000000 );
4016     zSign = aSign;
4017     expDiff = aExp - bExp;
4018     aSig1 = 0;
4019     if ( expDiff < 0 ) {
4020         if ( expDiff < -1 ) return a;
4021         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4022         expDiff = 0;
4023     }
4024     q = ( bSig <= aSig0 );
4025     if ( q ) aSig0 -= bSig;
4026     expDiff -= 64;
4027     while ( 0 < expDiff ) {
4028         q = estimateDiv128To64( aSig0, aSig1, bSig );
4029         q = ( 2 < q ) ? q - 2 : 0;
4030         mul64To128( bSig, q, &term0, &term1 );
4031         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4032         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4033         expDiff -= 62;
4034     }
4035     expDiff += 64;
4036     if ( 0 < expDiff ) {
4037         q = estimateDiv128To64( aSig0, aSig1, bSig );
4038         q = ( 2 < q ) ? q - 2 : 0;
4039         q >>= 64 - expDiff;
4040         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4041         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4042         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4043         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4044             ++q;
4045             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4046         }
4047     }
4048     else {
4049         term1 = 0;
4050         term0 = bSig;
4051     }
4052     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4053     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4054          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4055               && ( q & 1 ) )
4056        ) {
4057         aSig0 = alternateASig0;
4058         aSig1 = alternateASig1;
4059         zSign = ! zSign;
4060     }
4061     return
4062         normalizeRoundAndPackFloatx80(
4063             80, zSign, bExp + expDiff, aSig0, aSig1 );
4064 
4065 }
4066 
4067 /*
4068 -------------------------------------------------------------------------------
4069 Returns the square root of the extended double-precision floating-point
4070 value `a'.  The operation is performed according to the IEC/IEEE Standard
4071 for Binary Floating-Point Arithmetic.
4072 -------------------------------------------------------------------------------
4073 */
floatx80_sqrt(floatx80 a)4074 floatx80 floatx80_sqrt( floatx80 a )
4075 {
4076     flag aSign;
4077     int32 aExp, zExp;
4078     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4079     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4080     floatx80 z;
4081 
4082     aSig0 = extractFloatx80Frac( a );
4083     aExp = extractFloatx80Exp( a );
4084     aSign = extractFloatx80Sign( a );
4085     if ( aExp == 0x7FFF ) {
4086         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4087         if ( ! aSign ) return a;
4088         goto invalid;
4089     }
4090     if ( aSign ) {
4091         if ( ( aExp | aSig0 ) == 0 ) return a;
4092  invalid:
4093         float_raise( float_flag_invalid );
4094         z.low = floatx80_default_nan_low;
4095         z.high = floatx80_default_nan_high;
4096         return z;
4097     }
4098     if ( aExp == 0 ) {
4099         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4100         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4101     }
4102     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4103     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4104     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4105     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4106     doubleZSig0 = zSig0<<1;
4107     mul64To128( zSig0, zSig0, &term0, &term1 );
4108     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4109     while ( (sbits64) rem0 < 0 ) {
4110         --zSig0;
4111         doubleZSig0 -= 2;
4112         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4113     }
4114     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4115     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4116         if ( zSig1 == 0 ) zSig1 = 1;
4117         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4118         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4119         mul64To128( zSig1, zSig1, &term2, &term3 );
4120         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4121         while ( (sbits64) rem1 < 0 ) {
4122             --zSig1;
4123             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4124             term3 |= 1;
4125             term2 |= doubleZSig0;
4126             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4127         }
4128         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4129     }
4130     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4131     zSig0 |= doubleZSig0;
4132     return
4133         roundAndPackFloatx80(
4134             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4135 
4136 }
4137 
4138 /*
4139 -------------------------------------------------------------------------------
4140 Returns 1 if the extended double-precision floating-point value `a' is
4141 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4142 performed according to the IEC/IEEE Standard for Binary Floating-Point
4143 Arithmetic.
4144 -------------------------------------------------------------------------------
4145 */
floatx80_eq(floatx80 a,floatx80 b)4146 flag floatx80_eq( floatx80 a, floatx80 b )
4147 {
4148 
4149     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4150               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4151          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4152               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4153        ) {
4154         if (    floatx80_is_signaling_nan( a )
4155              || floatx80_is_signaling_nan( b ) ) {
4156             float_raise( float_flag_invalid );
4157         }
4158         return 0;
4159     }
4160     return
4161            ( a.low == b.low )
4162         && (    ( a.high == b.high )
4163              || (    ( a.low == 0 )
4164                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4165            );
4166 
4167 }
4168 
4169 /*
4170 -------------------------------------------------------------------------------
4171 Returns 1 if the extended double-precision floating-point value `a' is
4172 less than or equal to the corresponding value `b', and 0 otherwise.  The
4173 comparison is performed according to the IEC/IEEE Standard for Binary
4174 Floating-Point Arithmetic.
4175 -------------------------------------------------------------------------------
4176 */
floatx80_le(floatx80 a,floatx80 b)4177 flag floatx80_le( floatx80 a, floatx80 b )
4178 {
4179     flag aSign, bSign;
4180 
4181     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4182               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4183          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4184               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4185        ) {
4186         float_raise( float_flag_invalid );
4187         return 0;
4188     }
4189     aSign = extractFloatx80Sign( a );
4190     bSign = extractFloatx80Sign( b );
4191     if ( aSign != bSign ) {
4192         return
4193                aSign
4194             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4195                  == 0 );
4196     }
4197     return
4198           aSign ? le128( b.high, b.low, a.high, a.low )
4199         : le128( a.high, a.low, b.high, b.low );
4200 
4201 }
4202 
4203 /*
4204 -------------------------------------------------------------------------------
4205 Returns 1 if the extended double-precision floating-point value `a' is
4206 less than the corresponding value `b', and 0 otherwise.  The comparison
4207 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4208 Arithmetic.
4209 -------------------------------------------------------------------------------
4210 */
floatx80_lt(floatx80 a,floatx80 b)4211 flag floatx80_lt( floatx80 a, floatx80 b )
4212 {
4213     flag aSign, bSign;
4214 
4215     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4216               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4217          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4218               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4219        ) {
4220         float_raise( float_flag_invalid );
4221         return 0;
4222     }
4223     aSign = extractFloatx80Sign( a );
4224     bSign = extractFloatx80Sign( b );
4225     if ( aSign != bSign ) {
4226         return
4227                aSign
4228             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4229                  != 0 );
4230     }
4231     return
4232           aSign ? lt128( b.high, b.low, a.high, a.low )
4233         : lt128( a.high, a.low, b.high, b.low );
4234 
4235 }
4236 
4237 /*
4238 -------------------------------------------------------------------------------
4239 Returns 1 if the extended double-precision floating-point value `a' is equal
4240 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4241 raised if either operand is a NaN.  Otherwise, the comparison is performed
4242 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4243 -------------------------------------------------------------------------------
4244 */
floatx80_eq_signaling(floatx80 a,floatx80 b)4245 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4246 {
4247 
4248     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4249               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4250          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4251               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4252        ) {
4253         float_raise( float_flag_invalid );
4254         return 0;
4255     }
4256     return
4257            ( a.low == b.low )
4258         && (    ( a.high == b.high )
4259              || (    ( a.low == 0 )
4260                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4261            );
4262 
4263 }
4264 
4265 /*
4266 -------------------------------------------------------------------------------
4267 Returns 1 if the extended double-precision floating-point value `a' is less
4268 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4269 do not cause an exception.  Otherwise, the comparison is performed according
4270 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 -------------------------------------------------------------------------------
4272 */
floatx80_le_quiet(floatx80 a,floatx80 b)4273 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4274 {
4275     flag aSign, bSign;
4276 
4277     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4278               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4279          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4280               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4281        ) {
4282         if (    floatx80_is_signaling_nan( a )
4283              || floatx80_is_signaling_nan( b ) ) {
4284             float_raise( float_flag_invalid );
4285         }
4286         return 0;
4287     }
4288     aSign = extractFloatx80Sign( a );
4289     bSign = extractFloatx80Sign( b );
4290     if ( aSign != bSign ) {
4291         return
4292                aSign
4293             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4294                  == 0 );
4295     }
4296     return
4297           aSign ? le128( b.high, b.low, a.high, a.low )
4298         : le128( a.high, a.low, b.high, b.low );
4299 
4300 }
4301 
4302 /*
4303 -------------------------------------------------------------------------------
4304 Returns 1 if the extended double-precision floating-point value `a' is less
4305 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4306 an exception.  Otherwise, the comparison is performed according to the
4307 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4308 -------------------------------------------------------------------------------
4309 */
floatx80_lt_quiet(floatx80 a,floatx80 b)4310 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4311 {
4312     flag aSign, bSign;
4313 
4314     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4315               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4316          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4317               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4318        ) {
4319         if (    floatx80_is_signaling_nan( a )
4320              || floatx80_is_signaling_nan( b ) ) {
4321             float_raise( float_flag_invalid );
4322         }
4323         return 0;
4324     }
4325     aSign = extractFloatx80Sign( a );
4326     bSign = extractFloatx80Sign( b );
4327     if ( aSign != bSign ) {
4328         return
4329                aSign
4330             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4331                  != 0 );
4332     }
4333     return
4334           aSign ? lt128( b.high, b.low, a.high, a.low )
4335         : lt128( a.high, a.low, b.high, b.low );
4336 
4337 }
4338 
4339 #endif
4340 
4341 #ifdef FLOAT128
4342 
4343 /*
4344 -------------------------------------------------------------------------------
4345 Returns the result of converting the quadruple-precision floating-point
4346 value `a' to the 32-bit two's complement integer format.  The conversion
4347 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4348 Arithmetic---which means in particular that the conversion is rounded
4349 according to the current rounding mode.  If `a' is a NaN, the largest
4350 positive integer is returned.  Otherwise, if the conversion overflows, the
4351 largest integer with the same sign as `a' is returned.
4352 -------------------------------------------------------------------------------
4353 */
float128_to_int32(float128 a)4354 int32 float128_to_int32( float128 a )
4355 {
4356     flag aSign;
4357     int32 aExp, shiftCount;
4358     bits64 aSig0, aSig1;
4359 
4360     aSig1 = extractFloat128Frac1( a );
4361     aSig0 = extractFloat128Frac0( a );
4362     aExp = extractFloat128Exp( a );
4363     aSign = extractFloat128Sign( a );
4364     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4365     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4366     aSig0 |= ( aSig1 != 0 );
4367     shiftCount = 0x4028 - aExp;
4368     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4369     return roundAndPackInt32( aSign, aSig0 );
4370 
4371 }
4372 
4373 /*
4374 -------------------------------------------------------------------------------
4375 Returns the result of converting the quadruple-precision floating-point
4376 value `a' to the 32-bit two's complement integer format.  The conversion
4377 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4378 Arithmetic, except that the conversion is always rounded toward zero.  If
4379 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4380 conversion overflows, the largest integer with the same sign as `a' is
4381 returned.
4382 -------------------------------------------------------------------------------
4383 */
float128_to_int32_round_to_zero(float128 a)4384 int32 float128_to_int32_round_to_zero( float128 a )
4385 {
4386     flag aSign;
4387     int32 aExp, shiftCount;
4388     bits64 aSig0, aSig1, savedASig;
4389     int32 z;
4390 
4391     aSig1 = extractFloat128Frac1( a );
4392     aSig0 = extractFloat128Frac0( a );
4393     aExp = extractFloat128Exp( a );
4394     aSign = extractFloat128Sign( a );
4395     aSig0 |= ( aSig1 != 0 );
4396     if ( 0x401E < aExp ) {
4397         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4398         goto invalid;
4399     }
4400     else if ( aExp < 0x3FFF ) {
4401         if ( aExp || aSig0 ) float_set_inexact();
4402         return 0;
4403     }
4404     aSig0 |= LIT64( 0x0001000000000000 );
4405     shiftCount = 0x402F - aExp;
4406     savedASig = aSig0;
4407     aSig0 >>= shiftCount;
4408     z = aSig0;
4409     if ( aSign ) z = - z;
4410     if ( ( z < 0 ) ^ aSign ) {
4411  invalid:
4412         float_raise( float_flag_invalid );
4413         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4414     }
4415     if ( ( aSig0<<shiftCount ) != savedASig ) {
4416         float_set_inexact();
4417     }
4418     return z;
4419 
4420 }
4421 
4422 /*
4423 -------------------------------------------------------------------------------
4424 Returns the result of converting the quadruple-precision floating-point
4425 value `a' to the 64-bit two's complement integer format.  The conversion
4426 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4427 Arithmetic---which means in particular that the conversion is rounded
4428 according to the current rounding mode.  If `a' is a NaN, the largest
4429 positive integer is returned.  Otherwise, if the conversion overflows, the
4430 largest integer with the same sign as `a' is returned.
4431 -------------------------------------------------------------------------------
4432 */
float128_to_int64(float128 a)4433 int64 float128_to_int64( float128 a )
4434 {
4435     flag aSign;
4436     int32 aExp, shiftCount;
4437     bits64 aSig0, aSig1;
4438 
4439     aSig1 = extractFloat128Frac1( a );
4440     aSig0 = extractFloat128Frac0( a );
4441     aExp = extractFloat128Exp( a );
4442     aSign = extractFloat128Sign( a );
4443     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4444     shiftCount = 0x402F - aExp;
4445     if ( shiftCount <= 0 ) {
4446         if ( 0x403E < aExp ) {
4447             float_raise( float_flag_invalid );
4448             if (    ! aSign
4449                  || (    ( aExp == 0x7FFF )
4450                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4451                     )
4452                ) {
4453                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4454             }
4455             return (sbits64) LIT64( 0x8000000000000000 );
4456         }
4457         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4458     }
4459     else {
4460         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4461     }
4462     return roundAndPackInt64( aSign, aSig0, aSig1 );
4463 
4464 }
4465 
4466 /*
4467 -------------------------------------------------------------------------------
4468 Returns the result of converting the quadruple-precision floating-point
4469 value `a' to the 64-bit two's complement integer format.  The conversion
4470 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4471 Arithmetic, except that the conversion is always rounded toward zero.
4472 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4473 the conversion overflows, the largest integer with the same sign as `a' is
4474 returned.
4475 -------------------------------------------------------------------------------
4476 */
float128_to_int64_round_to_zero(float128 a)4477 int64 float128_to_int64_round_to_zero( float128 a )
4478 {
4479     flag aSign;
4480     int32 aExp, shiftCount;
4481     bits64 aSig0, aSig1;
4482     int64 z;
4483 
4484     aSig1 = extractFloat128Frac1( a );
4485     aSig0 = extractFloat128Frac0( a );
4486     aExp = extractFloat128Exp( a );
4487     aSign = extractFloat128Sign( a );
4488     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4489     shiftCount = aExp - 0x402F;
4490     if ( 0 < shiftCount ) {
4491         if ( 0x403E <= aExp ) {
4492             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4493             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4494                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4495                 if ( aSig1 ) float_set_inexact();
4496             }
4497             else {
4498                 float_raise( float_flag_invalid );
4499                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4500                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4501                 }
4502             }
4503             return (sbits64) LIT64( 0x8000000000000000 );
4504         }
4505         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4506         if ( (bits64) ( aSig1<<shiftCount ) ) {
4507             float_set_inexact();
4508         }
4509     }
4510     else {
4511         if ( aExp < 0x3FFF ) {
4512             if ( aExp | aSig0 | aSig1 ) {
4513                 float_set_inexact();
4514             }
4515             return 0;
4516         }
4517         z = aSig0>>( - shiftCount );
4518         if (    aSig1
4519              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4520             float_set_inexact();
4521         }
4522     }
4523     if ( aSign ) z = - z;
4524     return z;
4525 
4526 }
4527 
4528 /*
4529 -------------------------------------------------------------------------------
4530 Returns the result of converting the quadruple-precision floating-point
4531 value `a' to the single-precision floating-point format.  The conversion
4532 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4533 Arithmetic.
4534 -------------------------------------------------------------------------------
4535 */
float128_to_float32(float128 a)4536 float32 float128_to_float32( float128 a )
4537 {
4538     flag aSign;
4539     int32 aExp;
4540     bits64 aSig0, aSig1;
4541     bits32 zSig;
4542 
4543     aSig1 = extractFloat128Frac1( a );
4544     aSig0 = extractFloat128Frac0( a );
4545     aExp = extractFloat128Exp( a );
4546     aSign = extractFloat128Sign( a );
4547     if ( aExp == 0x7FFF ) {
4548         if ( aSig0 | aSig1 ) {
4549             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4550         }
4551         return packFloat32( aSign, 0xFF, 0 );
4552     }
4553     aSig0 |= ( aSig1 != 0 );
4554     shift64RightJamming( aSig0, 18, &aSig0 );
4555     zSig = aSig0;
4556     if ( aExp || zSig ) {
4557         zSig |= 0x40000000;
4558         aExp -= 0x3F81;
4559     }
4560     return roundAndPackFloat32( aSign, aExp, zSig );
4561 
4562 }
4563 
4564 /*
4565 -------------------------------------------------------------------------------
4566 Returns the result of converting the quadruple-precision floating-point
4567 value `a' to the double-precision floating-point format.  The conversion
4568 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4569 Arithmetic.
4570 -------------------------------------------------------------------------------
4571 */
float128_to_float64(float128 a)4572 float64 float128_to_float64( float128 a )
4573 {
4574     flag aSign;
4575     int32 aExp;
4576     bits64 aSig0, aSig1;
4577 
4578     aSig1 = extractFloat128Frac1( a );
4579     aSig0 = extractFloat128Frac0( a );
4580     aExp = extractFloat128Exp( a );
4581     aSign = extractFloat128Sign( a );
4582     if ( aExp == 0x7FFF ) {
4583         if ( aSig0 | aSig1 ) {
4584             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4585         }
4586         return packFloat64( aSign, 0x7FF, 0 );
4587     }
4588     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4589     aSig0 |= ( aSig1 != 0 );
4590     if ( aExp || aSig0 ) {
4591         aSig0 |= LIT64( 0x4000000000000000 );
4592         aExp -= 0x3C01;
4593     }
4594     return roundAndPackFloat64( aSign, aExp, aSig0 );
4595 
4596 }
4597 
4598 #ifdef FLOATX80
4599 
4600 /*
4601 -------------------------------------------------------------------------------
4602 Returns the result of converting the quadruple-precision floating-point
4603 value `a' to the extended double-precision floating-point format.  The
4604 conversion is performed according to the IEC/IEEE Standard for Binary
4605 Floating-Point Arithmetic.
4606 -------------------------------------------------------------------------------
4607 */
float128_to_floatx80(float128 a)4608 floatx80 float128_to_floatx80( float128 a )
4609 {
4610     flag aSign;
4611     int32 aExp;
4612     bits64 aSig0, aSig1;
4613 
4614     aSig1 = extractFloat128Frac1( a );
4615     aSig0 = extractFloat128Frac0( a );
4616     aExp = extractFloat128Exp( a );
4617     aSign = extractFloat128Sign( a );
4618     if ( aExp == 0x7FFF ) {
4619         if ( aSig0 | aSig1 ) {
4620             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4621         }
4622         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4623     }
4624     if ( aExp == 0 ) {
4625         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4626         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4627     }
4628     else {
4629         aSig0 |= LIT64( 0x0001000000000000 );
4630     }
4631     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4632     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4633 
4634 }
4635 
4636 #endif
4637 
4638 /*
4639 -------------------------------------------------------------------------------
4640 Rounds the quadruple-precision floating-point value `a' to an integer, and
4641 returns the result as a quadruple-precision floating-point value.  The
4642 operation is performed according to the IEC/IEEE Standard for Binary
4643 Floating-Point Arithmetic.
4644 -------------------------------------------------------------------------------
4645 */
float128_round_to_int(float128 a)4646 float128 float128_round_to_int( float128 a )
4647 {
4648     flag aSign;
4649     int32 aExp;
4650     bits64 lastBitMask, roundBitsMask;
4651     int8 roundingMode;
4652     float128 z;
4653 
4654     aExp = extractFloat128Exp( a );
4655     if ( 0x402F <= aExp ) {
4656         if ( 0x406F <= aExp ) {
4657             if (    ( aExp == 0x7FFF )
4658                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4659                ) {
4660                 return propagateFloat128NaN( a, a );
4661             }
4662             return a;
4663         }
4664         lastBitMask = 1;
4665         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4666         roundBitsMask = lastBitMask - 1;
4667         z = a;
4668         roundingMode = float_rounding_mode();
4669         if ( roundingMode == float_round_nearest_even ) {
4670             if ( lastBitMask ) {
4671                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4672                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4673             }
4674             else {
4675                 if ( (sbits64) z.low < 0 ) {
4676                     ++z.high;
4677                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4678                 }
4679             }
4680         }
4681         else if ( roundingMode != float_round_to_zero ) {
4682             if (   extractFloat128Sign( z )
4683                  ^ ( roundingMode == float_round_up ) ) {
4684                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4685             }
4686         }
4687         z.low &= ~ roundBitsMask;
4688     }
4689     else {
4690         if ( aExp < 0x3FFF ) {
4691             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4692             float_set_inexact();
4693             aSign = extractFloat128Sign( a );
4694             switch ( float_rounding_mode() ) {
4695              case float_round_nearest_even:
4696                 if (    ( aExp == 0x3FFE )
4697                      && (   extractFloat128Frac0( a )
4698                           | extractFloat128Frac1( a ) )
4699                    ) {
4700                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4701                 }
4702                 break;
4703              case float_round_down:
4704                 return
4705                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4706                     : packFloat128( 0, 0, 0, 0 );
4707              case float_round_up:
4708                 return
4709                       aSign ? packFloat128( 1, 0, 0, 0 )
4710                     : packFloat128( 0, 0x3FFF, 0, 0 );
4711             }
4712             return packFloat128( aSign, 0, 0, 0 );
4713         }
4714         lastBitMask = 1;
4715         lastBitMask <<= 0x402F - aExp;
4716         roundBitsMask = lastBitMask - 1;
4717         z.low = 0;
4718         z.high = a.high;
4719         roundingMode = float_rounding_mode();
4720         if ( roundingMode == float_round_nearest_even ) {
4721             z.high += lastBitMask>>1;
4722             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4723                 z.high &= ~ lastBitMask;
4724             }
4725         }
4726         else if ( roundingMode != float_round_to_zero ) {
4727             if (   extractFloat128Sign( z )
4728                  ^ ( roundingMode == float_round_up ) ) {
4729                 z.high |= ( a.low != 0 );
4730                 z.high += roundBitsMask;
4731             }
4732         }
4733         z.high &= ~ roundBitsMask;
4734     }
4735     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4736         float_set_inexact();
4737     }
4738     return z;
4739 
4740 }
4741 
4742 /*
4743 -------------------------------------------------------------------------------
4744 Returns the result of adding the absolute values of the quadruple-precision
4745 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4746 before being returned.  `zSign' is ignored if the result is a NaN.
4747 The addition is performed according to the IEC/IEEE Standard for Binary
4748 Floating-Point Arithmetic.
4749 -------------------------------------------------------------------------------
4750 */
addFloat128Sigs(float128 a,float128 b,flag zSign)4751 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4752 {
4753     int32 aExp, bExp, zExp;
4754     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4755     int32 expDiff;
4756 
4757     aSig1 = extractFloat128Frac1( a );
4758     aSig0 = extractFloat128Frac0( a );
4759     aExp = extractFloat128Exp( a );
4760     bSig1 = extractFloat128Frac1( b );
4761     bSig0 = extractFloat128Frac0( b );
4762     bExp = extractFloat128Exp( b );
4763     expDiff = aExp - bExp;
4764     if ( 0 < expDiff ) {
4765         if ( aExp == 0x7FFF ) {
4766             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4767             return a;
4768         }
4769         if ( bExp == 0 ) {
4770             --expDiff;
4771         }
4772         else {
4773             bSig0 |= LIT64( 0x0001000000000000 );
4774         }
4775         shift128ExtraRightJamming(
4776             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4777         zExp = aExp;
4778     }
4779     else if ( expDiff < 0 ) {
4780         if ( bExp == 0x7FFF ) {
4781             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4782             return packFloat128( zSign, 0x7FFF, 0, 0 );
4783         }
4784         if ( aExp == 0 ) {
4785             ++expDiff;
4786         }
4787         else {
4788             aSig0 |= LIT64( 0x0001000000000000 );
4789         }
4790         shift128ExtraRightJamming(
4791             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4792         zExp = bExp;
4793     }
4794     else {
4795         if ( aExp == 0x7FFF ) {
4796             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4797                 return propagateFloat128NaN( a, b );
4798             }
4799             return a;
4800         }
4801         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4802         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4803         zSig2 = 0;
4804         zSig0 |= LIT64( 0x0002000000000000 );
4805         zExp = aExp;
4806         goto shiftRight1;
4807     }
4808     aSig0 |= LIT64( 0x0001000000000000 );
4809     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4810     --zExp;
4811     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4812     ++zExp;
4813  shiftRight1:
4814     shift128ExtraRightJamming(
4815         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4816  roundAndPack:
4817     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4818 
4819 }
4820 
4821 /*
4822 -------------------------------------------------------------------------------
4823 Returns the result of subtracting the absolute values of the quadruple-
4824 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4825 difference is negated before being returned.  `zSign' is ignored if the
4826 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4827 Standard for Binary Floating-Point Arithmetic.
4828 -------------------------------------------------------------------------------
4829 */
subFloat128Sigs(float128 a,float128 b,flag zSign)4830 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4831 {
4832     int32 aExp, bExp, zExp;
4833     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4834     int32 expDiff;
4835     float128 z;
4836 
4837     aSig1 = extractFloat128Frac1( a );
4838     aSig0 = extractFloat128Frac0( a );
4839     aExp = extractFloat128Exp( a );
4840     bSig1 = extractFloat128Frac1( b );
4841     bSig0 = extractFloat128Frac0( b );
4842     bExp = extractFloat128Exp( b );
4843     expDiff = aExp - bExp;
4844     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4845     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4846     if ( 0 < expDiff ) goto aExpBigger;
4847     if ( expDiff < 0 ) goto bExpBigger;
4848     if ( aExp == 0x7FFF ) {
4849         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4850             return propagateFloat128NaN( a, b );
4851         }
4852         float_raise( float_flag_invalid );
4853         z.low = float128_default_nan_low;
4854         z.high = float128_default_nan_high;
4855         return z;
4856     }
4857     if ( aExp == 0 ) {
4858         aExp = 1;
4859         bExp = 1;
4860     }
4861     if ( bSig0 < aSig0 ) goto aBigger;
4862     if ( aSig0 < bSig0 ) goto bBigger;
4863     if ( bSig1 < aSig1 ) goto aBigger;
4864     if ( aSig1 < bSig1 ) goto bBigger;
4865     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4866  bExpBigger:
4867     if ( bExp == 0x7FFF ) {
4868         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4869         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4870     }
4871     if ( aExp == 0 ) {
4872         ++expDiff;
4873     }
4874     else {
4875         aSig0 |= LIT64( 0x4000000000000000 );
4876     }
4877     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4878     bSig0 |= LIT64( 0x4000000000000000 );
4879  bBigger:
4880     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4881     zExp = bExp;
4882     zSign ^= 1;
4883     goto normalizeRoundAndPack;
4884  aExpBigger:
4885     if ( aExp == 0x7FFF ) {
4886         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4887         return a;
4888     }
4889     if ( bExp == 0 ) {
4890         --expDiff;
4891     }
4892     else {
4893         bSig0 |= LIT64( 0x4000000000000000 );
4894     }
4895     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4896     aSig0 |= LIT64( 0x4000000000000000 );
4897  aBigger:
4898     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4899     zExp = aExp;
4900  normalizeRoundAndPack:
4901     --zExp;
4902     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4903 
4904 }
4905 
4906 /*
4907 -------------------------------------------------------------------------------
4908 Returns the result of adding the quadruple-precision floating-point values
4909 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4910 for Binary Floating-Point Arithmetic.
4911 -------------------------------------------------------------------------------
4912 */
float128_add(float128 a,float128 b)4913 float128 float128_add( float128 a, float128 b )
4914 {
4915     flag aSign, bSign;
4916 
4917     aSign = extractFloat128Sign( a );
4918     bSign = extractFloat128Sign( b );
4919     if ( aSign == bSign ) {
4920         return addFloat128Sigs( a, b, aSign );
4921     }
4922     else {
4923         return subFloat128Sigs( a, b, aSign );
4924     }
4925 
4926 }
4927 
4928 /*
4929 -------------------------------------------------------------------------------
4930 Returns the result of subtracting the quadruple-precision floating-point
4931 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4932 Standard for Binary Floating-Point Arithmetic.
4933 -------------------------------------------------------------------------------
4934 */
float128_sub(float128 a,float128 b)4935 float128 float128_sub( float128 a, float128 b )
4936 {
4937     flag aSign, bSign;
4938 
4939     aSign = extractFloat128Sign( a );
4940     bSign = extractFloat128Sign( b );
4941     if ( aSign == bSign ) {
4942         return subFloat128Sigs( a, b, aSign );
4943     }
4944     else {
4945         return addFloat128Sigs( a, b, aSign );
4946     }
4947 
4948 }
4949 
4950 /*
4951 -------------------------------------------------------------------------------
4952 Returns the result of multiplying the quadruple-precision floating-point
4953 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4954 Standard for Binary Floating-Point Arithmetic.
4955 -------------------------------------------------------------------------------
4956 */
float128_mul(float128 a,float128 b)4957 float128 float128_mul( float128 a, float128 b )
4958 {
4959     flag aSign, bSign, zSign;
4960     int32 aExp, bExp, zExp;
4961     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4962     float128 z;
4963 
4964     aSig1 = extractFloat128Frac1( a );
4965     aSig0 = extractFloat128Frac0( a );
4966     aExp = extractFloat128Exp( a );
4967     aSign = extractFloat128Sign( a );
4968     bSig1 = extractFloat128Frac1( b );
4969     bSig0 = extractFloat128Frac0( b );
4970     bExp = extractFloat128Exp( b );
4971     bSign = extractFloat128Sign( b );
4972     zSign = aSign ^ bSign;
4973     if ( aExp == 0x7FFF ) {
4974         if (    ( aSig0 | aSig1 )
4975              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4976             return propagateFloat128NaN( a, b );
4977         }
4978         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4979         return packFloat128( zSign, 0x7FFF, 0, 0 );
4980     }
4981     if ( bExp == 0x7FFF ) {
4982         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4983         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4984  invalid:
4985             float_raise( float_flag_invalid );
4986             z.low = float128_default_nan_low;
4987             z.high = float128_default_nan_high;
4988             return z;
4989         }
4990         return packFloat128( zSign, 0x7FFF, 0, 0 );
4991     }
4992     if ( aExp == 0 ) {
4993         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4994         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4995     }
4996     if ( bExp == 0 ) {
4997         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4998         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4999     }
5000     zExp = aExp + bExp - 0x4000;
5001     aSig0 |= LIT64( 0x0001000000000000 );
5002     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5003     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5004     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5005     zSig2 |= ( zSig3 != 0 );
5006     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5007         shift128ExtraRightJamming(
5008             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5009         ++zExp;
5010     }
5011     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5012 
5013 }
5014 
5015 /*
5016 -------------------------------------------------------------------------------
5017 Returns the result of dividing the quadruple-precision floating-point value
5018 `a' by the corresponding value `b'.  The operation is performed according to
5019 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 -------------------------------------------------------------------------------
5021 */
float128_div(float128 a,float128 b)5022 float128 float128_div( float128 a, float128 b )
5023 {
5024     flag aSign, bSign, zSign;
5025     int32 aExp, bExp, zExp;
5026     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5027     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5028     float128 z;
5029 
5030     aSig1 = extractFloat128Frac1( a );
5031     aSig0 = extractFloat128Frac0( a );
5032     aExp = extractFloat128Exp( a );
5033     aSign = extractFloat128Sign( a );
5034     bSig1 = extractFloat128Frac1( b );
5035     bSig0 = extractFloat128Frac0( b );
5036     bExp = extractFloat128Exp( b );
5037     bSign = extractFloat128Sign( b );
5038     zSign = aSign ^ bSign;
5039     if ( aExp == 0x7FFF ) {
5040         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5041         if ( bExp == 0x7FFF ) {
5042             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5043             goto invalid;
5044         }
5045         return packFloat128( zSign, 0x7FFF, 0, 0 );
5046     }
5047     if ( bExp == 0x7FFF ) {
5048         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5049         return packFloat128( zSign, 0, 0, 0 );
5050     }
5051     if ( bExp == 0 ) {
5052         if ( ( bSig0 | bSig1 ) == 0 ) {
5053             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5054  invalid:
5055                 float_raise( float_flag_invalid );
5056                 z.low = float128_default_nan_low;
5057                 z.high = float128_default_nan_high;
5058                 return z;
5059             }
5060             float_raise( float_flag_divbyzero );
5061             return packFloat128( zSign, 0x7FFF, 0, 0 );
5062         }
5063         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5064     }
5065     if ( aExp == 0 ) {
5066         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5067         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5068     }
5069     zExp = aExp - bExp + 0x3FFD;
5070     shortShift128Left(
5071         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5072     shortShift128Left(
5073         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5074     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5075         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5076         ++zExp;
5077     }
5078     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5079     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5080     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5081     while ( (sbits64) rem0 < 0 ) {
5082         --zSig0;
5083         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5084     }
5085     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5086     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5087         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5088         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5089         while ( (sbits64) rem1 < 0 ) {
5090             --zSig1;
5091             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5092         }
5093         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5094     }
5095     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5096     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5097 
5098 }
5099 
5100 /*
5101 -------------------------------------------------------------------------------
5102 Returns the remainder of the quadruple-precision floating-point value `a'
5103 with respect to the corresponding value `b'.  The operation is performed
5104 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5105 -------------------------------------------------------------------------------
5106 */
float128_rem(float128 a,float128 b)5107 float128 float128_rem( float128 a, float128 b )
5108 {
5109     flag aSign, bSign, zSign;
5110     int32 aExp, bExp, expDiff;
5111     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5112     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5113     sbits64 sigMean0;
5114     float128 z;
5115 
5116     aSig1 = extractFloat128Frac1( a );
5117     aSig0 = extractFloat128Frac0( a );
5118     aExp = extractFloat128Exp( a );
5119     aSign = extractFloat128Sign( a );
5120     bSig1 = extractFloat128Frac1( b );
5121     bSig0 = extractFloat128Frac0( b );
5122     bExp = extractFloat128Exp( b );
5123     bSign = extractFloat128Sign( b );
5124     if ( aExp == 0x7FFF ) {
5125         if (    ( aSig0 | aSig1 )
5126              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5127             return propagateFloat128NaN( a, b );
5128         }
5129         goto invalid;
5130     }
5131     if ( bExp == 0x7FFF ) {
5132         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5133         return a;
5134     }
5135     if ( bExp == 0 ) {
5136         if ( ( bSig0 | bSig1 ) == 0 ) {
5137  invalid:
5138             float_raise( float_flag_invalid );
5139             z.low = float128_default_nan_low;
5140             z.high = float128_default_nan_high;
5141             return z;
5142         }
5143         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5144     }
5145     if ( aExp == 0 ) {
5146         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5147         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5148     }
5149     expDiff = aExp - bExp;
5150     if ( expDiff < -1 ) return a;
5151     shortShift128Left(
5152         aSig0 | LIT64( 0x0001000000000000 ),
5153         aSig1,
5154         15 - ( expDiff < 0 ),
5155         &aSig0,
5156         &aSig1
5157     );
5158     shortShift128Left(
5159         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5160     q = le128( bSig0, bSig1, aSig0, aSig1 );
5161     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5162     expDiff -= 64;
5163     while ( 0 < expDiff ) {
5164         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5165         q = ( 4 < q ) ? q - 4 : 0;
5166         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5167         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5168         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5169         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5170         expDiff -= 61;
5171     }
5172     if ( -64 < expDiff ) {
5173         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5174         q = ( 4 < q ) ? q - 4 : 0;
5175         q >>= - expDiff;
5176         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5177         expDiff += 52;
5178         if ( expDiff < 0 ) {
5179             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5180         }
5181         else {
5182             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5183         }
5184         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5185         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5186     }
5187     else {
5188         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5189         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5190     }
5191     do {
5192         alternateASig0 = aSig0;
5193         alternateASig1 = aSig1;
5194         ++q;
5195         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5196     } while ( 0 <= (sbits64) aSig0 );
5197     add128(
5198         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5199     if (    ( sigMean0 < 0 )
5200          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5201         aSig0 = alternateASig0;
5202         aSig1 = alternateASig1;
5203     }
5204     zSign = ( (sbits64) aSig0 < 0 );
5205     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5206     return
5207         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5208 
5209 }
5210 
5211 /*
5212 -------------------------------------------------------------------------------
5213 Returns the square root of the quadruple-precision floating-point value `a'.
5214 The operation is performed according to the IEC/IEEE Standard for Binary
5215 Floating-Point Arithmetic.
5216 -------------------------------------------------------------------------------
5217 */
float128_sqrt(float128 a)5218 float128 float128_sqrt( float128 a )
5219 {
5220     flag aSign;
5221     int32 aExp, zExp;
5222     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5223     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5224     float128 z;
5225 
5226     aSig1 = extractFloat128Frac1( a );
5227     aSig0 = extractFloat128Frac0( a );
5228     aExp = extractFloat128Exp( a );
5229     aSign = extractFloat128Sign( a );
5230     if ( aExp == 0x7FFF ) {
5231         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5232         if ( ! aSign ) return a;
5233         goto invalid;
5234     }
5235     if ( aSign ) {
5236         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5237  invalid:
5238         float_raise( float_flag_invalid );
5239         z.low = float128_default_nan_low;
5240         z.high = float128_default_nan_high;
5241         return z;
5242     }
5243     if ( aExp == 0 ) {
5244         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5245         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5246     }
5247     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5248     aSig0 |= LIT64( 0x0001000000000000 );
5249     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5250     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5251     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5252     doubleZSig0 = zSig0<<1;
5253     mul64To128( zSig0, zSig0, &term0, &term1 );
5254     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5255     while ( (sbits64) rem0 < 0 ) {
5256         --zSig0;
5257         doubleZSig0 -= 2;
5258         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5259     }
5260     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5261     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5262         if ( zSig1 == 0 ) zSig1 = 1;
5263         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5264         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5265         mul64To128( zSig1, zSig1, &term2, &term3 );
5266         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5267         while ( (sbits64) rem1 < 0 ) {
5268             --zSig1;
5269             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5270             term3 |= 1;
5271             term2 |= doubleZSig0;
5272             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5273         }
5274         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5275     }
5276     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5277     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5278 
5279 }
5280 
5281 /*
5282 -------------------------------------------------------------------------------
5283 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5284 the corresponding value `b', and 0 otherwise.  The comparison is performed
5285 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5286 -------------------------------------------------------------------------------
5287 */
float128_eq(float128 a,float128 b)5288 flag float128_eq( float128 a, float128 b )
5289 {
5290 
5291     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5292               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5293          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5294               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5295        ) {
5296         if (    float128_is_signaling_nan( a )
5297              || float128_is_signaling_nan( b ) ) {
5298             float_raise( float_flag_invalid );
5299         }
5300         return 0;
5301     }
5302     return
5303            ( a.low == b.low )
5304         && (    ( a.high == b.high )
5305              || (    ( a.low == 0 )
5306                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5307            );
5308 
5309 }
5310 
5311 /*
5312 -------------------------------------------------------------------------------
5313 Returns 1 if the quadruple-precision floating-point value `a' is less than
5314 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5315 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5316 Arithmetic.
5317 -------------------------------------------------------------------------------
5318 */
float128_le(float128 a,float128 b)5319 flag float128_le( float128 a, float128 b )
5320 {
5321     flag aSign, bSign;
5322 
5323     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5324               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5325          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5326               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5327        ) {
5328         float_raise( float_flag_invalid );
5329         return 0;
5330     }
5331     aSign = extractFloat128Sign( a );
5332     bSign = extractFloat128Sign( b );
5333     if ( aSign != bSign ) {
5334         return
5335                aSign
5336             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5337                  == 0 );
5338     }
5339     return
5340           aSign ? le128( b.high, b.low, a.high, a.low )
5341         : le128( a.high, a.low, b.high, b.low );
5342 
5343 }
5344 
5345 /*
5346 -------------------------------------------------------------------------------
5347 Returns 1 if the quadruple-precision floating-point value `a' is less than
5348 the corresponding value `b', and 0 otherwise.  The comparison is performed
5349 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5350 -------------------------------------------------------------------------------
5351 */
float128_lt(float128 a,float128 b)5352 flag float128_lt( float128 a, float128 b )
5353 {
5354     flag aSign, bSign;
5355 
5356     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5357               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5358          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5359               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5360        ) {
5361         float_raise( float_flag_invalid );
5362         return 0;
5363     }
5364     aSign = extractFloat128Sign( a );
5365     bSign = extractFloat128Sign( b );
5366     if ( aSign != bSign ) {
5367         return
5368                aSign
5369             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5370                  != 0 );
5371     }
5372     return
5373           aSign ? lt128( b.high, b.low, a.high, a.low )
5374         : lt128( a.high, a.low, b.high, b.low );
5375 
5376 }
5377 
5378 /*
5379 -------------------------------------------------------------------------------
5380 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5381 the corresponding value `b', and 0 otherwise.  The invalid exception is
5382 raised if either operand is a NaN.  Otherwise, the comparison is performed
5383 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5384 -------------------------------------------------------------------------------
5385 */
float128_eq_signaling(float128 a,float128 b)5386 flag float128_eq_signaling( float128 a, float128 b )
5387 {
5388 
5389     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5390               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5391          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5392               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5393        ) {
5394         float_raise( float_flag_invalid );
5395         return 0;
5396     }
5397     return
5398            ( a.low == b.low )
5399         && (    ( a.high == b.high )
5400              || (    ( a.low == 0 )
5401                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5402            );
5403 
5404 }
5405 
5406 /*
5407 -------------------------------------------------------------------------------
5408 Returns 1 if the quadruple-precision floating-point value `a' is less than
5409 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5410 cause an exception.  Otherwise, the comparison is performed according to the
5411 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5412 -------------------------------------------------------------------------------
5413 */
float128_le_quiet(float128 a,float128 b)5414 flag float128_le_quiet( float128 a, float128 b )
5415 {
5416     flag aSign, bSign;
5417 
5418     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5419               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5420          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5421               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5422        ) {
5423         if (    float128_is_signaling_nan( a )
5424              || float128_is_signaling_nan( b ) ) {
5425             float_raise( float_flag_invalid );
5426         }
5427         return 0;
5428     }
5429     aSign = extractFloat128Sign( a );
5430     bSign = extractFloat128Sign( b );
5431     if ( aSign != bSign ) {
5432         return
5433                aSign
5434             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5435                  == 0 );
5436     }
5437     return
5438           aSign ? le128( b.high, b.low, a.high, a.low )
5439         : le128( a.high, a.low, b.high, b.low );
5440 
5441 }
5442 
5443 /*
5444 -------------------------------------------------------------------------------
5445 Returns 1 if the quadruple-precision floating-point value `a' is less than
5446 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5447 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5448 Standard for Binary Floating-Point Arithmetic.
5449 -------------------------------------------------------------------------------
5450 */
float128_lt_quiet(float128 a,float128 b)5451 flag float128_lt_quiet( float128 a, float128 b )
5452 {
5453     flag aSign, bSign;
5454 
5455     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5456               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5457          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5458               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5459        ) {
5460         if (    float128_is_signaling_nan( a )
5461              || float128_is_signaling_nan( b ) ) {
5462             float_raise( float_flag_invalid );
5463         }
5464         return 0;
5465     }
5466     aSign = extractFloat128Sign( a );
5467     bSign = extractFloat128Sign( b );
5468     if ( aSign != bSign ) {
5469         return
5470                aSign
5471             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5472                  != 0 );
5473     }
5474     return
5475           aSign ? lt128( b.high, b.low, a.high, a.low )
5476         : lt128( a.high, a.low, b.high, b.low );
5477 
5478 }
5479 
5480 #endif
5481 
5482 
5483 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5484 
5485 /*
5486  * These two routines are not part of the original softfloat distribution.
5487  *
5488  * They are based on the corresponding conversions to integer but return
5489  * unsigned numbers instead since these functions are required by GCC.
5490  *
5491  * Added by Mark Brinicombe <mark@netbsd.org>	27/09/97
5492  *
5493  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5494  */
5495 
5496 /*
5497 -------------------------------------------------------------------------------
5498 Returns the result of converting the double-precision floating-point value
5499 `a' to the 32-bit unsigned integer format.  The conversion is
5500 performed according to the IEC/IEEE Standard for Binary Floating-point
5501 Arithmetic, except that the conversion is always rounded toward zero.  If
5502 `a' is a NaN, the largest positive integer is returned.  If the conversion
5503 overflows, the largest integer positive is returned.
5504 -------------------------------------------------------------------------------
5505 */
float64_to_uint32_round_to_zero(float64 a)5506 uint32 float64_to_uint32_round_to_zero( float64 a )
5507 {
5508     flag aSign;
5509     int16 aExp, shiftCount;
5510     bits64 aSig, savedASig;
5511     uint32 z;
5512 
5513     aSig = extractFloat64Frac( a );
5514     aExp = extractFloat64Exp( a );
5515     aSign = extractFloat64Sign( a );
5516 
5517     if (aSign) {
5518         float_raise( float_flag_invalid );
5519     	return(0);
5520     }
5521 
5522     if ( 0x41E < aExp ) {
5523         float_raise( float_flag_invalid );
5524         return 0xffffffff;
5525     }
5526     else if ( aExp < 0x3FF ) {
5527         if ( aExp || aSig ) float_set_inexact();
5528         return 0;
5529     }
5530     aSig |= LIT64( 0x0010000000000000 );
5531     shiftCount = 0x433 - aExp;
5532     savedASig = aSig;
5533     aSig >>= shiftCount;
5534     z = aSig;
5535     if ( ( aSig<<shiftCount ) != savedASig ) {
5536         float_set_inexact();
5537     }
5538     return z;
5539 
5540 }
5541 
5542 /*
5543 -------------------------------------------------------------------------------
5544 Returns the result of converting the single-precision floating-point value
5545 `a' to the 32-bit unsigned integer format.  The conversion is
5546 performed according to the IEC/IEEE Standard for Binary Floating-point
5547 Arithmetic, except that the conversion is always rounded toward zero.  If
5548 `a' is a NaN, the largest positive integer is returned.  If the conversion
5549 overflows, the largest positive integer is returned.
5550 -------------------------------------------------------------------------------
5551 */
float32_to_uint32_round_to_zero(float32 a)5552 uint32 float32_to_uint32_round_to_zero( float32 a )
5553 {
5554     flag aSign;
5555     int16 aExp, shiftCount;
5556     bits32 aSig;
5557     uint32 z;
5558 
5559     aSig = extractFloat32Frac( a );
5560     aExp = extractFloat32Exp( a );
5561     aSign = extractFloat32Sign( a );
5562     shiftCount = aExp - 0x9E;
5563 
5564     if (aSign) {
5565         float_raise( float_flag_invalid );
5566     	return(0);
5567     }
5568     if ( 0 < shiftCount ) {
5569         float_raise( float_flag_invalid );
5570         return 0xFFFFFFFF;
5571     }
5572     else if ( aExp <= 0x7E ) {
5573         if ( aExp | aSig ) float_set_inexact();
5574         return 0;
5575     }
5576     aSig = ( aSig | 0x800000 )<<8;
5577     z = aSig>>( - shiftCount );
5578     if ( aSig<<( shiftCount & 31 ) ) {
5579         float_set_inexact();
5580     }
5581     return z;
5582 
5583 }
5584 
5585 #endif
5586