xref: /freebsd/lib/libc/softfloat/bits64/softfloat.c (revision 315ee00f)
1 /* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*
18 ===============================================================================
19 
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 #include <sys/cdefs.h>
48 #ifdef SOFTFLOAT_FOR_GCC
49 #include "softfloat-for-gcc.h"
50 #endif
51 
52 #include "milieu.h"
53 #include "softfloat.h"
54 
55 /*
56  * Conversions between floats as stored in memory and floats as
57  * SoftFloat uses them
58  */
59 #ifndef FLOAT64_DEMANGLE
60 #define FLOAT64_DEMANGLE(a)	(a)
61 #endif
62 #ifndef FLOAT64_MANGLE
63 #define FLOAT64_MANGLE(a)	(a)
64 #endif
65 
66 /*
67 -------------------------------------------------------------------------------
68 Floating-point rounding mode, extended double-precision rounding precision,
69 and exception flags.
70 -------------------------------------------------------------------------------
71 */
72 int float_rounding_mode = float_round_nearest_even;
73 int float_exception_flags = 0;
74 #ifdef FLOATX80
75 int8 floatx80_rounding_precision = 80;
76 #endif
77 
78 /*
79 -------------------------------------------------------------------------------
80 Primitive arithmetic functions, including multi-word arithmetic, and
81 division and square root approximations.  (Can be specialized to target if
82 desired.)
83 -------------------------------------------------------------------------------
84 */
85 #include "softfloat-macros"
86 
87 /*
88 -------------------------------------------------------------------------------
89 Functions and definitions to determine:  (1) whether tininess for underflow
90 is detected before or after rounding by default, (2) what (if anything)
91 happens when exceptions are raised, (3) how signaling NaNs are distinguished
92 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
93 are propagated from function inputs to output.  These details are target-
94 specific.
95 -------------------------------------------------------------------------------
96 */
97 #include "softfloat-specialize"
98 
99 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
100 /*
101 -------------------------------------------------------------------------------
102 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103 and 7, and returns the properly rounded 32-bit integer corresponding to the
104 input.  If `zSign' is 1, the input is negated before being converted to an
105 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
106 is simply rounded to an integer, with the inexact exception raised if the
107 input cannot be represented exactly as an integer.  However, if the fixed-
108 point input is too large, the invalid exception is raised and the largest
109 positive or negative integer is returned.
110 -------------------------------------------------------------------------------
111 */
112 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
113 {
114     int8 roundingMode;
115     flag roundNearestEven;
116     int8 roundIncrement, roundBits;
117     int32 z;
118 
119     roundingMode = float_rounding_mode;
120     roundNearestEven = ( roundingMode == float_round_nearest_even );
121     roundIncrement = 0x40;
122     if ( ! roundNearestEven ) {
123         if ( roundingMode == float_round_to_zero ) {
124             roundIncrement = 0;
125         }
126         else {
127             roundIncrement = 0x7F;
128             if ( zSign ) {
129                 if ( roundingMode == float_round_up ) roundIncrement = 0;
130             }
131             else {
132                 if ( roundingMode == float_round_down ) roundIncrement = 0;
133             }
134         }
135     }
136     roundBits = absZ & 0x7F;
137     absZ = ( absZ + roundIncrement )>>7;
138     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
139     z = absZ;
140     if ( zSign ) z = - z;
141     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
142         float_raise( float_flag_invalid );
143         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
144     }
145     if ( roundBits ) float_exception_flags |= float_flag_inexact;
146     return z;
147 
148 }
149 
150 /*
151 -------------------------------------------------------------------------------
152 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
153 `absZ1', with binary point between bits 63 and 64 (between the input words),
154 and returns the properly rounded 64-bit integer corresponding to the input.
155 If `zSign' is 1, the input is negated before being converted to an integer.
156 Ordinarily, the fixed-point input is simply rounded to an integer, with
157 the inexact exception raised if the input cannot be represented exactly as
158 an integer.  However, if the fixed-point input is too large, the invalid
159 exception is raised and the largest positive or negative integer is
160 returned.
161 -------------------------------------------------------------------------------
162 */
163 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
164 {
165     int8 roundingMode;
166     flag roundNearestEven, increment;
167     int64 z;
168 
169     roundingMode = float_rounding_mode;
170     roundNearestEven = ( roundingMode == float_round_nearest_even );
171     increment = ( (sbits64) absZ1 < 0 );
172     if ( ! roundNearestEven ) {
173         if ( roundingMode == float_round_to_zero ) {
174             increment = 0;
175         }
176         else {
177             if ( zSign ) {
178                 increment = ( roundingMode == float_round_down ) && absZ1;
179             }
180             else {
181                 increment = ( roundingMode == float_round_up ) && absZ1;
182             }
183         }
184     }
185     if ( increment ) {
186         ++absZ0;
187         if ( absZ0 == 0 ) goto overflow;
188         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
189     }
190     z = absZ0;
191     if ( zSign ) z = - z;
192     if ( z && ( ( z < 0 ) ^ zSign ) ) {
193  overflow:
194         float_raise( float_flag_invalid );
195         return
196               zSign ? (sbits64) LIT64( 0x8000000000000000 )
197             : LIT64( 0x7FFFFFFFFFFFFFFF );
198     }
199     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
200     return z;
201 
202 }
203 #endif
204 
205 /*
206 -------------------------------------------------------------------------------
207 Returns the fraction bits of the single-precision floating-point value `a'.
208 -------------------------------------------------------------------------------
209 */
210 INLINE bits32 extractFloat32Frac( float32 a )
211 {
212 
213     return a & 0x007FFFFF;
214 
215 }
216 
217 /*
218 -------------------------------------------------------------------------------
219 Returns the exponent bits of the single-precision floating-point value `a'.
220 -------------------------------------------------------------------------------
221 */
222 INLINE int16 extractFloat32Exp( float32 a )
223 {
224 
225     return ( a>>23 ) & 0xFF;
226 
227 }
228 
229 /*
230 -------------------------------------------------------------------------------
231 Returns the sign bit of the single-precision floating-point value `a'.
232 -------------------------------------------------------------------------------
233 */
234 INLINE flag extractFloat32Sign( float32 a )
235 {
236 
237     return a>>31;
238 
239 }
240 
241 /*
242 -------------------------------------------------------------------------------
243 Normalizes the subnormal single-precision floating-point value represented
244 by the denormalized significand `aSig'.  The normalized exponent and
245 significand are stored at the locations pointed to by `zExpPtr' and
246 `zSigPtr', respectively.
247 -------------------------------------------------------------------------------
248 */
249 static void
250  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
251 {
252     int8 shiftCount;
253 
254     shiftCount = countLeadingZeros32( aSig ) - 8;
255     *zSigPtr = aSig<<shiftCount;
256     *zExpPtr = 1 - shiftCount;
257 
258 }
259 
260 /*
261 -------------------------------------------------------------------------------
262 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
263 single-precision floating-point value, returning the result.  After being
264 shifted into the proper positions, the three fields are simply added
265 together to form the result.  This means that any integer portion of `zSig'
266 will be added into the exponent.  Since a properly normalized significand
267 will have an integer portion equal to 1, the `zExp' input should be 1 less
268 than the desired result exponent whenever `zSig' is a complete, normalized
269 significand.
270 -------------------------------------------------------------------------------
271 */
272 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
273 {
274 
275     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
276 
277 }
278 
279 /*
280 -------------------------------------------------------------------------------
281 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
282 and significand `zSig', and returns the proper single-precision floating-
283 point value corresponding to the abstract input.  Ordinarily, the abstract
284 value is simply rounded and packed into the single-precision format, with
285 the inexact exception raised if the abstract input cannot be represented
286 exactly.  However, if the abstract value is too large, the overflow and
287 inexact exceptions are raised and an infinity or maximal finite value is
288 returned.  If the abstract value is too small, the input value is rounded to
289 a subnormal number, and the underflow and inexact exceptions are raised if
290 the abstract input cannot be represented exactly as a subnormal single-
291 precision floating-point number.
292     The input significand `zSig' has its binary point between bits 30
293 and 29, which is 7 bits to the left of the usual location.  This shifted
294 significand must be normalized or smaller.  If `zSig' is not normalized,
295 `zExp' must be 0; in that case, the result returned is a subnormal number,
296 and it must not require rounding.  In the usual case that `zSig' is
297 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
298 The handling of underflow and overflow follows the IEC/IEEE Standard for
299 Binary Floating-Point Arithmetic.
300 -------------------------------------------------------------------------------
301 */
302 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
303 {
304     int8 roundingMode;
305     flag roundNearestEven;
306     int8 roundIncrement, roundBits;
307     flag isTiny;
308 
309     roundingMode = float_rounding_mode;
310     roundNearestEven = ( roundingMode == float_round_nearest_even );
311     roundIncrement = 0x40;
312     if ( ! roundNearestEven ) {
313         if ( roundingMode == float_round_to_zero ) {
314             roundIncrement = 0;
315         }
316         else {
317             roundIncrement = 0x7F;
318             if ( zSign ) {
319                 if ( roundingMode == float_round_up ) roundIncrement = 0;
320             }
321             else {
322                 if ( roundingMode == float_round_down ) roundIncrement = 0;
323             }
324         }
325     }
326     roundBits = zSig & 0x7F;
327     if ( 0xFD <= (bits16) zExp ) {
328         if (    ( 0xFD < zExp )
329              || (    ( zExp == 0xFD )
330                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
331            ) {
332             float_raise( float_flag_overflow | float_flag_inexact );
333             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
334         }
335         if ( zExp < 0 ) {
336             isTiny =
337                    ( float_detect_tininess == float_tininess_before_rounding )
338                 || ( zExp < -1 )
339                 || ( zSig + roundIncrement < 0x80000000 );
340             shift32RightJamming( zSig, - zExp, &zSig );
341             zExp = 0;
342             roundBits = zSig & 0x7F;
343             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
344         }
345     }
346     if ( roundBits ) float_exception_flags |= float_flag_inexact;
347     zSig = ( zSig + roundIncrement )>>7;
348     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
349     if ( zSig == 0 ) zExp = 0;
350     return packFloat32( zSign, zExp, zSig );
351 
352 }
353 
354 /*
355 -------------------------------------------------------------------------------
356 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
357 and significand `zSig', and returns the proper single-precision floating-
358 point value corresponding to the abstract input.  This routine is just like
359 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
360 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
361 floating-point exponent.
362 -------------------------------------------------------------------------------
363 */
364 static float32
365  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
366 {
367     int8 shiftCount;
368 
369     shiftCount = countLeadingZeros32( zSig ) - 1;
370     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
371 
372 }
373 
374 /*
375 -------------------------------------------------------------------------------
376 Returns the fraction bits of the double-precision floating-point value `a'.
377 -------------------------------------------------------------------------------
378 */
379 INLINE bits64 extractFloat64Frac( float64 a )
380 {
381 
382     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
383 
384 }
385 
386 /*
387 -------------------------------------------------------------------------------
388 Returns the exponent bits of the double-precision floating-point value `a'.
389 -------------------------------------------------------------------------------
390 */
391 INLINE int16 extractFloat64Exp( float64 a )
392 {
393 
394     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
395 
396 }
397 
398 /*
399 -------------------------------------------------------------------------------
400 Returns the sign bit of the double-precision floating-point value `a'.
401 -------------------------------------------------------------------------------
402 */
403 INLINE flag extractFloat64Sign( float64 a )
404 {
405 
406     return FLOAT64_DEMANGLE(a)>>63;
407 
408 }
409 
410 /*
411 -------------------------------------------------------------------------------
412 Normalizes the subnormal double-precision floating-point value represented
413 by the denormalized significand `aSig'.  The normalized exponent and
414 significand are stored at the locations pointed to by `zExpPtr' and
415 `zSigPtr', respectively.
416 -------------------------------------------------------------------------------
417 */
418 static void
419  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
420 {
421     int8 shiftCount;
422 
423     shiftCount = countLeadingZeros64( aSig ) - 11;
424     *zSigPtr = aSig<<shiftCount;
425     *zExpPtr = 1 - shiftCount;
426 
427 }
428 
429 /*
430 -------------------------------------------------------------------------------
431 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
432 double-precision floating-point value, returning the result.  After being
433 shifted into the proper positions, the three fields are simply added
434 together to form the result.  This means that any integer portion of `zSig'
435 will be added into the exponent.  Since a properly normalized significand
436 will have an integer portion equal to 1, the `zExp' input should be 1 less
437 than the desired result exponent whenever `zSig' is a complete, normalized
438 significand.
439 -------------------------------------------------------------------------------
440 */
441 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
442 {
443 
444     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
445 			   ( ( (bits64) zExp )<<52 ) + zSig );
446 
447 }
448 
449 /*
450 -------------------------------------------------------------------------------
451 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
452 and significand `zSig', and returns the proper double-precision floating-
453 point value corresponding to the abstract input.  Ordinarily, the abstract
454 value is simply rounded and packed into the double-precision format, with
455 the inexact exception raised if the abstract input cannot be represented
456 exactly.  However, if the abstract value is too large, the overflow and
457 inexact exceptions are raised and an infinity or maximal finite value is
458 returned.  If the abstract value is too small, the input value is rounded to
459 a subnormal number, and the underflow and inexact exceptions are raised if
460 the abstract input cannot be represented exactly as a subnormal double-
461 precision floating-point number.
462     The input significand `zSig' has its binary point between bits 62
463 and 61, which is 10 bits to the left of the usual location.  This shifted
464 significand must be normalized or smaller.  If `zSig' is not normalized,
465 `zExp' must be 0; in that case, the result returned is a subnormal number,
466 and it must not require rounding.  In the usual case that `zSig' is
467 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
468 The handling of underflow and overflow follows the IEC/IEEE Standard for
469 Binary Floating-Point Arithmetic.
470 -------------------------------------------------------------------------------
471 */
472 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
473 {
474     int8 roundingMode;
475     flag roundNearestEven;
476     int16 roundIncrement, roundBits;
477     flag isTiny;
478 
479     roundingMode = float_rounding_mode;
480     roundNearestEven = ( roundingMode == float_round_nearest_even );
481     roundIncrement = 0x200;
482     if ( ! roundNearestEven ) {
483         if ( roundingMode == float_round_to_zero ) {
484             roundIncrement = 0;
485         }
486         else {
487             roundIncrement = 0x3FF;
488             if ( zSign ) {
489                 if ( roundingMode == float_round_up ) roundIncrement = 0;
490             }
491             else {
492                 if ( roundingMode == float_round_down ) roundIncrement = 0;
493             }
494         }
495     }
496     roundBits = zSig & 0x3FF;
497     if ( 0x7FD <= (bits16) zExp ) {
498         if (    ( 0x7FD < zExp )
499              || (    ( zExp == 0x7FD )
500                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
501            ) {
502             float_raise( float_flag_overflow | float_flag_inexact );
503             return FLOAT64_MANGLE(
504 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
505 		( roundIncrement == 0 ));
506         }
507         if ( zExp < 0 ) {
508             isTiny =
509                    ( float_detect_tininess == float_tininess_before_rounding )
510                 || ( zExp < -1 )
511                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
512             shift64RightJamming( zSig, - zExp, &zSig );
513             zExp = 0;
514             roundBits = zSig & 0x3FF;
515             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
516         }
517     }
518     if ( roundBits ) float_exception_flags |= float_flag_inexact;
519     zSig = ( zSig + roundIncrement )>>10;
520     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
521     if ( zSig == 0 ) zExp = 0;
522     return packFloat64( zSign, zExp, zSig );
523 
524 }
525 
526 /*
527 -------------------------------------------------------------------------------
528 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
529 and significand `zSig', and returns the proper double-precision floating-
530 point value corresponding to the abstract input.  This routine is just like
531 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
532 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
533 floating-point exponent.
534 -------------------------------------------------------------------------------
535 */
536 static float64
537  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
538 {
539     int8 shiftCount;
540 
541     shiftCount = countLeadingZeros64( zSig ) - 1;
542     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
543 
544 }
545 
546 #ifdef FLOATX80
547 
548 /*
549 -------------------------------------------------------------------------------
550 Returns the fraction bits of the extended double-precision floating-point
551 value `a'.
552 -------------------------------------------------------------------------------
553 */
554 INLINE bits64 extractFloatx80Frac( floatx80 a )
555 {
556 
557     return a.low;
558 
559 }
560 
561 /*
562 -------------------------------------------------------------------------------
563 Returns the exponent bits of the extended double-precision floating-point
564 value `a'.
565 -------------------------------------------------------------------------------
566 */
567 INLINE int32 extractFloatx80Exp( floatx80 a )
568 {
569 
570     return a.high & 0x7FFF;
571 
572 }
573 
574 /*
575 -------------------------------------------------------------------------------
576 Returns the sign bit of the extended double-precision floating-point value
577 `a'.
578 -------------------------------------------------------------------------------
579 */
580 INLINE flag extractFloatx80Sign( floatx80 a )
581 {
582 
583     return a.high>>15;
584 
585 }
586 
587 /*
588 -------------------------------------------------------------------------------
589 Normalizes the subnormal extended double-precision floating-point value
590 represented by the denormalized significand `aSig'.  The normalized exponent
591 and significand are stored at the locations pointed to by `zExpPtr' and
592 `zSigPtr', respectively.
593 -------------------------------------------------------------------------------
594 */
595 static void
596  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
597 {
598     int8 shiftCount;
599 
600     shiftCount = countLeadingZeros64( aSig );
601     *zSigPtr = aSig<<shiftCount;
602     *zExpPtr = 1 - shiftCount;
603 
604 }
605 
606 /*
607 -------------------------------------------------------------------------------
608 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
609 extended double-precision floating-point value, returning the result.
610 -------------------------------------------------------------------------------
611 */
612 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
613 {
614     floatx80 z;
615 
616     z.low = zSig;
617     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
618     return z;
619 
620 }
621 
622 /*
623 -------------------------------------------------------------------------------
624 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
625 and extended significand formed by the concatenation of `zSig0' and `zSig1',
626 and returns the proper extended double-precision floating-point value
627 corresponding to the abstract input.  Ordinarily, the abstract value is
628 rounded and packed into the extended double-precision format, with the
629 inexact exception raised if the abstract input cannot be represented
630 exactly.  However, if the abstract value is too large, the overflow and
631 inexact exceptions are raised and an infinity or maximal finite value is
632 returned.  If the abstract value is too small, the input value is rounded to
633 a subnormal number, and the underflow and inexact exceptions are raised if
634 the abstract input cannot be represented exactly as a subnormal extended
635 double-precision floating-point number.
636     If `roundingPrecision' is 32 or 64, the result is rounded to the same
637 number of bits as single or double precision, respectively.  Otherwise, the
638 result is rounded to the full precision of the extended double-precision
639 format.
640     The input significand must be normalized or smaller.  If the input
641 significand is not normalized, `zExp' must be 0; in that case, the result
642 returned is a subnormal number, and it must not require rounding.  The
643 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
644 Floating-Point Arithmetic.
645 -------------------------------------------------------------------------------
646 */
647 static floatx80
648  roundAndPackFloatx80(
649      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
650  )
651 {
652     int8 roundingMode;
653     flag roundNearestEven, increment, isTiny;
654     int64 roundIncrement, roundMask, roundBits;
655 
656     roundingMode = float_rounding_mode;
657     roundNearestEven = ( roundingMode == float_round_nearest_even );
658     if ( roundingPrecision == 80 ) goto precision80;
659     if ( roundingPrecision == 64 ) {
660         roundIncrement = LIT64( 0x0000000000000400 );
661         roundMask = LIT64( 0x00000000000007FF );
662     }
663     else if ( roundingPrecision == 32 ) {
664         roundIncrement = LIT64( 0x0000008000000000 );
665         roundMask = LIT64( 0x000000FFFFFFFFFF );
666     }
667     else {
668         goto precision80;
669     }
670     zSig0 |= ( zSig1 != 0 );
671     if ( ! roundNearestEven ) {
672         if ( roundingMode == float_round_to_zero ) {
673             roundIncrement = 0;
674         }
675         else {
676             roundIncrement = roundMask;
677             if ( zSign ) {
678                 if ( roundingMode == float_round_up ) roundIncrement = 0;
679             }
680             else {
681                 if ( roundingMode == float_round_down ) roundIncrement = 0;
682             }
683         }
684     }
685     roundBits = zSig0 & roundMask;
686     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
687         if (    ( 0x7FFE < zExp )
688              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
689            ) {
690             goto overflow;
691         }
692         if ( zExp <= 0 ) {
693             isTiny =
694                    ( float_detect_tininess == float_tininess_before_rounding )
695                 || ( zExp < 0 )
696                 || ( zSig0 <= zSig0 + roundIncrement );
697             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
698             zExp = 0;
699             roundBits = zSig0 & roundMask;
700             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
701             if ( roundBits ) float_exception_flags |= float_flag_inexact;
702             zSig0 += roundIncrement;
703             if ( (sbits64) zSig0 < 0 ) zExp = 1;
704             roundIncrement = roundMask + 1;
705             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
706                 roundMask |= roundIncrement;
707             }
708             zSig0 &= ~ roundMask;
709             return packFloatx80( zSign, zExp, zSig0 );
710         }
711     }
712     if ( roundBits ) float_exception_flags |= float_flag_inexact;
713     zSig0 += roundIncrement;
714     if ( zSig0 < roundIncrement ) {
715         ++zExp;
716         zSig0 = LIT64( 0x8000000000000000 );
717     }
718     roundIncrement = roundMask + 1;
719     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
720         roundMask |= roundIncrement;
721     }
722     zSig0 &= ~ roundMask;
723     if ( zSig0 == 0 ) zExp = 0;
724     return packFloatx80( zSign, zExp, zSig0 );
725  precision80:
726     increment = ( (sbits64) zSig1 < 0 );
727     if ( ! roundNearestEven ) {
728         if ( roundingMode == float_round_to_zero ) {
729             increment = 0;
730         }
731         else {
732             if ( zSign ) {
733                 increment = ( roundingMode == float_round_down ) && zSig1;
734             }
735             else {
736                 increment = ( roundingMode == float_round_up ) && zSig1;
737             }
738         }
739     }
740     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
741         if (    ( 0x7FFE < zExp )
742              || (    ( zExp == 0x7FFE )
743                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
744                   && increment
745                 )
746            ) {
747             roundMask = 0;
748  overflow:
749             float_raise( float_flag_overflow | float_flag_inexact );
750             if (    ( roundingMode == float_round_to_zero )
751                  || ( zSign && ( roundingMode == float_round_up ) )
752                  || ( ! zSign && ( roundingMode == float_round_down ) )
753                ) {
754                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
755             }
756             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
757         }
758         if ( zExp <= 0 ) {
759             isTiny =
760                    ( float_detect_tininess == float_tininess_before_rounding )
761                 || ( zExp < 0 )
762                 || ! increment
763                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
764             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
765             zExp = 0;
766             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
767             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
768             if ( roundNearestEven ) {
769                 increment = ( (sbits64) zSig1 < 0 );
770             }
771             else {
772                 if ( zSign ) {
773                     increment = ( roundingMode == float_round_down ) && zSig1;
774                 }
775                 else {
776                     increment = ( roundingMode == float_round_up ) && zSig1;
777                 }
778             }
779             if ( increment ) {
780                 ++zSig0;
781                 zSig0 &=
782                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
783                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
784             }
785             return packFloatx80( zSign, zExp, zSig0 );
786         }
787     }
788     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
789     if ( increment ) {
790         ++zSig0;
791         if ( zSig0 == 0 ) {
792             ++zExp;
793             zSig0 = LIT64( 0x8000000000000000 );
794         }
795         else {
796             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
797         }
798     }
799     else {
800         if ( zSig0 == 0 ) zExp = 0;
801     }
802     return packFloatx80( zSign, zExp, zSig0 );
803 
804 }
805 
806 /*
807 -------------------------------------------------------------------------------
808 Takes an abstract floating-point value having sign `zSign', exponent
809 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
810 and returns the proper extended double-precision floating-point value
811 corresponding to the abstract input.  This routine is just like
812 `roundAndPackFloatx80' except that the input significand does not have to be
813 normalized.
814 -------------------------------------------------------------------------------
815 */
816 static floatx80
817  normalizeRoundAndPackFloatx80(
818      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
819  )
820 {
821     int8 shiftCount;
822 
823     if ( zSig0 == 0 ) {
824         zSig0 = zSig1;
825         zSig1 = 0;
826         zExp -= 64;
827     }
828     shiftCount = countLeadingZeros64( zSig0 );
829     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
830     zExp -= shiftCount;
831     return
832         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
833 
834 }
835 
836 #endif
837 
838 #ifdef FLOAT128
839 
840 /*
841 -------------------------------------------------------------------------------
842 Returns the least-significant 64 fraction bits of the quadruple-precision
843 floating-point value `a'.
844 -------------------------------------------------------------------------------
845 */
846 INLINE bits64 extractFloat128Frac1( float128 a )
847 {
848 
849     return a.low;
850 
851 }
852 
853 /*
854 -------------------------------------------------------------------------------
855 Returns the most-significant 48 fraction bits of the quadruple-precision
856 floating-point value `a'.
857 -------------------------------------------------------------------------------
858 */
859 INLINE bits64 extractFloat128Frac0( float128 a )
860 {
861 
862     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
863 
864 }
865 
866 /*
867 -------------------------------------------------------------------------------
868 Returns the exponent bits of the quadruple-precision floating-point value
869 `a'.
870 -------------------------------------------------------------------------------
871 */
872 INLINE int32 extractFloat128Exp( float128 a )
873 {
874 
875     return ( a.high>>48 ) & 0x7FFF;
876 
877 }
878 
879 /*
880 -------------------------------------------------------------------------------
881 Returns the sign bit of the quadruple-precision floating-point value `a'.
882 -------------------------------------------------------------------------------
883 */
884 INLINE flag extractFloat128Sign( float128 a )
885 {
886 
887     return a.high>>63;
888 
889 }
890 
891 /*
892 -------------------------------------------------------------------------------
893 Normalizes the subnormal quadruple-precision floating-point value
894 represented by the denormalized significand formed by the concatenation of
895 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
896 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
897 significand are stored at the location pointed to by `zSig0Ptr', and the
898 least significant 64 bits of the normalized significand are stored at the
899 location pointed to by `zSig1Ptr'.
900 -------------------------------------------------------------------------------
901 */
902 static void
903  normalizeFloat128Subnormal(
904      bits64 aSig0,
905      bits64 aSig1,
906      int32 *zExpPtr,
907      bits64 *zSig0Ptr,
908      bits64 *zSig1Ptr
909  )
910 {
911     int8 shiftCount;
912 
913     if ( aSig0 == 0 ) {
914         shiftCount = countLeadingZeros64( aSig1 ) - 15;
915         if ( shiftCount < 0 ) {
916             *zSig0Ptr = aSig1>>( - shiftCount );
917             *zSig1Ptr = aSig1<<( shiftCount & 63 );
918         }
919         else {
920             *zSig0Ptr = aSig1<<shiftCount;
921             *zSig1Ptr = 0;
922         }
923         *zExpPtr = - shiftCount - 63;
924     }
925     else {
926         shiftCount = countLeadingZeros64( aSig0 ) - 15;
927         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
928         *zExpPtr = 1 - shiftCount;
929     }
930 
931 }
932 
933 /*
934 -------------------------------------------------------------------------------
935 Packs the sign `zSign', the exponent `zExp', and the significand formed
936 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
937 floating-point value, returning the result.  After being shifted into the
938 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
939 added together to form the most significant 32 bits of the result.  This
940 means that any integer portion of `zSig0' will be added into the exponent.
941 Since a properly normalized significand will have an integer portion equal
942 to 1, the `zExp' input should be 1 less than the desired result exponent
943 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
944 significand.
945 -------------------------------------------------------------------------------
946 */
947 INLINE float128
948  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
949 {
950     float128 z;
951 
952     z.low = zSig1;
953     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
954     return z;
955 
956 }
957 
958 /*
959 -------------------------------------------------------------------------------
960 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
961 and extended significand formed by the concatenation of `zSig0', `zSig1',
962 and `zSig2', and returns the proper quadruple-precision floating-point value
963 corresponding to the abstract input.  Ordinarily, the abstract value is
964 simply rounded and packed into the quadruple-precision format, with the
965 inexact exception raised if the abstract input cannot be represented
966 exactly.  However, if the abstract value is too large, the overflow and
967 inexact exceptions are raised and an infinity or maximal finite value is
968 returned.  If the abstract value is too small, the input value is rounded to
969 a subnormal number, and the underflow and inexact exceptions are raised if
970 the abstract input cannot be represented exactly as a subnormal quadruple-
971 precision floating-point number.
972     The input significand must be normalized or smaller.  If the input
973 significand is not normalized, `zExp' must be 0; in that case, the result
974 returned is a subnormal number, and it must not require rounding.  In the
975 usual case that the input significand is normalized, `zExp' must be 1 less
976 than the ``true'' floating-point exponent.  The handling of underflow and
977 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
978 -------------------------------------------------------------------------------
979 */
980 static float128
981  roundAndPackFloat128(
982      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
983 {
984     int8 roundingMode;
985     flag roundNearestEven, increment, isTiny;
986 
987     roundingMode = float_rounding_mode;
988     roundNearestEven = ( roundingMode == float_round_nearest_even );
989     increment = ( (sbits64) zSig2 < 0 );
990     if ( ! roundNearestEven ) {
991         if ( roundingMode == float_round_to_zero ) {
992             increment = 0;
993         }
994         else {
995             if ( zSign ) {
996                 increment = ( roundingMode == float_round_down ) && zSig2;
997             }
998             else {
999                 increment = ( roundingMode == float_round_up ) && zSig2;
1000             }
1001         }
1002     }
1003     if ( 0x7FFD <= (bits32) zExp ) {
1004         if (    ( 0x7FFD < zExp )
1005              || (    ( zExp == 0x7FFD )
1006                   && eq128(
1007                          LIT64( 0x0001FFFFFFFFFFFF ),
1008                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1009                          zSig0,
1010                          zSig1
1011                      )
1012                   && increment
1013                 )
1014            ) {
1015             float_raise( float_flag_overflow | float_flag_inexact );
1016             if (    ( roundingMode == float_round_to_zero )
1017                  || ( zSign && ( roundingMode == float_round_up ) )
1018                  || ( ! zSign && ( roundingMode == float_round_down ) )
1019                ) {
1020                 return
1021                     packFloat128(
1022                         zSign,
1023                         0x7FFE,
1024                         LIT64( 0x0000FFFFFFFFFFFF ),
1025                         LIT64( 0xFFFFFFFFFFFFFFFF )
1026                     );
1027             }
1028             return packFloat128( zSign, 0x7FFF, 0, 0 );
1029         }
1030         if ( zExp < 0 ) {
1031             isTiny =
1032                    ( float_detect_tininess == float_tininess_before_rounding )
1033                 || ( zExp < -1 )
1034                 || ! increment
1035                 || lt128(
1036                        zSig0,
1037                        zSig1,
1038                        LIT64( 0x0001FFFFFFFFFFFF ),
1039                        LIT64( 0xFFFFFFFFFFFFFFFF )
1040                    );
1041             shift128ExtraRightJamming(
1042                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1043             zExp = 0;
1044             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1045             if ( roundNearestEven ) {
1046                 increment = ( (sbits64) zSig2 < 0 );
1047             }
1048             else {
1049                 if ( zSign ) {
1050                     increment = ( roundingMode == float_round_down ) && zSig2;
1051                 }
1052                 else {
1053                     increment = ( roundingMode == float_round_up ) && zSig2;
1054                 }
1055             }
1056         }
1057     }
1058     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1059     if ( increment ) {
1060         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1061         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1062     }
1063     else {
1064         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1065     }
1066     return packFloat128( zSign, zExp, zSig0, zSig1 );
1067 
1068 }
1069 
1070 /*
1071 -------------------------------------------------------------------------------
1072 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1073 and significand formed by the concatenation of `zSig0' and `zSig1', and
1074 returns the proper quadruple-precision floating-point value corresponding
1075 to the abstract input.  This routine is just like `roundAndPackFloat128'
1076 except that the input significand has fewer bits and does not have to be
1077 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1078 point exponent.
1079 -------------------------------------------------------------------------------
1080 */
1081 static float128
1082  normalizeRoundAndPackFloat128(
1083      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1084 {
1085     int8 shiftCount;
1086     bits64 zSig2;
1087 
1088     if ( zSig0 == 0 ) {
1089         zSig0 = zSig1;
1090         zSig1 = 0;
1091         zExp -= 64;
1092     }
1093     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1094     if ( 0 <= shiftCount ) {
1095         zSig2 = 0;
1096         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1097     }
1098     else {
1099         shift128ExtraRightJamming(
1100             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1101     }
1102     zExp -= shiftCount;
1103     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1104 
1105 }
1106 
1107 #endif
1108 
1109 /*
1110 -------------------------------------------------------------------------------
1111 Returns the result of converting the 32-bit two's complement integer `a'
1112 to the single-precision floating-point format.  The conversion is performed
1113 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1114 -------------------------------------------------------------------------------
1115 */
1116 float32 int32_to_float32( int32 a )
1117 {
1118     flag zSign;
1119 
1120     if ( a == 0 ) return 0;
1121     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1122     zSign = ( a < 0 );
1123     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1124 
1125 }
1126 
1127 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */
1128 float32 uint32_to_float32( uint32 a )
1129 {
1130     if ( a == 0 ) return 0;
1131     if ( a & (bits32) 0x80000000 )
1132 	return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1133     return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1134 }
1135 #endif
1136 
1137 
1138 /*
1139 -------------------------------------------------------------------------------
1140 Returns the result of converting the 32-bit two's complement integer `a'
1141 to the double-precision floating-point format.  The conversion is performed
1142 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1143 -------------------------------------------------------------------------------
1144 */
1145 float64 int32_to_float64( int32 a )
1146 {
1147     flag zSign;
1148     uint32 absA;
1149     int8 shiftCount;
1150     bits64 zSig;
1151 
1152     if ( a == 0 ) return 0;
1153     zSign = ( a < 0 );
1154     absA = zSign ? - a : a;
1155     shiftCount = countLeadingZeros32( absA ) + 21;
1156     zSig = absA;
1157     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1158 
1159 }
1160 
1161 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */
1162 float64 uint32_to_float64( uint32 a )
1163 {
1164     int8 shiftCount;
1165     bits64 zSig = a;
1166 
1167     if ( a == 0 ) return 0;
1168     shiftCount = countLeadingZeros32( a ) + 21;
1169     return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1170 
1171 }
1172 #endif
1173 
1174 #ifdef FLOATX80
1175 
1176 /*
1177 -------------------------------------------------------------------------------
1178 Returns the result of converting the 32-bit two's complement integer `a'
1179 to the extended double-precision floating-point format.  The conversion
1180 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1181 Arithmetic.
1182 -------------------------------------------------------------------------------
1183 */
1184 floatx80 int32_to_floatx80( int32 a )
1185 {
1186     flag zSign;
1187     uint32 absA;
1188     int8 shiftCount;
1189     bits64 zSig;
1190 
1191     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1192     zSign = ( a < 0 );
1193     absA = zSign ? - a : a;
1194     shiftCount = countLeadingZeros32( absA ) + 32;
1195     zSig = absA;
1196     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1197 
1198 }
1199 
1200 floatx80 uint32_to_floatx80( uint32 a )
1201 {
1202     int8 shiftCount;
1203     bits64 zSig = a;
1204 
1205     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1206     shiftCount = countLeadingZeros32( a ) + 32;
1207     return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1208 
1209 }
1210 
1211 #endif
1212 
1213 #ifdef FLOAT128
1214 
1215 /*
1216 -------------------------------------------------------------------------------
1217 Returns the result of converting the 32-bit two's complement integer `a' to
1218 the quadruple-precision floating-point format.  The conversion is performed
1219 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1220 -------------------------------------------------------------------------------
1221 */
1222 float128 int32_to_float128( int32 a )
1223 {
1224     flag zSign;
1225     uint32 absA;
1226     int8 shiftCount;
1227     bits64 zSig0;
1228 
1229     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1230     zSign = ( a < 0 );
1231     absA = zSign ? - a : a;
1232     shiftCount = countLeadingZeros32( absA ) + 17;
1233     zSig0 = absA;
1234     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1235 
1236 }
1237 
1238 float128 uint32_to_float128( uint32 a )
1239 {
1240     int8 shiftCount;
1241     bits64 zSig0 = a;
1242 
1243     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1244     shiftCount = countLeadingZeros32( a ) + 17;
1245     return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1246 
1247 }
1248 
1249 #endif
1250 
1251 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1252 /*
1253 -------------------------------------------------------------------------------
1254 Returns the result of converting the 64-bit two's complement integer `a'
1255 to the single-precision floating-point format.  The conversion is performed
1256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 -------------------------------------------------------------------------------
1258 */
1259 float32 int64_to_float32( int64 a )
1260 {
1261     flag zSign;
1262     uint64 absA;
1263     int8 shiftCount;
1264 
1265     if ( a == 0 ) return 0;
1266     zSign = ( a < 0 );
1267     absA = zSign ? - a : a;
1268     shiftCount = countLeadingZeros64( absA ) - 40;
1269     if ( 0 <= shiftCount ) {
1270         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1271     }
1272     else {
1273         shiftCount += 7;
1274         if ( shiftCount < 0 ) {
1275             shift64RightJamming( absA, - shiftCount, &absA );
1276         }
1277         else {
1278             absA <<= shiftCount;
1279         }
1280         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1281     }
1282 
1283 }
1284 
1285 /*
1286 -------------------------------------------------------------------------------
1287 Returns the result of converting the 64-bit two's complement integer `a'
1288 to the double-precision floating-point format.  The conversion is performed
1289 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1290 -------------------------------------------------------------------------------
1291 */
1292 float64 int64_to_float64( int64 a )
1293 {
1294     flag zSign;
1295 
1296     if ( a == 0 ) return 0;
1297     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1298         return packFloat64( 1, 0x43E, 0 );
1299     }
1300     zSign = ( a < 0 );
1301     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1302 
1303 }
1304 
1305 #ifdef FLOATX80
1306 
1307 /*
1308 -------------------------------------------------------------------------------
1309 Returns the result of converting the 64-bit two's complement integer `a'
1310 to the extended double-precision floating-point format.  The conversion
1311 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 Arithmetic.
1313 -------------------------------------------------------------------------------
1314 */
1315 floatx80 int64_to_floatx80( int64 a )
1316 {
1317     flag zSign;
1318     uint64 absA;
1319     int8 shiftCount;
1320 
1321     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1322     zSign = ( a < 0 );
1323     absA = zSign ? - a : a;
1324     shiftCount = countLeadingZeros64( absA );
1325     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1326 
1327 }
1328 
1329 #endif
1330 
1331 #endif /* !SOFTFLOAT_FOR_GCC */
1332 
1333 #ifdef FLOAT128
1334 
1335 /*
1336 -------------------------------------------------------------------------------
1337 Returns the result of converting the 64-bit two's complement integer `a' to
1338 the quadruple-precision floating-point format.  The conversion is performed
1339 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1340 -------------------------------------------------------------------------------
1341 */
1342 float128 int64_to_float128( int64 a )
1343 {
1344     flag zSign;
1345     uint64 absA;
1346     int8 shiftCount;
1347     int32 zExp;
1348     bits64 zSig0, zSig1;
1349 
1350     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1351     zSign = ( a < 0 );
1352     absA = zSign ? - a : a;
1353     shiftCount = countLeadingZeros64( absA ) + 49;
1354     zExp = 0x406E - shiftCount;
1355     if ( 64 <= shiftCount ) {
1356         zSig1 = 0;
1357         zSig0 = absA;
1358         shiftCount -= 64;
1359     }
1360     else {
1361         zSig1 = absA;
1362         zSig0 = 0;
1363     }
1364     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1365     return packFloat128( zSign, zExp, zSig0, zSig1 );
1366 
1367 }
1368 
1369 #endif
1370 
1371 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1372 /*
1373 -------------------------------------------------------------------------------
1374 Returns the result of converting the single-precision floating-point value
1375 `a' to the 32-bit two's complement integer format.  The conversion is
1376 performed according to the IEC/IEEE Standard for Binary Floating-Point
1377 Arithmetic---which means in particular that the conversion is rounded
1378 according to the current rounding mode.  If `a' is a NaN, the largest
1379 positive integer is returned.  Otherwise, if the conversion overflows, the
1380 largest integer with the same sign as `a' is returned.
1381 -------------------------------------------------------------------------------
1382 */
1383 int32 float32_to_int32( float32 a )
1384 {
1385     flag aSign;
1386     int16 aExp, shiftCount;
1387     bits32 aSig;
1388     bits64 aSig64;
1389 
1390     aSig = extractFloat32Frac( a );
1391     aExp = extractFloat32Exp( a );
1392     aSign = extractFloat32Sign( a );
1393     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1394     if ( aExp ) aSig |= 0x00800000;
1395     shiftCount = 0xAF - aExp;
1396     aSig64 = aSig;
1397     aSig64 <<= 32;
1398     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1399     return roundAndPackInt32( aSign, aSig64 );
1400 
1401 }
1402 #endif /* !SOFTFLOAT_FOR_GCC */
1403 
1404 /*
1405 -------------------------------------------------------------------------------
1406 Returns the result of converting the single-precision floating-point value
1407 `a' to the 32-bit two's complement integer format.  The conversion is
1408 performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 Arithmetic, except that the conversion is always rounded toward zero.
1410 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1411 the conversion overflows, the largest integer with the same sign as `a' is
1412 returned.
1413 -------------------------------------------------------------------------------
1414 */
1415 int32 float32_to_int32_round_to_zero( float32 a )
1416 {
1417     flag aSign;
1418     int16 aExp, shiftCount;
1419     bits32 aSig;
1420     int32 z;
1421 
1422     aSig = extractFloat32Frac( a );
1423     aExp = extractFloat32Exp( a );
1424     aSign = extractFloat32Sign( a );
1425     shiftCount = aExp - 0x9E;
1426     if ( 0 <= shiftCount ) {
1427         if ( a != 0xCF000000 ) {
1428             float_raise( float_flag_invalid );
1429             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1430         }
1431         return (sbits32) 0x80000000;
1432     }
1433     else if ( aExp <= 0x7E ) {
1434         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1435         return 0;
1436     }
1437     aSig = ( aSig | 0x00800000 )<<8;
1438     z = aSig>>( - shiftCount );
1439     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1440         float_exception_flags |= float_flag_inexact;
1441     }
1442     if ( aSign ) z = - z;
1443     return z;
1444 
1445 }
1446 
1447 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1448 /*
1449 -------------------------------------------------------------------------------
1450 Returns the result of converting the single-precision floating-point value
1451 `a' to the 64-bit two's complement integer format.  The conversion is
1452 performed according to the IEC/IEEE Standard for Binary Floating-Point
1453 Arithmetic---which means in particular that the conversion is rounded
1454 according to the current rounding mode.  If `a' is a NaN, the largest
1455 positive integer is returned.  Otherwise, if the conversion overflows, the
1456 largest integer with the same sign as `a' is returned.
1457 -------------------------------------------------------------------------------
1458 */
1459 int64 float32_to_int64( float32 a )
1460 {
1461     flag aSign;
1462     int16 aExp, shiftCount;
1463     bits32 aSig;
1464     bits64 aSig64, aSigExtra;
1465 
1466     aSig = extractFloat32Frac( a );
1467     aExp = extractFloat32Exp( a );
1468     aSign = extractFloat32Sign( a );
1469     shiftCount = 0xBE - aExp;
1470     if ( shiftCount < 0 ) {
1471         float_raise( float_flag_invalid );
1472         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1473             return LIT64( 0x7FFFFFFFFFFFFFFF );
1474         }
1475         return (sbits64) LIT64( 0x8000000000000000 );
1476     }
1477     if ( aExp ) aSig |= 0x00800000;
1478     aSig64 = aSig;
1479     aSig64 <<= 40;
1480     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1481     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1482 
1483 }
1484 
1485 /*
1486 -------------------------------------------------------------------------------
1487 Returns the result of converting the single-precision floating-point value
1488 `a' to the 64-bit two's complement integer format.  The conversion is
1489 performed according to the IEC/IEEE Standard for Binary Floating-Point
1490 Arithmetic, except that the conversion is always rounded toward zero.  If
1491 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1492 conversion overflows, the largest integer with the same sign as `a' is
1493 returned.
1494 -------------------------------------------------------------------------------
1495 */
1496 int64 float32_to_int64_round_to_zero( float32 a )
1497 {
1498     flag aSign;
1499     int16 aExp, shiftCount;
1500     bits32 aSig;
1501     bits64 aSig64;
1502     int64 z;
1503 
1504     aSig = extractFloat32Frac( a );
1505     aExp = extractFloat32Exp( a );
1506     aSign = extractFloat32Sign( a );
1507     shiftCount = aExp - 0xBE;
1508     if ( 0 <= shiftCount ) {
1509         if ( a != 0xDF000000 ) {
1510             float_raise( float_flag_invalid );
1511             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1512                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1513             }
1514         }
1515         return (sbits64) LIT64( 0x8000000000000000 );
1516     }
1517     else if ( aExp <= 0x7E ) {
1518         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1519         return 0;
1520     }
1521     aSig64 = aSig | 0x00800000;
1522     aSig64 <<= 40;
1523     z = aSig64>>( - shiftCount );
1524     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1525         float_exception_flags |= float_flag_inexact;
1526     }
1527     if ( aSign ) z = - z;
1528     return z;
1529 
1530 }
1531 #endif /* !SOFTFLOAT_FOR_GCC */
1532 
1533 /*
1534 -------------------------------------------------------------------------------
1535 Returns the result of converting the single-precision floating-point value
1536 `a' to the double-precision floating-point format.  The conversion is
1537 performed according to the IEC/IEEE Standard for Binary Floating-Point
1538 Arithmetic.
1539 -------------------------------------------------------------------------------
1540 */
1541 float64 float32_to_float64( float32 a )
1542 {
1543     flag aSign;
1544     int16 aExp;
1545     bits32 aSig;
1546 
1547     aSig = extractFloat32Frac( a );
1548     aExp = extractFloat32Exp( a );
1549     aSign = extractFloat32Sign( a );
1550     if ( aExp == 0xFF ) {
1551         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1552         return packFloat64( aSign, 0x7FF, 0 );
1553     }
1554     if ( aExp == 0 ) {
1555         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1556         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1557         --aExp;
1558     }
1559     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1560 
1561 }
1562 
1563 #ifdef FLOATX80
1564 
1565 /*
1566 -------------------------------------------------------------------------------
1567 Returns the result of converting the single-precision floating-point value
1568 `a' to the extended double-precision floating-point format.  The conversion
1569 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1570 Arithmetic.
1571 -------------------------------------------------------------------------------
1572 */
1573 floatx80 float32_to_floatx80( float32 a )
1574 {
1575     flag aSign;
1576     int16 aExp;
1577     bits32 aSig;
1578 
1579     aSig = extractFloat32Frac( a );
1580     aExp = extractFloat32Exp( a );
1581     aSign = extractFloat32Sign( a );
1582     if ( aExp == 0xFF ) {
1583         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1584         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1585     }
1586     if ( aExp == 0 ) {
1587         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1588         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1589     }
1590     aSig |= 0x00800000;
1591     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1592 
1593 }
1594 
1595 #endif
1596 
1597 #ifdef FLOAT128
1598 
1599 /*
1600 -------------------------------------------------------------------------------
1601 Returns the result of converting the single-precision floating-point value
1602 `a' to the double-precision floating-point format.  The conversion is
1603 performed according to the IEC/IEEE Standard for Binary Floating-Point
1604 Arithmetic.
1605 -------------------------------------------------------------------------------
1606 */
1607 float128 float32_to_float128( float32 a )
1608 {
1609     flag aSign;
1610     int16 aExp;
1611     bits32 aSig;
1612 
1613     aSig = extractFloat32Frac( a );
1614     aExp = extractFloat32Exp( a );
1615     aSign = extractFloat32Sign( a );
1616     if ( aExp == 0xFF ) {
1617         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1618         return packFloat128( aSign, 0x7FFF, 0, 0 );
1619     }
1620     if ( aExp == 0 ) {
1621         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1622         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1623         --aExp;
1624     }
1625     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1626 
1627 }
1628 
1629 #endif
1630 
1631 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1632 /*
1633 -------------------------------------------------------------------------------
1634 Rounds the single-precision floating-point value `a' to an integer, and
1635 returns the result as a single-precision floating-point value.  The
1636 operation is performed according to the IEC/IEEE Standard for Binary
1637 Floating-Point Arithmetic.
1638 -------------------------------------------------------------------------------
1639 */
1640 float32 float32_round_to_int( float32 a )
1641 {
1642     flag aSign;
1643     int16 aExp;
1644     bits32 lastBitMask, roundBitsMask;
1645     int8 roundingMode;
1646     float32 z;
1647 
1648     aExp = extractFloat32Exp( a );
1649     if ( 0x96 <= aExp ) {
1650         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651             return propagateFloat32NaN( a, a );
1652         }
1653         return a;
1654     }
1655     if ( aExp <= 0x7E ) {
1656         if ( (bits32) ( a<<1 ) == 0 ) return a;
1657         float_exception_flags |= float_flag_inexact;
1658         aSign = extractFloat32Sign( a );
1659         switch ( float_rounding_mode ) {
1660          case float_round_nearest_even:
1661             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662                 return packFloat32( aSign, 0x7F, 0 );
1663             }
1664             break;
1665 	 case float_round_to_zero:
1666 	    break;
1667          case float_round_down:
1668             return aSign ? 0xBF800000 : 0;
1669          case float_round_up:
1670             return aSign ? 0x80000000 : 0x3F800000;
1671         }
1672         return packFloat32( aSign, 0, 0 );
1673     }
1674     lastBitMask = 1;
1675     lastBitMask <<= 0x96 - aExp;
1676     roundBitsMask = lastBitMask - 1;
1677     z = a;
1678     roundingMode = float_rounding_mode;
1679     if ( roundingMode == float_round_nearest_even ) {
1680         z += lastBitMask>>1;
1681         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1682     }
1683     else if ( roundingMode != float_round_to_zero ) {
1684         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1685             z += roundBitsMask;
1686         }
1687     }
1688     z &= ~ roundBitsMask;
1689     if ( z != a ) float_exception_flags |= float_flag_inexact;
1690     return z;
1691 
1692 }
1693 #endif /* !SOFTFLOAT_FOR_GCC */
1694 
1695 /*
1696 -------------------------------------------------------------------------------
1697 Returns the result of adding the absolute values of the single-precision
1698 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1699 before being returned.  `zSign' is ignored if the result is a NaN.
1700 The addition is performed according to the IEC/IEEE Standard for Binary
1701 Floating-Point Arithmetic.
1702 -------------------------------------------------------------------------------
1703 */
1704 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1705 {
1706     int16 aExp, bExp, zExp;
1707     bits32 aSig, bSig, zSig;
1708     int16 expDiff;
1709 
1710     aSig = extractFloat32Frac( a );
1711     aExp = extractFloat32Exp( a );
1712     bSig = extractFloat32Frac( b );
1713     bExp = extractFloat32Exp( b );
1714     expDiff = aExp - bExp;
1715     aSig <<= 6;
1716     bSig <<= 6;
1717     if ( 0 < expDiff ) {
1718         if ( aExp == 0xFF ) {
1719             if ( aSig ) return propagateFloat32NaN( a, b );
1720             return a;
1721         }
1722         if ( bExp == 0 ) {
1723             --expDiff;
1724         }
1725         else {
1726             bSig |= 0x20000000;
1727         }
1728         shift32RightJamming( bSig, expDiff, &bSig );
1729         zExp = aExp;
1730     }
1731     else if ( expDiff < 0 ) {
1732         if ( bExp == 0xFF ) {
1733             if ( bSig ) return propagateFloat32NaN( a, b );
1734             return packFloat32( zSign, 0xFF, 0 );
1735         }
1736         if ( aExp == 0 ) {
1737             ++expDiff;
1738         }
1739         else {
1740             aSig |= 0x20000000;
1741         }
1742         shift32RightJamming( aSig, - expDiff, &aSig );
1743         zExp = bExp;
1744     }
1745     else {
1746         if ( aExp == 0xFF ) {
1747             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1748             return a;
1749         }
1750         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1751         zSig = 0x40000000 + aSig + bSig;
1752         zExp = aExp;
1753         goto roundAndPack;
1754     }
1755     aSig |= 0x20000000;
1756     zSig = ( aSig + bSig )<<1;
1757     --zExp;
1758     if ( (sbits32) zSig < 0 ) {
1759         zSig = aSig + bSig;
1760         ++zExp;
1761     }
1762  roundAndPack:
1763     return roundAndPackFloat32( zSign, zExp, zSig );
1764 
1765 }
1766 
1767 /*
1768 -------------------------------------------------------------------------------
1769 Returns the result of subtracting the absolute values of the single-
1770 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1771 difference is negated before being returned.  `zSign' is ignored if the
1772 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1773 Standard for Binary Floating-Point Arithmetic.
1774 -------------------------------------------------------------------------------
1775 */
1776 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1777 {
1778     int16 aExp, bExp, zExp;
1779     bits32 aSig, bSig, zSig;
1780     int16 expDiff;
1781 
1782     aSig = extractFloat32Frac( a );
1783     aExp = extractFloat32Exp( a );
1784     bSig = extractFloat32Frac( b );
1785     bExp = extractFloat32Exp( b );
1786     expDiff = aExp - bExp;
1787     aSig <<= 7;
1788     bSig <<= 7;
1789     if ( 0 < expDiff ) goto aExpBigger;
1790     if ( expDiff < 0 ) goto bExpBigger;
1791     if ( aExp == 0xFF ) {
1792         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1793         float_raise( float_flag_invalid );
1794         return float32_default_nan;
1795     }
1796     if ( aExp == 0 ) {
1797         aExp = 1;
1798         bExp = 1;
1799     }
1800     if ( bSig < aSig ) goto aBigger;
1801     if ( aSig < bSig ) goto bBigger;
1802     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1803  bExpBigger:
1804     if ( bExp == 0xFF ) {
1805         if ( bSig ) return propagateFloat32NaN( a, b );
1806         return packFloat32( zSign ^ 1, 0xFF, 0 );
1807     }
1808     if ( aExp == 0 ) {
1809         ++expDiff;
1810     }
1811     else {
1812         aSig |= 0x40000000;
1813     }
1814     shift32RightJamming( aSig, - expDiff, &aSig );
1815     bSig |= 0x40000000;
1816  bBigger:
1817     zSig = bSig - aSig;
1818     zExp = bExp;
1819     zSign ^= 1;
1820     goto normalizeRoundAndPack;
1821  aExpBigger:
1822     if ( aExp == 0xFF ) {
1823         if ( aSig ) return propagateFloat32NaN( a, b );
1824         return a;
1825     }
1826     if ( bExp == 0 ) {
1827         --expDiff;
1828     }
1829     else {
1830         bSig |= 0x40000000;
1831     }
1832     shift32RightJamming( bSig, expDiff, &bSig );
1833     aSig |= 0x40000000;
1834  aBigger:
1835     zSig = aSig - bSig;
1836     zExp = aExp;
1837  normalizeRoundAndPack:
1838     --zExp;
1839     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1840 
1841 }
1842 
1843 /*
1844 -------------------------------------------------------------------------------
1845 Returns the result of adding the single-precision floating-point values `a'
1846 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1847 Binary Floating-Point Arithmetic.
1848 -------------------------------------------------------------------------------
1849 */
1850 float32 float32_add( float32 a, float32 b )
1851 {
1852     flag aSign, bSign;
1853 
1854     aSign = extractFloat32Sign( a );
1855     bSign = extractFloat32Sign( b );
1856     if ( aSign == bSign ) {
1857         return addFloat32Sigs( a, b, aSign );
1858     }
1859     else {
1860         return subFloat32Sigs( a, b, aSign );
1861     }
1862 
1863 }
1864 
1865 /*
1866 -------------------------------------------------------------------------------
1867 Returns the result of subtracting the single-precision floating-point values
1868 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1869 for Binary Floating-Point Arithmetic.
1870 -------------------------------------------------------------------------------
1871 */
1872 float32 float32_sub( float32 a, float32 b )
1873 {
1874     flag aSign, bSign;
1875 
1876     aSign = extractFloat32Sign( a );
1877     bSign = extractFloat32Sign( b );
1878     if ( aSign == bSign ) {
1879         return subFloat32Sigs( a, b, aSign );
1880     }
1881     else {
1882         return addFloat32Sigs( a, b, aSign );
1883     }
1884 
1885 }
1886 
1887 /*
1888 -------------------------------------------------------------------------------
1889 Returns the result of multiplying the single-precision floating-point values
1890 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1891 for Binary Floating-Point Arithmetic.
1892 -------------------------------------------------------------------------------
1893 */
1894 float32 float32_mul( float32 a, float32 b )
1895 {
1896     flag aSign, bSign, zSign;
1897     int16 aExp, bExp, zExp;
1898     bits32 aSig, bSig;
1899     bits64 zSig64;
1900     bits32 zSig;
1901 
1902     aSig = extractFloat32Frac( a );
1903     aExp = extractFloat32Exp( a );
1904     aSign = extractFloat32Sign( a );
1905     bSig = extractFloat32Frac( b );
1906     bExp = extractFloat32Exp( b );
1907     bSign = extractFloat32Sign( b );
1908     zSign = aSign ^ bSign;
1909     if ( aExp == 0xFF ) {
1910         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1911             return propagateFloat32NaN( a, b );
1912         }
1913         if ( ( bExp | bSig ) == 0 ) {
1914             float_raise( float_flag_invalid );
1915             return float32_default_nan;
1916         }
1917         return packFloat32( zSign, 0xFF, 0 );
1918     }
1919     if ( bExp == 0xFF ) {
1920         if ( bSig ) return propagateFloat32NaN( a, b );
1921         if ( ( aExp | aSig ) == 0 ) {
1922             float_raise( float_flag_invalid );
1923             return float32_default_nan;
1924         }
1925         return packFloat32( zSign, 0xFF, 0 );
1926     }
1927     if ( aExp == 0 ) {
1928         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1929         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1930     }
1931     if ( bExp == 0 ) {
1932         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1933         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1934     }
1935     zExp = aExp + bExp - 0x7F;
1936     aSig = ( aSig | 0x00800000 )<<7;
1937     bSig = ( bSig | 0x00800000 )<<8;
1938     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1939     zSig = zSig64;
1940     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1941         zSig <<= 1;
1942         --zExp;
1943     }
1944     return roundAndPackFloat32( zSign, zExp, zSig );
1945 
1946 }
1947 
1948 /*
1949 -------------------------------------------------------------------------------
1950 Returns the result of dividing the single-precision floating-point value `a'
1951 by the corresponding value `b'.  The operation is performed according to the
1952 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1953 -------------------------------------------------------------------------------
1954 */
1955 float32 float32_div( float32 a, float32 b )
1956 {
1957     flag aSign, bSign, zSign;
1958     int16 aExp, bExp, zExp;
1959     bits32 aSig, bSig, zSig;
1960 
1961     aSig = extractFloat32Frac( a );
1962     aExp = extractFloat32Exp( a );
1963     aSign = extractFloat32Sign( a );
1964     bSig = extractFloat32Frac( b );
1965     bExp = extractFloat32Exp( b );
1966     bSign = extractFloat32Sign( b );
1967     zSign = aSign ^ bSign;
1968     if ( aExp == 0xFF ) {
1969         if ( aSig ) return propagateFloat32NaN( a, b );
1970         if ( bExp == 0xFF ) {
1971             if ( bSig ) return propagateFloat32NaN( a, b );
1972             float_raise( float_flag_invalid );
1973             return float32_default_nan;
1974         }
1975         return packFloat32( zSign, 0xFF, 0 );
1976     }
1977     if ( bExp == 0xFF ) {
1978         if ( bSig ) return propagateFloat32NaN( a, b );
1979         return packFloat32( zSign, 0, 0 );
1980     }
1981     if ( bExp == 0 ) {
1982         if ( bSig == 0 ) {
1983             if ( ( aExp | aSig ) == 0 ) {
1984                 float_raise( float_flag_invalid );
1985                 return float32_default_nan;
1986             }
1987             float_raise( float_flag_divbyzero );
1988             return packFloat32( zSign, 0xFF, 0 );
1989         }
1990         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1991     }
1992     if ( aExp == 0 ) {
1993         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1994         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1995     }
1996     zExp = aExp - bExp + 0x7D;
1997     aSig = ( aSig | 0x00800000 )<<7;
1998     bSig = ( bSig | 0x00800000 )<<8;
1999     if ( bSig <= ( aSig + aSig ) ) {
2000         aSig >>= 1;
2001         ++zExp;
2002     }
2003     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2004     if ( ( zSig & 0x3F ) == 0 ) {
2005         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2006     }
2007     return roundAndPackFloat32( zSign, zExp, zSig );
2008 
2009 }
2010 
2011 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2012 /*
2013 -------------------------------------------------------------------------------
2014 Returns the remainder of the single-precision floating-point value `a'
2015 with respect to the corresponding value `b'.  The operation is performed
2016 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2017 -------------------------------------------------------------------------------
2018 */
2019 float32 float32_rem( float32 a, float32 b )
2020 {
2021     flag aSign, bSign, zSign;
2022     int16 aExp, bExp, expDiff;
2023     bits32 aSig, bSig;
2024     bits32 q;
2025     bits64 aSig64, bSig64, q64;
2026     bits32 alternateASig;
2027     sbits32 sigMean;
2028 
2029     aSig = extractFloat32Frac( a );
2030     aExp = extractFloat32Exp( a );
2031     aSign = extractFloat32Sign( a );
2032     bSig = extractFloat32Frac( b );
2033     bExp = extractFloat32Exp( b );
2034     bSign = extractFloat32Sign( b );
2035     if ( aExp == 0xFF ) {
2036         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2037             return propagateFloat32NaN( a, b );
2038         }
2039         float_raise( float_flag_invalid );
2040         return float32_default_nan;
2041     }
2042     if ( bExp == 0xFF ) {
2043         if ( bSig ) return propagateFloat32NaN( a, b );
2044         return a;
2045     }
2046     if ( bExp == 0 ) {
2047         if ( bSig == 0 ) {
2048             float_raise( float_flag_invalid );
2049             return float32_default_nan;
2050         }
2051         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2052     }
2053     if ( aExp == 0 ) {
2054         if ( aSig == 0 ) return a;
2055         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2056     }
2057     expDiff = aExp - bExp;
2058     aSig |= 0x00800000;
2059     bSig |= 0x00800000;
2060     if ( expDiff < 32 ) {
2061         aSig <<= 8;
2062         bSig <<= 8;
2063         if ( expDiff < 0 ) {
2064             if ( expDiff < -1 ) return a;
2065             aSig >>= 1;
2066         }
2067         q = ( bSig <= aSig );
2068         if ( q ) aSig -= bSig;
2069         if ( 0 < expDiff ) {
2070             q = ( ( (bits64) aSig )<<32 ) / bSig;
2071             q >>= 32 - expDiff;
2072             bSig >>= 2;
2073             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2074         }
2075         else {
2076             aSig >>= 2;
2077             bSig >>= 2;
2078         }
2079     }
2080     else {
2081         if ( bSig <= aSig ) aSig -= bSig;
2082         aSig64 = ( (bits64) aSig )<<40;
2083         bSig64 = ( (bits64) bSig )<<40;
2084         expDiff -= 64;
2085         while ( 0 < expDiff ) {
2086             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2087             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2088             aSig64 = - ( ( bSig * q64 )<<38 );
2089             expDiff -= 62;
2090         }
2091         expDiff += 64;
2092         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2093         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2094         q = q64>>( 64 - expDiff );
2095         bSig <<= 6;
2096         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2097     }
2098     do {
2099         alternateASig = aSig;
2100         ++q;
2101         aSig -= bSig;
2102     } while ( 0 <= (sbits32) aSig );
2103     sigMean = aSig + alternateASig;
2104     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2105         aSig = alternateASig;
2106     }
2107     zSign = ( (sbits32) aSig < 0 );
2108     if ( zSign ) aSig = - aSig;
2109     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2110 
2111 }
2112 #endif /* !SOFTFLOAT_FOR_GCC */
2113 
2114 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2115 /*
2116 -------------------------------------------------------------------------------
2117 Returns the square root of the single-precision floating-point value `a'.
2118 The operation is performed according to the IEC/IEEE Standard for Binary
2119 Floating-Point Arithmetic.
2120 -------------------------------------------------------------------------------
2121 */
2122 float32 float32_sqrt( float32 a )
2123 {
2124     flag aSign;
2125     int16 aExp, zExp;
2126     bits32 aSig, zSig;
2127     bits64 rem, term;
2128 
2129     aSig = extractFloat32Frac( a );
2130     aExp = extractFloat32Exp( a );
2131     aSign = extractFloat32Sign( a );
2132     if ( aExp == 0xFF ) {
2133         if ( aSig ) return propagateFloat32NaN( a, 0 );
2134         if ( ! aSign ) return a;
2135         float_raise( float_flag_invalid );
2136         return float32_default_nan;
2137     }
2138     if ( aSign ) {
2139         if ( ( aExp | aSig ) == 0 ) return a;
2140         float_raise( float_flag_invalid );
2141         return float32_default_nan;
2142     }
2143     if ( aExp == 0 ) {
2144         if ( aSig == 0 ) return 0;
2145         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2146     }
2147     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2148     aSig = ( aSig | 0x00800000 )<<8;
2149     zSig = estimateSqrt32( aExp, aSig ) + 2;
2150     if ( ( zSig & 0x7F ) <= 5 ) {
2151         if ( zSig < 2 ) {
2152             zSig = 0x7FFFFFFF;
2153             goto roundAndPack;
2154         }
2155         aSig >>= aExp & 1;
2156         term = ( (bits64) zSig ) * zSig;
2157         rem = ( ( (bits64) aSig )<<32 ) - term;
2158         while ( (sbits64) rem < 0 ) {
2159             --zSig;
2160             rem += ( ( (bits64) zSig )<<1 ) | 1;
2161         }
2162         zSig |= ( rem != 0 );
2163     }
2164     shift32RightJamming( zSig, 1, &zSig );
2165  roundAndPack:
2166     return roundAndPackFloat32( 0, zExp, zSig );
2167 
2168 }
2169 #endif /* !SOFTFLOAT_FOR_GCC */
2170 
2171 /*
2172 -------------------------------------------------------------------------------
2173 Returns 1 if the single-precision floating-point value `a' is equal to
2174 the corresponding value `b', and 0 otherwise.  The comparison is performed
2175 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 -------------------------------------------------------------------------------
2177 */
2178 flag float32_eq( float32 a, float32 b )
2179 {
2180 
2181     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2182          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2183        ) {
2184         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2185             float_raise( float_flag_invalid );
2186         }
2187         return 0;
2188     }
2189     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2190 
2191 }
2192 
2193 /*
2194 -------------------------------------------------------------------------------
2195 Returns 1 if the single-precision floating-point value `a' is less than
2196 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2197 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2198 Arithmetic.
2199 -------------------------------------------------------------------------------
2200 */
2201 flag float32_le( float32 a, float32 b )
2202 {
2203     flag aSign, bSign;
2204 
2205     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2206          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2207        ) {
2208         float_raise( float_flag_invalid );
2209         return 0;
2210     }
2211     aSign = extractFloat32Sign( a );
2212     bSign = extractFloat32Sign( b );
2213     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2214     return ( a == b ) || ( aSign ^ ( a < b ) );
2215 
2216 }
2217 
2218 /*
2219 -------------------------------------------------------------------------------
2220 Returns 1 if the single-precision floating-point value `a' is less than
2221 the corresponding value `b', and 0 otherwise.  The comparison is performed
2222 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2223 -------------------------------------------------------------------------------
2224 */
2225 flag float32_lt( float32 a, float32 b )
2226 {
2227     flag aSign, bSign;
2228 
2229     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2230          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2231        ) {
2232         float_raise( float_flag_invalid );
2233         return 0;
2234     }
2235     aSign = extractFloat32Sign( a );
2236     bSign = extractFloat32Sign( b );
2237     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2238     return ( a != b ) && ( aSign ^ ( a < b ) );
2239 
2240 }
2241 
2242 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2243 /*
2244 -------------------------------------------------------------------------------
2245 Returns 1 if the single-precision floating-point value `a' is equal to
2246 the corresponding value `b', and 0 otherwise.  The invalid exception is
2247 raised if either operand is a NaN.  Otherwise, the comparison is performed
2248 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2249 -------------------------------------------------------------------------------
2250 */
2251 flag float32_eq_signaling( float32 a, float32 b )
2252 {
2253 
2254     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2255          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2256        ) {
2257         float_raise( float_flag_invalid );
2258         return 0;
2259     }
2260     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2261 
2262 }
2263 
2264 /*
2265 -------------------------------------------------------------------------------
2266 Returns 1 if the single-precision floating-point value `a' is less than or
2267 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2268 cause an exception.  Otherwise, the comparison is performed according to the
2269 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 -------------------------------------------------------------------------------
2271 */
2272 flag float32_le_quiet( float32 a, float32 b )
2273 {
2274     flag aSign, bSign;
2275 
2276     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2277          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2278        ) {
2279         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2280             float_raise( float_flag_invalid );
2281         }
2282         return 0;
2283     }
2284     aSign = extractFloat32Sign( a );
2285     bSign = extractFloat32Sign( b );
2286     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2287     return ( a == b ) || ( aSign ^ ( a < b ) );
2288 
2289 }
2290 
2291 /*
2292 -------------------------------------------------------------------------------
2293 Returns 1 if the single-precision floating-point value `a' is less than
2294 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2295 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2296 Standard for Binary Floating-Point Arithmetic.
2297 -------------------------------------------------------------------------------
2298 */
2299 flag float32_lt_quiet( float32 a, float32 b )
2300 {
2301     flag aSign, bSign;
2302 
2303     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2304          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2305        ) {
2306         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2307             float_raise( float_flag_invalid );
2308         }
2309         return 0;
2310     }
2311     aSign = extractFloat32Sign( a );
2312     bSign = extractFloat32Sign( b );
2313     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2314     return ( a != b ) && ( aSign ^ ( a < b ) );
2315 
2316 }
2317 #endif /* !SOFTFLOAT_FOR_GCC */
2318 
2319 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2320 /*
2321 -------------------------------------------------------------------------------
2322 Returns the result of converting the double-precision floating-point value
2323 `a' to the 32-bit two's complement integer format.  The conversion is
2324 performed according to the IEC/IEEE Standard for Binary Floating-Point
2325 Arithmetic---which means in particular that the conversion is rounded
2326 according to the current rounding mode.  If `a' is a NaN, the largest
2327 positive integer is returned.  Otherwise, if the conversion overflows, the
2328 largest integer with the same sign as `a' is returned.
2329 -------------------------------------------------------------------------------
2330 */
2331 int32 float64_to_int32( float64 a )
2332 {
2333     flag aSign;
2334     int16 aExp, shiftCount;
2335     bits64 aSig;
2336 
2337     aSig = extractFloat64Frac( a );
2338     aExp = extractFloat64Exp( a );
2339     aSign = extractFloat64Sign( a );
2340     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2341     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2342     shiftCount = 0x42C - aExp;
2343     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2344     return roundAndPackInt32( aSign, aSig );
2345 
2346 }
2347 #endif /* !SOFTFLOAT_FOR_GCC */
2348 
2349 /*
2350 -------------------------------------------------------------------------------
2351 Returns the result of converting the double-precision floating-point value
2352 `a' to the 32-bit two's complement integer format.  The conversion is
2353 performed according to the IEC/IEEE Standard for Binary Floating-Point
2354 Arithmetic, except that the conversion is always rounded toward zero.
2355 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2356 the conversion overflows, the largest integer with the same sign as `a' is
2357 returned.
2358 -------------------------------------------------------------------------------
2359 */
2360 int32 float64_to_int32_round_to_zero( float64 a )
2361 {
2362     flag aSign;
2363     int16 aExp, shiftCount;
2364     bits64 aSig, savedASig;
2365     int32 z;
2366 
2367     aSig = extractFloat64Frac( a );
2368     aExp = extractFloat64Exp( a );
2369     aSign = extractFloat64Sign( a );
2370     if ( 0x41E < aExp ) {
2371         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2372         goto invalid;
2373     }
2374     else if ( aExp < 0x3FF ) {
2375         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2376         return 0;
2377     }
2378     aSig |= LIT64( 0x0010000000000000 );
2379     shiftCount = 0x433 - aExp;
2380     savedASig = aSig;
2381     aSig >>= shiftCount;
2382     z = aSig;
2383     if ( aSign ) z = - z;
2384     if ( ( z < 0 ) ^ aSign ) {
2385  invalid:
2386         float_raise( float_flag_invalid );
2387         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2388     }
2389     if ( ( aSig<<shiftCount ) != savedASig ) {
2390         float_exception_flags |= float_flag_inexact;
2391     }
2392     return z;
2393 
2394 }
2395 
2396 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2397 /*
2398 -------------------------------------------------------------------------------
2399 Returns the result of converting the double-precision floating-point value
2400 `a' to the 64-bit two's complement integer format.  The conversion is
2401 performed according to the IEC/IEEE Standard for Binary Floating-Point
2402 Arithmetic---which means in particular that the conversion is rounded
2403 according to the current rounding mode.  If `a' is a NaN, the largest
2404 positive integer is returned.  Otherwise, if the conversion overflows, the
2405 largest integer with the same sign as `a' is returned.
2406 -------------------------------------------------------------------------------
2407 */
2408 int64 float64_to_int64( float64 a )
2409 {
2410     flag aSign;
2411     int16 aExp, shiftCount;
2412     bits64 aSig, aSigExtra;
2413 
2414     aSig = extractFloat64Frac( a );
2415     aExp = extractFloat64Exp( a );
2416     aSign = extractFloat64Sign( a );
2417     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2418     shiftCount = 0x433 - aExp;
2419     if ( shiftCount <= 0 ) {
2420         if ( 0x43E < aExp ) {
2421             float_raise( float_flag_invalid );
2422             if (    ! aSign
2423                  || (    ( aExp == 0x7FF )
2424                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2425                ) {
2426                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2427             }
2428             return (sbits64) LIT64( 0x8000000000000000 );
2429         }
2430         aSigExtra = 0;
2431         aSig <<= - shiftCount;
2432     }
2433     else {
2434         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2435     }
2436     return roundAndPackInt64( aSign, aSig, aSigExtra );
2437 
2438 }
2439 
2440 /*
2441 -------------------------------------------------------------------------------
2442 Returns the result of converting the double-precision floating-point value
2443 `a' to the 64-bit two's complement integer format.  The conversion is
2444 performed according to the IEC/IEEE Standard for Binary Floating-Point
2445 Arithmetic, except that the conversion is always rounded toward zero.
2446 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2447 the conversion overflows, the largest integer with the same sign as `a' is
2448 returned.
2449 -------------------------------------------------------------------------------
2450 */
2451 int64 float64_to_int64_round_to_zero( float64 a )
2452 {
2453     flag aSign;
2454     int16 aExp, shiftCount;
2455     bits64 aSig;
2456     int64 z;
2457 
2458     aSig = extractFloat64Frac( a );
2459     aExp = extractFloat64Exp( a );
2460     aSign = extractFloat64Sign( a );
2461     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2462     shiftCount = aExp - 0x433;
2463     if ( 0 <= shiftCount ) {
2464         if ( 0x43E <= aExp ) {
2465             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2466                 float_raise( float_flag_invalid );
2467                 if (    ! aSign
2468                      || (    ( aExp == 0x7FF )
2469                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2470                    ) {
2471                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2472                 }
2473             }
2474             return (sbits64) LIT64( 0x8000000000000000 );
2475         }
2476         z = aSig<<shiftCount;
2477     }
2478     else {
2479         if ( aExp < 0x3FE ) {
2480             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2481             return 0;
2482         }
2483         z = aSig>>( - shiftCount );
2484         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2485             float_exception_flags |= float_flag_inexact;
2486         }
2487     }
2488     if ( aSign ) z = - z;
2489     return z;
2490 
2491 }
2492 #endif /* !SOFTFLOAT_FOR_GCC */
2493 
2494 /*
2495 -------------------------------------------------------------------------------
2496 Returns the result of converting the double-precision floating-point value
2497 `a' to the single-precision floating-point format.  The conversion is
2498 performed according to the IEC/IEEE Standard for Binary Floating-Point
2499 Arithmetic.
2500 -------------------------------------------------------------------------------
2501 */
2502 float32 float64_to_float32( float64 a )
2503 {
2504     flag aSign;
2505     int16 aExp;
2506     bits64 aSig;
2507     bits32 zSig;
2508 
2509     aSig = extractFloat64Frac( a );
2510     aExp = extractFloat64Exp( a );
2511     aSign = extractFloat64Sign( a );
2512     if ( aExp == 0x7FF ) {
2513         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2514         return packFloat32( aSign, 0xFF, 0 );
2515     }
2516     shift64RightJamming( aSig, 22, &aSig );
2517     zSig = aSig;
2518     if ( aExp || zSig ) {
2519         zSig |= 0x40000000;
2520         aExp -= 0x381;
2521     }
2522     return roundAndPackFloat32( aSign, aExp, zSig );
2523 
2524 }
2525 
2526 #ifdef FLOATX80
2527 
2528 /*
2529 -------------------------------------------------------------------------------
2530 Returns the result of converting the double-precision floating-point value
2531 `a' to the extended double-precision floating-point format.  The conversion
2532 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2533 Arithmetic.
2534 -------------------------------------------------------------------------------
2535 */
2536 floatx80 float64_to_floatx80( float64 a )
2537 {
2538     flag aSign;
2539     int16 aExp;
2540     bits64 aSig;
2541 
2542     aSig = extractFloat64Frac( a );
2543     aExp = extractFloat64Exp( a );
2544     aSign = extractFloat64Sign( a );
2545     if ( aExp == 0x7FF ) {
2546         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2547         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2548     }
2549     if ( aExp == 0 ) {
2550         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2551         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2552     }
2553     return
2554         packFloatx80(
2555             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2556 
2557 }
2558 
2559 #endif
2560 
2561 #ifdef FLOAT128
2562 
2563 /*
2564 -------------------------------------------------------------------------------
2565 Returns the result of converting the double-precision floating-point value
2566 `a' to the quadruple-precision floating-point format.  The conversion is
2567 performed according to the IEC/IEEE Standard for Binary Floating-Point
2568 Arithmetic.
2569 -------------------------------------------------------------------------------
2570 */
2571 float128 float64_to_float128( float64 a )
2572 {
2573     flag aSign;
2574     int16 aExp;
2575     bits64 aSig, zSig0, zSig1;
2576 
2577     aSig = extractFloat64Frac( a );
2578     aExp = extractFloat64Exp( a );
2579     aSign = extractFloat64Sign( a );
2580     if ( aExp == 0x7FF ) {
2581         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2582         return packFloat128( aSign, 0x7FFF, 0, 0 );
2583     }
2584     if ( aExp == 0 ) {
2585         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2586         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2587         --aExp;
2588     }
2589     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2590     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2591 
2592 }
2593 
2594 #endif
2595 
2596 #ifndef SOFTFLOAT_FOR_GCC
2597 /*
2598 -------------------------------------------------------------------------------
2599 Rounds the double-precision floating-point value `a' to an integer, and
2600 returns the result as a double-precision floating-point value.  The
2601 operation is performed according to the IEC/IEEE Standard for Binary
2602 Floating-Point Arithmetic.
2603 -------------------------------------------------------------------------------
2604 */
2605 float64 float64_round_to_int( float64 a )
2606 {
2607     flag aSign;
2608     int16 aExp;
2609     bits64 lastBitMask, roundBitsMask;
2610     int8 roundingMode;
2611     float64 z;
2612 
2613     aExp = extractFloat64Exp( a );
2614     if ( 0x433 <= aExp ) {
2615         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2616             return propagateFloat64NaN( a, a );
2617         }
2618         return a;
2619     }
2620     if ( aExp < 0x3FF ) {
2621         if ( (bits64) ( a<<1 ) == 0 ) return a;
2622         float_exception_flags |= float_flag_inexact;
2623         aSign = extractFloat64Sign( a );
2624         switch ( float_rounding_mode ) {
2625          case float_round_nearest_even:
2626             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2627                 return packFloat64( aSign, 0x3FF, 0 );
2628             }
2629             break;
2630 	 case float_round_to_zero:
2631 	    break;
2632          case float_round_down:
2633             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2634          case float_round_up:
2635             return
2636             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2637         }
2638         return packFloat64( aSign, 0, 0 );
2639     }
2640     lastBitMask = 1;
2641     lastBitMask <<= 0x433 - aExp;
2642     roundBitsMask = lastBitMask - 1;
2643     z = a;
2644     roundingMode = float_rounding_mode;
2645     if ( roundingMode == float_round_nearest_even ) {
2646         z += lastBitMask>>1;
2647         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2648     }
2649     else if ( roundingMode != float_round_to_zero ) {
2650         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2651             z += roundBitsMask;
2652         }
2653     }
2654     z &= ~ roundBitsMask;
2655     if ( z != a ) float_exception_flags |= float_flag_inexact;
2656     return z;
2657 
2658 }
2659 #endif
2660 
2661 /*
2662 -------------------------------------------------------------------------------
2663 Returns the result of adding the absolute values of the double-precision
2664 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2665 before being returned.  `zSign' is ignored if the result is a NaN.
2666 The addition is performed according to the IEC/IEEE Standard for Binary
2667 Floating-Point Arithmetic.
2668 -------------------------------------------------------------------------------
2669 */
2670 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2671 {
2672     int16 aExp, bExp, zExp;
2673     bits64 aSig, bSig, zSig;
2674     int16 expDiff;
2675 
2676     aSig = extractFloat64Frac( a );
2677     aExp = extractFloat64Exp( a );
2678     bSig = extractFloat64Frac( b );
2679     bExp = extractFloat64Exp( b );
2680     expDiff = aExp - bExp;
2681     aSig <<= 9;
2682     bSig <<= 9;
2683     if ( 0 < expDiff ) {
2684         if ( aExp == 0x7FF ) {
2685             if ( aSig ) return propagateFloat64NaN( a, b );
2686             return a;
2687         }
2688         if ( bExp == 0 ) {
2689             --expDiff;
2690         }
2691         else {
2692             bSig |= LIT64( 0x2000000000000000 );
2693         }
2694         shift64RightJamming( bSig, expDiff, &bSig );
2695         zExp = aExp;
2696     }
2697     else if ( expDiff < 0 ) {
2698         if ( bExp == 0x7FF ) {
2699             if ( bSig ) return propagateFloat64NaN( a, b );
2700             return packFloat64( zSign, 0x7FF, 0 );
2701         }
2702         if ( aExp == 0 ) {
2703             ++expDiff;
2704         }
2705         else {
2706             aSig |= LIT64( 0x2000000000000000 );
2707         }
2708         shift64RightJamming( aSig, - expDiff, &aSig );
2709         zExp = bExp;
2710     }
2711     else {
2712         if ( aExp == 0x7FF ) {
2713             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2714             return a;
2715         }
2716         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2717         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2718         zExp = aExp;
2719         goto roundAndPack;
2720     }
2721     aSig |= LIT64( 0x2000000000000000 );
2722     zSig = ( aSig + bSig )<<1;
2723     --zExp;
2724     if ( (sbits64) zSig < 0 ) {
2725         zSig = aSig + bSig;
2726         ++zExp;
2727     }
2728  roundAndPack:
2729     return roundAndPackFloat64( zSign, zExp, zSig );
2730 
2731 }
2732 
2733 /*
2734 -------------------------------------------------------------------------------
2735 Returns the result of subtracting the absolute values of the double-
2736 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2737 difference is negated before being returned.  `zSign' is ignored if the
2738 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2739 Standard for Binary Floating-Point Arithmetic.
2740 -------------------------------------------------------------------------------
2741 */
2742 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2743 {
2744     int16 aExp, bExp, zExp;
2745     bits64 aSig, bSig, zSig;
2746     int16 expDiff;
2747 
2748     aSig = extractFloat64Frac( a );
2749     aExp = extractFloat64Exp( a );
2750     bSig = extractFloat64Frac( b );
2751     bExp = extractFloat64Exp( b );
2752     expDiff = aExp - bExp;
2753     aSig <<= 10;
2754     bSig <<= 10;
2755     if ( 0 < expDiff ) goto aExpBigger;
2756     if ( expDiff < 0 ) goto bExpBigger;
2757     if ( aExp == 0x7FF ) {
2758         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2759         float_raise( float_flag_invalid );
2760         return float64_default_nan;
2761     }
2762     if ( aExp == 0 ) {
2763         aExp = 1;
2764         bExp = 1;
2765     }
2766     if ( bSig < aSig ) goto aBigger;
2767     if ( aSig < bSig ) goto bBigger;
2768     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2769  bExpBigger:
2770     if ( bExp == 0x7FF ) {
2771         if ( bSig ) return propagateFloat64NaN( a, b );
2772         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2773     }
2774     if ( aExp == 0 ) {
2775         ++expDiff;
2776     }
2777     else {
2778         aSig |= LIT64( 0x4000000000000000 );
2779     }
2780     shift64RightJamming( aSig, - expDiff, &aSig );
2781     bSig |= LIT64( 0x4000000000000000 );
2782  bBigger:
2783     zSig = bSig - aSig;
2784     zExp = bExp;
2785     zSign ^= 1;
2786     goto normalizeRoundAndPack;
2787  aExpBigger:
2788     if ( aExp == 0x7FF ) {
2789         if ( aSig ) return propagateFloat64NaN( a, b );
2790         return a;
2791     }
2792     if ( bExp == 0 ) {
2793         --expDiff;
2794     }
2795     else {
2796         bSig |= LIT64( 0x4000000000000000 );
2797     }
2798     shift64RightJamming( bSig, expDiff, &bSig );
2799     aSig |= LIT64( 0x4000000000000000 );
2800  aBigger:
2801     zSig = aSig - bSig;
2802     zExp = aExp;
2803  normalizeRoundAndPack:
2804     --zExp;
2805     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2806 
2807 }
2808 
2809 /*
2810 -------------------------------------------------------------------------------
2811 Returns the result of adding the double-precision floating-point values `a'
2812 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2813 Binary Floating-Point Arithmetic.
2814 -------------------------------------------------------------------------------
2815 */
2816 float64 float64_add( float64 a, float64 b )
2817 {
2818     flag aSign, bSign;
2819 
2820     aSign = extractFloat64Sign( a );
2821     bSign = extractFloat64Sign( b );
2822     if ( aSign == bSign ) {
2823         return addFloat64Sigs( a, b, aSign );
2824     }
2825     else {
2826         return subFloat64Sigs( a, b, aSign );
2827     }
2828 
2829 }
2830 
2831 /*
2832 -------------------------------------------------------------------------------
2833 Returns the result of subtracting the double-precision floating-point values
2834 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2835 for Binary Floating-Point Arithmetic.
2836 -------------------------------------------------------------------------------
2837 */
2838 float64 float64_sub( float64 a, float64 b )
2839 {
2840     flag aSign, bSign;
2841 
2842     aSign = extractFloat64Sign( a );
2843     bSign = extractFloat64Sign( b );
2844     if ( aSign == bSign ) {
2845         return subFloat64Sigs( a, b, aSign );
2846     }
2847     else {
2848         return addFloat64Sigs( a, b, aSign );
2849     }
2850 
2851 }
2852 
2853 /*
2854 -------------------------------------------------------------------------------
2855 Returns the result of multiplying the double-precision floating-point values
2856 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2857 for Binary Floating-Point Arithmetic.
2858 -------------------------------------------------------------------------------
2859 */
2860 float64 float64_mul( float64 a, float64 b )
2861 {
2862     flag aSign, bSign, zSign;
2863     int16 aExp, bExp, zExp;
2864     bits64 aSig, bSig, zSig0, zSig1;
2865 
2866     aSig = extractFloat64Frac( a );
2867     aExp = extractFloat64Exp( a );
2868     aSign = extractFloat64Sign( a );
2869     bSig = extractFloat64Frac( b );
2870     bExp = extractFloat64Exp( b );
2871     bSign = extractFloat64Sign( b );
2872     zSign = aSign ^ bSign;
2873     if ( aExp == 0x7FF ) {
2874         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2875             return propagateFloat64NaN( a, b );
2876         }
2877         if ( ( bExp | bSig ) == 0 ) {
2878             float_raise( float_flag_invalid );
2879             return float64_default_nan;
2880         }
2881         return packFloat64( zSign, 0x7FF, 0 );
2882     }
2883     if ( bExp == 0x7FF ) {
2884         if ( bSig ) return propagateFloat64NaN( a, b );
2885         if ( ( aExp | aSig ) == 0 ) {
2886             float_raise( float_flag_invalid );
2887             return float64_default_nan;
2888         }
2889         return packFloat64( zSign, 0x7FF, 0 );
2890     }
2891     if ( aExp == 0 ) {
2892         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2893         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2894     }
2895     if ( bExp == 0 ) {
2896         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2897         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2898     }
2899     zExp = aExp + bExp - 0x3FF;
2900     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2901     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2902     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2903     zSig0 |= ( zSig1 != 0 );
2904     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2905         zSig0 <<= 1;
2906         --zExp;
2907     }
2908     return roundAndPackFloat64( zSign, zExp, zSig0 );
2909 
2910 }
2911 
2912 /*
2913 -------------------------------------------------------------------------------
2914 Returns the result of dividing the double-precision floating-point value `a'
2915 by the corresponding value `b'.  The operation is performed according to
2916 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917 -------------------------------------------------------------------------------
2918 */
2919 float64 float64_div( float64 a, float64 b )
2920 {
2921     flag aSign, bSign, zSign;
2922     int16 aExp, bExp, zExp;
2923     bits64 aSig, bSig, zSig;
2924     bits64 rem0, rem1;
2925     bits64 term0, term1;
2926 
2927     aSig = extractFloat64Frac( a );
2928     aExp = extractFloat64Exp( a );
2929     aSign = extractFloat64Sign( a );
2930     bSig = extractFloat64Frac( b );
2931     bExp = extractFloat64Exp( b );
2932     bSign = extractFloat64Sign( b );
2933     zSign = aSign ^ bSign;
2934     if ( aExp == 0x7FF ) {
2935         if ( aSig ) return propagateFloat64NaN( a, b );
2936         if ( bExp == 0x7FF ) {
2937             if ( bSig ) return propagateFloat64NaN( a, b );
2938             float_raise( float_flag_invalid );
2939             return float64_default_nan;
2940         }
2941         return packFloat64( zSign, 0x7FF, 0 );
2942     }
2943     if ( bExp == 0x7FF ) {
2944         if ( bSig ) return propagateFloat64NaN( a, b );
2945         return packFloat64( zSign, 0, 0 );
2946     }
2947     if ( bExp == 0 ) {
2948         if ( bSig == 0 ) {
2949             if ( ( aExp | aSig ) == 0 ) {
2950                 float_raise( float_flag_invalid );
2951                 return float64_default_nan;
2952             }
2953             float_raise( float_flag_divbyzero );
2954             return packFloat64( zSign, 0x7FF, 0 );
2955         }
2956         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2957     }
2958     if ( aExp == 0 ) {
2959         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2960         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2961     }
2962     zExp = aExp - bExp + 0x3FD;
2963     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2964     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2965     if ( bSig <= ( aSig + aSig ) ) {
2966         aSig >>= 1;
2967         ++zExp;
2968     }
2969     zSig = estimateDiv128To64( aSig, 0, bSig );
2970     if ( ( zSig & 0x1FF ) <= 2 ) {
2971         mul64To128( bSig, zSig, &term0, &term1 );
2972         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2973         while ( (sbits64) rem0 < 0 ) {
2974             --zSig;
2975             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2976         }
2977         zSig |= ( rem1 != 0 );
2978     }
2979     return roundAndPackFloat64( zSign, zExp, zSig );
2980 
2981 }
2982 
2983 #ifndef SOFTFLOAT_FOR_GCC
2984 /*
2985 -------------------------------------------------------------------------------
2986 Returns the remainder of the double-precision floating-point value `a'
2987 with respect to the corresponding value `b'.  The operation is performed
2988 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2989 -------------------------------------------------------------------------------
2990 */
2991 float64 float64_rem( float64 a, float64 b )
2992 {
2993     flag aSign, bSign, zSign;
2994     int16 aExp, bExp, expDiff;
2995     bits64 aSig, bSig;
2996     bits64 q, alternateASig;
2997     sbits64 sigMean;
2998 
2999     aSig = extractFloat64Frac( a );
3000     aExp = extractFloat64Exp( a );
3001     aSign = extractFloat64Sign( a );
3002     bSig = extractFloat64Frac( b );
3003     bExp = extractFloat64Exp( b );
3004     bSign = extractFloat64Sign( b );
3005     if ( aExp == 0x7FF ) {
3006         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3007             return propagateFloat64NaN( a, b );
3008         }
3009         float_raise( float_flag_invalid );
3010         return float64_default_nan;
3011     }
3012     if ( bExp == 0x7FF ) {
3013         if ( bSig ) return propagateFloat64NaN( a, b );
3014         return a;
3015     }
3016     if ( bExp == 0 ) {
3017         if ( bSig == 0 ) {
3018             float_raise( float_flag_invalid );
3019             return float64_default_nan;
3020         }
3021         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3022     }
3023     if ( aExp == 0 ) {
3024         if ( aSig == 0 ) return a;
3025         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3026     }
3027     expDiff = aExp - bExp;
3028     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3029     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3030     if ( expDiff < 0 ) {
3031         if ( expDiff < -1 ) return a;
3032         aSig >>= 1;
3033     }
3034     q = ( bSig <= aSig );
3035     if ( q ) aSig -= bSig;
3036     expDiff -= 64;
3037     while ( 0 < expDiff ) {
3038         q = estimateDiv128To64( aSig, 0, bSig );
3039         q = ( 2 < q ) ? q - 2 : 0;
3040         aSig = - ( ( bSig>>2 ) * q );
3041         expDiff -= 62;
3042     }
3043     expDiff += 64;
3044     if ( 0 < expDiff ) {
3045         q = estimateDiv128To64( aSig, 0, bSig );
3046         q = ( 2 < q ) ? q - 2 : 0;
3047         q >>= 64 - expDiff;
3048         bSig >>= 2;
3049         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3050     }
3051     else {
3052         aSig >>= 2;
3053         bSig >>= 2;
3054     }
3055     do {
3056         alternateASig = aSig;
3057         ++q;
3058         aSig -= bSig;
3059     } while ( 0 <= (sbits64) aSig );
3060     sigMean = aSig + alternateASig;
3061     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3062         aSig = alternateASig;
3063     }
3064     zSign = ( (sbits64) aSig < 0 );
3065     if ( zSign ) aSig = - aSig;
3066     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3067 
3068 }
3069 
3070 /*
3071 -------------------------------------------------------------------------------
3072 Returns the square root of the double-precision floating-point value `a'.
3073 The operation is performed according to the IEC/IEEE Standard for Binary
3074 Floating-Point Arithmetic.
3075 -------------------------------------------------------------------------------
3076 */
3077 float64 float64_sqrt( float64 a )
3078 {
3079     flag aSign;
3080     int16 aExp, zExp;
3081     bits64 aSig, zSig, doubleZSig;
3082     bits64 rem0, rem1, term0, term1;
3083 
3084     aSig = extractFloat64Frac( a );
3085     aExp = extractFloat64Exp( a );
3086     aSign = extractFloat64Sign( a );
3087     if ( aExp == 0x7FF ) {
3088         if ( aSig ) return propagateFloat64NaN( a, a );
3089         if ( ! aSign ) return a;
3090         float_raise( float_flag_invalid );
3091         return float64_default_nan;
3092     }
3093     if ( aSign ) {
3094         if ( ( aExp | aSig ) == 0 ) return a;
3095         float_raise( float_flag_invalid );
3096         return float64_default_nan;
3097     }
3098     if ( aExp == 0 ) {
3099         if ( aSig == 0 ) return 0;
3100         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3101     }
3102     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3103     aSig |= LIT64( 0x0010000000000000 );
3104     zSig = estimateSqrt32( aExp, aSig>>21 );
3105     aSig <<= 9 - ( aExp & 1 );
3106     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3107     if ( ( zSig & 0x1FF ) <= 5 ) {
3108         doubleZSig = zSig<<1;
3109         mul64To128( zSig, zSig, &term0, &term1 );
3110         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3111         while ( (sbits64) rem0 < 0 ) {
3112             --zSig;
3113             doubleZSig -= 2;
3114             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3115         }
3116         zSig |= ( ( rem0 | rem1 ) != 0 );
3117     }
3118     return roundAndPackFloat64( 0, zExp, zSig );
3119 
3120 }
3121 #endif
3122 
3123 /*
3124 -------------------------------------------------------------------------------
3125 Returns 1 if the double-precision floating-point value `a' is equal to the
3126 corresponding value `b', and 0 otherwise.  The comparison is performed
3127 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3128 -------------------------------------------------------------------------------
3129 */
3130 flag float64_eq( float64 a, float64 b )
3131 {
3132 
3133     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3134          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3135        ) {
3136         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3137             float_raise( float_flag_invalid );
3138         }
3139         return 0;
3140     }
3141     return ( a == b ) ||
3142 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3143 
3144 }
3145 
3146 /*
3147 -------------------------------------------------------------------------------
3148 Returns 1 if the double-precision floating-point value `a' is less than or
3149 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3150 performed according to the IEC/IEEE Standard for Binary Floating-Point
3151 Arithmetic.
3152 -------------------------------------------------------------------------------
3153 */
3154 flag float64_le( float64 a, float64 b )
3155 {
3156     flag aSign, bSign;
3157 
3158     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3159          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3160        ) {
3161         float_raise( float_flag_invalid );
3162         return 0;
3163     }
3164     aSign = extractFloat64Sign( a );
3165     bSign = extractFloat64Sign( b );
3166     if ( aSign != bSign )
3167 	return aSign ||
3168 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3169 	      0 );
3170     return ( a == b ) ||
3171 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3172 
3173 }
3174 
3175 /*
3176 -------------------------------------------------------------------------------
3177 Returns 1 if the double-precision floating-point value `a' is less than
3178 the corresponding value `b', and 0 otherwise.  The comparison is performed
3179 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3180 -------------------------------------------------------------------------------
3181 */
3182 flag float64_lt( float64 a, float64 b )
3183 {
3184     flag aSign, bSign;
3185 
3186     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3187          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3188        ) {
3189         float_raise( float_flag_invalid );
3190         return 0;
3191     }
3192     aSign = extractFloat64Sign( a );
3193     bSign = extractFloat64Sign( b );
3194     if ( aSign != bSign )
3195 	return aSign &&
3196 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3197 	      0 );
3198     return ( a != b ) &&
3199 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3200 
3201 }
3202 
3203 #ifndef SOFTFLOAT_FOR_GCC
3204 /*
3205 -------------------------------------------------------------------------------
3206 Returns 1 if the double-precision floating-point value `a' is equal to the
3207 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3208 if either operand is a NaN.  Otherwise, the comparison is performed
3209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3210 -------------------------------------------------------------------------------
3211 */
3212 flag float64_eq_signaling( float64 a, float64 b )
3213 {
3214 
3215     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3216          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3217        ) {
3218         float_raise( float_flag_invalid );
3219         return 0;
3220     }
3221     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3222 
3223 }
3224 
3225 /*
3226 -------------------------------------------------------------------------------
3227 Returns 1 if the double-precision floating-point value `a' is less than or
3228 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3229 cause an exception.  Otherwise, the comparison is performed according to the
3230 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3231 -------------------------------------------------------------------------------
3232 */
3233 flag float64_le_quiet( float64 a, float64 b )
3234 {
3235     flag aSign, bSign;
3236 
3237     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3238          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3239        ) {
3240         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3241             float_raise( float_flag_invalid );
3242         }
3243         return 0;
3244     }
3245     aSign = extractFloat64Sign( a );
3246     bSign = extractFloat64Sign( b );
3247     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3248     return ( a == b ) || ( aSign ^ ( a < b ) );
3249 
3250 }
3251 
3252 /*
3253 -------------------------------------------------------------------------------
3254 Returns 1 if the double-precision floating-point value `a' is less than
3255 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3256 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3257 Standard for Binary Floating-Point Arithmetic.
3258 -------------------------------------------------------------------------------
3259 */
3260 flag float64_lt_quiet( float64 a, float64 b )
3261 {
3262     flag aSign, bSign;
3263 
3264     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3265          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3266        ) {
3267         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3268             float_raise( float_flag_invalid );
3269         }
3270         return 0;
3271     }
3272     aSign = extractFloat64Sign( a );
3273     bSign = extractFloat64Sign( b );
3274     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3275     return ( a != b ) && ( aSign ^ ( a < b ) );
3276 
3277 }
3278 #endif
3279 
3280 #ifdef FLOATX80
3281 
3282 /*
3283 -------------------------------------------------------------------------------
3284 Returns the result of converting the extended double-precision floating-
3285 point value `a' to the 32-bit two's complement integer format.  The
3286 conversion is performed according to the IEC/IEEE Standard for Binary
3287 Floating-Point Arithmetic---which means in particular that the conversion
3288 is rounded according to the current rounding mode.  If `a' is a NaN, the
3289 largest positive integer is returned.  Otherwise, if the conversion
3290 overflows, the largest integer with the same sign as `a' is returned.
3291 -------------------------------------------------------------------------------
3292 */
3293 int32 floatx80_to_int32( floatx80 a )
3294 {
3295     flag aSign;
3296     int32 aExp, shiftCount;
3297     bits64 aSig;
3298 
3299     aSig = extractFloatx80Frac( a );
3300     aExp = extractFloatx80Exp( a );
3301     aSign = extractFloatx80Sign( a );
3302     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3303     shiftCount = 0x4037 - aExp;
3304     if ( shiftCount <= 0 ) shiftCount = 1;
3305     shift64RightJamming( aSig, shiftCount, &aSig );
3306     return roundAndPackInt32( aSign, aSig );
3307 
3308 }
3309 
3310 /*
3311 -------------------------------------------------------------------------------
3312 Returns the result of converting the extended double-precision floating-
3313 point value `a' to the 32-bit two's complement integer format.  The
3314 conversion is performed according to the IEC/IEEE Standard for Binary
3315 Floating-Point Arithmetic, except that the conversion is always rounded
3316 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3317 Otherwise, if the conversion overflows, the largest integer with the same
3318 sign as `a' is returned.
3319 -------------------------------------------------------------------------------
3320 */
3321 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3322 {
3323     flag aSign;
3324     int32 aExp, shiftCount;
3325     bits64 aSig, savedASig;
3326     int32 z;
3327 
3328     aSig = extractFloatx80Frac( a );
3329     aExp = extractFloatx80Exp( a );
3330     aSign = extractFloatx80Sign( a );
3331     if ( 0x401E < aExp ) {
3332         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3333         goto invalid;
3334     }
3335     else if ( aExp < 0x3FFF ) {
3336         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3337         return 0;
3338     }
3339     shiftCount = 0x403E - aExp;
3340     savedASig = aSig;
3341     aSig >>= shiftCount;
3342     z = aSig;
3343     if ( aSign ) z = - z;
3344     if ( ( z < 0 ) ^ aSign ) {
3345  invalid:
3346         float_raise( float_flag_invalid );
3347         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3348     }
3349     if ( ( aSig<<shiftCount ) != savedASig ) {
3350         float_exception_flags |= float_flag_inexact;
3351     }
3352     return z;
3353 
3354 }
3355 
3356 /*
3357 -------------------------------------------------------------------------------
3358 Returns the result of converting the extended double-precision floating-
3359 point value `a' to the 64-bit two's complement integer format.  The
3360 conversion is performed according to the IEC/IEEE Standard for Binary
3361 Floating-Point Arithmetic---which means in particular that the conversion
3362 is rounded according to the current rounding mode.  If `a' is a NaN,
3363 the largest positive integer is returned.  Otherwise, if the conversion
3364 overflows, the largest integer with the same sign as `a' is returned.
3365 -------------------------------------------------------------------------------
3366 */
3367 int64 floatx80_to_int64( floatx80 a )
3368 {
3369     flag aSign;
3370     int32 aExp, shiftCount;
3371     bits64 aSig, aSigExtra;
3372 
3373     aSig = extractFloatx80Frac( a );
3374     aExp = extractFloatx80Exp( a );
3375     aSign = extractFloatx80Sign( a );
3376     shiftCount = 0x403E - aExp;
3377     if ( shiftCount <= 0 ) {
3378         if ( shiftCount ) {
3379             float_raise( float_flag_invalid );
3380             if (    ! aSign
3381                  || (    ( aExp == 0x7FFF )
3382                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3383                ) {
3384                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3385             }
3386             return (sbits64) LIT64( 0x8000000000000000 );
3387         }
3388         aSigExtra = 0;
3389     }
3390     else {
3391         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3392     }
3393     return roundAndPackInt64( aSign, aSig, aSigExtra );
3394 
3395 }
3396 
3397 /*
3398 -------------------------------------------------------------------------------
3399 Returns the result of converting the extended double-precision floating-
3400 point value `a' to the 64-bit two's complement integer format.  The
3401 conversion is performed according to the IEC/IEEE Standard for Binary
3402 Floating-Point Arithmetic, except that the conversion is always rounded
3403 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3404 Otherwise, if the conversion overflows, the largest integer with the same
3405 sign as `a' is returned.
3406 -------------------------------------------------------------------------------
3407 */
3408 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3409 {
3410     flag aSign;
3411     int32 aExp, shiftCount;
3412     bits64 aSig;
3413     int64 z;
3414 
3415     aSig = extractFloatx80Frac( a );
3416     aExp = extractFloatx80Exp( a );
3417     aSign = extractFloatx80Sign( a );
3418     shiftCount = aExp - 0x403E;
3419     if ( 0 <= shiftCount ) {
3420         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3421         if ( ( a.high != 0xC03E ) || aSig ) {
3422             float_raise( float_flag_invalid );
3423             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3424                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3425             }
3426         }
3427         return (sbits64) LIT64( 0x8000000000000000 );
3428     }
3429     else if ( aExp < 0x3FFF ) {
3430         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3431         return 0;
3432     }
3433     z = aSig>>( - shiftCount );
3434     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3435         float_exception_flags |= float_flag_inexact;
3436     }
3437     if ( aSign ) z = - z;
3438     return z;
3439 
3440 }
3441 
3442 /*
3443 -------------------------------------------------------------------------------
3444 Returns the result of converting the extended double-precision floating-
3445 point value `a' to the single-precision floating-point format.  The
3446 conversion is performed according to the IEC/IEEE Standard for Binary
3447 Floating-Point Arithmetic.
3448 -------------------------------------------------------------------------------
3449 */
3450 float32 floatx80_to_float32( floatx80 a )
3451 {
3452     flag aSign;
3453     int32 aExp;
3454     bits64 aSig;
3455 
3456     aSig = extractFloatx80Frac( a );
3457     aExp = extractFloatx80Exp( a );
3458     aSign = extractFloatx80Sign( a );
3459     if ( aExp == 0x7FFF ) {
3460         if ( (bits64) ( aSig<<1 ) ) {
3461             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3462         }
3463         return packFloat32( aSign, 0xFF, 0 );
3464     }
3465     shift64RightJamming( aSig, 33, &aSig );
3466     if ( aExp || aSig ) aExp -= 0x3F81;
3467     return roundAndPackFloat32( aSign, aExp, aSig );
3468 
3469 }
3470 
3471 /*
3472 -------------------------------------------------------------------------------
3473 Returns the result of converting the extended double-precision floating-
3474 point value `a' to the double-precision floating-point format.  The
3475 conversion is performed according to the IEC/IEEE Standard for Binary
3476 Floating-Point Arithmetic.
3477 -------------------------------------------------------------------------------
3478 */
3479 float64 floatx80_to_float64( floatx80 a )
3480 {
3481     flag aSign;
3482     int32 aExp;
3483     bits64 aSig, zSig;
3484 
3485     aSig = extractFloatx80Frac( a );
3486     aExp = extractFloatx80Exp( a );
3487     aSign = extractFloatx80Sign( a );
3488     if ( aExp == 0x7FFF ) {
3489         if ( (bits64) ( aSig<<1 ) ) {
3490             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3491         }
3492         return packFloat64( aSign, 0x7FF, 0 );
3493     }
3494     shift64RightJamming( aSig, 1, &zSig );
3495     if ( aExp || aSig ) aExp -= 0x3C01;
3496     return roundAndPackFloat64( aSign, aExp, zSig );
3497 
3498 }
3499 
3500 #ifdef FLOAT128
3501 
3502 /*
3503 -------------------------------------------------------------------------------
3504 Returns the result of converting the extended double-precision floating-
3505 point value `a' to the quadruple-precision floating-point format.  The
3506 conversion is performed according to the IEC/IEEE Standard for Binary
3507 Floating-Point Arithmetic.
3508 -------------------------------------------------------------------------------
3509 */
3510 float128 floatx80_to_float128( floatx80 a )
3511 {
3512     flag aSign;
3513     int16 aExp;
3514     bits64 aSig, zSig0, zSig1;
3515 
3516     aSig = extractFloatx80Frac( a );
3517     aExp = extractFloatx80Exp( a );
3518     aSign = extractFloatx80Sign( a );
3519     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3520         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3521     }
3522     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3523     return packFloat128( aSign, aExp, zSig0, zSig1 );
3524 
3525 }
3526 
3527 #endif
3528 
3529 /*
3530 -------------------------------------------------------------------------------
3531 Rounds the extended double-precision floating-point value `a' to an integer,
3532 and returns the result as an extended quadruple-precision floating-point
3533 value.  The operation is performed according to the IEC/IEEE Standard for
3534 Binary Floating-Point Arithmetic.
3535 -------------------------------------------------------------------------------
3536 */
3537 floatx80 floatx80_round_to_int( floatx80 a )
3538 {
3539     flag aSign;
3540     int32 aExp;
3541     bits64 lastBitMask, roundBitsMask;
3542     int8 roundingMode;
3543     floatx80 z;
3544 
3545     aExp = extractFloatx80Exp( a );
3546     if ( 0x403E <= aExp ) {
3547         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3548             return propagateFloatx80NaN( a, a );
3549         }
3550         return a;
3551     }
3552     if ( aExp < 0x3FFF ) {
3553         if (    ( aExp == 0 )
3554              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3555             return a;
3556         }
3557         float_exception_flags |= float_flag_inexact;
3558         aSign = extractFloatx80Sign( a );
3559         switch ( float_rounding_mode ) {
3560          case float_round_nearest_even:
3561             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3562                ) {
3563                 return
3564                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3565             }
3566             break;
3567 	 case float_round_to_zero:
3568 	    break;
3569          case float_round_down:
3570             return
3571                   aSign ?
3572                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3573                 : packFloatx80( 0, 0, 0 );
3574          case float_round_up:
3575             return
3576                   aSign ? packFloatx80( 1, 0, 0 )
3577                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3578         }
3579         return packFloatx80( aSign, 0, 0 );
3580     }
3581     lastBitMask = 1;
3582     lastBitMask <<= 0x403E - aExp;
3583     roundBitsMask = lastBitMask - 1;
3584     z = a;
3585     roundingMode = float_rounding_mode;
3586     if ( roundingMode == float_round_nearest_even ) {
3587         z.low += lastBitMask>>1;
3588         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3589     }
3590     else if ( roundingMode != float_round_to_zero ) {
3591         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3592             z.low += roundBitsMask;
3593         }
3594     }
3595     z.low &= ~ roundBitsMask;
3596     if ( z.low == 0 ) {
3597         ++z.high;
3598         z.low = LIT64( 0x8000000000000000 );
3599     }
3600     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3601     return z;
3602 
3603 }
3604 
3605 /*
3606 -------------------------------------------------------------------------------
3607 Returns the result of adding the absolute values of the extended double-
3608 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3609 negated before being returned.  `zSign' is ignored if the result is a NaN.
3610 The addition is performed according to the IEC/IEEE Standard for Binary
3611 Floating-Point Arithmetic.
3612 -------------------------------------------------------------------------------
3613 */
3614 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3615 {
3616     int32 aExp, bExp, zExp;
3617     bits64 aSig, bSig, zSig0, zSig1;
3618     int32 expDiff;
3619 
3620     aSig = extractFloatx80Frac( a );
3621     aExp = extractFloatx80Exp( a );
3622     bSig = extractFloatx80Frac( b );
3623     bExp = extractFloatx80Exp( b );
3624     expDiff = aExp - bExp;
3625     if ( 0 < expDiff ) {
3626         if ( aExp == 0x7FFF ) {
3627             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3628             return a;
3629         }
3630         if ( bExp == 0 ) --expDiff;
3631         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3632         zExp = aExp;
3633     }
3634     else if ( expDiff < 0 ) {
3635         if ( bExp == 0x7FFF ) {
3636             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3637             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3638         }
3639         if ( aExp == 0 ) ++expDiff;
3640         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3641         zExp = bExp;
3642     }
3643     else {
3644         if ( aExp == 0x7FFF ) {
3645             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3646                 return propagateFloatx80NaN( a, b );
3647             }
3648             return a;
3649         }
3650         zSig1 = 0;
3651         zSig0 = aSig + bSig;
3652         if ( aExp == 0 ) {
3653             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3654             goto roundAndPack;
3655         }
3656         zExp = aExp;
3657         goto shiftRight1;
3658     }
3659     zSig0 = aSig + bSig;
3660     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3661  shiftRight1:
3662     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3663     zSig0 |= LIT64( 0x8000000000000000 );
3664     ++zExp;
3665  roundAndPack:
3666     return
3667         roundAndPackFloatx80(
3668             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3669 
3670 }
3671 
3672 /*
3673 -------------------------------------------------------------------------------
3674 Returns the result of subtracting the absolute values of the extended
3675 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3676 difference is negated before being returned.  `zSign' is ignored if the
3677 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3678 Standard for Binary Floating-Point Arithmetic.
3679 -------------------------------------------------------------------------------
3680 */
3681 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3682 {
3683     int32 aExp, bExp, zExp;
3684     bits64 aSig, bSig, zSig0, zSig1;
3685     int32 expDiff;
3686     floatx80 z;
3687 
3688     aSig = extractFloatx80Frac( a );
3689     aExp = extractFloatx80Exp( a );
3690     bSig = extractFloatx80Frac( b );
3691     bExp = extractFloatx80Exp( b );
3692     expDiff = aExp - bExp;
3693     if ( 0 < expDiff ) goto aExpBigger;
3694     if ( expDiff < 0 ) goto bExpBigger;
3695     if ( aExp == 0x7FFF ) {
3696         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3697             return propagateFloatx80NaN( a, b );
3698         }
3699         float_raise( float_flag_invalid );
3700         z.low = floatx80_default_nan_low;
3701         z.high = floatx80_default_nan_high;
3702         return z;
3703     }
3704     if ( aExp == 0 ) {
3705         aExp = 1;
3706         bExp = 1;
3707     }
3708     zSig1 = 0;
3709     if ( bSig < aSig ) goto aBigger;
3710     if ( aSig < bSig ) goto bBigger;
3711     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3712  bExpBigger:
3713     if ( bExp == 0x7FFF ) {
3714         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3715         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3716     }
3717     if ( aExp == 0 ) ++expDiff;
3718     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3719  bBigger:
3720     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3721     zExp = bExp;
3722     zSign ^= 1;
3723     goto normalizeRoundAndPack;
3724  aExpBigger:
3725     if ( aExp == 0x7FFF ) {
3726         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3727         return a;
3728     }
3729     if ( bExp == 0 ) --expDiff;
3730     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3731  aBigger:
3732     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3733     zExp = aExp;
3734  normalizeRoundAndPack:
3735     return
3736         normalizeRoundAndPackFloatx80(
3737             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3738 
3739 }
3740 
3741 /*
3742 -------------------------------------------------------------------------------
3743 Returns the result of adding the extended double-precision floating-point
3744 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3745 Standard for Binary Floating-Point Arithmetic.
3746 -------------------------------------------------------------------------------
3747 */
3748 floatx80 floatx80_add( floatx80 a, floatx80 b )
3749 {
3750     flag aSign, bSign;
3751 
3752     aSign = extractFloatx80Sign( a );
3753     bSign = extractFloatx80Sign( b );
3754     if ( aSign == bSign ) {
3755         return addFloatx80Sigs( a, b, aSign );
3756     }
3757     else {
3758         return subFloatx80Sigs( a, b, aSign );
3759     }
3760 
3761 }
3762 
3763 /*
3764 -------------------------------------------------------------------------------
3765 Returns the result of subtracting the extended double-precision floating-
3766 point values `a' and `b'.  The operation is performed according to the
3767 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3768 -------------------------------------------------------------------------------
3769 */
3770 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3771 {
3772     flag aSign, bSign;
3773 
3774     aSign = extractFloatx80Sign( a );
3775     bSign = extractFloatx80Sign( b );
3776     if ( aSign == bSign ) {
3777         return subFloatx80Sigs( a, b, aSign );
3778     }
3779     else {
3780         return addFloatx80Sigs( a, b, aSign );
3781     }
3782 
3783 }
3784 
3785 /*
3786 -------------------------------------------------------------------------------
3787 Returns the result of multiplying the extended double-precision floating-
3788 point values `a' and `b'.  The operation is performed according to the
3789 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3790 -------------------------------------------------------------------------------
3791 */
3792 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3793 {
3794     flag aSign, bSign, zSign;
3795     int32 aExp, bExp, zExp;
3796     bits64 aSig, bSig, zSig0, zSig1;
3797     floatx80 z;
3798 
3799     aSig = extractFloatx80Frac( a );
3800     aExp = extractFloatx80Exp( a );
3801     aSign = extractFloatx80Sign( a );
3802     bSig = extractFloatx80Frac( b );
3803     bExp = extractFloatx80Exp( b );
3804     bSign = extractFloatx80Sign( b );
3805     zSign = aSign ^ bSign;
3806     if ( aExp == 0x7FFF ) {
3807         if (    (bits64) ( aSig<<1 )
3808              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3809             return propagateFloatx80NaN( a, b );
3810         }
3811         if ( ( bExp | bSig ) == 0 ) goto invalid;
3812         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3813     }
3814     if ( bExp == 0x7FFF ) {
3815         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3816         if ( ( aExp | aSig ) == 0 ) {
3817  invalid:
3818             float_raise( float_flag_invalid );
3819             z.low = floatx80_default_nan_low;
3820             z.high = floatx80_default_nan_high;
3821             return z;
3822         }
3823         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3824     }
3825     if ( aExp == 0 ) {
3826         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3827         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3828     }
3829     if ( bExp == 0 ) {
3830         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3831         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3832     }
3833     zExp = aExp + bExp - 0x3FFE;
3834     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3835     if ( 0 < (sbits64) zSig0 ) {
3836         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3837         --zExp;
3838     }
3839     return
3840         roundAndPackFloatx80(
3841             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3842 
3843 }
3844 
3845 /*
3846 -------------------------------------------------------------------------------
3847 Returns the result of dividing the extended double-precision floating-point
3848 value `a' by the corresponding value `b'.  The operation is performed
3849 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3850 -------------------------------------------------------------------------------
3851 */
3852 floatx80 floatx80_div( floatx80 a, floatx80 b )
3853 {
3854     flag aSign, bSign, zSign;
3855     int32 aExp, bExp, zExp;
3856     bits64 aSig, bSig, zSig0, zSig1;
3857     bits64 rem0, rem1, rem2, term0, term1, term2;
3858     floatx80 z;
3859 
3860     aSig = extractFloatx80Frac( a );
3861     aExp = extractFloatx80Exp( a );
3862     aSign = extractFloatx80Sign( a );
3863     bSig = extractFloatx80Frac( b );
3864     bExp = extractFloatx80Exp( b );
3865     bSign = extractFloatx80Sign( b );
3866     zSign = aSign ^ bSign;
3867     if ( aExp == 0x7FFF ) {
3868         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3869         if ( bExp == 0x7FFF ) {
3870             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3871             goto invalid;
3872         }
3873         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3874     }
3875     if ( bExp == 0x7FFF ) {
3876         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3877         return packFloatx80( zSign, 0, 0 );
3878     }
3879     if ( bExp == 0 ) {
3880         if ( bSig == 0 ) {
3881             if ( ( aExp | aSig ) == 0 ) {
3882  invalid:
3883                 float_raise( float_flag_invalid );
3884                 z.low = floatx80_default_nan_low;
3885                 z.high = floatx80_default_nan_high;
3886                 return z;
3887             }
3888             float_raise( float_flag_divbyzero );
3889             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3890         }
3891         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3892     }
3893     if ( aExp == 0 ) {
3894         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3895         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3896     }
3897     zExp = aExp - bExp + 0x3FFE;
3898     rem1 = 0;
3899     if ( bSig <= aSig ) {
3900         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3901         ++zExp;
3902     }
3903     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3904     mul64To128( bSig, zSig0, &term0, &term1 );
3905     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3906     while ( (sbits64) rem0 < 0 ) {
3907         --zSig0;
3908         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3909     }
3910     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3911     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3912         mul64To128( bSig, zSig1, &term1, &term2 );
3913         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3914         while ( (sbits64) rem1 < 0 ) {
3915             --zSig1;
3916             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3917         }
3918         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3919     }
3920     return
3921         roundAndPackFloatx80(
3922             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3923 
3924 }
3925 
3926 /*
3927 -------------------------------------------------------------------------------
3928 Returns the remainder of the extended double-precision floating-point value
3929 `a' with respect to the corresponding value `b'.  The operation is performed
3930 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3931 -------------------------------------------------------------------------------
3932 */
3933 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3934 {
3935     flag aSign, bSign, zSign;
3936     int32 aExp, bExp, expDiff;
3937     bits64 aSig0, aSig1, bSig;
3938     bits64 q, term0, term1, alternateASig0, alternateASig1;
3939     floatx80 z;
3940 
3941     aSig0 = extractFloatx80Frac( a );
3942     aExp = extractFloatx80Exp( a );
3943     aSign = extractFloatx80Sign( a );
3944     bSig = extractFloatx80Frac( b );
3945     bExp = extractFloatx80Exp( b );
3946     bSign = extractFloatx80Sign( b );
3947     if ( aExp == 0x7FFF ) {
3948         if (    (bits64) ( aSig0<<1 )
3949              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3950             return propagateFloatx80NaN( a, b );
3951         }
3952         goto invalid;
3953     }
3954     if ( bExp == 0x7FFF ) {
3955         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3956         return a;
3957     }
3958     if ( bExp == 0 ) {
3959         if ( bSig == 0 ) {
3960  invalid:
3961             float_raise( float_flag_invalid );
3962             z.low = floatx80_default_nan_low;
3963             z.high = floatx80_default_nan_high;
3964             return z;
3965         }
3966         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3967     }
3968     if ( aExp == 0 ) {
3969         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3970         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3971     }
3972     bSig |= LIT64( 0x8000000000000000 );
3973     zSign = aSign;
3974     expDiff = aExp - bExp;
3975     aSig1 = 0;
3976     if ( expDiff < 0 ) {
3977         if ( expDiff < -1 ) return a;
3978         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3979         expDiff = 0;
3980     }
3981     q = ( bSig <= aSig0 );
3982     if ( q ) aSig0 -= bSig;
3983     expDiff -= 64;
3984     while ( 0 < expDiff ) {
3985         q = estimateDiv128To64( aSig0, aSig1, bSig );
3986         q = ( 2 < q ) ? q - 2 : 0;
3987         mul64To128( bSig, q, &term0, &term1 );
3988         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3989         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3990         expDiff -= 62;
3991     }
3992     expDiff += 64;
3993     if ( 0 < expDiff ) {
3994         q = estimateDiv128To64( aSig0, aSig1, bSig );
3995         q = ( 2 < q ) ? q - 2 : 0;
3996         q >>= 64 - expDiff;
3997         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3998         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3999         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4000         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4001             ++q;
4002             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4003         }
4004     }
4005     else {
4006         term1 = 0;
4007         term0 = bSig;
4008     }
4009     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4010     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4011          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4012               && ( q & 1 ) )
4013        ) {
4014         aSig0 = alternateASig0;
4015         aSig1 = alternateASig1;
4016         zSign = ! zSign;
4017     }
4018     return
4019         normalizeRoundAndPackFloatx80(
4020             80, zSign, bExp + expDiff, aSig0, aSig1 );
4021 
4022 }
4023 
4024 /*
4025 -------------------------------------------------------------------------------
4026 Returns the square root of the extended double-precision floating-point
4027 value `a'.  The operation is performed according to the IEC/IEEE Standard
4028 for Binary Floating-Point Arithmetic.
4029 -------------------------------------------------------------------------------
4030 */
4031 floatx80 floatx80_sqrt( floatx80 a )
4032 {
4033     flag aSign;
4034     int32 aExp, zExp;
4035     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4036     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4037     floatx80 z;
4038 
4039     aSig0 = extractFloatx80Frac( a );
4040     aExp = extractFloatx80Exp( a );
4041     aSign = extractFloatx80Sign( a );
4042     if ( aExp == 0x7FFF ) {
4043         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4044         if ( ! aSign ) return a;
4045         goto invalid;
4046     }
4047     if ( aSign ) {
4048         if ( ( aExp | aSig0 ) == 0 ) return a;
4049  invalid:
4050         float_raise( float_flag_invalid );
4051         z.low = floatx80_default_nan_low;
4052         z.high = floatx80_default_nan_high;
4053         return z;
4054     }
4055     if ( aExp == 0 ) {
4056         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4057         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4058     }
4059     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4060     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4061     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4062     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4063     doubleZSig0 = zSig0<<1;
4064     mul64To128( zSig0, zSig0, &term0, &term1 );
4065     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4066     while ( (sbits64) rem0 < 0 ) {
4067         --zSig0;
4068         doubleZSig0 -= 2;
4069         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4070     }
4071     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4072     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4073         if ( zSig1 == 0 ) zSig1 = 1;
4074         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4075         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4076         mul64To128( zSig1, zSig1, &term2, &term3 );
4077         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4078         while ( (sbits64) rem1 < 0 ) {
4079             --zSig1;
4080             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4081             term3 |= 1;
4082             term2 |= doubleZSig0;
4083             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4084         }
4085         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4086     }
4087     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4088     zSig0 |= doubleZSig0;
4089     return
4090         roundAndPackFloatx80(
4091             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4092 
4093 }
4094 
4095 /*
4096 -------------------------------------------------------------------------------
4097 Returns 1 if the extended double-precision floating-point value `a' is
4098 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4099 performed according to the IEC/IEEE Standard for Binary Floating-Point
4100 Arithmetic.
4101 -------------------------------------------------------------------------------
4102 */
4103 flag floatx80_eq( floatx80 a, floatx80 b )
4104 {
4105 
4106     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4107               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4108          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4109               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4110        ) {
4111         if (    floatx80_is_signaling_nan( a )
4112              || floatx80_is_signaling_nan( b ) ) {
4113             float_raise( float_flag_invalid );
4114         }
4115         return 0;
4116     }
4117     return
4118            ( a.low == b.low )
4119         && (    ( a.high == b.high )
4120              || (    ( a.low == 0 )
4121                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4122            );
4123 
4124 }
4125 
4126 /*
4127 -------------------------------------------------------------------------------
4128 Returns 1 if the extended double-precision floating-point value `a' is
4129 less than or equal to the corresponding value `b', and 0 otherwise.  The
4130 comparison is performed according to the IEC/IEEE Standard for Binary
4131 Floating-Point Arithmetic.
4132 -------------------------------------------------------------------------------
4133 */
4134 flag floatx80_le( floatx80 a, floatx80 b )
4135 {
4136     flag aSign, bSign;
4137 
4138     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4139               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4140          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4141               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4142        ) {
4143         float_raise( float_flag_invalid );
4144         return 0;
4145     }
4146     aSign = extractFloatx80Sign( a );
4147     bSign = extractFloatx80Sign( b );
4148     if ( aSign != bSign ) {
4149         return
4150                aSign
4151             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4152                  == 0 );
4153     }
4154     return
4155           aSign ? le128( b.high, b.low, a.high, a.low )
4156         : le128( a.high, a.low, b.high, b.low );
4157 
4158 }
4159 
4160 /*
4161 -------------------------------------------------------------------------------
4162 Returns 1 if the extended double-precision floating-point value `a' is
4163 less than the corresponding value `b', and 0 otherwise.  The comparison
4164 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4165 Arithmetic.
4166 -------------------------------------------------------------------------------
4167 */
4168 flag floatx80_lt( floatx80 a, floatx80 b )
4169 {
4170     flag aSign, bSign;
4171 
4172     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4173               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4174          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4175               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4176        ) {
4177         float_raise( float_flag_invalid );
4178         return 0;
4179     }
4180     aSign = extractFloatx80Sign( a );
4181     bSign = extractFloatx80Sign( b );
4182     if ( aSign != bSign ) {
4183         return
4184                aSign
4185             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4186                  != 0 );
4187     }
4188     return
4189           aSign ? lt128( b.high, b.low, a.high, a.low )
4190         : lt128( a.high, a.low, b.high, b.low );
4191 
4192 }
4193 
4194 /*
4195 -------------------------------------------------------------------------------
4196 Returns 1 if the extended double-precision floating-point value `a' is equal
4197 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4198 raised if either operand is a NaN.  Otherwise, the comparison is performed
4199 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4200 -------------------------------------------------------------------------------
4201 */
4202 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4203 {
4204 
4205     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4206               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4207          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4208               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4209        ) {
4210         float_raise( float_flag_invalid );
4211         return 0;
4212     }
4213     return
4214            ( a.low == b.low )
4215         && (    ( a.high == b.high )
4216              || (    ( a.low == 0 )
4217                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4218            );
4219 
4220 }
4221 
4222 /*
4223 -------------------------------------------------------------------------------
4224 Returns 1 if the extended double-precision floating-point value `a' is less
4225 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4226 do not cause an exception.  Otherwise, the comparison is performed according
4227 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4228 -------------------------------------------------------------------------------
4229 */
4230 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4231 {
4232     flag aSign, bSign;
4233 
4234     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4235               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4236          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4237               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4238        ) {
4239         if (    floatx80_is_signaling_nan( a )
4240              || floatx80_is_signaling_nan( b ) ) {
4241             float_raise( float_flag_invalid );
4242         }
4243         return 0;
4244     }
4245     aSign = extractFloatx80Sign( a );
4246     bSign = extractFloatx80Sign( b );
4247     if ( aSign != bSign ) {
4248         return
4249                aSign
4250             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4251                  == 0 );
4252     }
4253     return
4254           aSign ? le128( b.high, b.low, a.high, a.low )
4255         : le128( a.high, a.low, b.high, b.low );
4256 
4257 }
4258 
4259 /*
4260 -------------------------------------------------------------------------------
4261 Returns 1 if the extended double-precision floating-point value `a' is less
4262 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4263 an exception.  Otherwise, the comparison is performed according to the
4264 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4265 -------------------------------------------------------------------------------
4266 */
4267 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4268 {
4269     flag aSign, bSign;
4270 
4271     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4272               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4273          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4274               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4275        ) {
4276         if (    floatx80_is_signaling_nan( a )
4277              || floatx80_is_signaling_nan( b ) ) {
4278             float_raise( float_flag_invalid );
4279         }
4280         return 0;
4281     }
4282     aSign = extractFloatx80Sign( a );
4283     bSign = extractFloatx80Sign( b );
4284     if ( aSign != bSign ) {
4285         return
4286                aSign
4287             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4288                  != 0 );
4289     }
4290     return
4291           aSign ? lt128( b.high, b.low, a.high, a.low )
4292         : lt128( a.high, a.low, b.high, b.low );
4293 
4294 }
4295 
4296 #endif
4297 
4298 #ifdef FLOAT128
4299 
4300 /*
4301 -------------------------------------------------------------------------------
4302 Returns the result of converting the quadruple-precision floating-point
4303 value `a' to the 32-bit two's complement integer format.  The conversion
4304 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4305 Arithmetic---which means in particular that the conversion is rounded
4306 according to the current rounding mode.  If `a' is a NaN, the largest
4307 positive integer is returned.  Otherwise, if the conversion overflows, the
4308 largest integer with the same sign as `a' is returned.
4309 -------------------------------------------------------------------------------
4310 */
4311 int32 float128_to_int32( float128 a )
4312 {
4313     flag aSign;
4314     int32 aExp, shiftCount;
4315     bits64 aSig0, aSig1;
4316 
4317     aSig1 = extractFloat128Frac1( a );
4318     aSig0 = extractFloat128Frac0( a );
4319     aExp = extractFloat128Exp( a );
4320     aSign = extractFloat128Sign( a );
4321     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4322     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4323     aSig0 |= ( aSig1 != 0 );
4324     shiftCount = 0x4028 - aExp;
4325     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4326     return roundAndPackInt32( aSign, aSig0 );
4327 
4328 }
4329 
4330 /*
4331 -------------------------------------------------------------------------------
4332 Returns the result of converting the quadruple-precision floating-point
4333 value `a' to the 32-bit two's complement integer format.  The conversion
4334 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4335 Arithmetic, except that the conversion is always rounded toward zero.  If
4336 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4337 conversion overflows, the largest integer with the same sign as `a' is
4338 returned.
4339 -------------------------------------------------------------------------------
4340 */
4341 int32 float128_to_int32_round_to_zero( float128 a )
4342 {
4343     flag aSign;
4344     int32 aExp, shiftCount;
4345     bits64 aSig0, aSig1, savedASig;
4346     int32 z;
4347 
4348     aSig1 = extractFloat128Frac1( a );
4349     aSig0 = extractFloat128Frac0( a );
4350     aExp = extractFloat128Exp( a );
4351     aSign = extractFloat128Sign( a );
4352     aSig0 |= ( aSig1 != 0 );
4353     if ( 0x401E < aExp ) {
4354         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4355         goto invalid;
4356     }
4357     else if ( aExp < 0x3FFF ) {
4358         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4359         return 0;
4360     }
4361     aSig0 |= LIT64( 0x0001000000000000 );
4362     shiftCount = 0x402F - aExp;
4363     savedASig = aSig0;
4364     aSig0 >>= shiftCount;
4365     z = aSig0;
4366     if ( aSign ) z = - z;
4367     if ( ( z < 0 ) ^ aSign ) {
4368  invalid:
4369         float_raise( float_flag_invalid );
4370         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4371     }
4372     if ( ( aSig0<<shiftCount ) != savedASig ) {
4373         float_exception_flags |= float_flag_inexact;
4374     }
4375     return z;
4376 
4377 }
4378 
4379 /*
4380 -------------------------------------------------------------------------------
4381 Returns the result of converting the quadruple-precision floating-point
4382 value `a' to the 64-bit two's complement integer format.  The conversion
4383 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4384 Arithmetic---which means in particular that the conversion is rounded
4385 according to the current rounding mode.  If `a' is a NaN, the largest
4386 positive integer is returned.  Otherwise, if the conversion overflows, the
4387 largest integer with the same sign as `a' is returned.
4388 -------------------------------------------------------------------------------
4389 */
4390 int64 float128_to_int64( float128 a )
4391 {
4392     flag aSign;
4393     int32 aExp, shiftCount;
4394     bits64 aSig0, aSig1;
4395 
4396     aSig1 = extractFloat128Frac1( a );
4397     aSig0 = extractFloat128Frac0( a );
4398     aExp = extractFloat128Exp( a );
4399     aSign = extractFloat128Sign( a );
4400     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4401     shiftCount = 0x402F - aExp;
4402     if ( shiftCount <= 0 ) {
4403         if ( 0x403E < aExp ) {
4404             float_raise( float_flag_invalid );
4405             if (    ! aSign
4406                  || (    ( aExp == 0x7FFF )
4407                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4408                     )
4409                ) {
4410                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4411             }
4412             return (sbits64) LIT64( 0x8000000000000000 );
4413         }
4414         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4415     }
4416     else {
4417         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4418     }
4419     return roundAndPackInt64( aSign, aSig0, aSig1 );
4420 
4421 }
4422 
4423 /*
4424 -------------------------------------------------------------------------------
4425 Returns the result of converting the quadruple-precision floating-point
4426 value `a' to the 64-bit two's complement integer format.  The conversion
4427 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4428 Arithmetic, except that the conversion is always rounded toward zero.
4429 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4430 the conversion overflows, the largest integer with the same sign as `a' is
4431 returned.
4432 -------------------------------------------------------------------------------
4433 */
4434 int64 float128_to_int64_round_to_zero( float128 a )
4435 {
4436     flag aSign;
4437     int32 aExp, shiftCount;
4438     bits64 aSig0, aSig1;
4439     int64 z;
4440 
4441     aSig1 = extractFloat128Frac1( a );
4442     aSig0 = extractFloat128Frac0( a );
4443     aExp = extractFloat128Exp( a );
4444     aSign = extractFloat128Sign( a );
4445     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4446     shiftCount = aExp - 0x402F;
4447     if ( 0 < shiftCount ) {
4448         if ( 0x403E <= aExp ) {
4449             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4450             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4451                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4452                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4453             }
4454             else {
4455                 float_raise( float_flag_invalid );
4456                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4457                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4458                 }
4459             }
4460             return (sbits64) LIT64( 0x8000000000000000 );
4461         }
4462         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4463         if ( (bits64) ( aSig1<<shiftCount ) ) {
4464             float_exception_flags |= float_flag_inexact;
4465         }
4466     }
4467     else {
4468         if ( aExp < 0x3FFF ) {
4469             if ( aExp | aSig0 | aSig1 ) {
4470                 float_exception_flags |= float_flag_inexact;
4471             }
4472             return 0;
4473         }
4474         z = aSig0>>( - shiftCount );
4475         if (    aSig1
4476              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4477             float_exception_flags |= float_flag_inexact;
4478         }
4479     }
4480     if ( aSign ) z = - z;
4481     return z;
4482 
4483 }
4484 
4485 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4486     && defined(SOFTFLOAT_NEED_FIXUNS)
4487 /*
4488  * just like above - but do not care for overflow of signed results
4489  */
4490 uint64 float128_to_uint64_round_to_zero( float128 a )
4491 {
4492     flag aSign;
4493     int32 aExp, shiftCount;
4494     bits64 aSig0, aSig1;
4495     uint64 z;
4496 
4497     aSig1 = extractFloat128Frac1( a );
4498     aSig0 = extractFloat128Frac0( a );
4499     aExp = extractFloat128Exp( a );
4500     aSign = extractFloat128Sign( a );
4501     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4502     shiftCount = aExp - 0x402F;
4503     if ( 0 < shiftCount ) {
4504         if ( 0x403F <= aExp ) {
4505             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4506             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4507                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4508                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4509             }
4510             else {
4511                 float_raise( float_flag_invalid );
4512             }
4513             return LIT64( 0xFFFFFFFFFFFFFFFF );
4514         }
4515         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4516         if ( (bits64) ( aSig1<<shiftCount ) ) {
4517             float_exception_flags |= float_flag_inexact;
4518         }
4519     }
4520     else {
4521         if ( aExp < 0x3FFF ) {
4522             if ( aExp | aSig0 | aSig1 ) {
4523                 float_exception_flags |= float_flag_inexact;
4524             }
4525             return 0;
4526         }
4527         z = aSig0>>( - shiftCount );
4528         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4529             float_exception_flags |= float_flag_inexact;
4530         }
4531     }
4532     if ( aSign ) z = - z;
4533     return z;
4534 
4535 }
4536 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4537 
4538 /*
4539 -------------------------------------------------------------------------------
4540 Returns the result of converting the quadruple-precision floating-point
4541 value `a' to the single-precision floating-point format.  The conversion
4542 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4543 Arithmetic.
4544 -------------------------------------------------------------------------------
4545 */
4546 float32 float128_to_float32( float128 a )
4547 {
4548     flag aSign;
4549     int32 aExp;
4550     bits64 aSig0, aSig1;
4551     bits32 zSig;
4552 
4553     aSig1 = extractFloat128Frac1( a );
4554     aSig0 = extractFloat128Frac0( a );
4555     aExp = extractFloat128Exp( a );
4556     aSign = extractFloat128Sign( a );
4557     if ( aExp == 0x7FFF ) {
4558         if ( aSig0 | aSig1 ) {
4559             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4560         }
4561         return packFloat32( aSign, 0xFF, 0 );
4562     }
4563     aSig0 |= ( aSig1 != 0 );
4564     shift64RightJamming( aSig0, 18, &aSig0 );
4565     zSig = aSig0;
4566     if ( aExp || zSig ) {
4567         zSig |= 0x40000000;
4568         aExp -= 0x3F81;
4569     }
4570     return roundAndPackFloat32( aSign, aExp, zSig );
4571 
4572 }
4573 
4574 /*
4575 -------------------------------------------------------------------------------
4576 Returns the result of converting the quadruple-precision floating-point
4577 value `a' to the double-precision floating-point format.  The conversion
4578 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4579 Arithmetic.
4580 -------------------------------------------------------------------------------
4581 */
4582 float64 float128_to_float64( float128 a )
4583 {
4584     flag aSign;
4585     int32 aExp;
4586     bits64 aSig0, aSig1;
4587 
4588     aSig1 = extractFloat128Frac1( a );
4589     aSig0 = extractFloat128Frac0( a );
4590     aExp = extractFloat128Exp( a );
4591     aSign = extractFloat128Sign( a );
4592     if ( aExp == 0x7FFF ) {
4593         if ( aSig0 | aSig1 ) {
4594             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4595         }
4596         return packFloat64( aSign, 0x7FF, 0 );
4597     }
4598     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4599     aSig0 |= ( aSig1 != 0 );
4600     if ( aExp || aSig0 ) {
4601         aSig0 |= LIT64( 0x4000000000000000 );
4602         aExp -= 0x3C01;
4603     }
4604     return roundAndPackFloat64( aSign, aExp, aSig0 );
4605 
4606 }
4607 
4608 #ifdef FLOATX80
4609 
4610 /*
4611 -------------------------------------------------------------------------------
4612 Returns the result of converting the quadruple-precision floating-point
4613 value `a' to the extended double-precision floating-point format.  The
4614 conversion is performed according to the IEC/IEEE Standard for Binary
4615 Floating-Point Arithmetic.
4616 -------------------------------------------------------------------------------
4617 */
4618 floatx80 float128_to_floatx80( float128 a )
4619 {
4620     flag aSign;
4621     int32 aExp;
4622     bits64 aSig0, aSig1;
4623 
4624     aSig1 = extractFloat128Frac1( a );
4625     aSig0 = extractFloat128Frac0( a );
4626     aExp = extractFloat128Exp( a );
4627     aSign = extractFloat128Sign( a );
4628     if ( aExp == 0x7FFF ) {
4629         if ( aSig0 | aSig1 ) {
4630             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4631         }
4632         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4633     }
4634     if ( aExp == 0 ) {
4635         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4636         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4637     }
4638     else {
4639         aSig0 |= LIT64( 0x0001000000000000 );
4640     }
4641     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4642     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4643 
4644 }
4645 
4646 #endif
4647 
4648 /*
4649 -------------------------------------------------------------------------------
4650 Rounds the quadruple-precision floating-point value `a' to an integer, and
4651 returns the result as a quadruple-precision floating-point value.  The
4652 operation is performed according to the IEC/IEEE Standard for Binary
4653 Floating-Point Arithmetic.
4654 -------------------------------------------------------------------------------
4655 */
4656 float128 float128_round_to_int( float128 a )
4657 {
4658     flag aSign;
4659     int32 aExp;
4660     bits64 lastBitMask, roundBitsMask;
4661     int8 roundingMode;
4662     float128 z;
4663 
4664     aExp = extractFloat128Exp( a );
4665     if ( 0x402F <= aExp ) {
4666         if ( 0x406F <= aExp ) {
4667             if (    ( aExp == 0x7FFF )
4668                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4669                ) {
4670                 return propagateFloat128NaN( a, a );
4671             }
4672             return a;
4673         }
4674         lastBitMask = 1;
4675         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4676         roundBitsMask = lastBitMask - 1;
4677         z = a;
4678         roundingMode = float_rounding_mode;
4679         if ( roundingMode == float_round_nearest_even ) {
4680             if ( lastBitMask ) {
4681                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4682                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4683             }
4684             else {
4685                 if ( (sbits64) z.low < 0 ) {
4686                     ++z.high;
4687                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4688                 }
4689             }
4690         }
4691         else if ( roundingMode != float_round_to_zero ) {
4692             if (   extractFloat128Sign( z )
4693                  ^ ( roundingMode == float_round_up ) ) {
4694                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4695             }
4696         }
4697         z.low &= ~ roundBitsMask;
4698     }
4699     else {
4700         if ( aExp < 0x3FFF ) {
4701             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4702             float_exception_flags |= float_flag_inexact;
4703             aSign = extractFloat128Sign( a );
4704             switch ( float_rounding_mode ) {
4705              case float_round_nearest_even:
4706                 if (    ( aExp == 0x3FFE )
4707                      && (   extractFloat128Frac0( a )
4708                           | extractFloat128Frac1( a ) )
4709                    ) {
4710                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4711                 }
4712                 break;
4713 	     case float_round_to_zero:
4714 		break;
4715              case float_round_down:
4716                 return
4717                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4718                     : packFloat128( 0, 0, 0, 0 );
4719              case float_round_up:
4720                 return
4721                       aSign ? packFloat128( 1, 0, 0, 0 )
4722                     : packFloat128( 0, 0x3FFF, 0, 0 );
4723             }
4724             return packFloat128( aSign, 0, 0, 0 );
4725         }
4726         lastBitMask = 1;
4727         lastBitMask <<= 0x402F - aExp;
4728         roundBitsMask = lastBitMask - 1;
4729         z.low = 0;
4730         z.high = a.high;
4731         roundingMode = float_rounding_mode;
4732         if ( roundingMode == float_round_nearest_even ) {
4733             z.high += lastBitMask>>1;
4734             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4735                 z.high &= ~ lastBitMask;
4736             }
4737         }
4738         else if ( roundingMode != float_round_to_zero ) {
4739             if (   extractFloat128Sign( z )
4740                  ^ ( roundingMode == float_round_up ) ) {
4741                 z.high |= ( a.low != 0 );
4742                 z.high += roundBitsMask;
4743             }
4744         }
4745         z.high &= ~ roundBitsMask;
4746     }
4747     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4748         float_exception_flags |= float_flag_inexact;
4749     }
4750     return z;
4751 
4752 }
4753 
4754 /*
4755 -------------------------------------------------------------------------------
4756 Returns the result of adding the absolute values of the quadruple-precision
4757 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4758 before being returned.  `zSign' is ignored if the result is a NaN.
4759 The addition is performed according to the IEC/IEEE Standard for Binary
4760 Floating-Point Arithmetic.
4761 -------------------------------------------------------------------------------
4762 */
4763 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4764 {
4765     int32 aExp, bExp, zExp;
4766     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4767     int32 expDiff;
4768 
4769     aSig1 = extractFloat128Frac1( a );
4770     aSig0 = extractFloat128Frac0( a );
4771     aExp = extractFloat128Exp( a );
4772     bSig1 = extractFloat128Frac1( b );
4773     bSig0 = extractFloat128Frac0( b );
4774     bExp = extractFloat128Exp( b );
4775     expDiff = aExp - bExp;
4776     if ( 0 < expDiff ) {
4777         if ( aExp == 0x7FFF ) {
4778             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4779             return a;
4780         }
4781         if ( bExp == 0 ) {
4782             --expDiff;
4783         }
4784         else {
4785             bSig0 |= LIT64( 0x0001000000000000 );
4786         }
4787         shift128ExtraRightJamming(
4788             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4789         zExp = aExp;
4790     }
4791     else if ( expDiff < 0 ) {
4792         if ( bExp == 0x7FFF ) {
4793             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4794             return packFloat128( zSign, 0x7FFF, 0, 0 );
4795         }
4796         if ( aExp == 0 ) {
4797             ++expDiff;
4798         }
4799         else {
4800             aSig0 |= LIT64( 0x0001000000000000 );
4801         }
4802         shift128ExtraRightJamming(
4803             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4804         zExp = bExp;
4805     }
4806     else {
4807         if ( aExp == 0x7FFF ) {
4808             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4809                 return propagateFloat128NaN( a, b );
4810             }
4811             return a;
4812         }
4813         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4814         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4815         zSig2 = 0;
4816         zSig0 |= LIT64( 0x0002000000000000 );
4817         zExp = aExp;
4818         goto shiftRight1;
4819     }
4820     aSig0 |= LIT64( 0x0001000000000000 );
4821     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4822     --zExp;
4823     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4824     ++zExp;
4825  shiftRight1:
4826     shift128ExtraRightJamming(
4827         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4828  roundAndPack:
4829     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4830 
4831 }
4832 
4833 /*
4834 -------------------------------------------------------------------------------
4835 Returns the result of subtracting the absolute values of the quadruple-
4836 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4837 difference is negated before being returned.  `zSign' is ignored if the
4838 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4839 Standard for Binary Floating-Point Arithmetic.
4840 -------------------------------------------------------------------------------
4841 */
4842 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4843 {
4844     int32 aExp, bExp, zExp;
4845     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4846     int32 expDiff;
4847     float128 z;
4848 
4849     aSig1 = extractFloat128Frac1( a );
4850     aSig0 = extractFloat128Frac0( a );
4851     aExp = extractFloat128Exp( a );
4852     bSig1 = extractFloat128Frac1( b );
4853     bSig0 = extractFloat128Frac0( b );
4854     bExp = extractFloat128Exp( b );
4855     expDiff = aExp - bExp;
4856     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4857     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4858     if ( 0 < expDiff ) goto aExpBigger;
4859     if ( expDiff < 0 ) goto bExpBigger;
4860     if ( aExp == 0x7FFF ) {
4861         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4862             return propagateFloat128NaN( a, b );
4863         }
4864         float_raise( float_flag_invalid );
4865         z.low = float128_default_nan_low;
4866         z.high = float128_default_nan_high;
4867         return z;
4868     }
4869     if ( aExp == 0 ) {
4870         aExp = 1;
4871         bExp = 1;
4872     }
4873     if ( bSig0 < aSig0 ) goto aBigger;
4874     if ( aSig0 < bSig0 ) goto bBigger;
4875     if ( bSig1 < aSig1 ) goto aBigger;
4876     if ( aSig1 < bSig1 ) goto bBigger;
4877     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4878  bExpBigger:
4879     if ( bExp == 0x7FFF ) {
4880         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4881         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4882     }
4883     if ( aExp == 0 ) {
4884         ++expDiff;
4885     }
4886     else {
4887         aSig0 |= LIT64( 0x4000000000000000 );
4888     }
4889     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4890     bSig0 |= LIT64( 0x4000000000000000 );
4891  bBigger:
4892     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4893     zExp = bExp;
4894     zSign ^= 1;
4895     goto normalizeRoundAndPack;
4896  aExpBigger:
4897     if ( aExp == 0x7FFF ) {
4898         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4899         return a;
4900     }
4901     if ( bExp == 0 ) {
4902         --expDiff;
4903     }
4904     else {
4905         bSig0 |= LIT64( 0x4000000000000000 );
4906     }
4907     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4908     aSig0 |= LIT64( 0x4000000000000000 );
4909  aBigger:
4910     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4911     zExp = aExp;
4912  normalizeRoundAndPack:
4913     --zExp;
4914     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4915 
4916 }
4917 
4918 /*
4919 -------------------------------------------------------------------------------
4920 Returns the result of adding the quadruple-precision floating-point values
4921 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4922 for Binary Floating-Point Arithmetic.
4923 -------------------------------------------------------------------------------
4924 */
4925 float128 float128_add( float128 a, float128 b )
4926 {
4927     flag aSign, bSign;
4928 
4929     aSign = extractFloat128Sign( a );
4930     bSign = extractFloat128Sign( b );
4931     if ( aSign == bSign ) {
4932         return addFloat128Sigs( a, b, aSign );
4933     }
4934     else {
4935         return subFloat128Sigs( a, b, aSign );
4936     }
4937 
4938 }
4939 
4940 /*
4941 -------------------------------------------------------------------------------
4942 Returns the result of subtracting the quadruple-precision floating-point
4943 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4944 Standard for Binary Floating-Point Arithmetic.
4945 -------------------------------------------------------------------------------
4946 */
4947 float128 float128_sub( float128 a, float128 b )
4948 {
4949     flag aSign, bSign;
4950 
4951     aSign = extractFloat128Sign( a );
4952     bSign = extractFloat128Sign( b );
4953     if ( aSign == bSign ) {
4954         return subFloat128Sigs( a, b, aSign );
4955     }
4956     else {
4957         return addFloat128Sigs( a, b, aSign );
4958     }
4959 
4960 }
4961 
4962 /*
4963 -------------------------------------------------------------------------------
4964 Returns the result of multiplying the quadruple-precision floating-point
4965 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4966 Standard for Binary Floating-Point Arithmetic.
4967 -------------------------------------------------------------------------------
4968 */
4969 float128 float128_mul( float128 a, float128 b )
4970 {
4971     flag aSign, bSign, zSign;
4972     int32 aExp, bExp, zExp;
4973     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4974     float128 z;
4975 
4976     aSig1 = extractFloat128Frac1( a );
4977     aSig0 = extractFloat128Frac0( a );
4978     aExp = extractFloat128Exp( a );
4979     aSign = extractFloat128Sign( a );
4980     bSig1 = extractFloat128Frac1( b );
4981     bSig0 = extractFloat128Frac0( b );
4982     bExp = extractFloat128Exp( b );
4983     bSign = extractFloat128Sign( b );
4984     zSign = aSign ^ bSign;
4985     if ( aExp == 0x7FFF ) {
4986         if (    ( aSig0 | aSig1 )
4987              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4988             return propagateFloat128NaN( a, b );
4989         }
4990         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4991         return packFloat128( zSign, 0x7FFF, 0, 0 );
4992     }
4993     if ( bExp == 0x7FFF ) {
4994         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4995         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4996  invalid:
4997             float_raise( float_flag_invalid );
4998             z.low = float128_default_nan_low;
4999             z.high = float128_default_nan_high;
5000             return z;
5001         }
5002         return packFloat128( zSign, 0x7FFF, 0, 0 );
5003     }
5004     if ( aExp == 0 ) {
5005         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5006         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5007     }
5008     if ( bExp == 0 ) {
5009         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5010         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5011     }
5012     zExp = aExp + bExp - 0x4000;
5013     aSig0 |= LIT64( 0x0001000000000000 );
5014     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5015     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5016     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5017     zSig2 |= ( zSig3 != 0 );
5018     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5019         shift128ExtraRightJamming(
5020             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5021         ++zExp;
5022     }
5023     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5024 
5025 }
5026 
5027 /*
5028 -------------------------------------------------------------------------------
5029 Returns the result of dividing the quadruple-precision floating-point value
5030 `a' by the corresponding value `b'.  The operation is performed according to
5031 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5032 -------------------------------------------------------------------------------
5033 */
5034 float128 float128_div( float128 a, float128 b )
5035 {
5036     flag aSign, bSign, zSign;
5037     int32 aExp, bExp, zExp;
5038     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5039     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5040     float128 z;
5041 
5042     aSig1 = extractFloat128Frac1( a );
5043     aSig0 = extractFloat128Frac0( a );
5044     aExp = extractFloat128Exp( a );
5045     aSign = extractFloat128Sign( a );
5046     bSig1 = extractFloat128Frac1( b );
5047     bSig0 = extractFloat128Frac0( b );
5048     bExp = extractFloat128Exp( b );
5049     bSign = extractFloat128Sign( b );
5050     zSign = aSign ^ bSign;
5051     if ( aExp == 0x7FFF ) {
5052         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5053         if ( bExp == 0x7FFF ) {
5054             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5055             goto invalid;
5056         }
5057         return packFloat128( zSign, 0x7FFF, 0, 0 );
5058     }
5059     if ( bExp == 0x7FFF ) {
5060         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5061         return packFloat128( zSign, 0, 0, 0 );
5062     }
5063     if ( bExp == 0 ) {
5064         if ( ( bSig0 | bSig1 ) == 0 ) {
5065             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5066  invalid:
5067                 float_raise( float_flag_invalid );
5068                 z.low = float128_default_nan_low;
5069                 z.high = float128_default_nan_high;
5070                 return z;
5071             }
5072             float_raise( float_flag_divbyzero );
5073             return packFloat128( zSign, 0x7FFF, 0, 0 );
5074         }
5075         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5076     }
5077     if ( aExp == 0 ) {
5078         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5079         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5080     }
5081     zExp = aExp - bExp + 0x3FFD;
5082     shortShift128Left(
5083         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5084     shortShift128Left(
5085         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5086     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5087         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5088         ++zExp;
5089     }
5090     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5091     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5092     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5093     while ( (sbits64) rem0 < 0 ) {
5094         --zSig0;
5095         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5096     }
5097     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5098     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5099         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5100         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5101         while ( (sbits64) rem1 < 0 ) {
5102             --zSig1;
5103             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5104         }
5105         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5106     }
5107     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5108     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5109 
5110 }
5111 
5112 /*
5113 -------------------------------------------------------------------------------
5114 Returns the remainder of the quadruple-precision floating-point value `a'
5115 with respect to the corresponding value `b'.  The operation is performed
5116 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5117 -------------------------------------------------------------------------------
5118 */
5119 float128 float128_rem( float128 a, float128 b )
5120 {
5121     flag aSign, bSign, zSign;
5122     int32 aExp, bExp, expDiff;
5123     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5124     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5125     sbits64 sigMean0;
5126     float128 z;
5127 
5128     aSig1 = extractFloat128Frac1( a );
5129     aSig0 = extractFloat128Frac0( a );
5130     aExp = extractFloat128Exp( a );
5131     aSign = extractFloat128Sign( a );
5132     bSig1 = extractFloat128Frac1( b );
5133     bSig0 = extractFloat128Frac0( b );
5134     bExp = extractFloat128Exp( b );
5135     bSign = extractFloat128Sign( b );
5136     if ( aExp == 0x7FFF ) {
5137         if (    ( aSig0 | aSig1 )
5138              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5139             return propagateFloat128NaN( a, b );
5140         }
5141         goto invalid;
5142     }
5143     if ( bExp == 0x7FFF ) {
5144         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5145         return a;
5146     }
5147     if ( bExp == 0 ) {
5148         if ( ( bSig0 | bSig1 ) == 0 ) {
5149  invalid:
5150             float_raise( float_flag_invalid );
5151             z.low = float128_default_nan_low;
5152             z.high = float128_default_nan_high;
5153             return z;
5154         }
5155         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5156     }
5157     if ( aExp == 0 ) {
5158         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5159         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5160     }
5161     expDiff = aExp - bExp;
5162     if ( expDiff < -1 ) return a;
5163     shortShift128Left(
5164         aSig0 | LIT64( 0x0001000000000000 ),
5165         aSig1,
5166         15 - ( expDiff < 0 ),
5167         &aSig0,
5168         &aSig1
5169     );
5170     shortShift128Left(
5171         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5172     q = le128( bSig0, bSig1, aSig0, aSig1 );
5173     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5174     expDiff -= 64;
5175     while ( 0 < expDiff ) {
5176         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5177         q = ( 4 < q ) ? q - 4 : 0;
5178         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5179         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5180         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5181         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5182         expDiff -= 61;
5183     }
5184     if ( -64 < expDiff ) {
5185         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5186         q = ( 4 < q ) ? q - 4 : 0;
5187         q >>= - expDiff;
5188         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5189         expDiff += 52;
5190         if ( expDiff < 0 ) {
5191             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5192         }
5193         else {
5194             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5195         }
5196         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5197         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5198     }
5199     else {
5200         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5201         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5202     }
5203     do {
5204         alternateASig0 = aSig0;
5205         alternateASig1 = aSig1;
5206         ++q;
5207         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5208     } while ( 0 <= (sbits64) aSig0 );
5209     add128(
5210         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5211     if (    ( sigMean0 < 0 )
5212          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5213         aSig0 = alternateASig0;
5214         aSig1 = alternateASig1;
5215     }
5216     zSign = ( (sbits64) aSig0 < 0 );
5217     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5218     return
5219         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5220 
5221 }
5222 
5223 /*
5224 -------------------------------------------------------------------------------
5225 Returns the square root of the quadruple-precision floating-point value `a'.
5226 The operation is performed according to the IEC/IEEE Standard for Binary
5227 Floating-Point Arithmetic.
5228 -------------------------------------------------------------------------------
5229 */
5230 float128 float128_sqrt( float128 a )
5231 {
5232     flag aSign;
5233     int32 aExp, zExp;
5234     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5235     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5236     float128 z;
5237 
5238     aSig1 = extractFloat128Frac1( a );
5239     aSig0 = extractFloat128Frac0( a );
5240     aExp = extractFloat128Exp( a );
5241     aSign = extractFloat128Sign( a );
5242     if ( aExp == 0x7FFF ) {
5243         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5244         if ( ! aSign ) return a;
5245         goto invalid;
5246     }
5247     if ( aSign ) {
5248         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5249  invalid:
5250         float_raise( float_flag_invalid );
5251         z.low = float128_default_nan_low;
5252         z.high = float128_default_nan_high;
5253         return z;
5254     }
5255     if ( aExp == 0 ) {
5256         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5257         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5258     }
5259     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5260     aSig0 |= LIT64( 0x0001000000000000 );
5261     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5262     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5263     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5264     doubleZSig0 = zSig0<<1;
5265     mul64To128( zSig0, zSig0, &term0, &term1 );
5266     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5267     while ( (sbits64) rem0 < 0 ) {
5268         --zSig0;
5269         doubleZSig0 -= 2;
5270         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5271     }
5272     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5273     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5274         if ( zSig1 == 0 ) zSig1 = 1;
5275         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5276         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5277         mul64To128( zSig1, zSig1, &term2, &term3 );
5278         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5279         while ( (sbits64) rem1 < 0 ) {
5280             --zSig1;
5281             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5282             term3 |= 1;
5283             term2 |= doubleZSig0;
5284             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5285         }
5286         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5287     }
5288     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5289     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5290 
5291 }
5292 
5293 /*
5294 -------------------------------------------------------------------------------
5295 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5296 the corresponding value `b', and 0 otherwise.  The comparison is performed
5297 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5298 -------------------------------------------------------------------------------
5299 */
5300 flag float128_eq( float128 a, float128 b )
5301 {
5302 
5303     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5304               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5305          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5306               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5307        ) {
5308         if (    float128_is_signaling_nan( a )
5309              || float128_is_signaling_nan( b ) ) {
5310             float_raise( float_flag_invalid );
5311         }
5312         return 0;
5313     }
5314     return
5315            ( a.low == b.low )
5316         && (    ( a.high == b.high )
5317              || (    ( a.low == 0 )
5318                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5319            );
5320 
5321 }
5322 
5323 /*
5324 -------------------------------------------------------------------------------
5325 Returns 1 if the quadruple-precision floating-point value `a' is less than
5326 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5327 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5328 Arithmetic.
5329 -------------------------------------------------------------------------------
5330 */
5331 flag float128_le( float128 a, float128 b )
5332 {
5333     flag aSign, bSign;
5334 
5335     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5336               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5337          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5338               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5339        ) {
5340         float_raise( float_flag_invalid );
5341         return 0;
5342     }
5343     aSign = extractFloat128Sign( a );
5344     bSign = extractFloat128Sign( b );
5345     if ( aSign != bSign ) {
5346         return
5347                aSign
5348             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5349                  == 0 );
5350     }
5351     return
5352           aSign ? le128( b.high, b.low, a.high, a.low )
5353         : le128( a.high, a.low, b.high, b.low );
5354 
5355 }
5356 
5357 /*
5358 -------------------------------------------------------------------------------
5359 Returns 1 if the quadruple-precision floating-point value `a' is less than
5360 the corresponding value `b', and 0 otherwise.  The comparison is performed
5361 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5362 -------------------------------------------------------------------------------
5363 */
5364 flag float128_lt( float128 a, float128 b )
5365 {
5366     flag aSign, bSign;
5367 
5368     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5369               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5370          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5371               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5372        ) {
5373         float_raise( float_flag_invalid );
5374         return 0;
5375     }
5376     aSign = extractFloat128Sign( a );
5377     bSign = extractFloat128Sign( b );
5378     if ( aSign != bSign ) {
5379         return
5380                aSign
5381             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5382                  != 0 );
5383     }
5384     return
5385           aSign ? lt128( b.high, b.low, a.high, a.low )
5386         : lt128( a.high, a.low, b.high, b.low );
5387 
5388 }
5389 
5390 /*
5391 -------------------------------------------------------------------------------
5392 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5393 the corresponding value `b', and 0 otherwise.  The invalid exception is
5394 raised if either operand is a NaN.  Otherwise, the comparison is performed
5395 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5396 -------------------------------------------------------------------------------
5397 */
5398 flag float128_eq_signaling( float128 a, float128 b )
5399 {
5400 
5401     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5402               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5403          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5404               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5405        ) {
5406         float_raise( float_flag_invalid );
5407         return 0;
5408     }
5409     return
5410            ( a.low == b.low )
5411         && (    ( a.high == b.high )
5412              || (    ( a.low == 0 )
5413                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5414            );
5415 
5416 }
5417 
5418 /*
5419 -------------------------------------------------------------------------------
5420 Returns 1 if the quadruple-precision floating-point value `a' is less than
5421 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5422 cause an exception.  Otherwise, the comparison is performed according to the
5423 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5424 -------------------------------------------------------------------------------
5425 */
5426 flag float128_le_quiet( float128 a, float128 b )
5427 {
5428     flag aSign, bSign;
5429 
5430     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5431               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5432          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5433               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5434        ) {
5435         if (    float128_is_signaling_nan( a )
5436              || float128_is_signaling_nan( b ) ) {
5437             float_raise( float_flag_invalid );
5438         }
5439         return 0;
5440     }
5441     aSign = extractFloat128Sign( a );
5442     bSign = extractFloat128Sign( b );
5443     if ( aSign != bSign ) {
5444         return
5445                aSign
5446             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5447                  == 0 );
5448     }
5449     return
5450           aSign ? le128( b.high, b.low, a.high, a.low )
5451         : le128( a.high, a.low, b.high, b.low );
5452 
5453 }
5454 
5455 /*
5456 -------------------------------------------------------------------------------
5457 Returns 1 if the quadruple-precision floating-point value `a' is less than
5458 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5459 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5460 Standard for Binary Floating-Point Arithmetic.
5461 -------------------------------------------------------------------------------
5462 */
5463 flag float128_lt_quiet( float128 a, float128 b )
5464 {
5465     flag aSign, bSign;
5466 
5467     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5468               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5469          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5470               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5471        ) {
5472         if (    float128_is_signaling_nan( a )
5473              || float128_is_signaling_nan( b ) ) {
5474             float_raise( float_flag_invalid );
5475         }
5476         return 0;
5477     }
5478     aSign = extractFloat128Sign( a );
5479     bSign = extractFloat128Sign( b );
5480     if ( aSign != bSign ) {
5481         return
5482                aSign
5483             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5484                  != 0 );
5485     }
5486     return
5487           aSign ? lt128( b.high, b.low, a.high, a.low )
5488         : lt128( a.high, a.low, b.high, b.low );
5489 
5490 }
5491 
5492 #endif
5493 
5494 
5495 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5496 
5497 /*
5498  * These two routines are not part of the original softfloat distribution.
5499  *
5500  * They are based on the corresponding conversions to integer but return
5501  * unsigned numbers instead since these functions are required by GCC.
5502  *
5503  * Added by Mark Brinicombe <mark@NetBSD.org>	27/09/97
5504  *
5505  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5506  */
5507 
5508 /*
5509 -------------------------------------------------------------------------------
5510 Returns the result of converting the double-precision floating-point value
5511 `a' to the 32-bit unsigned integer format.  The conversion is
5512 performed according to the IEC/IEEE Standard for Binary Floating-point
5513 Arithmetic, except that the conversion is always rounded toward zero.  If
5514 `a' is a NaN, the largest positive integer is returned.  If the conversion
5515 overflows, the largest integer positive is returned.
5516 -------------------------------------------------------------------------------
5517 */
5518 uint32 float64_to_uint32_round_to_zero( float64 a )
5519 {
5520     flag aSign;
5521     int16 aExp, shiftCount;
5522     bits64 aSig, savedASig;
5523     uint32 z;
5524 
5525     aSig = extractFloat64Frac( a );
5526     aExp = extractFloat64Exp( a );
5527     aSign = extractFloat64Sign( a );
5528 
5529     if (aSign) {
5530         float_raise( float_flag_invalid );
5531     	return(0);
5532     }
5533 
5534     if ( 0x41E < aExp ) {
5535         float_raise( float_flag_invalid );
5536         return 0xffffffff;
5537     }
5538     else if ( aExp < 0x3FF ) {
5539         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5540         return 0;
5541     }
5542     aSig |= LIT64( 0x0010000000000000 );
5543     shiftCount = 0x433 - aExp;
5544     savedASig = aSig;
5545     aSig >>= shiftCount;
5546     z = aSig;
5547     if ( ( aSig<<shiftCount ) != savedASig ) {
5548         float_exception_flags |= float_flag_inexact;
5549     }
5550     return z;
5551 
5552 }
5553 
5554 /*
5555 -------------------------------------------------------------------------------
5556 Returns the result of converting the single-precision floating-point value
5557 `a' to the 32-bit unsigned integer format.  The conversion is
5558 performed according to the IEC/IEEE Standard for Binary Floating-point
5559 Arithmetic, except that the conversion is always rounded toward zero.  If
5560 `a' is a NaN, the largest positive integer is returned.  If the conversion
5561 overflows, the largest positive integer is returned.
5562 -------------------------------------------------------------------------------
5563 */
5564 uint32 float32_to_uint32_round_to_zero( float32 a )
5565 {
5566     flag aSign;
5567     int16 aExp, shiftCount;
5568     bits32 aSig;
5569     uint32 z;
5570 
5571     aSig = extractFloat32Frac( a );
5572     aExp = extractFloat32Exp( a );
5573     aSign = extractFloat32Sign( a );
5574     shiftCount = aExp - 0x9E;
5575 
5576     if (aSign) {
5577         float_raise( float_flag_invalid );
5578     	return(0);
5579     }
5580     if ( 0 < shiftCount ) {
5581         float_raise( float_flag_invalid );
5582         return 0xFFFFFFFF;
5583     }
5584     else if ( aExp <= 0x7E ) {
5585         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5586         return 0;
5587     }
5588     aSig = ( aSig | 0x800000 )<<8;
5589     z = aSig>>( - shiftCount );
5590     if ( aSig<<( shiftCount & 31 ) ) {
5591         float_exception_flags |= float_flag_inexact;
5592     }
5593     return z;
5594 
5595 }
5596 
5597 #endif
5598