xref: /netbsd/sys/lib/libkern/softfloat.c (revision bf9ec67e)
1 /* $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*
18 ===============================================================================
19 
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 #include <sys/cdefs.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $");
50 #endif /* LIBC_SCCS and not lint */
51 
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
55 
56 #include "milieu.h"
57 #include "softfloat.h"
58 
59 /*
60  * Conversions between floats as stored in memory and floats as
61  * SoftFloat uses them
62  */
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a)	(a)
65 #endif
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a)	(a)
68 #endif
69 
70 /*
71 -------------------------------------------------------------------------------
72 Floating-point rounding mode, extended double-precision rounding precision,
73 and exception flags.
74 -------------------------------------------------------------------------------
75 */
76 
77 /*
78  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
79  * 	Right now, it does not.  I've removed all other dynamic global
80  * 	variables. [ross]
81  */
82 #ifdef FLOATX80
83 int8 floatx80_rounding_precision = 80;
84 #endif
85 
86 /*
87 -------------------------------------------------------------------------------
88 Primitive arithmetic functions, including multi-word arithmetic, and
89 division and square root approximations.  (Can be specialized to target if
90 desired.)
91 -------------------------------------------------------------------------------
92 */
93 #include "softfloat-macros.h"
94 
95 /*
96 -------------------------------------------------------------------------------
97 Functions and definitions to determine:  (1) whether tininess for underflow
98 is detected before or after rounding by default, (2) what (if anything)
99 happens when exceptions are raised, (3) how signaling NaNs are distinguished
100 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
101 are propagated from function inputs to output.  These details are target-
102 specific.
103 -------------------------------------------------------------------------------
104 */
105 #include "softfloat-specialize.h"
106 
107 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
108 /*
109 -------------------------------------------------------------------------------
110 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
111 and 7, and returns the properly rounded 32-bit integer corresponding to the
112 input.  If `zSign' is 1, the input is negated before being converted to an
113 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
114 is simply rounded to an integer, with the inexact exception raised if the
115 input cannot be represented exactly as an integer.  However, if the fixed-
116 point input is too large, the invalid exception is raised and the largest
117 positive or negative integer is returned.
118 -------------------------------------------------------------------------------
119 */
120 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
121 {
122     int8 roundingMode;
123     flag roundNearestEven;
124     int8 roundIncrement, roundBits;
125     int32 z;
126 
127     roundingMode = float_rounding_mode();
128     roundNearestEven = ( roundingMode == float_round_nearest_even );
129     roundIncrement = 0x40;
130     if ( ! roundNearestEven ) {
131         if ( roundingMode == float_round_to_zero ) {
132             roundIncrement = 0;
133         }
134         else {
135             roundIncrement = 0x7F;
136             if ( zSign ) {
137                 if ( roundingMode == float_round_up ) roundIncrement = 0;
138             }
139             else {
140                 if ( roundingMode == float_round_down ) roundIncrement = 0;
141             }
142         }
143     }
144     roundBits = absZ & 0x7F;
145     absZ = ( absZ + roundIncrement )>>7;
146     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
147     z = absZ;
148     if ( zSign ) z = - z;
149     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
150         float_raise( float_flag_invalid );
151         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
152     }
153     if ( roundBits ) float_set_inexact();
154     return z;
155 
156 }
157 
158 /*
159 -------------------------------------------------------------------------------
160 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
161 `absZ1', with binary point between bits 63 and 64 (between the input words),
162 and returns the properly rounded 64-bit integer corresponding to the input.
163 If `zSign' is 1, the input is negated before being converted to an integer.
164 Ordinarily, the fixed-point input is simply rounded to an integer, with
165 the inexact exception raised if the input cannot be represented exactly as
166 an integer.  However, if the fixed-point input is too large, the invalid
167 exception is raised and the largest positive or negative integer is
168 returned.
169 -------------------------------------------------------------------------------
170 */
171 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
172 {
173     int8 roundingMode;
174     flag roundNearestEven, increment;
175     int64 z;
176 
177     roundingMode = float_rounding_mode();
178     roundNearestEven = ( roundingMode == float_round_nearest_even );
179     increment = ( (sbits64) absZ1 < 0 );
180     if ( ! roundNearestEven ) {
181         if ( roundingMode == float_round_to_zero ) {
182             increment = 0;
183         }
184         else {
185             if ( zSign ) {
186                 increment = ( roundingMode == float_round_down ) && absZ1;
187             }
188             else {
189                 increment = ( roundingMode == float_round_up ) && absZ1;
190             }
191         }
192     }
193     if ( increment ) {
194         ++absZ0;
195         if ( absZ0 == 0 ) goto overflow;
196         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
197     }
198     z = absZ0;
199     if ( zSign ) z = - z;
200     if ( z && ( ( z < 0 ) ^ zSign ) ) {
201  overflow:
202         float_raise( float_flag_invalid );
203         return
204               zSign ? (sbits64) LIT64( 0x8000000000000000 )
205             : LIT64( 0x7FFFFFFFFFFFFFFF );
206     }
207     if ( absZ1 ) float_set_inexact();
208     return z;
209 
210 }
211 #endif
212 
213 /*
214 -------------------------------------------------------------------------------
215 Returns the fraction bits of the single-precision floating-point value `a'.
216 -------------------------------------------------------------------------------
217 */
218 INLINE bits32 extractFloat32Frac( float32 a )
219 {
220 
221     return a & 0x007FFFFF;
222 
223 }
224 
225 /*
226 -------------------------------------------------------------------------------
227 Returns the exponent bits of the single-precision floating-point value `a'.
228 -------------------------------------------------------------------------------
229 */
230 INLINE int16 extractFloat32Exp( float32 a )
231 {
232 
233     return ( a>>23 ) & 0xFF;
234 
235 }
236 
237 /*
238 -------------------------------------------------------------------------------
239 Returns the sign bit of the single-precision floating-point value `a'.
240 -------------------------------------------------------------------------------
241 */
242 INLINE flag extractFloat32Sign( float32 a )
243 {
244 
245     return a>>31;
246 
247 }
248 
249 /*
250 -------------------------------------------------------------------------------
251 Normalizes the subnormal single-precision floating-point value represented
252 by the denormalized significand `aSig'.  The normalized exponent and
253 significand are stored at the locations pointed to by `zExpPtr' and
254 `zSigPtr', respectively.
255 -------------------------------------------------------------------------------
256 */
257 static void
258  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
259 {
260     int8 shiftCount;
261 
262     shiftCount = countLeadingZeros32( aSig ) - 8;
263     *zSigPtr = aSig<<shiftCount;
264     *zExpPtr = 1 - shiftCount;
265 
266 }
267 
268 /*
269 -------------------------------------------------------------------------------
270 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271 single-precision floating-point value, returning the result.  After being
272 shifted into the proper positions, the three fields are simply added
273 together to form the result.  This means that any integer portion of `zSig'
274 will be added into the exponent.  Since a properly normalized significand
275 will have an integer portion equal to 1, the `zExp' input should be 1 less
276 than the desired result exponent whenever `zSig' is a complete, normalized
277 significand.
278 -------------------------------------------------------------------------------
279 */
280 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
281 {
282 
283     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
284 
285 }
286 
287 /*
288 -------------------------------------------------------------------------------
289 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290 and significand `zSig', and returns the proper single-precision floating-
291 point value corresponding to the abstract input.  Ordinarily, the abstract
292 value is simply rounded and packed into the single-precision format, with
293 the inexact exception raised if the abstract input cannot be represented
294 exactly.  However, if the abstract value is too large, the overflow and
295 inexact exceptions are raised and an infinity or maximal finite value is
296 returned.  If the abstract value is too small, the input value is rounded to
297 a subnormal number, and the underflow and inexact exceptions are raised if
298 the abstract input cannot be represented exactly as a subnormal single-
299 precision floating-point number.
300     The input significand `zSig' has its binary point between bits 30
301 and 29, which is 7 bits to the left of the usual location.  This shifted
302 significand must be normalized or smaller.  If `zSig' is not normalized,
303 `zExp' must be 0; in that case, the result returned is a subnormal number,
304 and it must not require rounding.  In the usual case that `zSig' is
305 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306 The handling of underflow and overflow follows the IEC/IEEE Standard for
307 Binary Floating-Point Arithmetic.
308 -------------------------------------------------------------------------------
309 */
310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
311 {
312     int8 roundingMode;
313     flag roundNearestEven;
314     int8 roundIncrement, roundBits;
315     flag isTiny;
316 
317     roundingMode = float_rounding_mode();
318     roundNearestEven = ( roundingMode == float_round_nearest_even );
319     roundIncrement = 0x40;
320     if ( ! roundNearestEven ) {
321         if ( roundingMode == float_round_to_zero ) {
322             roundIncrement = 0;
323         }
324         else {
325             roundIncrement = 0x7F;
326             if ( zSign ) {
327                 if ( roundingMode == float_round_up ) roundIncrement = 0;
328             }
329             else {
330                 if ( roundingMode == float_round_down ) roundIncrement = 0;
331             }
332         }
333     }
334     roundBits = zSig & 0x7F;
335     if ( 0xFD <= (bits16) zExp ) {
336         if (    ( 0xFD < zExp )
337              || (    ( zExp == 0xFD )
338                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
339            ) {
340             float_raise( float_flag_overflow | float_flag_inexact );
341             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
342         }
343         if ( zExp < 0 ) {
344             isTiny =
345                    ( float_detect_tininess == float_tininess_before_rounding )
346                 || ( zExp < -1 )
347                 || ( zSig + roundIncrement < 0x80000000 );
348             shift32RightJamming( zSig, - zExp, &zSig );
349             zExp = 0;
350             roundBits = zSig & 0x7F;
351             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
352         }
353     }
354     if ( roundBits ) float_set_inexact();
355     zSig = ( zSig + roundIncrement )>>7;
356     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
357     if ( zSig == 0 ) zExp = 0;
358     return packFloat32( zSign, zExp, zSig );
359 
360 }
361 
362 /*
363 -------------------------------------------------------------------------------
364 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
365 and significand `zSig', and returns the proper single-precision floating-
366 point value corresponding to the abstract input.  This routine is just like
367 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
368 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
369 floating-point exponent.
370 -------------------------------------------------------------------------------
371 */
372 static float32
373  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
374 {
375     int8 shiftCount;
376 
377     shiftCount = countLeadingZeros32( zSig ) - 1;
378     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
379 
380 }
381 
382 /*
383 -------------------------------------------------------------------------------
384 Returns the fraction bits of the double-precision floating-point value `a'.
385 -------------------------------------------------------------------------------
386 */
387 INLINE bits64 extractFloat64Frac( float64 a )
388 {
389 
390     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
391 
392 }
393 
394 /*
395 -------------------------------------------------------------------------------
396 Returns the exponent bits of the double-precision floating-point value `a'.
397 -------------------------------------------------------------------------------
398 */
399 INLINE int16 extractFloat64Exp( float64 a )
400 {
401 
402     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
403 
404 }
405 
406 /*
407 -------------------------------------------------------------------------------
408 Returns the sign bit of the double-precision floating-point value `a'.
409 -------------------------------------------------------------------------------
410 */
411 INLINE flag extractFloat64Sign( float64 a )
412 {
413 
414     return FLOAT64_DEMANGLE(a)>>63;
415 
416 }
417 
418 /*
419 -------------------------------------------------------------------------------
420 Normalizes the subnormal double-precision floating-point value represented
421 by the denormalized significand `aSig'.  The normalized exponent and
422 significand are stored at the locations pointed to by `zExpPtr' and
423 `zSigPtr', respectively.
424 -------------------------------------------------------------------------------
425 */
426 static void
427  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
428 {
429     int8 shiftCount;
430 
431     shiftCount = countLeadingZeros64( aSig ) - 11;
432     *zSigPtr = aSig<<shiftCount;
433     *zExpPtr = 1 - shiftCount;
434 
435 }
436 
437 /*
438 -------------------------------------------------------------------------------
439 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
440 double-precision floating-point value, returning the result.  After being
441 shifted into the proper positions, the three fields are simply added
442 together to form the result.  This means that any integer portion of `zSig'
443 will be added into the exponent.  Since a properly normalized significand
444 will have an integer portion equal to 1, the `zExp' input should be 1 less
445 than the desired result exponent whenever `zSig' is a complete, normalized
446 significand.
447 -------------------------------------------------------------------------------
448 */
449 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
450 {
451 
452     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
453 			   ( ( (bits64) zExp )<<52 ) + zSig );
454 
455 }
456 
457 /*
458 -------------------------------------------------------------------------------
459 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
460 and significand `zSig', and returns the proper double-precision floating-
461 point value corresponding to the abstract input.  Ordinarily, the abstract
462 value is simply rounded and packed into the double-precision format, with
463 the inexact exception raised if the abstract input cannot be represented
464 exactly.  However, if the abstract value is too large, the overflow and
465 inexact exceptions are raised and an infinity or maximal finite value is
466 returned.  If the abstract value is too small, the input value is rounded to
467 a subnormal number, and the underflow and inexact exceptions are raised if
468 the abstract input cannot be represented exactly as a subnormal double-
469 precision floating-point number.
470     The input significand `zSig' has its binary point between bits 62
471 and 61, which is 10 bits to the left of the usual location.  This shifted
472 significand must be normalized or smaller.  If `zSig' is not normalized,
473 `zExp' must be 0; in that case, the result returned is a subnormal number,
474 and it must not require rounding.  In the usual case that `zSig' is
475 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
476 The handling of underflow and overflow follows the IEC/IEEE Standard for
477 Binary Floating-Point Arithmetic.
478 -------------------------------------------------------------------------------
479 */
480 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
481 {
482     int8 roundingMode;
483     flag roundNearestEven;
484     int16 roundIncrement, roundBits;
485     flag isTiny;
486 
487     roundingMode = float_rounding_mode();
488     roundNearestEven = ( roundingMode == float_round_nearest_even );
489     roundIncrement = 0x200;
490     if ( ! roundNearestEven ) {
491         if ( roundingMode == float_round_to_zero ) {
492             roundIncrement = 0;
493         }
494         else {
495             roundIncrement = 0x3FF;
496             if ( zSign ) {
497                 if ( roundingMode == float_round_up ) roundIncrement = 0;
498             }
499             else {
500                 if ( roundingMode == float_round_down ) roundIncrement = 0;
501             }
502         }
503     }
504     roundBits = zSig & 0x3FF;
505     if ( 0x7FD <= (bits16) zExp ) {
506         if (    ( 0x7FD < zExp )
507              || (    ( zExp == 0x7FD )
508                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
509            ) {
510             float_raise( float_flag_overflow | float_flag_inexact );
511             return FLOAT64_MANGLE(
512 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
513 		( roundIncrement == 0 ));
514         }
515         if ( zExp < 0 ) {
516             isTiny =
517                    ( float_detect_tininess == float_tininess_before_rounding )
518                 || ( zExp < -1 )
519                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
520             shift64RightJamming( zSig, - zExp, &zSig );
521             zExp = 0;
522             roundBits = zSig & 0x3FF;
523             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
524         }
525     }
526     if ( roundBits ) float_set_inexact();
527     zSig = ( zSig + roundIncrement )>>10;
528     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
529     if ( zSig == 0 ) zExp = 0;
530     return packFloat64( zSign, zExp, zSig );
531 
532 }
533 
534 /*
535 -------------------------------------------------------------------------------
536 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
537 and significand `zSig', and returns the proper double-precision floating-
538 point value corresponding to the abstract input.  This routine is just like
539 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
540 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
541 floating-point exponent.
542 -------------------------------------------------------------------------------
543 */
544 static float64
545  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
546 {
547     int8 shiftCount;
548 
549     shiftCount = countLeadingZeros64( zSig ) - 1;
550     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
551 
552 }
553 
554 #ifdef FLOATX80
555 
556 /*
557 -------------------------------------------------------------------------------
558 Returns the fraction bits of the extended double-precision floating-point
559 value `a'.
560 -------------------------------------------------------------------------------
561 */
562 INLINE bits64 extractFloatx80Frac( floatx80 a )
563 {
564 
565     return a.low;
566 
567 }
568 
569 /*
570 -------------------------------------------------------------------------------
571 Returns the exponent bits of the extended double-precision floating-point
572 value `a'.
573 -------------------------------------------------------------------------------
574 */
575 INLINE int32 extractFloatx80Exp( floatx80 a )
576 {
577 
578     return a.high & 0x7FFF;
579 
580 }
581 
582 /*
583 -------------------------------------------------------------------------------
584 Returns the sign bit of the extended double-precision floating-point value
585 `a'.
586 -------------------------------------------------------------------------------
587 */
588 INLINE flag extractFloatx80Sign( floatx80 a )
589 {
590 
591     return a.high>>15;
592 
593 }
594 
595 /*
596 -------------------------------------------------------------------------------
597 Normalizes the subnormal extended double-precision floating-point value
598 represented by the denormalized significand `aSig'.  The normalized exponent
599 and significand are stored at the locations pointed to by `zExpPtr' and
600 `zSigPtr', respectively.
601 -------------------------------------------------------------------------------
602 */
603 static void
604  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
605 {
606     int8 shiftCount;
607 
608     shiftCount = countLeadingZeros64( aSig );
609     *zSigPtr = aSig<<shiftCount;
610     *zExpPtr = 1 - shiftCount;
611 
612 }
613 
614 /*
615 -------------------------------------------------------------------------------
616 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
617 extended double-precision floating-point value, returning the result.
618 -------------------------------------------------------------------------------
619 */
620 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
621 {
622     floatx80 z;
623 
624     z.low = zSig;
625     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
626     return z;
627 
628 }
629 
630 /*
631 -------------------------------------------------------------------------------
632 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
633 and extended significand formed by the concatenation of `zSig0' and `zSig1',
634 and returns the proper extended double-precision floating-point value
635 corresponding to the abstract input.  Ordinarily, the abstract value is
636 rounded and packed into the extended double-precision format, with the
637 inexact exception raised if the abstract input cannot be represented
638 exactly.  However, if the abstract value is too large, the overflow and
639 inexact exceptions are raised and an infinity or maximal finite value is
640 returned.  If the abstract value is too small, the input value is rounded to
641 a subnormal number, and the underflow and inexact exceptions are raised if
642 the abstract input cannot be represented exactly as a subnormal extended
643 double-precision floating-point number.
644     If `roundingPrecision' is 32 or 64, the result is rounded to the same
645 number of bits as single or double precision, respectively.  Otherwise, the
646 result is rounded to the full precision of the extended double-precision
647 format.
648     The input significand must be normalized or smaller.  If the input
649 significand is not normalized, `zExp' must be 0; in that case, the result
650 returned is a subnormal number, and it must not require rounding.  The
651 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
652 Floating-Point Arithmetic.
653 -------------------------------------------------------------------------------
654 */
655 static floatx80
656  roundAndPackFloatx80(
657      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
658  )
659 {
660     int8 roundingMode;
661     flag roundNearestEven, increment, isTiny;
662     int64 roundIncrement, roundMask, roundBits;
663 
664     roundingMode = float_rounding_mode();
665     roundNearestEven = ( roundingMode == float_round_nearest_even );
666     if ( roundingPrecision == 80 ) goto precision80;
667     if ( roundingPrecision == 64 ) {
668         roundIncrement = LIT64( 0x0000000000000400 );
669         roundMask = LIT64( 0x00000000000007FF );
670     }
671     else if ( roundingPrecision == 32 ) {
672         roundIncrement = LIT64( 0x0000008000000000 );
673         roundMask = LIT64( 0x000000FFFFFFFFFF );
674     }
675     else {
676         goto precision80;
677     }
678     zSig0 |= ( zSig1 != 0 );
679     if ( ! roundNearestEven ) {
680         if ( roundingMode == float_round_to_zero ) {
681             roundIncrement = 0;
682         }
683         else {
684             roundIncrement = roundMask;
685             if ( zSign ) {
686                 if ( roundingMode == float_round_up ) roundIncrement = 0;
687             }
688             else {
689                 if ( roundingMode == float_round_down ) roundIncrement = 0;
690             }
691         }
692     }
693     roundBits = zSig0 & roundMask;
694     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
695         if (    ( 0x7FFE < zExp )
696              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
697            ) {
698             goto overflow;
699         }
700         if ( zExp <= 0 ) {
701             isTiny =
702                    ( float_detect_tininess == float_tininess_before_rounding )
703                 || ( zExp < 0 )
704                 || ( zSig0 <= zSig0 + roundIncrement );
705             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
706             zExp = 0;
707             roundBits = zSig0 & roundMask;
708             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
709             if ( roundBits ) float_set_inexact();
710             zSig0 += roundIncrement;
711             if ( (sbits64) zSig0 < 0 ) zExp = 1;
712             roundIncrement = roundMask + 1;
713             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
714                 roundMask |= roundIncrement;
715             }
716             zSig0 &= ~ roundMask;
717             return packFloatx80( zSign, zExp, zSig0 );
718         }
719     }
720     if ( roundBits ) float_set_inexact();
721     zSig0 += roundIncrement;
722     if ( zSig0 < roundIncrement ) {
723         ++zExp;
724         zSig0 = LIT64( 0x8000000000000000 );
725     }
726     roundIncrement = roundMask + 1;
727     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
728         roundMask |= roundIncrement;
729     }
730     zSig0 &= ~ roundMask;
731     if ( zSig0 == 0 ) zExp = 0;
732     return packFloatx80( zSign, zExp, zSig0 );
733  precision80:
734     increment = ( (sbits64) zSig1 < 0 );
735     if ( ! roundNearestEven ) {
736         if ( roundingMode == float_round_to_zero ) {
737             increment = 0;
738         }
739         else {
740             if ( zSign ) {
741                 increment = ( roundingMode == float_round_down ) && zSig1;
742             }
743             else {
744                 increment = ( roundingMode == float_round_up ) && zSig1;
745             }
746         }
747     }
748     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
749         if (    ( 0x7FFE < zExp )
750              || (    ( zExp == 0x7FFE )
751                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
752                   && increment
753                 )
754            ) {
755             roundMask = 0;
756  overflow:
757             float_raise( float_flag_overflow | float_flag_inexact );
758             if (    ( roundingMode == float_round_to_zero )
759                  || ( zSign && ( roundingMode == float_round_up ) )
760                  || ( ! zSign && ( roundingMode == float_round_down ) )
761                ) {
762                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
763             }
764             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
765         }
766         if ( zExp <= 0 ) {
767             isTiny =
768                    ( float_detect_tininess == float_tininess_before_rounding )
769                 || ( zExp < 0 )
770                 || ! increment
771                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
772             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
773             zExp = 0;
774             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
775             if ( zSig1 ) float_set_inexact();
776             if ( roundNearestEven ) {
777                 increment = ( (sbits64) zSig1 < 0 );
778             }
779             else {
780                 if ( zSign ) {
781                     increment = ( roundingMode == float_round_down ) && zSig1;
782                 }
783                 else {
784                     increment = ( roundingMode == float_round_up ) && zSig1;
785                 }
786             }
787             if ( increment ) {
788                 ++zSig0;
789                 zSig0 &=
790                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
791                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
792             }
793             return packFloatx80( zSign, zExp, zSig0 );
794         }
795     }
796     if ( zSig1 ) float_set_inexact();
797     if ( increment ) {
798         ++zSig0;
799         if ( zSig0 == 0 ) {
800             ++zExp;
801             zSig0 = LIT64( 0x8000000000000000 );
802         }
803         else {
804             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
805         }
806     }
807     else {
808         if ( zSig0 == 0 ) zExp = 0;
809     }
810     return packFloatx80( zSign, zExp, zSig0 );
811 
812 }
813 
814 /*
815 -------------------------------------------------------------------------------
816 Takes an abstract floating-point value having sign `zSign', exponent
817 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
818 and returns the proper extended double-precision floating-point value
819 corresponding to the abstract input.  This routine is just like
820 `roundAndPackFloatx80' except that the input significand does not have to be
821 normalized.
822 -------------------------------------------------------------------------------
823 */
824 static floatx80
825  normalizeRoundAndPackFloatx80(
826      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
827  )
828 {
829     int8 shiftCount;
830 
831     if ( zSig0 == 0 ) {
832         zSig0 = zSig1;
833         zSig1 = 0;
834         zExp -= 64;
835     }
836     shiftCount = countLeadingZeros64( zSig0 );
837     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
838     zExp -= shiftCount;
839     return
840         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
841 
842 }
843 
844 #endif
845 
846 #ifdef FLOAT128
847 
848 /*
849 -------------------------------------------------------------------------------
850 Returns the least-significant 64 fraction bits of the quadruple-precision
851 floating-point value `a'.
852 -------------------------------------------------------------------------------
853 */
854 INLINE bits64 extractFloat128Frac1( float128 a )
855 {
856 
857     return a.low;
858 
859 }
860 
861 /*
862 -------------------------------------------------------------------------------
863 Returns the most-significant 48 fraction bits of the quadruple-precision
864 floating-point value `a'.
865 -------------------------------------------------------------------------------
866 */
867 INLINE bits64 extractFloat128Frac0( float128 a )
868 {
869 
870     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
871 
872 }
873 
874 /*
875 -------------------------------------------------------------------------------
876 Returns the exponent bits of the quadruple-precision floating-point value
877 `a'.
878 -------------------------------------------------------------------------------
879 */
880 INLINE int32 extractFloat128Exp( float128 a )
881 {
882 
883     return ( a.high>>48 ) & 0x7FFF;
884 
885 }
886 
887 /*
888 -------------------------------------------------------------------------------
889 Returns the sign bit of the quadruple-precision floating-point value `a'.
890 -------------------------------------------------------------------------------
891 */
892 INLINE flag extractFloat128Sign( float128 a )
893 {
894 
895     return a.high>>63;
896 
897 }
898 
899 /*
900 -------------------------------------------------------------------------------
901 Normalizes the subnormal quadruple-precision floating-point value
902 represented by the denormalized significand formed by the concatenation of
903 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
904 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
905 significand are stored at the location pointed to by `zSig0Ptr', and the
906 least significant 64 bits of the normalized significand are stored at the
907 location pointed to by `zSig1Ptr'.
908 -------------------------------------------------------------------------------
909 */
910 static void
911  normalizeFloat128Subnormal(
912      bits64 aSig0,
913      bits64 aSig1,
914      int32 *zExpPtr,
915      bits64 *zSig0Ptr,
916      bits64 *zSig1Ptr
917  )
918 {
919     int8 shiftCount;
920 
921     if ( aSig0 == 0 ) {
922         shiftCount = countLeadingZeros64( aSig1 ) - 15;
923         if ( shiftCount < 0 ) {
924             *zSig0Ptr = aSig1>>( - shiftCount );
925             *zSig1Ptr = aSig1<<( shiftCount & 63 );
926         }
927         else {
928             *zSig0Ptr = aSig1<<shiftCount;
929             *zSig1Ptr = 0;
930         }
931         *zExpPtr = - shiftCount - 63;
932     }
933     else {
934         shiftCount = countLeadingZeros64( aSig0 ) - 15;
935         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
936         *zExpPtr = 1 - shiftCount;
937     }
938 
939 }
940 
941 /*
942 -------------------------------------------------------------------------------
943 Packs the sign `zSign', the exponent `zExp', and the significand formed
944 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
945 floating-point value, returning the result.  After being shifted into the
946 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
947 added together to form the most significant 32 bits of the result.  This
948 means that any integer portion of `zSig0' will be added into the exponent.
949 Since a properly normalized significand will have an integer portion equal
950 to 1, the `zExp' input should be 1 less than the desired result exponent
951 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
952 significand.
953 -------------------------------------------------------------------------------
954 */
955 INLINE float128
956  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
957 {
958     float128 z;
959 
960     z.low = zSig1;
961     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
962     return z;
963 
964 }
965 
966 /*
967 -------------------------------------------------------------------------------
968 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969 and extended significand formed by the concatenation of `zSig0', `zSig1',
970 and `zSig2', and returns the proper quadruple-precision floating-point value
971 corresponding to the abstract input.  Ordinarily, the abstract value is
972 simply rounded and packed into the quadruple-precision format, with the
973 inexact exception raised if the abstract input cannot be represented
974 exactly.  However, if the abstract value is too large, the overflow and
975 inexact exceptions are raised and an infinity or maximal finite value is
976 returned.  If the abstract value is too small, the input value is rounded to
977 a subnormal number, and the underflow and inexact exceptions are raised if
978 the abstract input cannot be represented exactly as a subnormal quadruple-
979 precision floating-point number.
980     The input significand must be normalized or smaller.  If the input
981 significand is not normalized, `zExp' must be 0; in that case, the result
982 returned is a subnormal number, and it must not require rounding.  In the
983 usual case that the input significand is normalized, `zExp' must be 1 less
984 than the ``true'' floating-point exponent.  The handling of underflow and
985 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986 -------------------------------------------------------------------------------
987 */
988 static float128
989  roundAndPackFloat128(
990      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
991 {
992     int8 roundingMode;
993     flag roundNearestEven, increment, isTiny;
994 
995     roundingMode = float_rounding_mode();
996     roundNearestEven = ( roundingMode == float_round_nearest_even );
997     increment = ( (sbits64) zSig2 < 0 );
998     if ( ! roundNearestEven ) {
999         if ( roundingMode == float_round_to_zero ) {
1000             increment = 0;
1001         }
1002         else {
1003             if ( zSign ) {
1004                 increment = ( roundingMode == float_round_down ) && zSig2;
1005             }
1006             else {
1007                 increment = ( roundingMode == float_round_up ) && zSig2;
1008             }
1009         }
1010     }
1011     if ( 0x7FFD <= (bits32) zExp ) {
1012         if (    ( 0x7FFD < zExp )
1013              || (    ( zExp == 0x7FFD )
1014                   && eq128(
1015                          LIT64( 0x0001FFFFFFFFFFFF ),
1016                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1017                          zSig0,
1018                          zSig1
1019                      )
1020                   && increment
1021                 )
1022            ) {
1023             float_raise( float_flag_overflow | float_flag_inexact );
1024             if (    ( roundingMode == float_round_to_zero )
1025                  || ( zSign && ( roundingMode == float_round_up ) )
1026                  || ( ! zSign && ( roundingMode == float_round_down ) )
1027                ) {
1028                 return
1029                     packFloat128(
1030                         zSign,
1031                         0x7FFE,
1032                         LIT64( 0x0000FFFFFFFFFFFF ),
1033                         LIT64( 0xFFFFFFFFFFFFFFFF )
1034                     );
1035             }
1036             return packFloat128( zSign, 0x7FFF, 0, 0 );
1037         }
1038         if ( zExp < 0 ) {
1039             isTiny =
1040                    ( float_detect_tininess == float_tininess_before_rounding )
1041                 || ( zExp < -1 )
1042                 || ! increment
1043                 || lt128(
1044                        zSig0,
1045                        zSig1,
1046                        LIT64( 0x0001FFFFFFFFFFFF ),
1047                        LIT64( 0xFFFFFFFFFFFFFFFF )
1048                    );
1049             shift128ExtraRightJamming(
1050                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1051             zExp = 0;
1052             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1053             if ( roundNearestEven ) {
1054                 increment = ( (sbits64) zSig2 < 0 );
1055             }
1056             else {
1057                 if ( zSign ) {
1058                     increment = ( roundingMode == float_round_down ) && zSig2;
1059                 }
1060                 else {
1061                     increment = ( roundingMode == float_round_up ) && zSig2;
1062                 }
1063             }
1064         }
1065     }
1066     if ( zSig2 ) float_set_inexact();
1067     if ( increment ) {
1068         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1069         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1070     }
1071     else {
1072         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1073     }
1074     return packFloat128( zSign, zExp, zSig0, zSig1 );
1075 
1076 }
1077 
1078 /*
1079 -------------------------------------------------------------------------------
1080 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1081 and significand formed by the concatenation of `zSig0' and `zSig1', and
1082 returns the proper quadruple-precision floating-point value corresponding
1083 to the abstract input.  This routine is just like `roundAndPackFloat128'
1084 except that the input significand has fewer bits and does not have to be
1085 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1086 point exponent.
1087 -------------------------------------------------------------------------------
1088 */
1089 static float128
1090  normalizeRoundAndPackFloat128(
1091      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1092 {
1093     int8 shiftCount;
1094     bits64 zSig2;
1095 
1096     if ( zSig0 == 0 ) {
1097         zSig0 = zSig1;
1098         zSig1 = 0;
1099         zExp -= 64;
1100     }
1101     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1102     if ( 0 <= shiftCount ) {
1103         zSig2 = 0;
1104         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1105     }
1106     else {
1107         shift128ExtraRightJamming(
1108             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1109     }
1110     zExp -= shiftCount;
1111     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1112 
1113 }
1114 
1115 #endif
1116 
1117 /*
1118 -------------------------------------------------------------------------------
1119 Returns the result of converting the 32-bit two's complement integer `a'
1120 to the single-precision floating-point format.  The conversion is performed
1121 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122 -------------------------------------------------------------------------------
1123 */
1124 float32 int32_to_float32( int32 a )
1125 {
1126     flag zSign;
1127 
1128     if ( a == 0 ) return 0;
1129     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130     zSign = ( a < 0 );
1131     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1132 
1133 }
1134 
1135 /*
1136 -------------------------------------------------------------------------------
1137 Returns the result of converting the 32-bit two's complement integer `a'
1138 to the double-precision floating-point format.  The conversion is performed
1139 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1140 -------------------------------------------------------------------------------
1141 */
1142 float64 int32_to_float64( int32 a )
1143 {
1144     flag zSign;
1145     uint32 absA;
1146     int8 shiftCount;
1147     bits64 zSig;
1148 
1149     if ( a == 0 ) return 0;
1150     zSign = ( a < 0 );
1151     absA = zSign ? - a : a;
1152     shiftCount = countLeadingZeros32( absA ) + 21;
1153     zSig = absA;
1154     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1155 
1156 }
1157 
1158 #ifdef FLOATX80
1159 
1160 /*
1161 -------------------------------------------------------------------------------
1162 Returns the result of converting the 32-bit two's complement integer `a'
1163 to the extended double-precision floating-point format.  The conversion
1164 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1165 Arithmetic.
1166 -------------------------------------------------------------------------------
1167 */
1168 floatx80 int32_to_floatx80( int32 a )
1169 {
1170     flag zSign;
1171     uint32 absA;
1172     int8 shiftCount;
1173     bits64 zSig;
1174 
1175     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1176     zSign = ( a < 0 );
1177     absA = zSign ? - a : a;
1178     shiftCount = countLeadingZeros32( absA ) + 32;
1179     zSig = absA;
1180     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1181 
1182 }
1183 
1184 #endif
1185 
1186 #ifdef FLOAT128
1187 
1188 /*
1189 -------------------------------------------------------------------------------
1190 Returns the result of converting the 32-bit two's complement integer `a' to
1191 the quadruple-precision floating-point format.  The conversion is performed
1192 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1193 -------------------------------------------------------------------------------
1194 */
1195 float128 int32_to_float128( int32 a )
1196 {
1197     flag zSign;
1198     uint32 absA;
1199     int8 shiftCount;
1200     bits64 zSig0;
1201 
1202     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1203     zSign = ( a < 0 );
1204     absA = zSign ? - a : a;
1205     shiftCount = countLeadingZeros32( absA ) + 17;
1206     zSig0 = absA;
1207     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1208 
1209 }
1210 
1211 #endif
1212 
1213 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1214 /*
1215 -------------------------------------------------------------------------------
1216 Returns the result of converting the 64-bit two's complement integer `a'
1217 to the single-precision floating-point format.  The conversion is performed
1218 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 -------------------------------------------------------------------------------
1220 */
1221 float32 int64_to_float32( int64 a )
1222 {
1223     flag zSign;
1224     uint64 absA;
1225     int8 shiftCount;
1226 
1227     if ( a == 0 ) return 0;
1228     zSign = ( a < 0 );
1229     absA = zSign ? - a : a;
1230     shiftCount = countLeadingZeros64( absA ) - 40;
1231     if ( 0 <= shiftCount ) {
1232         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1233     }
1234     else {
1235         shiftCount += 7;
1236         if ( shiftCount < 0 ) {
1237             shift64RightJamming( absA, - shiftCount, &absA );
1238         }
1239         else {
1240             absA <<= shiftCount;
1241         }
1242         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1243     }
1244 
1245 }
1246 
1247 /*
1248 -------------------------------------------------------------------------------
1249 Returns the result of converting the 64-bit two's complement integer `a'
1250 to the double-precision floating-point format.  The conversion is performed
1251 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1252 -------------------------------------------------------------------------------
1253 */
1254 float64 int64_to_float64( int64 a )
1255 {
1256     flag zSign;
1257 
1258     if ( a == 0 ) return 0;
1259     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1260         return packFloat64( 1, 0x43E, 0 );
1261     }
1262     zSign = ( a < 0 );
1263     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1264 
1265 }
1266 
1267 #ifdef FLOATX80
1268 
1269 /*
1270 -------------------------------------------------------------------------------
1271 Returns the result of converting the 64-bit two's complement integer `a'
1272 to the extended double-precision floating-point format.  The conversion
1273 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1274 Arithmetic.
1275 -------------------------------------------------------------------------------
1276 */
1277 floatx80 int64_to_floatx80( int64 a )
1278 {
1279     flag zSign;
1280     uint64 absA;
1281     int8 shiftCount;
1282 
1283     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1284     zSign = ( a < 0 );
1285     absA = zSign ? - a : a;
1286     shiftCount = countLeadingZeros64( absA );
1287     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1288 
1289 }
1290 
1291 #endif
1292 
1293 #ifdef FLOAT128
1294 
1295 /*
1296 -------------------------------------------------------------------------------
1297 Returns the result of converting the 64-bit two's complement integer `a' to
1298 the quadruple-precision floating-point format.  The conversion is performed
1299 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1300 -------------------------------------------------------------------------------
1301 */
1302 float128 int64_to_float128( int64 a )
1303 {
1304     flag zSign;
1305     uint64 absA;
1306     int8 shiftCount;
1307     int32 zExp;
1308     bits64 zSig0, zSig1;
1309 
1310     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1311     zSign = ( a < 0 );
1312     absA = zSign ? - a : a;
1313     shiftCount = countLeadingZeros64( absA ) + 49;
1314     zExp = 0x406E - shiftCount;
1315     if ( 64 <= shiftCount ) {
1316         zSig1 = 0;
1317         zSig0 = absA;
1318         shiftCount -= 64;
1319     }
1320     else {
1321         zSig1 = absA;
1322         zSig0 = 0;
1323     }
1324     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1325     return packFloat128( zSign, zExp, zSig0, zSig1 );
1326 
1327 }
1328 
1329 #endif
1330 #endif /* !SOFTFLOAT_FOR_GCC */
1331 
1332 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1333 /*
1334 -------------------------------------------------------------------------------
1335 Returns the result of converting the single-precision floating-point value
1336 `a' to the 32-bit two's complement integer format.  The conversion is
1337 performed according to the IEC/IEEE Standard for Binary Floating-Point
1338 Arithmetic---which means in particular that the conversion is rounded
1339 according to the current rounding mode.  If `a' is a NaN, the largest
1340 positive integer is returned.  Otherwise, if the conversion overflows, the
1341 largest integer with the same sign as `a' is returned.
1342 -------------------------------------------------------------------------------
1343 */
1344 int32 float32_to_int32( float32 a )
1345 {
1346     flag aSign;
1347     int16 aExp, shiftCount;
1348     bits32 aSig;
1349     bits64 aSig64;
1350 
1351     aSig = extractFloat32Frac( a );
1352     aExp = extractFloat32Exp( a );
1353     aSign = extractFloat32Sign( a );
1354     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1355     if ( aExp ) aSig |= 0x00800000;
1356     shiftCount = 0xAF - aExp;
1357     aSig64 = aSig;
1358     aSig64 <<= 32;
1359     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1360     return roundAndPackInt32( aSign, aSig64 );
1361 
1362 }
1363 #endif /* !SOFTFLOAT_FOR_GCC */
1364 
1365 /*
1366 -------------------------------------------------------------------------------
1367 Returns the result of converting the single-precision floating-point value
1368 `a' to the 32-bit two's complement integer format.  The conversion is
1369 performed according to the IEC/IEEE Standard for Binary Floating-Point
1370 Arithmetic, except that the conversion is always rounded toward zero.
1371 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1372 the conversion overflows, the largest integer with the same sign as `a' is
1373 returned.
1374 -------------------------------------------------------------------------------
1375 */
1376 int32 float32_to_int32_round_to_zero( float32 a )
1377 {
1378     flag aSign;
1379     int16 aExp, shiftCount;
1380     bits32 aSig;
1381     int32 z;
1382 
1383     aSig = extractFloat32Frac( a );
1384     aExp = extractFloat32Exp( a );
1385     aSign = extractFloat32Sign( a );
1386     shiftCount = aExp - 0x9E;
1387     if ( 0 <= shiftCount ) {
1388         if ( a != 0xCF000000 ) {
1389             float_raise( float_flag_invalid );
1390             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1391         }
1392         return (sbits32) 0x80000000;
1393     }
1394     else if ( aExp <= 0x7E ) {
1395         if ( aExp | aSig ) float_set_inexact();
1396         return 0;
1397     }
1398     aSig = ( aSig | 0x00800000 )<<8;
1399     z = aSig>>( - shiftCount );
1400     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1401         float_set_inexact();
1402     }
1403     if ( aSign ) z = - z;
1404     return z;
1405 
1406 }
1407 
1408 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1409 /*
1410 -------------------------------------------------------------------------------
1411 Returns the result of converting the single-precision floating-point value
1412 `a' to the 64-bit two's complement integer format.  The conversion is
1413 performed according to the IEC/IEEE Standard for Binary Floating-Point
1414 Arithmetic---which means in particular that the conversion is rounded
1415 according to the current rounding mode.  If `a' is a NaN, the largest
1416 positive integer is returned.  Otherwise, if the conversion overflows, the
1417 largest integer with the same sign as `a' is returned.
1418 -------------------------------------------------------------------------------
1419 */
1420 int64 float32_to_int64( float32 a )
1421 {
1422     flag aSign;
1423     int16 aExp, shiftCount;
1424     bits32 aSig;
1425     bits64 aSig64, aSigExtra;
1426 
1427     aSig = extractFloat32Frac( a );
1428     aExp = extractFloat32Exp( a );
1429     aSign = extractFloat32Sign( a );
1430     shiftCount = 0xBE - aExp;
1431     if ( shiftCount < 0 ) {
1432         float_raise( float_flag_invalid );
1433         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1434             return LIT64( 0x7FFFFFFFFFFFFFFF );
1435         }
1436         return (sbits64) LIT64( 0x8000000000000000 );
1437     }
1438     if ( aExp ) aSig |= 0x00800000;
1439     aSig64 = aSig;
1440     aSig64 <<= 40;
1441     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1442     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1443 
1444 }
1445 
1446 /*
1447 -------------------------------------------------------------------------------
1448 Returns the result of converting the single-precision floating-point value
1449 `a' to the 64-bit two's complement integer format.  The conversion is
1450 performed according to the IEC/IEEE Standard for Binary Floating-Point
1451 Arithmetic, except that the conversion is always rounded toward zero.  If
1452 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1453 conversion overflows, the largest integer with the same sign as `a' is
1454 returned.
1455 -------------------------------------------------------------------------------
1456 */
1457 int64 float32_to_int64_round_to_zero( float32 a )
1458 {
1459     flag aSign;
1460     int16 aExp, shiftCount;
1461     bits32 aSig;
1462     bits64 aSig64;
1463     int64 z;
1464 
1465     aSig = extractFloat32Frac( a );
1466     aExp = extractFloat32Exp( a );
1467     aSign = extractFloat32Sign( a );
1468     shiftCount = aExp - 0xBE;
1469     if ( 0 <= shiftCount ) {
1470         if ( a != 0xDF000000 ) {
1471             float_raise( float_flag_invalid );
1472             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1473                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1474             }
1475         }
1476         return (sbits64) LIT64( 0x8000000000000000 );
1477     }
1478     else if ( aExp <= 0x7E ) {
1479         if ( aExp | aSig ) float_set_inexact();
1480         return 0;
1481     }
1482     aSig64 = aSig | 0x00800000;
1483     aSig64 <<= 40;
1484     z = aSig64>>( - shiftCount );
1485     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1486         float_set_inexact();
1487     }
1488     if ( aSign ) z = - z;
1489     return z;
1490 
1491 }
1492 #endif /* !SOFTFLOAT_FOR_GCC */
1493 
1494 /*
1495 -------------------------------------------------------------------------------
1496 Returns the result of converting the single-precision floating-point value
1497 `a' to the double-precision floating-point format.  The conversion is
1498 performed according to the IEC/IEEE Standard for Binary Floating-Point
1499 Arithmetic.
1500 -------------------------------------------------------------------------------
1501 */
1502 float64 float32_to_float64( float32 a )
1503 {
1504     flag aSign;
1505     int16 aExp;
1506     bits32 aSig;
1507 
1508     aSig = extractFloat32Frac( a );
1509     aExp = extractFloat32Exp( a );
1510     aSign = extractFloat32Sign( a );
1511     if ( aExp == 0xFF ) {
1512         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1513         return packFloat64( aSign, 0x7FF, 0 );
1514     }
1515     if ( aExp == 0 ) {
1516         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1517         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1518         --aExp;
1519     }
1520     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1521 
1522 }
1523 
1524 #ifdef FLOATX80
1525 
1526 /*
1527 -------------------------------------------------------------------------------
1528 Returns the result of converting the single-precision floating-point value
1529 `a' to the extended double-precision floating-point format.  The conversion
1530 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1531 Arithmetic.
1532 -------------------------------------------------------------------------------
1533 */
1534 floatx80 float32_to_floatx80( float32 a )
1535 {
1536     flag aSign;
1537     int16 aExp;
1538     bits32 aSig;
1539 
1540     aSig = extractFloat32Frac( a );
1541     aExp = extractFloat32Exp( a );
1542     aSign = extractFloat32Sign( a );
1543     if ( aExp == 0xFF ) {
1544         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1545         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1546     }
1547     if ( aExp == 0 ) {
1548         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1549         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1550     }
1551     aSig |= 0x00800000;
1552     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1553 
1554 }
1555 
1556 #endif
1557 
1558 #ifdef FLOAT128
1559 
1560 /*
1561 -------------------------------------------------------------------------------
1562 Returns the result of converting the single-precision floating-point value
1563 `a' to the double-precision floating-point format.  The conversion is
1564 performed according to the IEC/IEEE Standard for Binary Floating-Point
1565 Arithmetic.
1566 -------------------------------------------------------------------------------
1567 */
1568 float128 float32_to_float128( float32 a )
1569 {
1570     flag aSign;
1571     int16 aExp;
1572     bits32 aSig;
1573 
1574     aSig = extractFloat32Frac( a );
1575     aExp = extractFloat32Exp( a );
1576     aSign = extractFloat32Sign( a );
1577     if ( aExp == 0xFF ) {
1578         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1579         return packFloat128( aSign, 0x7FFF, 0, 0 );
1580     }
1581     if ( aExp == 0 ) {
1582         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1583         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1584         --aExp;
1585     }
1586     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1587 
1588 }
1589 
1590 #endif
1591 
1592 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1593 /*
1594 -------------------------------------------------------------------------------
1595 Rounds the single-precision floating-point value `a' to an integer, and
1596 returns the result as a single-precision floating-point value.  The
1597 operation is performed according to the IEC/IEEE Standard for Binary
1598 Floating-Point Arithmetic.
1599 -------------------------------------------------------------------------------
1600 */
1601 float32 float32_round_to_int( float32 a )
1602 {
1603     flag aSign;
1604     int16 aExp;
1605     bits32 lastBitMask, roundBitsMask;
1606     int8 roundingMode;
1607     float32 z;
1608 
1609     aExp = extractFloat32Exp( a );
1610     if ( 0x96 <= aExp ) {
1611         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1612             return propagateFloat32NaN( a, a );
1613         }
1614         return a;
1615     }
1616     if ( aExp <= 0x7E ) {
1617         if ( (bits32) ( a<<1 ) == 0 ) return a;
1618         float_set_inexact();
1619         aSign = extractFloat32Sign( a );
1620         switch ( float_rounding_mode() ) {
1621          case float_round_nearest_even:
1622             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1623                 return packFloat32( aSign, 0x7F, 0 );
1624             }
1625             break;
1626          case float_round_down:
1627             return aSign ? 0xBF800000 : 0;
1628          case float_round_up:
1629             return aSign ? 0x80000000 : 0x3F800000;
1630         }
1631         return packFloat32( aSign, 0, 0 );
1632     }
1633     lastBitMask = 1;
1634     lastBitMask <<= 0x96 - aExp;
1635     roundBitsMask = lastBitMask - 1;
1636     z = a;
1637     roundingMode = float_rounding_mode();
1638     if ( roundingMode == float_round_nearest_even ) {
1639         z += lastBitMask>>1;
1640         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1641     }
1642     else if ( roundingMode != float_round_to_zero ) {
1643         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1644             z += roundBitsMask;
1645         }
1646     }
1647     z &= ~ roundBitsMask;
1648     if ( z != a ) float_set_inexact();
1649     return z;
1650 
1651 }
1652 #endif /* !SOFTFLOAT_FOR_GCC */
1653 
1654 /*
1655 -------------------------------------------------------------------------------
1656 Returns the result of adding the absolute values of the single-precision
1657 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1658 before being returned.  `zSign' is ignored if the result is a NaN.
1659 The addition is performed according to the IEC/IEEE Standard for Binary
1660 Floating-Point Arithmetic.
1661 -------------------------------------------------------------------------------
1662 */
1663 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1664 {
1665     int16 aExp, bExp, zExp;
1666     bits32 aSig, bSig, zSig;
1667     int16 expDiff;
1668 
1669     aSig = extractFloat32Frac( a );
1670     aExp = extractFloat32Exp( a );
1671     bSig = extractFloat32Frac( b );
1672     bExp = extractFloat32Exp( b );
1673     expDiff = aExp - bExp;
1674     aSig <<= 6;
1675     bSig <<= 6;
1676     if ( 0 < expDiff ) {
1677         if ( aExp == 0xFF ) {
1678             if ( aSig ) return propagateFloat32NaN( a, b );
1679             return a;
1680         }
1681         if ( bExp == 0 ) {
1682             --expDiff;
1683         }
1684         else {
1685             bSig |= 0x20000000;
1686         }
1687         shift32RightJamming( bSig, expDiff, &bSig );
1688         zExp = aExp;
1689     }
1690     else if ( expDiff < 0 ) {
1691         if ( bExp == 0xFF ) {
1692             if ( bSig ) return propagateFloat32NaN( a, b );
1693             return packFloat32( zSign, 0xFF, 0 );
1694         }
1695         if ( aExp == 0 ) {
1696             ++expDiff;
1697         }
1698         else {
1699             aSig |= 0x20000000;
1700         }
1701         shift32RightJamming( aSig, - expDiff, &aSig );
1702         zExp = bExp;
1703     }
1704     else {
1705         if ( aExp == 0xFF ) {
1706             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1707             return a;
1708         }
1709         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1710         zSig = 0x40000000 + aSig + bSig;
1711         zExp = aExp;
1712         goto roundAndPack;
1713     }
1714     aSig |= 0x20000000;
1715     zSig = ( aSig + bSig )<<1;
1716     --zExp;
1717     if ( (sbits32) zSig < 0 ) {
1718         zSig = aSig + bSig;
1719         ++zExp;
1720     }
1721  roundAndPack:
1722     return roundAndPackFloat32( zSign, zExp, zSig );
1723 
1724 }
1725 
1726 /*
1727 -------------------------------------------------------------------------------
1728 Returns the result of subtracting the absolute values of the single-
1729 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1730 difference is negated before being returned.  `zSign' is ignored if the
1731 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1732 Standard for Binary Floating-Point Arithmetic.
1733 -------------------------------------------------------------------------------
1734 */
1735 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1736 {
1737     int16 aExp, bExp, zExp;
1738     bits32 aSig, bSig, zSig;
1739     int16 expDiff;
1740 
1741     aSig = extractFloat32Frac( a );
1742     aExp = extractFloat32Exp( a );
1743     bSig = extractFloat32Frac( b );
1744     bExp = extractFloat32Exp( b );
1745     expDiff = aExp - bExp;
1746     aSig <<= 7;
1747     bSig <<= 7;
1748     if ( 0 < expDiff ) goto aExpBigger;
1749     if ( expDiff < 0 ) goto bExpBigger;
1750     if ( aExp == 0xFF ) {
1751         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1752         float_raise( float_flag_invalid );
1753         return float32_default_nan;
1754     }
1755     if ( aExp == 0 ) {
1756         aExp = 1;
1757         bExp = 1;
1758     }
1759     if ( bSig < aSig ) goto aBigger;
1760     if ( aSig < bSig ) goto bBigger;
1761     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1762  bExpBigger:
1763     if ( bExp == 0xFF ) {
1764         if ( bSig ) return propagateFloat32NaN( a, b );
1765         return packFloat32( zSign ^ 1, 0xFF, 0 );
1766     }
1767     if ( aExp == 0 ) {
1768         ++expDiff;
1769     }
1770     else {
1771         aSig |= 0x40000000;
1772     }
1773     shift32RightJamming( aSig, - expDiff, &aSig );
1774     bSig |= 0x40000000;
1775  bBigger:
1776     zSig = bSig - aSig;
1777     zExp = bExp;
1778     zSign ^= 1;
1779     goto normalizeRoundAndPack;
1780  aExpBigger:
1781     if ( aExp == 0xFF ) {
1782         if ( aSig ) return propagateFloat32NaN( a, b );
1783         return a;
1784     }
1785     if ( bExp == 0 ) {
1786         --expDiff;
1787     }
1788     else {
1789         bSig |= 0x40000000;
1790     }
1791     shift32RightJamming( bSig, expDiff, &bSig );
1792     aSig |= 0x40000000;
1793  aBigger:
1794     zSig = aSig - bSig;
1795     zExp = aExp;
1796  normalizeRoundAndPack:
1797     --zExp;
1798     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1799 
1800 }
1801 
1802 /*
1803 -------------------------------------------------------------------------------
1804 Returns the result of adding the single-precision floating-point values `a'
1805 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1806 Binary Floating-Point Arithmetic.
1807 -------------------------------------------------------------------------------
1808 */
1809 float32 float32_add( float32 a, float32 b )
1810 {
1811     flag aSign, bSign;
1812 
1813     aSign = extractFloat32Sign( a );
1814     bSign = extractFloat32Sign( b );
1815     if ( aSign == bSign ) {
1816         return addFloat32Sigs( a, b, aSign );
1817     }
1818     else {
1819         return subFloat32Sigs( a, b, aSign );
1820     }
1821 
1822 }
1823 
1824 /*
1825 -------------------------------------------------------------------------------
1826 Returns the result of subtracting the single-precision floating-point values
1827 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1828 for Binary Floating-Point Arithmetic.
1829 -------------------------------------------------------------------------------
1830 */
1831 float32 float32_sub( float32 a, float32 b )
1832 {
1833     flag aSign, bSign;
1834 
1835     aSign = extractFloat32Sign( a );
1836     bSign = extractFloat32Sign( b );
1837     if ( aSign == bSign ) {
1838         return subFloat32Sigs( a, b, aSign );
1839     }
1840     else {
1841         return addFloat32Sigs( a, b, aSign );
1842     }
1843 
1844 }
1845 
1846 /*
1847 -------------------------------------------------------------------------------
1848 Returns the result of multiplying the single-precision floating-point values
1849 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1850 for Binary Floating-Point Arithmetic.
1851 -------------------------------------------------------------------------------
1852 */
1853 float32 float32_mul( float32 a, float32 b )
1854 {
1855     flag aSign, bSign, zSign;
1856     int16 aExp, bExp, zExp;
1857     bits32 aSig, bSig;
1858     bits64 zSig64;
1859     bits32 zSig;
1860 
1861     aSig = extractFloat32Frac( a );
1862     aExp = extractFloat32Exp( a );
1863     aSign = extractFloat32Sign( a );
1864     bSig = extractFloat32Frac( b );
1865     bExp = extractFloat32Exp( b );
1866     bSign = extractFloat32Sign( b );
1867     zSign = aSign ^ bSign;
1868     if ( aExp == 0xFF ) {
1869         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1870             return propagateFloat32NaN( a, b );
1871         }
1872         if ( ( bExp | bSig ) == 0 ) {
1873             float_raise( float_flag_invalid );
1874             return float32_default_nan;
1875         }
1876         return packFloat32( zSign, 0xFF, 0 );
1877     }
1878     if ( bExp == 0xFF ) {
1879         if ( bSig ) return propagateFloat32NaN( a, b );
1880         if ( ( aExp | aSig ) == 0 ) {
1881             float_raise( float_flag_invalid );
1882             return float32_default_nan;
1883         }
1884         return packFloat32( zSign, 0xFF, 0 );
1885     }
1886     if ( aExp == 0 ) {
1887         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889     }
1890     if ( bExp == 0 ) {
1891         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1892         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1893     }
1894     zExp = aExp + bExp - 0x7F;
1895     aSig = ( aSig | 0x00800000 )<<7;
1896     bSig = ( bSig | 0x00800000 )<<8;
1897     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1898     zSig = zSig64;
1899     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1900         zSig <<= 1;
1901         --zExp;
1902     }
1903     return roundAndPackFloat32( zSign, zExp, zSig );
1904 
1905 }
1906 
1907 /*
1908 -------------------------------------------------------------------------------
1909 Returns the result of dividing the single-precision floating-point value `a'
1910 by the corresponding value `b'.  The operation is performed according to the
1911 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1912 -------------------------------------------------------------------------------
1913 */
1914 float32 float32_div( float32 a, float32 b )
1915 {
1916     flag aSign, bSign, zSign;
1917     int16 aExp, bExp, zExp;
1918     bits32 aSig, bSig, zSig;
1919 
1920     aSig = extractFloat32Frac( a );
1921     aExp = extractFloat32Exp( a );
1922     aSign = extractFloat32Sign( a );
1923     bSig = extractFloat32Frac( b );
1924     bExp = extractFloat32Exp( b );
1925     bSign = extractFloat32Sign( b );
1926     zSign = aSign ^ bSign;
1927     if ( aExp == 0xFF ) {
1928         if ( aSig ) return propagateFloat32NaN( a, b );
1929         if ( bExp == 0xFF ) {
1930             if ( bSig ) return propagateFloat32NaN( a, b );
1931             float_raise( float_flag_invalid );
1932             return float32_default_nan;
1933         }
1934         return packFloat32( zSign, 0xFF, 0 );
1935     }
1936     if ( bExp == 0xFF ) {
1937         if ( bSig ) return propagateFloat32NaN( a, b );
1938         return packFloat32( zSign, 0, 0 );
1939     }
1940     if ( bExp == 0 ) {
1941         if ( bSig == 0 ) {
1942             if ( ( aExp | aSig ) == 0 ) {
1943                 float_raise( float_flag_invalid );
1944                 return float32_default_nan;
1945             }
1946             float_raise( float_flag_divbyzero );
1947             return packFloat32( zSign, 0xFF, 0 );
1948         }
1949         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1950     }
1951     if ( aExp == 0 ) {
1952         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1953         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1954     }
1955     zExp = aExp - bExp + 0x7D;
1956     aSig = ( aSig | 0x00800000 )<<7;
1957     bSig = ( bSig | 0x00800000 )<<8;
1958     if ( bSig <= ( aSig + aSig ) ) {
1959         aSig >>= 1;
1960         ++zExp;
1961     }
1962     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1963     if ( ( zSig & 0x3F ) == 0 ) {
1964         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1965     }
1966     return roundAndPackFloat32( zSign, zExp, zSig );
1967 
1968 }
1969 
1970 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1971 /*
1972 -------------------------------------------------------------------------------
1973 Returns the remainder of the single-precision floating-point value `a'
1974 with respect to the corresponding value `b'.  The operation is performed
1975 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1976 -------------------------------------------------------------------------------
1977 */
1978 float32 float32_rem( float32 a, float32 b )
1979 {
1980     flag aSign, bSign, zSign;
1981     int16 aExp, bExp, expDiff;
1982     bits32 aSig, bSig;
1983     bits32 q;
1984     bits64 aSig64, bSig64, q64;
1985     bits32 alternateASig;
1986     sbits32 sigMean;
1987 
1988     aSig = extractFloat32Frac( a );
1989     aExp = extractFloat32Exp( a );
1990     aSign = extractFloat32Sign( a );
1991     bSig = extractFloat32Frac( b );
1992     bExp = extractFloat32Exp( b );
1993     bSign = extractFloat32Sign( b );
1994     if ( aExp == 0xFF ) {
1995         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1996             return propagateFloat32NaN( a, b );
1997         }
1998         float_raise( float_flag_invalid );
1999         return float32_default_nan;
2000     }
2001     if ( bExp == 0xFF ) {
2002         if ( bSig ) return propagateFloat32NaN( a, b );
2003         return a;
2004     }
2005     if ( bExp == 0 ) {
2006         if ( bSig == 0 ) {
2007             float_raise( float_flag_invalid );
2008             return float32_default_nan;
2009         }
2010         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011     }
2012     if ( aExp == 0 ) {
2013         if ( aSig == 0 ) return a;
2014         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015     }
2016     expDiff = aExp - bExp;
2017     aSig |= 0x00800000;
2018     bSig |= 0x00800000;
2019     if ( expDiff < 32 ) {
2020         aSig <<= 8;
2021         bSig <<= 8;
2022         if ( expDiff < 0 ) {
2023             if ( expDiff < -1 ) return a;
2024             aSig >>= 1;
2025         }
2026         q = ( bSig <= aSig );
2027         if ( q ) aSig -= bSig;
2028         if ( 0 < expDiff ) {
2029             q = ( ( (bits64) aSig )<<32 ) / bSig;
2030             q >>= 32 - expDiff;
2031             bSig >>= 2;
2032             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2033         }
2034         else {
2035             aSig >>= 2;
2036             bSig >>= 2;
2037         }
2038     }
2039     else {
2040         if ( bSig <= aSig ) aSig -= bSig;
2041         aSig64 = ( (bits64) aSig )<<40;
2042         bSig64 = ( (bits64) bSig )<<40;
2043         expDiff -= 64;
2044         while ( 0 < expDiff ) {
2045             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2046             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2047             aSig64 = - ( ( bSig * q64 )<<38 );
2048             expDiff -= 62;
2049         }
2050         expDiff += 64;
2051         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2052         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2053         q = q64>>( 64 - expDiff );
2054         bSig <<= 6;
2055         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2056     }
2057     do {
2058         alternateASig = aSig;
2059         ++q;
2060         aSig -= bSig;
2061     } while ( 0 <= (sbits32) aSig );
2062     sigMean = aSig + alternateASig;
2063     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2064         aSig = alternateASig;
2065     }
2066     zSign = ( (sbits32) aSig < 0 );
2067     if ( zSign ) aSig = - aSig;
2068     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2069 
2070 }
2071 #endif /* !SOFTFLOAT_FOR_GCC */
2072 
2073 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2074 /*
2075 -------------------------------------------------------------------------------
2076 Returns the square root of the single-precision floating-point value `a'.
2077 The operation is performed according to the IEC/IEEE Standard for Binary
2078 Floating-Point Arithmetic.
2079 -------------------------------------------------------------------------------
2080 */
2081 float32 float32_sqrt( float32 a )
2082 {
2083     flag aSign;
2084     int16 aExp, zExp;
2085     bits32 aSig, zSig;
2086     bits64 rem, term;
2087 
2088     aSig = extractFloat32Frac( a );
2089     aExp = extractFloat32Exp( a );
2090     aSign = extractFloat32Sign( a );
2091     if ( aExp == 0xFF ) {
2092         if ( aSig ) return propagateFloat32NaN( a, 0 );
2093         if ( ! aSign ) return a;
2094         float_raise( float_flag_invalid );
2095         return float32_default_nan;
2096     }
2097     if ( aSign ) {
2098         if ( ( aExp | aSig ) == 0 ) return a;
2099         float_raise( float_flag_invalid );
2100         return float32_default_nan;
2101     }
2102     if ( aExp == 0 ) {
2103         if ( aSig == 0 ) return 0;
2104         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2105     }
2106     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2107     aSig = ( aSig | 0x00800000 )<<8;
2108     zSig = estimateSqrt32( aExp, aSig ) + 2;
2109     if ( ( zSig & 0x7F ) <= 5 ) {
2110         if ( zSig < 2 ) {
2111             zSig = 0x7FFFFFFF;
2112             goto roundAndPack;
2113         }
2114         aSig >>= aExp & 1;
2115         term = ( (bits64) zSig ) * zSig;
2116         rem = ( ( (bits64) aSig )<<32 ) - term;
2117         while ( (sbits64) rem < 0 ) {
2118             --zSig;
2119             rem += ( ( (bits64) zSig )<<1 ) | 1;
2120         }
2121         zSig |= ( rem != 0 );
2122     }
2123     shift32RightJamming( zSig, 1, &zSig );
2124  roundAndPack:
2125     return roundAndPackFloat32( 0, zExp, zSig );
2126 
2127 }
2128 #endif /* !SOFTFLOAT_FOR_GCC */
2129 
2130 /*
2131 -------------------------------------------------------------------------------
2132 Returns 1 if the single-precision floating-point value `a' is equal to
2133 the corresponding value `b', and 0 otherwise.  The comparison is performed
2134 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2135 -------------------------------------------------------------------------------
2136 */
2137 flag float32_eq( float32 a, float32 b )
2138 {
2139 
2140     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2141          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2142        ) {
2143         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2144             float_raise( float_flag_invalid );
2145         }
2146         return 0;
2147     }
2148     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2149 
2150 }
2151 
2152 /*
2153 -------------------------------------------------------------------------------
2154 Returns 1 if the single-precision floating-point value `a' is less than
2155 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2156 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2157 Arithmetic.
2158 -------------------------------------------------------------------------------
2159 */
2160 flag float32_le( float32 a, float32 b )
2161 {
2162     flag aSign, bSign;
2163 
2164     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2165          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2166        ) {
2167         float_raise( float_flag_invalid );
2168         return 0;
2169     }
2170     aSign = extractFloat32Sign( a );
2171     bSign = extractFloat32Sign( b );
2172     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2173     return ( a == b ) || ( aSign ^ ( a < b ) );
2174 
2175 }
2176 
2177 /*
2178 -------------------------------------------------------------------------------
2179 Returns 1 if the single-precision floating-point value `a' is less than
2180 the corresponding value `b', and 0 otherwise.  The comparison is performed
2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2182 -------------------------------------------------------------------------------
2183 */
2184 flag float32_lt( float32 a, float32 b )
2185 {
2186     flag aSign, bSign;
2187 
2188     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2189          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2190        ) {
2191         float_raise( float_flag_invalid );
2192         return 0;
2193     }
2194     aSign = extractFloat32Sign( a );
2195     bSign = extractFloat32Sign( b );
2196     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2197     return ( a != b ) && ( aSign ^ ( a < b ) );
2198 
2199 }
2200 
2201 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2202 /*
2203 -------------------------------------------------------------------------------
2204 Returns 1 if the single-precision floating-point value `a' is equal to
2205 the corresponding value `b', and 0 otherwise.  The invalid exception is
2206 raised if either operand is a NaN.  Otherwise, the comparison is performed
2207 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2208 -------------------------------------------------------------------------------
2209 */
2210 flag float32_eq_signaling( float32 a, float32 b )
2211 {
2212 
2213     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2214          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2215        ) {
2216         float_raise( float_flag_invalid );
2217         return 0;
2218     }
2219     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2220 
2221 }
2222 
2223 /*
2224 -------------------------------------------------------------------------------
2225 Returns 1 if the single-precision floating-point value `a' is less than or
2226 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2227 cause an exception.  Otherwise, the comparison is performed according to the
2228 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2229 -------------------------------------------------------------------------------
2230 */
2231 flag float32_le_quiet( float32 a, float32 b )
2232 {
2233     flag aSign, bSign;
2234 
2235     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2236          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2237        ) {
2238         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2239             float_raise( float_flag_invalid );
2240         }
2241         return 0;
2242     }
2243     aSign = extractFloat32Sign( a );
2244     bSign = extractFloat32Sign( b );
2245     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2246     return ( a == b ) || ( aSign ^ ( a < b ) );
2247 
2248 }
2249 
2250 /*
2251 -------------------------------------------------------------------------------
2252 Returns 1 if the single-precision floating-point value `a' is less than
2253 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2254 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2255 Standard for Binary Floating-Point Arithmetic.
2256 -------------------------------------------------------------------------------
2257 */
2258 flag float32_lt_quiet( float32 a, float32 b )
2259 {
2260     flag aSign, bSign;
2261 
2262     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2263          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2264        ) {
2265         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2266             float_raise( float_flag_invalid );
2267         }
2268         return 0;
2269     }
2270     aSign = extractFloat32Sign( a );
2271     bSign = extractFloat32Sign( b );
2272     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2273     return ( a != b ) && ( aSign ^ ( a < b ) );
2274 
2275 }
2276 #endif /* !SOFTFLOAT_FOR_GCC */
2277 
2278 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2279 /*
2280 -------------------------------------------------------------------------------
2281 Returns the result of converting the double-precision floating-point value
2282 `a' to the 32-bit two's complement integer format.  The conversion is
2283 performed according to the IEC/IEEE Standard for Binary Floating-Point
2284 Arithmetic---which means in particular that the conversion is rounded
2285 according to the current rounding mode.  If `a' is a NaN, the largest
2286 positive integer is returned.  Otherwise, if the conversion overflows, the
2287 largest integer with the same sign as `a' is returned.
2288 -------------------------------------------------------------------------------
2289 */
2290 int32 float64_to_int32( float64 a )
2291 {
2292     flag aSign;
2293     int16 aExp, shiftCount;
2294     bits64 aSig;
2295 
2296     aSig = extractFloat64Frac( a );
2297     aExp = extractFloat64Exp( a );
2298     aSign = extractFloat64Sign( a );
2299     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2300     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2301     shiftCount = 0x42C - aExp;
2302     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2303     return roundAndPackInt32( aSign, aSig );
2304 
2305 }
2306 #endif /* !SOFTFLOAT_FOR_GCC */
2307 
2308 /*
2309 -------------------------------------------------------------------------------
2310 Returns the result of converting the double-precision floating-point value
2311 `a' to the 32-bit two's complement integer format.  The conversion is
2312 performed according to the IEC/IEEE Standard for Binary Floating-Point
2313 Arithmetic, except that the conversion is always rounded toward zero.
2314 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2315 the conversion overflows, the largest integer with the same sign as `a' is
2316 returned.
2317 -------------------------------------------------------------------------------
2318 */
2319 int32 float64_to_int32_round_to_zero( float64 a )
2320 {
2321     flag aSign;
2322     int16 aExp, shiftCount;
2323     bits64 aSig, savedASig;
2324     int32 z;
2325 
2326     aSig = extractFloat64Frac( a );
2327     aExp = extractFloat64Exp( a );
2328     aSign = extractFloat64Sign( a );
2329     if ( 0x41E < aExp ) {
2330         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2331         goto invalid;
2332     }
2333     else if ( aExp < 0x3FF ) {
2334         if ( aExp || aSig ) float_set_inexact();
2335         return 0;
2336     }
2337     aSig |= LIT64( 0x0010000000000000 );
2338     shiftCount = 0x433 - aExp;
2339     savedASig = aSig;
2340     aSig >>= shiftCount;
2341     z = aSig;
2342     if ( aSign ) z = - z;
2343     if ( ( z < 0 ) ^ aSign ) {
2344  invalid:
2345         float_raise( float_flag_invalid );
2346         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2347     }
2348     if ( ( aSig<<shiftCount ) != savedASig ) {
2349         float_set_inexact();
2350     }
2351     return z;
2352 
2353 }
2354 
2355 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2356 /*
2357 -------------------------------------------------------------------------------
2358 Returns the result of converting the double-precision floating-point value
2359 `a' to the 64-bit two's complement integer format.  The conversion is
2360 performed according to the IEC/IEEE Standard for Binary Floating-Point
2361 Arithmetic---which means in particular that the conversion is rounded
2362 according to the current rounding mode.  If `a' is a NaN, the largest
2363 positive integer is returned.  Otherwise, if the conversion overflows, the
2364 largest integer with the same sign as `a' is returned.
2365 -------------------------------------------------------------------------------
2366 */
2367 int64 float64_to_int64( float64 a )
2368 {
2369     flag aSign;
2370     int16 aExp, shiftCount;
2371     bits64 aSig, aSigExtra;
2372 
2373     aSig = extractFloat64Frac( a );
2374     aExp = extractFloat64Exp( a );
2375     aSign = extractFloat64Sign( a );
2376     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2377     shiftCount = 0x433 - aExp;
2378     if ( shiftCount <= 0 ) {
2379         if ( 0x43E < aExp ) {
2380             float_raise( float_flag_invalid );
2381             if (    ! aSign
2382                  || (    ( aExp == 0x7FF )
2383                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2384                ) {
2385                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2386             }
2387             return (sbits64) LIT64( 0x8000000000000000 );
2388         }
2389         aSigExtra = 0;
2390         aSig <<= - shiftCount;
2391     }
2392     else {
2393         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2394     }
2395     return roundAndPackInt64( aSign, aSig, aSigExtra );
2396 
2397 }
2398 
2399 /*
2400 -------------------------------------------------------------------------------
2401 Returns the result of converting the double-precision floating-point value
2402 `a' to the 64-bit two's complement integer format.  The conversion is
2403 performed according to the IEC/IEEE Standard for Binary Floating-Point
2404 Arithmetic, except that the conversion is always rounded toward zero.
2405 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2406 the conversion overflows, the largest integer with the same sign as `a' is
2407 returned.
2408 -------------------------------------------------------------------------------
2409 */
2410 int64 float64_to_int64_round_to_zero( float64 a )
2411 {
2412     flag aSign;
2413     int16 aExp, shiftCount;
2414     bits64 aSig;
2415     int64 z;
2416 
2417     aSig = extractFloat64Frac( a );
2418     aExp = extractFloat64Exp( a );
2419     aSign = extractFloat64Sign( a );
2420     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2421     shiftCount = aExp - 0x433;
2422     if ( 0 <= shiftCount ) {
2423         if ( 0x43E <= aExp ) {
2424             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2425                 float_raise( float_flag_invalid );
2426                 if (    ! aSign
2427                      || (    ( aExp == 0x7FF )
2428                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2429                    ) {
2430                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2431                 }
2432             }
2433             return (sbits64) LIT64( 0x8000000000000000 );
2434         }
2435         z = aSig<<shiftCount;
2436     }
2437     else {
2438         if ( aExp < 0x3FE ) {
2439             if ( aExp | aSig ) float_set_inexact();
2440             return 0;
2441         }
2442         z = aSig>>( - shiftCount );
2443         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2444             float_set_inexact();
2445         }
2446     }
2447     if ( aSign ) z = - z;
2448     return z;
2449 
2450 }
2451 #endif /* !SOFTFLOAT_FOR_GCC */
2452 
2453 /*
2454 -------------------------------------------------------------------------------
2455 Returns the result of converting the double-precision floating-point value
2456 `a' to the single-precision floating-point format.  The conversion is
2457 performed according to the IEC/IEEE Standard for Binary Floating-Point
2458 Arithmetic.
2459 -------------------------------------------------------------------------------
2460 */
2461 float32 float64_to_float32( float64 a )
2462 {
2463     flag aSign;
2464     int16 aExp;
2465     bits64 aSig;
2466     bits32 zSig;
2467 
2468     aSig = extractFloat64Frac( a );
2469     aExp = extractFloat64Exp( a );
2470     aSign = extractFloat64Sign( a );
2471     if ( aExp == 0x7FF ) {
2472         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2473         return packFloat32( aSign, 0xFF, 0 );
2474     }
2475     shift64RightJamming( aSig, 22, &aSig );
2476     zSig = aSig;
2477     if ( aExp || zSig ) {
2478         zSig |= 0x40000000;
2479         aExp -= 0x381;
2480     }
2481     return roundAndPackFloat32( aSign, aExp, zSig );
2482 
2483 }
2484 
2485 #ifdef FLOATX80
2486 
2487 /*
2488 -------------------------------------------------------------------------------
2489 Returns the result of converting the double-precision floating-point value
2490 `a' to the extended double-precision floating-point format.  The conversion
2491 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2492 Arithmetic.
2493 -------------------------------------------------------------------------------
2494 */
2495 floatx80 float64_to_floatx80( float64 a )
2496 {
2497     flag aSign;
2498     int16 aExp;
2499     bits64 aSig;
2500 
2501     aSig = extractFloat64Frac( a );
2502     aExp = extractFloat64Exp( a );
2503     aSign = extractFloat64Sign( a );
2504     if ( aExp == 0x7FF ) {
2505         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2506         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2507     }
2508     if ( aExp == 0 ) {
2509         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2510         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2511     }
2512     return
2513         packFloatx80(
2514             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2515 
2516 }
2517 
2518 #endif
2519 
2520 #ifdef FLOAT128
2521 
2522 /*
2523 -------------------------------------------------------------------------------
2524 Returns the result of converting the double-precision floating-point value
2525 `a' to the quadruple-precision floating-point format.  The conversion is
2526 performed according to the IEC/IEEE Standard for Binary Floating-Point
2527 Arithmetic.
2528 -------------------------------------------------------------------------------
2529 */
2530 float128 float64_to_float128( float64 a )
2531 {
2532     flag aSign;
2533     int16 aExp;
2534     bits64 aSig, zSig0, zSig1;
2535 
2536     aSig = extractFloat64Frac( a );
2537     aExp = extractFloat64Exp( a );
2538     aSign = extractFloat64Sign( a );
2539     if ( aExp == 0x7FF ) {
2540         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2541         return packFloat128( aSign, 0x7FFF, 0, 0 );
2542     }
2543     if ( aExp == 0 ) {
2544         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2545         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2546         --aExp;
2547     }
2548     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2549     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2550 
2551 }
2552 
2553 #endif
2554 
2555 #ifndef SOFTFLOAT_FOR_GCC
2556 /*
2557 -------------------------------------------------------------------------------
2558 Rounds the double-precision floating-point value `a' to an integer, and
2559 returns the result as a double-precision floating-point value.  The
2560 operation is performed according to the IEC/IEEE Standard for Binary
2561 Floating-Point Arithmetic.
2562 -------------------------------------------------------------------------------
2563 */
2564 float64 float64_round_to_int( float64 a )
2565 {
2566     flag aSign;
2567     int16 aExp;
2568     bits64 lastBitMask, roundBitsMask;
2569     int8 roundingMode;
2570     float64 z;
2571 
2572     aExp = extractFloat64Exp( a );
2573     if ( 0x433 <= aExp ) {
2574         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2575             return propagateFloat64NaN( a, a );
2576         }
2577         return a;
2578     }
2579     if ( aExp < 0x3FF ) {
2580         if ( (bits64) ( a<<1 ) == 0 ) return a;
2581         float_set_inexact();
2582         aSign = extractFloat64Sign( a );
2583         switch ( float_rounding_mode() ) {
2584          case float_round_nearest_even:
2585             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2586                 return packFloat64( aSign, 0x3FF, 0 );
2587             }
2588             break;
2589          case float_round_down:
2590             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2591          case float_round_up:
2592             return
2593             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2594         }
2595         return packFloat64( aSign, 0, 0 );
2596     }
2597     lastBitMask = 1;
2598     lastBitMask <<= 0x433 - aExp;
2599     roundBitsMask = lastBitMask - 1;
2600     z = a;
2601     roundingMode = float_rounding_mode();
2602     if ( roundingMode == float_round_nearest_even ) {
2603         z += lastBitMask>>1;
2604         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2605     }
2606     else if ( roundingMode != float_round_to_zero ) {
2607         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2608             z += roundBitsMask;
2609         }
2610     }
2611     z &= ~ roundBitsMask;
2612     if ( z != a ) float_set_inexact();
2613     return z;
2614 
2615 }
2616 #endif
2617 
2618 /*
2619 -------------------------------------------------------------------------------
2620 Returns the result of adding the absolute values of the double-precision
2621 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2622 before being returned.  `zSign' is ignored if the result is a NaN.
2623 The addition is performed according to the IEC/IEEE Standard for Binary
2624 Floating-Point Arithmetic.
2625 -------------------------------------------------------------------------------
2626 */
2627 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2628 {
2629     int16 aExp, bExp, zExp;
2630     bits64 aSig, bSig, zSig;
2631     int16 expDiff;
2632 
2633     aSig = extractFloat64Frac( a );
2634     aExp = extractFloat64Exp( a );
2635     bSig = extractFloat64Frac( b );
2636     bExp = extractFloat64Exp( b );
2637     expDiff = aExp - bExp;
2638     aSig <<= 9;
2639     bSig <<= 9;
2640     if ( 0 < expDiff ) {
2641         if ( aExp == 0x7FF ) {
2642             if ( aSig ) return propagateFloat64NaN( a, b );
2643             return a;
2644         }
2645         if ( bExp == 0 ) {
2646             --expDiff;
2647         }
2648         else {
2649             bSig |= LIT64( 0x2000000000000000 );
2650         }
2651         shift64RightJamming( bSig, expDiff, &bSig );
2652         zExp = aExp;
2653     }
2654     else if ( expDiff < 0 ) {
2655         if ( bExp == 0x7FF ) {
2656             if ( bSig ) return propagateFloat64NaN( a, b );
2657             return packFloat64( zSign, 0x7FF, 0 );
2658         }
2659         if ( aExp == 0 ) {
2660             ++expDiff;
2661         }
2662         else {
2663             aSig |= LIT64( 0x2000000000000000 );
2664         }
2665         shift64RightJamming( aSig, - expDiff, &aSig );
2666         zExp = bExp;
2667     }
2668     else {
2669         if ( aExp == 0x7FF ) {
2670             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2671             return a;
2672         }
2673         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2674         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2675         zExp = aExp;
2676         goto roundAndPack;
2677     }
2678     aSig |= LIT64( 0x2000000000000000 );
2679     zSig = ( aSig + bSig )<<1;
2680     --zExp;
2681     if ( (sbits64) zSig < 0 ) {
2682         zSig = aSig + bSig;
2683         ++zExp;
2684     }
2685  roundAndPack:
2686     return roundAndPackFloat64( zSign, zExp, zSig );
2687 
2688 }
2689 
2690 /*
2691 -------------------------------------------------------------------------------
2692 Returns the result of subtracting the absolute values of the double-
2693 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2694 difference is negated before being returned.  `zSign' is ignored if the
2695 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2696 Standard for Binary Floating-Point Arithmetic.
2697 -------------------------------------------------------------------------------
2698 */
2699 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2700 {
2701     int16 aExp, bExp, zExp;
2702     bits64 aSig, bSig, zSig;
2703     int16 expDiff;
2704 
2705     aSig = extractFloat64Frac( a );
2706     aExp = extractFloat64Exp( a );
2707     bSig = extractFloat64Frac( b );
2708     bExp = extractFloat64Exp( b );
2709     expDiff = aExp - bExp;
2710     aSig <<= 10;
2711     bSig <<= 10;
2712     if ( 0 < expDiff ) goto aExpBigger;
2713     if ( expDiff < 0 ) goto bExpBigger;
2714     if ( aExp == 0x7FF ) {
2715         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2716         float_raise( float_flag_invalid );
2717         return float64_default_nan;
2718     }
2719     if ( aExp == 0 ) {
2720         aExp = 1;
2721         bExp = 1;
2722     }
2723     if ( bSig < aSig ) goto aBigger;
2724     if ( aSig < bSig ) goto bBigger;
2725     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2726  bExpBigger:
2727     if ( bExp == 0x7FF ) {
2728         if ( bSig ) return propagateFloat64NaN( a, b );
2729         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2730     }
2731     if ( aExp == 0 ) {
2732         ++expDiff;
2733     }
2734     else {
2735         aSig |= LIT64( 0x4000000000000000 );
2736     }
2737     shift64RightJamming( aSig, - expDiff, &aSig );
2738     bSig |= LIT64( 0x4000000000000000 );
2739  bBigger:
2740     zSig = bSig - aSig;
2741     zExp = bExp;
2742     zSign ^= 1;
2743     goto normalizeRoundAndPack;
2744  aExpBigger:
2745     if ( aExp == 0x7FF ) {
2746         if ( aSig ) return propagateFloat64NaN( a, b );
2747         return a;
2748     }
2749     if ( bExp == 0 ) {
2750         --expDiff;
2751     }
2752     else {
2753         bSig |= LIT64( 0x4000000000000000 );
2754     }
2755     shift64RightJamming( bSig, expDiff, &bSig );
2756     aSig |= LIT64( 0x4000000000000000 );
2757  aBigger:
2758     zSig = aSig - bSig;
2759     zExp = aExp;
2760  normalizeRoundAndPack:
2761     --zExp;
2762     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2763 
2764 }
2765 
2766 /*
2767 -------------------------------------------------------------------------------
2768 Returns the result of adding the double-precision floating-point values `a'
2769 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2770 Binary Floating-Point Arithmetic.
2771 -------------------------------------------------------------------------------
2772 */
2773 float64 float64_add( float64 a, float64 b )
2774 {
2775     flag aSign, bSign;
2776 
2777     aSign = extractFloat64Sign( a );
2778     bSign = extractFloat64Sign( b );
2779     if ( aSign == bSign ) {
2780         return addFloat64Sigs( a, b, aSign );
2781     }
2782     else {
2783         return subFloat64Sigs( a, b, aSign );
2784     }
2785 
2786 }
2787 
2788 /*
2789 -------------------------------------------------------------------------------
2790 Returns the result of subtracting the double-precision floating-point values
2791 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2792 for Binary Floating-Point Arithmetic.
2793 -------------------------------------------------------------------------------
2794 */
2795 float64 float64_sub( float64 a, float64 b )
2796 {
2797     flag aSign, bSign;
2798 
2799     aSign = extractFloat64Sign( a );
2800     bSign = extractFloat64Sign( b );
2801     if ( aSign == bSign ) {
2802         return subFloat64Sigs( a, b, aSign );
2803     }
2804     else {
2805         return addFloat64Sigs( a, b, aSign );
2806     }
2807 
2808 }
2809 
2810 /*
2811 -------------------------------------------------------------------------------
2812 Returns the result of multiplying the double-precision floating-point values
2813 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2814 for Binary Floating-Point Arithmetic.
2815 -------------------------------------------------------------------------------
2816 */
2817 float64 float64_mul( float64 a, float64 b )
2818 {
2819     flag aSign, bSign, zSign;
2820     int16 aExp, bExp, zExp;
2821     bits64 aSig, bSig, zSig0, zSig1;
2822 
2823     aSig = extractFloat64Frac( a );
2824     aExp = extractFloat64Exp( a );
2825     aSign = extractFloat64Sign( a );
2826     bSig = extractFloat64Frac( b );
2827     bExp = extractFloat64Exp( b );
2828     bSign = extractFloat64Sign( b );
2829     zSign = aSign ^ bSign;
2830     if ( aExp == 0x7FF ) {
2831         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2832             return propagateFloat64NaN( a, b );
2833         }
2834         if ( ( bExp | bSig ) == 0 ) {
2835             float_raise( float_flag_invalid );
2836             return float64_default_nan;
2837         }
2838         return packFloat64( zSign, 0x7FF, 0 );
2839     }
2840     if ( bExp == 0x7FF ) {
2841         if ( bSig ) return propagateFloat64NaN( a, b );
2842         if ( ( aExp | aSig ) == 0 ) {
2843             float_raise( float_flag_invalid );
2844             return float64_default_nan;
2845         }
2846         return packFloat64( zSign, 0x7FF, 0 );
2847     }
2848     if ( aExp == 0 ) {
2849         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2850         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2851     }
2852     if ( bExp == 0 ) {
2853         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2854         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2855     }
2856     zExp = aExp + bExp - 0x3FF;
2857     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2858     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2859     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2860     zSig0 |= ( zSig1 != 0 );
2861     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2862         zSig0 <<= 1;
2863         --zExp;
2864     }
2865     return roundAndPackFloat64( zSign, zExp, zSig0 );
2866 
2867 }
2868 
2869 /*
2870 -------------------------------------------------------------------------------
2871 Returns the result of dividing the double-precision floating-point value `a'
2872 by the corresponding value `b'.  The operation is performed according to
2873 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2874 -------------------------------------------------------------------------------
2875 */
2876 float64 float64_div( float64 a, float64 b )
2877 {
2878     flag aSign, bSign, zSign;
2879     int16 aExp, bExp, zExp;
2880     bits64 aSig, bSig, zSig;
2881     bits64 rem0, rem1;
2882     bits64 term0, term1;
2883 
2884     aSig = extractFloat64Frac( a );
2885     aExp = extractFloat64Exp( a );
2886     aSign = extractFloat64Sign( a );
2887     bSig = extractFloat64Frac( b );
2888     bExp = extractFloat64Exp( b );
2889     bSign = extractFloat64Sign( b );
2890     zSign = aSign ^ bSign;
2891     if ( aExp == 0x7FF ) {
2892         if ( aSig ) return propagateFloat64NaN( a, b );
2893         if ( bExp == 0x7FF ) {
2894             if ( bSig ) return propagateFloat64NaN( a, b );
2895             float_raise( float_flag_invalid );
2896             return float64_default_nan;
2897         }
2898         return packFloat64( zSign, 0x7FF, 0 );
2899     }
2900     if ( bExp == 0x7FF ) {
2901         if ( bSig ) return propagateFloat64NaN( a, b );
2902         return packFloat64( zSign, 0, 0 );
2903     }
2904     if ( bExp == 0 ) {
2905         if ( bSig == 0 ) {
2906             if ( ( aExp | aSig ) == 0 ) {
2907                 float_raise( float_flag_invalid );
2908                 return float64_default_nan;
2909             }
2910             float_raise( float_flag_divbyzero );
2911             return packFloat64( zSign, 0x7FF, 0 );
2912         }
2913         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2914     }
2915     if ( aExp == 0 ) {
2916         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2917         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2918     }
2919     zExp = aExp - bExp + 0x3FD;
2920     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2921     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2922     if ( bSig <= ( aSig + aSig ) ) {
2923         aSig >>= 1;
2924         ++zExp;
2925     }
2926     zSig = estimateDiv128To64( aSig, 0, bSig );
2927     if ( ( zSig & 0x1FF ) <= 2 ) {
2928         mul64To128( bSig, zSig, &term0, &term1 );
2929         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2930         while ( (sbits64) rem0 < 0 ) {
2931             --zSig;
2932             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2933         }
2934         zSig |= ( rem1 != 0 );
2935     }
2936     return roundAndPackFloat64( zSign, zExp, zSig );
2937 
2938 }
2939 
2940 #ifndef SOFTFLOAT_FOR_GCC
2941 /*
2942 -------------------------------------------------------------------------------
2943 Returns the remainder of the double-precision floating-point value `a'
2944 with respect to the corresponding value `b'.  The operation is performed
2945 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2946 -------------------------------------------------------------------------------
2947 */
2948 float64 float64_rem( float64 a, float64 b )
2949 {
2950     flag aSign, bSign, zSign;
2951     int16 aExp, bExp, expDiff;
2952     bits64 aSig, bSig;
2953     bits64 q, alternateASig;
2954     sbits64 sigMean;
2955 
2956     aSig = extractFloat64Frac( a );
2957     aExp = extractFloat64Exp( a );
2958     aSign = extractFloat64Sign( a );
2959     bSig = extractFloat64Frac( b );
2960     bExp = extractFloat64Exp( b );
2961     bSign = extractFloat64Sign( b );
2962     if ( aExp == 0x7FF ) {
2963         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2964             return propagateFloat64NaN( a, b );
2965         }
2966         float_raise( float_flag_invalid );
2967         return float64_default_nan;
2968     }
2969     if ( bExp == 0x7FF ) {
2970         if ( bSig ) return propagateFloat64NaN( a, b );
2971         return a;
2972     }
2973     if ( bExp == 0 ) {
2974         if ( bSig == 0 ) {
2975             float_raise( float_flag_invalid );
2976             return float64_default_nan;
2977         }
2978         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2979     }
2980     if ( aExp == 0 ) {
2981         if ( aSig == 0 ) return a;
2982         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2983     }
2984     expDiff = aExp - bExp;
2985     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2986     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2987     if ( expDiff < 0 ) {
2988         if ( expDiff < -1 ) return a;
2989         aSig >>= 1;
2990     }
2991     q = ( bSig <= aSig );
2992     if ( q ) aSig -= bSig;
2993     expDiff -= 64;
2994     while ( 0 < expDiff ) {
2995         q = estimateDiv128To64( aSig, 0, bSig );
2996         q = ( 2 < q ) ? q - 2 : 0;
2997         aSig = - ( ( bSig>>2 ) * q );
2998         expDiff -= 62;
2999     }
3000     expDiff += 64;
3001     if ( 0 < expDiff ) {
3002         q = estimateDiv128To64( aSig, 0, bSig );
3003         q = ( 2 < q ) ? q - 2 : 0;
3004         q >>= 64 - expDiff;
3005         bSig >>= 2;
3006         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3007     }
3008     else {
3009         aSig >>= 2;
3010         bSig >>= 2;
3011     }
3012     do {
3013         alternateASig = aSig;
3014         ++q;
3015         aSig -= bSig;
3016     } while ( 0 <= (sbits64) aSig );
3017     sigMean = aSig + alternateASig;
3018     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3019         aSig = alternateASig;
3020     }
3021     zSign = ( (sbits64) aSig < 0 );
3022     if ( zSign ) aSig = - aSig;
3023     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3024 
3025 }
3026 
3027 /*
3028 -------------------------------------------------------------------------------
3029 Returns the square root of the double-precision floating-point value `a'.
3030 The operation is performed according to the IEC/IEEE Standard for Binary
3031 Floating-Point Arithmetic.
3032 -------------------------------------------------------------------------------
3033 */
3034 float64 float64_sqrt( float64 a )
3035 {
3036     flag aSign;
3037     int16 aExp, zExp;
3038     bits64 aSig, zSig, doubleZSig;
3039     bits64 rem0, rem1, term0, term1;
3040 
3041     aSig = extractFloat64Frac( a );
3042     aExp = extractFloat64Exp( a );
3043     aSign = extractFloat64Sign( a );
3044     if ( aExp == 0x7FF ) {
3045         if ( aSig ) return propagateFloat64NaN( a, a );
3046         if ( ! aSign ) return a;
3047         float_raise( float_flag_invalid );
3048         return float64_default_nan;
3049     }
3050     if ( aSign ) {
3051         if ( ( aExp | aSig ) == 0 ) return a;
3052         float_raise( float_flag_invalid );
3053         return float64_default_nan;
3054     }
3055     if ( aExp == 0 ) {
3056         if ( aSig == 0 ) return 0;
3057         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3058     }
3059     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3060     aSig |= LIT64( 0x0010000000000000 );
3061     zSig = estimateSqrt32( aExp, aSig>>21 );
3062     aSig <<= 9 - ( aExp & 1 );
3063     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3064     if ( ( zSig & 0x1FF ) <= 5 ) {
3065         doubleZSig = zSig<<1;
3066         mul64To128( zSig, zSig, &term0, &term1 );
3067         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3068         while ( (sbits64) rem0 < 0 ) {
3069             --zSig;
3070             doubleZSig -= 2;
3071             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3072         }
3073         zSig |= ( ( rem0 | rem1 ) != 0 );
3074     }
3075     return roundAndPackFloat64( 0, zExp, zSig );
3076 
3077 }
3078 #endif
3079 
3080 /*
3081 -------------------------------------------------------------------------------
3082 Returns 1 if the double-precision floating-point value `a' is equal to the
3083 corresponding value `b', and 0 otherwise.  The comparison is performed
3084 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3085 -------------------------------------------------------------------------------
3086 */
3087 flag float64_eq( float64 a, float64 b )
3088 {
3089 
3090     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3091          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3092        ) {
3093         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3094             float_raise( float_flag_invalid );
3095         }
3096         return 0;
3097     }
3098     return ( a == b ) ||
3099 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3100 
3101 }
3102 
3103 /*
3104 -------------------------------------------------------------------------------
3105 Returns 1 if the double-precision floating-point value `a' is less than or
3106 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3107 performed according to the IEC/IEEE Standard for Binary Floating-Point
3108 Arithmetic.
3109 -------------------------------------------------------------------------------
3110 */
3111 flag float64_le( float64 a, float64 b )
3112 {
3113     flag aSign, bSign;
3114 
3115     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3116          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3117        ) {
3118         float_raise( float_flag_invalid );
3119         return 0;
3120     }
3121     aSign = extractFloat64Sign( a );
3122     bSign = extractFloat64Sign( b );
3123     if ( aSign != bSign )
3124 	return aSign ||
3125 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3126 	      0 );
3127     return ( a == b ) ||
3128 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3129 
3130 }
3131 
3132 /*
3133 -------------------------------------------------------------------------------
3134 Returns 1 if the double-precision floating-point value `a' is less than
3135 the corresponding value `b', and 0 otherwise.  The comparison is performed
3136 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3137 -------------------------------------------------------------------------------
3138 */
3139 flag float64_lt( float64 a, float64 b )
3140 {
3141     flag aSign, bSign;
3142 
3143     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3144          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3145        ) {
3146         float_raise( float_flag_invalid );
3147         return 0;
3148     }
3149     aSign = extractFloat64Sign( a );
3150     bSign = extractFloat64Sign( b );
3151     if ( aSign != bSign )
3152 	return aSign &&
3153 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3154 	      0 );
3155     return ( a != b ) &&
3156 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3157 
3158 }
3159 
3160 #ifndef SOFTFLOAT_FOR_GCC
3161 /*
3162 -------------------------------------------------------------------------------
3163 Returns 1 if the double-precision floating-point value `a' is equal to the
3164 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3165 if either operand is a NaN.  Otherwise, the comparison is performed
3166 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3167 -------------------------------------------------------------------------------
3168 */
3169 flag float64_eq_signaling( float64 a, float64 b )
3170 {
3171 
3172     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3173          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3174        ) {
3175         float_raise( float_flag_invalid );
3176         return 0;
3177     }
3178     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3179 
3180 }
3181 
3182 /*
3183 -------------------------------------------------------------------------------
3184 Returns 1 if the double-precision floating-point value `a' is less than or
3185 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3186 cause an exception.  Otherwise, the comparison is performed according to the
3187 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3188 -------------------------------------------------------------------------------
3189 */
3190 flag float64_le_quiet( float64 a, float64 b )
3191 {
3192     flag aSign, bSign;
3193 
3194     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3195          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3196        ) {
3197         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3198             float_raise( float_flag_invalid );
3199         }
3200         return 0;
3201     }
3202     aSign = extractFloat64Sign( a );
3203     bSign = extractFloat64Sign( b );
3204     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3205     return ( a == b ) || ( aSign ^ ( a < b ) );
3206 
3207 }
3208 
3209 /*
3210 -------------------------------------------------------------------------------
3211 Returns 1 if the double-precision floating-point value `a' is less than
3212 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3213 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3214 Standard for Binary Floating-Point Arithmetic.
3215 -------------------------------------------------------------------------------
3216 */
3217 flag float64_lt_quiet( float64 a, float64 b )
3218 {
3219     flag aSign, bSign;
3220 
3221     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3222          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3223        ) {
3224         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3225             float_raise( float_flag_invalid );
3226         }
3227         return 0;
3228     }
3229     aSign = extractFloat64Sign( a );
3230     bSign = extractFloat64Sign( b );
3231     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3232     return ( a != b ) && ( aSign ^ ( a < b ) );
3233 
3234 }
3235 #endif
3236 
3237 #ifdef FLOATX80
3238 
3239 /*
3240 -------------------------------------------------------------------------------
3241 Returns the result of converting the extended double-precision floating-
3242 point value `a' to the 32-bit two's complement integer format.  The
3243 conversion is performed according to the IEC/IEEE Standard for Binary
3244 Floating-Point Arithmetic---which means in particular that the conversion
3245 is rounded according to the current rounding mode.  If `a' is a NaN, the
3246 largest positive integer is returned.  Otherwise, if the conversion
3247 overflows, the largest integer with the same sign as `a' is returned.
3248 -------------------------------------------------------------------------------
3249 */
3250 int32 floatx80_to_int32( floatx80 a )
3251 {
3252     flag aSign;
3253     int32 aExp, shiftCount;
3254     bits64 aSig;
3255 
3256     aSig = extractFloatx80Frac( a );
3257     aExp = extractFloatx80Exp( a );
3258     aSign = extractFloatx80Sign( a );
3259     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3260     shiftCount = 0x4037 - aExp;
3261     if ( shiftCount <= 0 ) shiftCount = 1;
3262     shift64RightJamming( aSig, shiftCount, &aSig );
3263     return roundAndPackInt32( aSign, aSig );
3264 
3265 }
3266 
3267 /*
3268 -------------------------------------------------------------------------------
3269 Returns the result of converting the extended double-precision floating-
3270 point value `a' to the 32-bit two's complement integer format.  The
3271 conversion is performed according to the IEC/IEEE Standard for Binary
3272 Floating-Point Arithmetic, except that the conversion is always rounded
3273 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3274 Otherwise, if the conversion overflows, the largest integer with the same
3275 sign as `a' is returned.
3276 -------------------------------------------------------------------------------
3277 */
3278 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3279 {
3280     flag aSign;
3281     int32 aExp, shiftCount;
3282     bits64 aSig, savedASig;
3283     int32 z;
3284 
3285     aSig = extractFloatx80Frac( a );
3286     aExp = extractFloatx80Exp( a );
3287     aSign = extractFloatx80Sign( a );
3288     if ( 0x401E < aExp ) {
3289         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3290         goto invalid;
3291     }
3292     else if ( aExp < 0x3FFF ) {
3293         if ( aExp || aSig ) float_set_inexact();
3294         return 0;
3295     }
3296     shiftCount = 0x403E - aExp;
3297     savedASig = aSig;
3298     aSig >>= shiftCount;
3299     z = aSig;
3300     if ( aSign ) z = - z;
3301     if ( ( z < 0 ) ^ aSign ) {
3302  invalid:
3303         float_raise( float_flag_invalid );
3304         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3305     }
3306     if ( ( aSig<<shiftCount ) != savedASig ) {
3307         float_set_inexact();
3308     }
3309     return z;
3310 
3311 }
3312 
3313 /*
3314 -------------------------------------------------------------------------------
3315 Returns the result of converting the extended double-precision floating-
3316 point value `a' to the 64-bit two's complement integer format.  The
3317 conversion is performed according to the IEC/IEEE Standard for Binary
3318 Floating-Point Arithmetic---which means in particular that the conversion
3319 is rounded according to the current rounding mode.  If `a' is a NaN,
3320 the largest positive integer is returned.  Otherwise, if the conversion
3321 overflows, the largest integer with the same sign as `a' is returned.
3322 -------------------------------------------------------------------------------
3323 */
3324 int64 floatx80_to_int64( floatx80 a )
3325 {
3326     flag aSign;
3327     int32 aExp, shiftCount;
3328     bits64 aSig, aSigExtra;
3329 
3330     aSig = extractFloatx80Frac( a );
3331     aExp = extractFloatx80Exp( a );
3332     aSign = extractFloatx80Sign( a );
3333     shiftCount = 0x403E - aExp;
3334     if ( shiftCount <= 0 ) {
3335         if ( shiftCount ) {
3336             float_raise( float_flag_invalid );
3337             if (    ! aSign
3338                  || (    ( aExp == 0x7FFF )
3339                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3340                ) {
3341                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3342             }
3343             return (sbits64) LIT64( 0x8000000000000000 );
3344         }
3345         aSigExtra = 0;
3346     }
3347     else {
3348         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3349     }
3350     return roundAndPackInt64( aSign, aSig, aSigExtra );
3351 
3352 }
3353 
3354 /*
3355 -------------------------------------------------------------------------------
3356 Returns the result of converting the extended double-precision floating-
3357 point value `a' to the 64-bit two's complement integer format.  The
3358 conversion is performed according to the IEC/IEEE Standard for Binary
3359 Floating-Point Arithmetic, except that the conversion is always rounded
3360 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3361 Otherwise, if the conversion overflows, the largest integer with the same
3362 sign as `a' is returned.
3363 -------------------------------------------------------------------------------
3364 */
3365 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3366 {
3367     flag aSign;
3368     int32 aExp, shiftCount;
3369     bits64 aSig;
3370     int64 z;
3371 
3372     aSig = extractFloatx80Frac( a );
3373     aExp = extractFloatx80Exp( a );
3374     aSign = extractFloatx80Sign( a );
3375     shiftCount = aExp - 0x403E;
3376     if ( 0 <= shiftCount ) {
3377         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3378         if ( ( a.high != 0xC03E ) || aSig ) {
3379             float_raise( float_flag_invalid );
3380             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3381                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3382             }
3383         }
3384         return (sbits64) LIT64( 0x8000000000000000 );
3385     }
3386     else if ( aExp < 0x3FFF ) {
3387         if ( aExp | aSig ) float_set_inexact();
3388         return 0;
3389     }
3390     z = aSig>>( - shiftCount );
3391     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3392         float_set_inexact();
3393     }
3394     if ( aSign ) z = - z;
3395     return z;
3396 
3397 }
3398 
3399 /*
3400 -------------------------------------------------------------------------------
3401 Returns the result of converting the extended double-precision floating-
3402 point value `a' to the single-precision floating-point format.  The
3403 conversion is performed according to the IEC/IEEE Standard for Binary
3404 Floating-Point Arithmetic.
3405 -------------------------------------------------------------------------------
3406 */
3407 float32 floatx80_to_float32( floatx80 a )
3408 {
3409     flag aSign;
3410     int32 aExp;
3411     bits64 aSig;
3412 
3413     aSig = extractFloatx80Frac( a );
3414     aExp = extractFloatx80Exp( a );
3415     aSign = extractFloatx80Sign( a );
3416     if ( aExp == 0x7FFF ) {
3417         if ( (bits64) ( aSig<<1 ) ) {
3418             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3419         }
3420         return packFloat32( aSign, 0xFF, 0 );
3421     }
3422     shift64RightJamming( aSig, 33, &aSig );
3423     if ( aExp || aSig ) aExp -= 0x3F81;
3424     return roundAndPackFloat32( aSign, aExp, aSig );
3425 
3426 }
3427 
3428 /*
3429 -------------------------------------------------------------------------------
3430 Returns the result of converting the extended double-precision floating-
3431 point value `a' to the double-precision floating-point format.  The
3432 conversion is performed according to the IEC/IEEE Standard for Binary
3433 Floating-Point Arithmetic.
3434 -------------------------------------------------------------------------------
3435 */
3436 float64 floatx80_to_float64( floatx80 a )
3437 {
3438     flag aSign;
3439     int32 aExp;
3440     bits64 aSig, zSig;
3441 
3442     aSig = extractFloatx80Frac( a );
3443     aExp = extractFloatx80Exp( a );
3444     aSign = extractFloatx80Sign( a );
3445     if ( aExp == 0x7FFF ) {
3446         if ( (bits64) ( aSig<<1 ) ) {
3447             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3448         }
3449         return packFloat64( aSign, 0x7FF, 0 );
3450     }
3451     shift64RightJamming( aSig, 1, &zSig );
3452     if ( aExp || aSig ) aExp -= 0x3C01;
3453     return roundAndPackFloat64( aSign, aExp, zSig );
3454 
3455 }
3456 
3457 #ifdef FLOAT128
3458 
3459 /*
3460 -------------------------------------------------------------------------------
3461 Returns the result of converting the extended double-precision floating-
3462 point value `a' to the quadruple-precision floating-point format.  The
3463 conversion is performed according to the IEC/IEEE Standard for Binary
3464 Floating-Point Arithmetic.
3465 -------------------------------------------------------------------------------
3466 */
3467 float128 floatx80_to_float128( floatx80 a )
3468 {
3469     flag aSign;
3470     int16 aExp;
3471     bits64 aSig, zSig0, zSig1;
3472 
3473     aSig = extractFloatx80Frac( a );
3474     aExp = extractFloatx80Exp( a );
3475     aSign = extractFloatx80Sign( a );
3476     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3477         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3478     }
3479     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3480     return packFloat128( aSign, aExp, zSig0, zSig1 );
3481 
3482 }
3483 
3484 #endif
3485 
3486 /*
3487 -------------------------------------------------------------------------------
3488 Rounds the extended double-precision floating-point value `a' to an integer,
3489 and returns the result as an extended quadruple-precision floating-point
3490 value.  The operation is performed according to the IEC/IEEE Standard for
3491 Binary Floating-Point Arithmetic.
3492 -------------------------------------------------------------------------------
3493 */
3494 floatx80 floatx80_round_to_int( floatx80 a )
3495 {
3496     flag aSign;
3497     int32 aExp;
3498     bits64 lastBitMask, roundBitsMask;
3499     int8 roundingMode;
3500     floatx80 z;
3501 
3502     aExp = extractFloatx80Exp( a );
3503     if ( 0x403E <= aExp ) {
3504         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3505             return propagateFloatx80NaN( a, a );
3506         }
3507         return a;
3508     }
3509     if ( aExp < 0x3FFF ) {
3510         if (    ( aExp == 0 )
3511              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3512             return a;
3513         }
3514         float_set_inexact();
3515         aSign = extractFloatx80Sign( a );
3516         switch ( float_rounding_mode() ) {
3517          case float_round_nearest_even:
3518             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3519                ) {
3520                 return
3521                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3522             }
3523             break;
3524          case float_round_down:
3525             return
3526                   aSign ?
3527                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3528                 : packFloatx80( 0, 0, 0 );
3529          case float_round_up:
3530             return
3531                   aSign ? packFloatx80( 1, 0, 0 )
3532                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3533         }
3534         return packFloatx80( aSign, 0, 0 );
3535     }
3536     lastBitMask = 1;
3537     lastBitMask <<= 0x403E - aExp;
3538     roundBitsMask = lastBitMask - 1;
3539     z = a;
3540     roundingMode = float_rounding_mode();
3541     if ( roundingMode == float_round_nearest_even ) {
3542         z.low += lastBitMask>>1;
3543         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3544     }
3545     else if ( roundingMode != float_round_to_zero ) {
3546         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3547             z.low += roundBitsMask;
3548         }
3549     }
3550     z.low &= ~ roundBitsMask;
3551     if ( z.low == 0 ) {
3552         ++z.high;
3553         z.low = LIT64( 0x8000000000000000 );
3554     }
3555     if ( z.low != a.low ) float_set_inexact();
3556     return z;
3557 
3558 }
3559 
3560 /*
3561 -------------------------------------------------------------------------------
3562 Returns the result of adding the absolute values of the extended double-
3563 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3564 negated before being returned.  `zSign' is ignored if the result is a NaN.
3565 The addition is performed according to the IEC/IEEE Standard for Binary
3566 Floating-Point Arithmetic.
3567 -------------------------------------------------------------------------------
3568 */
3569 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3570 {
3571     int32 aExp, bExp, zExp;
3572     bits64 aSig, bSig, zSig0, zSig1;
3573     int32 expDiff;
3574 
3575     aSig = extractFloatx80Frac( a );
3576     aExp = extractFloatx80Exp( a );
3577     bSig = extractFloatx80Frac( b );
3578     bExp = extractFloatx80Exp( b );
3579     expDiff = aExp - bExp;
3580     if ( 0 < expDiff ) {
3581         if ( aExp == 0x7FFF ) {
3582             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3583             return a;
3584         }
3585         if ( bExp == 0 ) --expDiff;
3586         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3587         zExp = aExp;
3588     }
3589     else if ( expDiff < 0 ) {
3590         if ( bExp == 0x7FFF ) {
3591             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3592             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3593         }
3594         if ( aExp == 0 ) ++expDiff;
3595         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3596         zExp = bExp;
3597     }
3598     else {
3599         if ( aExp == 0x7FFF ) {
3600             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3601                 return propagateFloatx80NaN( a, b );
3602             }
3603             return a;
3604         }
3605         zSig1 = 0;
3606         zSig0 = aSig + bSig;
3607         if ( aExp == 0 ) {
3608             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3609             goto roundAndPack;
3610         }
3611         zExp = aExp;
3612         goto shiftRight1;
3613     }
3614     zSig0 = aSig + bSig;
3615     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3616  shiftRight1:
3617     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3618     zSig0 |= LIT64( 0x8000000000000000 );
3619     ++zExp;
3620  roundAndPack:
3621     return
3622         roundAndPackFloatx80(
3623             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3624 
3625 }
3626 
3627 /*
3628 -------------------------------------------------------------------------------
3629 Returns the result of subtracting the absolute values of the extended
3630 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3631 difference is negated before being returned.  `zSign' is ignored if the
3632 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3633 Standard for Binary Floating-Point Arithmetic.
3634 -------------------------------------------------------------------------------
3635 */
3636 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3637 {
3638     int32 aExp, bExp, zExp;
3639     bits64 aSig, bSig, zSig0, zSig1;
3640     int32 expDiff;
3641     floatx80 z;
3642 
3643     aSig = extractFloatx80Frac( a );
3644     aExp = extractFloatx80Exp( a );
3645     bSig = extractFloatx80Frac( b );
3646     bExp = extractFloatx80Exp( b );
3647     expDiff = aExp - bExp;
3648     if ( 0 < expDiff ) goto aExpBigger;
3649     if ( expDiff < 0 ) goto bExpBigger;
3650     if ( aExp == 0x7FFF ) {
3651         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3652             return propagateFloatx80NaN( a, b );
3653         }
3654         float_raise( float_flag_invalid );
3655         z.low = floatx80_default_nan_low;
3656         z.high = floatx80_default_nan_high;
3657         return z;
3658     }
3659     if ( aExp == 0 ) {
3660         aExp = 1;
3661         bExp = 1;
3662     }
3663     zSig1 = 0;
3664     if ( bSig < aSig ) goto aBigger;
3665     if ( aSig < bSig ) goto bBigger;
3666     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3667  bExpBigger:
3668     if ( bExp == 0x7FFF ) {
3669         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3670         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3671     }
3672     if ( aExp == 0 ) ++expDiff;
3673     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3674  bBigger:
3675     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3676     zExp = bExp;
3677     zSign ^= 1;
3678     goto normalizeRoundAndPack;
3679  aExpBigger:
3680     if ( aExp == 0x7FFF ) {
3681         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3682         return a;
3683     }
3684     if ( bExp == 0 ) --expDiff;
3685     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3686  aBigger:
3687     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3688     zExp = aExp;
3689  normalizeRoundAndPack:
3690     return
3691         normalizeRoundAndPackFloatx80(
3692             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3693 
3694 }
3695 
3696 /*
3697 -------------------------------------------------------------------------------
3698 Returns the result of adding the extended double-precision floating-point
3699 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3700 Standard for Binary Floating-Point Arithmetic.
3701 -------------------------------------------------------------------------------
3702 */
3703 floatx80 floatx80_add( floatx80 a, floatx80 b )
3704 {
3705     flag aSign, bSign;
3706 
3707     aSign = extractFloatx80Sign( a );
3708     bSign = extractFloatx80Sign( b );
3709     if ( aSign == bSign ) {
3710         return addFloatx80Sigs( a, b, aSign );
3711     }
3712     else {
3713         return subFloatx80Sigs( a, b, aSign );
3714     }
3715 
3716 }
3717 
3718 /*
3719 -------------------------------------------------------------------------------
3720 Returns the result of subtracting the extended double-precision floating-
3721 point values `a' and `b'.  The operation is performed according to the
3722 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3723 -------------------------------------------------------------------------------
3724 */
3725 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3726 {
3727     flag aSign, bSign;
3728 
3729     aSign = extractFloatx80Sign( a );
3730     bSign = extractFloatx80Sign( b );
3731     if ( aSign == bSign ) {
3732         return subFloatx80Sigs( a, b, aSign );
3733     }
3734     else {
3735         return addFloatx80Sigs( a, b, aSign );
3736     }
3737 
3738 }
3739 
3740 /*
3741 -------------------------------------------------------------------------------
3742 Returns the result of multiplying the extended double-precision floating-
3743 point values `a' and `b'.  The operation is performed according to the
3744 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3745 -------------------------------------------------------------------------------
3746 */
3747 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3748 {
3749     flag aSign, bSign, zSign;
3750     int32 aExp, bExp, zExp;
3751     bits64 aSig, bSig, zSig0, zSig1;
3752     floatx80 z;
3753 
3754     aSig = extractFloatx80Frac( a );
3755     aExp = extractFloatx80Exp( a );
3756     aSign = extractFloatx80Sign( a );
3757     bSig = extractFloatx80Frac( b );
3758     bExp = extractFloatx80Exp( b );
3759     bSign = extractFloatx80Sign( b );
3760     zSign = aSign ^ bSign;
3761     if ( aExp == 0x7FFF ) {
3762         if (    (bits64) ( aSig<<1 )
3763              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3764             return propagateFloatx80NaN( a, b );
3765         }
3766         if ( ( bExp | bSig ) == 0 ) goto invalid;
3767         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3768     }
3769     if ( bExp == 0x7FFF ) {
3770         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3771         if ( ( aExp | aSig ) == 0 ) {
3772  invalid:
3773             float_raise( float_flag_invalid );
3774             z.low = floatx80_default_nan_low;
3775             z.high = floatx80_default_nan_high;
3776             return z;
3777         }
3778         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3779     }
3780     if ( aExp == 0 ) {
3781         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3782         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3783     }
3784     if ( bExp == 0 ) {
3785         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3786         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3787     }
3788     zExp = aExp + bExp - 0x3FFE;
3789     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3790     if ( 0 < (sbits64) zSig0 ) {
3791         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3792         --zExp;
3793     }
3794     return
3795         roundAndPackFloatx80(
3796             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3797 
3798 }
3799 
3800 /*
3801 -------------------------------------------------------------------------------
3802 Returns the result of dividing the extended double-precision floating-point
3803 value `a' by the corresponding value `b'.  The operation is performed
3804 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3805 -------------------------------------------------------------------------------
3806 */
3807 floatx80 floatx80_div( floatx80 a, floatx80 b )
3808 {
3809     flag aSign, bSign, zSign;
3810     int32 aExp, bExp, zExp;
3811     bits64 aSig, bSig, zSig0, zSig1;
3812     bits64 rem0, rem1, rem2, term0, term1, term2;
3813     floatx80 z;
3814 
3815     aSig = extractFloatx80Frac( a );
3816     aExp = extractFloatx80Exp( a );
3817     aSign = extractFloatx80Sign( a );
3818     bSig = extractFloatx80Frac( b );
3819     bExp = extractFloatx80Exp( b );
3820     bSign = extractFloatx80Sign( b );
3821     zSign = aSign ^ bSign;
3822     if ( aExp == 0x7FFF ) {
3823         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3824         if ( bExp == 0x7FFF ) {
3825             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3826             goto invalid;
3827         }
3828         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3829     }
3830     if ( bExp == 0x7FFF ) {
3831         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3832         return packFloatx80( zSign, 0, 0 );
3833     }
3834     if ( bExp == 0 ) {
3835         if ( bSig == 0 ) {
3836             if ( ( aExp | aSig ) == 0 ) {
3837  invalid:
3838                 float_raise( float_flag_invalid );
3839                 z.low = floatx80_default_nan_low;
3840                 z.high = floatx80_default_nan_high;
3841                 return z;
3842             }
3843             float_raise( float_flag_divbyzero );
3844             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3845         }
3846         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3847     }
3848     if ( aExp == 0 ) {
3849         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3850         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3851     }
3852     zExp = aExp - bExp + 0x3FFE;
3853     rem1 = 0;
3854     if ( bSig <= aSig ) {
3855         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3856         ++zExp;
3857     }
3858     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3859     mul64To128( bSig, zSig0, &term0, &term1 );
3860     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3861     while ( (sbits64) rem0 < 0 ) {
3862         --zSig0;
3863         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3864     }
3865     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3866     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3867         mul64To128( bSig, zSig1, &term1, &term2 );
3868         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3869         while ( (sbits64) rem1 < 0 ) {
3870             --zSig1;
3871             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3872         }
3873         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3874     }
3875     return
3876         roundAndPackFloatx80(
3877             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3878 
3879 }
3880 
3881 /*
3882 -------------------------------------------------------------------------------
3883 Returns the remainder of the extended double-precision floating-point value
3884 `a' with respect to the corresponding value `b'.  The operation is performed
3885 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3886 -------------------------------------------------------------------------------
3887 */
3888 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3889 {
3890     flag aSign, bSign, zSign;
3891     int32 aExp, bExp, expDiff;
3892     bits64 aSig0, aSig1, bSig;
3893     bits64 q, term0, term1, alternateASig0, alternateASig1;
3894     floatx80 z;
3895 
3896     aSig0 = extractFloatx80Frac( a );
3897     aExp = extractFloatx80Exp( a );
3898     aSign = extractFloatx80Sign( a );
3899     bSig = extractFloatx80Frac( b );
3900     bExp = extractFloatx80Exp( b );
3901     bSign = extractFloatx80Sign( b );
3902     if ( aExp == 0x7FFF ) {
3903         if (    (bits64) ( aSig0<<1 )
3904              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3905             return propagateFloatx80NaN( a, b );
3906         }
3907         goto invalid;
3908     }
3909     if ( bExp == 0x7FFF ) {
3910         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3911         return a;
3912     }
3913     if ( bExp == 0 ) {
3914         if ( bSig == 0 ) {
3915  invalid:
3916             float_raise( float_flag_invalid );
3917             z.low = floatx80_default_nan_low;
3918             z.high = floatx80_default_nan_high;
3919             return z;
3920         }
3921         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3922     }
3923     if ( aExp == 0 ) {
3924         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3925         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3926     }
3927     bSig |= LIT64( 0x8000000000000000 );
3928     zSign = aSign;
3929     expDiff = aExp - bExp;
3930     aSig1 = 0;
3931     if ( expDiff < 0 ) {
3932         if ( expDiff < -1 ) return a;
3933         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3934         expDiff = 0;
3935     }
3936     q = ( bSig <= aSig0 );
3937     if ( q ) aSig0 -= bSig;
3938     expDiff -= 64;
3939     while ( 0 < expDiff ) {
3940         q = estimateDiv128To64( aSig0, aSig1, bSig );
3941         q = ( 2 < q ) ? q - 2 : 0;
3942         mul64To128( bSig, q, &term0, &term1 );
3943         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3944         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3945         expDiff -= 62;
3946     }
3947     expDiff += 64;
3948     if ( 0 < expDiff ) {
3949         q = estimateDiv128To64( aSig0, aSig1, bSig );
3950         q = ( 2 < q ) ? q - 2 : 0;
3951         q >>= 64 - expDiff;
3952         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3953         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3954         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3955         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3956             ++q;
3957             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3958         }
3959     }
3960     else {
3961         term1 = 0;
3962         term0 = bSig;
3963     }
3964     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3965     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3966          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3967               && ( q & 1 ) )
3968        ) {
3969         aSig0 = alternateASig0;
3970         aSig1 = alternateASig1;
3971         zSign = ! zSign;
3972     }
3973     return
3974         normalizeRoundAndPackFloatx80(
3975             80, zSign, bExp + expDiff, aSig0, aSig1 );
3976 
3977 }
3978 
3979 /*
3980 -------------------------------------------------------------------------------
3981 Returns the square root of the extended double-precision floating-point
3982 value `a'.  The operation is performed according to the IEC/IEEE Standard
3983 for Binary Floating-Point Arithmetic.
3984 -------------------------------------------------------------------------------
3985 */
3986 floatx80 floatx80_sqrt( floatx80 a )
3987 {
3988     flag aSign;
3989     int32 aExp, zExp;
3990     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3991     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3992     floatx80 z;
3993 
3994     aSig0 = extractFloatx80Frac( a );
3995     aExp = extractFloatx80Exp( a );
3996     aSign = extractFloatx80Sign( a );
3997     if ( aExp == 0x7FFF ) {
3998         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3999         if ( ! aSign ) return a;
4000         goto invalid;
4001     }
4002     if ( aSign ) {
4003         if ( ( aExp | aSig0 ) == 0 ) return a;
4004  invalid:
4005         float_raise( float_flag_invalid );
4006         z.low = floatx80_default_nan_low;
4007         z.high = floatx80_default_nan_high;
4008         return z;
4009     }
4010     if ( aExp == 0 ) {
4011         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4012         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4013     }
4014     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4015     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4016     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4017     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4018     doubleZSig0 = zSig0<<1;
4019     mul64To128( zSig0, zSig0, &term0, &term1 );
4020     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4021     while ( (sbits64) rem0 < 0 ) {
4022         --zSig0;
4023         doubleZSig0 -= 2;
4024         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4025     }
4026     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4027     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4028         if ( zSig1 == 0 ) zSig1 = 1;
4029         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4030         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4031         mul64To128( zSig1, zSig1, &term2, &term3 );
4032         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4033         while ( (sbits64) rem1 < 0 ) {
4034             --zSig1;
4035             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4036             term3 |= 1;
4037             term2 |= doubleZSig0;
4038             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4039         }
4040         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4041     }
4042     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4043     zSig0 |= doubleZSig0;
4044     return
4045         roundAndPackFloatx80(
4046             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4047 
4048 }
4049 
4050 /*
4051 -------------------------------------------------------------------------------
4052 Returns 1 if the extended double-precision floating-point value `a' is
4053 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4054 performed according to the IEC/IEEE Standard for Binary Floating-Point
4055 Arithmetic.
4056 -------------------------------------------------------------------------------
4057 */
4058 flag floatx80_eq( floatx80 a, floatx80 b )
4059 {
4060 
4061     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4062               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4063          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4064               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4065        ) {
4066         if (    floatx80_is_signaling_nan( a )
4067              || floatx80_is_signaling_nan( b ) ) {
4068             float_raise( float_flag_invalid );
4069         }
4070         return 0;
4071     }
4072     return
4073            ( a.low == b.low )
4074         && (    ( a.high == b.high )
4075              || (    ( a.low == 0 )
4076                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4077            );
4078 
4079 }
4080 
4081 /*
4082 -------------------------------------------------------------------------------
4083 Returns 1 if the extended double-precision floating-point value `a' is
4084 less than or equal to the corresponding value `b', and 0 otherwise.  The
4085 comparison is performed according to the IEC/IEEE Standard for Binary
4086 Floating-Point Arithmetic.
4087 -------------------------------------------------------------------------------
4088 */
4089 flag floatx80_le( floatx80 a, floatx80 b )
4090 {
4091     flag aSign, bSign;
4092 
4093     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4094               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4095          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4096               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4097        ) {
4098         float_raise( float_flag_invalid );
4099         return 0;
4100     }
4101     aSign = extractFloatx80Sign( a );
4102     bSign = extractFloatx80Sign( b );
4103     if ( aSign != bSign ) {
4104         return
4105                aSign
4106             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4107                  == 0 );
4108     }
4109     return
4110           aSign ? le128( b.high, b.low, a.high, a.low )
4111         : le128( a.high, a.low, b.high, b.low );
4112 
4113 }
4114 
4115 /*
4116 -------------------------------------------------------------------------------
4117 Returns 1 if the extended double-precision floating-point value `a' is
4118 less than the corresponding value `b', and 0 otherwise.  The comparison
4119 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4120 Arithmetic.
4121 -------------------------------------------------------------------------------
4122 */
4123 flag floatx80_lt( floatx80 a, floatx80 b )
4124 {
4125     flag aSign, bSign;
4126 
4127     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4128               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4129          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4130               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4131        ) {
4132         float_raise( float_flag_invalid );
4133         return 0;
4134     }
4135     aSign = extractFloatx80Sign( a );
4136     bSign = extractFloatx80Sign( b );
4137     if ( aSign != bSign ) {
4138         return
4139                aSign
4140             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4141                  != 0 );
4142     }
4143     return
4144           aSign ? lt128( b.high, b.low, a.high, a.low )
4145         : lt128( a.high, a.low, b.high, b.low );
4146 
4147 }
4148 
4149 /*
4150 -------------------------------------------------------------------------------
4151 Returns 1 if the extended double-precision floating-point value `a' is equal
4152 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4153 raised if either operand is a NaN.  Otherwise, the comparison is performed
4154 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4155 -------------------------------------------------------------------------------
4156 */
4157 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4158 {
4159 
4160     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4161               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4162          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4163               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4164        ) {
4165         float_raise( float_flag_invalid );
4166         return 0;
4167     }
4168     return
4169            ( a.low == b.low )
4170         && (    ( a.high == b.high )
4171              || (    ( a.low == 0 )
4172                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4173            );
4174 
4175 }
4176 
4177 /*
4178 -------------------------------------------------------------------------------
4179 Returns 1 if the extended double-precision floating-point value `a' is less
4180 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4181 do not cause an exception.  Otherwise, the comparison is performed according
4182 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4183 -------------------------------------------------------------------------------
4184 */
4185 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4186 {
4187     flag aSign, bSign;
4188 
4189     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4190               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4191          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4192               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4193        ) {
4194         if (    floatx80_is_signaling_nan( a )
4195              || floatx80_is_signaling_nan( b ) ) {
4196             float_raise( float_flag_invalid );
4197         }
4198         return 0;
4199     }
4200     aSign = extractFloatx80Sign( a );
4201     bSign = extractFloatx80Sign( b );
4202     if ( aSign != bSign ) {
4203         return
4204                aSign
4205             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4206                  == 0 );
4207     }
4208     return
4209           aSign ? le128( b.high, b.low, a.high, a.low )
4210         : le128( a.high, a.low, b.high, b.low );
4211 
4212 }
4213 
4214 /*
4215 -------------------------------------------------------------------------------
4216 Returns 1 if the extended double-precision floating-point value `a' is less
4217 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4218 an exception.  Otherwise, the comparison is performed according to the
4219 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4220 -------------------------------------------------------------------------------
4221 */
4222 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4223 {
4224     flag aSign, bSign;
4225 
4226     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4227               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4228          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4229               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4230        ) {
4231         if (    floatx80_is_signaling_nan( a )
4232              || floatx80_is_signaling_nan( b ) ) {
4233             float_raise( float_flag_invalid );
4234         }
4235         return 0;
4236     }
4237     aSign = extractFloatx80Sign( a );
4238     bSign = extractFloatx80Sign( b );
4239     if ( aSign != bSign ) {
4240         return
4241                aSign
4242             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4243                  != 0 );
4244     }
4245     return
4246           aSign ? lt128( b.high, b.low, a.high, a.low )
4247         : lt128( a.high, a.low, b.high, b.low );
4248 
4249 }
4250 
4251 #endif
4252 
4253 #ifdef FLOAT128
4254 
4255 /*
4256 -------------------------------------------------------------------------------
4257 Returns the result of converting the quadruple-precision floating-point
4258 value `a' to the 32-bit two's complement integer format.  The conversion
4259 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4260 Arithmetic---which means in particular that the conversion is rounded
4261 according to the current rounding mode.  If `a' is a NaN, the largest
4262 positive integer is returned.  Otherwise, if the conversion overflows, the
4263 largest integer with the same sign as `a' is returned.
4264 -------------------------------------------------------------------------------
4265 */
4266 int32 float128_to_int32( float128 a )
4267 {
4268     flag aSign;
4269     int32 aExp, shiftCount;
4270     bits64 aSig0, aSig1;
4271 
4272     aSig1 = extractFloat128Frac1( a );
4273     aSig0 = extractFloat128Frac0( a );
4274     aExp = extractFloat128Exp( a );
4275     aSign = extractFloat128Sign( a );
4276     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4277     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4278     aSig0 |= ( aSig1 != 0 );
4279     shiftCount = 0x4028 - aExp;
4280     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4281     return roundAndPackInt32( aSign, aSig0 );
4282 
4283 }
4284 
4285 /*
4286 -------------------------------------------------------------------------------
4287 Returns the result of converting the quadruple-precision floating-point
4288 value `a' to the 32-bit two's complement integer format.  The conversion
4289 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4290 Arithmetic, except that the conversion is always rounded toward zero.  If
4291 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4292 conversion overflows, the largest integer with the same sign as `a' is
4293 returned.
4294 -------------------------------------------------------------------------------
4295 */
4296 int32 float128_to_int32_round_to_zero( float128 a )
4297 {
4298     flag aSign;
4299     int32 aExp, shiftCount;
4300     bits64 aSig0, aSig1, savedASig;
4301     int32 z;
4302 
4303     aSig1 = extractFloat128Frac1( a );
4304     aSig0 = extractFloat128Frac0( a );
4305     aExp = extractFloat128Exp( a );
4306     aSign = extractFloat128Sign( a );
4307     aSig0 |= ( aSig1 != 0 );
4308     if ( 0x401E < aExp ) {
4309         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4310         goto invalid;
4311     }
4312     else if ( aExp < 0x3FFF ) {
4313         if ( aExp || aSig0 ) float_set_inexact();
4314         return 0;
4315     }
4316     aSig0 |= LIT64( 0x0001000000000000 );
4317     shiftCount = 0x402F - aExp;
4318     savedASig = aSig0;
4319     aSig0 >>= shiftCount;
4320     z = aSig0;
4321     if ( aSign ) z = - z;
4322     if ( ( z < 0 ) ^ aSign ) {
4323  invalid:
4324         float_raise( float_flag_invalid );
4325         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4326     }
4327     if ( ( aSig0<<shiftCount ) != savedASig ) {
4328         float_set_inexact();
4329     }
4330     return z;
4331 
4332 }
4333 
4334 /*
4335 -------------------------------------------------------------------------------
4336 Returns the result of converting the quadruple-precision floating-point
4337 value `a' to the 64-bit two's complement integer format.  The conversion
4338 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4339 Arithmetic---which means in particular that the conversion is rounded
4340 according to the current rounding mode.  If `a' is a NaN, the largest
4341 positive integer is returned.  Otherwise, if the conversion overflows, the
4342 largest integer with the same sign as `a' is returned.
4343 -------------------------------------------------------------------------------
4344 */
4345 int64 float128_to_int64( float128 a )
4346 {
4347     flag aSign;
4348     int32 aExp, shiftCount;
4349     bits64 aSig0, aSig1;
4350 
4351     aSig1 = extractFloat128Frac1( a );
4352     aSig0 = extractFloat128Frac0( a );
4353     aExp = extractFloat128Exp( a );
4354     aSign = extractFloat128Sign( a );
4355     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4356     shiftCount = 0x402F - aExp;
4357     if ( shiftCount <= 0 ) {
4358         if ( 0x403E < aExp ) {
4359             float_raise( float_flag_invalid );
4360             if (    ! aSign
4361                  || (    ( aExp == 0x7FFF )
4362                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4363                     )
4364                ) {
4365                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4366             }
4367             return (sbits64) LIT64( 0x8000000000000000 );
4368         }
4369         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4370     }
4371     else {
4372         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4373     }
4374     return roundAndPackInt64( aSign, aSig0, aSig1 );
4375 
4376 }
4377 
4378 /*
4379 -------------------------------------------------------------------------------
4380 Returns the result of converting the quadruple-precision floating-point
4381 value `a' to the 64-bit two's complement integer format.  The conversion
4382 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4383 Arithmetic, except that the conversion is always rounded toward zero.
4384 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4385 the conversion overflows, the largest integer with the same sign as `a' is
4386 returned.
4387 -------------------------------------------------------------------------------
4388 */
4389 int64 float128_to_int64_round_to_zero( float128 a )
4390 {
4391     flag aSign;
4392     int32 aExp, shiftCount;
4393     bits64 aSig0, aSig1;
4394     int64 z;
4395 
4396     aSig1 = extractFloat128Frac1( a );
4397     aSig0 = extractFloat128Frac0( a );
4398     aExp = extractFloat128Exp( a );
4399     aSign = extractFloat128Sign( a );
4400     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4401     shiftCount = aExp - 0x402F;
4402     if ( 0 < shiftCount ) {
4403         if ( 0x403E <= aExp ) {
4404             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4405             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4406                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4407                 if ( aSig1 ) float_set_inexact();
4408             }
4409             else {
4410                 float_raise( float_flag_invalid );
4411                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4412                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4413                 }
4414             }
4415             return (sbits64) LIT64( 0x8000000000000000 );
4416         }
4417         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4418         if ( (bits64) ( aSig1<<shiftCount ) ) {
4419             float_set_inexact();
4420         }
4421     }
4422     else {
4423         if ( aExp < 0x3FFF ) {
4424             if ( aExp | aSig0 | aSig1 ) {
4425                 float_set_inexact();
4426             }
4427             return 0;
4428         }
4429         z = aSig0>>( - shiftCount );
4430         if (    aSig1
4431              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4432             float_set_inexact();
4433         }
4434     }
4435     if ( aSign ) z = - z;
4436     return z;
4437 
4438 }
4439 
4440 /*
4441 -------------------------------------------------------------------------------
4442 Returns the result of converting the quadruple-precision floating-point
4443 value `a' to the single-precision floating-point format.  The conversion
4444 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4445 Arithmetic.
4446 -------------------------------------------------------------------------------
4447 */
4448 float32 float128_to_float32( float128 a )
4449 {
4450     flag aSign;
4451     int32 aExp;
4452     bits64 aSig0, aSig1;
4453     bits32 zSig;
4454 
4455     aSig1 = extractFloat128Frac1( a );
4456     aSig0 = extractFloat128Frac0( a );
4457     aExp = extractFloat128Exp( a );
4458     aSign = extractFloat128Sign( a );
4459     if ( aExp == 0x7FFF ) {
4460         if ( aSig0 | aSig1 ) {
4461             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4462         }
4463         return packFloat32( aSign, 0xFF, 0 );
4464     }
4465     aSig0 |= ( aSig1 != 0 );
4466     shift64RightJamming( aSig0, 18, &aSig0 );
4467     zSig = aSig0;
4468     if ( aExp || zSig ) {
4469         zSig |= 0x40000000;
4470         aExp -= 0x3F81;
4471     }
4472     return roundAndPackFloat32( aSign, aExp, zSig );
4473 
4474 }
4475 
4476 /*
4477 -------------------------------------------------------------------------------
4478 Returns the result of converting the quadruple-precision floating-point
4479 value `a' to the double-precision floating-point format.  The conversion
4480 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4481 Arithmetic.
4482 -------------------------------------------------------------------------------
4483 */
4484 float64 float128_to_float64( float128 a )
4485 {
4486     flag aSign;
4487     int32 aExp;
4488     bits64 aSig0, aSig1;
4489 
4490     aSig1 = extractFloat128Frac1( a );
4491     aSig0 = extractFloat128Frac0( a );
4492     aExp = extractFloat128Exp( a );
4493     aSign = extractFloat128Sign( a );
4494     if ( aExp == 0x7FFF ) {
4495         if ( aSig0 | aSig1 ) {
4496             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4497         }
4498         return packFloat64( aSign, 0x7FF, 0 );
4499     }
4500     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4501     aSig0 |= ( aSig1 != 0 );
4502     if ( aExp || aSig0 ) {
4503         aSig0 |= LIT64( 0x4000000000000000 );
4504         aExp -= 0x3C01;
4505     }
4506     return roundAndPackFloat64( aSign, aExp, aSig0 );
4507 
4508 }
4509 
4510 #ifdef FLOATX80
4511 
4512 /*
4513 -------------------------------------------------------------------------------
4514 Returns the result of converting the quadruple-precision floating-point
4515 value `a' to the extended double-precision floating-point format.  The
4516 conversion is performed according to the IEC/IEEE Standard for Binary
4517 Floating-Point Arithmetic.
4518 -------------------------------------------------------------------------------
4519 */
4520 floatx80 float128_to_floatx80( float128 a )
4521 {
4522     flag aSign;
4523     int32 aExp;
4524     bits64 aSig0, aSig1;
4525 
4526     aSig1 = extractFloat128Frac1( a );
4527     aSig0 = extractFloat128Frac0( a );
4528     aExp = extractFloat128Exp( a );
4529     aSign = extractFloat128Sign( a );
4530     if ( aExp == 0x7FFF ) {
4531         if ( aSig0 | aSig1 ) {
4532             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4533         }
4534         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4535     }
4536     if ( aExp == 0 ) {
4537         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4538         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4539     }
4540     else {
4541         aSig0 |= LIT64( 0x0001000000000000 );
4542     }
4543     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4544     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4545 
4546 }
4547 
4548 #endif
4549 
4550 /*
4551 -------------------------------------------------------------------------------
4552 Rounds the quadruple-precision floating-point value `a' to an integer, and
4553 returns the result as a quadruple-precision floating-point value.  The
4554 operation is performed according to the IEC/IEEE Standard for Binary
4555 Floating-Point Arithmetic.
4556 -------------------------------------------------------------------------------
4557 */
4558 float128 float128_round_to_int( float128 a )
4559 {
4560     flag aSign;
4561     int32 aExp;
4562     bits64 lastBitMask, roundBitsMask;
4563     int8 roundingMode;
4564     float128 z;
4565 
4566     aExp = extractFloat128Exp( a );
4567     if ( 0x402F <= aExp ) {
4568         if ( 0x406F <= aExp ) {
4569             if (    ( aExp == 0x7FFF )
4570                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4571                ) {
4572                 return propagateFloat128NaN( a, a );
4573             }
4574             return a;
4575         }
4576         lastBitMask = 1;
4577         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4578         roundBitsMask = lastBitMask - 1;
4579         z = a;
4580         roundingMode = float_rounding_mode();
4581         if ( roundingMode == float_round_nearest_even ) {
4582             if ( lastBitMask ) {
4583                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4584                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4585             }
4586             else {
4587                 if ( (sbits64) z.low < 0 ) {
4588                     ++z.high;
4589                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4590                 }
4591             }
4592         }
4593         else if ( roundingMode != float_round_to_zero ) {
4594             if (   extractFloat128Sign( z )
4595                  ^ ( roundingMode == float_round_up ) ) {
4596                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4597             }
4598         }
4599         z.low &= ~ roundBitsMask;
4600     }
4601     else {
4602         if ( aExp < 0x3FFF ) {
4603             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4604             float_set_inexact();
4605             aSign = extractFloat128Sign( a );
4606             switch ( float_rounding_mode() ) {
4607              case float_round_nearest_even:
4608                 if (    ( aExp == 0x3FFE )
4609                      && (   extractFloat128Frac0( a )
4610                           | extractFloat128Frac1( a ) )
4611                    ) {
4612                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4613                 }
4614                 break;
4615              case float_round_down:
4616                 return
4617                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4618                     : packFloat128( 0, 0, 0, 0 );
4619              case float_round_up:
4620                 return
4621                       aSign ? packFloat128( 1, 0, 0, 0 )
4622                     : packFloat128( 0, 0x3FFF, 0, 0 );
4623             }
4624             return packFloat128( aSign, 0, 0, 0 );
4625         }
4626         lastBitMask = 1;
4627         lastBitMask <<= 0x402F - aExp;
4628         roundBitsMask = lastBitMask - 1;
4629         z.low = 0;
4630         z.high = a.high;
4631         roundingMode = float_rounding_mode();
4632         if ( roundingMode == float_round_nearest_even ) {
4633             z.high += lastBitMask>>1;
4634             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4635                 z.high &= ~ lastBitMask;
4636             }
4637         }
4638         else if ( roundingMode != float_round_to_zero ) {
4639             if (   extractFloat128Sign( z )
4640                  ^ ( roundingMode == float_round_up ) ) {
4641                 z.high |= ( a.low != 0 );
4642                 z.high += roundBitsMask;
4643             }
4644         }
4645         z.high &= ~ roundBitsMask;
4646     }
4647     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4648         float_set_inexact();
4649     }
4650     return z;
4651 
4652 }
4653 
4654 /*
4655 -------------------------------------------------------------------------------
4656 Returns the result of adding the absolute values of the quadruple-precision
4657 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4658 before being returned.  `zSign' is ignored if the result is a NaN.
4659 The addition is performed according to the IEC/IEEE Standard for Binary
4660 Floating-Point Arithmetic.
4661 -------------------------------------------------------------------------------
4662 */
4663 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4664 {
4665     int32 aExp, bExp, zExp;
4666     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4667     int32 expDiff;
4668 
4669     aSig1 = extractFloat128Frac1( a );
4670     aSig0 = extractFloat128Frac0( a );
4671     aExp = extractFloat128Exp( a );
4672     bSig1 = extractFloat128Frac1( b );
4673     bSig0 = extractFloat128Frac0( b );
4674     bExp = extractFloat128Exp( b );
4675     expDiff = aExp - bExp;
4676     if ( 0 < expDiff ) {
4677         if ( aExp == 0x7FFF ) {
4678             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4679             return a;
4680         }
4681         if ( bExp == 0 ) {
4682             --expDiff;
4683         }
4684         else {
4685             bSig0 |= LIT64( 0x0001000000000000 );
4686         }
4687         shift128ExtraRightJamming(
4688             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4689         zExp = aExp;
4690     }
4691     else if ( expDiff < 0 ) {
4692         if ( bExp == 0x7FFF ) {
4693             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4694             return packFloat128( zSign, 0x7FFF, 0, 0 );
4695         }
4696         if ( aExp == 0 ) {
4697             ++expDiff;
4698         }
4699         else {
4700             aSig0 |= LIT64( 0x0001000000000000 );
4701         }
4702         shift128ExtraRightJamming(
4703             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4704         zExp = bExp;
4705     }
4706     else {
4707         if ( aExp == 0x7FFF ) {
4708             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4709                 return propagateFloat128NaN( a, b );
4710             }
4711             return a;
4712         }
4713         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4714         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4715         zSig2 = 0;
4716         zSig0 |= LIT64( 0x0002000000000000 );
4717         zExp = aExp;
4718         goto shiftRight1;
4719     }
4720     aSig0 |= LIT64( 0x0001000000000000 );
4721     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4722     --zExp;
4723     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4724     ++zExp;
4725  shiftRight1:
4726     shift128ExtraRightJamming(
4727         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4728  roundAndPack:
4729     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4730 
4731 }
4732 
4733 /*
4734 -------------------------------------------------------------------------------
4735 Returns the result of subtracting the absolute values of the quadruple-
4736 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4737 difference is negated before being returned.  `zSign' is ignored if the
4738 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4739 Standard for Binary Floating-Point Arithmetic.
4740 -------------------------------------------------------------------------------
4741 */
4742 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4743 {
4744     int32 aExp, bExp, zExp;
4745     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4746     int32 expDiff;
4747     float128 z;
4748 
4749     aSig1 = extractFloat128Frac1( a );
4750     aSig0 = extractFloat128Frac0( a );
4751     aExp = extractFloat128Exp( a );
4752     bSig1 = extractFloat128Frac1( b );
4753     bSig0 = extractFloat128Frac0( b );
4754     bExp = extractFloat128Exp( b );
4755     expDiff = aExp - bExp;
4756     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4757     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4758     if ( 0 < expDiff ) goto aExpBigger;
4759     if ( expDiff < 0 ) goto bExpBigger;
4760     if ( aExp == 0x7FFF ) {
4761         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4762             return propagateFloat128NaN( a, b );
4763         }
4764         float_raise( float_flag_invalid );
4765         z.low = float128_default_nan_low;
4766         z.high = float128_default_nan_high;
4767         return z;
4768     }
4769     if ( aExp == 0 ) {
4770         aExp = 1;
4771         bExp = 1;
4772     }
4773     if ( bSig0 < aSig0 ) goto aBigger;
4774     if ( aSig0 < bSig0 ) goto bBigger;
4775     if ( bSig1 < aSig1 ) goto aBigger;
4776     if ( aSig1 < bSig1 ) goto bBigger;
4777     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4778  bExpBigger:
4779     if ( bExp == 0x7FFF ) {
4780         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4781         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4782     }
4783     if ( aExp == 0 ) {
4784         ++expDiff;
4785     }
4786     else {
4787         aSig0 |= LIT64( 0x4000000000000000 );
4788     }
4789     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4790     bSig0 |= LIT64( 0x4000000000000000 );
4791  bBigger:
4792     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4793     zExp = bExp;
4794     zSign ^= 1;
4795     goto normalizeRoundAndPack;
4796  aExpBigger:
4797     if ( aExp == 0x7FFF ) {
4798         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4799         return a;
4800     }
4801     if ( bExp == 0 ) {
4802         --expDiff;
4803     }
4804     else {
4805         bSig0 |= LIT64( 0x4000000000000000 );
4806     }
4807     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4808     aSig0 |= LIT64( 0x4000000000000000 );
4809  aBigger:
4810     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4811     zExp = aExp;
4812  normalizeRoundAndPack:
4813     --zExp;
4814     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4815 
4816 }
4817 
4818 /*
4819 -------------------------------------------------------------------------------
4820 Returns the result of adding the quadruple-precision floating-point values
4821 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4822 for Binary Floating-Point Arithmetic.
4823 -------------------------------------------------------------------------------
4824 */
4825 float128 float128_add( float128 a, float128 b )
4826 {
4827     flag aSign, bSign;
4828 
4829     aSign = extractFloat128Sign( a );
4830     bSign = extractFloat128Sign( b );
4831     if ( aSign == bSign ) {
4832         return addFloat128Sigs( a, b, aSign );
4833     }
4834     else {
4835         return subFloat128Sigs( a, b, aSign );
4836     }
4837 
4838 }
4839 
4840 /*
4841 -------------------------------------------------------------------------------
4842 Returns the result of subtracting the quadruple-precision floating-point
4843 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4844 Standard for Binary Floating-Point Arithmetic.
4845 -------------------------------------------------------------------------------
4846 */
4847 float128 float128_sub( float128 a, float128 b )
4848 {
4849     flag aSign, bSign;
4850 
4851     aSign = extractFloat128Sign( a );
4852     bSign = extractFloat128Sign( b );
4853     if ( aSign == bSign ) {
4854         return subFloat128Sigs( a, b, aSign );
4855     }
4856     else {
4857         return addFloat128Sigs( a, b, aSign );
4858     }
4859 
4860 }
4861 
4862 /*
4863 -------------------------------------------------------------------------------
4864 Returns the result of multiplying the quadruple-precision floating-point
4865 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4866 Standard for Binary Floating-Point Arithmetic.
4867 -------------------------------------------------------------------------------
4868 */
4869 float128 float128_mul( float128 a, float128 b )
4870 {
4871     flag aSign, bSign, zSign;
4872     int32 aExp, bExp, zExp;
4873     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4874     float128 z;
4875 
4876     aSig1 = extractFloat128Frac1( a );
4877     aSig0 = extractFloat128Frac0( a );
4878     aExp = extractFloat128Exp( a );
4879     aSign = extractFloat128Sign( a );
4880     bSig1 = extractFloat128Frac1( b );
4881     bSig0 = extractFloat128Frac0( b );
4882     bExp = extractFloat128Exp( b );
4883     bSign = extractFloat128Sign( b );
4884     zSign = aSign ^ bSign;
4885     if ( aExp == 0x7FFF ) {
4886         if (    ( aSig0 | aSig1 )
4887              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4888             return propagateFloat128NaN( a, b );
4889         }
4890         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4891         return packFloat128( zSign, 0x7FFF, 0, 0 );
4892     }
4893     if ( bExp == 0x7FFF ) {
4894         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4895         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4896  invalid:
4897             float_raise( float_flag_invalid );
4898             z.low = float128_default_nan_low;
4899             z.high = float128_default_nan_high;
4900             return z;
4901         }
4902         return packFloat128( zSign, 0x7FFF, 0, 0 );
4903     }
4904     if ( aExp == 0 ) {
4905         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4906         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4907     }
4908     if ( bExp == 0 ) {
4909         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4910         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4911     }
4912     zExp = aExp + bExp - 0x4000;
4913     aSig0 |= LIT64( 0x0001000000000000 );
4914     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4915     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4916     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4917     zSig2 |= ( zSig3 != 0 );
4918     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4919         shift128ExtraRightJamming(
4920             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4921         ++zExp;
4922     }
4923     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4924 
4925 }
4926 
4927 /*
4928 -------------------------------------------------------------------------------
4929 Returns the result of dividing the quadruple-precision floating-point value
4930 `a' by the corresponding value `b'.  The operation is performed according to
4931 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4932 -------------------------------------------------------------------------------
4933 */
4934 float128 float128_div( float128 a, float128 b )
4935 {
4936     flag aSign, bSign, zSign;
4937     int32 aExp, bExp, zExp;
4938     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4939     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4940     float128 z;
4941 
4942     aSig1 = extractFloat128Frac1( a );
4943     aSig0 = extractFloat128Frac0( a );
4944     aExp = extractFloat128Exp( a );
4945     aSign = extractFloat128Sign( a );
4946     bSig1 = extractFloat128Frac1( b );
4947     bSig0 = extractFloat128Frac0( b );
4948     bExp = extractFloat128Exp( b );
4949     bSign = extractFloat128Sign( b );
4950     zSign = aSign ^ bSign;
4951     if ( aExp == 0x7FFF ) {
4952         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4953         if ( bExp == 0x7FFF ) {
4954             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4955             goto invalid;
4956         }
4957         return packFloat128( zSign, 0x7FFF, 0, 0 );
4958     }
4959     if ( bExp == 0x7FFF ) {
4960         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4961         return packFloat128( zSign, 0, 0, 0 );
4962     }
4963     if ( bExp == 0 ) {
4964         if ( ( bSig0 | bSig1 ) == 0 ) {
4965             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4966  invalid:
4967                 float_raise( float_flag_invalid );
4968                 z.low = float128_default_nan_low;
4969                 z.high = float128_default_nan_high;
4970                 return z;
4971             }
4972             float_raise( float_flag_divbyzero );
4973             return packFloat128( zSign, 0x7FFF, 0, 0 );
4974         }
4975         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4976     }
4977     if ( aExp == 0 ) {
4978         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4979         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4980     }
4981     zExp = aExp - bExp + 0x3FFD;
4982     shortShift128Left(
4983         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4984     shortShift128Left(
4985         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4986     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4987         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4988         ++zExp;
4989     }
4990     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4991     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4992     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4993     while ( (sbits64) rem0 < 0 ) {
4994         --zSig0;
4995         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4996     }
4997     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4998     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4999         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5000         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5001         while ( (sbits64) rem1 < 0 ) {
5002             --zSig1;
5003             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5004         }
5005         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5006     }
5007     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5008     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5009 
5010 }
5011 
5012 /*
5013 -------------------------------------------------------------------------------
5014 Returns the remainder of the quadruple-precision floating-point value `a'
5015 with respect to the corresponding value `b'.  The operation is performed
5016 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5017 -------------------------------------------------------------------------------
5018 */
5019 float128 float128_rem( float128 a, float128 b )
5020 {
5021     flag aSign, bSign, zSign;
5022     int32 aExp, bExp, expDiff;
5023     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5024     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5025     sbits64 sigMean0;
5026     float128 z;
5027 
5028     aSig1 = extractFloat128Frac1( a );
5029     aSig0 = extractFloat128Frac0( a );
5030     aExp = extractFloat128Exp( a );
5031     aSign = extractFloat128Sign( a );
5032     bSig1 = extractFloat128Frac1( b );
5033     bSig0 = extractFloat128Frac0( b );
5034     bExp = extractFloat128Exp( b );
5035     bSign = extractFloat128Sign( b );
5036     if ( aExp == 0x7FFF ) {
5037         if (    ( aSig0 | aSig1 )
5038              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5039             return propagateFloat128NaN( a, b );
5040         }
5041         goto invalid;
5042     }
5043     if ( bExp == 0x7FFF ) {
5044         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5045         return a;
5046     }
5047     if ( bExp == 0 ) {
5048         if ( ( bSig0 | bSig1 ) == 0 ) {
5049  invalid:
5050             float_raise( float_flag_invalid );
5051             z.low = float128_default_nan_low;
5052             z.high = float128_default_nan_high;
5053             return z;
5054         }
5055         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5056     }
5057     if ( aExp == 0 ) {
5058         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5059         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5060     }
5061     expDiff = aExp - bExp;
5062     if ( expDiff < -1 ) return a;
5063     shortShift128Left(
5064         aSig0 | LIT64( 0x0001000000000000 ),
5065         aSig1,
5066         15 - ( expDiff < 0 ),
5067         &aSig0,
5068         &aSig1
5069     );
5070     shortShift128Left(
5071         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5072     q = le128( bSig0, bSig1, aSig0, aSig1 );
5073     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5074     expDiff -= 64;
5075     while ( 0 < expDiff ) {
5076         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5077         q = ( 4 < q ) ? q - 4 : 0;
5078         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5079         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5080         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5081         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5082         expDiff -= 61;
5083     }
5084     if ( -64 < expDiff ) {
5085         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5086         q = ( 4 < q ) ? q - 4 : 0;
5087         q >>= - expDiff;
5088         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5089         expDiff += 52;
5090         if ( expDiff < 0 ) {
5091             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5092         }
5093         else {
5094             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5095         }
5096         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5097         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5098     }
5099     else {
5100         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5101         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5102     }
5103     do {
5104         alternateASig0 = aSig0;
5105         alternateASig1 = aSig1;
5106         ++q;
5107         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5108     } while ( 0 <= (sbits64) aSig0 );
5109     add128(
5110         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5111     if (    ( sigMean0 < 0 )
5112          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5113         aSig0 = alternateASig0;
5114         aSig1 = alternateASig1;
5115     }
5116     zSign = ( (sbits64) aSig0 < 0 );
5117     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5118     return
5119         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5120 
5121 }
5122 
5123 /*
5124 -------------------------------------------------------------------------------
5125 Returns the square root of the quadruple-precision floating-point value `a'.
5126 The operation is performed according to the IEC/IEEE Standard for Binary
5127 Floating-Point Arithmetic.
5128 -------------------------------------------------------------------------------
5129 */
5130 float128 float128_sqrt( float128 a )
5131 {
5132     flag aSign;
5133     int32 aExp, zExp;
5134     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5135     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5136     float128 z;
5137 
5138     aSig1 = extractFloat128Frac1( a );
5139     aSig0 = extractFloat128Frac0( a );
5140     aExp = extractFloat128Exp( a );
5141     aSign = extractFloat128Sign( a );
5142     if ( aExp == 0x7FFF ) {
5143         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5144         if ( ! aSign ) return a;
5145         goto invalid;
5146     }
5147     if ( aSign ) {
5148         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5149  invalid:
5150         float_raise( float_flag_invalid );
5151         z.low = float128_default_nan_low;
5152         z.high = float128_default_nan_high;
5153         return z;
5154     }
5155     if ( aExp == 0 ) {
5156         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5157         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5158     }
5159     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5160     aSig0 |= LIT64( 0x0001000000000000 );
5161     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5162     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5163     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5164     doubleZSig0 = zSig0<<1;
5165     mul64To128( zSig0, zSig0, &term0, &term1 );
5166     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5167     while ( (sbits64) rem0 < 0 ) {
5168         --zSig0;
5169         doubleZSig0 -= 2;
5170         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5171     }
5172     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5173     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5174         if ( zSig1 == 0 ) zSig1 = 1;
5175         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5176         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5177         mul64To128( zSig1, zSig1, &term2, &term3 );
5178         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5179         while ( (sbits64) rem1 < 0 ) {
5180             --zSig1;
5181             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5182             term3 |= 1;
5183             term2 |= doubleZSig0;
5184             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5185         }
5186         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5187     }
5188     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5189     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5190 
5191 }
5192 
5193 /*
5194 -------------------------------------------------------------------------------
5195 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5196 the corresponding value `b', and 0 otherwise.  The comparison is performed
5197 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5198 -------------------------------------------------------------------------------
5199 */
5200 flag float128_eq( float128 a, float128 b )
5201 {
5202 
5203     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5204               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5205          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5206               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5207        ) {
5208         if (    float128_is_signaling_nan( a )
5209              || float128_is_signaling_nan( b ) ) {
5210             float_raise( float_flag_invalid );
5211         }
5212         return 0;
5213     }
5214     return
5215            ( a.low == b.low )
5216         && (    ( a.high == b.high )
5217              || (    ( a.low == 0 )
5218                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5219            );
5220 
5221 }
5222 
5223 /*
5224 -------------------------------------------------------------------------------
5225 Returns 1 if the quadruple-precision floating-point value `a' is less than
5226 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5227 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5228 Arithmetic.
5229 -------------------------------------------------------------------------------
5230 */
5231 flag float128_le( float128 a, float128 b )
5232 {
5233     flag aSign, bSign;
5234 
5235     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5236               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5237          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5238               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5239        ) {
5240         float_raise( float_flag_invalid );
5241         return 0;
5242     }
5243     aSign = extractFloat128Sign( a );
5244     bSign = extractFloat128Sign( b );
5245     if ( aSign != bSign ) {
5246         return
5247                aSign
5248             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5249                  == 0 );
5250     }
5251     return
5252           aSign ? le128( b.high, b.low, a.high, a.low )
5253         : le128( a.high, a.low, b.high, b.low );
5254 
5255 }
5256 
5257 /*
5258 -------------------------------------------------------------------------------
5259 Returns 1 if the quadruple-precision floating-point value `a' is less than
5260 the corresponding value `b', and 0 otherwise.  The comparison is performed
5261 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5262 -------------------------------------------------------------------------------
5263 */
5264 flag float128_lt( float128 a, float128 b )
5265 {
5266     flag aSign, bSign;
5267 
5268     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5269               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5270          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5271               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5272        ) {
5273         float_raise( float_flag_invalid );
5274         return 0;
5275     }
5276     aSign = extractFloat128Sign( a );
5277     bSign = extractFloat128Sign( b );
5278     if ( aSign != bSign ) {
5279         return
5280                aSign
5281             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5282                  != 0 );
5283     }
5284     return
5285           aSign ? lt128( b.high, b.low, a.high, a.low )
5286         : lt128( a.high, a.low, b.high, b.low );
5287 
5288 }
5289 
5290 /*
5291 -------------------------------------------------------------------------------
5292 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5293 the corresponding value `b', and 0 otherwise.  The invalid exception is
5294 raised if either operand is a NaN.  Otherwise, the comparison is performed
5295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5296 -------------------------------------------------------------------------------
5297 */
5298 flag float128_eq_signaling( float128 a, float128 b )
5299 {
5300 
5301     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5302               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5303          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5304               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5305        ) {
5306         float_raise( float_flag_invalid );
5307         return 0;
5308     }
5309     return
5310            ( a.low == b.low )
5311         && (    ( a.high == b.high )
5312              || (    ( a.low == 0 )
5313                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5314            );
5315 
5316 }
5317 
5318 /*
5319 -------------------------------------------------------------------------------
5320 Returns 1 if the quadruple-precision floating-point value `a' is less than
5321 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5322 cause an exception.  Otherwise, the comparison is performed according to the
5323 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5324 -------------------------------------------------------------------------------
5325 */
5326 flag float128_le_quiet( float128 a, float128 b )
5327 {
5328     flag aSign, bSign;
5329 
5330     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5331               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5332          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5333               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5334        ) {
5335         if (    float128_is_signaling_nan( a )
5336              || float128_is_signaling_nan( b ) ) {
5337             float_raise( float_flag_invalid );
5338         }
5339         return 0;
5340     }
5341     aSign = extractFloat128Sign( a );
5342     bSign = extractFloat128Sign( b );
5343     if ( aSign != bSign ) {
5344         return
5345                aSign
5346             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5347                  == 0 );
5348     }
5349     return
5350           aSign ? le128( b.high, b.low, a.high, a.low )
5351         : le128( a.high, a.low, b.high, b.low );
5352 
5353 }
5354 
5355 /*
5356 -------------------------------------------------------------------------------
5357 Returns 1 if the quadruple-precision floating-point value `a' is less than
5358 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5359 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5360 Standard for Binary Floating-Point Arithmetic.
5361 -------------------------------------------------------------------------------
5362 */
5363 flag float128_lt_quiet( float128 a, float128 b )
5364 {
5365     flag aSign, bSign;
5366 
5367     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5368               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5369          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5370               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5371        ) {
5372         if (    float128_is_signaling_nan( a )
5373              || float128_is_signaling_nan( b ) ) {
5374             float_raise( float_flag_invalid );
5375         }
5376         return 0;
5377     }
5378     aSign = extractFloat128Sign( a );
5379     bSign = extractFloat128Sign( b );
5380     if ( aSign != bSign ) {
5381         return
5382                aSign
5383             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5384                  != 0 );
5385     }
5386     return
5387           aSign ? lt128( b.high, b.low, a.high, a.low )
5388         : lt128( a.high, a.low, b.high, b.low );
5389 
5390 }
5391 
5392 #endif
5393 
5394 
5395 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5396 
5397 /*
5398  * These two routines are not part of the original softfloat distribution.
5399  *
5400  * They are based on the corresponding conversions to integer but return
5401  * unsigned numbers instead since these functions are required by GCC.
5402  *
5403  * Added by Mark Brinicombe <mark@netbsd.org>	27/09/97
5404  *
5405  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5406  */
5407 
5408 /*
5409 -------------------------------------------------------------------------------
5410 Returns the result of converting the double-precision floating-point value
5411 `a' to the 32-bit unsigned integer format.  The conversion is
5412 performed according to the IEC/IEEE Standard for Binary Floating-point
5413 Arithmetic, except that the conversion is always rounded toward zero.  If
5414 `a' is a NaN, the largest positive integer is returned.  If the conversion
5415 overflows, the largest integer positive is returned.
5416 -------------------------------------------------------------------------------
5417 */
5418 uint32 float64_to_uint32_round_to_zero( float64 a )
5419 {
5420     flag aSign;
5421     int16 aExp, shiftCount;
5422     bits64 aSig, savedASig;
5423     uint32 z;
5424 
5425     aSig = extractFloat64Frac( a );
5426     aExp = extractFloat64Exp( a );
5427     aSign = extractFloat64Sign( a );
5428 
5429     if (aSign) {
5430         float_raise( float_flag_invalid );
5431     	return(0);
5432     }
5433 
5434     if ( 0x41E < aExp ) {
5435         float_raise( float_flag_invalid );
5436         return 0xffffffff;
5437     }
5438     else if ( aExp < 0x3FF ) {
5439         if ( aExp || aSig ) float_set_inexact();
5440         return 0;
5441     }
5442     aSig |= LIT64( 0x0010000000000000 );
5443     shiftCount = 0x433 - aExp;
5444     savedASig = aSig;
5445     aSig >>= shiftCount;
5446     z = aSig;
5447     if ( ( aSig<<shiftCount ) != savedASig ) {
5448         float_set_inexact();
5449     }
5450     return z;
5451 
5452 }
5453 
5454 /*
5455 -------------------------------------------------------------------------------
5456 Returns the result of converting the single-precision floating-point value
5457 `a' to the 32-bit unsigned integer format.  The conversion is
5458 performed according to the IEC/IEEE Standard for Binary Floating-point
5459 Arithmetic, except that the conversion is always rounded toward zero.  If
5460 `a' is a NaN, the largest positive integer is returned.  If the conversion
5461 overflows, the largest positive integer is returned.
5462 -------------------------------------------------------------------------------
5463 */
5464 uint32 float32_to_uint32_round_to_zero( float32 a )
5465 {
5466     flag aSign;
5467     int16 aExp, shiftCount;
5468     bits32 aSig;
5469     uint32 z;
5470 
5471     aSig = extractFloat32Frac( a );
5472     aExp = extractFloat32Exp( a );
5473     aSign = extractFloat32Sign( a );
5474     shiftCount = aExp - 0x9E;
5475 
5476     if (aSign) {
5477         float_raise( float_flag_invalid );
5478     	return(0);
5479     }
5480     if ( 0 < shiftCount ) {
5481         float_raise( float_flag_invalid );
5482         return 0xFFFFFFFF;
5483     }
5484     else if ( aExp <= 0x7E ) {
5485         if ( aExp | aSig ) float_set_inexact();
5486         return 0;
5487     }
5488     aSig = ( aSig | 0x800000 )<<8;
5489     z = aSig>>( - shiftCount );
5490     if ( aSig<<( shiftCount & 31 ) ) {
5491         float_set_inexact();
5492     }
5493     return z;
5494 
5495 }
5496 
5497 #endif
5498