xref: /netbsd/lib/libc/softfloat/bits64/softfloat.c (revision bf9ec67e)
1 /* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:08 bjh21 Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*
18 ===============================================================================
19 
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 #include <sys/cdefs.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 __RCSID("$NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:08 bjh21 Exp $");
50 #endif /* LIBC_SCCS and not lint */
51 
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
55 
56 #include "milieu.h"
57 #include "softfloat.h"
58 
59 /*
60  * Conversions between floats as stored in memory and floats as
61  * SoftFloat uses them
62  */
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a)	(a)
65 #endif
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a)	(a)
68 #endif
69 
70 /*
71 -------------------------------------------------------------------------------
72 Floating-point rounding mode, extended double-precision rounding precision,
73 and exception flags.
74 -------------------------------------------------------------------------------
75 */
76 fp_rnd float_rounding_mode = float_round_nearest_even;
77 fp_except float_exception_flags = 0;
78 #ifdef FLOATX80
79 int8 floatx80_rounding_precision = 80;
80 #endif
81 
82 /*
83 -------------------------------------------------------------------------------
84 Primitive arithmetic functions, including multi-word arithmetic, and
85 division and square root approximations.  (Can be specialized to target if
86 desired.)
87 -------------------------------------------------------------------------------
88 */
89 #include "softfloat-macros"
90 
91 /*
92 -------------------------------------------------------------------------------
93 Functions and definitions to determine:  (1) whether tininess for underflow
94 is detected before or after rounding by default, (2) what (if anything)
95 happens when exceptions are raised, (3) how signaling NaNs are distinguished
96 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
97 are propagated from function inputs to output.  These details are target-
98 specific.
99 -------------------------------------------------------------------------------
100 */
101 #include "softfloat-specialize"
102 
103 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
104 /*
105 -------------------------------------------------------------------------------
106 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
107 and 7, and returns the properly rounded 32-bit integer corresponding to the
108 input.  If `zSign' is 1, the input is negated before being converted to an
109 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
110 is simply rounded to an integer, with the inexact exception raised if the
111 input cannot be represented exactly as an integer.  However, if the fixed-
112 point input is too large, the invalid exception is raised and the largest
113 positive or negative integer is returned.
114 -------------------------------------------------------------------------------
115 */
116 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
117 {
118     int8 roundingMode;
119     flag roundNearestEven;
120     int8 roundIncrement, roundBits;
121     int32 z;
122 
123     roundingMode = float_rounding_mode;
124     roundNearestEven = ( roundingMode == float_round_nearest_even );
125     roundIncrement = 0x40;
126     if ( ! roundNearestEven ) {
127         if ( roundingMode == float_round_to_zero ) {
128             roundIncrement = 0;
129         }
130         else {
131             roundIncrement = 0x7F;
132             if ( zSign ) {
133                 if ( roundingMode == float_round_up ) roundIncrement = 0;
134             }
135             else {
136                 if ( roundingMode == float_round_down ) roundIncrement = 0;
137             }
138         }
139     }
140     roundBits = absZ & 0x7F;
141     absZ = ( absZ + roundIncrement )>>7;
142     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143     z = absZ;
144     if ( zSign ) z = - z;
145     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
146         float_raise( float_flag_invalid );
147         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
148     }
149     if ( roundBits ) float_exception_flags |= float_flag_inexact;
150     return z;
151 
152 }
153 
154 /*
155 -------------------------------------------------------------------------------
156 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
157 `absZ1', with binary point between bits 63 and 64 (between the input words),
158 and returns the properly rounded 64-bit integer corresponding to the input.
159 If `zSign' is 1, the input is negated before being converted to an integer.
160 Ordinarily, the fixed-point input is simply rounded to an integer, with
161 the inexact exception raised if the input cannot be represented exactly as
162 an integer.  However, if the fixed-point input is too large, the invalid
163 exception is raised and the largest positive or negative integer is
164 returned.
165 -------------------------------------------------------------------------------
166 */
167 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
168 {
169     int8 roundingMode;
170     flag roundNearestEven, increment;
171     int64 z;
172 
173     roundingMode = float_rounding_mode;
174     roundNearestEven = ( roundingMode == float_round_nearest_even );
175     increment = ( (sbits64) absZ1 < 0 );
176     if ( ! roundNearestEven ) {
177         if ( roundingMode == float_round_to_zero ) {
178             increment = 0;
179         }
180         else {
181             if ( zSign ) {
182                 increment = ( roundingMode == float_round_down ) && absZ1;
183             }
184             else {
185                 increment = ( roundingMode == float_round_up ) && absZ1;
186             }
187         }
188     }
189     if ( increment ) {
190         ++absZ0;
191         if ( absZ0 == 0 ) goto overflow;
192         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193     }
194     z = absZ0;
195     if ( zSign ) z = - z;
196     if ( z && ( ( z < 0 ) ^ zSign ) ) {
197  overflow:
198         float_raise( float_flag_invalid );
199         return
200               zSign ? (sbits64) LIT64( 0x8000000000000000 )
201             : LIT64( 0x7FFFFFFFFFFFFFFF );
202     }
203     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
204     return z;
205 
206 }
207 #endif
208 
209 /*
210 -------------------------------------------------------------------------------
211 Returns the fraction bits of the single-precision floating-point value `a'.
212 -------------------------------------------------------------------------------
213 */
214 INLINE bits32 extractFloat32Frac( float32 a )
215 {
216 
217     return a & 0x007FFFFF;
218 
219 }
220 
221 /*
222 -------------------------------------------------------------------------------
223 Returns the exponent bits of the single-precision floating-point value `a'.
224 -------------------------------------------------------------------------------
225 */
226 INLINE int16 extractFloat32Exp( float32 a )
227 {
228 
229     return ( a>>23 ) & 0xFF;
230 
231 }
232 
233 /*
234 -------------------------------------------------------------------------------
235 Returns the sign bit of the single-precision floating-point value `a'.
236 -------------------------------------------------------------------------------
237 */
238 INLINE flag extractFloat32Sign( float32 a )
239 {
240 
241     return a>>31;
242 
243 }
244 
245 /*
246 -------------------------------------------------------------------------------
247 Normalizes the subnormal single-precision floating-point value represented
248 by the denormalized significand `aSig'.  The normalized exponent and
249 significand are stored at the locations pointed to by `zExpPtr' and
250 `zSigPtr', respectively.
251 -------------------------------------------------------------------------------
252 */
253 static void
254  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
255 {
256     int8 shiftCount;
257 
258     shiftCount = countLeadingZeros32( aSig ) - 8;
259     *zSigPtr = aSig<<shiftCount;
260     *zExpPtr = 1 - shiftCount;
261 
262 }
263 
264 /*
265 -------------------------------------------------------------------------------
266 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
267 single-precision floating-point value, returning the result.  After being
268 shifted into the proper positions, the three fields are simply added
269 together to form the result.  This means that any integer portion of `zSig'
270 will be added into the exponent.  Since a properly normalized significand
271 will have an integer portion equal to 1, the `zExp' input should be 1 less
272 than the desired result exponent whenever `zSig' is a complete, normalized
273 significand.
274 -------------------------------------------------------------------------------
275 */
276 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
277 {
278 
279     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
280 
281 }
282 
283 /*
284 -------------------------------------------------------------------------------
285 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
286 and significand `zSig', and returns the proper single-precision floating-
287 point value corresponding to the abstract input.  Ordinarily, the abstract
288 value is simply rounded and packed into the single-precision format, with
289 the inexact exception raised if the abstract input cannot be represented
290 exactly.  However, if the abstract value is too large, the overflow and
291 inexact exceptions are raised and an infinity or maximal finite value is
292 returned.  If the abstract value is too small, the input value is rounded to
293 a subnormal number, and the underflow and inexact exceptions are raised if
294 the abstract input cannot be represented exactly as a subnormal single-
295 precision floating-point number.
296     The input significand `zSig' has its binary point between bits 30
297 and 29, which is 7 bits to the left of the usual location.  This shifted
298 significand must be normalized or smaller.  If `zSig' is not normalized,
299 `zExp' must be 0; in that case, the result returned is a subnormal number,
300 and it must not require rounding.  In the usual case that `zSig' is
301 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
302 The handling of underflow and overflow follows the IEC/IEEE Standard for
303 Binary Floating-Point Arithmetic.
304 -------------------------------------------------------------------------------
305 */
306 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
307 {
308     int8 roundingMode;
309     flag roundNearestEven;
310     int8 roundIncrement, roundBits;
311     flag isTiny;
312 
313     roundingMode = float_rounding_mode;
314     roundNearestEven = ( roundingMode == float_round_nearest_even );
315     roundIncrement = 0x40;
316     if ( ! roundNearestEven ) {
317         if ( roundingMode == float_round_to_zero ) {
318             roundIncrement = 0;
319         }
320         else {
321             roundIncrement = 0x7F;
322             if ( zSign ) {
323                 if ( roundingMode == float_round_up ) roundIncrement = 0;
324             }
325             else {
326                 if ( roundingMode == float_round_down ) roundIncrement = 0;
327             }
328         }
329     }
330     roundBits = zSig & 0x7F;
331     if ( 0xFD <= (bits16) zExp ) {
332         if (    ( 0xFD < zExp )
333              || (    ( zExp == 0xFD )
334                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
335            ) {
336             float_raise( float_flag_overflow | float_flag_inexact );
337             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
338         }
339         if ( zExp < 0 ) {
340             isTiny =
341                    ( float_detect_tininess == float_tininess_before_rounding )
342                 || ( zExp < -1 )
343                 || ( zSig + roundIncrement < 0x80000000 );
344             shift32RightJamming( zSig, - zExp, &zSig );
345             zExp = 0;
346             roundBits = zSig & 0x7F;
347             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
348         }
349     }
350     if ( roundBits ) float_exception_flags |= float_flag_inexact;
351     zSig = ( zSig + roundIncrement )>>7;
352     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
353     if ( zSig == 0 ) zExp = 0;
354     return packFloat32( zSign, zExp, zSig );
355 
356 }
357 
358 /*
359 -------------------------------------------------------------------------------
360 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
361 and significand `zSig', and returns the proper single-precision floating-
362 point value corresponding to the abstract input.  This routine is just like
363 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
364 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
365 floating-point exponent.
366 -------------------------------------------------------------------------------
367 */
368 static float32
369  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
370 {
371     int8 shiftCount;
372 
373     shiftCount = countLeadingZeros32( zSig ) - 1;
374     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
375 
376 }
377 
378 /*
379 -------------------------------------------------------------------------------
380 Returns the fraction bits of the double-precision floating-point value `a'.
381 -------------------------------------------------------------------------------
382 */
383 INLINE bits64 extractFloat64Frac( float64 a )
384 {
385 
386     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
387 
388 }
389 
390 /*
391 -------------------------------------------------------------------------------
392 Returns the exponent bits of the double-precision floating-point value `a'.
393 -------------------------------------------------------------------------------
394 */
395 INLINE int16 extractFloat64Exp( float64 a )
396 {
397 
398     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
399 
400 }
401 
402 /*
403 -------------------------------------------------------------------------------
404 Returns the sign bit of the double-precision floating-point value `a'.
405 -------------------------------------------------------------------------------
406 */
407 INLINE flag extractFloat64Sign( float64 a )
408 {
409 
410     return FLOAT64_DEMANGLE(a)>>63;
411 
412 }
413 
414 /*
415 -------------------------------------------------------------------------------
416 Normalizes the subnormal double-precision floating-point value represented
417 by the denormalized significand `aSig'.  The normalized exponent and
418 significand are stored at the locations pointed to by `zExpPtr' and
419 `zSigPtr', respectively.
420 -------------------------------------------------------------------------------
421 */
422 static void
423  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
424 {
425     int8 shiftCount;
426 
427     shiftCount = countLeadingZeros64( aSig ) - 11;
428     *zSigPtr = aSig<<shiftCount;
429     *zExpPtr = 1 - shiftCount;
430 
431 }
432 
433 /*
434 -------------------------------------------------------------------------------
435 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
436 double-precision floating-point value, returning the result.  After being
437 shifted into the proper positions, the three fields are simply added
438 together to form the result.  This means that any integer portion of `zSig'
439 will be added into the exponent.  Since a properly normalized significand
440 will have an integer portion equal to 1, the `zExp' input should be 1 less
441 than the desired result exponent whenever `zSig' is a complete, normalized
442 significand.
443 -------------------------------------------------------------------------------
444 */
445 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
446 {
447 
448     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
449 			   ( ( (bits64) zExp )<<52 ) + zSig );
450 
451 }
452 
453 /*
454 -------------------------------------------------------------------------------
455 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
456 and significand `zSig', and returns the proper double-precision floating-
457 point value corresponding to the abstract input.  Ordinarily, the abstract
458 value is simply rounded and packed into the double-precision format, with
459 the inexact exception raised if the abstract input cannot be represented
460 exactly.  However, if the abstract value is too large, the overflow and
461 inexact exceptions are raised and an infinity or maximal finite value is
462 returned.  If the abstract value is too small, the input value is rounded to
463 a subnormal number, and the underflow and inexact exceptions are raised if
464 the abstract input cannot be represented exactly as a subnormal double-
465 precision floating-point number.
466     The input significand `zSig' has its binary point between bits 62
467 and 61, which is 10 bits to the left of the usual location.  This shifted
468 significand must be normalized or smaller.  If `zSig' is not normalized,
469 `zExp' must be 0; in that case, the result returned is a subnormal number,
470 and it must not require rounding.  In the usual case that `zSig' is
471 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
472 The handling of underflow and overflow follows the IEC/IEEE Standard for
473 Binary Floating-Point Arithmetic.
474 -------------------------------------------------------------------------------
475 */
476 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
477 {
478     int8 roundingMode;
479     flag roundNearestEven;
480     int16 roundIncrement, roundBits;
481     flag isTiny;
482 
483     roundingMode = float_rounding_mode;
484     roundNearestEven = ( roundingMode == float_round_nearest_even );
485     roundIncrement = 0x200;
486     if ( ! roundNearestEven ) {
487         if ( roundingMode == float_round_to_zero ) {
488             roundIncrement = 0;
489         }
490         else {
491             roundIncrement = 0x3FF;
492             if ( zSign ) {
493                 if ( roundingMode == float_round_up ) roundIncrement = 0;
494             }
495             else {
496                 if ( roundingMode == float_round_down ) roundIncrement = 0;
497             }
498         }
499     }
500     roundBits = zSig & 0x3FF;
501     if ( 0x7FD <= (bits16) zExp ) {
502         if (    ( 0x7FD < zExp )
503              || (    ( zExp == 0x7FD )
504                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
505            ) {
506             float_raise( float_flag_overflow | float_flag_inexact );
507             return FLOAT64_MANGLE(
508 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
509 		( roundIncrement == 0 ));
510         }
511         if ( zExp < 0 ) {
512             isTiny =
513                    ( float_detect_tininess == float_tininess_before_rounding )
514                 || ( zExp < -1 )
515                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
516             shift64RightJamming( zSig, - zExp, &zSig );
517             zExp = 0;
518             roundBits = zSig & 0x3FF;
519             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
520         }
521     }
522     if ( roundBits ) float_exception_flags |= float_flag_inexact;
523     zSig = ( zSig + roundIncrement )>>10;
524     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
525     if ( zSig == 0 ) zExp = 0;
526     return packFloat64( zSign, zExp, zSig );
527 
528 }
529 
530 /*
531 -------------------------------------------------------------------------------
532 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
533 and significand `zSig', and returns the proper double-precision floating-
534 point value corresponding to the abstract input.  This routine is just like
535 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
536 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
537 floating-point exponent.
538 -------------------------------------------------------------------------------
539 */
540 static float64
541  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
542 {
543     int8 shiftCount;
544 
545     shiftCount = countLeadingZeros64( zSig ) - 1;
546     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
547 
548 }
549 
550 #ifdef FLOATX80
551 
552 /*
553 -------------------------------------------------------------------------------
554 Returns the fraction bits of the extended double-precision floating-point
555 value `a'.
556 -------------------------------------------------------------------------------
557 */
558 INLINE bits64 extractFloatx80Frac( floatx80 a )
559 {
560 
561     return a.low;
562 
563 }
564 
565 /*
566 -------------------------------------------------------------------------------
567 Returns the exponent bits of the extended double-precision floating-point
568 value `a'.
569 -------------------------------------------------------------------------------
570 */
571 INLINE int32 extractFloatx80Exp( floatx80 a )
572 {
573 
574     return a.high & 0x7FFF;
575 
576 }
577 
578 /*
579 -------------------------------------------------------------------------------
580 Returns the sign bit of the extended double-precision floating-point value
581 `a'.
582 -------------------------------------------------------------------------------
583 */
584 INLINE flag extractFloatx80Sign( floatx80 a )
585 {
586 
587     return a.high>>15;
588 
589 }
590 
591 /*
592 -------------------------------------------------------------------------------
593 Normalizes the subnormal extended double-precision floating-point value
594 represented by the denormalized significand `aSig'.  The normalized exponent
595 and significand are stored at the locations pointed to by `zExpPtr' and
596 `zSigPtr', respectively.
597 -------------------------------------------------------------------------------
598 */
599 static void
600  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
601 {
602     int8 shiftCount;
603 
604     shiftCount = countLeadingZeros64( aSig );
605     *zSigPtr = aSig<<shiftCount;
606     *zExpPtr = 1 - shiftCount;
607 
608 }
609 
610 /*
611 -------------------------------------------------------------------------------
612 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
613 extended double-precision floating-point value, returning the result.
614 -------------------------------------------------------------------------------
615 */
616 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
617 {
618     floatx80 z;
619 
620     z.low = zSig;
621     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
622     return z;
623 
624 }
625 
626 /*
627 -------------------------------------------------------------------------------
628 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 and returns the proper extended double-precision floating-point value
631 corresponding to the abstract input.  Ordinarily, the abstract value is
632 rounded and packed into the extended double-precision format, with the
633 inexact exception raised if the abstract input cannot be represented
634 exactly.  However, if the abstract value is too large, the overflow and
635 inexact exceptions are raised and an infinity or maximal finite value is
636 returned.  If the abstract value is too small, the input value is rounded to
637 a subnormal number, and the underflow and inexact exceptions are raised if
638 the abstract input cannot be represented exactly as a subnormal extended
639 double-precision floating-point number.
640     If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 number of bits as single or double precision, respectively.  Otherwise, the
642 result is rounded to the full precision of the extended double-precision
643 format.
644     The input significand must be normalized or smaller.  If the input
645 significand is not normalized, `zExp' must be 0; in that case, the result
646 returned is a subnormal number, and it must not require rounding.  The
647 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 Floating-Point Arithmetic.
649 -------------------------------------------------------------------------------
650 */
651 static floatx80
652  roundAndPackFloatx80(
653      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
654  )
655 {
656     int8 roundingMode;
657     flag roundNearestEven, increment, isTiny;
658     int64 roundIncrement, roundMask, roundBits;
659 
660     roundingMode = float_rounding_mode;
661     roundNearestEven = ( roundingMode == float_round_nearest_even );
662     if ( roundingPrecision == 80 ) goto precision80;
663     if ( roundingPrecision == 64 ) {
664         roundIncrement = LIT64( 0x0000000000000400 );
665         roundMask = LIT64( 0x00000000000007FF );
666     }
667     else if ( roundingPrecision == 32 ) {
668         roundIncrement = LIT64( 0x0000008000000000 );
669         roundMask = LIT64( 0x000000FFFFFFFFFF );
670     }
671     else {
672         goto precision80;
673     }
674     zSig0 |= ( zSig1 != 0 );
675     if ( ! roundNearestEven ) {
676         if ( roundingMode == float_round_to_zero ) {
677             roundIncrement = 0;
678         }
679         else {
680             roundIncrement = roundMask;
681             if ( zSign ) {
682                 if ( roundingMode == float_round_up ) roundIncrement = 0;
683             }
684             else {
685                 if ( roundingMode == float_round_down ) roundIncrement = 0;
686             }
687         }
688     }
689     roundBits = zSig0 & roundMask;
690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691         if (    ( 0x7FFE < zExp )
692              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
693            ) {
694             goto overflow;
695         }
696         if ( zExp <= 0 ) {
697             isTiny =
698                    ( float_detect_tininess == float_tininess_before_rounding )
699                 || ( zExp < 0 )
700                 || ( zSig0 <= zSig0 + roundIncrement );
701             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
702             zExp = 0;
703             roundBits = zSig0 & roundMask;
704             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
705             if ( roundBits ) float_exception_flags |= float_flag_inexact;
706             zSig0 += roundIncrement;
707             if ( (sbits64) zSig0 < 0 ) zExp = 1;
708             roundIncrement = roundMask + 1;
709             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
710                 roundMask |= roundIncrement;
711             }
712             zSig0 &= ~ roundMask;
713             return packFloatx80( zSign, zExp, zSig0 );
714         }
715     }
716     if ( roundBits ) float_exception_flags |= float_flag_inexact;
717     zSig0 += roundIncrement;
718     if ( zSig0 < roundIncrement ) {
719         ++zExp;
720         zSig0 = LIT64( 0x8000000000000000 );
721     }
722     roundIncrement = roundMask + 1;
723     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
724         roundMask |= roundIncrement;
725     }
726     zSig0 &= ~ roundMask;
727     if ( zSig0 == 0 ) zExp = 0;
728     return packFloatx80( zSign, zExp, zSig0 );
729  precision80:
730     increment = ( (sbits64) zSig1 < 0 );
731     if ( ! roundNearestEven ) {
732         if ( roundingMode == float_round_to_zero ) {
733             increment = 0;
734         }
735         else {
736             if ( zSign ) {
737                 increment = ( roundingMode == float_round_down ) && zSig1;
738             }
739             else {
740                 increment = ( roundingMode == float_round_up ) && zSig1;
741             }
742         }
743     }
744     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
745         if (    ( 0x7FFE < zExp )
746              || (    ( zExp == 0x7FFE )
747                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
748                   && increment
749                 )
750            ) {
751             roundMask = 0;
752  overflow:
753             float_raise( float_flag_overflow | float_flag_inexact );
754             if (    ( roundingMode == float_round_to_zero )
755                  || ( zSign && ( roundingMode == float_round_up ) )
756                  || ( ! zSign && ( roundingMode == float_round_down ) )
757                ) {
758                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
759             }
760             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
761         }
762         if ( zExp <= 0 ) {
763             isTiny =
764                    ( float_detect_tininess == float_tininess_before_rounding )
765                 || ( zExp < 0 )
766                 || ! increment
767                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
768             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
769             zExp = 0;
770             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
771             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
772             if ( roundNearestEven ) {
773                 increment = ( (sbits64) zSig1 < 0 );
774             }
775             else {
776                 if ( zSign ) {
777                     increment = ( roundingMode == float_round_down ) && zSig1;
778                 }
779                 else {
780                     increment = ( roundingMode == float_round_up ) && zSig1;
781                 }
782             }
783             if ( increment ) {
784                 ++zSig0;
785                 zSig0 &=
786                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
787                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
788             }
789             return packFloatx80( zSign, zExp, zSig0 );
790         }
791     }
792     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
793     if ( increment ) {
794         ++zSig0;
795         if ( zSig0 == 0 ) {
796             ++zExp;
797             zSig0 = LIT64( 0x8000000000000000 );
798         }
799         else {
800             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
801         }
802     }
803     else {
804         if ( zSig0 == 0 ) zExp = 0;
805     }
806     return packFloatx80( zSign, zExp, zSig0 );
807 
808 }
809 
810 /*
811 -------------------------------------------------------------------------------
812 Takes an abstract floating-point value having sign `zSign', exponent
813 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 and returns the proper extended double-precision floating-point value
815 corresponding to the abstract input.  This routine is just like
816 `roundAndPackFloatx80' except that the input significand does not have to be
817 normalized.
818 -------------------------------------------------------------------------------
819 */
820 static floatx80
821  normalizeRoundAndPackFloatx80(
822      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
823  )
824 {
825     int8 shiftCount;
826 
827     if ( zSig0 == 0 ) {
828         zSig0 = zSig1;
829         zSig1 = 0;
830         zExp -= 64;
831     }
832     shiftCount = countLeadingZeros64( zSig0 );
833     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834     zExp -= shiftCount;
835     return
836         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
837 
838 }
839 
840 #endif
841 
842 #ifdef FLOAT128
843 
844 /*
845 -------------------------------------------------------------------------------
846 Returns the least-significant 64 fraction bits of the quadruple-precision
847 floating-point value `a'.
848 -------------------------------------------------------------------------------
849 */
850 INLINE bits64 extractFloat128Frac1( float128 a )
851 {
852 
853     return a.low;
854 
855 }
856 
857 /*
858 -------------------------------------------------------------------------------
859 Returns the most-significant 48 fraction bits of the quadruple-precision
860 floating-point value `a'.
861 -------------------------------------------------------------------------------
862 */
863 INLINE bits64 extractFloat128Frac0( float128 a )
864 {
865 
866     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
867 
868 }
869 
870 /*
871 -------------------------------------------------------------------------------
872 Returns the exponent bits of the quadruple-precision floating-point value
873 `a'.
874 -------------------------------------------------------------------------------
875 */
876 INLINE int32 extractFloat128Exp( float128 a )
877 {
878 
879     return ( a.high>>48 ) & 0x7FFF;
880 
881 }
882 
883 /*
884 -------------------------------------------------------------------------------
885 Returns the sign bit of the quadruple-precision floating-point value `a'.
886 -------------------------------------------------------------------------------
887 */
888 INLINE flag extractFloat128Sign( float128 a )
889 {
890 
891     return a.high>>63;
892 
893 }
894 
895 /*
896 -------------------------------------------------------------------------------
897 Normalizes the subnormal quadruple-precision floating-point value
898 represented by the denormalized significand formed by the concatenation of
899 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
900 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
901 significand are stored at the location pointed to by `zSig0Ptr', and the
902 least significant 64 bits of the normalized significand are stored at the
903 location pointed to by `zSig1Ptr'.
904 -------------------------------------------------------------------------------
905 */
906 static void
907  normalizeFloat128Subnormal(
908      bits64 aSig0,
909      bits64 aSig1,
910      int32 *zExpPtr,
911      bits64 *zSig0Ptr,
912      bits64 *zSig1Ptr
913  )
914 {
915     int8 shiftCount;
916 
917     if ( aSig0 == 0 ) {
918         shiftCount = countLeadingZeros64( aSig1 ) - 15;
919         if ( shiftCount < 0 ) {
920             *zSig0Ptr = aSig1>>( - shiftCount );
921             *zSig1Ptr = aSig1<<( shiftCount & 63 );
922         }
923         else {
924             *zSig0Ptr = aSig1<<shiftCount;
925             *zSig1Ptr = 0;
926         }
927         *zExpPtr = - shiftCount - 63;
928     }
929     else {
930         shiftCount = countLeadingZeros64( aSig0 ) - 15;
931         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932         *zExpPtr = 1 - shiftCount;
933     }
934 
935 }
936 
937 /*
938 -------------------------------------------------------------------------------
939 Packs the sign `zSign', the exponent `zExp', and the significand formed
940 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
941 floating-point value, returning the result.  After being shifted into the
942 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
943 added together to form the most significant 32 bits of the result.  This
944 means that any integer portion of `zSig0' will be added into the exponent.
945 Since a properly normalized significand will have an integer portion equal
946 to 1, the `zExp' input should be 1 less than the desired result exponent
947 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
948 significand.
949 -------------------------------------------------------------------------------
950 */
951 INLINE float128
952  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
953 {
954     float128 z;
955 
956     z.low = zSig1;
957     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
958     return z;
959 
960 }
961 
962 /*
963 -------------------------------------------------------------------------------
964 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
965 and extended significand formed by the concatenation of `zSig0', `zSig1',
966 and `zSig2', and returns the proper quadruple-precision floating-point value
967 corresponding to the abstract input.  Ordinarily, the abstract value is
968 simply rounded and packed into the quadruple-precision format, with the
969 inexact exception raised if the abstract input cannot be represented
970 exactly.  However, if the abstract value is too large, the overflow and
971 inexact exceptions are raised and an infinity or maximal finite value is
972 returned.  If the abstract value is too small, the input value is rounded to
973 a subnormal number, and the underflow and inexact exceptions are raised if
974 the abstract input cannot be represented exactly as a subnormal quadruple-
975 precision floating-point number.
976     The input significand must be normalized or smaller.  If the input
977 significand is not normalized, `zExp' must be 0; in that case, the result
978 returned is a subnormal number, and it must not require rounding.  In the
979 usual case that the input significand is normalized, `zExp' must be 1 less
980 than the ``true'' floating-point exponent.  The handling of underflow and
981 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
982 -------------------------------------------------------------------------------
983 */
984 static float128
985  roundAndPackFloat128(
986      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
987 {
988     int8 roundingMode;
989     flag roundNearestEven, increment, isTiny;
990 
991     roundingMode = float_rounding_mode;
992     roundNearestEven = ( roundingMode == float_round_nearest_even );
993     increment = ( (sbits64) zSig2 < 0 );
994     if ( ! roundNearestEven ) {
995         if ( roundingMode == float_round_to_zero ) {
996             increment = 0;
997         }
998         else {
999             if ( zSign ) {
1000                 increment = ( roundingMode == float_round_down ) && zSig2;
1001             }
1002             else {
1003                 increment = ( roundingMode == float_round_up ) && zSig2;
1004             }
1005         }
1006     }
1007     if ( 0x7FFD <= (bits32) zExp ) {
1008         if (    ( 0x7FFD < zExp )
1009              || (    ( zExp == 0x7FFD )
1010                   && eq128(
1011                          LIT64( 0x0001FFFFFFFFFFFF ),
1012                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1013                          zSig0,
1014                          zSig1
1015                      )
1016                   && increment
1017                 )
1018            ) {
1019             float_raise( float_flag_overflow | float_flag_inexact );
1020             if (    ( roundingMode == float_round_to_zero )
1021                  || ( zSign && ( roundingMode == float_round_up ) )
1022                  || ( ! zSign && ( roundingMode == float_round_down ) )
1023                ) {
1024                 return
1025                     packFloat128(
1026                         zSign,
1027                         0x7FFE,
1028                         LIT64( 0x0000FFFFFFFFFFFF ),
1029                         LIT64( 0xFFFFFFFFFFFFFFFF )
1030                     );
1031             }
1032             return packFloat128( zSign, 0x7FFF, 0, 0 );
1033         }
1034         if ( zExp < 0 ) {
1035             isTiny =
1036                    ( float_detect_tininess == float_tininess_before_rounding )
1037                 || ( zExp < -1 )
1038                 || ! increment
1039                 || lt128(
1040                        zSig0,
1041                        zSig1,
1042                        LIT64( 0x0001FFFFFFFFFFFF ),
1043                        LIT64( 0xFFFFFFFFFFFFFFFF )
1044                    );
1045             shift128ExtraRightJamming(
1046                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1047             zExp = 0;
1048             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1049             if ( roundNearestEven ) {
1050                 increment = ( (sbits64) zSig2 < 0 );
1051             }
1052             else {
1053                 if ( zSign ) {
1054                     increment = ( roundingMode == float_round_down ) && zSig2;
1055                 }
1056                 else {
1057                     increment = ( roundingMode == float_round_up ) && zSig2;
1058                 }
1059             }
1060         }
1061     }
1062     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1063     if ( increment ) {
1064         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1065         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1066     }
1067     else {
1068         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1069     }
1070     return packFloat128( zSign, zExp, zSig0, zSig1 );
1071 
1072 }
1073 
1074 /*
1075 -------------------------------------------------------------------------------
1076 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077 and significand formed by the concatenation of `zSig0' and `zSig1', and
1078 returns the proper quadruple-precision floating-point value corresponding
1079 to the abstract input.  This routine is just like `roundAndPackFloat128'
1080 except that the input significand has fewer bits and does not have to be
1081 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1082 point exponent.
1083 -------------------------------------------------------------------------------
1084 */
1085 static float128
1086  normalizeRoundAndPackFloat128(
1087      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1088 {
1089     int8 shiftCount;
1090     bits64 zSig2;
1091 
1092     if ( zSig0 == 0 ) {
1093         zSig0 = zSig1;
1094         zSig1 = 0;
1095         zExp -= 64;
1096     }
1097     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1098     if ( 0 <= shiftCount ) {
1099         zSig2 = 0;
1100         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1101     }
1102     else {
1103         shift128ExtraRightJamming(
1104             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1105     }
1106     zExp -= shiftCount;
1107     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1108 
1109 }
1110 
1111 #endif
1112 
1113 /*
1114 -------------------------------------------------------------------------------
1115 Returns the result of converting the 32-bit two's complement integer `a'
1116 to the single-precision floating-point format.  The conversion is performed
1117 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1118 -------------------------------------------------------------------------------
1119 */
1120 float32 int32_to_float32( int32 a )
1121 {
1122     flag zSign;
1123 
1124     if ( a == 0 ) return 0;
1125     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1126     zSign = ( a < 0 );
1127     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1128 
1129 }
1130 
1131 /*
1132 -------------------------------------------------------------------------------
1133 Returns the result of converting the 32-bit two's complement integer `a'
1134 to the double-precision floating-point format.  The conversion is performed
1135 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1136 -------------------------------------------------------------------------------
1137 */
1138 float64 int32_to_float64( int32 a )
1139 {
1140     flag zSign;
1141     uint32 absA;
1142     int8 shiftCount;
1143     bits64 zSig;
1144 
1145     if ( a == 0 ) return 0;
1146     zSign = ( a < 0 );
1147     absA = zSign ? - a : a;
1148     shiftCount = countLeadingZeros32( absA ) + 21;
1149     zSig = absA;
1150     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1151 
1152 }
1153 
1154 #ifdef FLOATX80
1155 
1156 /*
1157 -------------------------------------------------------------------------------
1158 Returns the result of converting the 32-bit two's complement integer `a'
1159 to the extended double-precision floating-point format.  The conversion
1160 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161 Arithmetic.
1162 -------------------------------------------------------------------------------
1163 */
1164 floatx80 int32_to_floatx80( int32 a )
1165 {
1166     flag zSign;
1167     uint32 absA;
1168     int8 shiftCount;
1169     bits64 zSig;
1170 
1171     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172     zSign = ( a < 0 );
1173     absA = zSign ? - a : a;
1174     shiftCount = countLeadingZeros32( absA ) + 32;
1175     zSig = absA;
1176     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177 
1178 }
1179 
1180 #endif
1181 
1182 #ifdef FLOAT128
1183 
1184 /*
1185 -------------------------------------------------------------------------------
1186 Returns the result of converting the 32-bit two's complement integer `a' to
1187 the quadruple-precision floating-point format.  The conversion is performed
1188 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1189 -------------------------------------------------------------------------------
1190 */
1191 float128 int32_to_float128( int32 a )
1192 {
1193     flag zSign;
1194     uint32 absA;
1195     int8 shiftCount;
1196     bits64 zSig0;
1197 
1198     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1199     zSign = ( a < 0 );
1200     absA = zSign ? - a : a;
1201     shiftCount = countLeadingZeros32( absA ) + 17;
1202     zSig0 = absA;
1203     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1204 
1205 }
1206 
1207 #endif
1208 
1209 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1210 /*
1211 -------------------------------------------------------------------------------
1212 Returns the result of converting the 64-bit two's complement integer `a'
1213 to the single-precision floating-point format.  The conversion is performed
1214 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1215 -------------------------------------------------------------------------------
1216 */
1217 float32 int64_to_float32( int64 a )
1218 {
1219     flag zSign;
1220     uint64 absA;
1221     int8 shiftCount;
1222 
1223     if ( a == 0 ) return 0;
1224     zSign = ( a < 0 );
1225     absA = zSign ? - a : a;
1226     shiftCount = countLeadingZeros64( absA ) - 40;
1227     if ( 0 <= shiftCount ) {
1228         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1229     }
1230     else {
1231         shiftCount += 7;
1232         if ( shiftCount < 0 ) {
1233             shift64RightJamming( absA, - shiftCount, &absA );
1234         }
1235         else {
1236             absA <<= shiftCount;
1237         }
1238         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1239     }
1240 
1241 }
1242 
1243 /*
1244 -------------------------------------------------------------------------------
1245 Returns the result of converting the 64-bit two's complement integer `a'
1246 to the double-precision floating-point format.  The conversion is performed
1247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1248 -------------------------------------------------------------------------------
1249 */
1250 float64 int64_to_float64( int64 a )
1251 {
1252     flag zSign;
1253 
1254     if ( a == 0 ) return 0;
1255     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1256         return packFloat64( 1, 0x43E, 0 );
1257     }
1258     zSign = ( a < 0 );
1259     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1260 
1261 }
1262 
1263 #ifdef FLOATX80
1264 
1265 /*
1266 -------------------------------------------------------------------------------
1267 Returns the result of converting the 64-bit two's complement integer `a'
1268 to the extended double-precision floating-point format.  The conversion
1269 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1270 Arithmetic.
1271 -------------------------------------------------------------------------------
1272 */
1273 floatx80 int64_to_floatx80( int64 a )
1274 {
1275     flag zSign;
1276     uint64 absA;
1277     int8 shiftCount;
1278 
1279     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1280     zSign = ( a < 0 );
1281     absA = zSign ? - a : a;
1282     shiftCount = countLeadingZeros64( absA );
1283     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1284 
1285 }
1286 
1287 #endif
1288 
1289 #endif /* !SOFTFLOAT_FOR_GCC */
1290 
1291 #ifdef FLOAT128
1292 
1293 /*
1294 -------------------------------------------------------------------------------
1295 Returns the result of converting the 64-bit two's complement integer `a' to
1296 the quadruple-precision floating-point format.  The conversion is performed
1297 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1298 -------------------------------------------------------------------------------
1299 */
1300 float128 int64_to_float128( int64 a )
1301 {
1302     flag zSign;
1303     uint64 absA;
1304     int8 shiftCount;
1305     int32 zExp;
1306     bits64 zSig0, zSig1;
1307 
1308     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1309     zSign = ( a < 0 );
1310     absA = zSign ? - a : a;
1311     shiftCount = countLeadingZeros64( absA ) + 49;
1312     zExp = 0x406E - shiftCount;
1313     if ( 64 <= shiftCount ) {
1314         zSig1 = 0;
1315         zSig0 = absA;
1316         shiftCount -= 64;
1317     }
1318     else {
1319         zSig1 = absA;
1320         zSig0 = 0;
1321     }
1322     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1323     return packFloat128( zSign, zExp, zSig0, zSig1 );
1324 
1325 }
1326 
1327 #endif
1328 
1329 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1330 /*
1331 -------------------------------------------------------------------------------
1332 Returns the result of converting the single-precision floating-point value
1333 `a' to the 32-bit two's complement integer format.  The conversion is
1334 performed according to the IEC/IEEE Standard for Binary Floating-Point
1335 Arithmetic---which means in particular that the conversion is rounded
1336 according to the current rounding mode.  If `a' is a NaN, the largest
1337 positive integer is returned.  Otherwise, if the conversion overflows, the
1338 largest integer with the same sign as `a' is returned.
1339 -------------------------------------------------------------------------------
1340 */
1341 int32 float32_to_int32( float32 a )
1342 {
1343     flag aSign;
1344     int16 aExp, shiftCount;
1345     bits32 aSig;
1346     bits64 aSig64;
1347 
1348     aSig = extractFloat32Frac( a );
1349     aExp = extractFloat32Exp( a );
1350     aSign = extractFloat32Sign( a );
1351     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1352     if ( aExp ) aSig |= 0x00800000;
1353     shiftCount = 0xAF - aExp;
1354     aSig64 = aSig;
1355     aSig64 <<= 32;
1356     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1357     return roundAndPackInt32( aSign, aSig64 );
1358 
1359 }
1360 #endif /* !SOFTFLOAT_FOR_GCC */
1361 
1362 /*
1363 -------------------------------------------------------------------------------
1364 Returns the result of converting the single-precision floating-point value
1365 `a' to the 32-bit two's complement integer format.  The conversion is
1366 performed according to the IEC/IEEE Standard for Binary Floating-Point
1367 Arithmetic, except that the conversion is always rounded toward zero.
1368 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1369 the conversion overflows, the largest integer with the same sign as `a' is
1370 returned.
1371 -------------------------------------------------------------------------------
1372 */
1373 int32 float32_to_int32_round_to_zero( float32 a )
1374 {
1375     flag aSign;
1376     int16 aExp, shiftCount;
1377     bits32 aSig;
1378     int32 z;
1379 
1380     aSig = extractFloat32Frac( a );
1381     aExp = extractFloat32Exp( a );
1382     aSign = extractFloat32Sign( a );
1383     shiftCount = aExp - 0x9E;
1384     if ( 0 <= shiftCount ) {
1385         if ( a != 0xCF000000 ) {
1386             float_raise( float_flag_invalid );
1387             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1388         }
1389         return (sbits32) 0x80000000;
1390     }
1391     else if ( aExp <= 0x7E ) {
1392         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1393         return 0;
1394     }
1395     aSig = ( aSig | 0x00800000 )<<8;
1396     z = aSig>>( - shiftCount );
1397     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1398         float_exception_flags |= float_flag_inexact;
1399     }
1400     if ( aSign ) z = - z;
1401     return z;
1402 
1403 }
1404 
1405 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1406 /*
1407 -------------------------------------------------------------------------------
1408 Returns the result of converting the single-precision floating-point value
1409 `a' to the 64-bit two's complement integer format.  The conversion is
1410 performed according to the IEC/IEEE Standard for Binary Floating-Point
1411 Arithmetic---which means in particular that the conversion is rounded
1412 according to the current rounding mode.  If `a' is a NaN, the largest
1413 positive integer is returned.  Otherwise, if the conversion overflows, the
1414 largest integer with the same sign as `a' is returned.
1415 -------------------------------------------------------------------------------
1416 */
1417 int64 float32_to_int64( float32 a )
1418 {
1419     flag aSign;
1420     int16 aExp, shiftCount;
1421     bits32 aSig;
1422     bits64 aSig64, aSigExtra;
1423 
1424     aSig = extractFloat32Frac( a );
1425     aExp = extractFloat32Exp( a );
1426     aSign = extractFloat32Sign( a );
1427     shiftCount = 0xBE - aExp;
1428     if ( shiftCount < 0 ) {
1429         float_raise( float_flag_invalid );
1430         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1431             return LIT64( 0x7FFFFFFFFFFFFFFF );
1432         }
1433         return (sbits64) LIT64( 0x8000000000000000 );
1434     }
1435     if ( aExp ) aSig |= 0x00800000;
1436     aSig64 = aSig;
1437     aSig64 <<= 40;
1438     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1439     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1440 
1441 }
1442 
1443 /*
1444 -------------------------------------------------------------------------------
1445 Returns the result of converting the single-precision floating-point value
1446 `a' to the 64-bit two's complement integer format.  The conversion is
1447 performed according to the IEC/IEEE Standard for Binary Floating-Point
1448 Arithmetic, except that the conversion is always rounded toward zero.  If
1449 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1450 conversion overflows, the largest integer with the same sign as `a' is
1451 returned.
1452 -------------------------------------------------------------------------------
1453 */
1454 int64 float32_to_int64_round_to_zero( float32 a )
1455 {
1456     flag aSign;
1457     int16 aExp, shiftCount;
1458     bits32 aSig;
1459     bits64 aSig64;
1460     int64 z;
1461 
1462     aSig = extractFloat32Frac( a );
1463     aExp = extractFloat32Exp( a );
1464     aSign = extractFloat32Sign( a );
1465     shiftCount = aExp - 0xBE;
1466     if ( 0 <= shiftCount ) {
1467         if ( a != 0xDF000000 ) {
1468             float_raise( float_flag_invalid );
1469             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1470                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1471             }
1472         }
1473         return (sbits64) LIT64( 0x8000000000000000 );
1474     }
1475     else if ( aExp <= 0x7E ) {
1476         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1477         return 0;
1478     }
1479     aSig64 = aSig | 0x00800000;
1480     aSig64 <<= 40;
1481     z = aSig64>>( - shiftCount );
1482     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1483         float_exception_flags |= float_flag_inexact;
1484     }
1485     if ( aSign ) z = - z;
1486     return z;
1487 
1488 }
1489 #endif /* !SOFTFLOAT_FOR_GCC */
1490 
1491 /*
1492 -------------------------------------------------------------------------------
1493 Returns the result of converting the single-precision floating-point value
1494 `a' to the double-precision floating-point format.  The conversion is
1495 performed according to the IEC/IEEE Standard for Binary Floating-Point
1496 Arithmetic.
1497 -------------------------------------------------------------------------------
1498 */
1499 float64 float32_to_float64( float32 a )
1500 {
1501     flag aSign;
1502     int16 aExp;
1503     bits32 aSig;
1504 
1505     aSig = extractFloat32Frac( a );
1506     aExp = extractFloat32Exp( a );
1507     aSign = extractFloat32Sign( a );
1508     if ( aExp == 0xFF ) {
1509         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1510         return packFloat64( aSign, 0x7FF, 0 );
1511     }
1512     if ( aExp == 0 ) {
1513         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1514         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1515         --aExp;
1516     }
1517     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1518 
1519 }
1520 
1521 #ifdef FLOATX80
1522 
1523 /*
1524 -------------------------------------------------------------------------------
1525 Returns the result of converting the single-precision floating-point value
1526 `a' to the extended double-precision floating-point format.  The conversion
1527 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1528 Arithmetic.
1529 -------------------------------------------------------------------------------
1530 */
1531 floatx80 float32_to_floatx80( float32 a )
1532 {
1533     flag aSign;
1534     int16 aExp;
1535     bits32 aSig;
1536 
1537     aSig = extractFloat32Frac( a );
1538     aExp = extractFloat32Exp( a );
1539     aSign = extractFloat32Sign( a );
1540     if ( aExp == 0xFF ) {
1541         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1542         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1543     }
1544     if ( aExp == 0 ) {
1545         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1546         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1547     }
1548     aSig |= 0x00800000;
1549     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1550 
1551 }
1552 
1553 #endif
1554 
1555 #ifdef FLOAT128
1556 
1557 /*
1558 -------------------------------------------------------------------------------
1559 Returns the result of converting the single-precision floating-point value
1560 `a' to the double-precision floating-point format.  The conversion is
1561 performed according to the IEC/IEEE Standard for Binary Floating-Point
1562 Arithmetic.
1563 -------------------------------------------------------------------------------
1564 */
1565 float128 float32_to_float128( float32 a )
1566 {
1567     flag aSign;
1568     int16 aExp;
1569     bits32 aSig;
1570 
1571     aSig = extractFloat32Frac( a );
1572     aExp = extractFloat32Exp( a );
1573     aSign = extractFloat32Sign( a );
1574     if ( aExp == 0xFF ) {
1575         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1576         return packFloat128( aSign, 0x7FFF, 0, 0 );
1577     }
1578     if ( aExp == 0 ) {
1579         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1580         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1581         --aExp;
1582     }
1583     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1584 
1585 }
1586 
1587 #endif
1588 
1589 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1590 /*
1591 -------------------------------------------------------------------------------
1592 Rounds the single-precision floating-point value `a' to an integer, and
1593 returns the result as a single-precision floating-point value.  The
1594 operation is performed according to the IEC/IEEE Standard for Binary
1595 Floating-Point Arithmetic.
1596 -------------------------------------------------------------------------------
1597 */
1598 float32 float32_round_to_int( float32 a )
1599 {
1600     flag aSign;
1601     int16 aExp;
1602     bits32 lastBitMask, roundBitsMask;
1603     int8 roundingMode;
1604     float32 z;
1605 
1606     aExp = extractFloat32Exp( a );
1607     if ( 0x96 <= aExp ) {
1608         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1609             return propagateFloat32NaN( a, a );
1610         }
1611         return a;
1612     }
1613     if ( aExp <= 0x7E ) {
1614         if ( (bits32) ( a<<1 ) == 0 ) return a;
1615         float_exception_flags |= float_flag_inexact;
1616         aSign = extractFloat32Sign( a );
1617         switch ( float_rounding_mode ) {
1618          case float_round_nearest_even:
1619             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1620                 return packFloat32( aSign, 0x7F, 0 );
1621             }
1622             break;
1623 	 case float_round_to_zero:
1624 	    break;
1625          case float_round_down:
1626             return aSign ? 0xBF800000 : 0;
1627          case float_round_up:
1628             return aSign ? 0x80000000 : 0x3F800000;
1629         }
1630         return packFloat32( aSign, 0, 0 );
1631     }
1632     lastBitMask = 1;
1633     lastBitMask <<= 0x96 - aExp;
1634     roundBitsMask = lastBitMask - 1;
1635     z = a;
1636     roundingMode = float_rounding_mode;
1637     if ( roundingMode == float_round_nearest_even ) {
1638         z += lastBitMask>>1;
1639         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1640     }
1641     else if ( roundingMode != float_round_to_zero ) {
1642         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1643             z += roundBitsMask;
1644         }
1645     }
1646     z &= ~ roundBitsMask;
1647     if ( z != a ) float_exception_flags |= float_flag_inexact;
1648     return z;
1649 
1650 }
1651 #endif /* !SOFTFLOAT_FOR_GCC */
1652 
1653 /*
1654 -------------------------------------------------------------------------------
1655 Returns the result of adding the absolute values of the single-precision
1656 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1657 before being returned.  `zSign' is ignored if the result is a NaN.
1658 The addition is performed according to the IEC/IEEE Standard for Binary
1659 Floating-Point Arithmetic.
1660 -------------------------------------------------------------------------------
1661 */
1662 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1663 {
1664     int16 aExp, bExp, zExp;
1665     bits32 aSig, bSig, zSig;
1666     int16 expDiff;
1667 
1668     aSig = extractFloat32Frac( a );
1669     aExp = extractFloat32Exp( a );
1670     bSig = extractFloat32Frac( b );
1671     bExp = extractFloat32Exp( b );
1672     expDiff = aExp - bExp;
1673     aSig <<= 6;
1674     bSig <<= 6;
1675     if ( 0 < expDiff ) {
1676         if ( aExp == 0xFF ) {
1677             if ( aSig ) return propagateFloat32NaN( a, b );
1678             return a;
1679         }
1680         if ( bExp == 0 ) {
1681             --expDiff;
1682         }
1683         else {
1684             bSig |= 0x20000000;
1685         }
1686         shift32RightJamming( bSig, expDiff, &bSig );
1687         zExp = aExp;
1688     }
1689     else if ( expDiff < 0 ) {
1690         if ( bExp == 0xFF ) {
1691             if ( bSig ) return propagateFloat32NaN( a, b );
1692             return packFloat32( zSign, 0xFF, 0 );
1693         }
1694         if ( aExp == 0 ) {
1695             ++expDiff;
1696         }
1697         else {
1698             aSig |= 0x20000000;
1699         }
1700         shift32RightJamming( aSig, - expDiff, &aSig );
1701         zExp = bExp;
1702     }
1703     else {
1704         if ( aExp == 0xFF ) {
1705             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1706             return a;
1707         }
1708         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1709         zSig = 0x40000000 + aSig + bSig;
1710         zExp = aExp;
1711         goto roundAndPack;
1712     }
1713     aSig |= 0x20000000;
1714     zSig = ( aSig + bSig )<<1;
1715     --zExp;
1716     if ( (sbits32) zSig < 0 ) {
1717         zSig = aSig + bSig;
1718         ++zExp;
1719     }
1720  roundAndPack:
1721     return roundAndPackFloat32( zSign, zExp, zSig );
1722 
1723 }
1724 
1725 /*
1726 -------------------------------------------------------------------------------
1727 Returns the result of subtracting the absolute values of the single-
1728 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1729 difference is negated before being returned.  `zSign' is ignored if the
1730 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1731 Standard for Binary Floating-Point Arithmetic.
1732 -------------------------------------------------------------------------------
1733 */
1734 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1735 {
1736     int16 aExp, bExp, zExp;
1737     bits32 aSig, bSig, zSig;
1738     int16 expDiff;
1739 
1740     aSig = extractFloat32Frac( a );
1741     aExp = extractFloat32Exp( a );
1742     bSig = extractFloat32Frac( b );
1743     bExp = extractFloat32Exp( b );
1744     expDiff = aExp - bExp;
1745     aSig <<= 7;
1746     bSig <<= 7;
1747     if ( 0 < expDiff ) goto aExpBigger;
1748     if ( expDiff < 0 ) goto bExpBigger;
1749     if ( aExp == 0xFF ) {
1750         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1751         float_raise( float_flag_invalid );
1752         return float32_default_nan;
1753     }
1754     if ( aExp == 0 ) {
1755         aExp = 1;
1756         bExp = 1;
1757     }
1758     if ( bSig < aSig ) goto aBigger;
1759     if ( aSig < bSig ) goto bBigger;
1760     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1761  bExpBigger:
1762     if ( bExp == 0xFF ) {
1763         if ( bSig ) return propagateFloat32NaN( a, b );
1764         return packFloat32( zSign ^ 1, 0xFF, 0 );
1765     }
1766     if ( aExp == 0 ) {
1767         ++expDiff;
1768     }
1769     else {
1770         aSig |= 0x40000000;
1771     }
1772     shift32RightJamming( aSig, - expDiff, &aSig );
1773     bSig |= 0x40000000;
1774  bBigger:
1775     zSig = bSig - aSig;
1776     zExp = bExp;
1777     zSign ^= 1;
1778     goto normalizeRoundAndPack;
1779  aExpBigger:
1780     if ( aExp == 0xFF ) {
1781         if ( aSig ) return propagateFloat32NaN( a, b );
1782         return a;
1783     }
1784     if ( bExp == 0 ) {
1785         --expDiff;
1786     }
1787     else {
1788         bSig |= 0x40000000;
1789     }
1790     shift32RightJamming( bSig, expDiff, &bSig );
1791     aSig |= 0x40000000;
1792  aBigger:
1793     zSig = aSig - bSig;
1794     zExp = aExp;
1795  normalizeRoundAndPack:
1796     --zExp;
1797     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1798 
1799 }
1800 
1801 /*
1802 -------------------------------------------------------------------------------
1803 Returns the result of adding the single-precision floating-point values `a'
1804 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1805 Binary Floating-Point Arithmetic.
1806 -------------------------------------------------------------------------------
1807 */
1808 float32 float32_add( float32 a, float32 b )
1809 {
1810     flag aSign, bSign;
1811 
1812     aSign = extractFloat32Sign( a );
1813     bSign = extractFloat32Sign( b );
1814     if ( aSign == bSign ) {
1815         return addFloat32Sigs( a, b, aSign );
1816     }
1817     else {
1818         return subFloat32Sigs( a, b, aSign );
1819     }
1820 
1821 }
1822 
1823 /*
1824 -------------------------------------------------------------------------------
1825 Returns the result of subtracting the single-precision floating-point values
1826 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1827 for Binary Floating-Point Arithmetic.
1828 -------------------------------------------------------------------------------
1829 */
1830 float32 float32_sub( float32 a, float32 b )
1831 {
1832     flag aSign, bSign;
1833 
1834     aSign = extractFloat32Sign( a );
1835     bSign = extractFloat32Sign( b );
1836     if ( aSign == bSign ) {
1837         return subFloat32Sigs( a, b, aSign );
1838     }
1839     else {
1840         return addFloat32Sigs( a, b, aSign );
1841     }
1842 
1843 }
1844 
1845 /*
1846 -------------------------------------------------------------------------------
1847 Returns the result of multiplying the single-precision floating-point values
1848 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1849 for Binary Floating-Point Arithmetic.
1850 -------------------------------------------------------------------------------
1851 */
1852 float32 float32_mul( float32 a, float32 b )
1853 {
1854     flag aSign, bSign, zSign;
1855     int16 aExp, bExp, zExp;
1856     bits32 aSig, bSig;
1857     bits64 zSig64;
1858     bits32 zSig;
1859 
1860     aSig = extractFloat32Frac( a );
1861     aExp = extractFloat32Exp( a );
1862     aSign = extractFloat32Sign( a );
1863     bSig = extractFloat32Frac( b );
1864     bExp = extractFloat32Exp( b );
1865     bSign = extractFloat32Sign( b );
1866     zSign = aSign ^ bSign;
1867     if ( aExp == 0xFF ) {
1868         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1869             return propagateFloat32NaN( a, b );
1870         }
1871         if ( ( bExp | bSig ) == 0 ) {
1872             float_raise( float_flag_invalid );
1873             return float32_default_nan;
1874         }
1875         return packFloat32( zSign, 0xFF, 0 );
1876     }
1877     if ( bExp == 0xFF ) {
1878         if ( bSig ) return propagateFloat32NaN( a, b );
1879         if ( ( aExp | aSig ) == 0 ) {
1880             float_raise( float_flag_invalid );
1881             return float32_default_nan;
1882         }
1883         return packFloat32( zSign, 0xFF, 0 );
1884     }
1885     if ( aExp == 0 ) {
1886         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1887         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1888     }
1889     if ( bExp == 0 ) {
1890         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1891         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1892     }
1893     zExp = aExp + bExp - 0x7F;
1894     aSig = ( aSig | 0x00800000 )<<7;
1895     bSig = ( bSig | 0x00800000 )<<8;
1896     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1897     zSig = zSig64;
1898     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1899         zSig <<= 1;
1900         --zExp;
1901     }
1902     return roundAndPackFloat32( zSign, zExp, zSig );
1903 
1904 }
1905 
1906 /*
1907 -------------------------------------------------------------------------------
1908 Returns the result of dividing the single-precision floating-point value `a'
1909 by the corresponding value `b'.  The operation is performed according to the
1910 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1911 -------------------------------------------------------------------------------
1912 */
1913 float32 float32_div( float32 a, float32 b )
1914 {
1915     flag aSign, bSign, zSign;
1916     int16 aExp, bExp, zExp;
1917     bits32 aSig, bSig, zSig;
1918 
1919     aSig = extractFloat32Frac( a );
1920     aExp = extractFloat32Exp( a );
1921     aSign = extractFloat32Sign( a );
1922     bSig = extractFloat32Frac( b );
1923     bExp = extractFloat32Exp( b );
1924     bSign = extractFloat32Sign( b );
1925     zSign = aSign ^ bSign;
1926     if ( aExp == 0xFF ) {
1927         if ( aSig ) return propagateFloat32NaN( a, b );
1928         if ( bExp == 0xFF ) {
1929             if ( bSig ) return propagateFloat32NaN( a, b );
1930             float_raise( float_flag_invalid );
1931             return float32_default_nan;
1932         }
1933         return packFloat32( zSign, 0xFF, 0 );
1934     }
1935     if ( bExp == 0xFF ) {
1936         if ( bSig ) return propagateFloat32NaN( a, b );
1937         return packFloat32( zSign, 0, 0 );
1938     }
1939     if ( bExp == 0 ) {
1940         if ( bSig == 0 ) {
1941             if ( ( aExp | aSig ) == 0 ) {
1942                 float_raise( float_flag_invalid );
1943                 return float32_default_nan;
1944             }
1945             float_raise( float_flag_divbyzero );
1946             return packFloat32( zSign, 0xFF, 0 );
1947         }
1948         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1949     }
1950     if ( aExp == 0 ) {
1951         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1952         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1953     }
1954     zExp = aExp - bExp + 0x7D;
1955     aSig = ( aSig | 0x00800000 )<<7;
1956     bSig = ( bSig | 0x00800000 )<<8;
1957     if ( bSig <= ( aSig + aSig ) ) {
1958         aSig >>= 1;
1959         ++zExp;
1960     }
1961     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1962     if ( ( zSig & 0x3F ) == 0 ) {
1963         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1964     }
1965     return roundAndPackFloat32( zSign, zExp, zSig );
1966 
1967 }
1968 
1969 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1970 /*
1971 -------------------------------------------------------------------------------
1972 Returns the remainder of the single-precision floating-point value `a'
1973 with respect to the corresponding value `b'.  The operation is performed
1974 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1975 -------------------------------------------------------------------------------
1976 */
1977 float32 float32_rem( float32 a, float32 b )
1978 {
1979     flag aSign, bSign, zSign;
1980     int16 aExp, bExp, expDiff;
1981     bits32 aSig, bSig;
1982     bits32 q;
1983     bits64 aSig64, bSig64, q64;
1984     bits32 alternateASig;
1985     sbits32 sigMean;
1986 
1987     aSig = extractFloat32Frac( a );
1988     aExp = extractFloat32Exp( a );
1989     aSign = extractFloat32Sign( a );
1990     bSig = extractFloat32Frac( b );
1991     bExp = extractFloat32Exp( b );
1992     bSign = extractFloat32Sign( b );
1993     if ( aExp == 0xFF ) {
1994         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1995             return propagateFloat32NaN( a, b );
1996         }
1997         float_raise( float_flag_invalid );
1998         return float32_default_nan;
1999     }
2000     if ( bExp == 0xFF ) {
2001         if ( bSig ) return propagateFloat32NaN( a, b );
2002         return a;
2003     }
2004     if ( bExp == 0 ) {
2005         if ( bSig == 0 ) {
2006             float_raise( float_flag_invalid );
2007             return float32_default_nan;
2008         }
2009         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2010     }
2011     if ( aExp == 0 ) {
2012         if ( aSig == 0 ) return a;
2013         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2014     }
2015     expDiff = aExp - bExp;
2016     aSig |= 0x00800000;
2017     bSig |= 0x00800000;
2018     if ( expDiff < 32 ) {
2019         aSig <<= 8;
2020         bSig <<= 8;
2021         if ( expDiff < 0 ) {
2022             if ( expDiff < -1 ) return a;
2023             aSig >>= 1;
2024         }
2025         q = ( bSig <= aSig );
2026         if ( q ) aSig -= bSig;
2027         if ( 0 < expDiff ) {
2028             q = ( ( (bits64) aSig )<<32 ) / bSig;
2029             q >>= 32 - expDiff;
2030             bSig >>= 2;
2031             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2032         }
2033         else {
2034             aSig >>= 2;
2035             bSig >>= 2;
2036         }
2037     }
2038     else {
2039         if ( bSig <= aSig ) aSig -= bSig;
2040         aSig64 = ( (bits64) aSig )<<40;
2041         bSig64 = ( (bits64) bSig )<<40;
2042         expDiff -= 64;
2043         while ( 0 < expDiff ) {
2044             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2045             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2046             aSig64 = - ( ( bSig * q64 )<<38 );
2047             expDiff -= 62;
2048         }
2049         expDiff += 64;
2050         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2051         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2052         q = q64>>( 64 - expDiff );
2053         bSig <<= 6;
2054         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2055     }
2056     do {
2057         alternateASig = aSig;
2058         ++q;
2059         aSig -= bSig;
2060     } while ( 0 <= (sbits32) aSig );
2061     sigMean = aSig + alternateASig;
2062     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2063         aSig = alternateASig;
2064     }
2065     zSign = ( (sbits32) aSig < 0 );
2066     if ( zSign ) aSig = - aSig;
2067     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2068 
2069 }
2070 #endif /* !SOFTFLOAT_FOR_GCC */
2071 
2072 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2073 /*
2074 -------------------------------------------------------------------------------
2075 Returns the square root of the single-precision floating-point value `a'.
2076 The operation is performed according to the IEC/IEEE Standard for Binary
2077 Floating-Point Arithmetic.
2078 -------------------------------------------------------------------------------
2079 */
2080 float32 float32_sqrt( float32 a )
2081 {
2082     flag aSign;
2083     int16 aExp, zExp;
2084     bits32 aSig, zSig;
2085     bits64 rem, term;
2086 
2087     aSig = extractFloat32Frac( a );
2088     aExp = extractFloat32Exp( a );
2089     aSign = extractFloat32Sign( a );
2090     if ( aExp == 0xFF ) {
2091         if ( aSig ) return propagateFloat32NaN( a, 0 );
2092         if ( ! aSign ) return a;
2093         float_raise( float_flag_invalid );
2094         return float32_default_nan;
2095     }
2096     if ( aSign ) {
2097         if ( ( aExp | aSig ) == 0 ) return a;
2098         float_raise( float_flag_invalid );
2099         return float32_default_nan;
2100     }
2101     if ( aExp == 0 ) {
2102         if ( aSig == 0 ) return 0;
2103         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2104     }
2105     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2106     aSig = ( aSig | 0x00800000 )<<8;
2107     zSig = estimateSqrt32( aExp, aSig ) + 2;
2108     if ( ( zSig & 0x7F ) <= 5 ) {
2109         if ( zSig < 2 ) {
2110             zSig = 0x7FFFFFFF;
2111             goto roundAndPack;
2112         }
2113         aSig >>= aExp & 1;
2114         term = ( (bits64) zSig ) * zSig;
2115         rem = ( ( (bits64) aSig )<<32 ) - term;
2116         while ( (sbits64) rem < 0 ) {
2117             --zSig;
2118             rem += ( ( (bits64) zSig )<<1 ) | 1;
2119         }
2120         zSig |= ( rem != 0 );
2121     }
2122     shift32RightJamming( zSig, 1, &zSig );
2123  roundAndPack:
2124     return roundAndPackFloat32( 0, zExp, zSig );
2125 
2126 }
2127 #endif /* !SOFTFLOAT_FOR_GCC */
2128 
2129 /*
2130 -------------------------------------------------------------------------------
2131 Returns 1 if the single-precision floating-point value `a' is equal to
2132 the corresponding value `b', and 0 otherwise.  The comparison is performed
2133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2134 -------------------------------------------------------------------------------
2135 */
2136 flag float32_eq( float32 a, float32 b )
2137 {
2138 
2139     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2140          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2141        ) {
2142         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2143             float_raise( float_flag_invalid );
2144         }
2145         return 0;
2146     }
2147     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2148 
2149 }
2150 
2151 /*
2152 -------------------------------------------------------------------------------
2153 Returns 1 if the single-precision floating-point value `a' is less than
2154 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2155 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2156 Arithmetic.
2157 -------------------------------------------------------------------------------
2158 */
2159 flag float32_le( float32 a, float32 b )
2160 {
2161     flag aSign, bSign;
2162 
2163     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2164          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2165        ) {
2166         float_raise( float_flag_invalid );
2167         return 0;
2168     }
2169     aSign = extractFloat32Sign( a );
2170     bSign = extractFloat32Sign( b );
2171     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2172     return ( a == b ) || ( aSign ^ ( a < b ) );
2173 
2174 }
2175 
2176 /*
2177 -------------------------------------------------------------------------------
2178 Returns 1 if the single-precision floating-point value `a' is less than
2179 the corresponding value `b', and 0 otherwise.  The comparison is performed
2180 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2181 -------------------------------------------------------------------------------
2182 */
2183 flag float32_lt( float32 a, float32 b )
2184 {
2185     flag aSign, bSign;
2186 
2187     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2188          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2189        ) {
2190         float_raise( float_flag_invalid );
2191         return 0;
2192     }
2193     aSign = extractFloat32Sign( a );
2194     bSign = extractFloat32Sign( b );
2195     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2196     return ( a != b ) && ( aSign ^ ( a < b ) );
2197 
2198 }
2199 
2200 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2201 /*
2202 -------------------------------------------------------------------------------
2203 Returns 1 if the single-precision floating-point value `a' is equal to
2204 the corresponding value `b', and 0 otherwise.  The invalid exception is
2205 raised if either operand is a NaN.  Otherwise, the comparison is performed
2206 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2207 -------------------------------------------------------------------------------
2208 */
2209 flag float32_eq_signaling( float32 a, float32 b )
2210 {
2211 
2212     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2213          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2214        ) {
2215         float_raise( float_flag_invalid );
2216         return 0;
2217     }
2218     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2219 
2220 }
2221 
2222 /*
2223 -------------------------------------------------------------------------------
2224 Returns 1 if the single-precision floating-point value `a' is less than or
2225 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2226 cause an exception.  Otherwise, the comparison is performed according to the
2227 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2228 -------------------------------------------------------------------------------
2229 */
2230 flag float32_le_quiet( float32 a, float32 b )
2231 {
2232     flag aSign, bSign;
2233 
2234     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2235          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2236        ) {
2237         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2238             float_raise( float_flag_invalid );
2239         }
2240         return 0;
2241     }
2242     aSign = extractFloat32Sign( a );
2243     bSign = extractFloat32Sign( b );
2244     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2245     return ( a == b ) || ( aSign ^ ( a < b ) );
2246 
2247 }
2248 
2249 /*
2250 -------------------------------------------------------------------------------
2251 Returns 1 if the single-precision floating-point value `a' is less than
2252 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2253 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2254 Standard for Binary Floating-Point Arithmetic.
2255 -------------------------------------------------------------------------------
2256 */
2257 flag float32_lt_quiet( float32 a, float32 b )
2258 {
2259     flag aSign, bSign;
2260 
2261     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2262          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2263        ) {
2264         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2265             float_raise( float_flag_invalid );
2266         }
2267         return 0;
2268     }
2269     aSign = extractFloat32Sign( a );
2270     bSign = extractFloat32Sign( b );
2271     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2272     return ( a != b ) && ( aSign ^ ( a < b ) );
2273 
2274 }
2275 #endif /* !SOFTFLOAT_FOR_GCC */
2276 
2277 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2278 /*
2279 -------------------------------------------------------------------------------
2280 Returns the result of converting the double-precision floating-point value
2281 `a' to the 32-bit two's complement integer format.  The conversion is
2282 performed according to the IEC/IEEE Standard for Binary Floating-Point
2283 Arithmetic---which means in particular that the conversion is rounded
2284 according to the current rounding mode.  If `a' is a NaN, the largest
2285 positive integer is returned.  Otherwise, if the conversion overflows, the
2286 largest integer with the same sign as `a' is returned.
2287 -------------------------------------------------------------------------------
2288 */
2289 int32 float64_to_int32( float64 a )
2290 {
2291     flag aSign;
2292     int16 aExp, shiftCount;
2293     bits64 aSig;
2294 
2295     aSig = extractFloat64Frac( a );
2296     aExp = extractFloat64Exp( a );
2297     aSign = extractFloat64Sign( a );
2298     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2299     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2300     shiftCount = 0x42C - aExp;
2301     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2302     return roundAndPackInt32( aSign, aSig );
2303 
2304 }
2305 #endif /* !SOFTFLOAT_FOR_GCC */
2306 
2307 /*
2308 -------------------------------------------------------------------------------
2309 Returns the result of converting the double-precision floating-point value
2310 `a' to the 32-bit two's complement integer format.  The conversion is
2311 performed according to the IEC/IEEE Standard for Binary Floating-Point
2312 Arithmetic, except that the conversion is always rounded toward zero.
2313 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2314 the conversion overflows, the largest integer with the same sign as `a' is
2315 returned.
2316 -------------------------------------------------------------------------------
2317 */
2318 int32 float64_to_int32_round_to_zero( float64 a )
2319 {
2320     flag aSign;
2321     int16 aExp, shiftCount;
2322     bits64 aSig, savedASig;
2323     int32 z;
2324 
2325     aSig = extractFloat64Frac( a );
2326     aExp = extractFloat64Exp( a );
2327     aSign = extractFloat64Sign( a );
2328     if ( 0x41E < aExp ) {
2329         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2330         goto invalid;
2331     }
2332     else if ( aExp < 0x3FF ) {
2333         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2334         return 0;
2335     }
2336     aSig |= LIT64( 0x0010000000000000 );
2337     shiftCount = 0x433 - aExp;
2338     savedASig = aSig;
2339     aSig >>= shiftCount;
2340     z = aSig;
2341     if ( aSign ) z = - z;
2342     if ( ( z < 0 ) ^ aSign ) {
2343  invalid:
2344         float_raise( float_flag_invalid );
2345         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2346     }
2347     if ( ( aSig<<shiftCount ) != savedASig ) {
2348         float_exception_flags |= float_flag_inexact;
2349     }
2350     return z;
2351 
2352 }
2353 
2354 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2355 /*
2356 -------------------------------------------------------------------------------
2357 Returns the result of converting the double-precision floating-point value
2358 `a' to the 64-bit two's complement integer format.  The conversion is
2359 performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 Arithmetic---which means in particular that the conversion is rounded
2361 according to the current rounding mode.  If `a' is a NaN, the largest
2362 positive integer is returned.  Otherwise, if the conversion overflows, the
2363 largest integer with the same sign as `a' is returned.
2364 -------------------------------------------------------------------------------
2365 */
2366 int64 float64_to_int64( float64 a )
2367 {
2368     flag aSign;
2369     int16 aExp, shiftCount;
2370     bits64 aSig, aSigExtra;
2371 
2372     aSig = extractFloat64Frac( a );
2373     aExp = extractFloat64Exp( a );
2374     aSign = extractFloat64Sign( a );
2375     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2376     shiftCount = 0x433 - aExp;
2377     if ( shiftCount <= 0 ) {
2378         if ( 0x43E < aExp ) {
2379             float_raise( float_flag_invalid );
2380             if (    ! aSign
2381                  || (    ( aExp == 0x7FF )
2382                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2383                ) {
2384                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2385             }
2386             return (sbits64) LIT64( 0x8000000000000000 );
2387         }
2388         aSigExtra = 0;
2389         aSig <<= - shiftCount;
2390     }
2391     else {
2392         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2393     }
2394     return roundAndPackInt64( aSign, aSig, aSigExtra );
2395 
2396 }
2397 
2398 /*
2399 -------------------------------------------------------------------------------
2400 Returns the result of converting the double-precision floating-point value
2401 `a' to the 64-bit two's complement integer format.  The conversion is
2402 performed according to the IEC/IEEE Standard for Binary Floating-Point
2403 Arithmetic, except that the conversion is always rounded toward zero.
2404 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2405 the conversion overflows, the largest integer with the same sign as `a' is
2406 returned.
2407 -------------------------------------------------------------------------------
2408 */
2409 int64 float64_to_int64_round_to_zero( float64 a )
2410 {
2411     flag aSign;
2412     int16 aExp, shiftCount;
2413     bits64 aSig;
2414     int64 z;
2415 
2416     aSig = extractFloat64Frac( a );
2417     aExp = extractFloat64Exp( a );
2418     aSign = extractFloat64Sign( a );
2419     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2420     shiftCount = aExp - 0x433;
2421     if ( 0 <= shiftCount ) {
2422         if ( 0x43E <= aExp ) {
2423             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2424                 float_raise( float_flag_invalid );
2425                 if (    ! aSign
2426                      || (    ( aExp == 0x7FF )
2427                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2428                    ) {
2429                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2430                 }
2431             }
2432             return (sbits64) LIT64( 0x8000000000000000 );
2433         }
2434         z = aSig<<shiftCount;
2435     }
2436     else {
2437         if ( aExp < 0x3FE ) {
2438             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2439             return 0;
2440         }
2441         z = aSig>>( - shiftCount );
2442         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2443             float_exception_flags |= float_flag_inexact;
2444         }
2445     }
2446     if ( aSign ) z = - z;
2447     return z;
2448 
2449 }
2450 #endif /* !SOFTFLOAT_FOR_GCC */
2451 
2452 /*
2453 -------------------------------------------------------------------------------
2454 Returns the result of converting the double-precision floating-point value
2455 `a' to the single-precision floating-point format.  The conversion is
2456 performed according to the IEC/IEEE Standard for Binary Floating-Point
2457 Arithmetic.
2458 -------------------------------------------------------------------------------
2459 */
2460 float32 float64_to_float32( float64 a )
2461 {
2462     flag aSign;
2463     int16 aExp;
2464     bits64 aSig;
2465     bits32 zSig;
2466 
2467     aSig = extractFloat64Frac( a );
2468     aExp = extractFloat64Exp( a );
2469     aSign = extractFloat64Sign( a );
2470     if ( aExp == 0x7FF ) {
2471         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2472         return packFloat32( aSign, 0xFF, 0 );
2473     }
2474     shift64RightJamming( aSig, 22, &aSig );
2475     zSig = aSig;
2476     if ( aExp || zSig ) {
2477         zSig |= 0x40000000;
2478         aExp -= 0x381;
2479     }
2480     return roundAndPackFloat32( aSign, aExp, zSig );
2481 
2482 }
2483 
2484 #ifdef FLOATX80
2485 
2486 /*
2487 -------------------------------------------------------------------------------
2488 Returns the result of converting the double-precision floating-point value
2489 `a' to the extended double-precision floating-point format.  The conversion
2490 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2491 Arithmetic.
2492 -------------------------------------------------------------------------------
2493 */
2494 floatx80 float64_to_floatx80( float64 a )
2495 {
2496     flag aSign;
2497     int16 aExp;
2498     bits64 aSig;
2499 
2500     aSig = extractFloat64Frac( a );
2501     aExp = extractFloat64Exp( a );
2502     aSign = extractFloat64Sign( a );
2503     if ( aExp == 0x7FF ) {
2504         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2505         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2506     }
2507     if ( aExp == 0 ) {
2508         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2509         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2510     }
2511     return
2512         packFloatx80(
2513             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2514 
2515 }
2516 
2517 #endif
2518 
2519 #ifdef FLOAT128
2520 
2521 /*
2522 -------------------------------------------------------------------------------
2523 Returns the result of converting the double-precision floating-point value
2524 `a' to the quadruple-precision floating-point format.  The conversion is
2525 performed according to the IEC/IEEE Standard for Binary Floating-Point
2526 Arithmetic.
2527 -------------------------------------------------------------------------------
2528 */
2529 float128 float64_to_float128( float64 a )
2530 {
2531     flag aSign;
2532     int16 aExp;
2533     bits64 aSig, zSig0, zSig1;
2534 
2535     aSig = extractFloat64Frac( a );
2536     aExp = extractFloat64Exp( a );
2537     aSign = extractFloat64Sign( a );
2538     if ( aExp == 0x7FF ) {
2539         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2540         return packFloat128( aSign, 0x7FFF, 0, 0 );
2541     }
2542     if ( aExp == 0 ) {
2543         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2544         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2545         --aExp;
2546     }
2547     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2548     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2549 
2550 }
2551 
2552 #endif
2553 
2554 #ifndef SOFTFLOAT_FOR_GCC
2555 /*
2556 -------------------------------------------------------------------------------
2557 Rounds the double-precision floating-point value `a' to an integer, and
2558 returns the result as a double-precision floating-point value.  The
2559 operation is performed according to the IEC/IEEE Standard for Binary
2560 Floating-Point Arithmetic.
2561 -------------------------------------------------------------------------------
2562 */
2563 float64 float64_round_to_int( float64 a )
2564 {
2565     flag aSign;
2566     int16 aExp;
2567     bits64 lastBitMask, roundBitsMask;
2568     int8 roundingMode;
2569     float64 z;
2570 
2571     aExp = extractFloat64Exp( a );
2572     if ( 0x433 <= aExp ) {
2573         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2574             return propagateFloat64NaN( a, a );
2575         }
2576         return a;
2577     }
2578     if ( aExp < 0x3FF ) {
2579         if ( (bits64) ( a<<1 ) == 0 ) return a;
2580         float_exception_flags |= float_flag_inexact;
2581         aSign = extractFloat64Sign( a );
2582         switch ( float_rounding_mode ) {
2583          case float_round_nearest_even:
2584             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2585                 return packFloat64( aSign, 0x3FF, 0 );
2586             }
2587             break;
2588 	 case float_round_to_zero:
2589 	    break;
2590          case float_round_down:
2591             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2592          case float_round_up:
2593             return
2594             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2595         }
2596         return packFloat64( aSign, 0, 0 );
2597     }
2598     lastBitMask = 1;
2599     lastBitMask <<= 0x433 - aExp;
2600     roundBitsMask = lastBitMask - 1;
2601     z = a;
2602     roundingMode = float_rounding_mode;
2603     if ( roundingMode == float_round_nearest_even ) {
2604         z += lastBitMask>>1;
2605         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2606     }
2607     else if ( roundingMode != float_round_to_zero ) {
2608         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2609             z += roundBitsMask;
2610         }
2611     }
2612     z &= ~ roundBitsMask;
2613     if ( z != a ) float_exception_flags |= float_flag_inexact;
2614     return z;
2615 
2616 }
2617 #endif
2618 
2619 /*
2620 -------------------------------------------------------------------------------
2621 Returns the result of adding the absolute values of the double-precision
2622 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2623 before being returned.  `zSign' is ignored if the result is a NaN.
2624 The addition is performed according to the IEC/IEEE Standard for Binary
2625 Floating-Point Arithmetic.
2626 -------------------------------------------------------------------------------
2627 */
2628 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2629 {
2630     int16 aExp, bExp, zExp;
2631     bits64 aSig, bSig, zSig;
2632     int16 expDiff;
2633 
2634     aSig = extractFloat64Frac( a );
2635     aExp = extractFloat64Exp( a );
2636     bSig = extractFloat64Frac( b );
2637     bExp = extractFloat64Exp( b );
2638     expDiff = aExp - bExp;
2639     aSig <<= 9;
2640     bSig <<= 9;
2641     if ( 0 < expDiff ) {
2642         if ( aExp == 0x7FF ) {
2643             if ( aSig ) return propagateFloat64NaN( a, b );
2644             return a;
2645         }
2646         if ( bExp == 0 ) {
2647             --expDiff;
2648         }
2649         else {
2650             bSig |= LIT64( 0x2000000000000000 );
2651         }
2652         shift64RightJamming( bSig, expDiff, &bSig );
2653         zExp = aExp;
2654     }
2655     else if ( expDiff < 0 ) {
2656         if ( bExp == 0x7FF ) {
2657             if ( bSig ) return propagateFloat64NaN( a, b );
2658             return packFloat64( zSign, 0x7FF, 0 );
2659         }
2660         if ( aExp == 0 ) {
2661             ++expDiff;
2662         }
2663         else {
2664             aSig |= LIT64( 0x2000000000000000 );
2665         }
2666         shift64RightJamming( aSig, - expDiff, &aSig );
2667         zExp = bExp;
2668     }
2669     else {
2670         if ( aExp == 0x7FF ) {
2671             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2672             return a;
2673         }
2674         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2675         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2676         zExp = aExp;
2677         goto roundAndPack;
2678     }
2679     aSig |= LIT64( 0x2000000000000000 );
2680     zSig = ( aSig + bSig )<<1;
2681     --zExp;
2682     if ( (sbits64) zSig < 0 ) {
2683         zSig = aSig + bSig;
2684         ++zExp;
2685     }
2686  roundAndPack:
2687     return roundAndPackFloat64( zSign, zExp, zSig );
2688 
2689 }
2690 
2691 /*
2692 -------------------------------------------------------------------------------
2693 Returns the result of subtracting the absolute values of the double-
2694 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2695 difference is negated before being returned.  `zSign' is ignored if the
2696 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2697 Standard for Binary Floating-Point Arithmetic.
2698 -------------------------------------------------------------------------------
2699 */
2700 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2701 {
2702     int16 aExp, bExp, zExp;
2703     bits64 aSig, bSig, zSig;
2704     int16 expDiff;
2705 
2706     aSig = extractFloat64Frac( a );
2707     aExp = extractFloat64Exp( a );
2708     bSig = extractFloat64Frac( b );
2709     bExp = extractFloat64Exp( b );
2710     expDiff = aExp - bExp;
2711     aSig <<= 10;
2712     bSig <<= 10;
2713     if ( 0 < expDiff ) goto aExpBigger;
2714     if ( expDiff < 0 ) goto bExpBigger;
2715     if ( aExp == 0x7FF ) {
2716         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2717         float_raise( float_flag_invalid );
2718         return float64_default_nan;
2719     }
2720     if ( aExp == 0 ) {
2721         aExp = 1;
2722         bExp = 1;
2723     }
2724     if ( bSig < aSig ) goto aBigger;
2725     if ( aSig < bSig ) goto bBigger;
2726     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2727  bExpBigger:
2728     if ( bExp == 0x7FF ) {
2729         if ( bSig ) return propagateFloat64NaN( a, b );
2730         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2731     }
2732     if ( aExp == 0 ) {
2733         ++expDiff;
2734     }
2735     else {
2736         aSig |= LIT64( 0x4000000000000000 );
2737     }
2738     shift64RightJamming( aSig, - expDiff, &aSig );
2739     bSig |= LIT64( 0x4000000000000000 );
2740  bBigger:
2741     zSig = bSig - aSig;
2742     zExp = bExp;
2743     zSign ^= 1;
2744     goto normalizeRoundAndPack;
2745  aExpBigger:
2746     if ( aExp == 0x7FF ) {
2747         if ( aSig ) return propagateFloat64NaN( a, b );
2748         return a;
2749     }
2750     if ( bExp == 0 ) {
2751         --expDiff;
2752     }
2753     else {
2754         bSig |= LIT64( 0x4000000000000000 );
2755     }
2756     shift64RightJamming( bSig, expDiff, &bSig );
2757     aSig |= LIT64( 0x4000000000000000 );
2758  aBigger:
2759     zSig = aSig - bSig;
2760     zExp = aExp;
2761  normalizeRoundAndPack:
2762     --zExp;
2763     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2764 
2765 }
2766 
2767 /*
2768 -------------------------------------------------------------------------------
2769 Returns the result of adding the double-precision floating-point values `a'
2770 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2771 Binary Floating-Point Arithmetic.
2772 -------------------------------------------------------------------------------
2773 */
2774 float64 float64_add( float64 a, float64 b )
2775 {
2776     flag aSign, bSign;
2777 
2778     aSign = extractFloat64Sign( a );
2779     bSign = extractFloat64Sign( b );
2780     if ( aSign == bSign ) {
2781         return addFloat64Sigs( a, b, aSign );
2782     }
2783     else {
2784         return subFloat64Sigs( a, b, aSign );
2785     }
2786 
2787 }
2788 
2789 /*
2790 -------------------------------------------------------------------------------
2791 Returns the result of subtracting the double-precision floating-point values
2792 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2793 for Binary Floating-Point Arithmetic.
2794 -------------------------------------------------------------------------------
2795 */
2796 float64 float64_sub( float64 a, float64 b )
2797 {
2798     flag aSign, bSign;
2799 
2800     aSign = extractFloat64Sign( a );
2801     bSign = extractFloat64Sign( b );
2802     if ( aSign == bSign ) {
2803         return subFloat64Sigs( a, b, aSign );
2804     }
2805     else {
2806         return addFloat64Sigs( a, b, aSign );
2807     }
2808 
2809 }
2810 
2811 /*
2812 -------------------------------------------------------------------------------
2813 Returns the result of multiplying the double-precision floating-point values
2814 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2815 for Binary Floating-Point Arithmetic.
2816 -------------------------------------------------------------------------------
2817 */
2818 float64 float64_mul( float64 a, float64 b )
2819 {
2820     flag aSign, bSign, zSign;
2821     int16 aExp, bExp, zExp;
2822     bits64 aSig, bSig, zSig0, zSig1;
2823 
2824     aSig = extractFloat64Frac( a );
2825     aExp = extractFloat64Exp( a );
2826     aSign = extractFloat64Sign( a );
2827     bSig = extractFloat64Frac( b );
2828     bExp = extractFloat64Exp( b );
2829     bSign = extractFloat64Sign( b );
2830     zSign = aSign ^ bSign;
2831     if ( aExp == 0x7FF ) {
2832         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2833             return propagateFloat64NaN( a, b );
2834         }
2835         if ( ( bExp | bSig ) == 0 ) {
2836             float_raise( float_flag_invalid );
2837             return float64_default_nan;
2838         }
2839         return packFloat64( zSign, 0x7FF, 0 );
2840     }
2841     if ( bExp == 0x7FF ) {
2842         if ( bSig ) return propagateFloat64NaN( a, b );
2843         if ( ( aExp | aSig ) == 0 ) {
2844             float_raise( float_flag_invalid );
2845             return float64_default_nan;
2846         }
2847         return packFloat64( zSign, 0x7FF, 0 );
2848     }
2849     if ( aExp == 0 ) {
2850         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2851         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2852     }
2853     if ( bExp == 0 ) {
2854         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2855         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2856     }
2857     zExp = aExp + bExp - 0x3FF;
2858     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2859     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2860     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2861     zSig0 |= ( zSig1 != 0 );
2862     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2863         zSig0 <<= 1;
2864         --zExp;
2865     }
2866     return roundAndPackFloat64( zSign, zExp, zSig0 );
2867 
2868 }
2869 
2870 /*
2871 -------------------------------------------------------------------------------
2872 Returns the result of dividing the double-precision floating-point value `a'
2873 by the corresponding value `b'.  The operation is performed according to
2874 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2875 -------------------------------------------------------------------------------
2876 */
2877 float64 float64_div( float64 a, float64 b )
2878 {
2879     flag aSign, bSign, zSign;
2880     int16 aExp, bExp, zExp;
2881     bits64 aSig, bSig, zSig;
2882     bits64 rem0, rem1;
2883     bits64 term0, term1;
2884 
2885     aSig = extractFloat64Frac( a );
2886     aExp = extractFloat64Exp( a );
2887     aSign = extractFloat64Sign( a );
2888     bSig = extractFloat64Frac( b );
2889     bExp = extractFloat64Exp( b );
2890     bSign = extractFloat64Sign( b );
2891     zSign = aSign ^ bSign;
2892     if ( aExp == 0x7FF ) {
2893         if ( aSig ) return propagateFloat64NaN( a, b );
2894         if ( bExp == 0x7FF ) {
2895             if ( bSig ) return propagateFloat64NaN( a, b );
2896             float_raise( float_flag_invalid );
2897             return float64_default_nan;
2898         }
2899         return packFloat64( zSign, 0x7FF, 0 );
2900     }
2901     if ( bExp == 0x7FF ) {
2902         if ( bSig ) return propagateFloat64NaN( a, b );
2903         return packFloat64( zSign, 0, 0 );
2904     }
2905     if ( bExp == 0 ) {
2906         if ( bSig == 0 ) {
2907             if ( ( aExp | aSig ) == 0 ) {
2908                 float_raise( float_flag_invalid );
2909                 return float64_default_nan;
2910             }
2911             float_raise( float_flag_divbyzero );
2912             return packFloat64( zSign, 0x7FF, 0 );
2913         }
2914         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2915     }
2916     if ( aExp == 0 ) {
2917         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2918         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2919     }
2920     zExp = aExp - bExp + 0x3FD;
2921     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2922     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2923     if ( bSig <= ( aSig + aSig ) ) {
2924         aSig >>= 1;
2925         ++zExp;
2926     }
2927     zSig = estimateDiv128To64( aSig, 0, bSig );
2928     if ( ( zSig & 0x1FF ) <= 2 ) {
2929         mul64To128( bSig, zSig, &term0, &term1 );
2930         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2931         while ( (sbits64) rem0 < 0 ) {
2932             --zSig;
2933             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2934         }
2935         zSig |= ( rem1 != 0 );
2936     }
2937     return roundAndPackFloat64( zSign, zExp, zSig );
2938 
2939 }
2940 
2941 #ifndef SOFTFLOAT_FOR_GCC
2942 /*
2943 -------------------------------------------------------------------------------
2944 Returns the remainder of the double-precision floating-point value `a'
2945 with respect to the corresponding value `b'.  The operation is performed
2946 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2947 -------------------------------------------------------------------------------
2948 */
2949 float64 float64_rem( float64 a, float64 b )
2950 {
2951     flag aSign, bSign, zSign;
2952     int16 aExp, bExp, expDiff;
2953     bits64 aSig, bSig;
2954     bits64 q, alternateASig;
2955     sbits64 sigMean;
2956 
2957     aSig = extractFloat64Frac( a );
2958     aExp = extractFloat64Exp( a );
2959     aSign = extractFloat64Sign( a );
2960     bSig = extractFloat64Frac( b );
2961     bExp = extractFloat64Exp( b );
2962     bSign = extractFloat64Sign( b );
2963     if ( aExp == 0x7FF ) {
2964         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2965             return propagateFloat64NaN( a, b );
2966         }
2967         float_raise( float_flag_invalid );
2968         return float64_default_nan;
2969     }
2970     if ( bExp == 0x7FF ) {
2971         if ( bSig ) return propagateFloat64NaN( a, b );
2972         return a;
2973     }
2974     if ( bExp == 0 ) {
2975         if ( bSig == 0 ) {
2976             float_raise( float_flag_invalid );
2977             return float64_default_nan;
2978         }
2979         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2980     }
2981     if ( aExp == 0 ) {
2982         if ( aSig == 0 ) return a;
2983         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2984     }
2985     expDiff = aExp - bExp;
2986     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2987     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2988     if ( expDiff < 0 ) {
2989         if ( expDiff < -1 ) return a;
2990         aSig >>= 1;
2991     }
2992     q = ( bSig <= aSig );
2993     if ( q ) aSig -= bSig;
2994     expDiff -= 64;
2995     while ( 0 < expDiff ) {
2996         q = estimateDiv128To64( aSig, 0, bSig );
2997         q = ( 2 < q ) ? q - 2 : 0;
2998         aSig = - ( ( bSig>>2 ) * q );
2999         expDiff -= 62;
3000     }
3001     expDiff += 64;
3002     if ( 0 < expDiff ) {
3003         q = estimateDiv128To64( aSig, 0, bSig );
3004         q = ( 2 < q ) ? q - 2 : 0;
3005         q >>= 64 - expDiff;
3006         bSig >>= 2;
3007         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3008     }
3009     else {
3010         aSig >>= 2;
3011         bSig >>= 2;
3012     }
3013     do {
3014         alternateASig = aSig;
3015         ++q;
3016         aSig -= bSig;
3017     } while ( 0 <= (sbits64) aSig );
3018     sigMean = aSig + alternateASig;
3019     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3020         aSig = alternateASig;
3021     }
3022     zSign = ( (sbits64) aSig < 0 );
3023     if ( zSign ) aSig = - aSig;
3024     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3025 
3026 }
3027 
3028 /*
3029 -------------------------------------------------------------------------------
3030 Returns the square root of the double-precision floating-point value `a'.
3031 The operation is performed according to the IEC/IEEE Standard for Binary
3032 Floating-Point Arithmetic.
3033 -------------------------------------------------------------------------------
3034 */
3035 float64 float64_sqrt( float64 a )
3036 {
3037     flag aSign;
3038     int16 aExp, zExp;
3039     bits64 aSig, zSig, doubleZSig;
3040     bits64 rem0, rem1, term0, term1;
3041 
3042     aSig = extractFloat64Frac( a );
3043     aExp = extractFloat64Exp( a );
3044     aSign = extractFloat64Sign( a );
3045     if ( aExp == 0x7FF ) {
3046         if ( aSig ) return propagateFloat64NaN( a, a );
3047         if ( ! aSign ) return a;
3048         float_raise( float_flag_invalid );
3049         return float64_default_nan;
3050     }
3051     if ( aSign ) {
3052         if ( ( aExp | aSig ) == 0 ) return a;
3053         float_raise( float_flag_invalid );
3054         return float64_default_nan;
3055     }
3056     if ( aExp == 0 ) {
3057         if ( aSig == 0 ) return 0;
3058         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3059     }
3060     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3061     aSig |= LIT64( 0x0010000000000000 );
3062     zSig = estimateSqrt32( aExp, aSig>>21 );
3063     aSig <<= 9 - ( aExp & 1 );
3064     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3065     if ( ( zSig & 0x1FF ) <= 5 ) {
3066         doubleZSig = zSig<<1;
3067         mul64To128( zSig, zSig, &term0, &term1 );
3068         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3069         while ( (sbits64) rem0 < 0 ) {
3070             --zSig;
3071             doubleZSig -= 2;
3072             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3073         }
3074         zSig |= ( ( rem0 | rem1 ) != 0 );
3075     }
3076     return roundAndPackFloat64( 0, zExp, zSig );
3077 
3078 }
3079 #endif
3080 
3081 /*
3082 -------------------------------------------------------------------------------
3083 Returns 1 if the double-precision floating-point value `a' is equal to the
3084 corresponding value `b', and 0 otherwise.  The comparison is performed
3085 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3086 -------------------------------------------------------------------------------
3087 */
3088 flag float64_eq( float64 a, float64 b )
3089 {
3090 
3091     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3092          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3093        ) {
3094         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3095             float_raise( float_flag_invalid );
3096         }
3097         return 0;
3098     }
3099     return ( a == b ) ||
3100 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3101 
3102 }
3103 
3104 /*
3105 -------------------------------------------------------------------------------
3106 Returns 1 if the double-precision floating-point value `a' is less than or
3107 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3108 performed according to the IEC/IEEE Standard for Binary Floating-Point
3109 Arithmetic.
3110 -------------------------------------------------------------------------------
3111 */
3112 flag float64_le( float64 a, float64 b )
3113 {
3114     flag aSign, bSign;
3115 
3116     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3117          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3118        ) {
3119         float_raise( float_flag_invalid );
3120         return 0;
3121     }
3122     aSign = extractFloat64Sign( a );
3123     bSign = extractFloat64Sign( b );
3124     if ( aSign != bSign )
3125 	return aSign ||
3126 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3127 	      0 );
3128     return ( a == b ) ||
3129 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3130 
3131 }
3132 
3133 /*
3134 -------------------------------------------------------------------------------
3135 Returns 1 if the double-precision floating-point value `a' is less than
3136 the corresponding value `b', and 0 otherwise.  The comparison is performed
3137 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3138 -------------------------------------------------------------------------------
3139 */
3140 flag float64_lt( float64 a, float64 b )
3141 {
3142     flag aSign, bSign;
3143 
3144     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3145          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3146        ) {
3147         float_raise( float_flag_invalid );
3148         return 0;
3149     }
3150     aSign = extractFloat64Sign( a );
3151     bSign = extractFloat64Sign( b );
3152     if ( aSign != bSign )
3153 	return aSign &&
3154 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3155 	      0 );
3156     return ( a != b ) &&
3157 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3158 
3159 }
3160 
3161 #ifndef SOFTFLOAT_FOR_GCC
3162 /*
3163 -------------------------------------------------------------------------------
3164 Returns 1 if the double-precision floating-point value `a' is equal to the
3165 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3166 if either operand is a NaN.  Otherwise, the comparison is performed
3167 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3168 -------------------------------------------------------------------------------
3169 */
3170 flag float64_eq_signaling( float64 a, float64 b )
3171 {
3172 
3173     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3174          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3175        ) {
3176         float_raise( float_flag_invalid );
3177         return 0;
3178     }
3179     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3180 
3181 }
3182 
3183 /*
3184 -------------------------------------------------------------------------------
3185 Returns 1 if the double-precision floating-point value `a' is less than or
3186 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3187 cause an exception.  Otherwise, the comparison is performed according to the
3188 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3189 -------------------------------------------------------------------------------
3190 */
3191 flag float64_le_quiet( float64 a, float64 b )
3192 {
3193     flag aSign, bSign;
3194 
3195     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3196          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3197        ) {
3198         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3199             float_raise( float_flag_invalid );
3200         }
3201         return 0;
3202     }
3203     aSign = extractFloat64Sign( a );
3204     bSign = extractFloat64Sign( b );
3205     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3206     return ( a == b ) || ( aSign ^ ( a < b ) );
3207 
3208 }
3209 
3210 /*
3211 -------------------------------------------------------------------------------
3212 Returns 1 if the double-precision floating-point value `a' is less than
3213 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3214 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3215 Standard for Binary Floating-Point Arithmetic.
3216 -------------------------------------------------------------------------------
3217 */
3218 flag float64_lt_quiet( float64 a, float64 b )
3219 {
3220     flag aSign, bSign;
3221 
3222     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3223          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3224        ) {
3225         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3226             float_raise( float_flag_invalid );
3227         }
3228         return 0;
3229     }
3230     aSign = extractFloat64Sign( a );
3231     bSign = extractFloat64Sign( b );
3232     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3233     return ( a != b ) && ( aSign ^ ( a < b ) );
3234 
3235 }
3236 #endif
3237 
3238 #ifdef FLOATX80
3239 
3240 /*
3241 -------------------------------------------------------------------------------
3242 Returns the result of converting the extended double-precision floating-
3243 point value `a' to the 32-bit two's complement integer format.  The
3244 conversion is performed according to the IEC/IEEE Standard for Binary
3245 Floating-Point Arithmetic---which means in particular that the conversion
3246 is rounded according to the current rounding mode.  If `a' is a NaN, the
3247 largest positive integer is returned.  Otherwise, if the conversion
3248 overflows, the largest integer with the same sign as `a' is returned.
3249 -------------------------------------------------------------------------------
3250 */
3251 int32 floatx80_to_int32( floatx80 a )
3252 {
3253     flag aSign;
3254     int32 aExp, shiftCount;
3255     bits64 aSig;
3256 
3257     aSig = extractFloatx80Frac( a );
3258     aExp = extractFloatx80Exp( a );
3259     aSign = extractFloatx80Sign( a );
3260     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3261     shiftCount = 0x4037 - aExp;
3262     if ( shiftCount <= 0 ) shiftCount = 1;
3263     shift64RightJamming( aSig, shiftCount, &aSig );
3264     return roundAndPackInt32( aSign, aSig );
3265 
3266 }
3267 
3268 /*
3269 -------------------------------------------------------------------------------
3270 Returns the result of converting the extended double-precision floating-
3271 point value `a' to the 32-bit two's complement integer format.  The
3272 conversion is performed according to the IEC/IEEE Standard for Binary
3273 Floating-Point Arithmetic, except that the conversion is always rounded
3274 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3275 Otherwise, if the conversion overflows, the largest integer with the same
3276 sign as `a' is returned.
3277 -------------------------------------------------------------------------------
3278 */
3279 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3280 {
3281     flag aSign;
3282     int32 aExp, shiftCount;
3283     bits64 aSig, savedASig;
3284     int32 z;
3285 
3286     aSig = extractFloatx80Frac( a );
3287     aExp = extractFloatx80Exp( a );
3288     aSign = extractFloatx80Sign( a );
3289     if ( 0x401E < aExp ) {
3290         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3291         goto invalid;
3292     }
3293     else if ( aExp < 0x3FFF ) {
3294         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3295         return 0;
3296     }
3297     shiftCount = 0x403E - aExp;
3298     savedASig = aSig;
3299     aSig >>= shiftCount;
3300     z = aSig;
3301     if ( aSign ) z = - z;
3302     if ( ( z < 0 ) ^ aSign ) {
3303  invalid:
3304         float_raise( float_flag_invalid );
3305         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3306     }
3307     if ( ( aSig<<shiftCount ) != savedASig ) {
3308         float_exception_flags |= float_flag_inexact;
3309     }
3310     return z;
3311 
3312 }
3313 
3314 /*
3315 -------------------------------------------------------------------------------
3316 Returns the result of converting the extended double-precision floating-
3317 point value `a' to the 64-bit two's complement integer format.  The
3318 conversion is performed according to the IEC/IEEE Standard for Binary
3319 Floating-Point Arithmetic---which means in particular that the conversion
3320 is rounded according to the current rounding mode.  If `a' is a NaN,
3321 the largest positive integer is returned.  Otherwise, if the conversion
3322 overflows, the largest integer with the same sign as `a' is returned.
3323 -------------------------------------------------------------------------------
3324 */
3325 int64 floatx80_to_int64( floatx80 a )
3326 {
3327     flag aSign;
3328     int32 aExp, shiftCount;
3329     bits64 aSig, aSigExtra;
3330 
3331     aSig = extractFloatx80Frac( a );
3332     aExp = extractFloatx80Exp( a );
3333     aSign = extractFloatx80Sign( a );
3334     shiftCount = 0x403E - aExp;
3335     if ( shiftCount <= 0 ) {
3336         if ( shiftCount ) {
3337             float_raise( float_flag_invalid );
3338             if (    ! aSign
3339                  || (    ( aExp == 0x7FFF )
3340                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3341                ) {
3342                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3343             }
3344             return (sbits64) LIT64( 0x8000000000000000 );
3345         }
3346         aSigExtra = 0;
3347     }
3348     else {
3349         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3350     }
3351     return roundAndPackInt64( aSign, aSig, aSigExtra );
3352 
3353 }
3354 
3355 /*
3356 -------------------------------------------------------------------------------
3357 Returns the result of converting the extended double-precision floating-
3358 point value `a' to the 64-bit two's complement integer format.  The
3359 conversion is performed according to the IEC/IEEE Standard for Binary
3360 Floating-Point Arithmetic, except that the conversion is always rounded
3361 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3362 Otherwise, if the conversion overflows, the largest integer with the same
3363 sign as `a' is returned.
3364 -------------------------------------------------------------------------------
3365 */
3366 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3367 {
3368     flag aSign;
3369     int32 aExp, shiftCount;
3370     bits64 aSig;
3371     int64 z;
3372 
3373     aSig = extractFloatx80Frac( a );
3374     aExp = extractFloatx80Exp( a );
3375     aSign = extractFloatx80Sign( a );
3376     shiftCount = aExp - 0x403E;
3377     if ( 0 <= shiftCount ) {
3378         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3379         if ( ( a.high != 0xC03E ) || aSig ) {
3380             float_raise( float_flag_invalid );
3381             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3382                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3383             }
3384         }
3385         return (sbits64) LIT64( 0x8000000000000000 );
3386     }
3387     else if ( aExp < 0x3FFF ) {
3388         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3389         return 0;
3390     }
3391     z = aSig>>( - shiftCount );
3392     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3393         float_exception_flags |= float_flag_inexact;
3394     }
3395     if ( aSign ) z = - z;
3396     return z;
3397 
3398 }
3399 
3400 /*
3401 -------------------------------------------------------------------------------
3402 Returns the result of converting the extended double-precision floating-
3403 point value `a' to the single-precision floating-point format.  The
3404 conversion is performed according to the IEC/IEEE Standard for Binary
3405 Floating-Point Arithmetic.
3406 -------------------------------------------------------------------------------
3407 */
3408 float32 floatx80_to_float32( floatx80 a )
3409 {
3410     flag aSign;
3411     int32 aExp;
3412     bits64 aSig;
3413 
3414     aSig = extractFloatx80Frac( a );
3415     aExp = extractFloatx80Exp( a );
3416     aSign = extractFloatx80Sign( a );
3417     if ( aExp == 0x7FFF ) {
3418         if ( (bits64) ( aSig<<1 ) ) {
3419             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3420         }
3421         return packFloat32( aSign, 0xFF, 0 );
3422     }
3423     shift64RightJamming( aSig, 33, &aSig );
3424     if ( aExp || aSig ) aExp -= 0x3F81;
3425     return roundAndPackFloat32( aSign, aExp, aSig );
3426 
3427 }
3428 
3429 /*
3430 -------------------------------------------------------------------------------
3431 Returns the result of converting the extended double-precision floating-
3432 point value `a' to the double-precision floating-point format.  The
3433 conversion is performed according to the IEC/IEEE Standard for Binary
3434 Floating-Point Arithmetic.
3435 -------------------------------------------------------------------------------
3436 */
3437 float64 floatx80_to_float64( floatx80 a )
3438 {
3439     flag aSign;
3440     int32 aExp;
3441     bits64 aSig, zSig;
3442 
3443     aSig = extractFloatx80Frac( a );
3444     aExp = extractFloatx80Exp( a );
3445     aSign = extractFloatx80Sign( a );
3446     if ( aExp == 0x7FFF ) {
3447         if ( (bits64) ( aSig<<1 ) ) {
3448             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3449         }
3450         return packFloat64( aSign, 0x7FF, 0 );
3451     }
3452     shift64RightJamming( aSig, 1, &zSig );
3453     if ( aExp || aSig ) aExp -= 0x3C01;
3454     return roundAndPackFloat64( aSign, aExp, zSig );
3455 
3456 }
3457 
3458 #ifdef FLOAT128
3459 
3460 /*
3461 -------------------------------------------------------------------------------
3462 Returns the result of converting the extended double-precision floating-
3463 point value `a' to the quadruple-precision floating-point format.  The
3464 conversion is performed according to the IEC/IEEE Standard for Binary
3465 Floating-Point Arithmetic.
3466 -------------------------------------------------------------------------------
3467 */
3468 float128 floatx80_to_float128( floatx80 a )
3469 {
3470     flag aSign;
3471     int16 aExp;
3472     bits64 aSig, zSig0, zSig1;
3473 
3474     aSig = extractFloatx80Frac( a );
3475     aExp = extractFloatx80Exp( a );
3476     aSign = extractFloatx80Sign( a );
3477     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3478         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3479     }
3480     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3481     return packFloat128( aSign, aExp, zSig0, zSig1 );
3482 
3483 }
3484 
3485 #endif
3486 
3487 /*
3488 -------------------------------------------------------------------------------
3489 Rounds the extended double-precision floating-point value `a' to an integer,
3490 and returns the result as an extended quadruple-precision floating-point
3491 value.  The operation is performed according to the IEC/IEEE Standard for
3492 Binary Floating-Point Arithmetic.
3493 -------------------------------------------------------------------------------
3494 */
3495 floatx80 floatx80_round_to_int( floatx80 a )
3496 {
3497     flag aSign;
3498     int32 aExp;
3499     bits64 lastBitMask, roundBitsMask;
3500     int8 roundingMode;
3501     floatx80 z;
3502 
3503     aExp = extractFloatx80Exp( a );
3504     if ( 0x403E <= aExp ) {
3505         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3506             return propagateFloatx80NaN( a, a );
3507         }
3508         return a;
3509     }
3510     if ( aExp < 0x3FFF ) {
3511         if (    ( aExp == 0 )
3512              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3513             return a;
3514         }
3515         float_exception_flags |= float_flag_inexact;
3516         aSign = extractFloatx80Sign( a );
3517         switch ( float_rounding_mode ) {
3518          case float_round_nearest_even:
3519             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3520                ) {
3521                 return
3522                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3523             }
3524             break;
3525 	 case float_round_to_zero:
3526 	    break;
3527          case float_round_down:
3528             return
3529                   aSign ?
3530                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3531                 : packFloatx80( 0, 0, 0 );
3532          case float_round_up:
3533             return
3534                   aSign ? packFloatx80( 1, 0, 0 )
3535                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3536         }
3537         return packFloatx80( aSign, 0, 0 );
3538     }
3539     lastBitMask = 1;
3540     lastBitMask <<= 0x403E - aExp;
3541     roundBitsMask = lastBitMask - 1;
3542     z = a;
3543     roundingMode = float_rounding_mode;
3544     if ( roundingMode == float_round_nearest_even ) {
3545         z.low += lastBitMask>>1;
3546         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3547     }
3548     else if ( roundingMode != float_round_to_zero ) {
3549         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3550             z.low += roundBitsMask;
3551         }
3552     }
3553     z.low &= ~ roundBitsMask;
3554     if ( z.low == 0 ) {
3555         ++z.high;
3556         z.low = LIT64( 0x8000000000000000 );
3557     }
3558     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3559     return z;
3560 
3561 }
3562 
3563 /*
3564 -------------------------------------------------------------------------------
3565 Returns the result of adding the absolute values of the extended double-
3566 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3567 negated before being returned.  `zSign' is ignored if the result is a NaN.
3568 The addition is performed according to the IEC/IEEE Standard for Binary
3569 Floating-Point Arithmetic.
3570 -------------------------------------------------------------------------------
3571 */
3572 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3573 {
3574     int32 aExp, bExp, zExp;
3575     bits64 aSig, bSig, zSig0, zSig1;
3576     int32 expDiff;
3577 
3578     aSig = extractFloatx80Frac( a );
3579     aExp = extractFloatx80Exp( a );
3580     bSig = extractFloatx80Frac( b );
3581     bExp = extractFloatx80Exp( b );
3582     expDiff = aExp - bExp;
3583     if ( 0 < expDiff ) {
3584         if ( aExp == 0x7FFF ) {
3585             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3586             return a;
3587         }
3588         if ( bExp == 0 ) --expDiff;
3589         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3590         zExp = aExp;
3591     }
3592     else if ( expDiff < 0 ) {
3593         if ( bExp == 0x7FFF ) {
3594             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3595             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3596         }
3597         if ( aExp == 0 ) ++expDiff;
3598         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3599         zExp = bExp;
3600     }
3601     else {
3602         if ( aExp == 0x7FFF ) {
3603             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3604                 return propagateFloatx80NaN( a, b );
3605             }
3606             return a;
3607         }
3608         zSig1 = 0;
3609         zSig0 = aSig + bSig;
3610         if ( aExp == 0 ) {
3611             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3612             goto roundAndPack;
3613         }
3614         zExp = aExp;
3615         goto shiftRight1;
3616     }
3617     zSig0 = aSig + bSig;
3618     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3619  shiftRight1:
3620     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3621     zSig0 |= LIT64( 0x8000000000000000 );
3622     ++zExp;
3623  roundAndPack:
3624     return
3625         roundAndPackFloatx80(
3626             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3627 
3628 }
3629 
3630 /*
3631 -------------------------------------------------------------------------------
3632 Returns the result of subtracting the absolute values of the extended
3633 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3634 difference is negated before being returned.  `zSign' is ignored if the
3635 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3636 Standard for Binary Floating-Point Arithmetic.
3637 -------------------------------------------------------------------------------
3638 */
3639 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3640 {
3641     int32 aExp, bExp, zExp;
3642     bits64 aSig, bSig, zSig0, zSig1;
3643     int32 expDiff;
3644     floatx80 z;
3645 
3646     aSig = extractFloatx80Frac( a );
3647     aExp = extractFloatx80Exp( a );
3648     bSig = extractFloatx80Frac( b );
3649     bExp = extractFloatx80Exp( b );
3650     expDiff = aExp - bExp;
3651     if ( 0 < expDiff ) goto aExpBigger;
3652     if ( expDiff < 0 ) goto bExpBigger;
3653     if ( aExp == 0x7FFF ) {
3654         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3655             return propagateFloatx80NaN( a, b );
3656         }
3657         float_raise( float_flag_invalid );
3658         z.low = floatx80_default_nan_low;
3659         z.high = floatx80_default_nan_high;
3660         return z;
3661     }
3662     if ( aExp == 0 ) {
3663         aExp = 1;
3664         bExp = 1;
3665     }
3666     zSig1 = 0;
3667     if ( bSig < aSig ) goto aBigger;
3668     if ( aSig < bSig ) goto bBigger;
3669     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3670  bExpBigger:
3671     if ( bExp == 0x7FFF ) {
3672         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3673         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3674     }
3675     if ( aExp == 0 ) ++expDiff;
3676     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3677  bBigger:
3678     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3679     zExp = bExp;
3680     zSign ^= 1;
3681     goto normalizeRoundAndPack;
3682  aExpBigger:
3683     if ( aExp == 0x7FFF ) {
3684         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3685         return a;
3686     }
3687     if ( bExp == 0 ) --expDiff;
3688     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3689  aBigger:
3690     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3691     zExp = aExp;
3692  normalizeRoundAndPack:
3693     return
3694         normalizeRoundAndPackFloatx80(
3695             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3696 
3697 }
3698 
3699 /*
3700 -------------------------------------------------------------------------------
3701 Returns the result of adding the extended double-precision floating-point
3702 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3703 Standard for Binary Floating-Point Arithmetic.
3704 -------------------------------------------------------------------------------
3705 */
3706 floatx80 floatx80_add( floatx80 a, floatx80 b )
3707 {
3708     flag aSign, bSign;
3709 
3710     aSign = extractFloatx80Sign( a );
3711     bSign = extractFloatx80Sign( b );
3712     if ( aSign == bSign ) {
3713         return addFloatx80Sigs( a, b, aSign );
3714     }
3715     else {
3716         return subFloatx80Sigs( a, b, aSign );
3717     }
3718 
3719 }
3720 
3721 /*
3722 -------------------------------------------------------------------------------
3723 Returns the result of subtracting the extended double-precision floating-
3724 point values `a' and `b'.  The operation is performed according to the
3725 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3726 -------------------------------------------------------------------------------
3727 */
3728 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3729 {
3730     flag aSign, bSign;
3731 
3732     aSign = extractFloatx80Sign( a );
3733     bSign = extractFloatx80Sign( b );
3734     if ( aSign == bSign ) {
3735         return subFloatx80Sigs( a, b, aSign );
3736     }
3737     else {
3738         return addFloatx80Sigs( a, b, aSign );
3739     }
3740 
3741 }
3742 
3743 /*
3744 -------------------------------------------------------------------------------
3745 Returns the result of multiplying the extended double-precision floating-
3746 point values `a' and `b'.  The operation is performed according to the
3747 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3748 -------------------------------------------------------------------------------
3749 */
3750 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3751 {
3752     flag aSign, bSign, zSign;
3753     int32 aExp, bExp, zExp;
3754     bits64 aSig, bSig, zSig0, zSig1;
3755     floatx80 z;
3756 
3757     aSig = extractFloatx80Frac( a );
3758     aExp = extractFloatx80Exp( a );
3759     aSign = extractFloatx80Sign( a );
3760     bSig = extractFloatx80Frac( b );
3761     bExp = extractFloatx80Exp( b );
3762     bSign = extractFloatx80Sign( b );
3763     zSign = aSign ^ bSign;
3764     if ( aExp == 0x7FFF ) {
3765         if (    (bits64) ( aSig<<1 )
3766              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3767             return propagateFloatx80NaN( a, b );
3768         }
3769         if ( ( bExp | bSig ) == 0 ) goto invalid;
3770         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3771     }
3772     if ( bExp == 0x7FFF ) {
3773         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3774         if ( ( aExp | aSig ) == 0 ) {
3775  invalid:
3776             float_raise( float_flag_invalid );
3777             z.low = floatx80_default_nan_low;
3778             z.high = floatx80_default_nan_high;
3779             return z;
3780         }
3781         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3782     }
3783     if ( aExp == 0 ) {
3784         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3785         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3786     }
3787     if ( bExp == 0 ) {
3788         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3789         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3790     }
3791     zExp = aExp + bExp - 0x3FFE;
3792     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3793     if ( 0 < (sbits64) zSig0 ) {
3794         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3795         --zExp;
3796     }
3797     return
3798         roundAndPackFloatx80(
3799             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3800 
3801 }
3802 
3803 /*
3804 -------------------------------------------------------------------------------
3805 Returns the result of dividing the extended double-precision floating-point
3806 value `a' by the corresponding value `b'.  The operation is performed
3807 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3808 -------------------------------------------------------------------------------
3809 */
3810 floatx80 floatx80_div( floatx80 a, floatx80 b )
3811 {
3812     flag aSign, bSign, zSign;
3813     int32 aExp, bExp, zExp;
3814     bits64 aSig, bSig, zSig0, zSig1;
3815     bits64 rem0, rem1, rem2, term0, term1, term2;
3816     floatx80 z;
3817 
3818     aSig = extractFloatx80Frac( a );
3819     aExp = extractFloatx80Exp( a );
3820     aSign = extractFloatx80Sign( a );
3821     bSig = extractFloatx80Frac( b );
3822     bExp = extractFloatx80Exp( b );
3823     bSign = extractFloatx80Sign( b );
3824     zSign = aSign ^ bSign;
3825     if ( aExp == 0x7FFF ) {
3826         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3827         if ( bExp == 0x7FFF ) {
3828             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3829             goto invalid;
3830         }
3831         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3832     }
3833     if ( bExp == 0x7FFF ) {
3834         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3835         return packFloatx80( zSign, 0, 0 );
3836     }
3837     if ( bExp == 0 ) {
3838         if ( bSig == 0 ) {
3839             if ( ( aExp | aSig ) == 0 ) {
3840  invalid:
3841                 float_raise( float_flag_invalid );
3842                 z.low = floatx80_default_nan_low;
3843                 z.high = floatx80_default_nan_high;
3844                 return z;
3845             }
3846             float_raise( float_flag_divbyzero );
3847             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3848         }
3849         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3850     }
3851     if ( aExp == 0 ) {
3852         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3853         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3854     }
3855     zExp = aExp - bExp + 0x3FFE;
3856     rem1 = 0;
3857     if ( bSig <= aSig ) {
3858         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3859         ++zExp;
3860     }
3861     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3862     mul64To128( bSig, zSig0, &term0, &term1 );
3863     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3864     while ( (sbits64) rem0 < 0 ) {
3865         --zSig0;
3866         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3867     }
3868     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3869     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3870         mul64To128( bSig, zSig1, &term1, &term2 );
3871         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3872         while ( (sbits64) rem1 < 0 ) {
3873             --zSig1;
3874             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3875         }
3876         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3877     }
3878     return
3879         roundAndPackFloatx80(
3880             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3881 
3882 }
3883 
3884 /*
3885 -------------------------------------------------------------------------------
3886 Returns the remainder of the extended double-precision floating-point value
3887 `a' with respect to the corresponding value `b'.  The operation is performed
3888 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3889 -------------------------------------------------------------------------------
3890 */
3891 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3892 {
3893     flag aSign, bSign, zSign;
3894     int32 aExp, bExp, expDiff;
3895     bits64 aSig0, aSig1, bSig;
3896     bits64 q, term0, term1, alternateASig0, alternateASig1;
3897     floatx80 z;
3898 
3899     aSig0 = extractFloatx80Frac( a );
3900     aExp = extractFloatx80Exp( a );
3901     aSign = extractFloatx80Sign( a );
3902     bSig = extractFloatx80Frac( b );
3903     bExp = extractFloatx80Exp( b );
3904     bSign = extractFloatx80Sign( b );
3905     if ( aExp == 0x7FFF ) {
3906         if (    (bits64) ( aSig0<<1 )
3907              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3908             return propagateFloatx80NaN( a, b );
3909         }
3910         goto invalid;
3911     }
3912     if ( bExp == 0x7FFF ) {
3913         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3914         return a;
3915     }
3916     if ( bExp == 0 ) {
3917         if ( bSig == 0 ) {
3918  invalid:
3919             float_raise( float_flag_invalid );
3920             z.low = floatx80_default_nan_low;
3921             z.high = floatx80_default_nan_high;
3922             return z;
3923         }
3924         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3925     }
3926     if ( aExp == 0 ) {
3927         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3928         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3929     }
3930     bSig |= LIT64( 0x8000000000000000 );
3931     zSign = aSign;
3932     expDiff = aExp - bExp;
3933     aSig1 = 0;
3934     if ( expDiff < 0 ) {
3935         if ( expDiff < -1 ) return a;
3936         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3937         expDiff = 0;
3938     }
3939     q = ( bSig <= aSig0 );
3940     if ( q ) aSig0 -= bSig;
3941     expDiff -= 64;
3942     while ( 0 < expDiff ) {
3943         q = estimateDiv128To64( aSig0, aSig1, bSig );
3944         q = ( 2 < q ) ? q - 2 : 0;
3945         mul64To128( bSig, q, &term0, &term1 );
3946         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3947         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3948         expDiff -= 62;
3949     }
3950     expDiff += 64;
3951     if ( 0 < expDiff ) {
3952         q = estimateDiv128To64( aSig0, aSig1, bSig );
3953         q = ( 2 < q ) ? q - 2 : 0;
3954         q >>= 64 - expDiff;
3955         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3956         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3957         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3958         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3959             ++q;
3960             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3961         }
3962     }
3963     else {
3964         term1 = 0;
3965         term0 = bSig;
3966     }
3967     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3968     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3969          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3970               && ( q & 1 ) )
3971        ) {
3972         aSig0 = alternateASig0;
3973         aSig1 = alternateASig1;
3974         zSign = ! zSign;
3975     }
3976     return
3977         normalizeRoundAndPackFloatx80(
3978             80, zSign, bExp + expDiff, aSig0, aSig1 );
3979 
3980 }
3981 
3982 /*
3983 -------------------------------------------------------------------------------
3984 Returns the square root of the extended double-precision floating-point
3985 value `a'.  The operation is performed according to the IEC/IEEE Standard
3986 for Binary Floating-Point Arithmetic.
3987 -------------------------------------------------------------------------------
3988 */
3989 floatx80 floatx80_sqrt( floatx80 a )
3990 {
3991     flag aSign;
3992     int32 aExp, zExp;
3993     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3994     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3995     floatx80 z;
3996 
3997     aSig0 = extractFloatx80Frac( a );
3998     aExp = extractFloatx80Exp( a );
3999     aSign = extractFloatx80Sign( a );
4000     if ( aExp == 0x7FFF ) {
4001         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4002         if ( ! aSign ) return a;
4003         goto invalid;
4004     }
4005     if ( aSign ) {
4006         if ( ( aExp | aSig0 ) == 0 ) return a;
4007  invalid:
4008         float_raise( float_flag_invalid );
4009         z.low = floatx80_default_nan_low;
4010         z.high = floatx80_default_nan_high;
4011         return z;
4012     }
4013     if ( aExp == 0 ) {
4014         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4015         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4016     }
4017     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4018     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4019     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4020     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4021     doubleZSig0 = zSig0<<1;
4022     mul64To128( zSig0, zSig0, &term0, &term1 );
4023     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4024     while ( (sbits64) rem0 < 0 ) {
4025         --zSig0;
4026         doubleZSig0 -= 2;
4027         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4028     }
4029     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4030     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4031         if ( zSig1 == 0 ) zSig1 = 1;
4032         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4033         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4034         mul64To128( zSig1, zSig1, &term2, &term3 );
4035         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4036         while ( (sbits64) rem1 < 0 ) {
4037             --zSig1;
4038             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4039             term3 |= 1;
4040             term2 |= doubleZSig0;
4041             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4042         }
4043         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4044     }
4045     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4046     zSig0 |= doubleZSig0;
4047     return
4048         roundAndPackFloatx80(
4049             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4050 
4051 }
4052 
4053 /*
4054 -------------------------------------------------------------------------------
4055 Returns 1 if the extended double-precision floating-point value `a' is
4056 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4057 performed according to the IEC/IEEE Standard for Binary Floating-Point
4058 Arithmetic.
4059 -------------------------------------------------------------------------------
4060 */
4061 flag floatx80_eq( floatx80 a, floatx80 b )
4062 {
4063 
4064     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4065               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4066          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4067               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4068        ) {
4069         if (    floatx80_is_signaling_nan( a )
4070              || floatx80_is_signaling_nan( b ) ) {
4071             float_raise( float_flag_invalid );
4072         }
4073         return 0;
4074     }
4075     return
4076            ( a.low == b.low )
4077         && (    ( a.high == b.high )
4078              || (    ( a.low == 0 )
4079                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4080            );
4081 
4082 }
4083 
4084 /*
4085 -------------------------------------------------------------------------------
4086 Returns 1 if the extended double-precision floating-point value `a' is
4087 less than or equal to the corresponding value `b', and 0 otherwise.  The
4088 comparison is performed according to the IEC/IEEE Standard for Binary
4089 Floating-Point Arithmetic.
4090 -------------------------------------------------------------------------------
4091 */
4092 flag floatx80_le( floatx80 a, floatx80 b )
4093 {
4094     flag aSign, bSign;
4095 
4096     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4097               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4098          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4099               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4100        ) {
4101         float_raise( float_flag_invalid );
4102         return 0;
4103     }
4104     aSign = extractFloatx80Sign( a );
4105     bSign = extractFloatx80Sign( b );
4106     if ( aSign != bSign ) {
4107         return
4108                aSign
4109             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4110                  == 0 );
4111     }
4112     return
4113           aSign ? le128( b.high, b.low, a.high, a.low )
4114         : le128( a.high, a.low, b.high, b.low );
4115 
4116 }
4117 
4118 /*
4119 -------------------------------------------------------------------------------
4120 Returns 1 if the extended double-precision floating-point value `a' is
4121 less than the corresponding value `b', and 0 otherwise.  The comparison
4122 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4123 Arithmetic.
4124 -------------------------------------------------------------------------------
4125 */
4126 flag floatx80_lt( floatx80 a, floatx80 b )
4127 {
4128     flag aSign, bSign;
4129 
4130     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4131               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4132          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4133               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4134        ) {
4135         float_raise( float_flag_invalid );
4136         return 0;
4137     }
4138     aSign = extractFloatx80Sign( a );
4139     bSign = extractFloatx80Sign( b );
4140     if ( aSign != bSign ) {
4141         return
4142                aSign
4143             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4144                  != 0 );
4145     }
4146     return
4147           aSign ? lt128( b.high, b.low, a.high, a.low )
4148         : lt128( a.high, a.low, b.high, b.low );
4149 
4150 }
4151 
4152 /*
4153 -------------------------------------------------------------------------------
4154 Returns 1 if the extended double-precision floating-point value `a' is equal
4155 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4156 raised if either operand is a NaN.  Otherwise, the comparison is performed
4157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4158 -------------------------------------------------------------------------------
4159 */
4160 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4161 {
4162 
4163     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4164               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4165          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4166               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4167        ) {
4168         float_raise( float_flag_invalid );
4169         return 0;
4170     }
4171     return
4172            ( a.low == b.low )
4173         && (    ( a.high == b.high )
4174              || (    ( a.low == 0 )
4175                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4176            );
4177 
4178 }
4179 
4180 /*
4181 -------------------------------------------------------------------------------
4182 Returns 1 if the extended double-precision floating-point value `a' is less
4183 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4184 do not cause an exception.  Otherwise, the comparison is performed according
4185 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4186 -------------------------------------------------------------------------------
4187 */
4188 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4189 {
4190     flag aSign, bSign;
4191 
4192     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4193               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4194          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4195               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4196        ) {
4197         if (    floatx80_is_signaling_nan( a )
4198              || floatx80_is_signaling_nan( b ) ) {
4199             float_raise( float_flag_invalid );
4200         }
4201         return 0;
4202     }
4203     aSign = extractFloatx80Sign( a );
4204     bSign = extractFloatx80Sign( b );
4205     if ( aSign != bSign ) {
4206         return
4207                aSign
4208             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4209                  == 0 );
4210     }
4211     return
4212           aSign ? le128( b.high, b.low, a.high, a.low )
4213         : le128( a.high, a.low, b.high, b.low );
4214 
4215 }
4216 
4217 /*
4218 -------------------------------------------------------------------------------
4219 Returns 1 if the extended double-precision floating-point value `a' is less
4220 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4221 an exception.  Otherwise, the comparison is performed according to the
4222 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4223 -------------------------------------------------------------------------------
4224 */
4225 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4226 {
4227     flag aSign, bSign;
4228 
4229     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4230               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4231          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4232               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4233        ) {
4234         if (    floatx80_is_signaling_nan( a )
4235              || floatx80_is_signaling_nan( b ) ) {
4236             float_raise( float_flag_invalid );
4237         }
4238         return 0;
4239     }
4240     aSign = extractFloatx80Sign( a );
4241     bSign = extractFloatx80Sign( b );
4242     if ( aSign != bSign ) {
4243         return
4244                aSign
4245             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4246                  != 0 );
4247     }
4248     return
4249           aSign ? lt128( b.high, b.low, a.high, a.low )
4250         : lt128( a.high, a.low, b.high, b.low );
4251 
4252 }
4253 
4254 #endif
4255 
4256 #ifdef FLOAT128
4257 
4258 /*
4259 -------------------------------------------------------------------------------
4260 Returns the result of converting the quadruple-precision floating-point
4261 value `a' to the 32-bit two's complement integer format.  The conversion
4262 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4263 Arithmetic---which means in particular that the conversion is rounded
4264 according to the current rounding mode.  If `a' is a NaN, the largest
4265 positive integer is returned.  Otherwise, if the conversion overflows, the
4266 largest integer with the same sign as `a' is returned.
4267 -------------------------------------------------------------------------------
4268 */
4269 int32 float128_to_int32( float128 a )
4270 {
4271     flag aSign;
4272     int32 aExp, shiftCount;
4273     bits64 aSig0, aSig1;
4274 
4275     aSig1 = extractFloat128Frac1( a );
4276     aSig0 = extractFloat128Frac0( a );
4277     aExp = extractFloat128Exp( a );
4278     aSign = extractFloat128Sign( a );
4279     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4280     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4281     aSig0 |= ( aSig1 != 0 );
4282     shiftCount = 0x4028 - aExp;
4283     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4284     return roundAndPackInt32( aSign, aSig0 );
4285 
4286 }
4287 
4288 /*
4289 -------------------------------------------------------------------------------
4290 Returns the result of converting the quadruple-precision floating-point
4291 value `a' to the 32-bit two's complement integer format.  The conversion
4292 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4293 Arithmetic, except that the conversion is always rounded toward zero.  If
4294 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4295 conversion overflows, the largest integer with the same sign as `a' is
4296 returned.
4297 -------------------------------------------------------------------------------
4298 */
4299 int32 float128_to_int32_round_to_zero( float128 a )
4300 {
4301     flag aSign;
4302     int32 aExp, shiftCount;
4303     bits64 aSig0, aSig1, savedASig;
4304     int32 z;
4305 
4306     aSig1 = extractFloat128Frac1( a );
4307     aSig0 = extractFloat128Frac0( a );
4308     aExp = extractFloat128Exp( a );
4309     aSign = extractFloat128Sign( a );
4310     aSig0 |= ( aSig1 != 0 );
4311     if ( 0x401E < aExp ) {
4312         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4313         goto invalid;
4314     }
4315     else if ( aExp < 0x3FFF ) {
4316         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4317         return 0;
4318     }
4319     aSig0 |= LIT64( 0x0001000000000000 );
4320     shiftCount = 0x402F - aExp;
4321     savedASig = aSig0;
4322     aSig0 >>= shiftCount;
4323     z = aSig0;
4324     if ( aSign ) z = - z;
4325     if ( ( z < 0 ) ^ aSign ) {
4326  invalid:
4327         float_raise( float_flag_invalid );
4328         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4329     }
4330     if ( ( aSig0<<shiftCount ) != savedASig ) {
4331         float_exception_flags |= float_flag_inexact;
4332     }
4333     return z;
4334 
4335 }
4336 
4337 /*
4338 -------------------------------------------------------------------------------
4339 Returns the result of converting the quadruple-precision floating-point
4340 value `a' to the 64-bit two's complement integer format.  The conversion
4341 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4342 Arithmetic---which means in particular that the conversion is rounded
4343 according to the current rounding mode.  If `a' is a NaN, the largest
4344 positive integer is returned.  Otherwise, if the conversion overflows, the
4345 largest integer with the same sign as `a' is returned.
4346 -------------------------------------------------------------------------------
4347 */
4348 int64 float128_to_int64( float128 a )
4349 {
4350     flag aSign;
4351     int32 aExp, shiftCount;
4352     bits64 aSig0, aSig1;
4353 
4354     aSig1 = extractFloat128Frac1( a );
4355     aSig0 = extractFloat128Frac0( a );
4356     aExp = extractFloat128Exp( a );
4357     aSign = extractFloat128Sign( a );
4358     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4359     shiftCount = 0x402F - aExp;
4360     if ( shiftCount <= 0 ) {
4361         if ( 0x403E < aExp ) {
4362             float_raise( float_flag_invalid );
4363             if (    ! aSign
4364                  || (    ( aExp == 0x7FFF )
4365                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4366                     )
4367                ) {
4368                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4369             }
4370             return (sbits64) LIT64( 0x8000000000000000 );
4371         }
4372         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4373     }
4374     else {
4375         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4376     }
4377     return roundAndPackInt64( aSign, aSig0, aSig1 );
4378 
4379 }
4380 
4381 /*
4382 -------------------------------------------------------------------------------
4383 Returns the result of converting the quadruple-precision floating-point
4384 value `a' to the 64-bit two's complement integer format.  The conversion
4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4386 Arithmetic, except that the conversion is always rounded toward zero.
4387 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4388 the conversion overflows, the largest integer with the same sign as `a' is
4389 returned.
4390 -------------------------------------------------------------------------------
4391 */
4392 int64 float128_to_int64_round_to_zero( float128 a )
4393 {
4394     flag aSign;
4395     int32 aExp, shiftCount;
4396     bits64 aSig0, aSig1;
4397     int64 z;
4398 
4399     aSig1 = extractFloat128Frac1( a );
4400     aSig0 = extractFloat128Frac0( a );
4401     aExp = extractFloat128Exp( a );
4402     aSign = extractFloat128Sign( a );
4403     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4404     shiftCount = aExp - 0x402F;
4405     if ( 0 < shiftCount ) {
4406         if ( 0x403E <= aExp ) {
4407             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4408             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4409                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4410                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4411             }
4412             else {
4413                 float_raise( float_flag_invalid );
4414                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4415                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4416                 }
4417             }
4418             return (sbits64) LIT64( 0x8000000000000000 );
4419         }
4420         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4421         if ( (bits64) ( aSig1<<shiftCount ) ) {
4422             float_exception_flags |= float_flag_inexact;
4423         }
4424     }
4425     else {
4426         if ( aExp < 0x3FFF ) {
4427             if ( aExp | aSig0 | aSig1 ) {
4428                 float_exception_flags |= float_flag_inexact;
4429             }
4430             return 0;
4431         }
4432         z = aSig0>>( - shiftCount );
4433         if (    aSig1
4434              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4435             float_exception_flags |= float_flag_inexact;
4436         }
4437     }
4438     if ( aSign ) z = - z;
4439     return z;
4440 
4441 }
4442 
4443 /*
4444 -------------------------------------------------------------------------------
4445 Returns the result of converting the quadruple-precision floating-point
4446 value `a' to the single-precision floating-point format.  The conversion
4447 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4448 Arithmetic.
4449 -------------------------------------------------------------------------------
4450 */
4451 float32 float128_to_float32( float128 a )
4452 {
4453     flag aSign;
4454     int32 aExp;
4455     bits64 aSig0, aSig1;
4456     bits32 zSig;
4457 
4458     aSig1 = extractFloat128Frac1( a );
4459     aSig0 = extractFloat128Frac0( a );
4460     aExp = extractFloat128Exp( a );
4461     aSign = extractFloat128Sign( a );
4462     if ( aExp == 0x7FFF ) {
4463         if ( aSig0 | aSig1 ) {
4464             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4465         }
4466         return packFloat32( aSign, 0xFF, 0 );
4467     }
4468     aSig0 |= ( aSig1 != 0 );
4469     shift64RightJamming( aSig0, 18, &aSig0 );
4470     zSig = aSig0;
4471     if ( aExp || zSig ) {
4472         zSig |= 0x40000000;
4473         aExp -= 0x3F81;
4474     }
4475     return roundAndPackFloat32( aSign, aExp, zSig );
4476 
4477 }
4478 
4479 /*
4480 -------------------------------------------------------------------------------
4481 Returns the result of converting the quadruple-precision floating-point
4482 value `a' to the double-precision floating-point format.  The conversion
4483 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4484 Arithmetic.
4485 -------------------------------------------------------------------------------
4486 */
4487 float64 float128_to_float64( float128 a )
4488 {
4489     flag aSign;
4490     int32 aExp;
4491     bits64 aSig0, aSig1;
4492 
4493     aSig1 = extractFloat128Frac1( a );
4494     aSig0 = extractFloat128Frac0( a );
4495     aExp = extractFloat128Exp( a );
4496     aSign = extractFloat128Sign( a );
4497     if ( aExp == 0x7FFF ) {
4498         if ( aSig0 | aSig1 ) {
4499             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4500         }
4501         return packFloat64( aSign, 0x7FF, 0 );
4502     }
4503     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4504     aSig0 |= ( aSig1 != 0 );
4505     if ( aExp || aSig0 ) {
4506         aSig0 |= LIT64( 0x4000000000000000 );
4507         aExp -= 0x3C01;
4508     }
4509     return roundAndPackFloat64( aSign, aExp, aSig0 );
4510 
4511 }
4512 
4513 #ifdef FLOATX80
4514 
4515 /*
4516 -------------------------------------------------------------------------------
4517 Returns the result of converting the quadruple-precision floating-point
4518 value `a' to the extended double-precision floating-point format.  The
4519 conversion is performed according to the IEC/IEEE Standard for Binary
4520 Floating-Point Arithmetic.
4521 -------------------------------------------------------------------------------
4522 */
4523 floatx80 float128_to_floatx80( float128 a )
4524 {
4525     flag aSign;
4526     int32 aExp;
4527     bits64 aSig0, aSig1;
4528 
4529     aSig1 = extractFloat128Frac1( a );
4530     aSig0 = extractFloat128Frac0( a );
4531     aExp = extractFloat128Exp( a );
4532     aSign = extractFloat128Sign( a );
4533     if ( aExp == 0x7FFF ) {
4534         if ( aSig0 | aSig1 ) {
4535             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4536         }
4537         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4538     }
4539     if ( aExp == 0 ) {
4540         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4541         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4542     }
4543     else {
4544         aSig0 |= LIT64( 0x0001000000000000 );
4545     }
4546     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4547     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4548 
4549 }
4550 
4551 #endif
4552 
4553 /*
4554 -------------------------------------------------------------------------------
4555 Rounds the quadruple-precision floating-point value `a' to an integer, and
4556 returns the result as a quadruple-precision floating-point value.  The
4557 operation is performed according to the IEC/IEEE Standard for Binary
4558 Floating-Point Arithmetic.
4559 -------------------------------------------------------------------------------
4560 */
4561 float128 float128_round_to_int( float128 a )
4562 {
4563     flag aSign;
4564     int32 aExp;
4565     bits64 lastBitMask, roundBitsMask;
4566     int8 roundingMode;
4567     float128 z;
4568 
4569     aExp = extractFloat128Exp( a );
4570     if ( 0x402F <= aExp ) {
4571         if ( 0x406F <= aExp ) {
4572             if (    ( aExp == 0x7FFF )
4573                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4574                ) {
4575                 return propagateFloat128NaN( a, a );
4576             }
4577             return a;
4578         }
4579         lastBitMask = 1;
4580         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4581         roundBitsMask = lastBitMask - 1;
4582         z = a;
4583         roundingMode = float_rounding_mode;
4584         if ( roundingMode == float_round_nearest_even ) {
4585             if ( lastBitMask ) {
4586                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4587                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4588             }
4589             else {
4590                 if ( (sbits64) z.low < 0 ) {
4591                     ++z.high;
4592                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4593                 }
4594             }
4595         }
4596         else if ( roundingMode != float_round_to_zero ) {
4597             if (   extractFloat128Sign( z )
4598                  ^ ( roundingMode == float_round_up ) ) {
4599                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4600             }
4601         }
4602         z.low &= ~ roundBitsMask;
4603     }
4604     else {
4605         if ( aExp < 0x3FFF ) {
4606             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4607             float_exception_flags |= float_flag_inexact;
4608             aSign = extractFloat128Sign( a );
4609             switch ( float_rounding_mode ) {
4610              case float_round_nearest_even:
4611                 if (    ( aExp == 0x3FFE )
4612                      && (   extractFloat128Frac0( a )
4613                           | extractFloat128Frac1( a ) )
4614                    ) {
4615                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4616                 }
4617                 break;
4618 	     case float_round_to_zero:
4619 		break;
4620              case float_round_down:
4621                 return
4622                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4623                     : packFloat128( 0, 0, 0, 0 );
4624              case float_round_up:
4625                 return
4626                       aSign ? packFloat128( 1, 0, 0, 0 )
4627                     : packFloat128( 0, 0x3FFF, 0, 0 );
4628             }
4629             return packFloat128( aSign, 0, 0, 0 );
4630         }
4631         lastBitMask = 1;
4632         lastBitMask <<= 0x402F - aExp;
4633         roundBitsMask = lastBitMask - 1;
4634         z.low = 0;
4635         z.high = a.high;
4636         roundingMode = float_rounding_mode;
4637         if ( roundingMode == float_round_nearest_even ) {
4638             z.high += lastBitMask>>1;
4639             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4640                 z.high &= ~ lastBitMask;
4641             }
4642         }
4643         else if ( roundingMode != float_round_to_zero ) {
4644             if (   extractFloat128Sign( z )
4645                  ^ ( roundingMode == float_round_up ) ) {
4646                 z.high |= ( a.low != 0 );
4647                 z.high += roundBitsMask;
4648             }
4649         }
4650         z.high &= ~ roundBitsMask;
4651     }
4652     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4653         float_exception_flags |= float_flag_inexact;
4654     }
4655     return z;
4656 
4657 }
4658 
4659 /*
4660 -------------------------------------------------------------------------------
4661 Returns the result of adding the absolute values of the quadruple-precision
4662 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4663 before being returned.  `zSign' is ignored if the result is a NaN.
4664 The addition is performed according to the IEC/IEEE Standard for Binary
4665 Floating-Point Arithmetic.
4666 -------------------------------------------------------------------------------
4667 */
4668 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4669 {
4670     int32 aExp, bExp, zExp;
4671     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4672     int32 expDiff;
4673 
4674     aSig1 = extractFloat128Frac1( a );
4675     aSig0 = extractFloat128Frac0( a );
4676     aExp = extractFloat128Exp( a );
4677     bSig1 = extractFloat128Frac1( b );
4678     bSig0 = extractFloat128Frac0( b );
4679     bExp = extractFloat128Exp( b );
4680     expDiff = aExp - bExp;
4681     if ( 0 < expDiff ) {
4682         if ( aExp == 0x7FFF ) {
4683             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4684             return a;
4685         }
4686         if ( bExp == 0 ) {
4687             --expDiff;
4688         }
4689         else {
4690             bSig0 |= LIT64( 0x0001000000000000 );
4691         }
4692         shift128ExtraRightJamming(
4693             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4694         zExp = aExp;
4695     }
4696     else if ( expDiff < 0 ) {
4697         if ( bExp == 0x7FFF ) {
4698             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4699             return packFloat128( zSign, 0x7FFF, 0, 0 );
4700         }
4701         if ( aExp == 0 ) {
4702             ++expDiff;
4703         }
4704         else {
4705             aSig0 |= LIT64( 0x0001000000000000 );
4706         }
4707         shift128ExtraRightJamming(
4708             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4709         zExp = bExp;
4710     }
4711     else {
4712         if ( aExp == 0x7FFF ) {
4713             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4714                 return propagateFloat128NaN( a, b );
4715             }
4716             return a;
4717         }
4718         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4719         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4720         zSig2 = 0;
4721         zSig0 |= LIT64( 0x0002000000000000 );
4722         zExp = aExp;
4723         goto shiftRight1;
4724     }
4725     aSig0 |= LIT64( 0x0001000000000000 );
4726     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4727     --zExp;
4728     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4729     ++zExp;
4730  shiftRight1:
4731     shift128ExtraRightJamming(
4732         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4733  roundAndPack:
4734     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4735 
4736 }
4737 
4738 /*
4739 -------------------------------------------------------------------------------
4740 Returns the result of subtracting the absolute values of the quadruple-
4741 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4742 difference is negated before being returned.  `zSign' is ignored if the
4743 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4744 Standard for Binary Floating-Point Arithmetic.
4745 -------------------------------------------------------------------------------
4746 */
4747 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4748 {
4749     int32 aExp, bExp, zExp;
4750     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4751     int32 expDiff;
4752     float128 z;
4753 
4754     aSig1 = extractFloat128Frac1( a );
4755     aSig0 = extractFloat128Frac0( a );
4756     aExp = extractFloat128Exp( a );
4757     bSig1 = extractFloat128Frac1( b );
4758     bSig0 = extractFloat128Frac0( b );
4759     bExp = extractFloat128Exp( b );
4760     expDiff = aExp - bExp;
4761     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4762     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4763     if ( 0 < expDiff ) goto aExpBigger;
4764     if ( expDiff < 0 ) goto bExpBigger;
4765     if ( aExp == 0x7FFF ) {
4766         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4767             return propagateFloat128NaN( a, b );
4768         }
4769         float_raise( float_flag_invalid );
4770         z.low = float128_default_nan_low;
4771         z.high = float128_default_nan_high;
4772         return z;
4773     }
4774     if ( aExp == 0 ) {
4775         aExp = 1;
4776         bExp = 1;
4777     }
4778     if ( bSig0 < aSig0 ) goto aBigger;
4779     if ( aSig0 < bSig0 ) goto bBigger;
4780     if ( bSig1 < aSig1 ) goto aBigger;
4781     if ( aSig1 < bSig1 ) goto bBigger;
4782     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4783  bExpBigger:
4784     if ( bExp == 0x7FFF ) {
4785         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4786         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4787     }
4788     if ( aExp == 0 ) {
4789         ++expDiff;
4790     }
4791     else {
4792         aSig0 |= LIT64( 0x4000000000000000 );
4793     }
4794     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4795     bSig0 |= LIT64( 0x4000000000000000 );
4796  bBigger:
4797     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4798     zExp = bExp;
4799     zSign ^= 1;
4800     goto normalizeRoundAndPack;
4801  aExpBigger:
4802     if ( aExp == 0x7FFF ) {
4803         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4804         return a;
4805     }
4806     if ( bExp == 0 ) {
4807         --expDiff;
4808     }
4809     else {
4810         bSig0 |= LIT64( 0x4000000000000000 );
4811     }
4812     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4813     aSig0 |= LIT64( 0x4000000000000000 );
4814  aBigger:
4815     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4816     zExp = aExp;
4817  normalizeRoundAndPack:
4818     --zExp;
4819     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4820 
4821 }
4822 
4823 /*
4824 -------------------------------------------------------------------------------
4825 Returns the result of adding the quadruple-precision floating-point values
4826 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4827 for Binary Floating-Point Arithmetic.
4828 -------------------------------------------------------------------------------
4829 */
4830 float128 float128_add( float128 a, float128 b )
4831 {
4832     flag aSign, bSign;
4833 
4834     aSign = extractFloat128Sign( a );
4835     bSign = extractFloat128Sign( b );
4836     if ( aSign == bSign ) {
4837         return addFloat128Sigs( a, b, aSign );
4838     }
4839     else {
4840         return subFloat128Sigs( a, b, aSign );
4841     }
4842 
4843 }
4844 
4845 /*
4846 -------------------------------------------------------------------------------
4847 Returns the result of subtracting the quadruple-precision floating-point
4848 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4849 Standard for Binary Floating-Point Arithmetic.
4850 -------------------------------------------------------------------------------
4851 */
4852 float128 float128_sub( float128 a, float128 b )
4853 {
4854     flag aSign, bSign;
4855 
4856     aSign = extractFloat128Sign( a );
4857     bSign = extractFloat128Sign( b );
4858     if ( aSign == bSign ) {
4859         return subFloat128Sigs( a, b, aSign );
4860     }
4861     else {
4862         return addFloat128Sigs( a, b, aSign );
4863     }
4864 
4865 }
4866 
4867 /*
4868 -------------------------------------------------------------------------------
4869 Returns the result of multiplying the quadruple-precision floating-point
4870 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4871 Standard for Binary Floating-Point Arithmetic.
4872 -------------------------------------------------------------------------------
4873 */
4874 float128 float128_mul( float128 a, float128 b )
4875 {
4876     flag aSign, bSign, zSign;
4877     int32 aExp, bExp, zExp;
4878     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4879     float128 z;
4880 
4881     aSig1 = extractFloat128Frac1( a );
4882     aSig0 = extractFloat128Frac0( a );
4883     aExp = extractFloat128Exp( a );
4884     aSign = extractFloat128Sign( a );
4885     bSig1 = extractFloat128Frac1( b );
4886     bSig0 = extractFloat128Frac0( b );
4887     bExp = extractFloat128Exp( b );
4888     bSign = extractFloat128Sign( b );
4889     zSign = aSign ^ bSign;
4890     if ( aExp == 0x7FFF ) {
4891         if (    ( aSig0 | aSig1 )
4892              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4893             return propagateFloat128NaN( a, b );
4894         }
4895         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4896         return packFloat128( zSign, 0x7FFF, 0, 0 );
4897     }
4898     if ( bExp == 0x7FFF ) {
4899         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4900         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4901  invalid:
4902             float_raise( float_flag_invalid );
4903             z.low = float128_default_nan_low;
4904             z.high = float128_default_nan_high;
4905             return z;
4906         }
4907         return packFloat128( zSign, 0x7FFF, 0, 0 );
4908     }
4909     if ( aExp == 0 ) {
4910         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4911         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4912     }
4913     if ( bExp == 0 ) {
4914         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4915         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4916     }
4917     zExp = aExp + bExp - 0x4000;
4918     aSig0 |= LIT64( 0x0001000000000000 );
4919     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4920     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4921     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4922     zSig2 |= ( zSig3 != 0 );
4923     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4924         shift128ExtraRightJamming(
4925             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4926         ++zExp;
4927     }
4928     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4929 
4930 }
4931 
4932 /*
4933 -------------------------------------------------------------------------------
4934 Returns the result of dividing the quadruple-precision floating-point value
4935 `a' by the corresponding value `b'.  The operation is performed according to
4936 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4937 -------------------------------------------------------------------------------
4938 */
4939 float128 float128_div( float128 a, float128 b )
4940 {
4941     flag aSign, bSign, zSign;
4942     int32 aExp, bExp, zExp;
4943     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4944     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4945     float128 z;
4946 
4947     aSig1 = extractFloat128Frac1( a );
4948     aSig0 = extractFloat128Frac0( a );
4949     aExp = extractFloat128Exp( a );
4950     aSign = extractFloat128Sign( a );
4951     bSig1 = extractFloat128Frac1( b );
4952     bSig0 = extractFloat128Frac0( b );
4953     bExp = extractFloat128Exp( b );
4954     bSign = extractFloat128Sign( b );
4955     zSign = aSign ^ bSign;
4956     if ( aExp == 0x7FFF ) {
4957         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4958         if ( bExp == 0x7FFF ) {
4959             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4960             goto invalid;
4961         }
4962         return packFloat128( zSign, 0x7FFF, 0, 0 );
4963     }
4964     if ( bExp == 0x7FFF ) {
4965         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4966         return packFloat128( zSign, 0, 0, 0 );
4967     }
4968     if ( bExp == 0 ) {
4969         if ( ( bSig0 | bSig1 ) == 0 ) {
4970             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4971  invalid:
4972                 float_raise( float_flag_invalid );
4973                 z.low = float128_default_nan_low;
4974                 z.high = float128_default_nan_high;
4975                 return z;
4976             }
4977             float_raise( float_flag_divbyzero );
4978             return packFloat128( zSign, 0x7FFF, 0, 0 );
4979         }
4980         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4981     }
4982     if ( aExp == 0 ) {
4983         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4984         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4985     }
4986     zExp = aExp - bExp + 0x3FFD;
4987     shortShift128Left(
4988         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4989     shortShift128Left(
4990         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4991     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4992         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4993         ++zExp;
4994     }
4995     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4996     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4997     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4998     while ( (sbits64) rem0 < 0 ) {
4999         --zSig0;
5000         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5001     }
5002     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5003     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5004         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5005         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5006         while ( (sbits64) rem1 < 0 ) {
5007             --zSig1;
5008             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5009         }
5010         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5011     }
5012     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5013     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5014 
5015 }
5016 
5017 /*
5018 -------------------------------------------------------------------------------
5019 Returns the remainder of the quadruple-precision floating-point value `a'
5020 with respect to the corresponding value `b'.  The operation is performed
5021 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5022 -------------------------------------------------------------------------------
5023 */
5024 float128 float128_rem( float128 a, float128 b )
5025 {
5026     flag aSign, bSign, zSign;
5027     int32 aExp, bExp, expDiff;
5028     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5029     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5030     sbits64 sigMean0;
5031     float128 z;
5032 
5033     aSig1 = extractFloat128Frac1( a );
5034     aSig0 = extractFloat128Frac0( a );
5035     aExp = extractFloat128Exp( a );
5036     aSign = extractFloat128Sign( a );
5037     bSig1 = extractFloat128Frac1( b );
5038     bSig0 = extractFloat128Frac0( b );
5039     bExp = extractFloat128Exp( b );
5040     bSign = extractFloat128Sign( b );
5041     if ( aExp == 0x7FFF ) {
5042         if (    ( aSig0 | aSig1 )
5043              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5044             return propagateFloat128NaN( a, b );
5045         }
5046         goto invalid;
5047     }
5048     if ( bExp == 0x7FFF ) {
5049         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5050         return a;
5051     }
5052     if ( bExp == 0 ) {
5053         if ( ( bSig0 | bSig1 ) == 0 ) {
5054  invalid:
5055             float_raise( float_flag_invalid );
5056             z.low = float128_default_nan_low;
5057             z.high = float128_default_nan_high;
5058             return z;
5059         }
5060         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5061     }
5062     if ( aExp == 0 ) {
5063         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5064         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5065     }
5066     expDiff = aExp - bExp;
5067     if ( expDiff < -1 ) return a;
5068     shortShift128Left(
5069         aSig0 | LIT64( 0x0001000000000000 ),
5070         aSig1,
5071         15 - ( expDiff < 0 ),
5072         &aSig0,
5073         &aSig1
5074     );
5075     shortShift128Left(
5076         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5077     q = le128( bSig0, bSig1, aSig0, aSig1 );
5078     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5079     expDiff -= 64;
5080     while ( 0 < expDiff ) {
5081         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5082         q = ( 4 < q ) ? q - 4 : 0;
5083         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5084         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5085         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5086         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5087         expDiff -= 61;
5088     }
5089     if ( -64 < expDiff ) {
5090         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5091         q = ( 4 < q ) ? q - 4 : 0;
5092         q >>= - expDiff;
5093         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5094         expDiff += 52;
5095         if ( expDiff < 0 ) {
5096             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5097         }
5098         else {
5099             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5100         }
5101         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5102         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5103     }
5104     else {
5105         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5106         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5107     }
5108     do {
5109         alternateASig0 = aSig0;
5110         alternateASig1 = aSig1;
5111         ++q;
5112         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5113     } while ( 0 <= (sbits64) aSig0 );
5114     add128(
5115         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5116     if (    ( sigMean0 < 0 )
5117          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5118         aSig0 = alternateASig0;
5119         aSig1 = alternateASig1;
5120     }
5121     zSign = ( (sbits64) aSig0 < 0 );
5122     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5123     return
5124         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5125 
5126 }
5127 
5128 /*
5129 -------------------------------------------------------------------------------
5130 Returns the square root of the quadruple-precision floating-point value `a'.
5131 The operation is performed according to the IEC/IEEE Standard for Binary
5132 Floating-Point Arithmetic.
5133 -------------------------------------------------------------------------------
5134 */
5135 float128 float128_sqrt( float128 a )
5136 {
5137     flag aSign;
5138     int32 aExp, zExp;
5139     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5140     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5141     float128 z;
5142 
5143     aSig1 = extractFloat128Frac1( a );
5144     aSig0 = extractFloat128Frac0( a );
5145     aExp = extractFloat128Exp( a );
5146     aSign = extractFloat128Sign( a );
5147     if ( aExp == 0x7FFF ) {
5148         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5149         if ( ! aSign ) return a;
5150         goto invalid;
5151     }
5152     if ( aSign ) {
5153         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5154  invalid:
5155         float_raise( float_flag_invalid );
5156         z.low = float128_default_nan_low;
5157         z.high = float128_default_nan_high;
5158         return z;
5159     }
5160     if ( aExp == 0 ) {
5161         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5162         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5163     }
5164     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5165     aSig0 |= LIT64( 0x0001000000000000 );
5166     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5167     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5168     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5169     doubleZSig0 = zSig0<<1;
5170     mul64To128( zSig0, zSig0, &term0, &term1 );
5171     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5172     while ( (sbits64) rem0 < 0 ) {
5173         --zSig0;
5174         doubleZSig0 -= 2;
5175         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5176     }
5177     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5178     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5179         if ( zSig1 == 0 ) zSig1 = 1;
5180         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5181         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5182         mul64To128( zSig1, zSig1, &term2, &term3 );
5183         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5184         while ( (sbits64) rem1 < 0 ) {
5185             --zSig1;
5186             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5187             term3 |= 1;
5188             term2 |= doubleZSig0;
5189             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5190         }
5191         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5192     }
5193     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5194     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5195 
5196 }
5197 
5198 /*
5199 -------------------------------------------------------------------------------
5200 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5201 the corresponding value `b', and 0 otherwise.  The comparison is performed
5202 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5203 -------------------------------------------------------------------------------
5204 */
5205 flag float128_eq( float128 a, float128 b )
5206 {
5207 
5208     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5209               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5210          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5211               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5212        ) {
5213         if (    float128_is_signaling_nan( a )
5214              || float128_is_signaling_nan( b ) ) {
5215             float_raise( float_flag_invalid );
5216         }
5217         return 0;
5218     }
5219     return
5220            ( a.low == b.low )
5221         && (    ( a.high == b.high )
5222              || (    ( a.low == 0 )
5223                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5224            );
5225 
5226 }
5227 
5228 /*
5229 -------------------------------------------------------------------------------
5230 Returns 1 if the quadruple-precision floating-point value `a' is less than
5231 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5232 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5233 Arithmetic.
5234 -------------------------------------------------------------------------------
5235 */
5236 flag float128_le( float128 a, float128 b )
5237 {
5238     flag aSign, bSign;
5239 
5240     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5241               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5242          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5243               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5244        ) {
5245         float_raise( float_flag_invalid );
5246         return 0;
5247     }
5248     aSign = extractFloat128Sign( a );
5249     bSign = extractFloat128Sign( b );
5250     if ( aSign != bSign ) {
5251         return
5252                aSign
5253             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5254                  == 0 );
5255     }
5256     return
5257           aSign ? le128( b.high, b.low, a.high, a.low )
5258         : le128( a.high, a.low, b.high, b.low );
5259 
5260 }
5261 
5262 /*
5263 -------------------------------------------------------------------------------
5264 Returns 1 if the quadruple-precision floating-point value `a' is less than
5265 the corresponding value `b', and 0 otherwise.  The comparison is performed
5266 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5267 -------------------------------------------------------------------------------
5268 */
5269 flag float128_lt( float128 a, float128 b )
5270 {
5271     flag aSign, bSign;
5272 
5273     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5274               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5275          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5276               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5277        ) {
5278         float_raise( float_flag_invalid );
5279         return 0;
5280     }
5281     aSign = extractFloat128Sign( a );
5282     bSign = extractFloat128Sign( b );
5283     if ( aSign != bSign ) {
5284         return
5285                aSign
5286             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5287                  != 0 );
5288     }
5289     return
5290           aSign ? lt128( b.high, b.low, a.high, a.low )
5291         : lt128( a.high, a.low, b.high, b.low );
5292 
5293 }
5294 
5295 /*
5296 -------------------------------------------------------------------------------
5297 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5298 the corresponding value `b', and 0 otherwise.  The invalid exception is
5299 raised if either operand is a NaN.  Otherwise, the comparison is performed
5300 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5301 -------------------------------------------------------------------------------
5302 */
5303 flag float128_eq_signaling( float128 a, float128 b )
5304 {
5305 
5306     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5307               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5308          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5309               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5310        ) {
5311         float_raise( float_flag_invalid );
5312         return 0;
5313     }
5314     return
5315            ( a.low == b.low )
5316         && (    ( a.high == b.high )
5317              || (    ( a.low == 0 )
5318                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5319            );
5320 
5321 }
5322 
5323 /*
5324 -------------------------------------------------------------------------------
5325 Returns 1 if the quadruple-precision floating-point value `a' is less than
5326 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5327 cause an exception.  Otherwise, the comparison is performed according to the
5328 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5329 -------------------------------------------------------------------------------
5330 */
5331 flag float128_le_quiet( float128 a, float128 b )
5332 {
5333     flag aSign, bSign;
5334 
5335     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5336               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5337          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5338               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5339        ) {
5340         if (    float128_is_signaling_nan( a )
5341              || float128_is_signaling_nan( b ) ) {
5342             float_raise( float_flag_invalid );
5343         }
5344         return 0;
5345     }
5346     aSign = extractFloat128Sign( a );
5347     bSign = extractFloat128Sign( b );
5348     if ( aSign != bSign ) {
5349         return
5350                aSign
5351             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5352                  == 0 );
5353     }
5354     return
5355           aSign ? le128( b.high, b.low, a.high, a.low )
5356         : le128( a.high, a.low, b.high, b.low );
5357 
5358 }
5359 
5360 /*
5361 -------------------------------------------------------------------------------
5362 Returns 1 if the quadruple-precision floating-point value `a' is less than
5363 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5364 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5365 Standard for Binary Floating-Point Arithmetic.
5366 -------------------------------------------------------------------------------
5367 */
5368 flag float128_lt_quiet( float128 a, float128 b )
5369 {
5370     flag aSign, bSign;
5371 
5372     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5373               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5374          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5375               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5376        ) {
5377         if (    float128_is_signaling_nan( a )
5378              || float128_is_signaling_nan( b ) ) {
5379             float_raise( float_flag_invalid );
5380         }
5381         return 0;
5382     }
5383     aSign = extractFloat128Sign( a );
5384     bSign = extractFloat128Sign( b );
5385     if ( aSign != bSign ) {
5386         return
5387                aSign
5388             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5389                  != 0 );
5390     }
5391     return
5392           aSign ? lt128( b.high, b.low, a.high, a.low )
5393         : lt128( a.high, a.low, b.high, b.low );
5394 
5395 }
5396 
5397 #endif
5398 
5399 
5400 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5401 
5402 /*
5403  * These two routines are not part of the original softfloat distribution.
5404  *
5405  * They are based on the corresponding conversions to integer but return
5406  * unsigned numbers instead since these functions are required by GCC.
5407  *
5408  * Added by Mark Brinicombe <mark@netbsd.org>	27/09/97
5409  *
5410  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5411  */
5412 
5413 /*
5414 -------------------------------------------------------------------------------
5415 Returns the result of converting the double-precision floating-point value
5416 `a' to the 32-bit unsigned integer format.  The conversion is
5417 performed according to the IEC/IEEE Standard for Binary Floating-point
5418 Arithmetic, except that the conversion is always rounded toward zero.  If
5419 `a' is a NaN, the largest positive integer is returned.  If the conversion
5420 overflows, the largest integer positive is returned.
5421 -------------------------------------------------------------------------------
5422 */
5423 uint32 float64_to_uint32_round_to_zero( float64 a )
5424 {
5425     flag aSign;
5426     int16 aExp, shiftCount;
5427     bits64 aSig, savedASig;
5428     uint32 z;
5429 
5430     aSig = extractFloat64Frac( a );
5431     aExp = extractFloat64Exp( a );
5432     aSign = extractFloat64Sign( a );
5433 
5434     if (aSign) {
5435         float_raise( float_flag_invalid );
5436     	return(0);
5437     }
5438 
5439     if ( 0x41E < aExp ) {
5440         float_raise( float_flag_invalid );
5441         return 0xffffffff;
5442     }
5443     else if ( aExp < 0x3FF ) {
5444         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5445         return 0;
5446     }
5447     aSig |= LIT64( 0x0010000000000000 );
5448     shiftCount = 0x433 - aExp;
5449     savedASig = aSig;
5450     aSig >>= shiftCount;
5451     z = aSig;
5452     if ( ( aSig<<shiftCount ) != savedASig ) {
5453         float_exception_flags |= float_flag_inexact;
5454     }
5455     return z;
5456 
5457 }
5458 
5459 /*
5460 -------------------------------------------------------------------------------
5461 Returns the result of converting the single-precision floating-point value
5462 `a' to the 32-bit unsigned integer format.  The conversion is
5463 performed according to the IEC/IEEE Standard for Binary Floating-point
5464 Arithmetic, except that the conversion is always rounded toward zero.  If
5465 `a' is a NaN, the largest positive integer is returned.  If the conversion
5466 overflows, the largest positive integer is returned.
5467 -------------------------------------------------------------------------------
5468 */
5469 uint32 float32_to_uint32_round_to_zero( float32 a )
5470 {
5471     flag aSign;
5472     int16 aExp, shiftCount;
5473     bits32 aSig;
5474     uint32 z;
5475 
5476     aSig = extractFloat32Frac( a );
5477     aExp = extractFloat32Exp( a );
5478     aSign = extractFloat32Sign( a );
5479     shiftCount = aExp - 0x9E;
5480 
5481     if (aSign) {
5482         float_raise( float_flag_invalid );
5483     	return(0);
5484     }
5485     if ( 0 < shiftCount ) {
5486         float_raise( float_flag_invalid );
5487         return 0xFFFFFFFF;
5488     }
5489     else if ( aExp <= 0x7E ) {
5490         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5491         return 0;
5492     }
5493     aSig = ( aSig | 0x800000 )<<8;
5494     z = aSig>>( - shiftCount );
5495     if ( aSig<<( shiftCount & 31 ) ) {
5496         float_exception_flags |= float_flag_inexact;
5497     }
5498     return z;
5499 
5500 }
5501 
5502 #endif
5503