xref: /qemu/fpu/softfloat.c (revision 814bb12a)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input.  If `zSign' is 1, the input is negated before being converted to an
139 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer.  However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
145 
146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
147 {
148     int8_t roundingMode;
149     flag roundNearestEven;
150     int8_t roundIncrement, roundBits;
151     int32_t z;
152 
153     roundingMode = status->float_rounding_mode;
154     roundNearestEven = ( roundingMode == float_round_nearest_even );
155     switch (roundingMode) {
156     case float_round_nearest_even:
157     case float_round_ties_away:
158         roundIncrement = 0x40;
159         break;
160     case float_round_to_zero:
161         roundIncrement = 0;
162         break;
163     case float_round_up:
164         roundIncrement = zSign ? 0 : 0x7f;
165         break;
166     case float_round_down:
167         roundIncrement = zSign ? 0x7f : 0;
168         break;
169     default:
170         abort();
171     }
172     roundBits = absZ & 0x7F;
173     absZ = ( absZ + roundIncrement )>>7;
174     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
175     z = absZ;
176     if ( zSign ) z = - z;
177     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
178         float_raise(float_flag_invalid, status);
179         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
180     }
181     if (roundBits) {
182         status->float_exception_flags |= float_flag_inexact;
183     }
184     return z;
185 
186 }
187 
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer.  However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
197 | returned.
198 *----------------------------------------------------------------------------*/
199 
200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
201                                float_status *status)
202 {
203     int8_t roundingMode;
204     flag roundNearestEven, increment;
205     int64_t z;
206 
207     roundingMode = status->float_rounding_mode;
208     roundNearestEven = ( roundingMode == float_round_nearest_even );
209     switch (roundingMode) {
210     case float_round_nearest_even:
211     case float_round_ties_away:
212         increment = ((int64_t) absZ1 < 0);
213         break;
214     case float_round_to_zero:
215         increment = 0;
216         break;
217     case float_round_up:
218         increment = !zSign && absZ1;
219         break;
220     case float_round_down:
221         increment = zSign && absZ1;
222         break;
223     default:
224         abort();
225     }
226     if ( increment ) {
227         ++absZ0;
228         if ( absZ0 == 0 ) goto overflow;
229         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
230     }
231     z = absZ0;
232     if ( zSign ) z = - z;
233     if ( z && ( ( z < 0 ) ^ zSign ) ) {
234  overflow:
235         float_raise(float_flag_invalid, status);
236         return
237               zSign ? (int64_t) LIT64( 0x8000000000000000 )
238             : LIT64( 0x7FFFFFFFFFFFFFFF );
239     }
240     if (absZ1) {
241         status->float_exception_flags |= float_flag_inexact;
242     }
243     return z;
244 
245 }
246 
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer.  However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
256 
257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
258                                 uint64_t absZ1, float_status *status)
259 {
260     int8_t roundingMode;
261     flag roundNearestEven, increment;
262 
263     roundingMode = status->float_rounding_mode;
264     roundNearestEven = (roundingMode == float_round_nearest_even);
265     switch (roundingMode) {
266     case float_round_nearest_even:
267     case float_round_ties_away:
268         increment = ((int64_t)absZ1 < 0);
269         break;
270     case float_round_to_zero:
271         increment = 0;
272         break;
273     case float_round_up:
274         increment = !zSign && absZ1;
275         break;
276     case float_round_down:
277         increment = zSign && absZ1;
278         break;
279     default:
280         abort();
281     }
282     if (increment) {
283         ++absZ0;
284         if (absZ0 == 0) {
285             float_raise(float_flag_invalid, status);
286             return LIT64(0xFFFFFFFFFFFFFFFF);
287         }
288         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
289     }
290 
291     if (zSign && absZ0) {
292         float_raise(float_flag_invalid, status);
293         return 0;
294     }
295 
296     if (absZ1) {
297         status->float_exception_flags |= float_flag_inexact;
298     }
299     return absZ0;
300 }
301 
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
305 
306 static inline uint32_t extractFloat32Frac( float32 a )
307 {
308 
309     return float32_val(a) & 0x007FFFFF;
310 
311 }
312 
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
316 
317 static inline int extractFloat32Exp(float32 a)
318 {
319 
320     return ( float32_val(a)>>23 ) & 0xFF;
321 
322 }
323 
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
327 
328 static inline flag extractFloat32Sign( float32 a )
329 {
330 
331     return float32_val(a)>>31;
332 
333 }
334 
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32 float32_squash_input_denormal(float32 a, float_status *status)
340 {
341     if (status->flush_inputs_to_zero) {
342         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
343             float_raise(float_flag_input_denormal, status);
344             return make_float32(float32_val(a) & 0x80000000);
345         }
346     }
347     return a;
348 }
349 
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'.  The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
356 
357 static void
358  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
359 {
360     int8_t shiftCount;
361 
362     shiftCount = countLeadingZeros32( aSig ) - 8;
363     *zSigPtr = aSig<<shiftCount;
364     *zExpPtr = 1 - shiftCount;
365 
366 }
367 
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result.  After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result.  This means that any integer portion of `zSig'
373 | will be added into the exponent.  Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
376 | significand.
377 *----------------------------------------------------------------------------*/
378 
379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
380 {
381 
382     return make_float32(
383           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
384 
385 }
386 
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input.  Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly.  However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned.  If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 |     The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location.  This shifted
401 | significand must be normalized or smaller.  If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding.  In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
408 
409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
410                                    float_status *status)
411 {
412     int8_t roundingMode;
413     flag roundNearestEven;
414     int8_t roundIncrement, roundBits;
415     flag isTiny;
416 
417     roundingMode = status->float_rounding_mode;
418     roundNearestEven = ( roundingMode == float_round_nearest_even );
419     switch (roundingMode) {
420     case float_round_nearest_even:
421     case float_round_ties_away:
422         roundIncrement = 0x40;
423         break;
424     case float_round_to_zero:
425         roundIncrement = 0;
426         break;
427     case float_round_up:
428         roundIncrement = zSign ? 0 : 0x7f;
429         break;
430     case float_round_down:
431         roundIncrement = zSign ? 0x7f : 0;
432         break;
433     default:
434         abort();
435         break;
436     }
437     roundBits = zSig & 0x7F;
438     if ( 0xFD <= (uint16_t) zExp ) {
439         if (    ( 0xFD < zExp )
440              || (    ( zExp == 0xFD )
441                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
442            ) {
443             float_raise(float_flag_overflow | float_flag_inexact, status);
444             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
445         }
446         if ( zExp < 0 ) {
447             if (status->flush_to_zero) {
448                 float_raise(float_flag_output_denormal, status);
449                 return packFloat32(zSign, 0, 0);
450             }
451             isTiny =
452                 (status->float_detect_tininess
453                  == float_tininess_before_rounding)
454                 || ( zExp < -1 )
455                 || ( zSig + roundIncrement < 0x80000000 );
456             shift32RightJamming( zSig, - zExp, &zSig );
457             zExp = 0;
458             roundBits = zSig & 0x7F;
459             if (isTiny && roundBits) {
460                 float_raise(float_flag_underflow, status);
461             }
462         }
463     }
464     if (roundBits) {
465         status->float_exception_flags |= float_flag_inexact;
466     }
467     zSig = ( zSig + roundIncrement )>>7;
468     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
469     if ( zSig == 0 ) zExp = 0;
470     return packFloat32( zSign, zExp, zSig );
471 
472 }
473 
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input.  This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
482 
483 static float32
484  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
485                               float_status *status)
486 {
487     int8_t shiftCount;
488 
489     shiftCount = countLeadingZeros32( zSig ) - 1;
490     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
491                                status);
492 
493 }
494 
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
498 
499 static inline uint64_t extractFloat64Frac( float64 a )
500 {
501 
502     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
503 
504 }
505 
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
509 
510 static inline int extractFloat64Exp(float64 a)
511 {
512 
513     return ( float64_val(a)>>52 ) & 0x7FF;
514 
515 }
516 
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
520 
521 static inline flag extractFloat64Sign( float64 a )
522 {
523 
524     return float64_val(a)>>63;
525 
526 }
527 
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64 float64_squash_input_denormal(float64 a, float_status *status)
533 {
534     if (status->flush_inputs_to_zero) {
535         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
536             float_raise(float_flag_input_denormal, status);
537             return make_float64(float64_val(a) & (1ULL << 63));
538         }
539     }
540     return a;
541 }
542 
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'.  The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
549 
550 static void
551  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
552 {
553     int8_t shiftCount;
554 
555     shiftCount = countLeadingZeros64( aSig ) - 11;
556     *zSigPtr = aSig<<shiftCount;
557     *zExpPtr = 1 - shiftCount;
558 
559 }
560 
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result.  After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result.  This means that any integer portion of `zSig'
566 | will be added into the exponent.  Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
569 | significand.
570 *----------------------------------------------------------------------------*/
571 
572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
573 {
574 
575     return make_float64(
576         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
577 
578 }
579 
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input.  Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly.  However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned.  If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 |     The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location.  This shifted
594 | significand must be normalized or smaller.  If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding.  In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
601 
602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
603                                    float_status *status)
604 {
605     int8_t roundingMode;
606     flag roundNearestEven;
607     int roundIncrement, roundBits;
608     flag isTiny;
609 
610     roundingMode = status->float_rounding_mode;
611     roundNearestEven = ( roundingMode == float_round_nearest_even );
612     switch (roundingMode) {
613     case float_round_nearest_even:
614     case float_round_ties_away:
615         roundIncrement = 0x200;
616         break;
617     case float_round_to_zero:
618         roundIncrement = 0;
619         break;
620     case float_round_up:
621         roundIncrement = zSign ? 0 : 0x3ff;
622         break;
623     case float_round_down:
624         roundIncrement = zSign ? 0x3ff : 0;
625         break;
626     default:
627         abort();
628     }
629     roundBits = zSig & 0x3FF;
630     if ( 0x7FD <= (uint16_t) zExp ) {
631         if (    ( 0x7FD < zExp )
632              || (    ( zExp == 0x7FD )
633                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
634            ) {
635             float_raise(float_flag_overflow | float_flag_inexact, status);
636             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
637         }
638         if ( zExp < 0 ) {
639             if (status->flush_to_zero) {
640                 float_raise(float_flag_output_denormal, status);
641                 return packFloat64(zSign, 0, 0);
642             }
643             isTiny =
644                    (status->float_detect_tininess
645                     == float_tininess_before_rounding)
646                 || ( zExp < -1 )
647                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
648             shift64RightJamming( zSig, - zExp, &zSig );
649             zExp = 0;
650             roundBits = zSig & 0x3FF;
651             if (isTiny && roundBits) {
652                 float_raise(float_flag_underflow, status);
653             }
654         }
655     }
656     if (roundBits) {
657         status->float_exception_flags |= float_flag_inexact;
658     }
659     zSig = ( zSig + roundIncrement )>>10;
660     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
661     if ( zSig == 0 ) zExp = 0;
662     return packFloat64( zSign, zExp, zSig );
663 
664 }
665 
666 /*----------------------------------------------------------------------------
667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668 | and significand `zSig', and returns the proper double-precision floating-
669 | point value corresponding to the abstract input.  This routine is just like
670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
672 | floating-point exponent.
673 *----------------------------------------------------------------------------*/
674 
675 static float64
676  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
677                               float_status *status)
678 {
679     int8_t shiftCount;
680 
681     shiftCount = countLeadingZeros64( zSig ) - 1;
682     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
683                                status);
684 
685 }
686 
687 /*----------------------------------------------------------------------------
688 | Returns the fraction bits of the extended double-precision floating-point
689 | value `a'.
690 *----------------------------------------------------------------------------*/
691 
692 static inline uint64_t extractFloatx80Frac( floatx80 a )
693 {
694 
695     return a.low;
696 
697 }
698 
699 /*----------------------------------------------------------------------------
700 | Returns the exponent bits of the extended double-precision floating-point
701 | value `a'.
702 *----------------------------------------------------------------------------*/
703 
704 static inline int32_t extractFloatx80Exp( floatx80 a )
705 {
706 
707     return a.high & 0x7FFF;
708 
709 }
710 
711 /*----------------------------------------------------------------------------
712 | Returns the sign bit of the extended double-precision floating-point value
713 | `a'.
714 *----------------------------------------------------------------------------*/
715 
716 static inline flag extractFloatx80Sign( floatx80 a )
717 {
718 
719     return a.high>>15;
720 
721 }
722 
723 /*----------------------------------------------------------------------------
724 | Normalizes the subnormal extended double-precision floating-point value
725 | represented by the denormalized significand `aSig'.  The normalized exponent
726 | and significand are stored at the locations pointed to by `zExpPtr' and
727 | `zSigPtr', respectively.
728 *----------------------------------------------------------------------------*/
729 
730 static void
731  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
732 {
733     int8_t shiftCount;
734 
735     shiftCount = countLeadingZeros64( aSig );
736     *zSigPtr = aSig<<shiftCount;
737     *zExpPtr = 1 - shiftCount;
738 
739 }
740 
741 /*----------------------------------------------------------------------------
742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
743 | extended double-precision floating-point value, returning the result.
744 *----------------------------------------------------------------------------*/
745 
746 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
747 {
748     floatx80 z;
749 
750     z.low = zSig;
751     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
752     return z;
753 
754 }
755 
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
758 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input.  Ordinarily, the abstract value is
761 | rounded and packed into the extended double-precision format, with the
762 | inexact exception raised if the abstract input cannot be represented
763 | exactly.  However, if the abstract value is too large, the overflow and
764 | inexact exceptions are raised and an infinity or maximal finite value is
765 | returned.  If the abstract value is too small, the input value is rounded to
766 | a subnormal number, and the underflow and inexact exceptions are raised if
767 | the abstract input cannot be represented exactly as a subnormal extended
768 | double-precision floating-point number.
769 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
770 | number of bits as single or double precision, respectively.  Otherwise, the
771 | result is rounded to the full precision of the extended double-precision
772 | format.
773 |     The input significand must be normalized or smaller.  If the input
774 | significand is not normalized, `zExp' must be 0; in that case, the result
775 | returned is a subnormal number, and it must not require rounding.  The
776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
777 | Floating-Point Arithmetic.
778 *----------------------------------------------------------------------------*/
779 
780 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
781                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
782                                      float_status *status)
783 {
784     int8_t roundingMode;
785     flag roundNearestEven, increment, isTiny;
786     int64_t roundIncrement, roundMask, roundBits;
787 
788     roundingMode = status->float_rounding_mode;
789     roundNearestEven = ( roundingMode == float_round_nearest_even );
790     if ( roundingPrecision == 80 ) goto precision80;
791     if ( roundingPrecision == 64 ) {
792         roundIncrement = LIT64( 0x0000000000000400 );
793         roundMask = LIT64( 0x00000000000007FF );
794     }
795     else if ( roundingPrecision == 32 ) {
796         roundIncrement = LIT64( 0x0000008000000000 );
797         roundMask = LIT64( 0x000000FFFFFFFFFF );
798     }
799     else {
800         goto precision80;
801     }
802     zSig0 |= ( zSig1 != 0 );
803     switch (roundingMode) {
804     case float_round_nearest_even:
805     case float_round_ties_away:
806         break;
807     case float_round_to_zero:
808         roundIncrement = 0;
809         break;
810     case float_round_up:
811         roundIncrement = zSign ? 0 : roundMask;
812         break;
813     case float_round_down:
814         roundIncrement = zSign ? roundMask : 0;
815         break;
816     default:
817         abort();
818     }
819     roundBits = zSig0 & roundMask;
820     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
821         if (    ( 0x7FFE < zExp )
822              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
823            ) {
824             goto overflow;
825         }
826         if ( zExp <= 0 ) {
827             if (status->flush_to_zero) {
828                 float_raise(float_flag_output_denormal, status);
829                 return packFloatx80(zSign, 0, 0);
830             }
831             isTiny =
832                    (status->float_detect_tininess
833                     == float_tininess_before_rounding)
834                 || ( zExp < 0 )
835                 || ( zSig0 <= zSig0 + roundIncrement );
836             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
837             zExp = 0;
838             roundBits = zSig0 & roundMask;
839             if (isTiny && roundBits) {
840                 float_raise(float_flag_underflow, status);
841             }
842             if (roundBits) {
843                 status->float_exception_flags |= float_flag_inexact;
844             }
845             zSig0 += roundIncrement;
846             if ( (int64_t) zSig0 < 0 ) zExp = 1;
847             roundIncrement = roundMask + 1;
848             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
849                 roundMask |= roundIncrement;
850             }
851             zSig0 &= ~ roundMask;
852             return packFloatx80( zSign, zExp, zSig0 );
853         }
854     }
855     if (roundBits) {
856         status->float_exception_flags |= float_flag_inexact;
857     }
858     zSig0 += roundIncrement;
859     if ( zSig0 < roundIncrement ) {
860         ++zExp;
861         zSig0 = LIT64( 0x8000000000000000 );
862     }
863     roundIncrement = roundMask + 1;
864     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
865         roundMask |= roundIncrement;
866     }
867     zSig0 &= ~ roundMask;
868     if ( zSig0 == 0 ) zExp = 0;
869     return packFloatx80( zSign, zExp, zSig0 );
870  precision80:
871     switch (roundingMode) {
872     case float_round_nearest_even:
873     case float_round_ties_away:
874         increment = ((int64_t)zSig1 < 0);
875         break;
876     case float_round_to_zero:
877         increment = 0;
878         break;
879     case float_round_up:
880         increment = !zSign && zSig1;
881         break;
882     case float_round_down:
883         increment = zSign && zSig1;
884         break;
885     default:
886         abort();
887     }
888     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
889         if (    ( 0x7FFE < zExp )
890              || (    ( zExp == 0x7FFE )
891                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
892                   && increment
893                 )
894            ) {
895             roundMask = 0;
896  overflow:
897             float_raise(float_flag_overflow | float_flag_inexact, status);
898             if (    ( roundingMode == float_round_to_zero )
899                  || ( zSign && ( roundingMode == float_round_up ) )
900                  || ( ! zSign && ( roundingMode == float_round_down ) )
901                ) {
902                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
903             }
904             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
905         }
906         if ( zExp <= 0 ) {
907             isTiny =
908                    (status->float_detect_tininess
909                     == float_tininess_before_rounding)
910                 || ( zExp < 0 )
911                 || ! increment
912                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
913             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
914             zExp = 0;
915             if (isTiny && zSig1) {
916                 float_raise(float_flag_underflow, status);
917             }
918             if (zSig1) {
919                 status->float_exception_flags |= float_flag_inexact;
920             }
921             switch (roundingMode) {
922             case float_round_nearest_even:
923             case float_round_ties_away:
924                 increment = ((int64_t)zSig1 < 0);
925                 break;
926             case float_round_to_zero:
927                 increment = 0;
928                 break;
929             case float_round_up:
930                 increment = !zSign && zSig1;
931                 break;
932             case float_round_down:
933                 increment = zSign && zSig1;
934                 break;
935             default:
936                 abort();
937             }
938             if ( increment ) {
939                 ++zSig0;
940                 zSig0 &=
941                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
942                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
943             }
944             return packFloatx80( zSign, zExp, zSig0 );
945         }
946     }
947     if (zSig1) {
948         status->float_exception_flags |= float_flag_inexact;
949     }
950     if ( increment ) {
951         ++zSig0;
952         if ( zSig0 == 0 ) {
953             ++zExp;
954             zSig0 = LIT64( 0x8000000000000000 );
955         }
956         else {
957             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
958         }
959     }
960     else {
961         if ( zSig0 == 0 ) zExp = 0;
962     }
963     return packFloatx80( zSign, zExp, zSig0 );
964 
965 }
966 
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent
969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
970 | and returns the proper extended double-precision floating-point value
971 | corresponding to the abstract input.  This routine is just like
972 | `roundAndPackFloatx80' except that the input significand does not have to be
973 | normalized.
974 *----------------------------------------------------------------------------*/
975 
976 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
977                                               flag zSign, int32_t zExp,
978                                               uint64_t zSig0, uint64_t zSig1,
979                                               float_status *status)
980 {
981     int8_t shiftCount;
982 
983     if ( zSig0 == 0 ) {
984         zSig0 = zSig1;
985         zSig1 = 0;
986         zExp -= 64;
987     }
988     shiftCount = countLeadingZeros64( zSig0 );
989     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
990     zExp -= shiftCount;
991     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
992                                 zSig0, zSig1, status);
993 
994 }
995 
996 /*----------------------------------------------------------------------------
997 | Returns the least-significant 64 fraction bits of the quadruple-precision
998 | floating-point value `a'.
999 *----------------------------------------------------------------------------*/
1000 
1001 static inline uint64_t extractFloat128Frac1( float128 a )
1002 {
1003 
1004     return a.low;
1005 
1006 }
1007 
1008 /*----------------------------------------------------------------------------
1009 | Returns the most-significant 48 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1012 
1013 static inline uint64_t extractFloat128Frac0( float128 a )
1014 {
1015 
1016     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1017 
1018 }
1019 
1020 /*----------------------------------------------------------------------------
1021 | Returns the exponent bits of the quadruple-precision floating-point value
1022 | `a'.
1023 *----------------------------------------------------------------------------*/
1024 
1025 static inline int32_t extractFloat128Exp( float128 a )
1026 {
1027 
1028     return ( a.high>>48 ) & 0x7FFF;
1029 
1030 }
1031 
1032 /*----------------------------------------------------------------------------
1033 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1034 *----------------------------------------------------------------------------*/
1035 
1036 static inline flag extractFloat128Sign( float128 a )
1037 {
1038 
1039     return a.high>>63;
1040 
1041 }
1042 
1043 /*----------------------------------------------------------------------------
1044 | Normalizes the subnormal quadruple-precision floating-point value
1045 | represented by the denormalized significand formed by the concatenation of
1046 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1047 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1048 | significand are stored at the location pointed to by `zSig0Ptr', and the
1049 | least significant 64 bits of the normalized significand are stored at the
1050 | location pointed to by `zSig1Ptr'.
1051 *----------------------------------------------------------------------------*/
1052 
1053 static void
1054  normalizeFloat128Subnormal(
1055      uint64_t aSig0,
1056      uint64_t aSig1,
1057      int32_t *zExpPtr,
1058      uint64_t *zSig0Ptr,
1059      uint64_t *zSig1Ptr
1060  )
1061 {
1062     int8_t shiftCount;
1063 
1064     if ( aSig0 == 0 ) {
1065         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1066         if ( shiftCount < 0 ) {
1067             *zSig0Ptr = aSig1>>( - shiftCount );
1068             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1069         }
1070         else {
1071             *zSig0Ptr = aSig1<<shiftCount;
1072             *zSig1Ptr = 0;
1073         }
1074         *zExpPtr = - shiftCount - 63;
1075     }
1076     else {
1077         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1078         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1079         *zExpPtr = 1 - shiftCount;
1080     }
1081 
1082 }
1083 
1084 /*----------------------------------------------------------------------------
1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087 | floating-point value, returning the result.  After being shifted into the
1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089 | added together to form the most significant 32 bits of the result.  This
1090 | means that any integer portion of `zSig0' will be added into the exponent.
1091 | Since a properly normalized significand will have an integer portion equal
1092 | to 1, the `zExp' input should be 1 less than the desired result exponent
1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1094 | significand.
1095 *----------------------------------------------------------------------------*/
1096 
1097 static inline float128
1098  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1099 {
1100     float128 z;
1101 
1102     z.low = zSig1;
1103     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1104     return z;
1105 
1106 }
1107 
1108 /*----------------------------------------------------------------------------
1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1111 | and `zSig2', and returns the proper quadruple-precision floating-point value
1112 | corresponding to the abstract input.  Ordinarily, the abstract value is
1113 | simply rounded and packed into the quadruple-precision format, with the
1114 | inexact exception raised if the abstract input cannot be represented
1115 | exactly.  However, if the abstract value is too large, the overflow and
1116 | inexact exceptions are raised and an infinity or maximal finite value is
1117 | returned.  If the abstract value is too small, the input value is rounded to
1118 | a subnormal number, and the underflow and inexact exceptions are raised if
1119 | the abstract input cannot be represented exactly as a subnormal quadruple-
1120 | precision floating-point number.
1121 |     The input significand must be normalized or smaller.  If the input
1122 | significand is not normalized, `zExp' must be 0; in that case, the result
1123 | returned is a subnormal number, and it must not require rounding.  In the
1124 | usual case that the input significand is normalized, `zExp' must be 1 less
1125 | than the ``true'' floating-point exponent.  The handling of underflow and
1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128 
1129 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1130                                      uint64_t zSig0, uint64_t zSig1,
1131                                      uint64_t zSig2, float_status *status)
1132 {
1133     int8_t roundingMode;
1134     flag roundNearestEven, increment, isTiny;
1135 
1136     roundingMode = status->float_rounding_mode;
1137     roundNearestEven = ( roundingMode == float_round_nearest_even );
1138     switch (roundingMode) {
1139     case float_round_nearest_even:
1140     case float_round_ties_away:
1141         increment = ((int64_t)zSig2 < 0);
1142         break;
1143     case float_round_to_zero:
1144         increment = 0;
1145         break;
1146     case float_round_up:
1147         increment = !zSign && zSig2;
1148         break;
1149     case float_round_down:
1150         increment = zSign && zSig2;
1151         break;
1152     default:
1153         abort();
1154     }
1155     if ( 0x7FFD <= (uint32_t) zExp ) {
1156         if (    ( 0x7FFD < zExp )
1157              || (    ( zExp == 0x7FFD )
1158                   && eq128(
1159                          LIT64( 0x0001FFFFFFFFFFFF ),
1160                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1161                          zSig0,
1162                          zSig1
1163                      )
1164                   && increment
1165                 )
1166            ) {
1167             float_raise(float_flag_overflow | float_flag_inexact, status);
1168             if (    ( roundingMode == float_round_to_zero )
1169                  || ( zSign && ( roundingMode == float_round_up ) )
1170                  || ( ! zSign && ( roundingMode == float_round_down ) )
1171                ) {
1172                 return
1173                     packFloat128(
1174                         zSign,
1175                         0x7FFE,
1176                         LIT64( 0x0000FFFFFFFFFFFF ),
1177                         LIT64( 0xFFFFFFFFFFFFFFFF )
1178                     );
1179             }
1180             return packFloat128( zSign, 0x7FFF, 0, 0 );
1181         }
1182         if ( zExp < 0 ) {
1183             if (status->flush_to_zero) {
1184                 float_raise(float_flag_output_denormal, status);
1185                 return packFloat128(zSign, 0, 0, 0);
1186             }
1187             isTiny =
1188                    (status->float_detect_tininess
1189                     == float_tininess_before_rounding)
1190                 || ( zExp < -1 )
1191                 || ! increment
1192                 || lt128(
1193                        zSig0,
1194                        zSig1,
1195                        LIT64( 0x0001FFFFFFFFFFFF ),
1196                        LIT64( 0xFFFFFFFFFFFFFFFF )
1197                    );
1198             shift128ExtraRightJamming(
1199                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1200             zExp = 0;
1201             if (isTiny && zSig2) {
1202                 float_raise(float_flag_underflow, status);
1203             }
1204             switch (roundingMode) {
1205             case float_round_nearest_even:
1206             case float_round_ties_away:
1207                 increment = ((int64_t)zSig2 < 0);
1208                 break;
1209             case float_round_to_zero:
1210                 increment = 0;
1211                 break;
1212             case float_round_up:
1213                 increment = !zSign && zSig2;
1214                 break;
1215             case float_round_down:
1216                 increment = zSign && zSig2;
1217                 break;
1218             default:
1219                 abort();
1220             }
1221         }
1222     }
1223     if (zSig2) {
1224         status->float_exception_flags |= float_flag_inexact;
1225     }
1226     if ( increment ) {
1227         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1228         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1229     }
1230     else {
1231         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1232     }
1233     return packFloat128( zSign, zExp, zSig0, zSig1 );
1234 
1235 }
1236 
1237 /*----------------------------------------------------------------------------
1238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1240 | returns the proper quadruple-precision floating-point value corresponding
1241 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1242 | except that the input significand has fewer bits and does not have to be
1243 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1244 | point exponent.
1245 *----------------------------------------------------------------------------*/
1246 
1247 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1248                                               uint64_t zSig0, uint64_t zSig1,
1249                                               float_status *status)
1250 {
1251     int8_t shiftCount;
1252     uint64_t zSig2;
1253 
1254     if ( zSig0 == 0 ) {
1255         zSig0 = zSig1;
1256         zSig1 = 0;
1257         zExp -= 64;
1258     }
1259     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1260     if ( 0 <= shiftCount ) {
1261         zSig2 = 0;
1262         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1263     }
1264     else {
1265         shift128ExtraRightJamming(
1266             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1267     }
1268     zExp -= shiftCount;
1269     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1270 
1271 }
1272 
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the 32-bit two's complement integer `a'
1275 | to the single-precision floating-point format.  The conversion is performed
1276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277 *----------------------------------------------------------------------------*/
1278 
1279 float32 int32_to_float32(int32_t a, float_status *status)
1280 {
1281     flag zSign;
1282 
1283     if ( a == 0 ) return float32_zero;
1284     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1285     zSign = ( a < 0 );
1286     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1287 }
1288 
1289 /*----------------------------------------------------------------------------
1290 | Returns the result of converting the 32-bit two's complement integer `a'
1291 | to the double-precision floating-point format.  The conversion is performed
1292 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 *----------------------------------------------------------------------------*/
1294 
1295 float64 int32_to_float64(int32_t a, float_status *status)
1296 {
1297     flag zSign;
1298     uint32_t absA;
1299     int8_t shiftCount;
1300     uint64_t zSig;
1301 
1302     if ( a == 0 ) return float64_zero;
1303     zSign = ( a < 0 );
1304     absA = zSign ? - a : a;
1305     shiftCount = countLeadingZeros32( absA ) + 21;
1306     zSig = absA;
1307     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1308 
1309 }
1310 
1311 /*----------------------------------------------------------------------------
1312 | Returns the result of converting the 32-bit two's complement integer `a'
1313 | to the extended double-precision floating-point format.  The conversion
1314 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1315 | Arithmetic.
1316 *----------------------------------------------------------------------------*/
1317 
1318 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1319 {
1320     flag zSign;
1321     uint32_t absA;
1322     int8_t shiftCount;
1323     uint64_t zSig;
1324 
1325     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1326     zSign = ( a < 0 );
1327     absA = zSign ? - a : a;
1328     shiftCount = countLeadingZeros32( absA ) + 32;
1329     zSig = absA;
1330     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1331 
1332 }
1333 
1334 /*----------------------------------------------------------------------------
1335 | Returns the result of converting the 32-bit two's complement integer `a' to
1336 | the quadruple-precision floating-point format.  The conversion is performed
1337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338 *----------------------------------------------------------------------------*/
1339 
1340 float128 int32_to_float128(int32_t a, float_status *status)
1341 {
1342     flag zSign;
1343     uint32_t absA;
1344     int8_t shiftCount;
1345     uint64_t zSig0;
1346 
1347     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1348     zSign = ( a < 0 );
1349     absA = zSign ? - a : a;
1350     shiftCount = countLeadingZeros32( absA ) + 17;
1351     zSig0 = absA;
1352     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1353 
1354 }
1355 
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the 64-bit two's complement integer `a'
1358 | to the single-precision floating-point format.  The conversion is performed
1359 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360 *----------------------------------------------------------------------------*/
1361 
1362 float32 int64_to_float32(int64_t a, float_status *status)
1363 {
1364     flag zSign;
1365     uint64_t absA;
1366     int8_t shiftCount;
1367 
1368     if ( a == 0 ) return float32_zero;
1369     zSign = ( a < 0 );
1370     absA = zSign ? - a : a;
1371     shiftCount = countLeadingZeros64( absA ) - 40;
1372     if ( 0 <= shiftCount ) {
1373         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1374     }
1375     else {
1376         shiftCount += 7;
1377         if ( shiftCount < 0 ) {
1378             shift64RightJamming( absA, - shiftCount, &absA );
1379         }
1380         else {
1381             absA <<= shiftCount;
1382         }
1383         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1384     }
1385 
1386 }
1387 
1388 /*----------------------------------------------------------------------------
1389 | Returns the result of converting the 64-bit two's complement integer `a'
1390 | to the double-precision floating-point format.  The conversion is performed
1391 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392 *----------------------------------------------------------------------------*/
1393 
1394 float64 int64_to_float64(int64_t a, float_status *status)
1395 {
1396     flag zSign;
1397 
1398     if ( a == 0 ) return float64_zero;
1399     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1400         return packFloat64( 1, 0x43E, 0 );
1401     }
1402     zSign = ( a < 0 );
1403     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1404 }
1405 
1406 /*----------------------------------------------------------------------------
1407 | Returns the result of converting the 64-bit two's complement integer `a'
1408 | to the extended double-precision floating-point format.  The conversion
1409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1410 | Arithmetic.
1411 *----------------------------------------------------------------------------*/
1412 
1413 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1414 {
1415     flag zSign;
1416     uint64_t absA;
1417     int8_t shiftCount;
1418 
1419     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1420     zSign = ( a < 0 );
1421     absA = zSign ? - a : a;
1422     shiftCount = countLeadingZeros64( absA );
1423     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1424 
1425 }
1426 
1427 /*----------------------------------------------------------------------------
1428 | Returns the result of converting the 64-bit two's complement integer `a' to
1429 | the quadruple-precision floating-point format.  The conversion is performed
1430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431 *----------------------------------------------------------------------------*/
1432 
1433 float128 int64_to_float128(int64_t a, float_status *status)
1434 {
1435     flag zSign;
1436     uint64_t absA;
1437     int8_t shiftCount;
1438     int32_t zExp;
1439     uint64_t zSig0, zSig1;
1440 
1441     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1442     zSign = ( a < 0 );
1443     absA = zSign ? - a : a;
1444     shiftCount = countLeadingZeros64( absA ) + 49;
1445     zExp = 0x406E - shiftCount;
1446     if ( 64 <= shiftCount ) {
1447         zSig1 = 0;
1448         zSig0 = absA;
1449         shiftCount -= 64;
1450     }
1451     else {
1452         zSig1 = absA;
1453         zSig0 = 0;
1454     }
1455     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1456     return packFloat128( zSign, zExp, zSig0, zSig1 );
1457 
1458 }
1459 
1460 /*----------------------------------------------------------------------------
1461 | Returns the result of converting the 64-bit unsigned integer `a'
1462 | to the single-precision floating-point format.  The conversion is performed
1463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464 *----------------------------------------------------------------------------*/
1465 
1466 float32 uint64_to_float32(uint64_t a, float_status *status)
1467 {
1468     int shiftcount;
1469 
1470     if (a == 0) {
1471         return float32_zero;
1472     }
1473 
1474     /* Determine (left) shift needed to put first set bit into bit posn 23
1475      * (since packFloat32() expects the binary point between bits 23 and 22);
1476      * this is the fast case for smallish numbers.
1477      */
1478     shiftcount = countLeadingZeros64(a) - 40;
1479     if (shiftcount >= 0) {
1480         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1481     }
1482     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483      * expects the binary point between bits 30 and 29, hence the + 7.
1484      */
1485     shiftcount += 7;
1486     if (shiftcount < 0) {
1487         shift64RightJamming(a, -shiftcount, &a);
1488     } else {
1489         a <<= shiftcount;
1490     }
1491 
1492     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1493 }
1494 
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the 64-bit unsigned integer `a'
1497 | to the double-precision floating-point format.  The conversion is performed
1498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499 *----------------------------------------------------------------------------*/
1500 
1501 float64 uint64_to_float64(uint64_t a, float_status *status)
1502 {
1503     int exp = 0x43C;
1504     int shiftcount;
1505 
1506     if (a == 0) {
1507         return float64_zero;
1508     }
1509 
1510     shiftcount = countLeadingZeros64(a) - 1;
1511     if (shiftcount < 0) {
1512         shift64RightJamming(a, -shiftcount, &a);
1513     } else {
1514         a <<= shiftcount;
1515     }
1516     return roundAndPackFloat64(0, exp - shiftcount, a, status);
1517 }
1518 
1519 /*----------------------------------------------------------------------------
1520 | Returns the result of converting the 64-bit unsigned integer `a'
1521 | to the quadruple-precision floating-point format.  The conversion is performed
1522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523 *----------------------------------------------------------------------------*/
1524 
1525 float128 uint64_to_float128(uint64_t a, float_status *status)
1526 {
1527     if (a == 0) {
1528         return float128_zero;
1529     }
1530     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1531 }
1532 
1533 /*----------------------------------------------------------------------------
1534 | Returns the result of converting the single-precision floating-point value
1535 | `a' to the 32-bit two's complement integer format.  The conversion is
1536 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 | Arithmetic---which means in particular that the conversion is rounded
1538 | according to the current rounding mode.  If `a' is a NaN, the largest
1539 | positive integer is returned.  Otherwise, if the conversion overflows, the
1540 | largest integer with the same sign as `a' is returned.
1541 *----------------------------------------------------------------------------*/
1542 
1543 int32_t float32_to_int32(float32 a, float_status *status)
1544 {
1545     flag aSign;
1546     int aExp;
1547     int shiftCount;
1548     uint32_t aSig;
1549     uint64_t aSig64;
1550 
1551     a = float32_squash_input_denormal(a, status);
1552     aSig = extractFloat32Frac( a );
1553     aExp = extractFloat32Exp( a );
1554     aSign = extractFloat32Sign( a );
1555     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1556     if ( aExp ) aSig |= 0x00800000;
1557     shiftCount = 0xAF - aExp;
1558     aSig64 = aSig;
1559     aSig64 <<= 32;
1560     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1561     return roundAndPackInt32(aSign, aSig64, status);
1562 
1563 }
1564 
1565 /*----------------------------------------------------------------------------
1566 | Returns the result of converting the single-precision floating-point value
1567 | `a' to the 32-bit two's complement integer format.  The conversion is
1568 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 | Arithmetic, except that the conversion is always rounded toward zero.
1570 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1571 | the conversion overflows, the largest integer with the same sign as `a' is
1572 | returned.
1573 *----------------------------------------------------------------------------*/
1574 
1575 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1576 {
1577     flag aSign;
1578     int aExp;
1579     int shiftCount;
1580     uint32_t aSig;
1581     int32_t z;
1582     a = float32_squash_input_denormal(a, status);
1583 
1584     aSig = extractFloat32Frac( a );
1585     aExp = extractFloat32Exp( a );
1586     aSign = extractFloat32Sign( a );
1587     shiftCount = aExp - 0x9E;
1588     if ( 0 <= shiftCount ) {
1589         if ( float32_val(a) != 0xCF000000 ) {
1590             float_raise(float_flag_invalid, status);
1591             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1592         }
1593         return (int32_t) 0x80000000;
1594     }
1595     else if ( aExp <= 0x7E ) {
1596         if (aExp | aSig) {
1597             status->float_exception_flags |= float_flag_inexact;
1598         }
1599         return 0;
1600     }
1601     aSig = ( aSig | 0x00800000 )<<8;
1602     z = aSig>>( - shiftCount );
1603     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1604         status->float_exception_flags |= float_flag_inexact;
1605     }
1606     if ( aSign ) z = - z;
1607     return z;
1608 
1609 }
1610 
1611 /*----------------------------------------------------------------------------
1612 | Returns the result of converting the single-precision floating-point value
1613 | `a' to the 16-bit two's complement integer format.  The conversion is
1614 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1615 | Arithmetic, except that the conversion is always rounded toward zero.
1616 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1617 | the conversion overflows, the largest integer with the same sign as `a' is
1618 | returned.
1619 *----------------------------------------------------------------------------*/
1620 
1621 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1622 {
1623     flag aSign;
1624     int aExp;
1625     int shiftCount;
1626     uint32_t aSig;
1627     int32_t z;
1628 
1629     aSig = extractFloat32Frac( a );
1630     aExp = extractFloat32Exp( a );
1631     aSign = extractFloat32Sign( a );
1632     shiftCount = aExp - 0x8E;
1633     if ( 0 <= shiftCount ) {
1634         if ( float32_val(a) != 0xC7000000 ) {
1635             float_raise(float_flag_invalid, status);
1636             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1637                 return 0x7FFF;
1638             }
1639         }
1640         return (int32_t) 0xffff8000;
1641     }
1642     else if ( aExp <= 0x7E ) {
1643         if ( aExp | aSig ) {
1644             status->float_exception_flags |= float_flag_inexact;
1645         }
1646         return 0;
1647     }
1648     shiftCount -= 0x10;
1649     aSig = ( aSig | 0x00800000 )<<8;
1650     z = aSig>>( - shiftCount );
1651     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1652         status->float_exception_flags |= float_flag_inexact;
1653     }
1654     if ( aSign ) {
1655         z = - z;
1656     }
1657     return z;
1658 
1659 }
1660 
1661 /*----------------------------------------------------------------------------
1662 | Returns the result of converting the single-precision floating-point value
1663 | `a' to the 64-bit two's complement integer format.  The conversion is
1664 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1665 | Arithmetic---which means in particular that the conversion is rounded
1666 | according to the current rounding mode.  If `a' is a NaN, the largest
1667 | positive integer is returned.  Otherwise, if the conversion overflows, the
1668 | largest integer with the same sign as `a' is returned.
1669 *----------------------------------------------------------------------------*/
1670 
1671 int64_t float32_to_int64(float32 a, float_status *status)
1672 {
1673     flag aSign;
1674     int aExp;
1675     int shiftCount;
1676     uint32_t aSig;
1677     uint64_t aSig64, aSigExtra;
1678     a = float32_squash_input_denormal(a, status);
1679 
1680     aSig = extractFloat32Frac( a );
1681     aExp = extractFloat32Exp( a );
1682     aSign = extractFloat32Sign( a );
1683     shiftCount = 0xBE - aExp;
1684     if ( shiftCount < 0 ) {
1685         float_raise(float_flag_invalid, status);
1686         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1687             return LIT64( 0x7FFFFFFFFFFFFFFF );
1688         }
1689         return (int64_t) LIT64( 0x8000000000000000 );
1690     }
1691     if ( aExp ) aSig |= 0x00800000;
1692     aSig64 = aSig;
1693     aSig64 <<= 40;
1694     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1695     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1696 
1697 }
1698 
1699 /*----------------------------------------------------------------------------
1700 | Returns the result of converting the single-precision floating-point value
1701 | `a' to the 64-bit unsigned integer format.  The conversion is
1702 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1703 | Arithmetic---which means in particular that the conversion is rounded
1704 | according to the current rounding mode.  If `a' is a NaN, the largest
1705 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1706 | largest unsigned integer is returned.  If the 'a' is negative, the result
1707 | is rounded and zero is returned; values that do not round to zero will
1708 | raise the inexact exception flag.
1709 *----------------------------------------------------------------------------*/
1710 
1711 uint64_t float32_to_uint64(float32 a, float_status *status)
1712 {
1713     flag aSign;
1714     int aExp;
1715     int shiftCount;
1716     uint32_t aSig;
1717     uint64_t aSig64, aSigExtra;
1718     a = float32_squash_input_denormal(a, status);
1719 
1720     aSig = extractFloat32Frac(a);
1721     aExp = extractFloat32Exp(a);
1722     aSign = extractFloat32Sign(a);
1723     if ((aSign) && (aExp > 126)) {
1724         float_raise(float_flag_invalid, status);
1725         if (float32_is_any_nan(a)) {
1726             return LIT64(0xFFFFFFFFFFFFFFFF);
1727         } else {
1728             return 0;
1729         }
1730     }
1731     shiftCount = 0xBE - aExp;
1732     if (aExp) {
1733         aSig |= 0x00800000;
1734     }
1735     if (shiftCount < 0) {
1736         float_raise(float_flag_invalid, status);
1737         return LIT64(0xFFFFFFFFFFFFFFFF);
1738     }
1739 
1740     aSig64 = aSig;
1741     aSig64 <<= 40;
1742     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1743     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1744 }
1745 
1746 /*----------------------------------------------------------------------------
1747 | Returns the result of converting the single-precision floating-point value
1748 | `a' to the 64-bit unsigned integer format.  The conversion is
1749 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1750 | Arithmetic, except that the conversion is always rounded toward zero.  If
1751 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1752 | conversion overflows, the largest unsigned integer is returned.  If the
1753 | 'a' is negative, the result is rounded and zero is returned; values that do
1754 | not round to zero will raise the inexact flag.
1755 *----------------------------------------------------------------------------*/
1756 
1757 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1758 {
1759     signed char current_rounding_mode = status->float_rounding_mode;
1760     set_float_rounding_mode(float_round_to_zero, status);
1761     int64_t v = float32_to_uint64(a, status);
1762     set_float_rounding_mode(current_rounding_mode, status);
1763     return v;
1764 }
1765 
1766 /*----------------------------------------------------------------------------
1767 | Returns the result of converting the single-precision floating-point value
1768 | `a' to the 64-bit two's complement integer format.  The conversion is
1769 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1770 | Arithmetic, except that the conversion is always rounded toward zero.  If
1771 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1772 | conversion overflows, the largest integer with the same sign as `a' is
1773 | returned.
1774 *----------------------------------------------------------------------------*/
1775 
1776 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1777 {
1778     flag aSign;
1779     int aExp;
1780     int shiftCount;
1781     uint32_t aSig;
1782     uint64_t aSig64;
1783     int64_t z;
1784     a = float32_squash_input_denormal(a, status);
1785 
1786     aSig = extractFloat32Frac( a );
1787     aExp = extractFloat32Exp( a );
1788     aSign = extractFloat32Sign( a );
1789     shiftCount = aExp - 0xBE;
1790     if ( 0 <= shiftCount ) {
1791         if ( float32_val(a) != 0xDF000000 ) {
1792             float_raise(float_flag_invalid, status);
1793             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1794                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1795             }
1796         }
1797         return (int64_t) LIT64( 0x8000000000000000 );
1798     }
1799     else if ( aExp <= 0x7E ) {
1800         if (aExp | aSig) {
1801             status->float_exception_flags |= float_flag_inexact;
1802         }
1803         return 0;
1804     }
1805     aSig64 = aSig | 0x00800000;
1806     aSig64 <<= 40;
1807     z = aSig64>>( - shiftCount );
1808     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1809         status->float_exception_flags |= float_flag_inexact;
1810     }
1811     if ( aSign ) z = - z;
1812     return z;
1813 
1814 }
1815 
1816 /*----------------------------------------------------------------------------
1817 | Returns the result of converting the single-precision floating-point value
1818 | `a' to the double-precision floating-point format.  The conversion is
1819 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1820 | Arithmetic.
1821 *----------------------------------------------------------------------------*/
1822 
1823 float64 float32_to_float64(float32 a, float_status *status)
1824 {
1825     flag aSign;
1826     int aExp;
1827     uint32_t aSig;
1828     a = float32_squash_input_denormal(a, status);
1829 
1830     aSig = extractFloat32Frac( a );
1831     aExp = extractFloat32Exp( a );
1832     aSign = extractFloat32Sign( a );
1833     if ( aExp == 0xFF ) {
1834         if (aSig) {
1835             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1836         }
1837         return packFloat64( aSign, 0x7FF, 0 );
1838     }
1839     if ( aExp == 0 ) {
1840         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1841         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1842         --aExp;
1843     }
1844     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1845 
1846 }
1847 
1848 /*----------------------------------------------------------------------------
1849 | Returns the result of converting the single-precision floating-point value
1850 | `a' to the extended double-precision floating-point format.  The conversion
1851 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1852 | Arithmetic.
1853 *----------------------------------------------------------------------------*/
1854 
1855 floatx80 float32_to_floatx80(float32 a, float_status *status)
1856 {
1857     flag aSign;
1858     int aExp;
1859     uint32_t aSig;
1860 
1861     a = float32_squash_input_denormal(a, status);
1862     aSig = extractFloat32Frac( a );
1863     aExp = extractFloat32Exp( a );
1864     aSign = extractFloat32Sign( a );
1865     if ( aExp == 0xFF ) {
1866         if (aSig) {
1867             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1868         }
1869         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1870     }
1871     if ( aExp == 0 ) {
1872         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1873         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1874     }
1875     aSig |= 0x00800000;
1876     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1877 
1878 }
1879 
1880 /*----------------------------------------------------------------------------
1881 | Returns the result of converting the single-precision floating-point value
1882 | `a' to the double-precision floating-point format.  The conversion is
1883 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1884 | Arithmetic.
1885 *----------------------------------------------------------------------------*/
1886 
1887 float128 float32_to_float128(float32 a, float_status *status)
1888 {
1889     flag aSign;
1890     int aExp;
1891     uint32_t aSig;
1892 
1893     a = float32_squash_input_denormal(a, status);
1894     aSig = extractFloat32Frac( a );
1895     aExp = extractFloat32Exp( a );
1896     aSign = extractFloat32Sign( a );
1897     if ( aExp == 0xFF ) {
1898         if (aSig) {
1899             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1900         }
1901         return packFloat128( aSign, 0x7FFF, 0, 0 );
1902     }
1903     if ( aExp == 0 ) {
1904         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1905         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1906         --aExp;
1907     }
1908     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1909 
1910 }
1911 
1912 /*----------------------------------------------------------------------------
1913 | Rounds the single-precision floating-point value `a' to an integer, and
1914 | returns the result as a single-precision floating-point value.  The
1915 | operation is performed according to the IEC/IEEE Standard for Binary
1916 | Floating-Point Arithmetic.
1917 *----------------------------------------------------------------------------*/
1918 
1919 float32 float32_round_to_int(float32 a, float_status *status)
1920 {
1921     flag aSign;
1922     int aExp;
1923     uint32_t lastBitMask, roundBitsMask;
1924     uint32_t z;
1925     a = float32_squash_input_denormal(a, status);
1926 
1927     aExp = extractFloat32Exp( a );
1928     if ( 0x96 <= aExp ) {
1929         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1930             return propagateFloat32NaN(a, a, status);
1931         }
1932         return a;
1933     }
1934     if ( aExp <= 0x7E ) {
1935         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1936         status->float_exception_flags |= float_flag_inexact;
1937         aSign = extractFloat32Sign( a );
1938         switch (status->float_rounding_mode) {
1939          case float_round_nearest_even:
1940             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1941                 return packFloat32( aSign, 0x7F, 0 );
1942             }
1943             break;
1944         case float_round_ties_away:
1945             if (aExp == 0x7E) {
1946                 return packFloat32(aSign, 0x7F, 0);
1947             }
1948             break;
1949          case float_round_down:
1950             return make_float32(aSign ? 0xBF800000 : 0);
1951          case float_round_up:
1952             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1953         }
1954         return packFloat32( aSign, 0, 0 );
1955     }
1956     lastBitMask = 1;
1957     lastBitMask <<= 0x96 - aExp;
1958     roundBitsMask = lastBitMask - 1;
1959     z = float32_val(a);
1960     switch (status->float_rounding_mode) {
1961     case float_round_nearest_even:
1962         z += lastBitMask>>1;
1963         if ((z & roundBitsMask) == 0) {
1964             z &= ~lastBitMask;
1965         }
1966         break;
1967     case float_round_ties_away:
1968         z += lastBitMask >> 1;
1969         break;
1970     case float_round_to_zero:
1971         break;
1972     case float_round_up:
1973         if (!extractFloat32Sign(make_float32(z))) {
1974             z += roundBitsMask;
1975         }
1976         break;
1977     case float_round_down:
1978         if (extractFloat32Sign(make_float32(z))) {
1979             z += roundBitsMask;
1980         }
1981         break;
1982     default:
1983         abort();
1984     }
1985     z &= ~ roundBitsMask;
1986     if (z != float32_val(a)) {
1987         status->float_exception_flags |= float_flag_inexact;
1988     }
1989     return make_float32(z);
1990 
1991 }
1992 
1993 /*----------------------------------------------------------------------------
1994 | Returns the result of adding the absolute values of the single-precision
1995 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1996 | before being returned.  `zSign' is ignored if the result is a NaN.
1997 | The addition is performed according to the IEC/IEEE Standard for Binary
1998 | Floating-Point Arithmetic.
1999 *----------------------------------------------------------------------------*/
2000 
2001 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2002                               float_status *status)
2003 {
2004     int aExp, bExp, zExp;
2005     uint32_t aSig, bSig, zSig;
2006     int expDiff;
2007 
2008     aSig = extractFloat32Frac( a );
2009     aExp = extractFloat32Exp( a );
2010     bSig = extractFloat32Frac( b );
2011     bExp = extractFloat32Exp( b );
2012     expDiff = aExp - bExp;
2013     aSig <<= 6;
2014     bSig <<= 6;
2015     if ( 0 < expDiff ) {
2016         if ( aExp == 0xFF ) {
2017             if (aSig) {
2018                 return propagateFloat32NaN(a, b, status);
2019             }
2020             return a;
2021         }
2022         if ( bExp == 0 ) {
2023             --expDiff;
2024         }
2025         else {
2026             bSig |= 0x20000000;
2027         }
2028         shift32RightJamming( bSig, expDiff, &bSig );
2029         zExp = aExp;
2030     }
2031     else if ( expDiff < 0 ) {
2032         if ( bExp == 0xFF ) {
2033             if (bSig) {
2034                 return propagateFloat32NaN(a, b, status);
2035             }
2036             return packFloat32( zSign, 0xFF, 0 );
2037         }
2038         if ( aExp == 0 ) {
2039             ++expDiff;
2040         }
2041         else {
2042             aSig |= 0x20000000;
2043         }
2044         shift32RightJamming( aSig, - expDiff, &aSig );
2045         zExp = bExp;
2046     }
2047     else {
2048         if ( aExp == 0xFF ) {
2049             if (aSig | bSig) {
2050                 return propagateFloat32NaN(a, b, status);
2051             }
2052             return a;
2053         }
2054         if ( aExp == 0 ) {
2055             if (status->flush_to_zero) {
2056                 if (aSig | bSig) {
2057                     float_raise(float_flag_output_denormal, status);
2058                 }
2059                 return packFloat32(zSign, 0, 0);
2060             }
2061             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2062         }
2063         zSig = 0x40000000 + aSig + bSig;
2064         zExp = aExp;
2065         goto roundAndPack;
2066     }
2067     aSig |= 0x20000000;
2068     zSig = ( aSig + bSig )<<1;
2069     --zExp;
2070     if ( (int32_t) zSig < 0 ) {
2071         zSig = aSig + bSig;
2072         ++zExp;
2073     }
2074  roundAndPack:
2075     return roundAndPackFloat32(zSign, zExp, zSig, status);
2076 
2077 }
2078 
2079 /*----------------------------------------------------------------------------
2080 | Returns the result of subtracting the absolute values of the single-
2081 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2082 | difference is negated before being returned.  `zSign' is ignored if the
2083 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2084 | Standard for Binary Floating-Point Arithmetic.
2085 *----------------------------------------------------------------------------*/
2086 
2087 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2088                               float_status *status)
2089 {
2090     int aExp, bExp, zExp;
2091     uint32_t aSig, bSig, zSig;
2092     int expDiff;
2093 
2094     aSig = extractFloat32Frac( a );
2095     aExp = extractFloat32Exp( a );
2096     bSig = extractFloat32Frac( b );
2097     bExp = extractFloat32Exp( b );
2098     expDiff = aExp - bExp;
2099     aSig <<= 7;
2100     bSig <<= 7;
2101     if ( 0 < expDiff ) goto aExpBigger;
2102     if ( expDiff < 0 ) goto bExpBigger;
2103     if ( aExp == 0xFF ) {
2104         if (aSig | bSig) {
2105             return propagateFloat32NaN(a, b, status);
2106         }
2107         float_raise(float_flag_invalid, status);
2108         return float32_default_nan(status);
2109     }
2110     if ( aExp == 0 ) {
2111         aExp = 1;
2112         bExp = 1;
2113     }
2114     if ( bSig < aSig ) goto aBigger;
2115     if ( aSig < bSig ) goto bBigger;
2116     return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2117  bExpBigger:
2118     if ( bExp == 0xFF ) {
2119         if (bSig) {
2120             return propagateFloat32NaN(a, b, status);
2121         }
2122         return packFloat32( zSign ^ 1, 0xFF, 0 );
2123     }
2124     if ( aExp == 0 ) {
2125         ++expDiff;
2126     }
2127     else {
2128         aSig |= 0x40000000;
2129     }
2130     shift32RightJamming( aSig, - expDiff, &aSig );
2131     bSig |= 0x40000000;
2132  bBigger:
2133     zSig = bSig - aSig;
2134     zExp = bExp;
2135     zSign ^= 1;
2136     goto normalizeRoundAndPack;
2137  aExpBigger:
2138     if ( aExp == 0xFF ) {
2139         if (aSig) {
2140             return propagateFloat32NaN(a, b, status);
2141         }
2142         return a;
2143     }
2144     if ( bExp == 0 ) {
2145         --expDiff;
2146     }
2147     else {
2148         bSig |= 0x40000000;
2149     }
2150     shift32RightJamming( bSig, expDiff, &bSig );
2151     aSig |= 0x40000000;
2152  aBigger:
2153     zSig = aSig - bSig;
2154     zExp = aExp;
2155  normalizeRoundAndPack:
2156     --zExp;
2157     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2158 
2159 }
2160 
2161 /*----------------------------------------------------------------------------
2162 | Returns the result of adding the single-precision floating-point values `a'
2163 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2164 | Binary Floating-Point Arithmetic.
2165 *----------------------------------------------------------------------------*/
2166 
2167 float32 float32_add(float32 a, float32 b, float_status *status)
2168 {
2169     flag aSign, bSign;
2170     a = float32_squash_input_denormal(a, status);
2171     b = float32_squash_input_denormal(b, status);
2172 
2173     aSign = extractFloat32Sign( a );
2174     bSign = extractFloat32Sign( b );
2175     if ( aSign == bSign ) {
2176         return addFloat32Sigs(a, b, aSign, status);
2177     }
2178     else {
2179         return subFloat32Sigs(a, b, aSign, status);
2180     }
2181 
2182 }
2183 
2184 /*----------------------------------------------------------------------------
2185 | Returns the result of subtracting the single-precision floating-point values
2186 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2187 | for Binary Floating-Point Arithmetic.
2188 *----------------------------------------------------------------------------*/
2189 
2190 float32 float32_sub(float32 a, float32 b, float_status *status)
2191 {
2192     flag aSign, bSign;
2193     a = float32_squash_input_denormal(a, status);
2194     b = float32_squash_input_denormal(b, status);
2195 
2196     aSign = extractFloat32Sign( a );
2197     bSign = extractFloat32Sign( b );
2198     if ( aSign == bSign ) {
2199         return subFloat32Sigs(a, b, aSign, status);
2200     }
2201     else {
2202         return addFloat32Sigs(a, b, aSign, status);
2203     }
2204 
2205 }
2206 
2207 /*----------------------------------------------------------------------------
2208 | Returns the result of multiplying the single-precision floating-point values
2209 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2210 | for Binary Floating-Point Arithmetic.
2211 *----------------------------------------------------------------------------*/
2212 
2213 float32 float32_mul(float32 a, float32 b, float_status *status)
2214 {
2215     flag aSign, bSign, zSign;
2216     int aExp, bExp, zExp;
2217     uint32_t aSig, bSig;
2218     uint64_t zSig64;
2219     uint32_t zSig;
2220 
2221     a = float32_squash_input_denormal(a, status);
2222     b = float32_squash_input_denormal(b, status);
2223 
2224     aSig = extractFloat32Frac( a );
2225     aExp = extractFloat32Exp( a );
2226     aSign = extractFloat32Sign( a );
2227     bSig = extractFloat32Frac( b );
2228     bExp = extractFloat32Exp( b );
2229     bSign = extractFloat32Sign( b );
2230     zSign = aSign ^ bSign;
2231     if ( aExp == 0xFF ) {
2232         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2233             return propagateFloat32NaN(a, b, status);
2234         }
2235         if ( ( bExp | bSig ) == 0 ) {
2236             float_raise(float_flag_invalid, status);
2237             return float32_default_nan(status);
2238         }
2239         return packFloat32( zSign, 0xFF, 0 );
2240     }
2241     if ( bExp == 0xFF ) {
2242         if (bSig) {
2243             return propagateFloat32NaN(a, b, status);
2244         }
2245         if ( ( aExp | aSig ) == 0 ) {
2246             float_raise(float_flag_invalid, status);
2247             return float32_default_nan(status);
2248         }
2249         return packFloat32( zSign, 0xFF, 0 );
2250     }
2251     if ( aExp == 0 ) {
2252         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2253         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2254     }
2255     if ( bExp == 0 ) {
2256         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2257         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2258     }
2259     zExp = aExp + bExp - 0x7F;
2260     aSig = ( aSig | 0x00800000 )<<7;
2261     bSig = ( bSig | 0x00800000 )<<8;
2262     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2263     zSig = zSig64;
2264     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2265         zSig <<= 1;
2266         --zExp;
2267     }
2268     return roundAndPackFloat32(zSign, zExp, zSig, status);
2269 
2270 }
2271 
2272 /*----------------------------------------------------------------------------
2273 | Returns the result of dividing the single-precision floating-point value `a'
2274 | by the corresponding value `b'.  The operation is performed according to the
2275 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 *----------------------------------------------------------------------------*/
2277 
2278 float32 float32_div(float32 a, float32 b, float_status *status)
2279 {
2280     flag aSign, bSign, zSign;
2281     int aExp, bExp, zExp;
2282     uint32_t aSig, bSig, zSig;
2283     a = float32_squash_input_denormal(a, status);
2284     b = float32_squash_input_denormal(b, status);
2285 
2286     aSig = extractFloat32Frac( a );
2287     aExp = extractFloat32Exp( a );
2288     aSign = extractFloat32Sign( a );
2289     bSig = extractFloat32Frac( b );
2290     bExp = extractFloat32Exp( b );
2291     bSign = extractFloat32Sign( b );
2292     zSign = aSign ^ bSign;
2293     if ( aExp == 0xFF ) {
2294         if (aSig) {
2295             return propagateFloat32NaN(a, b, status);
2296         }
2297         if ( bExp == 0xFF ) {
2298             if (bSig) {
2299                 return propagateFloat32NaN(a, b, status);
2300             }
2301             float_raise(float_flag_invalid, status);
2302             return float32_default_nan(status);
2303         }
2304         return packFloat32( zSign, 0xFF, 0 );
2305     }
2306     if ( bExp == 0xFF ) {
2307         if (bSig) {
2308             return propagateFloat32NaN(a, b, status);
2309         }
2310         return packFloat32( zSign, 0, 0 );
2311     }
2312     if ( bExp == 0 ) {
2313         if ( bSig == 0 ) {
2314             if ( ( aExp | aSig ) == 0 ) {
2315                 float_raise(float_flag_invalid, status);
2316                 return float32_default_nan(status);
2317             }
2318             float_raise(float_flag_divbyzero, status);
2319             return packFloat32( zSign, 0xFF, 0 );
2320         }
2321         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2322     }
2323     if ( aExp == 0 ) {
2324         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2325         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2326     }
2327     zExp = aExp - bExp + 0x7D;
2328     aSig = ( aSig | 0x00800000 )<<7;
2329     bSig = ( bSig | 0x00800000 )<<8;
2330     if ( bSig <= ( aSig + aSig ) ) {
2331         aSig >>= 1;
2332         ++zExp;
2333     }
2334     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2335     if ( ( zSig & 0x3F ) == 0 ) {
2336         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2337     }
2338     return roundAndPackFloat32(zSign, zExp, zSig, status);
2339 
2340 }
2341 
2342 /*----------------------------------------------------------------------------
2343 | Returns the remainder of the single-precision floating-point value `a'
2344 | with respect to the corresponding value `b'.  The operation is performed
2345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2346 *----------------------------------------------------------------------------*/
2347 
2348 float32 float32_rem(float32 a, float32 b, float_status *status)
2349 {
2350     flag aSign, zSign;
2351     int aExp, bExp, expDiff;
2352     uint32_t aSig, bSig;
2353     uint32_t q;
2354     uint64_t aSig64, bSig64, q64;
2355     uint32_t alternateASig;
2356     int32_t sigMean;
2357     a = float32_squash_input_denormal(a, status);
2358     b = float32_squash_input_denormal(b, status);
2359 
2360     aSig = extractFloat32Frac( a );
2361     aExp = extractFloat32Exp( a );
2362     aSign = extractFloat32Sign( a );
2363     bSig = extractFloat32Frac( b );
2364     bExp = extractFloat32Exp( b );
2365     if ( aExp == 0xFF ) {
2366         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2367             return propagateFloat32NaN(a, b, status);
2368         }
2369         float_raise(float_flag_invalid, status);
2370         return float32_default_nan(status);
2371     }
2372     if ( bExp == 0xFF ) {
2373         if (bSig) {
2374             return propagateFloat32NaN(a, b, status);
2375         }
2376         return a;
2377     }
2378     if ( bExp == 0 ) {
2379         if ( bSig == 0 ) {
2380             float_raise(float_flag_invalid, status);
2381             return float32_default_nan(status);
2382         }
2383         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2384     }
2385     if ( aExp == 0 ) {
2386         if ( aSig == 0 ) return a;
2387         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2388     }
2389     expDiff = aExp - bExp;
2390     aSig |= 0x00800000;
2391     bSig |= 0x00800000;
2392     if ( expDiff < 32 ) {
2393         aSig <<= 8;
2394         bSig <<= 8;
2395         if ( expDiff < 0 ) {
2396             if ( expDiff < -1 ) return a;
2397             aSig >>= 1;
2398         }
2399         q = ( bSig <= aSig );
2400         if ( q ) aSig -= bSig;
2401         if ( 0 < expDiff ) {
2402             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2403             q >>= 32 - expDiff;
2404             bSig >>= 2;
2405             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2406         }
2407         else {
2408             aSig >>= 2;
2409             bSig >>= 2;
2410         }
2411     }
2412     else {
2413         if ( bSig <= aSig ) aSig -= bSig;
2414         aSig64 = ( (uint64_t) aSig )<<40;
2415         bSig64 = ( (uint64_t) bSig )<<40;
2416         expDiff -= 64;
2417         while ( 0 < expDiff ) {
2418             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2419             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2420             aSig64 = - ( ( bSig * q64 )<<38 );
2421             expDiff -= 62;
2422         }
2423         expDiff += 64;
2424         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2425         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2426         q = q64>>( 64 - expDiff );
2427         bSig <<= 6;
2428         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2429     }
2430     do {
2431         alternateASig = aSig;
2432         ++q;
2433         aSig -= bSig;
2434     } while ( 0 <= (int32_t) aSig );
2435     sigMean = aSig + alternateASig;
2436     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2437         aSig = alternateASig;
2438     }
2439     zSign = ( (int32_t) aSig < 0 );
2440     if ( zSign ) aSig = - aSig;
2441     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2442 }
2443 
2444 /*----------------------------------------------------------------------------
2445 | Returns the result of multiplying the single-precision floating-point values
2446 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2447 | multiplication.  The operation is performed according to the IEC/IEEE
2448 | Standard for Binary Floating-Point Arithmetic 754-2008.
2449 | The flags argument allows the caller to select negation of the
2450 | addend, the intermediate product, or the final result. (The difference
2451 | between this and having the caller do a separate negation is that negating
2452 | externally will flip the sign bit on NaNs.)
2453 *----------------------------------------------------------------------------*/
2454 
2455 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2456                        float_status *status)
2457 {
2458     flag aSign, bSign, cSign, zSign;
2459     int aExp, bExp, cExp, pExp, zExp, expDiff;
2460     uint32_t aSig, bSig, cSig;
2461     flag pInf, pZero, pSign;
2462     uint64_t pSig64, cSig64, zSig64;
2463     uint32_t pSig;
2464     int shiftcount;
2465     flag signflip, infzero;
2466 
2467     a = float32_squash_input_denormal(a, status);
2468     b = float32_squash_input_denormal(b, status);
2469     c = float32_squash_input_denormal(c, status);
2470     aSig = extractFloat32Frac(a);
2471     aExp = extractFloat32Exp(a);
2472     aSign = extractFloat32Sign(a);
2473     bSig = extractFloat32Frac(b);
2474     bExp = extractFloat32Exp(b);
2475     bSign = extractFloat32Sign(b);
2476     cSig = extractFloat32Frac(c);
2477     cExp = extractFloat32Exp(c);
2478     cSign = extractFloat32Sign(c);
2479 
2480     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2481                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2482 
2483     /* It is implementation-defined whether the cases of (0,inf,qnan)
2484      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2485      * they return if they do), so we have to hand this information
2486      * off to the target-specific pick-a-NaN routine.
2487      */
2488     if (((aExp == 0xff) && aSig) ||
2489         ((bExp == 0xff) && bSig) ||
2490         ((cExp == 0xff) && cSig)) {
2491         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2492     }
2493 
2494     if (infzero) {
2495         float_raise(float_flag_invalid, status);
2496         return float32_default_nan(status);
2497     }
2498 
2499     if (flags & float_muladd_negate_c) {
2500         cSign ^= 1;
2501     }
2502 
2503     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2504 
2505     /* Work out the sign and type of the product */
2506     pSign = aSign ^ bSign;
2507     if (flags & float_muladd_negate_product) {
2508         pSign ^= 1;
2509     }
2510     pInf = (aExp == 0xff) || (bExp == 0xff);
2511     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2512 
2513     if (cExp == 0xff) {
2514         if (pInf && (pSign ^ cSign)) {
2515             /* addition of opposite-signed infinities => InvalidOperation */
2516             float_raise(float_flag_invalid, status);
2517             return float32_default_nan(status);
2518         }
2519         /* Otherwise generate an infinity of the same sign */
2520         return packFloat32(cSign ^ signflip, 0xff, 0);
2521     }
2522 
2523     if (pInf) {
2524         return packFloat32(pSign ^ signflip, 0xff, 0);
2525     }
2526 
2527     if (pZero) {
2528         if (cExp == 0) {
2529             if (cSig == 0) {
2530                 /* Adding two exact zeroes */
2531                 if (pSign == cSign) {
2532                     zSign = pSign;
2533                 } else if (status->float_rounding_mode == float_round_down) {
2534                     zSign = 1;
2535                 } else {
2536                     zSign = 0;
2537                 }
2538                 return packFloat32(zSign ^ signflip, 0, 0);
2539             }
2540             /* Exact zero plus a denorm */
2541             if (status->flush_to_zero) {
2542                 float_raise(float_flag_output_denormal, status);
2543                 return packFloat32(cSign ^ signflip, 0, 0);
2544             }
2545         }
2546         /* Zero plus something non-zero : just return the something */
2547         if (flags & float_muladd_halve_result) {
2548             if (cExp == 0) {
2549                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2550             }
2551             /* Subtract one to halve, and one again because roundAndPackFloat32
2552              * wants one less than the true exponent.
2553              */
2554             cExp -= 2;
2555             cSig = (cSig | 0x00800000) << 7;
2556             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2557         }
2558         return packFloat32(cSign ^ signflip, cExp, cSig);
2559     }
2560 
2561     if (aExp == 0) {
2562         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2563     }
2564     if (bExp == 0) {
2565         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2566     }
2567 
2568     /* Calculate the actual result a * b + c */
2569 
2570     /* Multiply first; this is easy. */
2571     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2572      * because we want the true exponent, not the "one-less-than"
2573      * flavour that roundAndPackFloat32() takes.
2574      */
2575     pExp = aExp + bExp - 0x7e;
2576     aSig = (aSig | 0x00800000) << 7;
2577     bSig = (bSig | 0x00800000) << 8;
2578     pSig64 = (uint64_t)aSig * bSig;
2579     if ((int64_t)(pSig64 << 1) >= 0) {
2580         pSig64 <<= 1;
2581         pExp--;
2582     }
2583 
2584     zSign = pSign ^ signflip;
2585 
2586     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2587      * position 62.
2588      */
2589     if (cExp == 0) {
2590         if (!cSig) {
2591             /* Throw out the special case of c being an exact zero now */
2592             shift64RightJamming(pSig64, 32, &pSig64);
2593             pSig = pSig64;
2594             if (flags & float_muladd_halve_result) {
2595                 pExp--;
2596             }
2597             return roundAndPackFloat32(zSign, pExp - 1,
2598                                        pSig, status);
2599         }
2600         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2601     }
2602 
2603     cSig64 = (uint64_t)cSig << (62 - 23);
2604     cSig64 |= LIT64(0x4000000000000000);
2605     expDiff = pExp - cExp;
2606 
2607     if (pSign == cSign) {
2608         /* Addition */
2609         if (expDiff > 0) {
2610             /* scale c to match p */
2611             shift64RightJamming(cSig64, expDiff, &cSig64);
2612             zExp = pExp;
2613         } else if (expDiff < 0) {
2614             /* scale p to match c */
2615             shift64RightJamming(pSig64, -expDiff, &pSig64);
2616             zExp = cExp;
2617         } else {
2618             /* no scaling needed */
2619             zExp = cExp;
2620         }
2621         /* Add significands and make sure explicit bit ends up in posn 62 */
2622         zSig64 = pSig64 + cSig64;
2623         if ((int64_t)zSig64 < 0) {
2624             shift64RightJamming(zSig64, 1, &zSig64);
2625         } else {
2626             zExp--;
2627         }
2628     } else {
2629         /* Subtraction */
2630         if (expDiff > 0) {
2631             shift64RightJamming(cSig64, expDiff, &cSig64);
2632             zSig64 = pSig64 - cSig64;
2633             zExp = pExp;
2634         } else if (expDiff < 0) {
2635             shift64RightJamming(pSig64, -expDiff, &pSig64);
2636             zSig64 = cSig64 - pSig64;
2637             zExp = cExp;
2638             zSign ^= 1;
2639         } else {
2640             zExp = pExp;
2641             if (cSig64 < pSig64) {
2642                 zSig64 = pSig64 - cSig64;
2643             } else if (pSig64 < cSig64) {
2644                 zSig64 = cSig64 - pSig64;
2645                 zSign ^= 1;
2646             } else {
2647                 /* Exact zero */
2648                 zSign = signflip;
2649                 if (status->float_rounding_mode == float_round_down) {
2650                     zSign ^= 1;
2651                 }
2652                 return packFloat32(zSign, 0, 0);
2653             }
2654         }
2655         --zExp;
2656         /* Normalize to put the explicit bit back into bit 62. */
2657         shiftcount = countLeadingZeros64(zSig64) - 1;
2658         zSig64 <<= shiftcount;
2659         zExp -= shiftcount;
2660     }
2661     if (flags & float_muladd_halve_result) {
2662         zExp--;
2663     }
2664 
2665     shift64RightJamming(zSig64, 32, &zSig64);
2666     return roundAndPackFloat32(zSign, zExp, zSig64, status);
2667 }
2668 
2669 
2670 /*----------------------------------------------------------------------------
2671 | Returns the square root of the single-precision floating-point value `a'.
2672 | The operation is performed according to the IEC/IEEE Standard for Binary
2673 | Floating-Point Arithmetic.
2674 *----------------------------------------------------------------------------*/
2675 
2676 float32 float32_sqrt(float32 a, float_status *status)
2677 {
2678     flag aSign;
2679     int aExp, zExp;
2680     uint32_t aSig, zSig;
2681     uint64_t rem, term;
2682     a = float32_squash_input_denormal(a, status);
2683 
2684     aSig = extractFloat32Frac( a );
2685     aExp = extractFloat32Exp( a );
2686     aSign = extractFloat32Sign( a );
2687     if ( aExp == 0xFF ) {
2688         if (aSig) {
2689             return propagateFloat32NaN(a, float32_zero, status);
2690         }
2691         if ( ! aSign ) return a;
2692         float_raise(float_flag_invalid, status);
2693         return float32_default_nan(status);
2694     }
2695     if ( aSign ) {
2696         if ( ( aExp | aSig ) == 0 ) return a;
2697         float_raise(float_flag_invalid, status);
2698         return float32_default_nan(status);
2699     }
2700     if ( aExp == 0 ) {
2701         if ( aSig == 0 ) return float32_zero;
2702         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2703     }
2704     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2705     aSig = ( aSig | 0x00800000 )<<8;
2706     zSig = estimateSqrt32( aExp, aSig ) + 2;
2707     if ( ( zSig & 0x7F ) <= 5 ) {
2708         if ( zSig < 2 ) {
2709             zSig = 0x7FFFFFFF;
2710             goto roundAndPack;
2711         }
2712         aSig >>= aExp & 1;
2713         term = ( (uint64_t) zSig ) * zSig;
2714         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2715         while ( (int64_t) rem < 0 ) {
2716             --zSig;
2717             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2718         }
2719         zSig |= ( rem != 0 );
2720     }
2721     shift32RightJamming( zSig, 1, &zSig );
2722  roundAndPack:
2723     return roundAndPackFloat32(0, zExp, zSig, status);
2724 
2725 }
2726 
2727 /*----------------------------------------------------------------------------
2728 | Returns the binary exponential of the single-precision floating-point value
2729 | `a'. The operation is performed according to the IEC/IEEE Standard for
2730 | Binary Floating-Point Arithmetic.
2731 |
2732 | Uses the following identities:
2733 |
2734 | 1. -------------------------------------------------------------------------
2735 |      x    x*ln(2)
2736 |     2  = e
2737 |
2738 | 2. -------------------------------------------------------------------------
2739 |                      2     3     4     5           n
2740 |      x        x     x     x     x     x           x
2741 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2742 |               1!    2!    3!    4!    5!          n!
2743 *----------------------------------------------------------------------------*/
2744 
2745 static const float64 float32_exp2_coefficients[15] =
2746 {
2747     const_float64( 0x3ff0000000000000ll ), /*  1 */
2748     const_float64( 0x3fe0000000000000ll ), /*  2 */
2749     const_float64( 0x3fc5555555555555ll ), /*  3 */
2750     const_float64( 0x3fa5555555555555ll ), /*  4 */
2751     const_float64( 0x3f81111111111111ll ), /*  5 */
2752     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2753     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2754     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2755     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2756     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2757     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2758     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2759     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2760     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2761     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2762 };
2763 
2764 float32 float32_exp2(float32 a, float_status *status)
2765 {
2766     flag aSign;
2767     int aExp;
2768     uint32_t aSig;
2769     float64 r, x, xn;
2770     int i;
2771     a = float32_squash_input_denormal(a, status);
2772 
2773     aSig = extractFloat32Frac( a );
2774     aExp = extractFloat32Exp( a );
2775     aSign = extractFloat32Sign( a );
2776 
2777     if ( aExp == 0xFF) {
2778         if (aSig) {
2779             return propagateFloat32NaN(a, float32_zero, status);
2780         }
2781         return (aSign) ? float32_zero : a;
2782     }
2783     if (aExp == 0) {
2784         if (aSig == 0) return float32_one;
2785     }
2786 
2787     float_raise(float_flag_inexact, status);
2788 
2789     /* ******************************* */
2790     /* using float64 for approximation */
2791     /* ******************************* */
2792     x = float32_to_float64(a, status);
2793     x = float64_mul(x, float64_ln2, status);
2794 
2795     xn = x;
2796     r = float64_one;
2797     for (i = 0 ; i < 15 ; i++) {
2798         float64 f;
2799 
2800         f = float64_mul(xn, float32_exp2_coefficients[i], status);
2801         r = float64_add(r, f, status);
2802 
2803         xn = float64_mul(xn, x, status);
2804     }
2805 
2806     return float64_to_float32(r, status);
2807 }
2808 
2809 /*----------------------------------------------------------------------------
2810 | Returns the binary log of the single-precision floating-point value `a'.
2811 | The operation is performed according to the IEC/IEEE Standard for Binary
2812 | Floating-Point Arithmetic.
2813 *----------------------------------------------------------------------------*/
2814 float32 float32_log2(float32 a, float_status *status)
2815 {
2816     flag aSign, zSign;
2817     int aExp;
2818     uint32_t aSig, zSig, i;
2819 
2820     a = float32_squash_input_denormal(a, status);
2821     aSig = extractFloat32Frac( a );
2822     aExp = extractFloat32Exp( a );
2823     aSign = extractFloat32Sign( a );
2824 
2825     if ( aExp == 0 ) {
2826         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2827         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2828     }
2829     if ( aSign ) {
2830         float_raise(float_flag_invalid, status);
2831         return float32_default_nan(status);
2832     }
2833     if ( aExp == 0xFF ) {
2834         if (aSig) {
2835             return propagateFloat32NaN(a, float32_zero, status);
2836         }
2837         return a;
2838     }
2839 
2840     aExp -= 0x7F;
2841     aSig |= 0x00800000;
2842     zSign = aExp < 0;
2843     zSig = aExp << 23;
2844 
2845     for (i = 1 << 22; i > 0; i >>= 1) {
2846         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2847         if ( aSig & 0x01000000 ) {
2848             aSig >>= 1;
2849             zSig |= i;
2850         }
2851     }
2852 
2853     if ( zSign )
2854         zSig = -zSig;
2855 
2856     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2857 }
2858 
2859 /*----------------------------------------------------------------------------
2860 | Returns 1 if the single-precision floating-point value `a' is equal to
2861 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2862 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2865 
2866 int float32_eq(float32 a, float32 b, float_status *status)
2867 {
2868     uint32_t av, bv;
2869     a = float32_squash_input_denormal(a, status);
2870     b = float32_squash_input_denormal(b, status);
2871 
2872     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2873          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2874        ) {
2875         float_raise(float_flag_invalid, status);
2876         return 0;
2877     }
2878     av = float32_val(a);
2879     bv = float32_val(b);
2880     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2881 }
2882 
2883 /*----------------------------------------------------------------------------
2884 | Returns 1 if the single-precision floating-point value `a' is less than
2885 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2886 | exception is raised if either operand is a NaN.  The comparison is performed
2887 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2888 *----------------------------------------------------------------------------*/
2889 
2890 int float32_le(float32 a, float32 b, float_status *status)
2891 {
2892     flag aSign, bSign;
2893     uint32_t av, bv;
2894     a = float32_squash_input_denormal(a, status);
2895     b = float32_squash_input_denormal(b, status);
2896 
2897     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2898          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2899        ) {
2900         float_raise(float_flag_invalid, status);
2901         return 0;
2902     }
2903     aSign = extractFloat32Sign( a );
2904     bSign = extractFloat32Sign( b );
2905     av = float32_val(a);
2906     bv = float32_val(b);
2907     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2908     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2909 
2910 }
2911 
2912 /*----------------------------------------------------------------------------
2913 | Returns 1 if the single-precision floating-point value `a' is less than
2914 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2915 | raised if either operand is a NaN.  The comparison is performed according
2916 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2917 *----------------------------------------------------------------------------*/
2918 
2919 int float32_lt(float32 a, float32 b, float_status *status)
2920 {
2921     flag aSign, bSign;
2922     uint32_t av, bv;
2923     a = float32_squash_input_denormal(a, status);
2924     b = float32_squash_input_denormal(b, status);
2925 
2926     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2927          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2928        ) {
2929         float_raise(float_flag_invalid, status);
2930         return 0;
2931     }
2932     aSign = extractFloat32Sign( a );
2933     bSign = extractFloat32Sign( b );
2934     av = float32_val(a);
2935     bv = float32_val(b);
2936     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2937     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2938 
2939 }
2940 
2941 /*----------------------------------------------------------------------------
2942 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2943 | be compared, and 0 otherwise.  The invalid exception is raised if either
2944 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2945 | Standard for Binary Floating-Point Arithmetic.
2946 *----------------------------------------------------------------------------*/
2947 
2948 int float32_unordered(float32 a, float32 b, float_status *status)
2949 {
2950     a = float32_squash_input_denormal(a, status);
2951     b = float32_squash_input_denormal(b, status);
2952 
2953     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2954          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2955        ) {
2956         float_raise(float_flag_invalid, status);
2957         return 1;
2958     }
2959     return 0;
2960 }
2961 
2962 /*----------------------------------------------------------------------------
2963 | Returns 1 if the single-precision floating-point value `a' is equal to
2964 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2965 | exception.  The comparison is performed according to the IEC/IEEE Standard
2966 | for Binary Floating-Point Arithmetic.
2967 *----------------------------------------------------------------------------*/
2968 
2969 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2970 {
2971     a = float32_squash_input_denormal(a, status);
2972     b = float32_squash_input_denormal(b, status);
2973 
2974     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2975          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2976        ) {
2977         if (float32_is_signaling_nan(a, status)
2978          || float32_is_signaling_nan(b, status)) {
2979             float_raise(float_flag_invalid, status);
2980         }
2981         return 0;
2982     }
2983     return ( float32_val(a) == float32_val(b) ) ||
2984             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2985 }
2986 
2987 /*----------------------------------------------------------------------------
2988 | Returns 1 if the single-precision floating-point value `a' is less than or
2989 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2990 | cause an exception.  Otherwise, the comparison is performed according to the
2991 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2992 *----------------------------------------------------------------------------*/
2993 
2994 int float32_le_quiet(float32 a, float32 b, float_status *status)
2995 {
2996     flag aSign, bSign;
2997     uint32_t av, bv;
2998     a = float32_squash_input_denormal(a, status);
2999     b = float32_squash_input_denormal(b, status);
3000 
3001     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3002          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3003        ) {
3004         if (float32_is_signaling_nan(a, status)
3005          || float32_is_signaling_nan(b, status)) {
3006             float_raise(float_flag_invalid, status);
3007         }
3008         return 0;
3009     }
3010     aSign = extractFloat32Sign( a );
3011     bSign = extractFloat32Sign( b );
3012     av = float32_val(a);
3013     bv = float32_val(b);
3014     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3015     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3016 
3017 }
3018 
3019 /*----------------------------------------------------------------------------
3020 | Returns 1 if the single-precision floating-point value `a' is less than
3021 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3022 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3023 | Standard for Binary Floating-Point Arithmetic.
3024 *----------------------------------------------------------------------------*/
3025 
3026 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3027 {
3028     flag aSign, bSign;
3029     uint32_t av, bv;
3030     a = float32_squash_input_denormal(a, status);
3031     b = float32_squash_input_denormal(b, status);
3032 
3033     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3034          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3035        ) {
3036         if (float32_is_signaling_nan(a, status)
3037          || float32_is_signaling_nan(b, status)) {
3038             float_raise(float_flag_invalid, status);
3039         }
3040         return 0;
3041     }
3042     aSign = extractFloat32Sign( a );
3043     bSign = extractFloat32Sign( b );
3044     av = float32_val(a);
3045     bv = float32_val(b);
3046     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3047     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3048 
3049 }
3050 
3051 /*----------------------------------------------------------------------------
3052 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3053 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3054 | comparison is performed according to the IEC/IEEE Standard for Binary
3055 | Floating-Point Arithmetic.
3056 *----------------------------------------------------------------------------*/
3057 
3058 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3059 {
3060     a = float32_squash_input_denormal(a, status);
3061     b = float32_squash_input_denormal(b, status);
3062 
3063     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3064          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3065        ) {
3066         if (float32_is_signaling_nan(a, status)
3067          || float32_is_signaling_nan(b, status)) {
3068             float_raise(float_flag_invalid, status);
3069         }
3070         return 1;
3071     }
3072     return 0;
3073 }
3074 
3075 /*----------------------------------------------------------------------------
3076 | Returns the result of converting the double-precision floating-point value
3077 | `a' to the 32-bit two's complement integer format.  The conversion is
3078 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3079 | Arithmetic---which means in particular that the conversion is rounded
3080 | according to the current rounding mode.  If `a' is a NaN, the largest
3081 | positive integer is returned.  Otherwise, if the conversion overflows, the
3082 | largest integer with the same sign as `a' is returned.
3083 *----------------------------------------------------------------------------*/
3084 
3085 int32_t float64_to_int32(float64 a, float_status *status)
3086 {
3087     flag aSign;
3088     int aExp;
3089     int shiftCount;
3090     uint64_t aSig;
3091     a = float64_squash_input_denormal(a, status);
3092 
3093     aSig = extractFloat64Frac( a );
3094     aExp = extractFloat64Exp( a );
3095     aSign = extractFloat64Sign( a );
3096     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3097     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3098     shiftCount = 0x42C - aExp;
3099     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3100     return roundAndPackInt32(aSign, aSig, status);
3101 
3102 }
3103 
3104 /*----------------------------------------------------------------------------
3105 | Returns the result of converting the double-precision floating-point value
3106 | `a' to the 32-bit two's complement integer format.  The conversion is
3107 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3108 | Arithmetic, except that the conversion is always rounded toward zero.
3109 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3110 | the conversion overflows, the largest integer with the same sign as `a' is
3111 | returned.
3112 *----------------------------------------------------------------------------*/
3113 
3114 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3115 {
3116     flag aSign;
3117     int aExp;
3118     int shiftCount;
3119     uint64_t aSig, savedASig;
3120     int32_t z;
3121     a = float64_squash_input_denormal(a, status);
3122 
3123     aSig = extractFloat64Frac( a );
3124     aExp = extractFloat64Exp( a );
3125     aSign = extractFloat64Sign( a );
3126     if ( 0x41E < aExp ) {
3127         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3128         goto invalid;
3129     }
3130     else if ( aExp < 0x3FF ) {
3131         if (aExp || aSig) {
3132             status->float_exception_flags |= float_flag_inexact;
3133         }
3134         return 0;
3135     }
3136     aSig |= LIT64( 0x0010000000000000 );
3137     shiftCount = 0x433 - aExp;
3138     savedASig = aSig;
3139     aSig >>= shiftCount;
3140     z = aSig;
3141     if ( aSign ) z = - z;
3142     if ( ( z < 0 ) ^ aSign ) {
3143  invalid:
3144         float_raise(float_flag_invalid, status);
3145         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3146     }
3147     if ( ( aSig<<shiftCount ) != savedASig ) {
3148         status->float_exception_flags |= float_flag_inexact;
3149     }
3150     return z;
3151 
3152 }
3153 
3154 /*----------------------------------------------------------------------------
3155 | Returns the result of converting the double-precision floating-point value
3156 | `a' to the 16-bit two's complement integer format.  The conversion is
3157 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3158 | Arithmetic, except that the conversion is always rounded toward zero.
3159 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3160 | the conversion overflows, the largest integer with the same sign as `a' is
3161 | returned.
3162 *----------------------------------------------------------------------------*/
3163 
3164 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3165 {
3166     flag aSign;
3167     int aExp;
3168     int shiftCount;
3169     uint64_t aSig, savedASig;
3170     int32_t z;
3171 
3172     aSig = extractFloat64Frac( a );
3173     aExp = extractFloat64Exp( a );
3174     aSign = extractFloat64Sign( a );
3175     if ( 0x40E < aExp ) {
3176         if ( ( aExp == 0x7FF ) && aSig ) {
3177             aSign = 0;
3178         }
3179         goto invalid;
3180     }
3181     else if ( aExp < 0x3FF ) {
3182         if ( aExp || aSig ) {
3183             status->float_exception_flags |= float_flag_inexact;
3184         }
3185         return 0;
3186     }
3187     aSig |= LIT64( 0x0010000000000000 );
3188     shiftCount = 0x433 - aExp;
3189     savedASig = aSig;
3190     aSig >>= shiftCount;
3191     z = aSig;
3192     if ( aSign ) {
3193         z = - z;
3194     }
3195     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3196  invalid:
3197         float_raise(float_flag_invalid, status);
3198         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3199     }
3200     if ( ( aSig<<shiftCount ) != savedASig ) {
3201         status->float_exception_flags |= float_flag_inexact;
3202     }
3203     return z;
3204 }
3205 
3206 /*----------------------------------------------------------------------------
3207 | Returns the result of converting the double-precision floating-point value
3208 | `a' to the 64-bit two's complement integer format.  The conversion is
3209 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3210 | Arithmetic---which means in particular that the conversion is rounded
3211 | according to the current rounding mode.  If `a' is a NaN, the largest
3212 | positive integer is returned.  Otherwise, if the conversion overflows, the
3213 | largest integer with the same sign as `a' is returned.
3214 *----------------------------------------------------------------------------*/
3215 
3216 int64_t float64_to_int64(float64 a, float_status *status)
3217 {
3218     flag aSign;
3219     int aExp;
3220     int shiftCount;
3221     uint64_t aSig, aSigExtra;
3222     a = float64_squash_input_denormal(a, status);
3223 
3224     aSig = extractFloat64Frac( a );
3225     aExp = extractFloat64Exp( a );
3226     aSign = extractFloat64Sign( a );
3227     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3228     shiftCount = 0x433 - aExp;
3229     if ( shiftCount <= 0 ) {
3230         if ( 0x43E < aExp ) {
3231             float_raise(float_flag_invalid, status);
3232             if (    ! aSign
3233                  || (    ( aExp == 0x7FF )
3234                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3235                ) {
3236                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3237             }
3238             return (int64_t) LIT64( 0x8000000000000000 );
3239         }
3240         aSigExtra = 0;
3241         aSig <<= - shiftCount;
3242     }
3243     else {
3244         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3245     }
3246     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3247 
3248 }
3249 
3250 /*----------------------------------------------------------------------------
3251 | Returns the result of converting the double-precision floating-point value
3252 | `a' to the 64-bit two's complement integer format.  The conversion is
3253 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3254 | Arithmetic, except that the conversion is always rounded toward zero.
3255 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3256 | the conversion overflows, the largest integer with the same sign as `a' is
3257 | returned.
3258 *----------------------------------------------------------------------------*/
3259 
3260 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3261 {
3262     flag aSign;
3263     int aExp;
3264     int shiftCount;
3265     uint64_t aSig;
3266     int64_t z;
3267     a = float64_squash_input_denormal(a, status);
3268 
3269     aSig = extractFloat64Frac( a );
3270     aExp = extractFloat64Exp( a );
3271     aSign = extractFloat64Sign( a );
3272     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3273     shiftCount = aExp - 0x433;
3274     if ( 0 <= shiftCount ) {
3275         if ( 0x43E <= aExp ) {
3276             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3277                 float_raise(float_flag_invalid, status);
3278                 if (    ! aSign
3279                      || (    ( aExp == 0x7FF )
3280                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3281                    ) {
3282                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3283                 }
3284             }
3285             return (int64_t) LIT64( 0x8000000000000000 );
3286         }
3287         z = aSig<<shiftCount;
3288     }
3289     else {
3290         if ( aExp < 0x3FE ) {
3291             if (aExp | aSig) {
3292                 status->float_exception_flags |= float_flag_inexact;
3293             }
3294             return 0;
3295         }
3296         z = aSig>>( - shiftCount );
3297         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3298             status->float_exception_flags |= float_flag_inexact;
3299         }
3300     }
3301     if ( aSign ) z = - z;
3302     return z;
3303 
3304 }
3305 
3306 /*----------------------------------------------------------------------------
3307 | Returns the result of converting the double-precision floating-point value
3308 | `a' to the single-precision floating-point format.  The conversion is
3309 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3310 | Arithmetic.
3311 *----------------------------------------------------------------------------*/
3312 
3313 float32 float64_to_float32(float64 a, float_status *status)
3314 {
3315     flag aSign;
3316     int aExp;
3317     uint64_t aSig;
3318     uint32_t zSig;
3319     a = float64_squash_input_denormal(a, status);
3320 
3321     aSig = extractFloat64Frac( a );
3322     aExp = extractFloat64Exp( a );
3323     aSign = extractFloat64Sign( a );
3324     if ( aExp == 0x7FF ) {
3325         if (aSig) {
3326             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3327         }
3328         return packFloat32( aSign, 0xFF, 0 );
3329     }
3330     shift64RightJamming( aSig, 22, &aSig );
3331     zSig = aSig;
3332     if ( aExp || zSig ) {
3333         zSig |= 0x40000000;
3334         aExp -= 0x381;
3335     }
3336     return roundAndPackFloat32(aSign, aExp, zSig, status);
3337 
3338 }
3339 
3340 
3341 /*----------------------------------------------------------------------------
3342 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3343 | half-precision floating-point value, returning the result.  After being
3344 | shifted into the proper positions, the three fields are simply added
3345 | together to form the result.  This means that any integer portion of `zSig'
3346 | will be added into the exponent.  Since a properly normalized significand
3347 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3348 | than the desired result exponent whenever `zSig' is a complete, normalized
3349 | significand.
3350 *----------------------------------------------------------------------------*/
3351 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3352 {
3353     return make_float16(
3354         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3355 }
3356 
3357 /*----------------------------------------------------------------------------
3358 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3359 | and significand `zSig', and returns the proper half-precision floating-
3360 | point value corresponding to the abstract input.  Ordinarily, the abstract
3361 | value is simply rounded and packed into the half-precision format, with
3362 | the inexact exception raised if the abstract input cannot be represented
3363 | exactly.  However, if the abstract value is too large, the overflow and
3364 | inexact exceptions are raised and an infinity or maximal finite value is
3365 | returned.  If the abstract value is too small, the input value is rounded to
3366 | a subnormal number, and the underflow and inexact exceptions are raised if
3367 | the abstract input cannot be represented exactly as a subnormal half-
3368 | precision floating-point number.
3369 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3370 | ARM-style "alternative representation", which omits the NaN and Inf
3371 | encodings in order to raise the maximum representable exponent by one.
3372 |     The input significand `zSig' has its binary point between bits 22
3373 | and 23, which is 13 bits to the left of the usual location.  This shifted
3374 | significand must be normalized or smaller.  If `zSig' is not normalized,
3375 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3376 | and it must not require rounding.  In the usual case that `zSig' is
3377 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3378 | Note the slightly odd position of the binary point in zSig compared with the
3379 | other roundAndPackFloat functions. This should probably be fixed if we
3380 | need to implement more float16 routines than just conversion.
3381 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3382 | Binary Floating-Point Arithmetic.
3383 *----------------------------------------------------------------------------*/
3384 
3385 static float16 roundAndPackFloat16(flag zSign, int zExp,
3386                                    uint32_t zSig, flag ieee,
3387                                    float_status *status)
3388 {
3389     int maxexp = ieee ? 29 : 30;
3390     uint32_t mask;
3391     uint32_t increment;
3392     bool rounding_bumps_exp;
3393     bool is_tiny = false;
3394 
3395     /* Calculate the mask of bits of the mantissa which are not
3396      * representable in half-precision and will be lost.
3397      */
3398     if (zExp < 1) {
3399         /* Will be denormal in halfprec */
3400         mask = 0x00ffffff;
3401         if (zExp >= -11) {
3402             mask >>= 11 + zExp;
3403         }
3404     } else {
3405         /* Normal number in halfprec */
3406         mask = 0x00001fff;
3407     }
3408 
3409     switch (status->float_rounding_mode) {
3410     case float_round_nearest_even:
3411         increment = (mask + 1) >> 1;
3412         if ((zSig & mask) == increment) {
3413             increment = zSig & (increment << 1);
3414         }
3415         break;
3416     case float_round_ties_away:
3417         increment = (mask + 1) >> 1;
3418         break;
3419     case float_round_up:
3420         increment = zSign ? 0 : mask;
3421         break;
3422     case float_round_down:
3423         increment = zSign ? mask : 0;
3424         break;
3425     default: /* round_to_zero */
3426         increment = 0;
3427         break;
3428     }
3429 
3430     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3431 
3432     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3433         if (ieee) {
3434             float_raise(float_flag_overflow | float_flag_inexact, status);
3435             return packFloat16(zSign, 0x1f, 0);
3436         } else {
3437             float_raise(float_flag_invalid, status);
3438             return packFloat16(zSign, 0x1f, 0x3ff);
3439         }
3440     }
3441 
3442     if (zExp < 0) {
3443         /* Note that flush-to-zero does not affect half-precision results */
3444         is_tiny =
3445             (status->float_detect_tininess == float_tininess_before_rounding)
3446             || (zExp < -1)
3447             || (!rounding_bumps_exp);
3448     }
3449     if (zSig & mask) {
3450         float_raise(float_flag_inexact, status);
3451         if (is_tiny) {
3452             float_raise(float_flag_underflow, status);
3453         }
3454     }
3455 
3456     zSig += increment;
3457     if (rounding_bumps_exp) {
3458         zSig >>= 1;
3459         zExp++;
3460     }
3461 
3462     if (zExp < -10) {
3463         return packFloat16(zSign, 0, 0);
3464     }
3465     if (zExp < 0) {
3466         zSig >>= -zExp;
3467         zExp = 0;
3468     }
3469     return packFloat16(zSign, zExp, zSig >> 13);
3470 }
3471 
3472 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3473                                       uint32_t *zSigPtr)
3474 {
3475     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3476     *zSigPtr = aSig << shiftCount;
3477     *zExpPtr = 1 - shiftCount;
3478 }
3479 
3480 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3481    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3482 
3483 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3484 {
3485     flag aSign;
3486     int aExp;
3487     uint32_t aSig;
3488 
3489     aSign = extractFloat16Sign(a);
3490     aExp = extractFloat16Exp(a);
3491     aSig = extractFloat16Frac(a);
3492 
3493     if (aExp == 0x1f && ieee) {
3494         if (aSig) {
3495             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3496         }
3497         return packFloat32(aSign, 0xff, 0);
3498     }
3499     if (aExp == 0) {
3500         if (aSig == 0) {
3501             return packFloat32(aSign, 0, 0);
3502         }
3503 
3504         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3505         aExp--;
3506     }
3507     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3508 }
3509 
3510 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3511 {
3512     flag aSign;
3513     int aExp;
3514     uint32_t aSig;
3515 
3516     a = float32_squash_input_denormal(a, status);
3517 
3518     aSig = extractFloat32Frac( a );
3519     aExp = extractFloat32Exp( a );
3520     aSign = extractFloat32Sign( a );
3521     if ( aExp == 0xFF ) {
3522         if (aSig) {
3523             /* Input is a NaN */
3524             if (!ieee) {
3525                 float_raise(float_flag_invalid, status);
3526                 return packFloat16(aSign, 0, 0);
3527             }
3528             return commonNaNToFloat16(
3529                 float32ToCommonNaN(a, status), status);
3530         }
3531         /* Infinity */
3532         if (!ieee) {
3533             float_raise(float_flag_invalid, status);
3534             return packFloat16(aSign, 0x1f, 0x3ff);
3535         }
3536         return packFloat16(aSign, 0x1f, 0);
3537     }
3538     if (aExp == 0 && aSig == 0) {
3539         return packFloat16(aSign, 0, 0);
3540     }
3541     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3542      * even if the input is denormal; however this is harmless because
3543      * the largest possible single-precision denormal is still smaller
3544      * than the smallest representable half-precision denormal, and so we
3545      * will end up ignoring aSig and returning via the "always return zero"
3546      * codepath.
3547      */
3548     aSig |= 0x00800000;
3549     aExp -= 0x71;
3550 
3551     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3552 }
3553 
3554 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3555 {
3556     flag aSign;
3557     int aExp;
3558     uint32_t aSig;
3559 
3560     aSign = extractFloat16Sign(a);
3561     aExp = extractFloat16Exp(a);
3562     aSig = extractFloat16Frac(a);
3563 
3564     if (aExp == 0x1f && ieee) {
3565         if (aSig) {
3566             return commonNaNToFloat64(
3567                 float16ToCommonNaN(a, status), status);
3568         }
3569         return packFloat64(aSign, 0x7ff, 0);
3570     }
3571     if (aExp == 0) {
3572         if (aSig == 0) {
3573             return packFloat64(aSign, 0, 0);
3574         }
3575 
3576         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3577         aExp--;
3578     }
3579     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3580 }
3581 
3582 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3583 {
3584     flag aSign;
3585     int aExp;
3586     uint64_t aSig;
3587     uint32_t zSig;
3588 
3589     a = float64_squash_input_denormal(a, status);
3590 
3591     aSig = extractFloat64Frac(a);
3592     aExp = extractFloat64Exp(a);
3593     aSign = extractFloat64Sign(a);
3594     if (aExp == 0x7FF) {
3595         if (aSig) {
3596             /* Input is a NaN */
3597             if (!ieee) {
3598                 float_raise(float_flag_invalid, status);
3599                 return packFloat16(aSign, 0, 0);
3600             }
3601             return commonNaNToFloat16(
3602                 float64ToCommonNaN(a, status), status);
3603         }
3604         /* Infinity */
3605         if (!ieee) {
3606             float_raise(float_flag_invalid, status);
3607             return packFloat16(aSign, 0x1f, 0x3ff);
3608         }
3609         return packFloat16(aSign, 0x1f, 0);
3610     }
3611     shift64RightJamming(aSig, 29, &aSig);
3612     zSig = aSig;
3613     if (aExp == 0 && zSig == 0) {
3614         return packFloat16(aSign, 0, 0);
3615     }
3616     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3617      * even if the input is denormal; however this is harmless because
3618      * the largest possible single-precision denormal is still smaller
3619      * than the smallest representable half-precision denormal, and so we
3620      * will end up ignoring aSig and returning via the "always return zero"
3621      * codepath.
3622      */
3623     zSig |= 0x00800000;
3624     aExp -= 0x3F1;
3625 
3626     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3627 }
3628 
3629 /*----------------------------------------------------------------------------
3630 | Returns the result of converting the double-precision floating-point value
3631 | `a' to the extended double-precision floating-point format.  The conversion
3632 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3633 | Arithmetic.
3634 *----------------------------------------------------------------------------*/
3635 
3636 floatx80 float64_to_floatx80(float64 a, float_status *status)
3637 {
3638     flag aSign;
3639     int aExp;
3640     uint64_t aSig;
3641 
3642     a = float64_squash_input_denormal(a, status);
3643     aSig = extractFloat64Frac( a );
3644     aExp = extractFloat64Exp( a );
3645     aSign = extractFloat64Sign( a );
3646     if ( aExp == 0x7FF ) {
3647         if (aSig) {
3648             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3649         }
3650         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3651     }
3652     if ( aExp == 0 ) {
3653         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3654         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3655     }
3656     return
3657         packFloatx80(
3658             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3659 
3660 }
3661 
3662 /*----------------------------------------------------------------------------
3663 | Returns the result of converting the double-precision floating-point value
3664 | `a' to the quadruple-precision floating-point format.  The conversion is
3665 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3666 | Arithmetic.
3667 *----------------------------------------------------------------------------*/
3668 
3669 float128 float64_to_float128(float64 a, float_status *status)
3670 {
3671     flag aSign;
3672     int aExp;
3673     uint64_t aSig, zSig0, zSig1;
3674 
3675     a = float64_squash_input_denormal(a, status);
3676     aSig = extractFloat64Frac( a );
3677     aExp = extractFloat64Exp( a );
3678     aSign = extractFloat64Sign( a );
3679     if ( aExp == 0x7FF ) {
3680         if (aSig) {
3681             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3682         }
3683         return packFloat128( aSign, 0x7FFF, 0, 0 );
3684     }
3685     if ( aExp == 0 ) {
3686         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3687         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3688         --aExp;
3689     }
3690     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3691     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3692 
3693 }
3694 
3695 /*----------------------------------------------------------------------------
3696 | Rounds the double-precision floating-point value `a' to an integer, and
3697 | returns the result as a double-precision floating-point value.  The
3698 | operation is performed according to the IEC/IEEE Standard for Binary
3699 | Floating-Point Arithmetic.
3700 *----------------------------------------------------------------------------*/
3701 
3702 float64 float64_round_to_int(float64 a, float_status *status)
3703 {
3704     flag aSign;
3705     int aExp;
3706     uint64_t lastBitMask, roundBitsMask;
3707     uint64_t z;
3708     a = float64_squash_input_denormal(a, status);
3709 
3710     aExp = extractFloat64Exp( a );
3711     if ( 0x433 <= aExp ) {
3712         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3713             return propagateFloat64NaN(a, a, status);
3714         }
3715         return a;
3716     }
3717     if ( aExp < 0x3FF ) {
3718         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3719         status->float_exception_flags |= float_flag_inexact;
3720         aSign = extractFloat64Sign( a );
3721         switch (status->float_rounding_mode) {
3722          case float_round_nearest_even:
3723             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3724                 return packFloat64( aSign, 0x3FF, 0 );
3725             }
3726             break;
3727         case float_round_ties_away:
3728             if (aExp == 0x3FE) {
3729                 return packFloat64(aSign, 0x3ff, 0);
3730             }
3731             break;
3732          case float_round_down:
3733             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3734          case float_round_up:
3735             return make_float64(
3736             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3737         }
3738         return packFloat64( aSign, 0, 0 );
3739     }
3740     lastBitMask = 1;
3741     lastBitMask <<= 0x433 - aExp;
3742     roundBitsMask = lastBitMask - 1;
3743     z = float64_val(a);
3744     switch (status->float_rounding_mode) {
3745     case float_round_nearest_even:
3746         z += lastBitMask >> 1;
3747         if ((z & roundBitsMask) == 0) {
3748             z &= ~lastBitMask;
3749         }
3750         break;
3751     case float_round_ties_away:
3752         z += lastBitMask >> 1;
3753         break;
3754     case float_round_to_zero:
3755         break;
3756     case float_round_up:
3757         if (!extractFloat64Sign(make_float64(z))) {
3758             z += roundBitsMask;
3759         }
3760         break;
3761     case float_round_down:
3762         if (extractFloat64Sign(make_float64(z))) {
3763             z += roundBitsMask;
3764         }
3765         break;
3766     default:
3767         abort();
3768     }
3769     z &= ~ roundBitsMask;
3770     if (z != float64_val(a)) {
3771         status->float_exception_flags |= float_flag_inexact;
3772     }
3773     return make_float64(z);
3774 
3775 }
3776 
3777 float64 float64_trunc_to_int(float64 a, float_status *status)
3778 {
3779     int oldmode;
3780     float64 res;
3781     oldmode = status->float_rounding_mode;
3782     status->float_rounding_mode = float_round_to_zero;
3783     res = float64_round_to_int(a, status);
3784     status->float_rounding_mode = oldmode;
3785     return res;
3786 }
3787 
3788 /*----------------------------------------------------------------------------
3789 | Returns the result of adding the absolute values of the double-precision
3790 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3791 | before being returned.  `zSign' is ignored if the result is a NaN.
3792 | The addition is performed according to the IEC/IEEE Standard for Binary
3793 | Floating-Point Arithmetic.
3794 *----------------------------------------------------------------------------*/
3795 
3796 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3797                               float_status *status)
3798 {
3799     int aExp, bExp, zExp;
3800     uint64_t aSig, bSig, zSig;
3801     int expDiff;
3802 
3803     aSig = extractFloat64Frac( a );
3804     aExp = extractFloat64Exp( a );
3805     bSig = extractFloat64Frac( b );
3806     bExp = extractFloat64Exp( b );
3807     expDiff = aExp - bExp;
3808     aSig <<= 9;
3809     bSig <<= 9;
3810     if ( 0 < expDiff ) {
3811         if ( aExp == 0x7FF ) {
3812             if (aSig) {
3813                 return propagateFloat64NaN(a, b, status);
3814             }
3815             return a;
3816         }
3817         if ( bExp == 0 ) {
3818             --expDiff;
3819         }
3820         else {
3821             bSig |= LIT64( 0x2000000000000000 );
3822         }
3823         shift64RightJamming( bSig, expDiff, &bSig );
3824         zExp = aExp;
3825     }
3826     else if ( expDiff < 0 ) {
3827         if ( bExp == 0x7FF ) {
3828             if (bSig) {
3829                 return propagateFloat64NaN(a, b, status);
3830             }
3831             return packFloat64( zSign, 0x7FF, 0 );
3832         }
3833         if ( aExp == 0 ) {
3834             ++expDiff;
3835         }
3836         else {
3837             aSig |= LIT64( 0x2000000000000000 );
3838         }
3839         shift64RightJamming( aSig, - expDiff, &aSig );
3840         zExp = bExp;
3841     }
3842     else {
3843         if ( aExp == 0x7FF ) {
3844             if (aSig | bSig) {
3845                 return propagateFloat64NaN(a, b, status);
3846             }
3847             return a;
3848         }
3849         if ( aExp == 0 ) {
3850             if (status->flush_to_zero) {
3851                 if (aSig | bSig) {
3852                     float_raise(float_flag_output_denormal, status);
3853                 }
3854                 return packFloat64(zSign, 0, 0);
3855             }
3856             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3857         }
3858         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3859         zExp = aExp;
3860         goto roundAndPack;
3861     }
3862     aSig |= LIT64( 0x2000000000000000 );
3863     zSig = ( aSig + bSig )<<1;
3864     --zExp;
3865     if ( (int64_t) zSig < 0 ) {
3866         zSig = aSig + bSig;
3867         ++zExp;
3868     }
3869  roundAndPack:
3870     return roundAndPackFloat64(zSign, zExp, zSig, status);
3871 
3872 }
3873 
3874 /*----------------------------------------------------------------------------
3875 | Returns the result of subtracting the absolute values of the double-
3876 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3877 | difference is negated before being returned.  `zSign' is ignored if the
3878 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3879 | Standard for Binary Floating-Point Arithmetic.
3880 *----------------------------------------------------------------------------*/
3881 
3882 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3883                               float_status *status)
3884 {
3885     int aExp, bExp, zExp;
3886     uint64_t aSig, bSig, zSig;
3887     int expDiff;
3888 
3889     aSig = extractFloat64Frac( a );
3890     aExp = extractFloat64Exp( a );
3891     bSig = extractFloat64Frac( b );
3892     bExp = extractFloat64Exp( b );
3893     expDiff = aExp - bExp;
3894     aSig <<= 10;
3895     bSig <<= 10;
3896     if ( 0 < expDiff ) goto aExpBigger;
3897     if ( expDiff < 0 ) goto bExpBigger;
3898     if ( aExp == 0x7FF ) {
3899         if (aSig | bSig) {
3900             return propagateFloat64NaN(a, b, status);
3901         }
3902         float_raise(float_flag_invalid, status);
3903         return float64_default_nan(status);
3904     }
3905     if ( aExp == 0 ) {
3906         aExp = 1;
3907         bExp = 1;
3908     }
3909     if ( bSig < aSig ) goto aBigger;
3910     if ( aSig < bSig ) goto bBigger;
3911     return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3912  bExpBigger:
3913     if ( bExp == 0x7FF ) {
3914         if (bSig) {
3915             return propagateFloat64NaN(a, b, status);
3916         }
3917         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3918     }
3919     if ( aExp == 0 ) {
3920         ++expDiff;
3921     }
3922     else {
3923         aSig |= LIT64( 0x4000000000000000 );
3924     }
3925     shift64RightJamming( aSig, - expDiff, &aSig );
3926     bSig |= LIT64( 0x4000000000000000 );
3927  bBigger:
3928     zSig = bSig - aSig;
3929     zExp = bExp;
3930     zSign ^= 1;
3931     goto normalizeRoundAndPack;
3932  aExpBigger:
3933     if ( aExp == 0x7FF ) {
3934         if (aSig) {
3935             return propagateFloat64NaN(a, b, status);
3936         }
3937         return a;
3938     }
3939     if ( bExp == 0 ) {
3940         --expDiff;
3941     }
3942     else {
3943         bSig |= LIT64( 0x4000000000000000 );
3944     }
3945     shift64RightJamming( bSig, expDiff, &bSig );
3946     aSig |= LIT64( 0x4000000000000000 );
3947  aBigger:
3948     zSig = aSig - bSig;
3949     zExp = aExp;
3950  normalizeRoundAndPack:
3951     --zExp;
3952     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3953 
3954 }
3955 
3956 /*----------------------------------------------------------------------------
3957 | Returns the result of adding the double-precision floating-point values `a'
3958 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3959 | Binary Floating-Point Arithmetic.
3960 *----------------------------------------------------------------------------*/
3961 
3962 float64 float64_add(float64 a, float64 b, float_status *status)
3963 {
3964     flag aSign, bSign;
3965     a = float64_squash_input_denormal(a, status);
3966     b = float64_squash_input_denormal(b, status);
3967 
3968     aSign = extractFloat64Sign( a );
3969     bSign = extractFloat64Sign( b );
3970     if ( aSign == bSign ) {
3971         return addFloat64Sigs(a, b, aSign, status);
3972     }
3973     else {
3974         return subFloat64Sigs(a, b, aSign, status);
3975     }
3976 
3977 }
3978 
3979 /*----------------------------------------------------------------------------
3980 | Returns the result of subtracting the double-precision floating-point values
3981 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3982 | for Binary Floating-Point Arithmetic.
3983 *----------------------------------------------------------------------------*/
3984 
3985 float64 float64_sub(float64 a, float64 b, float_status *status)
3986 {
3987     flag aSign, bSign;
3988     a = float64_squash_input_denormal(a, status);
3989     b = float64_squash_input_denormal(b, status);
3990 
3991     aSign = extractFloat64Sign( a );
3992     bSign = extractFloat64Sign( b );
3993     if ( aSign == bSign ) {
3994         return subFloat64Sigs(a, b, aSign, status);
3995     }
3996     else {
3997         return addFloat64Sigs(a, b, aSign, status);
3998     }
3999 
4000 }
4001 
4002 /*----------------------------------------------------------------------------
4003 | Returns the result of multiplying the double-precision floating-point values
4004 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4005 | for Binary Floating-Point Arithmetic.
4006 *----------------------------------------------------------------------------*/
4007 
4008 float64 float64_mul(float64 a, float64 b, float_status *status)
4009 {
4010     flag aSign, bSign, zSign;
4011     int aExp, bExp, zExp;
4012     uint64_t aSig, bSig, zSig0, zSig1;
4013 
4014     a = float64_squash_input_denormal(a, status);
4015     b = float64_squash_input_denormal(b, status);
4016 
4017     aSig = extractFloat64Frac( a );
4018     aExp = extractFloat64Exp( a );
4019     aSign = extractFloat64Sign( a );
4020     bSig = extractFloat64Frac( b );
4021     bExp = extractFloat64Exp( b );
4022     bSign = extractFloat64Sign( b );
4023     zSign = aSign ^ bSign;
4024     if ( aExp == 0x7FF ) {
4025         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4026             return propagateFloat64NaN(a, b, status);
4027         }
4028         if ( ( bExp | bSig ) == 0 ) {
4029             float_raise(float_flag_invalid, status);
4030             return float64_default_nan(status);
4031         }
4032         return packFloat64( zSign, 0x7FF, 0 );
4033     }
4034     if ( bExp == 0x7FF ) {
4035         if (bSig) {
4036             return propagateFloat64NaN(a, b, status);
4037         }
4038         if ( ( aExp | aSig ) == 0 ) {
4039             float_raise(float_flag_invalid, status);
4040             return float64_default_nan(status);
4041         }
4042         return packFloat64( zSign, 0x7FF, 0 );
4043     }
4044     if ( aExp == 0 ) {
4045         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4046         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4047     }
4048     if ( bExp == 0 ) {
4049         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4050         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4051     }
4052     zExp = aExp + bExp - 0x3FF;
4053     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4054     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4055     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4056     zSig0 |= ( zSig1 != 0 );
4057     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4058         zSig0 <<= 1;
4059         --zExp;
4060     }
4061     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4062 
4063 }
4064 
4065 /*----------------------------------------------------------------------------
4066 | Returns the result of dividing the double-precision floating-point value `a'
4067 | by the corresponding value `b'.  The operation is performed according to
4068 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4069 *----------------------------------------------------------------------------*/
4070 
4071 float64 float64_div(float64 a, float64 b, float_status *status)
4072 {
4073     flag aSign, bSign, zSign;
4074     int aExp, bExp, zExp;
4075     uint64_t aSig, bSig, zSig;
4076     uint64_t rem0, rem1;
4077     uint64_t term0, term1;
4078     a = float64_squash_input_denormal(a, status);
4079     b = float64_squash_input_denormal(b, status);
4080 
4081     aSig = extractFloat64Frac( a );
4082     aExp = extractFloat64Exp( a );
4083     aSign = extractFloat64Sign( a );
4084     bSig = extractFloat64Frac( b );
4085     bExp = extractFloat64Exp( b );
4086     bSign = extractFloat64Sign( b );
4087     zSign = aSign ^ bSign;
4088     if ( aExp == 0x7FF ) {
4089         if (aSig) {
4090             return propagateFloat64NaN(a, b, status);
4091         }
4092         if ( bExp == 0x7FF ) {
4093             if (bSig) {
4094                 return propagateFloat64NaN(a, b, status);
4095             }
4096             float_raise(float_flag_invalid, status);
4097             return float64_default_nan(status);
4098         }
4099         return packFloat64( zSign, 0x7FF, 0 );
4100     }
4101     if ( bExp == 0x7FF ) {
4102         if (bSig) {
4103             return propagateFloat64NaN(a, b, status);
4104         }
4105         return packFloat64( zSign, 0, 0 );
4106     }
4107     if ( bExp == 0 ) {
4108         if ( bSig == 0 ) {
4109             if ( ( aExp | aSig ) == 0 ) {
4110                 float_raise(float_flag_invalid, status);
4111                 return float64_default_nan(status);
4112             }
4113             float_raise(float_flag_divbyzero, status);
4114             return packFloat64( zSign, 0x7FF, 0 );
4115         }
4116         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4117     }
4118     if ( aExp == 0 ) {
4119         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4120         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4121     }
4122     zExp = aExp - bExp + 0x3FD;
4123     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4124     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4125     if ( bSig <= ( aSig + aSig ) ) {
4126         aSig >>= 1;
4127         ++zExp;
4128     }
4129     zSig = estimateDiv128To64( aSig, 0, bSig );
4130     if ( ( zSig & 0x1FF ) <= 2 ) {
4131         mul64To128( bSig, zSig, &term0, &term1 );
4132         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4133         while ( (int64_t) rem0 < 0 ) {
4134             --zSig;
4135             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4136         }
4137         zSig |= ( rem1 != 0 );
4138     }
4139     return roundAndPackFloat64(zSign, zExp, zSig, status);
4140 
4141 }
4142 
4143 /*----------------------------------------------------------------------------
4144 | Returns the remainder of the double-precision floating-point value `a'
4145 | with respect to the corresponding value `b'.  The operation is performed
4146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4147 *----------------------------------------------------------------------------*/
4148 
4149 float64 float64_rem(float64 a, float64 b, float_status *status)
4150 {
4151     flag aSign, zSign;
4152     int aExp, bExp, expDiff;
4153     uint64_t aSig, bSig;
4154     uint64_t q, alternateASig;
4155     int64_t sigMean;
4156 
4157     a = float64_squash_input_denormal(a, status);
4158     b = float64_squash_input_denormal(b, status);
4159     aSig = extractFloat64Frac( a );
4160     aExp = extractFloat64Exp( a );
4161     aSign = extractFloat64Sign( a );
4162     bSig = extractFloat64Frac( b );
4163     bExp = extractFloat64Exp( b );
4164     if ( aExp == 0x7FF ) {
4165         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4166             return propagateFloat64NaN(a, b, status);
4167         }
4168         float_raise(float_flag_invalid, status);
4169         return float64_default_nan(status);
4170     }
4171     if ( bExp == 0x7FF ) {
4172         if (bSig) {
4173             return propagateFloat64NaN(a, b, status);
4174         }
4175         return a;
4176     }
4177     if ( bExp == 0 ) {
4178         if ( bSig == 0 ) {
4179             float_raise(float_flag_invalid, status);
4180             return float64_default_nan(status);
4181         }
4182         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4183     }
4184     if ( aExp == 0 ) {
4185         if ( aSig == 0 ) return a;
4186         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4187     }
4188     expDiff = aExp - bExp;
4189     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4190     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4191     if ( expDiff < 0 ) {
4192         if ( expDiff < -1 ) return a;
4193         aSig >>= 1;
4194     }
4195     q = ( bSig <= aSig );
4196     if ( q ) aSig -= bSig;
4197     expDiff -= 64;
4198     while ( 0 < expDiff ) {
4199         q = estimateDiv128To64( aSig, 0, bSig );
4200         q = ( 2 < q ) ? q - 2 : 0;
4201         aSig = - ( ( bSig>>2 ) * q );
4202         expDiff -= 62;
4203     }
4204     expDiff += 64;
4205     if ( 0 < expDiff ) {
4206         q = estimateDiv128To64( aSig, 0, bSig );
4207         q = ( 2 < q ) ? q - 2 : 0;
4208         q >>= 64 - expDiff;
4209         bSig >>= 2;
4210         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4211     }
4212     else {
4213         aSig >>= 2;
4214         bSig >>= 2;
4215     }
4216     do {
4217         alternateASig = aSig;
4218         ++q;
4219         aSig -= bSig;
4220     } while ( 0 <= (int64_t) aSig );
4221     sigMean = aSig + alternateASig;
4222     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4223         aSig = alternateASig;
4224     }
4225     zSign = ( (int64_t) aSig < 0 );
4226     if ( zSign ) aSig = - aSig;
4227     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4228 
4229 }
4230 
4231 /*----------------------------------------------------------------------------
4232 | Returns the result of multiplying the double-precision floating-point values
4233 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4234 | multiplication.  The operation is performed according to the IEC/IEEE
4235 | Standard for Binary Floating-Point Arithmetic 754-2008.
4236 | The flags argument allows the caller to select negation of the
4237 | addend, the intermediate product, or the final result. (The difference
4238 | between this and having the caller do a separate negation is that negating
4239 | externally will flip the sign bit on NaNs.)
4240 *----------------------------------------------------------------------------*/
4241 
4242 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4243                        float_status *status)
4244 {
4245     flag aSign, bSign, cSign, zSign;
4246     int aExp, bExp, cExp, pExp, zExp, expDiff;
4247     uint64_t aSig, bSig, cSig;
4248     flag pInf, pZero, pSign;
4249     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4250     int shiftcount;
4251     flag signflip, infzero;
4252 
4253     a = float64_squash_input_denormal(a, status);
4254     b = float64_squash_input_denormal(b, status);
4255     c = float64_squash_input_denormal(c, status);
4256     aSig = extractFloat64Frac(a);
4257     aExp = extractFloat64Exp(a);
4258     aSign = extractFloat64Sign(a);
4259     bSig = extractFloat64Frac(b);
4260     bExp = extractFloat64Exp(b);
4261     bSign = extractFloat64Sign(b);
4262     cSig = extractFloat64Frac(c);
4263     cExp = extractFloat64Exp(c);
4264     cSign = extractFloat64Sign(c);
4265 
4266     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4267                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4268 
4269     /* It is implementation-defined whether the cases of (0,inf,qnan)
4270      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4271      * they return if they do), so we have to hand this information
4272      * off to the target-specific pick-a-NaN routine.
4273      */
4274     if (((aExp == 0x7ff) && aSig) ||
4275         ((bExp == 0x7ff) && bSig) ||
4276         ((cExp == 0x7ff) && cSig)) {
4277         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4278     }
4279 
4280     if (infzero) {
4281         float_raise(float_flag_invalid, status);
4282         return float64_default_nan(status);
4283     }
4284 
4285     if (flags & float_muladd_negate_c) {
4286         cSign ^= 1;
4287     }
4288 
4289     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4290 
4291     /* Work out the sign and type of the product */
4292     pSign = aSign ^ bSign;
4293     if (flags & float_muladd_negate_product) {
4294         pSign ^= 1;
4295     }
4296     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4297     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4298 
4299     if (cExp == 0x7ff) {
4300         if (pInf && (pSign ^ cSign)) {
4301             /* addition of opposite-signed infinities => InvalidOperation */
4302             float_raise(float_flag_invalid, status);
4303             return float64_default_nan(status);
4304         }
4305         /* Otherwise generate an infinity of the same sign */
4306         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4307     }
4308 
4309     if (pInf) {
4310         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4311     }
4312 
4313     if (pZero) {
4314         if (cExp == 0) {
4315             if (cSig == 0) {
4316                 /* Adding two exact zeroes */
4317                 if (pSign == cSign) {
4318                     zSign = pSign;
4319                 } else if (status->float_rounding_mode == float_round_down) {
4320                     zSign = 1;
4321                 } else {
4322                     zSign = 0;
4323                 }
4324                 return packFloat64(zSign ^ signflip, 0, 0);
4325             }
4326             /* Exact zero plus a denorm */
4327             if (status->flush_to_zero) {
4328                 float_raise(float_flag_output_denormal, status);
4329                 return packFloat64(cSign ^ signflip, 0, 0);
4330             }
4331         }
4332         /* Zero plus something non-zero : just return the something */
4333         if (flags & float_muladd_halve_result) {
4334             if (cExp == 0) {
4335                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4336             }
4337             /* Subtract one to halve, and one again because roundAndPackFloat64
4338              * wants one less than the true exponent.
4339              */
4340             cExp -= 2;
4341             cSig = (cSig | 0x0010000000000000ULL) << 10;
4342             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4343         }
4344         return packFloat64(cSign ^ signflip, cExp, cSig);
4345     }
4346 
4347     if (aExp == 0) {
4348         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4349     }
4350     if (bExp == 0) {
4351         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4352     }
4353 
4354     /* Calculate the actual result a * b + c */
4355 
4356     /* Multiply first; this is easy. */
4357     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4358      * because we want the true exponent, not the "one-less-than"
4359      * flavour that roundAndPackFloat64() takes.
4360      */
4361     pExp = aExp + bExp - 0x3fe;
4362     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4363     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4364     mul64To128(aSig, bSig, &pSig0, &pSig1);
4365     if ((int64_t)(pSig0 << 1) >= 0) {
4366         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4367         pExp--;
4368     }
4369 
4370     zSign = pSign ^ signflip;
4371 
4372     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4373      * bit in position 126.
4374      */
4375     if (cExp == 0) {
4376         if (!cSig) {
4377             /* Throw out the special case of c being an exact zero now */
4378             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4379             if (flags & float_muladd_halve_result) {
4380                 pExp--;
4381             }
4382             return roundAndPackFloat64(zSign, pExp - 1,
4383                                        pSig1, status);
4384         }
4385         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4386     }
4387 
4388     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4389      * significand of the addend, with the explicit bit in position 126.
4390      */
4391     cSig0 = cSig << (126 - 64 - 52);
4392     cSig1 = 0;
4393     cSig0 |= LIT64(0x4000000000000000);
4394     expDiff = pExp - cExp;
4395 
4396     if (pSign == cSign) {
4397         /* Addition */
4398         if (expDiff > 0) {
4399             /* scale c to match p */
4400             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4401             zExp = pExp;
4402         } else if (expDiff < 0) {
4403             /* scale p to match c */
4404             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4405             zExp = cExp;
4406         } else {
4407             /* no scaling needed */
4408             zExp = cExp;
4409         }
4410         /* Add significands and make sure explicit bit ends up in posn 126 */
4411         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4412         if ((int64_t)zSig0 < 0) {
4413             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4414         } else {
4415             zExp--;
4416         }
4417         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4418         if (flags & float_muladd_halve_result) {
4419             zExp--;
4420         }
4421         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4422     } else {
4423         /* Subtraction */
4424         if (expDiff > 0) {
4425             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4426             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4427             zExp = pExp;
4428         } else if (expDiff < 0) {
4429             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4430             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4431             zExp = cExp;
4432             zSign ^= 1;
4433         } else {
4434             zExp = pExp;
4435             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4436                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4437             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4438                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4439                 zSign ^= 1;
4440             } else {
4441                 /* Exact zero */
4442                 zSign = signflip;
4443                 if (status->float_rounding_mode == float_round_down) {
4444                     zSign ^= 1;
4445                 }
4446                 return packFloat64(zSign, 0, 0);
4447             }
4448         }
4449         --zExp;
4450         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4451          * starting with the significand in a pair of uint64_t.
4452          */
4453         if (zSig0) {
4454             shiftcount = countLeadingZeros64(zSig0) - 1;
4455             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4456             if (zSig1) {
4457                 zSig0 |= 1;
4458             }
4459             zExp -= shiftcount;
4460         } else {
4461             shiftcount = countLeadingZeros64(zSig1);
4462             if (shiftcount == 0) {
4463                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4464                 zExp -= 63;
4465             } else {
4466                 shiftcount--;
4467                 zSig0 = zSig1 << shiftcount;
4468                 zExp -= (shiftcount + 64);
4469             }
4470         }
4471         if (flags & float_muladd_halve_result) {
4472             zExp--;
4473         }
4474         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4475     }
4476 }
4477 
4478 /*----------------------------------------------------------------------------
4479 | Returns the square root of the double-precision floating-point value `a'.
4480 | The operation is performed according to the IEC/IEEE Standard for Binary
4481 | Floating-Point Arithmetic.
4482 *----------------------------------------------------------------------------*/
4483 
4484 float64 float64_sqrt(float64 a, float_status *status)
4485 {
4486     flag aSign;
4487     int aExp, zExp;
4488     uint64_t aSig, zSig, doubleZSig;
4489     uint64_t rem0, rem1, term0, term1;
4490     a = float64_squash_input_denormal(a, status);
4491 
4492     aSig = extractFloat64Frac( a );
4493     aExp = extractFloat64Exp( a );
4494     aSign = extractFloat64Sign( a );
4495     if ( aExp == 0x7FF ) {
4496         if (aSig) {
4497             return propagateFloat64NaN(a, a, status);
4498         }
4499         if ( ! aSign ) return a;
4500         float_raise(float_flag_invalid, status);
4501         return float64_default_nan(status);
4502     }
4503     if ( aSign ) {
4504         if ( ( aExp | aSig ) == 0 ) return a;
4505         float_raise(float_flag_invalid, status);
4506         return float64_default_nan(status);
4507     }
4508     if ( aExp == 0 ) {
4509         if ( aSig == 0 ) return float64_zero;
4510         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4511     }
4512     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4513     aSig |= LIT64( 0x0010000000000000 );
4514     zSig = estimateSqrt32( aExp, aSig>>21 );
4515     aSig <<= 9 - ( aExp & 1 );
4516     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4517     if ( ( zSig & 0x1FF ) <= 5 ) {
4518         doubleZSig = zSig<<1;
4519         mul64To128( zSig, zSig, &term0, &term1 );
4520         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4521         while ( (int64_t) rem0 < 0 ) {
4522             --zSig;
4523             doubleZSig -= 2;
4524             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4525         }
4526         zSig |= ( ( rem0 | rem1 ) != 0 );
4527     }
4528     return roundAndPackFloat64(0, zExp, zSig, status);
4529 
4530 }
4531 
4532 /*----------------------------------------------------------------------------
4533 | Returns the binary log of the double-precision floating-point value `a'.
4534 | The operation is performed according to the IEC/IEEE Standard for Binary
4535 | Floating-Point Arithmetic.
4536 *----------------------------------------------------------------------------*/
4537 float64 float64_log2(float64 a, float_status *status)
4538 {
4539     flag aSign, zSign;
4540     int aExp;
4541     uint64_t aSig, aSig0, aSig1, zSig, i;
4542     a = float64_squash_input_denormal(a, status);
4543 
4544     aSig = extractFloat64Frac( a );
4545     aExp = extractFloat64Exp( a );
4546     aSign = extractFloat64Sign( a );
4547 
4548     if ( aExp == 0 ) {
4549         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4550         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4551     }
4552     if ( aSign ) {
4553         float_raise(float_flag_invalid, status);
4554         return float64_default_nan(status);
4555     }
4556     if ( aExp == 0x7FF ) {
4557         if (aSig) {
4558             return propagateFloat64NaN(a, float64_zero, status);
4559         }
4560         return a;
4561     }
4562 
4563     aExp -= 0x3FF;
4564     aSig |= LIT64( 0x0010000000000000 );
4565     zSign = aExp < 0;
4566     zSig = (uint64_t)aExp << 52;
4567     for (i = 1LL << 51; i > 0; i >>= 1) {
4568         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4569         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4570         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4571             aSig >>= 1;
4572             zSig |= i;
4573         }
4574     }
4575 
4576     if ( zSign )
4577         zSig = -zSig;
4578     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4579 }
4580 
4581 /*----------------------------------------------------------------------------
4582 | Returns 1 if the double-precision floating-point value `a' is equal to the
4583 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4584 | if either operand is a NaN.  Otherwise, the comparison is performed
4585 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4586 *----------------------------------------------------------------------------*/
4587 
4588 int float64_eq(float64 a, float64 b, float_status *status)
4589 {
4590     uint64_t av, bv;
4591     a = float64_squash_input_denormal(a, status);
4592     b = float64_squash_input_denormal(b, status);
4593 
4594     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4595          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4596        ) {
4597         float_raise(float_flag_invalid, status);
4598         return 0;
4599     }
4600     av = float64_val(a);
4601     bv = float64_val(b);
4602     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4603 
4604 }
4605 
4606 /*----------------------------------------------------------------------------
4607 | Returns 1 if the double-precision floating-point value `a' is less than or
4608 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4609 | exception is raised if either operand is a NaN.  The comparison is performed
4610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4611 *----------------------------------------------------------------------------*/
4612 
4613 int float64_le(float64 a, float64 b, float_status *status)
4614 {
4615     flag aSign, bSign;
4616     uint64_t av, bv;
4617     a = float64_squash_input_denormal(a, status);
4618     b = float64_squash_input_denormal(b, status);
4619 
4620     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4621          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4622        ) {
4623         float_raise(float_flag_invalid, status);
4624         return 0;
4625     }
4626     aSign = extractFloat64Sign( a );
4627     bSign = extractFloat64Sign( b );
4628     av = float64_val(a);
4629     bv = float64_val(b);
4630     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4631     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4632 
4633 }
4634 
4635 /*----------------------------------------------------------------------------
4636 | Returns 1 if the double-precision floating-point value `a' is less than
4637 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4638 | raised if either operand is a NaN.  The comparison is performed according
4639 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4640 *----------------------------------------------------------------------------*/
4641 
4642 int float64_lt(float64 a, float64 b, float_status *status)
4643 {
4644     flag aSign, bSign;
4645     uint64_t av, bv;
4646 
4647     a = float64_squash_input_denormal(a, status);
4648     b = float64_squash_input_denormal(b, status);
4649     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4650          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4651        ) {
4652         float_raise(float_flag_invalid, status);
4653         return 0;
4654     }
4655     aSign = extractFloat64Sign( a );
4656     bSign = extractFloat64Sign( b );
4657     av = float64_val(a);
4658     bv = float64_val(b);
4659     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4660     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4661 
4662 }
4663 
4664 /*----------------------------------------------------------------------------
4665 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4666 | be compared, and 0 otherwise.  The invalid exception is raised if either
4667 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4668 | Standard for Binary Floating-Point Arithmetic.
4669 *----------------------------------------------------------------------------*/
4670 
4671 int float64_unordered(float64 a, float64 b, float_status *status)
4672 {
4673     a = float64_squash_input_denormal(a, status);
4674     b = float64_squash_input_denormal(b, status);
4675 
4676     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4677          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4678        ) {
4679         float_raise(float_flag_invalid, status);
4680         return 1;
4681     }
4682     return 0;
4683 }
4684 
4685 /*----------------------------------------------------------------------------
4686 | Returns 1 if the double-precision floating-point value `a' is equal to the
4687 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4688 | exception.The comparison is performed according to the IEC/IEEE Standard
4689 | for Binary Floating-Point Arithmetic.
4690 *----------------------------------------------------------------------------*/
4691 
4692 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4693 {
4694     uint64_t av, bv;
4695     a = float64_squash_input_denormal(a, status);
4696     b = float64_squash_input_denormal(b, status);
4697 
4698     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4699          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4700        ) {
4701         if (float64_is_signaling_nan(a, status)
4702          || float64_is_signaling_nan(b, status)) {
4703             float_raise(float_flag_invalid, status);
4704         }
4705         return 0;
4706     }
4707     av = float64_val(a);
4708     bv = float64_val(b);
4709     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4710 
4711 }
4712 
4713 /*----------------------------------------------------------------------------
4714 | Returns 1 if the double-precision floating-point value `a' is less than or
4715 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4716 | cause an exception.  Otherwise, the comparison is performed according to the
4717 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4718 *----------------------------------------------------------------------------*/
4719 
4720 int float64_le_quiet(float64 a, float64 b, float_status *status)
4721 {
4722     flag aSign, bSign;
4723     uint64_t av, bv;
4724     a = float64_squash_input_denormal(a, status);
4725     b = float64_squash_input_denormal(b, status);
4726 
4727     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4728          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4729        ) {
4730         if (float64_is_signaling_nan(a, status)
4731          || float64_is_signaling_nan(b, status)) {
4732             float_raise(float_flag_invalid, status);
4733         }
4734         return 0;
4735     }
4736     aSign = extractFloat64Sign( a );
4737     bSign = extractFloat64Sign( b );
4738     av = float64_val(a);
4739     bv = float64_val(b);
4740     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4741     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4742 
4743 }
4744 
4745 /*----------------------------------------------------------------------------
4746 | Returns 1 if the double-precision floating-point value `a' is less than
4747 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4748 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4749 | Standard for Binary Floating-Point Arithmetic.
4750 *----------------------------------------------------------------------------*/
4751 
4752 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4753 {
4754     flag aSign, bSign;
4755     uint64_t av, bv;
4756     a = float64_squash_input_denormal(a, status);
4757     b = float64_squash_input_denormal(b, status);
4758 
4759     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4760          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4761        ) {
4762         if (float64_is_signaling_nan(a, status)
4763          || float64_is_signaling_nan(b, status)) {
4764             float_raise(float_flag_invalid, status);
4765         }
4766         return 0;
4767     }
4768     aSign = extractFloat64Sign( a );
4769     bSign = extractFloat64Sign( b );
4770     av = float64_val(a);
4771     bv = float64_val(b);
4772     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4773     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4774 
4775 }
4776 
4777 /*----------------------------------------------------------------------------
4778 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4779 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4780 | comparison is performed according to the IEC/IEEE Standard for Binary
4781 | Floating-Point Arithmetic.
4782 *----------------------------------------------------------------------------*/
4783 
4784 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4785 {
4786     a = float64_squash_input_denormal(a, status);
4787     b = float64_squash_input_denormal(b, status);
4788 
4789     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4790          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4791        ) {
4792         if (float64_is_signaling_nan(a, status)
4793          || float64_is_signaling_nan(b, status)) {
4794             float_raise(float_flag_invalid, status);
4795         }
4796         return 1;
4797     }
4798     return 0;
4799 }
4800 
4801 /*----------------------------------------------------------------------------
4802 | Returns the result of converting the extended double-precision floating-
4803 | point value `a' to the 32-bit two's complement integer format.  The
4804 | conversion is performed according to the IEC/IEEE Standard for Binary
4805 | Floating-Point Arithmetic---which means in particular that the conversion
4806 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4807 | largest positive integer is returned.  Otherwise, if the conversion
4808 | overflows, the largest integer with the same sign as `a' is returned.
4809 *----------------------------------------------------------------------------*/
4810 
4811 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4812 {
4813     flag aSign;
4814     int32_t aExp, shiftCount;
4815     uint64_t aSig;
4816 
4817     if (floatx80_invalid_encoding(a)) {
4818         float_raise(float_flag_invalid, status);
4819         return 1 << 31;
4820     }
4821     aSig = extractFloatx80Frac( a );
4822     aExp = extractFloatx80Exp( a );
4823     aSign = extractFloatx80Sign( a );
4824     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4825     shiftCount = 0x4037 - aExp;
4826     if ( shiftCount <= 0 ) shiftCount = 1;
4827     shift64RightJamming( aSig, shiftCount, &aSig );
4828     return roundAndPackInt32(aSign, aSig, status);
4829 
4830 }
4831 
4832 /*----------------------------------------------------------------------------
4833 | Returns the result of converting the extended double-precision floating-
4834 | point value `a' to the 32-bit two's complement integer format.  The
4835 | conversion is performed according to the IEC/IEEE Standard for Binary
4836 | Floating-Point Arithmetic, except that the conversion is always rounded
4837 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4838 | Otherwise, if the conversion overflows, the largest integer with the same
4839 | sign as `a' is returned.
4840 *----------------------------------------------------------------------------*/
4841 
4842 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4843 {
4844     flag aSign;
4845     int32_t aExp, shiftCount;
4846     uint64_t aSig, savedASig;
4847     int32_t z;
4848 
4849     if (floatx80_invalid_encoding(a)) {
4850         float_raise(float_flag_invalid, status);
4851         return 1 << 31;
4852     }
4853     aSig = extractFloatx80Frac( a );
4854     aExp = extractFloatx80Exp( a );
4855     aSign = extractFloatx80Sign( a );
4856     if ( 0x401E < aExp ) {
4857         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4858         goto invalid;
4859     }
4860     else if ( aExp < 0x3FFF ) {
4861         if (aExp || aSig) {
4862             status->float_exception_flags |= float_flag_inexact;
4863         }
4864         return 0;
4865     }
4866     shiftCount = 0x403E - aExp;
4867     savedASig = aSig;
4868     aSig >>= shiftCount;
4869     z = aSig;
4870     if ( aSign ) z = - z;
4871     if ( ( z < 0 ) ^ aSign ) {
4872  invalid:
4873         float_raise(float_flag_invalid, status);
4874         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4875     }
4876     if ( ( aSig<<shiftCount ) != savedASig ) {
4877         status->float_exception_flags |= float_flag_inexact;
4878     }
4879     return z;
4880 
4881 }
4882 
4883 /*----------------------------------------------------------------------------
4884 | Returns the result of converting the extended double-precision floating-
4885 | point value `a' to the 64-bit two's complement integer format.  The
4886 | conversion is performed according to the IEC/IEEE Standard for Binary
4887 | Floating-Point Arithmetic---which means in particular that the conversion
4888 | is rounded according to the current rounding mode.  If `a' is a NaN,
4889 | the largest positive integer is returned.  Otherwise, if the conversion
4890 | overflows, the largest integer with the same sign as `a' is returned.
4891 *----------------------------------------------------------------------------*/
4892 
4893 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4894 {
4895     flag aSign;
4896     int32_t aExp, shiftCount;
4897     uint64_t aSig, aSigExtra;
4898 
4899     if (floatx80_invalid_encoding(a)) {
4900         float_raise(float_flag_invalid, status);
4901         return 1ULL << 63;
4902     }
4903     aSig = extractFloatx80Frac( a );
4904     aExp = extractFloatx80Exp( a );
4905     aSign = extractFloatx80Sign( a );
4906     shiftCount = 0x403E - aExp;
4907     if ( shiftCount <= 0 ) {
4908         if ( shiftCount ) {
4909             float_raise(float_flag_invalid, status);
4910             if (    ! aSign
4911                  || (    ( aExp == 0x7FFF )
4912                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4913                ) {
4914                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4915             }
4916             return (int64_t) LIT64( 0x8000000000000000 );
4917         }
4918         aSigExtra = 0;
4919     }
4920     else {
4921         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4922     }
4923     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4924 
4925 }
4926 
4927 /*----------------------------------------------------------------------------
4928 | Returns the result of converting the extended double-precision floating-
4929 | point value `a' to the 64-bit two's complement integer format.  The
4930 | conversion is performed according to the IEC/IEEE Standard for Binary
4931 | Floating-Point Arithmetic, except that the conversion is always rounded
4932 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4933 | Otherwise, if the conversion overflows, the largest integer with the same
4934 | sign as `a' is returned.
4935 *----------------------------------------------------------------------------*/
4936 
4937 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4938 {
4939     flag aSign;
4940     int32_t aExp, shiftCount;
4941     uint64_t aSig;
4942     int64_t z;
4943 
4944     if (floatx80_invalid_encoding(a)) {
4945         float_raise(float_flag_invalid, status);
4946         return 1ULL << 63;
4947     }
4948     aSig = extractFloatx80Frac( a );
4949     aExp = extractFloatx80Exp( a );
4950     aSign = extractFloatx80Sign( a );
4951     shiftCount = aExp - 0x403E;
4952     if ( 0 <= shiftCount ) {
4953         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4954         if ( ( a.high != 0xC03E ) || aSig ) {
4955             float_raise(float_flag_invalid, status);
4956             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4957                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4958             }
4959         }
4960         return (int64_t) LIT64( 0x8000000000000000 );
4961     }
4962     else if ( aExp < 0x3FFF ) {
4963         if (aExp | aSig) {
4964             status->float_exception_flags |= float_flag_inexact;
4965         }
4966         return 0;
4967     }
4968     z = aSig>>( - shiftCount );
4969     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4970         status->float_exception_flags |= float_flag_inexact;
4971     }
4972     if ( aSign ) z = - z;
4973     return z;
4974 
4975 }
4976 
4977 /*----------------------------------------------------------------------------
4978 | Returns the result of converting the extended double-precision floating-
4979 | point value `a' to the single-precision floating-point format.  The
4980 | conversion is performed according to the IEC/IEEE Standard for Binary
4981 | Floating-Point Arithmetic.
4982 *----------------------------------------------------------------------------*/
4983 
4984 float32 floatx80_to_float32(floatx80 a, float_status *status)
4985 {
4986     flag aSign;
4987     int32_t aExp;
4988     uint64_t aSig;
4989 
4990     if (floatx80_invalid_encoding(a)) {
4991         float_raise(float_flag_invalid, status);
4992         return float32_default_nan(status);
4993     }
4994     aSig = extractFloatx80Frac( a );
4995     aExp = extractFloatx80Exp( a );
4996     aSign = extractFloatx80Sign( a );
4997     if ( aExp == 0x7FFF ) {
4998         if ( (uint64_t) ( aSig<<1 ) ) {
4999             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5000         }
5001         return packFloat32( aSign, 0xFF, 0 );
5002     }
5003     shift64RightJamming( aSig, 33, &aSig );
5004     if ( aExp || aSig ) aExp -= 0x3F81;
5005     return roundAndPackFloat32(aSign, aExp, aSig, status);
5006 
5007 }
5008 
5009 /*----------------------------------------------------------------------------
5010 | Returns the result of converting the extended double-precision floating-
5011 | point value `a' to the double-precision floating-point format.  The
5012 | conversion is performed according to the IEC/IEEE Standard for Binary
5013 | Floating-Point Arithmetic.
5014 *----------------------------------------------------------------------------*/
5015 
5016 float64 floatx80_to_float64(floatx80 a, float_status *status)
5017 {
5018     flag aSign;
5019     int32_t aExp;
5020     uint64_t aSig, zSig;
5021 
5022     if (floatx80_invalid_encoding(a)) {
5023         float_raise(float_flag_invalid, status);
5024         return float64_default_nan(status);
5025     }
5026     aSig = extractFloatx80Frac( a );
5027     aExp = extractFloatx80Exp( a );
5028     aSign = extractFloatx80Sign( a );
5029     if ( aExp == 0x7FFF ) {
5030         if ( (uint64_t) ( aSig<<1 ) ) {
5031             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5032         }
5033         return packFloat64( aSign, 0x7FF, 0 );
5034     }
5035     shift64RightJamming( aSig, 1, &zSig );
5036     if ( aExp || aSig ) aExp -= 0x3C01;
5037     return roundAndPackFloat64(aSign, aExp, zSig, status);
5038 
5039 }
5040 
5041 /*----------------------------------------------------------------------------
5042 | Returns the result of converting the extended double-precision floating-
5043 | point value `a' to the quadruple-precision floating-point format.  The
5044 | conversion is performed according to the IEC/IEEE Standard for Binary
5045 | Floating-Point Arithmetic.
5046 *----------------------------------------------------------------------------*/
5047 
5048 float128 floatx80_to_float128(floatx80 a, float_status *status)
5049 {
5050     flag aSign;
5051     int aExp;
5052     uint64_t aSig, zSig0, zSig1;
5053 
5054     if (floatx80_invalid_encoding(a)) {
5055         float_raise(float_flag_invalid, status);
5056         return float128_default_nan(status);
5057     }
5058     aSig = extractFloatx80Frac( a );
5059     aExp = extractFloatx80Exp( a );
5060     aSign = extractFloatx80Sign( a );
5061     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5062         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5063     }
5064     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5065     return packFloat128( aSign, aExp, zSig0, zSig1 );
5066 
5067 }
5068 
5069 /*----------------------------------------------------------------------------
5070 | Rounds the extended double-precision floating-point value `a' to an integer,
5071 | and returns the result as an extended quadruple-precision floating-point
5072 | value.  The operation is performed according to the IEC/IEEE Standard for
5073 | Binary Floating-Point Arithmetic.
5074 *----------------------------------------------------------------------------*/
5075 
5076 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5077 {
5078     flag aSign;
5079     int32_t aExp;
5080     uint64_t lastBitMask, roundBitsMask;
5081     floatx80 z;
5082 
5083     if (floatx80_invalid_encoding(a)) {
5084         float_raise(float_flag_invalid, status);
5085         return floatx80_default_nan(status);
5086     }
5087     aExp = extractFloatx80Exp( a );
5088     if ( 0x403E <= aExp ) {
5089         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5090             return propagateFloatx80NaN(a, a, status);
5091         }
5092         return a;
5093     }
5094     if ( aExp < 0x3FFF ) {
5095         if (    ( aExp == 0 )
5096              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5097             return a;
5098         }
5099         status->float_exception_flags |= float_flag_inexact;
5100         aSign = extractFloatx80Sign( a );
5101         switch (status->float_rounding_mode) {
5102          case float_round_nearest_even:
5103             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5104                ) {
5105                 return
5106                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5107             }
5108             break;
5109         case float_round_ties_away:
5110             if (aExp == 0x3FFE) {
5111                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5112             }
5113             break;
5114          case float_round_down:
5115             return
5116                   aSign ?
5117                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5118                 : packFloatx80( 0, 0, 0 );
5119          case float_round_up:
5120             return
5121                   aSign ? packFloatx80( 1, 0, 0 )
5122                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5123         }
5124         return packFloatx80( aSign, 0, 0 );
5125     }
5126     lastBitMask = 1;
5127     lastBitMask <<= 0x403E - aExp;
5128     roundBitsMask = lastBitMask - 1;
5129     z = a;
5130     switch (status->float_rounding_mode) {
5131     case float_round_nearest_even:
5132         z.low += lastBitMask>>1;
5133         if ((z.low & roundBitsMask) == 0) {
5134             z.low &= ~lastBitMask;
5135         }
5136         break;
5137     case float_round_ties_away:
5138         z.low += lastBitMask >> 1;
5139         break;
5140     case float_round_to_zero:
5141         break;
5142     case float_round_up:
5143         if (!extractFloatx80Sign(z)) {
5144             z.low += roundBitsMask;
5145         }
5146         break;
5147     case float_round_down:
5148         if (extractFloatx80Sign(z)) {
5149             z.low += roundBitsMask;
5150         }
5151         break;
5152     default:
5153         abort();
5154     }
5155     z.low &= ~ roundBitsMask;
5156     if ( z.low == 0 ) {
5157         ++z.high;
5158         z.low = LIT64( 0x8000000000000000 );
5159     }
5160     if (z.low != a.low) {
5161         status->float_exception_flags |= float_flag_inexact;
5162     }
5163     return z;
5164 
5165 }
5166 
5167 /*----------------------------------------------------------------------------
5168 | Returns the result of adding the absolute values of the extended double-
5169 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5170 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5171 | The addition is performed according to the IEC/IEEE Standard for Binary
5172 | Floating-Point Arithmetic.
5173 *----------------------------------------------------------------------------*/
5174 
5175 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5176                                 float_status *status)
5177 {
5178     int32_t aExp, bExp, zExp;
5179     uint64_t aSig, bSig, zSig0, zSig1;
5180     int32_t expDiff;
5181 
5182     aSig = extractFloatx80Frac( a );
5183     aExp = extractFloatx80Exp( a );
5184     bSig = extractFloatx80Frac( b );
5185     bExp = extractFloatx80Exp( b );
5186     expDiff = aExp - bExp;
5187     if ( 0 < expDiff ) {
5188         if ( aExp == 0x7FFF ) {
5189             if ((uint64_t)(aSig << 1)) {
5190                 return propagateFloatx80NaN(a, b, status);
5191             }
5192             return a;
5193         }
5194         if ( bExp == 0 ) --expDiff;
5195         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5196         zExp = aExp;
5197     }
5198     else if ( expDiff < 0 ) {
5199         if ( bExp == 0x7FFF ) {
5200             if ((uint64_t)(bSig << 1)) {
5201                 return propagateFloatx80NaN(a, b, status);
5202             }
5203             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5204         }
5205         if ( aExp == 0 ) ++expDiff;
5206         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5207         zExp = bExp;
5208     }
5209     else {
5210         if ( aExp == 0x7FFF ) {
5211             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5212                 return propagateFloatx80NaN(a, b, status);
5213             }
5214             return a;
5215         }
5216         zSig1 = 0;
5217         zSig0 = aSig + bSig;
5218         if ( aExp == 0 ) {
5219             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5220             goto roundAndPack;
5221         }
5222         zExp = aExp;
5223         goto shiftRight1;
5224     }
5225     zSig0 = aSig + bSig;
5226     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5227  shiftRight1:
5228     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5229     zSig0 |= LIT64( 0x8000000000000000 );
5230     ++zExp;
5231  roundAndPack:
5232     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5233                                 zSign, zExp, zSig0, zSig1, status);
5234 }
5235 
5236 /*----------------------------------------------------------------------------
5237 | Returns the result of subtracting the absolute values of the extended
5238 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5239 | difference is negated before being returned.  `zSign' is ignored if the
5240 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5241 | Standard for Binary Floating-Point Arithmetic.
5242 *----------------------------------------------------------------------------*/
5243 
5244 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5245                                 float_status *status)
5246 {
5247     int32_t aExp, bExp, zExp;
5248     uint64_t aSig, bSig, zSig0, zSig1;
5249     int32_t expDiff;
5250 
5251     aSig = extractFloatx80Frac( a );
5252     aExp = extractFloatx80Exp( a );
5253     bSig = extractFloatx80Frac( b );
5254     bExp = extractFloatx80Exp( b );
5255     expDiff = aExp - bExp;
5256     if ( 0 < expDiff ) goto aExpBigger;
5257     if ( expDiff < 0 ) goto bExpBigger;
5258     if ( aExp == 0x7FFF ) {
5259         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5260             return propagateFloatx80NaN(a, b, status);
5261         }
5262         float_raise(float_flag_invalid, status);
5263         return floatx80_default_nan(status);
5264     }
5265     if ( aExp == 0 ) {
5266         aExp = 1;
5267         bExp = 1;
5268     }
5269     zSig1 = 0;
5270     if ( bSig < aSig ) goto aBigger;
5271     if ( aSig < bSig ) goto bBigger;
5272     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5273  bExpBigger:
5274     if ( bExp == 0x7FFF ) {
5275         if ((uint64_t)(bSig << 1)) {
5276             return propagateFloatx80NaN(a, b, status);
5277         }
5278         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5279     }
5280     if ( aExp == 0 ) ++expDiff;
5281     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5282  bBigger:
5283     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5284     zExp = bExp;
5285     zSign ^= 1;
5286     goto normalizeRoundAndPack;
5287  aExpBigger:
5288     if ( aExp == 0x7FFF ) {
5289         if ((uint64_t)(aSig << 1)) {
5290             return propagateFloatx80NaN(a, b, status);
5291         }
5292         return a;
5293     }
5294     if ( bExp == 0 ) --expDiff;
5295     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5296  aBigger:
5297     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5298     zExp = aExp;
5299  normalizeRoundAndPack:
5300     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5301                                          zSign, zExp, zSig0, zSig1, status);
5302 }
5303 
5304 /*----------------------------------------------------------------------------
5305 | Returns the result of adding the extended double-precision floating-point
5306 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5307 | Standard for Binary Floating-Point Arithmetic.
5308 *----------------------------------------------------------------------------*/
5309 
5310 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5311 {
5312     flag aSign, bSign;
5313 
5314     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5315         float_raise(float_flag_invalid, status);
5316         return floatx80_default_nan(status);
5317     }
5318     aSign = extractFloatx80Sign( a );
5319     bSign = extractFloatx80Sign( b );
5320     if ( aSign == bSign ) {
5321         return addFloatx80Sigs(a, b, aSign, status);
5322     }
5323     else {
5324         return subFloatx80Sigs(a, b, aSign, status);
5325     }
5326 
5327 }
5328 
5329 /*----------------------------------------------------------------------------
5330 | Returns the result of subtracting the extended double-precision floating-
5331 | point values `a' and `b'.  The operation is performed according to the
5332 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5333 *----------------------------------------------------------------------------*/
5334 
5335 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5336 {
5337     flag aSign, bSign;
5338 
5339     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5340         float_raise(float_flag_invalid, status);
5341         return floatx80_default_nan(status);
5342     }
5343     aSign = extractFloatx80Sign( a );
5344     bSign = extractFloatx80Sign( b );
5345     if ( aSign == bSign ) {
5346         return subFloatx80Sigs(a, b, aSign, status);
5347     }
5348     else {
5349         return addFloatx80Sigs(a, b, aSign, status);
5350     }
5351 
5352 }
5353 
5354 /*----------------------------------------------------------------------------
5355 | Returns the result of multiplying the extended double-precision floating-
5356 | point values `a' and `b'.  The operation is performed according to the
5357 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5358 *----------------------------------------------------------------------------*/
5359 
5360 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5361 {
5362     flag aSign, bSign, zSign;
5363     int32_t aExp, bExp, zExp;
5364     uint64_t aSig, bSig, zSig0, zSig1;
5365 
5366     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5367         float_raise(float_flag_invalid, status);
5368         return floatx80_default_nan(status);
5369     }
5370     aSig = extractFloatx80Frac( a );
5371     aExp = extractFloatx80Exp( a );
5372     aSign = extractFloatx80Sign( a );
5373     bSig = extractFloatx80Frac( b );
5374     bExp = extractFloatx80Exp( b );
5375     bSign = extractFloatx80Sign( b );
5376     zSign = aSign ^ bSign;
5377     if ( aExp == 0x7FFF ) {
5378         if (    (uint64_t) ( aSig<<1 )
5379              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5380             return propagateFloatx80NaN(a, b, status);
5381         }
5382         if ( ( bExp | bSig ) == 0 ) goto invalid;
5383         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5384     }
5385     if ( bExp == 0x7FFF ) {
5386         if ((uint64_t)(bSig << 1)) {
5387             return propagateFloatx80NaN(a, b, status);
5388         }
5389         if ( ( aExp | aSig ) == 0 ) {
5390  invalid:
5391             float_raise(float_flag_invalid, status);
5392             return floatx80_default_nan(status);
5393         }
5394         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5395     }
5396     if ( aExp == 0 ) {
5397         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5398         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5399     }
5400     if ( bExp == 0 ) {
5401         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5402         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5403     }
5404     zExp = aExp + bExp - 0x3FFE;
5405     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5406     if ( 0 < (int64_t) zSig0 ) {
5407         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5408         --zExp;
5409     }
5410     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5411                                 zSign, zExp, zSig0, zSig1, status);
5412 }
5413 
5414 /*----------------------------------------------------------------------------
5415 | Returns the result of dividing the extended double-precision floating-point
5416 | value `a' by the corresponding value `b'.  The operation is performed
5417 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5418 *----------------------------------------------------------------------------*/
5419 
5420 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5421 {
5422     flag aSign, bSign, zSign;
5423     int32_t aExp, bExp, zExp;
5424     uint64_t aSig, bSig, zSig0, zSig1;
5425     uint64_t rem0, rem1, rem2, term0, term1, term2;
5426 
5427     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5428         float_raise(float_flag_invalid, status);
5429         return floatx80_default_nan(status);
5430     }
5431     aSig = extractFloatx80Frac( a );
5432     aExp = extractFloatx80Exp( a );
5433     aSign = extractFloatx80Sign( a );
5434     bSig = extractFloatx80Frac( b );
5435     bExp = extractFloatx80Exp( b );
5436     bSign = extractFloatx80Sign( b );
5437     zSign = aSign ^ bSign;
5438     if ( aExp == 0x7FFF ) {
5439         if ((uint64_t)(aSig << 1)) {
5440             return propagateFloatx80NaN(a, b, status);
5441         }
5442         if ( bExp == 0x7FFF ) {
5443             if ((uint64_t)(bSig << 1)) {
5444                 return propagateFloatx80NaN(a, b, status);
5445             }
5446             goto invalid;
5447         }
5448         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5449     }
5450     if ( bExp == 0x7FFF ) {
5451         if ((uint64_t)(bSig << 1)) {
5452             return propagateFloatx80NaN(a, b, status);
5453         }
5454         return packFloatx80( zSign, 0, 0 );
5455     }
5456     if ( bExp == 0 ) {
5457         if ( bSig == 0 ) {
5458             if ( ( aExp | aSig ) == 0 ) {
5459  invalid:
5460                 float_raise(float_flag_invalid, status);
5461                 return floatx80_default_nan(status);
5462             }
5463             float_raise(float_flag_divbyzero, status);
5464             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5465         }
5466         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5467     }
5468     if ( aExp == 0 ) {
5469         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5470         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5471     }
5472     zExp = aExp - bExp + 0x3FFE;
5473     rem1 = 0;
5474     if ( bSig <= aSig ) {
5475         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5476         ++zExp;
5477     }
5478     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5479     mul64To128( bSig, zSig0, &term0, &term1 );
5480     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5481     while ( (int64_t) rem0 < 0 ) {
5482         --zSig0;
5483         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5484     }
5485     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5486     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5487         mul64To128( bSig, zSig1, &term1, &term2 );
5488         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5489         while ( (int64_t) rem1 < 0 ) {
5490             --zSig1;
5491             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5492         }
5493         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5494     }
5495     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5496                                 zSign, zExp, zSig0, zSig1, status);
5497 }
5498 
5499 /*----------------------------------------------------------------------------
5500 | Returns the remainder of the extended double-precision floating-point value
5501 | `a' with respect to the corresponding value `b'.  The operation is performed
5502 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5503 *----------------------------------------------------------------------------*/
5504 
5505 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5506 {
5507     flag aSign, zSign;
5508     int32_t aExp, bExp, expDiff;
5509     uint64_t aSig0, aSig1, bSig;
5510     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5511 
5512     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5513         float_raise(float_flag_invalid, status);
5514         return floatx80_default_nan(status);
5515     }
5516     aSig0 = extractFloatx80Frac( a );
5517     aExp = extractFloatx80Exp( a );
5518     aSign = extractFloatx80Sign( a );
5519     bSig = extractFloatx80Frac( b );
5520     bExp = extractFloatx80Exp( b );
5521     if ( aExp == 0x7FFF ) {
5522         if (    (uint64_t) ( aSig0<<1 )
5523              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5524             return propagateFloatx80NaN(a, b, status);
5525         }
5526         goto invalid;
5527     }
5528     if ( bExp == 0x7FFF ) {
5529         if ((uint64_t)(bSig << 1)) {
5530             return propagateFloatx80NaN(a, b, status);
5531         }
5532         return a;
5533     }
5534     if ( bExp == 0 ) {
5535         if ( bSig == 0 ) {
5536  invalid:
5537             float_raise(float_flag_invalid, status);
5538             return floatx80_default_nan(status);
5539         }
5540         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5541     }
5542     if ( aExp == 0 ) {
5543         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5544         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5545     }
5546     bSig |= LIT64( 0x8000000000000000 );
5547     zSign = aSign;
5548     expDiff = aExp - bExp;
5549     aSig1 = 0;
5550     if ( expDiff < 0 ) {
5551         if ( expDiff < -1 ) return a;
5552         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5553         expDiff = 0;
5554     }
5555     q = ( bSig <= aSig0 );
5556     if ( q ) aSig0 -= bSig;
5557     expDiff -= 64;
5558     while ( 0 < expDiff ) {
5559         q = estimateDiv128To64( aSig0, aSig1, bSig );
5560         q = ( 2 < q ) ? q - 2 : 0;
5561         mul64To128( bSig, q, &term0, &term1 );
5562         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5563         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5564         expDiff -= 62;
5565     }
5566     expDiff += 64;
5567     if ( 0 < expDiff ) {
5568         q = estimateDiv128To64( aSig0, aSig1, bSig );
5569         q = ( 2 < q ) ? q - 2 : 0;
5570         q >>= 64 - expDiff;
5571         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5572         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5573         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5574         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5575             ++q;
5576             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5577         }
5578     }
5579     else {
5580         term1 = 0;
5581         term0 = bSig;
5582     }
5583     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5584     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5585          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5586               && ( q & 1 ) )
5587        ) {
5588         aSig0 = alternateASig0;
5589         aSig1 = alternateASig1;
5590         zSign = ! zSign;
5591     }
5592     return
5593         normalizeRoundAndPackFloatx80(
5594             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5595 
5596 }
5597 
5598 /*----------------------------------------------------------------------------
5599 | Returns the square root of the extended double-precision floating-point
5600 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5601 | for Binary Floating-Point Arithmetic.
5602 *----------------------------------------------------------------------------*/
5603 
5604 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5605 {
5606     flag aSign;
5607     int32_t aExp, zExp;
5608     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5609     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5610 
5611     if (floatx80_invalid_encoding(a)) {
5612         float_raise(float_flag_invalid, status);
5613         return floatx80_default_nan(status);
5614     }
5615     aSig0 = extractFloatx80Frac( a );
5616     aExp = extractFloatx80Exp( a );
5617     aSign = extractFloatx80Sign( a );
5618     if ( aExp == 0x7FFF ) {
5619         if ((uint64_t)(aSig0 << 1)) {
5620             return propagateFloatx80NaN(a, a, status);
5621         }
5622         if ( ! aSign ) return a;
5623         goto invalid;
5624     }
5625     if ( aSign ) {
5626         if ( ( aExp | aSig0 ) == 0 ) return a;
5627  invalid:
5628         float_raise(float_flag_invalid, status);
5629         return floatx80_default_nan(status);
5630     }
5631     if ( aExp == 0 ) {
5632         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5633         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5634     }
5635     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5636     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5637     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5638     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5639     doubleZSig0 = zSig0<<1;
5640     mul64To128( zSig0, zSig0, &term0, &term1 );
5641     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5642     while ( (int64_t) rem0 < 0 ) {
5643         --zSig0;
5644         doubleZSig0 -= 2;
5645         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5646     }
5647     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5648     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5649         if ( zSig1 == 0 ) zSig1 = 1;
5650         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5651         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5652         mul64To128( zSig1, zSig1, &term2, &term3 );
5653         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5654         while ( (int64_t) rem1 < 0 ) {
5655             --zSig1;
5656             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5657             term3 |= 1;
5658             term2 |= doubleZSig0;
5659             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5660         }
5661         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5662     }
5663     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5664     zSig0 |= doubleZSig0;
5665     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5666                                 0, zExp, zSig0, zSig1, status);
5667 }
5668 
5669 /*----------------------------------------------------------------------------
5670 | Returns 1 if the extended double-precision floating-point value `a' is equal
5671 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5672 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5673 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5674 *----------------------------------------------------------------------------*/
5675 
5676 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5677 {
5678 
5679     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5680         || (extractFloatx80Exp(a) == 0x7FFF
5681             && (uint64_t) (extractFloatx80Frac(a) << 1))
5682         || (extractFloatx80Exp(b) == 0x7FFF
5683             && (uint64_t) (extractFloatx80Frac(b) << 1))
5684        ) {
5685         float_raise(float_flag_invalid, status);
5686         return 0;
5687     }
5688     return
5689            ( a.low == b.low )
5690         && (    ( a.high == b.high )
5691              || (    ( a.low == 0 )
5692                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5693            );
5694 
5695 }
5696 
5697 /*----------------------------------------------------------------------------
5698 | Returns 1 if the extended double-precision floating-point value `a' is
5699 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5700 | invalid exception is raised if either operand is a NaN.  The comparison is
5701 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5702 | Arithmetic.
5703 *----------------------------------------------------------------------------*/
5704 
5705 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5706 {
5707     flag aSign, bSign;
5708 
5709     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5710         || (extractFloatx80Exp(a) == 0x7FFF
5711             && (uint64_t) (extractFloatx80Frac(a) << 1))
5712         || (extractFloatx80Exp(b) == 0x7FFF
5713             && (uint64_t) (extractFloatx80Frac(b) << 1))
5714        ) {
5715         float_raise(float_flag_invalid, status);
5716         return 0;
5717     }
5718     aSign = extractFloatx80Sign( a );
5719     bSign = extractFloatx80Sign( b );
5720     if ( aSign != bSign ) {
5721         return
5722                aSign
5723             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5724                  == 0 );
5725     }
5726     return
5727           aSign ? le128( b.high, b.low, a.high, a.low )
5728         : le128( a.high, a.low, b.high, b.low );
5729 
5730 }
5731 
5732 /*----------------------------------------------------------------------------
5733 | Returns 1 if the extended double-precision floating-point value `a' is
5734 | less than the corresponding value `b', and 0 otherwise.  The invalid
5735 | exception is raised if either operand is a NaN.  The comparison is performed
5736 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5737 *----------------------------------------------------------------------------*/
5738 
5739 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5740 {
5741     flag aSign, bSign;
5742 
5743     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5744         || (extractFloatx80Exp(a) == 0x7FFF
5745             && (uint64_t) (extractFloatx80Frac(a) << 1))
5746         || (extractFloatx80Exp(b) == 0x7FFF
5747             && (uint64_t) (extractFloatx80Frac(b) << 1))
5748        ) {
5749         float_raise(float_flag_invalid, status);
5750         return 0;
5751     }
5752     aSign = extractFloatx80Sign( a );
5753     bSign = extractFloatx80Sign( b );
5754     if ( aSign != bSign ) {
5755         return
5756                aSign
5757             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5758                  != 0 );
5759     }
5760     return
5761           aSign ? lt128( b.high, b.low, a.high, a.low )
5762         : lt128( a.high, a.low, b.high, b.low );
5763 
5764 }
5765 
5766 /*----------------------------------------------------------------------------
5767 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5768 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5769 | either operand is a NaN.   The comparison is performed according to the
5770 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5771 *----------------------------------------------------------------------------*/
5772 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5773 {
5774     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5775         || (extractFloatx80Exp(a) == 0x7FFF
5776             && (uint64_t) (extractFloatx80Frac(a) << 1))
5777         || (extractFloatx80Exp(b) == 0x7FFF
5778             && (uint64_t) (extractFloatx80Frac(b) << 1))
5779        ) {
5780         float_raise(float_flag_invalid, status);
5781         return 1;
5782     }
5783     return 0;
5784 }
5785 
5786 /*----------------------------------------------------------------------------
5787 | Returns 1 if the extended double-precision floating-point value `a' is
5788 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5789 | cause an exception.  The comparison is performed according to the IEC/IEEE
5790 | Standard for Binary Floating-Point Arithmetic.
5791 *----------------------------------------------------------------------------*/
5792 
5793 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5794 {
5795 
5796     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5797         float_raise(float_flag_invalid, status);
5798         return 0;
5799     }
5800     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5801               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5802          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5803               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5804        ) {
5805         if (floatx80_is_signaling_nan(a, status)
5806          || floatx80_is_signaling_nan(b, status)) {
5807             float_raise(float_flag_invalid, status);
5808         }
5809         return 0;
5810     }
5811     return
5812            ( a.low == b.low )
5813         && (    ( a.high == b.high )
5814              || (    ( a.low == 0 )
5815                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5816            );
5817 
5818 }
5819 
5820 /*----------------------------------------------------------------------------
5821 | Returns 1 if the extended double-precision floating-point value `a' is less
5822 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5823 | do not cause an exception.  Otherwise, the comparison is performed according
5824 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5825 *----------------------------------------------------------------------------*/
5826 
5827 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5828 {
5829     flag aSign, bSign;
5830 
5831     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5832         float_raise(float_flag_invalid, status);
5833         return 0;
5834     }
5835     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5836               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5837          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5838               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5839        ) {
5840         if (floatx80_is_signaling_nan(a, status)
5841          || floatx80_is_signaling_nan(b, status)) {
5842             float_raise(float_flag_invalid, status);
5843         }
5844         return 0;
5845     }
5846     aSign = extractFloatx80Sign( a );
5847     bSign = extractFloatx80Sign( b );
5848     if ( aSign != bSign ) {
5849         return
5850                aSign
5851             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5852                  == 0 );
5853     }
5854     return
5855           aSign ? le128( b.high, b.low, a.high, a.low )
5856         : le128( a.high, a.low, b.high, b.low );
5857 
5858 }
5859 
5860 /*----------------------------------------------------------------------------
5861 | Returns 1 if the extended double-precision floating-point value `a' is less
5862 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5863 | an exception.  Otherwise, the comparison is performed according to the
5864 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5865 *----------------------------------------------------------------------------*/
5866 
5867 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5868 {
5869     flag aSign, bSign;
5870 
5871     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5872         float_raise(float_flag_invalid, status);
5873         return 0;
5874     }
5875     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5876               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5877          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5878               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5879        ) {
5880         if (floatx80_is_signaling_nan(a, status)
5881          || floatx80_is_signaling_nan(b, status)) {
5882             float_raise(float_flag_invalid, status);
5883         }
5884         return 0;
5885     }
5886     aSign = extractFloatx80Sign( a );
5887     bSign = extractFloatx80Sign( b );
5888     if ( aSign != bSign ) {
5889         return
5890                aSign
5891             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5892                  != 0 );
5893     }
5894     return
5895           aSign ? lt128( b.high, b.low, a.high, a.low )
5896         : lt128( a.high, a.low, b.high, b.low );
5897 
5898 }
5899 
5900 /*----------------------------------------------------------------------------
5901 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5902 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5903 | The comparison is performed according to the IEC/IEEE Standard for Binary
5904 | Floating-Point Arithmetic.
5905 *----------------------------------------------------------------------------*/
5906 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5907 {
5908     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5909         float_raise(float_flag_invalid, status);
5910         return 1;
5911     }
5912     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5913               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5914          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5915               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5916        ) {
5917         if (floatx80_is_signaling_nan(a, status)
5918          || floatx80_is_signaling_nan(b, status)) {
5919             float_raise(float_flag_invalid, status);
5920         }
5921         return 1;
5922     }
5923     return 0;
5924 }
5925 
5926 /*----------------------------------------------------------------------------
5927 | Returns the result of converting the quadruple-precision floating-point
5928 | value `a' to the 32-bit two's complement integer format.  The conversion
5929 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5930 | Arithmetic---which means in particular that the conversion is rounded
5931 | according to the current rounding mode.  If `a' is a NaN, the largest
5932 | positive integer is returned.  Otherwise, if the conversion overflows, the
5933 | largest integer with the same sign as `a' is returned.
5934 *----------------------------------------------------------------------------*/
5935 
5936 int32_t float128_to_int32(float128 a, float_status *status)
5937 {
5938     flag aSign;
5939     int32_t aExp, shiftCount;
5940     uint64_t aSig0, aSig1;
5941 
5942     aSig1 = extractFloat128Frac1( a );
5943     aSig0 = extractFloat128Frac0( a );
5944     aExp = extractFloat128Exp( a );
5945     aSign = extractFloat128Sign( a );
5946     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5947     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5948     aSig0 |= ( aSig1 != 0 );
5949     shiftCount = 0x4028 - aExp;
5950     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5951     return roundAndPackInt32(aSign, aSig0, status);
5952 
5953 }
5954 
5955 /*----------------------------------------------------------------------------
5956 | Returns the result of converting the quadruple-precision floating-point
5957 | value `a' to the 32-bit two's complement integer format.  The conversion
5958 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5959 | Arithmetic, except that the conversion is always rounded toward zero.  If
5960 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5961 | conversion overflows, the largest integer with the same sign as `a' is
5962 | returned.
5963 *----------------------------------------------------------------------------*/
5964 
5965 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5966 {
5967     flag aSign;
5968     int32_t aExp, shiftCount;
5969     uint64_t aSig0, aSig1, savedASig;
5970     int32_t z;
5971 
5972     aSig1 = extractFloat128Frac1( a );
5973     aSig0 = extractFloat128Frac0( a );
5974     aExp = extractFloat128Exp( a );
5975     aSign = extractFloat128Sign( a );
5976     aSig0 |= ( aSig1 != 0 );
5977     if ( 0x401E < aExp ) {
5978         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5979         goto invalid;
5980     }
5981     else if ( aExp < 0x3FFF ) {
5982         if (aExp || aSig0) {
5983             status->float_exception_flags |= float_flag_inexact;
5984         }
5985         return 0;
5986     }
5987     aSig0 |= LIT64( 0x0001000000000000 );
5988     shiftCount = 0x402F - aExp;
5989     savedASig = aSig0;
5990     aSig0 >>= shiftCount;
5991     z = aSig0;
5992     if ( aSign ) z = - z;
5993     if ( ( z < 0 ) ^ aSign ) {
5994  invalid:
5995         float_raise(float_flag_invalid, status);
5996         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5997     }
5998     if ( ( aSig0<<shiftCount ) != savedASig ) {
5999         status->float_exception_flags |= float_flag_inexact;
6000     }
6001     return z;
6002 
6003 }
6004 
6005 /*----------------------------------------------------------------------------
6006 | Returns the result of converting the quadruple-precision floating-point
6007 | value `a' to the 64-bit two's complement integer format.  The conversion
6008 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6009 | Arithmetic---which means in particular that the conversion is rounded
6010 | according to the current rounding mode.  If `a' is a NaN, the largest
6011 | positive integer is returned.  Otherwise, if the conversion overflows, the
6012 | largest integer with the same sign as `a' is returned.
6013 *----------------------------------------------------------------------------*/
6014 
6015 int64_t float128_to_int64(float128 a, float_status *status)
6016 {
6017     flag aSign;
6018     int32_t aExp, shiftCount;
6019     uint64_t aSig0, aSig1;
6020 
6021     aSig1 = extractFloat128Frac1( a );
6022     aSig0 = extractFloat128Frac0( a );
6023     aExp = extractFloat128Exp( a );
6024     aSign = extractFloat128Sign( a );
6025     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6026     shiftCount = 0x402F - aExp;
6027     if ( shiftCount <= 0 ) {
6028         if ( 0x403E < aExp ) {
6029             float_raise(float_flag_invalid, status);
6030             if (    ! aSign
6031                  || (    ( aExp == 0x7FFF )
6032                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6033                     )
6034                ) {
6035                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6036             }
6037             return (int64_t) LIT64( 0x8000000000000000 );
6038         }
6039         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6040     }
6041     else {
6042         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6043     }
6044     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6045 
6046 }
6047 
6048 /*----------------------------------------------------------------------------
6049 | Returns the result of converting the quadruple-precision floating-point
6050 | value `a' to the 64-bit two's complement integer format.  The conversion
6051 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6052 | Arithmetic, except that the conversion is always rounded toward zero.
6053 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6054 | the conversion overflows, the largest integer with the same sign as `a' is
6055 | returned.
6056 *----------------------------------------------------------------------------*/
6057 
6058 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6059 {
6060     flag aSign;
6061     int32_t aExp, shiftCount;
6062     uint64_t aSig0, aSig1;
6063     int64_t z;
6064 
6065     aSig1 = extractFloat128Frac1( a );
6066     aSig0 = extractFloat128Frac0( a );
6067     aExp = extractFloat128Exp( a );
6068     aSign = extractFloat128Sign( a );
6069     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6070     shiftCount = aExp - 0x402F;
6071     if ( 0 < shiftCount ) {
6072         if ( 0x403E <= aExp ) {
6073             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6074             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6075                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6076                 if (aSig1) {
6077                     status->float_exception_flags |= float_flag_inexact;
6078                 }
6079             }
6080             else {
6081                 float_raise(float_flag_invalid, status);
6082                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6083                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6084                 }
6085             }
6086             return (int64_t) LIT64( 0x8000000000000000 );
6087         }
6088         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6089         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6090             status->float_exception_flags |= float_flag_inexact;
6091         }
6092     }
6093     else {
6094         if ( aExp < 0x3FFF ) {
6095             if ( aExp | aSig0 | aSig1 ) {
6096                 status->float_exception_flags |= float_flag_inexact;
6097             }
6098             return 0;
6099         }
6100         z = aSig0>>( - shiftCount );
6101         if (    aSig1
6102              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6103             status->float_exception_flags |= float_flag_inexact;
6104         }
6105     }
6106     if ( aSign ) z = - z;
6107     return z;
6108 
6109 }
6110 
6111 /*----------------------------------------------------------------------------
6112 | Returns the result of converting the quadruple-precision floating-point
6113 | value `a' to the single-precision floating-point format.  The conversion
6114 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6115 | Arithmetic.
6116 *----------------------------------------------------------------------------*/
6117 
6118 float32 float128_to_float32(float128 a, float_status *status)
6119 {
6120     flag aSign;
6121     int32_t aExp;
6122     uint64_t aSig0, aSig1;
6123     uint32_t zSig;
6124 
6125     aSig1 = extractFloat128Frac1( a );
6126     aSig0 = extractFloat128Frac0( a );
6127     aExp = extractFloat128Exp( a );
6128     aSign = extractFloat128Sign( a );
6129     if ( aExp == 0x7FFF ) {
6130         if ( aSig0 | aSig1 ) {
6131             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6132         }
6133         return packFloat32( aSign, 0xFF, 0 );
6134     }
6135     aSig0 |= ( aSig1 != 0 );
6136     shift64RightJamming( aSig0, 18, &aSig0 );
6137     zSig = aSig0;
6138     if ( aExp || zSig ) {
6139         zSig |= 0x40000000;
6140         aExp -= 0x3F81;
6141     }
6142     return roundAndPackFloat32(aSign, aExp, zSig, status);
6143 
6144 }
6145 
6146 /*----------------------------------------------------------------------------
6147 | Returns the result of converting the quadruple-precision floating-point
6148 | value `a' to the double-precision floating-point format.  The conversion
6149 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6150 | Arithmetic.
6151 *----------------------------------------------------------------------------*/
6152 
6153 float64 float128_to_float64(float128 a, float_status *status)
6154 {
6155     flag aSign;
6156     int32_t aExp;
6157     uint64_t aSig0, aSig1;
6158 
6159     aSig1 = extractFloat128Frac1( a );
6160     aSig0 = extractFloat128Frac0( a );
6161     aExp = extractFloat128Exp( a );
6162     aSign = extractFloat128Sign( a );
6163     if ( aExp == 0x7FFF ) {
6164         if ( aSig0 | aSig1 ) {
6165             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6166         }
6167         return packFloat64( aSign, 0x7FF, 0 );
6168     }
6169     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6170     aSig0 |= ( aSig1 != 0 );
6171     if ( aExp || aSig0 ) {
6172         aSig0 |= LIT64( 0x4000000000000000 );
6173         aExp -= 0x3C01;
6174     }
6175     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6176 
6177 }
6178 
6179 /*----------------------------------------------------------------------------
6180 | Returns the result of converting the quadruple-precision floating-point
6181 | value `a' to the extended double-precision floating-point format.  The
6182 | conversion is performed according to the IEC/IEEE Standard for Binary
6183 | Floating-Point Arithmetic.
6184 *----------------------------------------------------------------------------*/
6185 
6186 floatx80 float128_to_floatx80(float128 a, float_status *status)
6187 {
6188     flag aSign;
6189     int32_t aExp;
6190     uint64_t aSig0, aSig1;
6191 
6192     aSig1 = extractFloat128Frac1( a );
6193     aSig0 = extractFloat128Frac0( a );
6194     aExp = extractFloat128Exp( a );
6195     aSign = extractFloat128Sign( a );
6196     if ( aExp == 0x7FFF ) {
6197         if ( aSig0 | aSig1 ) {
6198             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6199         }
6200         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6201     }
6202     if ( aExp == 0 ) {
6203         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6204         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6205     }
6206     else {
6207         aSig0 |= LIT64( 0x0001000000000000 );
6208     }
6209     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6210     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6211 
6212 }
6213 
6214 /*----------------------------------------------------------------------------
6215 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6216 | returns the result as a quadruple-precision floating-point value.  The
6217 | operation is performed according to the IEC/IEEE Standard for Binary
6218 | Floating-Point Arithmetic.
6219 *----------------------------------------------------------------------------*/
6220 
6221 float128 float128_round_to_int(float128 a, float_status *status)
6222 {
6223     flag aSign;
6224     int32_t aExp;
6225     uint64_t lastBitMask, roundBitsMask;
6226     float128 z;
6227 
6228     aExp = extractFloat128Exp( a );
6229     if ( 0x402F <= aExp ) {
6230         if ( 0x406F <= aExp ) {
6231             if (    ( aExp == 0x7FFF )
6232                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6233                ) {
6234                 return propagateFloat128NaN(a, a, status);
6235             }
6236             return a;
6237         }
6238         lastBitMask = 1;
6239         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6240         roundBitsMask = lastBitMask - 1;
6241         z = a;
6242         switch (status->float_rounding_mode) {
6243         case float_round_nearest_even:
6244             if ( lastBitMask ) {
6245                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6246                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6247             }
6248             else {
6249                 if ( (int64_t) z.low < 0 ) {
6250                     ++z.high;
6251                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6252                 }
6253             }
6254             break;
6255         case float_round_ties_away:
6256             if (lastBitMask) {
6257                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6258             } else {
6259                 if ((int64_t) z.low < 0) {
6260                     ++z.high;
6261                 }
6262             }
6263             break;
6264         case float_round_to_zero:
6265             break;
6266         case float_round_up:
6267             if (!extractFloat128Sign(z)) {
6268                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6269             }
6270             break;
6271         case float_round_down:
6272             if (extractFloat128Sign(z)) {
6273                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6274             }
6275             break;
6276         default:
6277             abort();
6278         }
6279         z.low &= ~ roundBitsMask;
6280     }
6281     else {
6282         if ( aExp < 0x3FFF ) {
6283             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6284             status->float_exception_flags |= float_flag_inexact;
6285             aSign = extractFloat128Sign( a );
6286             switch (status->float_rounding_mode) {
6287              case float_round_nearest_even:
6288                 if (    ( aExp == 0x3FFE )
6289                      && (   extractFloat128Frac0( a )
6290                           | extractFloat128Frac1( a ) )
6291                    ) {
6292                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6293                 }
6294                 break;
6295             case float_round_ties_away:
6296                 if (aExp == 0x3FFE) {
6297                     return packFloat128(aSign, 0x3FFF, 0, 0);
6298                 }
6299                 break;
6300              case float_round_down:
6301                 return
6302                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6303                     : packFloat128( 0, 0, 0, 0 );
6304              case float_round_up:
6305                 return
6306                       aSign ? packFloat128( 1, 0, 0, 0 )
6307                     : packFloat128( 0, 0x3FFF, 0, 0 );
6308             }
6309             return packFloat128( aSign, 0, 0, 0 );
6310         }
6311         lastBitMask = 1;
6312         lastBitMask <<= 0x402F - aExp;
6313         roundBitsMask = lastBitMask - 1;
6314         z.low = 0;
6315         z.high = a.high;
6316         switch (status->float_rounding_mode) {
6317         case float_round_nearest_even:
6318             z.high += lastBitMask>>1;
6319             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6320                 z.high &= ~ lastBitMask;
6321             }
6322             break;
6323         case float_round_ties_away:
6324             z.high += lastBitMask>>1;
6325             break;
6326         case float_round_to_zero:
6327             break;
6328         case float_round_up:
6329             if (!extractFloat128Sign(z)) {
6330                 z.high |= ( a.low != 0 );
6331                 z.high += roundBitsMask;
6332             }
6333             break;
6334         case float_round_down:
6335             if (extractFloat128Sign(z)) {
6336                 z.high |= (a.low != 0);
6337                 z.high += roundBitsMask;
6338             }
6339             break;
6340         default:
6341             abort();
6342         }
6343         z.high &= ~ roundBitsMask;
6344     }
6345     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6346         status->float_exception_flags |= float_flag_inexact;
6347     }
6348     return z;
6349 
6350 }
6351 
6352 /*----------------------------------------------------------------------------
6353 | Returns the result of adding the absolute values of the quadruple-precision
6354 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6355 | before being returned.  `zSign' is ignored if the result is a NaN.
6356 | The addition is performed according to the IEC/IEEE Standard for Binary
6357 | Floating-Point Arithmetic.
6358 *----------------------------------------------------------------------------*/
6359 
6360 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6361                                 float_status *status)
6362 {
6363     int32_t aExp, bExp, zExp;
6364     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6365     int32_t expDiff;
6366 
6367     aSig1 = extractFloat128Frac1( a );
6368     aSig0 = extractFloat128Frac0( a );
6369     aExp = extractFloat128Exp( a );
6370     bSig1 = extractFloat128Frac1( b );
6371     bSig0 = extractFloat128Frac0( b );
6372     bExp = extractFloat128Exp( b );
6373     expDiff = aExp - bExp;
6374     if ( 0 < expDiff ) {
6375         if ( aExp == 0x7FFF ) {
6376             if (aSig0 | aSig1) {
6377                 return propagateFloat128NaN(a, b, status);
6378             }
6379             return a;
6380         }
6381         if ( bExp == 0 ) {
6382             --expDiff;
6383         }
6384         else {
6385             bSig0 |= LIT64( 0x0001000000000000 );
6386         }
6387         shift128ExtraRightJamming(
6388             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6389         zExp = aExp;
6390     }
6391     else if ( expDiff < 0 ) {
6392         if ( bExp == 0x7FFF ) {
6393             if (bSig0 | bSig1) {
6394                 return propagateFloat128NaN(a, b, status);
6395             }
6396             return packFloat128( zSign, 0x7FFF, 0, 0 );
6397         }
6398         if ( aExp == 0 ) {
6399             ++expDiff;
6400         }
6401         else {
6402             aSig0 |= LIT64( 0x0001000000000000 );
6403         }
6404         shift128ExtraRightJamming(
6405             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6406         zExp = bExp;
6407     }
6408     else {
6409         if ( aExp == 0x7FFF ) {
6410             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6411                 return propagateFloat128NaN(a, b, status);
6412             }
6413             return a;
6414         }
6415         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6416         if ( aExp == 0 ) {
6417             if (status->flush_to_zero) {
6418                 if (zSig0 | zSig1) {
6419                     float_raise(float_flag_output_denormal, status);
6420                 }
6421                 return packFloat128(zSign, 0, 0, 0);
6422             }
6423             return packFloat128( zSign, 0, zSig0, zSig1 );
6424         }
6425         zSig2 = 0;
6426         zSig0 |= LIT64( 0x0002000000000000 );
6427         zExp = aExp;
6428         goto shiftRight1;
6429     }
6430     aSig0 |= LIT64( 0x0001000000000000 );
6431     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6432     --zExp;
6433     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6434     ++zExp;
6435  shiftRight1:
6436     shift128ExtraRightJamming(
6437         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6438  roundAndPack:
6439     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6440 
6441 }
6442 
6443 /*----------------------------------------------------------------------------
6444 | Returns the result of subtracting the absolute values of the quadruple-
6445 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6446 | difference is negated before being returned.  `zSign' is ignored if the
6447 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6448 | Standard for Binary Floating-Point Arithmetic.
6449 *----------------------------------------------------------------------------*/
6450 
6451 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6452                                 float_status *status)
6453 {
6454     int32_t aExp, bExp, zExp;
6455     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6456     int32_t expDiff;
6457 
6458     aSig1 = extractFloat128Frac1( a );
6459     aSig0 = extractFloat128Frac0( a );
6460     aExp = extractFloat128Exp( a );
6461     bSig1 = extractFloat128Frac1( b );
6462     bSig0 = extractFloat128Frac0( b );
6463     bExp = extractFloat128Exp( b );
6464     expDiff = aExp - bExp;
6465     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6466     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6467     if ( 0 < expDiff ) goto aExpBigger;
6468     if ( expDiff < 0 ) goto bExpBigger;
6469     if ( aExp == 0x7FFF ) {
6470         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6471             return propagateFloat128NaN(a, b, status);
6472         }
6473         float_raise(float_flag_invalid, status);
6474         return float128_default_nan(status);
6475     }
6476     if ( aExp == 0 ) {
6477         aExp = 1;
6478         bExp = 1;
6479     }
6480     if ( bSig0 < aSig0 ) goto aBigger;
6481     if ( aSig0 < bSig0 ) goto bBigger;
6482     if ( bSig1 < aSig1 ) goto aBigger;
6483     if ( aSig1 < bSig1 ) goto bBigger;
6484     return packFloat128(status->float_rounding_mode == float_round_down,
6485                         0, 0, 0);
6486  bExpBigger:
6487     if ( bExp == 0x7FFF ) {
6488         if (bSig0 | bSig1) {
6489             return propagateFloat128NaN(a, b, status);
6490         }
6491         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6492     }
6493     if ( aExp == 0 ) {
6494         ++expDiff;
6495     }
6496     else {
6497         aSig0 |= LIT64( 0x4000000000000000 );
6498     }
6499     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6500     bSig0 |= LIT64( 0x4000000000000000 );
6501  bBigger:
6502     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6503     zExp = bExp;
6504     zSign ^= 1;
6505     goto normalizeRoundAndPack;
6506  aExpBigger:
6507     if ( aExp == 0x7FFF ) {
6508         if (aSig0 | aSig1) {
6509             return propagateFloat128NaN(a, b, status);
6510         }
6511         return a;
6512     }
6513     if ( bExp == 0 ) {
6514         --expDiff;
6515     }
6516     else {
6517         bSig0 |= LIT64( 0x4000000000000000 );
6518     }
6519     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6520     aSig0 |= LIT64( 0x4000000000000000 );
6521  aBigger:
6522     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6523     zExp = aExp;
6524  normalizeRoundAndPack:
6525     --zExp;
6526     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6527                                          status);
6528 
6529 }
6530 
6531 /*----------------------------------------------------------------------------
6532 | Returns the result of adding the quadruple-precision floating-point values
6533 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6534 | for Binary Floating-Point Arithmetic.
6535 *----------------------------------------------------------------------------*/
6536 
6537 float128 float128_add(float128 a, float128 b, float_status *status)
6538 {
6539     flag aSign, bSign;
6540 
6541     aSign = extractFloat128Sign( a );
6542     bSign = extractFloat128Sign( b );
6543     if ( aSign == bSign ) {
6544         return addFloat128Sigs(a, b, aSign, status);
6545     }
6546     else {
6547         return subFloat128Sigs(a, b, aSign, status);
6548     }
6549 
6550 }
6551 
6552 /*----------------------------------------------------------------------------
6553 | Returns the result of subtracting the quadruple-precision floating-point
6554 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6555 | Standard for Binary Floating-Point Arithmetic.
6556 *----------------------------------------------------------------------------*/
6557 
6558 float128 float128_sub(float128 a, float128 b, float_status *status)
6559 {
6560     flag aSign, bSign;
6561 
6562     aSign = extractFloat128Sign( a );
6563     bSign = extractFloat128Sign( b );
6564     if ( aSign == bSign ) {
6565         return subFloat128Sigs(a, b, aSign, status);
6566     }
6567     else {
6568         return addFloat128Sigs(a, b, aSign, status);
6569     }
6570 
6571 }
6572 
6573 /*----------------------------------------------------------------------------
6574 | Returns the result of multiplying the quadruple-precision floating-point
6575 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6576 | Standard for Binary Floating-Point Arithmetic.
6577 *----------------------------------------------------------------------------*/
6578 
6579 float128 float128_mul(float128 a, float128 b, float_status *status)
6580 {
6581     flag aSign, bSign, zSign;
6582     int32_t aExp, bExp, zExp;
6583     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6584 
6585     aSig1 = extractFloat128Frac1( a );
6586     aSig0 = extractFloat128Frac0( a );
6587     aExp = extractFloat128Exp( a );
6588     aSign = extractFloat128Sign( a );
6589     bSig1 = extractFloat128Frac1( b );
6590     bSig0 = extractFloat128Frac0( b );
6591     bExp = extractFloat128Exp( b );
6592     bSign = extractFloat128Sign( b );
6593     zSign = aSign ^ bSign;
6594     if ( aExp == 0x7FFF ) {
6595         if (    ( aSig0 | aSig1 )
6596              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6597             return propagateFloat128NaN(a, b, status);
6598         }
6599         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6600         return packFloat128( zSign, 0x7FFF, 0, 0 );
6601     }
6602     if ( bExp == 0x7FFF ) {
6603         if (bSig0 | bSig1) {
6604             return propagateFloat128NaN(a, b, status);
6605         }
6606         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6607  invalid:
6608             float_raise(float_flag_invalid, status);
6609             return float128_default_nan(status);
6610         }
6611         return packFloat128( zSign, 0x7FFF, 0, 0 );
6612     }
6613     if ( aExp == 0 ) {
6614         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6615         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6616     }
6617     if ( bExp == 0 ) {
6618         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6619         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6620     }
6621     zExp = aExp + bExp - 0x4000;
6622     aSig0 |= LIT64( 0x0001000000000000 );
6623     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6624     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6625     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6626     zSig2 |= ( zSig3 != 0 );
6627     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6628         shift128ExtraRightJamming(
6629             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6630         ++zExp;
6631     }
6632     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6633 
6634 }
6635 
6636 /*----------------------------------------------------------------------------
6637 | Returns the result of dividing the quadruple-precision floating-point value
6638 | `a' by the corresponding value `b'.  The operation is performed according to
6639 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6640 *----------------------------------------------------------------------------*/
6641 
6642 float128 float128_div(float128 a, float128 b, float_status *status)
6643 {
6644     flag aSign, bSign, zSign;
6645     int32_t aExp, bExp, zExp;
6646     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6647     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6648 
6649     aSig1 = extractFloat128Frac1( a );
6650     aSig0 = extractFloat128Frac0( a );
6651     aExp = extractFloat128Exp( a );
6652     aSign = extractFloat128Sign( a );
6653     bSig1 = extractFloat128Frac1( b );
6654     bSig0 = extractFloat128Frac0( b );
6655     bExp = extractFloat128Exp( b );
6656     bSign = extractFloat128Sign( b );
6657     zSign = aSign ^ bSign;
6658     if ( aExp == 0x7FFF ) {
6659         if (aSig0 | aSig1) {
6660             return propagateFloat128NaN(a, b, status);
6661         }
6662         if ( bExp == 0x7FFF ) {
6663             if (bSig0 | bSig1) {
6664                 return propagateFloat128NaN(a, b, status);
6665             }
6666             goto invalid;
6667         }
6668         return packFloat128( zSign, 0x7FFF, 0, 0 );
6669     }
6670     if ( bExp == 0x7FFF ) {
6671         if (bSig0 | bSig1) {
6672             return propagateFloat128NaN(a, b, status);
6673         }
6674         return packFloat128( zSign, 0, 0, 0 );
6675     }
6676     if ( bExp == 0 ) {
6677         if ( ( bSig0 | bSig1 ) == 0 ) {
6678             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6679  invalid:
6680                 float_raise(float_flag_invalid, status);
6681                 return float128_default_nan(status);
6682             }
6683             float_raise(float_flag_divbyzero, status);
6684             return packFloat128( zSign, 0x7FFF, 0, 0 );
6685         }
6686         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6687     }
6688     if ( aExp == 0 ) {
6689         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6690         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6691     }
6692     zExp = aExp - bExp + 0x3FFD;
6693     shortShift128Left(
6694         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6695     shortShift128Left(
6696         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6697     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6698         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6699         ++zExp;
6700     }
6701     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6702     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6703     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6704     while ( (int64_t) rem0 < 0 ) {
6705         --zSig0;
6706         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6707     }
6708     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6709     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6710         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6711         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6712         while ( (int64_t) rem1 < 0 ) {
6713             --zSig1;
6714             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6715         }
6716         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6717     }
6718     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6719     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6720 
6721 }
6722 
6723 /*----------------------------------------------------------------------------
6724 | Returns the remainder of the quadruple-precision floating-point value `a'
6725 | with respect to the corresponding value `b'.  The operation is performed
6726 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6727 *----------------------------------------------------------------------------*/
6728 
6729 float128 float128_rem(float128 a, float128 b, float_status *status)
6730 {
6731     flag aSign, zSign;
6732     int32_t aExp, bExp, expDiff;
6733     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6734     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6735     int64_t sigMean0;
6736 
6737     aSig1 = extractFloat128Frac1( a );
6738     aSig0 = extractFloat128Frac0( a );
6739     aExp = extractFloat128Exp( a );
6740     aSign = extractFloat128Sign( a );
6741     bSig1 = extractFloat128Frac1( b );
6742     bSig0 = extractFloat128Frac0( b );
6743     bExp = extractFloat128Exp( b );
6744     if ( aExp == 0x7FFF ) {
6745         if (    ( aSig0 | aSig1 )
6746              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6747             return propagateFloat128NaN(a, b, status);
6748         }
6749         goto invalid;
6750     }
6751     if ( bExp == 0x7FFF ) {
6752         if (bSig0 | bSig1) {
6753             return propagateFloat128NaN(a, b, status);
6754         }
6755         return a;
6756     }
6757     if ( bExp == 0 ) {
6758         if ( ( bSig0 | bSig1 ) == 0 ) {
6759  invalid:
6760             float_raise(float_flag_invalid, status);
6761             return float128_default_nan(status);
6762         }
6763         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6764     }
6765     if ( aExp == 0 ) {
6766         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6767         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6768     }
6769     expDiff = aExp - bExp;
6770     if ( expDiff < -1 ) return a;
6771     shortShift128Left(
6772         aSig0 | LIT64( 0x0001000000000000 ),
6773         aSig1,
6774         15 - ( expDiff < 0 ),
6775         &aSig0,
6776         &aSig1
6777     );
6778     shortShift128Left(
6779         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6780     q = le128( bSig0, bSig1, aSig0, aSig1 );
6781     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6782     expDiff -= 64;
6783     while ( 0 < expDiff ) {
6784         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6785         q = ( 4 < q ) ? q - 4 : 0;
6786         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6787         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6788         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6789         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6790         expDiff -= 61;
6791     }
6792     if ( -64 < expDiff ) {
6793         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6794         q = ( 4 < q ) ? q - 4 : 0;
6795         q >>= - expDiff;
6796         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6797         expDiff += 52;
6798         if ( expDiff < 0 ) {
6799             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6800         }
6801         else {
6802             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6803         }
6804         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6805         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6806     }
6807     else {
6808         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6809         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6810     }
6811     do {
6812         alternateASig0 = aSig0;
6813         alternateASig1 = aSig1;
6814         ++q;
6815         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6816     } while ( 0 <= (int64_t) aSig0 );
6817     add128(
6818         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6819     if (    ( sigMean0 < 0 )
6820          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6821         aSig0 = alternateASig0;
6822         aSig1 = alternateASig1;
6823     }
6824     zSign = ( (int64_t) aSig0 < 0 );
6825     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6826     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6827                                          status);
6828 }
6829 
6830 /*----------------------------------------------------------------------------
6831 | Returns the square root of the quadruple-precision floating-point value `a'.
6832 | The operation is performed according to the IEC/IEEE Standard for Binary
6833 | Floating-Point Arithmetic.
6834 *----------------------------------------------------------------------------*/
6835 
6836 float128 float128_sqrt(float128 a, float_status *status)
6837 {
6838     flag aSign;
6839     int32_t aExp, zExp;
6840     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6841     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6842 
6843     aSig1 = extractFloat128Frac1( a );
6844     aSig0 = extractFloat128Frac0( a );
6845     aExp = extractFloat128Exp( a );
6846     aSign = extractFloat128Sign( a );
6847     if ( aExp == 0x7FFF ) {
6848         if (aSig0 | aSig1) {
6849             return propagateFloat128NaN(a, a, status);
6850         }
6851         if ( ! aSign ) return a;
6852         goto invalid;
6853     }
6854     if ( aSign ) {
6855         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6856  invalid:
6857         float_raise(float_flag_invalid, status);
6858         return float128_default_nan(status);
6859     }
6860     if ( aExp == 0 ) {
6861         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6862         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6863     }
6864     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6865     aSig0 |= LIT64( 0x0001000000000000 );
6866     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6867     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6868     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6869     doubleZSig0 = zSig0<<1;
6870     mul64To128( zSig0, zSig0, &term0, &term1 );
6871     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6872     while ( (int64_t) rem0 < 0 ) {
6873         --zSig0;
6874         doubleZSig0 -= 2;
6875         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6876     }
6877     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6878     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6879         if ( zSig1 == 0 ) zSig1 = 1;
6880         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6881         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6882         mul64To128( zSig1, zSig1, &term2, &term3 );
6883         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6884         while ( (int64_t) rem1 < 0 ) {
6885             --zSig1;
6886             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6887             term3 |= 1;
6888             term2 |= doubleZSig0;
6889             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6890         }
6891         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6892     }
6893     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6894     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6895 
6896 }
6897 
6898 /*----------------------------------------------------------------------------
6899 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6900 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6901 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6902 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6903 *----------------------------------------------------------------------------*/
6904 
6905 int float128_eq(float128 a, float128 b, float_status *status)
6906 {
6907 
6908     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6909               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6910          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6911               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6912        ) {
6913         float_raise(float_flag_invalid, status);
6914         return 0;
6915     }
6916     return
6917            ( a.low == b.low )
6918         && (    ( a.high == b.high )
6919              || (    ( a.low == 0 )
6920                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6921            );
6922 
6923 }
6924 
6925 /*----------------------------------------------------------------------------
6926 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6927 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6928 | exception is raised if either operand is a NaN.  The comparison is performed
6929 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6930 *----------------------------------------------------------------------------*/
6931 
6932 int float128_le(float128 a, float128 b, float_status *status)
6933 {
6934     flag aSign, bSign;
6935 
6936     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6937               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6938          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6939               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6940        ) {
6941         float_raise(float_flag_invalid, status);
6942         return 0;
6943     }
6944     aSign = extractFloat128Sign( a );
6945     bSign = extractFloat128Sign( b );
6946     if ( aSign != bSign ) {
6947         return
6948                aSign
6949             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6950                  == 0 );
6951     }
6952     return
6953           aSign ? le128( b.high, b.low, a.high, a.low )
6954         : le128( a.high, a.low, b.high, b.low );
6955 
6956 }
6957 
6958 /*----------------------------------------------------------------------------
6959 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6960 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6961 | raised if either operand is a NaN.  The comparison is performed according
6962 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6963 *----------------------------------------------------------------------------*/
6964 
6965 int float128_lt(float128 a, float128 b, float_status *status)
6966 {
6967     flag aSign, bSign;
6968 
6969     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6970               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6971          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6972               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6973        ) {
6974         float_raise(float_flag_invalid, status);
6975         return 0;
6976     }
6977     aSign = extractFloat128Sign( a );
6978     bSign = extractFloat128Sign( b );
6979     if ( aSign != bSign ) {
6980         return
6981                aSign
6982             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6983                  != 0 );
6984     }
6985     return
6986           aSign ? lt128( b.high, b.low, a.high, a.low )
6987         : lt128( a.high, a.low, b.high, b.low );
6988 
6989 }
6990 
6991 /*----------------------------------------------------------------------------
6992 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6993 | be compared, and 0 otherwise.  The invalid exception is raised if either
6994 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6995 | Standard for Binary Floating-Point Arithmetic.
6996 *----------------------------------------------------------------------------*/
6997 
6998 int float128_unordered(float128 a, float128 b, float_status *status)
6999 {
7000     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7001               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7002          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7003               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7004        ) {
7005         float_raise(float_flag_invalid, status);
7006         return 1;
7007     }
7008     return 0;
7009 }
7010 
7011 /*----------------------------------------------------------------------------
7012 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7013 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7014 | exception.  The comparison is performed according to the IEC/IEEE Standard
7015 | for Binary Floating-Point Arithmetic.
7016 *----------------------------------------------------------------------------*/
7017 
7018 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7019 {
7020 
7021     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7022               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7023          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7024               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7025        ) {
7026         if (float128_is_signaling_nan(a, status)
7027          || float128_is_signaling_nan(b, status)) {
7028             float_raise(float_flag_invalid, status);
7029         }
7030         return 0;
7031     }
7032     return
7033            ( a.low == b.low )
7034         && (    ( a.high == b.high )
7035              || (    ( a.low == 0 )
7036                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7037            );
7038 
7039 }
7040 
7041 /*----------------------------------------------------------------------------
7042 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7043 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7044 | cause an exception.  Otherwise, the comparison is performed according to the
7045 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7046 *----------------------------------------------------------------------------*/
7047 
7048 int float128_le_quiet(float128 a, float128 b, float_status *status)
7049 {
7050     flag aSign, bSign;
7051 
7052     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7053               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7054          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7055               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7056        ) {
7057         if (float128_is_signaling_nan(a, status)
7058          || float128_is_signaling_nan(b, status)) {
7059             float_raise(float_flag_invalid, status);
7060         }
7061         return 0;
7062     }
7063     aSign = extractFloat128Sign( a );
7064     bSign = extractFloat128Sign( b );
7065     if ( aSign != bSign ) {
7066         return
7067                aSign
7068             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7069                  == 0 );
7070     }
7071     return
7072           aSign ? le128( b.high, b.low, a.high, a.low )
7073         : le128( a.high, a.low, b.high, b.low );
7074 
7075 }
7076 
7077 /*----------------------------------------------------------------------------
7078 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7079 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7080 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7081 | Standard for Binary Floating-Point Arithmetic.
7082 *----------------------------------------------------------------------------*/
7083 
7084 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7085 {
7086     flag aSign, bSign;
7087 
7088     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7089               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7090          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7091               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7092        ) {
7093         if (float128_is_signaling_nan(a, status)
7094          || float128_is_signaling_nan(b, status)) {
7095             float_raise(float_flag_invalid, status);
7096         }
7097         return 0;
7098     }
7099     aSign = extractFloat128Sign( a );
7100     bSign = extractFloat128Sign( b );
7101     if ( aSign != bSign ) {
7102         return
7103                aSign
7104             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7105                  != 0 );
7106     }
7107     return
7108           aSign ? lt128( b.high, b.low, a.high, a.low )
7109         : lt128( a.high, a.low, b.high, b.low );
7110 
7111 }
7112 
7113 /*----------------------------------------------------------------------------
7114 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7115 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7116 | comparison is performed according to the IEC/IEEE Standard for Binary
7117 | Floating-Point Arithmetic.
7118 *----------------------------------------------------------------------------*/
7119 
7120 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7121 {
7122     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7123               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7124          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7125               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7126        ) {
7127         if (float128_is_signaling_nan(a, status)
7128          || float128_is_signaling_nan(b, status)) {
7129             float_raise(float_flag_invalid, status);
7130         }
7131         return 1;
7132     }
7133     return 0;
7134 }
7135 
7136 /* misc functions */
7137 float32 uint32_to_float32(uint32_t a, float_status *status)
7138 {
7139     return int64_to_float32(a, status);
7140 }
7141 
7142 float64 uint32_to_float64(uint32_t a, float_status *status)
7143 {
7144     return int64_to_float64(a, status);
7145 }
7146 
7147 uint32_t float32_to_uint32(float32 a, float_status *status)
7148 {
7149     int64_t v;
7150     uint32_t res;
7151     int old_exc_flags = get_float_exception_flags(status);
7152 
7153     v = float32_to_int64(a, status);
7154     if (v < 0) {
7155         res = 0;
7156     } else if (v > 0xffffffff) {
7157         res = 0xffffffff;
7158     } else {
7159         return v;
7160     }
7161     set_float_exception_flags(old_exc_flags, status);
7162     float_raise(float_flag_invalid, status);
7163     return res;
7164 }
7165 
7166 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7167 {
7168     int64_t v;
7169     uint32_t res;
7170     int old_exc_flags = get_float_exception_flags(status);
7171 
7172     v = float32_to_int64_round_to_zero(a, status);
7173     if (v < 0) {
7174         res = 0;
7175     } else if (v > 0xffffffff) {
7176         res = 0xffffffff;
7177     } else {
7178         return v;
7179     }
7180     set_float_exception_flags(old_exc_flags, status);
7181     float_raise(float_flag_invalid, status);
7182     return res;
7183 }
7184 
7185 int16_t float32_to_int16(float32 a, float_status *status)
7186 {
7187     int32_t v;
7188     int16_t res;
7189     int old_exc_flags = get_float_exception_flags(status);
7190 
7191     v = float32_to_int32(a, status);
7192     if (v < -0x8000) {
7193         res = -0x8000;
7194     } else if (v > 0x7fff) {
7195         res = 0x7fff;
7196     } else {
7197         return v;
7198     }
7199 
7200     set_float_exception_flags(old_exc_flags, status);
7201     float_raise(float_flag_invalid, status);
7202     return res;
7203 }
7204 
7205 uint16_t float32_to_uint16(float32 a, float_status *status)
7206 {
7207     int32_t v;
7208     uint16_t res;
7209     int old_exc_flags = get_float_exception_flags(status);
7210 
7211     v = float32_to_int32(a, status);
7212     if (v < 0) {
7213         res = 0;
7214     } else if (v > 0xffff) {
7215         res = 0xffff;
7216     } else {
7217         return v;
7218     }
7219 
7220     set_float_exception_flags(old_exc_flags, status);
7221     float_raise(float_flag_invalid, status);
7222     return res;
7223 }
7224 
7225 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7226 {
7227     int64_t v;
7228     uint16_t res;
7229     int old_exc_flags = get_float_exception_flags(status);
7230 
7231     v = float32_to_int64_round_to_zero(a, status);
7232     if (v < 0) {
7233         res = 0;
7234     } else if (v > 0xffff) {
7235         res = 0xffff;
7236     } else {
7237         return v;
7238     }
7239     set_float_exception_flags(old_exc_flags, status);
7240     float_raise(float_flag_invalid, status);
7241     return res;
7242 }
7243 
7244 uint32_t float64_to_uint32(float64 a, float_status *status)
7245 {
7246     uint64_t v;
7247     uint32_t res;
7248     int old_exc_flags = get_float_exception_flags(status);
7249 
7250     v = float64_to_uint64(a, status);
7251     if (v > 0xffffffff) {
7252         res = 0xffffffff;
7253     } else {
7254         return v;
7255     }
7256     set_float_exception_flags(old_exc_flags, status);
7257     float_raise(float_flag_invalid, status);
7258     return res;
7259 }
7260 
7261 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7262 {
7263     uint64_t v;
7264     uint32_t res;
7265     int old_exc_flags = get_float_exception_flags(status);
7266 
7267     v = float64_to_uint64_round_to_zero(a, status);
7268     if (v > 0xffffffff) {
7269         res = 0xffffffff;
7270     } else {
7271         return v;
7272     }
7273     set_float_exception_flags(old_exc_flags, status);
7274     float_raise(float_flag_invalid, status);
7275     return res;
7276 }
7277 
7278 int16_t float64_to_int16(float64 a, float_status *status)
7279 {
7280     int64_t v;
7281     int16_t res;
7282     int old_exc_flags = get_float_exception_flags(status);
7283 
7284     v = float64_to_int32(a, status);
7285     if (v < -0x8000) {
7286         res = -0x8000;
7287     } else if (v > 0x7fff) {
7288         res = 0x7fff;
7289     } else {
7290         return v;
7291     }
7292 
7293     set_float_exception_flags(old_exc_flags, status);
7294     float_raise(float_flag_invalid, status);
7295     return res;
7296 }
7297 
7298 uint16_t float64_to_uint16(float64 a, float_status *status)
7299 {
7300     int64_t v;
7301     uint16_t res;
7302     int old_exc_flags = get_float_exception_flags(status);
7303 
7304     v = float64_to_int32(a, status);
7305     if (v < 0) {
7306         res = 0;
7307     } else if (v > 0xffff) {
7308         res = 0xffff;
7309     } else {
7310         return v;
7311     }
7312 
7313     set_float_exception_flags(old_exc_flags, status);
7314     float_raise(float_flag_invalid, status);
7315     return res;
7316 }
7317 
7318 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7319 {
7320     int64_t v;
7321     uint16_t res;
7322     int old_exc_flags = get_float_exception_flags(status);
7323 
7324     v = float64_to_int64_round_to_zero(a, status);
7325     if (v < 0) {
7326         res = 0;
7327     } else if (v > 0xffff) {
7328         res = 0xffff;
7329     } else {
7330         return v;
7331     }
7332     set_float_exception_flags(old_exc_flags, status);
7333     float_raise(float_flag_invalid, status);
7334     return res;
7335 }
7336 
7337 /*----------------------------------------------------------------------------
7338 | Returns the result of converting the double-precision floating-point value
7339 | `a' to the 64-bit unsigned integer format.  The conversion is
7340 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7341 | Arithmetic---which means in particular that the conversion is rounded
7342 | according to the current rounding mode.  If `a' is a NaN, the largest
7343 | positive integer is returned.  If the conversion overflows, the
7344 | largest unsigned integer is returned.  If 'a' is negative, the value is
7345 | rounded and zero is returned; negative values that do not round to zero
7346 | will raise the inexact exception.
7347 *----------------------------------------------------------------------------*/
7348 
7349 uint64_t float64_to_uint64(float64 a, float_status *status)
7350 {
7351     flag aSign;
7352     int aExp;
7353     int shiftCount;
7354     uint64_t aSig, aSigExtra;
7355     a = float64_squash_input_denormal(a, status);
7356 
7357     aSig = extractFloat64Frac(a);
7358     aExp = extractFloat64Exp(a);
7359     aSign = extractFloat64Sign(a);
7360     if (aSign && (aExp > 1022)) {
7361         float_raise(float_flag_invalid, status);
7362         if (float64_is_any_nan(a)) {
7363             return LIT64(0xFFFFFFFFFFFFFFFF);
7364         } else {
7365             return 0;
7366         }
7367     }
7368     if (aExp) {
7369         aSig |= LIT64(0x0010000000000000);
7370     }
7371     shiftCount = 0x433 - aExp;
7372     if (shiftCount <= 0) {
7373         if (0x43E < aExp) {
7374             float_raise(float_flag_invalid, status);
7375             return LIT64(0xFFFFFFFFFFFFFFFF);
7376         }
7377         aSigExtra = 0;
7378         aSig <<= -shiftCount;
7379     } else {
7380         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7381     }
7382     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7383 }
7384 
7385 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7386 {
7387     signed char current_rounding_mode = status->float_rounding_mode;
7388     set_float_rounding_mode(float_round_to_zero, status);
7389     int64_t v = float64_to_uint64(a, status);
7390     set_float_rounding_mode(current_rounding_mode, status);
7391     return v;
7392 }
7393 
7394 #define COMPARE(s, nan_exp)                                                  \
7395 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7396                                       int is_quiet, float_status *status)    \
7397 {                                                                            \
7398     flag aSign, bSign;                                                       \
7399     uint ## s ## _t av, bv;                                                  \
7400     a = float ## s ## _squash_input_denormal(a, status);                     \
7401     b = float ## s ## _squash_input_denormal(b, status);                     \
7402                                                                              \
7403     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7404          extractFloat ## s ## Frac( a ) ) ||                                 \
7405         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7406           extractFloat ## s ## Frac( b ) )) {                                \
7407         if (!is_quiet ||                                                     \
7408             float ## s ## _is_signaling_nan(a, status) ||                  \
7409             float ## s ## _is_signaling_nan(b, status)) {                 \
7410             float_raise(float_flag_invalid, status);                         \
7411         }                                                                    \
7412         return float_relation_unordered;                                     \
7413     }                                                                        \
7414     aSign = extractFloat ## s ## Sign( a );                                  \
7415     bSign = extractFloat ## s ## Sign( b );                                  \
7416     av = float ## s ## _val(a);                                              \
7417     bv = float ## s ## _val(b);                                              \
7418     if ( aSign != bSign ) {                                                  \
7419         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7420             /* zero case */                                                  \
7421             return float_relation_equal;                                     \
7422         } else {                                                             \
7423             return 1 - (2 * aSign);                                          \
7424         }                                                                    \
7425     } else {                                                                 \
7426         if (av == bv) {                                                      \
7427             return float_relation_equal;                                     \
7428         } else {                                                             \
7429             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7430         }                                                                    \
7431     }                                                                        \
7432 }                                                                            \
7433                                                                              \
7434 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7435 {                                                                            \
7436     return float ## s ## _compare_internal(a, b, 0, status);                 \
7437 }                                                                            \
7438                                                                              \
7439 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7440                                  float_status *status)                       \
7441 {                                                                            \
7442     return float ## s ## _compare_internal(a, b, 1, status);                 \
7443 }
7444 
7445 COMPARE(32, 0xff)
7446 COMPARE(64, 0x7ff)
7447 
7448 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7449                                             int is_quiet, float_status *status)
7450 {
7451     flag aSign, bSign;
7452 
7453     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7454         float_raise(float_flag_invalid, status);
7455         return float_relation_unordered;
7456     }
7457     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7458           ( extractFloatx80Frac( a )<<1 ) ) ||
7459         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7460           ( extractFloatx80Frac( b )<<1 ) )) {
7461         if (!is_quiet ||
7462             floatx80_is_signaling_nan(a, status) ||
7463             floatx80_is_signaling_nan(b, status)) {
7464             float_raise(float_flag_invalid, status);
7465         }
7466         return float_relation_unordered;
7467     }
7468     aSign = extractFloatx80Sign( a );
7469     bSign = extractFloatx80Sign( b );
7470     if ( aSign != bSign ) {
7471 
7472         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7473              ( ( a.low | b.low ) == 0 ) ) {
7474             /* zero case */
7475             return float_relation_equal;
7476         } else {
7477             return 1 - (2 * aSign);
7478         }
7479     } else {
7480         if (a.low == b.low && a.high == b.high) {
7481             return float_relation_equal;
7482         } else {
7483             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7484         }
7485     }
7486 }
7487 
7488 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7489 {
7490     return floatx80_compare_internal(a, b, 0, status);
7491 }
7492 
7493 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7494 {
7495     return floatx80_compare_internal(a, b, 1, status);
7496 }
7497 
7498 static inline int float128_compare_internal(float128 a, float128 b,
7499                                             int is_quiet, float_status *status)
7500 {
7501     flag aSign, bSign;
7502 
7503     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7504           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7505         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7506           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7507         if (!is_quiet ||
7508             float128_is_signaling_nan(a, status) ||
7509             float128_is_signaling_nan(b, status)) {
7510             float_raise(float_flag_invalid, status);
7511         }
7512         return float_relation_unordered;
7513     }
7514     aSign = extractFloat128Sign( a );
7515     bSign = extractFloat128Sign( b );
7516     if ( aSign != bSign ) {
7517         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7518             /* zero case */
7519             return float_relation_equal;
7520         } else {
7521             return 1 - (2 * aSign);
7522         }
7523     } else {
7524         if (a.low == b.low && a.high == b.high) {
7525             return float_relation_equal;
7526         } else {
7527             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7528         }
7529     }
7530 }
7531 
7532 int float128_compare(float128 a, float128 b, float_status *status)
7533 {
7534     return float128_compare_internal(a, b, 0, status);
7535 }
7536 
7537 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7538 {
7539     return float128_compare_internal(a, b, 1, status);
7540 }
7541 
7542 /* min() and max() functions. These can't be implemented as
7543  * 'compare and pick one input' because that would mishandle
7544  * NaNs and +0 vs -0.
7545  *
7546  * minnum() and maxnum() functions. These are similar to the min()
7547  * and max() functions but if one of the arguments is a QNaN and
7548  * the other is numerical then the numerical argument is returned.
7549  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7550  * and maxNum() operations. min() and max() are the typical min/max
7551  * semantics provided by many CPUs which predate that specification.
7552  *
7553  * minnummag() and maxnummag() functions correspond to minNumMag()
7554  * and minNumMag() from the IEEE-754 2008.
7555  */
7556 #define MINMAX(s)                                                       \
7557 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7558                                                int ismin, int isieee,   \
7559                                                int ismag,               \
7560                                                float_status *status)    \
7561 {                                                                       \
7562     flag aSign, bSign;                                                  \
7563     uint ## s ## _t av, bv, aav, abv;                                   \
7564     a = float ## s ## _squash_input_denormal(a, status);                \
7565     b = float ## s ## _squash_input_denormal(b, status);                \
7566     if (float ## s ## _is_any_nan(a) ||                                 \
7567         float ## s ## _is_any_nan(b)) {                                 \
7568         if (isieee) {                                                   \
7569             if (float ## s ## _is_quiet_nan(a, status) &&               \
7570                 !float ## s ##_is_any_nan(b)) {                         \
7571                 return b;                                               \
7572             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7573                        !float ## s ## _is_any_nan(a)) {                \
7574                 return a;                                               \
7575             }                                                           \
7576         }                                                               \
7577         return propagateFloat ## s ## NaN(a, b, status);                \
7578     }                                                                   \
7579     aSign = extractFloat ## s ## Sign(a);                               \
7580     bSign = extractFloat ## s ## Sign(b);                               \
7581     av = float ## s ## _val(a);                                         \
7582     bv = float ## s ## _val(b);                                         \
7583     if (ismag) {                                                        \
7584         aav = float ## s ## _abs(av);                                   \
7585         abv = float ## s ## _abs(bv);                                   \
7586         if (aav != abv) {                                               \
7587             if (ismin) {                                                \
7588                 return (aav < abv) ? a : b;                             \
7589             } else {                                                    \
7590                 return (aav < abv) ? b : a;                             \
7591             }                                                           \
7592         }                                                               \
7593     }                                                                   \
7594     if (aSign != bSign) {                                               \
7595         if (ismin) {                                                    \
7596             return aSign ? a : b;                                       \
7597         } else {                                                        \
7598             return aSign ? b : a;                                       \
7599         }                                                               \
7600     } else {                                                            \
7601         if (ismin) {                                                    \
7602             return (aSign ^ (av < bv)) ? a : b;                         \
7603         } else {                                                        \
7604             return (aSign ^ (av < bv)) ? b : a;                         \
7605         }                                                               \
7606     }                                                                   \
7607 }                                                                       \
7608                                                                         \
7609 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7610                               float_status *status)                     \
7611 {                                                                       \
7612     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7613 }                                                                       \
7614                                                                         \
7615 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7616                               float_status *status)                     \
7617 {                                                                       \
7618     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7619 }                                                                       \
7620                                                                         \
7621 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7622                                  float_status *status)                  \
7623 {                                                                       \
7624     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7625 }                                                                       \
7626                                                                         \
7627 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7628                                  float_status *status)                  \
7629 {                                                                       \
7630     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7631 }                                                                       \
7632                                                                         \
7633 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7634                                     float_status *status)               \
7635 {                                                                       \
7636     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7637 }                                                                       \
7638                                                                         \
7639 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7640                                     float_status *status)               \
7641 {                                                                       \
7642     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7643 }
7644 
7645 MINMAX(32)
7646 MINMAX(64)
7647 
7648 
7649 /* Multiply A by 2 raised to the power N.  */
7650 float32 float32_scalbn(float32 a, int n, float_status *status)
7651 {
7652     flag aSign;
7653     int16_t aExp;
7654     uint32_t aSig;
7655 
7656     a = float32_squash_input_denormal(a, status);
7657     aSig = extractFloat32Frac( a );
7658     aExp = extractFloat32Exp( a );
7659     aSign = extractFloat32Sign( a );
7660 
7661     if ( aExp == 0xFF ) {
7662         if ( aSig ) {
7663             return propagateFloat32NaN(a, a, status);
7664         }
7665         return a;
7666     }
7667     if (aExp != 0) {
7668         aSig |= 0x00800000;
7669     } else if (aSig == 0) {
7670         return a;
7671     } else {
7672         aExp++;
7673     }
7674 
7675     if (n > 0x200) {
7676         n = 0x200;
7677     } else if (n < -0x200) {
7678         n = -0x200;
7679     }
7680 
7681     aExp += n - 1;
7682     aSig <<= 7;
7683     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7684 }
7685 
7686 float64 float64_scalbn(float64 a, int n, float_status *status)
7687 {
7688     flag aSign;
7689     int16_t aExp;
7690     uint64_t aSig;
7691 
7692     a = float64_squash_input_denormal(a, status);
7693     aSig = extractFloat64Frac( a );
7694     aExp = extractFloat64Exp( a );
7695     aSign = extractFloat64Sign( a );
7696 
7697     if ( aExp == 0x7FF ) {
7698         if ( aSig ) {
7699             return propagateFloat64NaN(a, a, status);
7700         }
7701         return a;
7702     }
7703     if (aExp != 0) {
7704         aSig |= LIT64( 0x0010000000000000 );
7705     } else if (aSig == 0) {
7706         return a;
7707     } else {
7708         aExp++;
7709     }
7710 
7711     if (n > 0x1000) {
7712         n = 0x1000;
7713     } else if (n < -0x1000) {
7714         n = -0x1000;
7715     }
7716 
7717     aExp += n - 1;
7718     aSig <<= 10;
7719     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7720 }
7721 
7722 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7723 {
7724     flag aSign;
7725     int32_t aExp;
7726     uint64_t aSig;
7727 
7728     if (floatx80_invalid_encoding(a)) {
7729         float_raise(float_flag_invalid, status);
7730         return floatx80_default_nan(status);
7731     }
7732     aSig = extractFloatx80Frac( a );
7733     aExp = extractFloatx80Exp( a );
7734     aSign = extractFloatx80Sign( a );
7735 
7736     if ( aExp == 0x7FFF ) {
7737         if ( aSig<<1 ) {
7738             return propagateFloatx80NaN(a, a, status);
7739         }
7740         return a;
7741     }
7742 
7743     if (aExp == 0) {
7744         if (aSig == 0) {
7745             return a;
7746         }
7747         aExp++;
7748     }
7749 
7750     if (n > 0x10000) {
7751         n = 0x10000;
7752     } else if (n < -0x10000) {
7753         n = -0x10000;
7754     }
7755 
7756     aExp += n;
7757     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7758                                          aSign, aExp, aSig, 0, status);
7759 }
7760 
7761 float128 float128_scalbn(float128 a, int n, float_status *status)
7762 {
7763     flag aSign;
7764     int32_t aExp;
7765     uint64_t aSig0, aSig1;
7766 
7767     aSig1 = extractFloat128Frac1( a );
7768     aSig0 = extractFloat128Frac0( a );
7769     aExp = extractFloat128Exp( a );
7770     aSign = extractFloat128Sign( a );
7771     if ( aExp == 0x7FFF ) {
7772         if ( aSig0 | aSig1 ) {
7773             return propagateFloat128NaN(a, a, status);
7774         }
7775         return a;
7776     }
7777     if (aExp != 0) {
7778         aSig0 |= LIT64( 0x0001000000000000 );
7779     } else if (aSig0 == 0 && aSig1 == 0) {
7780         return a;
7781     } else {
7782         aExp++;
7783     }
7784 
7785     if (n > 0x10000) {
7786         n = 0x10000;
7787     } else if (n < -0x10000) {
7788         n = -0x10000;
7789     }
7790 
7791     aExp += n - 1;
7792     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7793                                          , status);
7794 
7795 }
7796