xref: /qemu/fpu/softfloat.c (revision 7a4e543d)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int_fast16_t extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input.  If `zSign' is 1, the input is negated before being converted to an
139 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer.  However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
145 
146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
147 {
148     int8_t roundingMode;
149     flag roundNearestEven;
150     int8_t roundIncrement, roundBits;
151     int32_t z;
152 
153     roundingMode = status->float_rounding_mode;
154     roundNearestEven = ( roundingMode == float_round_nearest_even );
155     switch (roundingMode) {
156     case float_round_nearest_even:
157     case float_round_ties_away:
158         roundIncrement = 0x40;
159         break;
160     case float_round_to_zero:
161         roundIncrement = 0;
162         break;
163     case float_round_up:
164         roundIncrement = zSign ? 0 : 0x7f;
165         break;
166     case float_round_down:
167         roundIncrement = zSign ? 0x7f : 0;
168         break;
169     default:
170         abort();
171     }
172     roundBits = absZ & 0x7F;
173     absZ = ( absZ + roundIncrement )>>7;
174     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
175     z = absZ;
176     if ( zSign ) z = - z;
177     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
178         float_raise(float_flag_invalid, status);
179         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
180     }
181     if (roundBits) {
182         status->float_exception_flags |= float_flag_inexact;
183     }
184     return z;
185 
186 }
187 
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer.  However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
197 | returned.
198 *----------------------------------------------------------------------------*/
199 
200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
201                                float_status *status)
202 {
203     int8_t roundingMode;
204     flag roundNearestEven, increment;
205     int64_t z;
206 
207     roundingMode = status->float_rounding_mode;
208     roundNearestEven = ( roundingMode == float_round_nearest_even );
209     switch (roundingMode) {
210     case float_round_nearest_even:
211     case float_round_ties_away:
212         increment = ((int64_t) absZ1 < 0);
213         break;
214     case float_round_to_zero:
215         increment = 0;
216         break;
217     case float_round_up:
218         increment = !zSign && absZ1;
219         break;
220     case float_round_down:
221         increment = zSign && absZ1;
222         break;
223     default:
224         abort();
225     }
226     if ( increment ) {
227         ++absZ0;
228         if ( absZ0 == 0 ) goto overflow;
229         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
230     }
231     z = absZ0;
232     if ( zSign ) z = - z;
233     if ( z && ( ( z < 0 ) ^ zSign ) ) {
234  overflow:
235         float_raise(float_flag_invalid, status);
236         return
237               zSign ? (int64_t) LIT64( 0x8000000000000000 )
238             : LIT64( 0x7FFFFFFFFFFFFFFF );
239     }
240     if (absZ1) {
241         status->float_exception_flags |= float_flag_inexact;
242     }
243     return z;
244 
245 }
246 
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer.  However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
256 
257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
258                                 uint64_t absZ1, float_status *status)
259 {
260     int8_t roundingMode;
261     flag roundNearestEven, increment;
262 
263     roundingMode = status->float_rounding_mode;
264     roundNearestEven = (roundingMode == float_round_nearest_even);
265     switch (roundingMode) {
266     case float_round_nearest_even:
267     case float_round_ties_away:
268         increment = ((int64_t)absZ1 < 0);
269         break;
270     case float_round_to_zero:
271         increment = 0;
272         break;
273     case float_round_up:
274         increment = !zSign && absZ1;
275         break;
276     case float_round_down:
277         increment = zSign && absZ1;
278         break;
279     default:
280         abort();
281     }
282     if (increment) {
283         ++absZ0;
284         if (absZ0 == 0) {
285             float_raise(float_flag_invalid, status);
286             return LIT64(0xFFFFFFFFFFFFFFFF);
287         }
288         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
289     }
290 
291     if (zSign && absZ0) {
292         float_raise(float_flag_invalid, status);
293         return 0;
294     }
295 
296     if (absZ1) {
297         status->float_exception_flags |= float_flag_inexact;
298     }
299     return absZ0;
300 }
301 
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
305 
306 static inline uint32_t extractFloat32Frac( float32 a )
307 {
308 
309     return float32_val(a) & 0x007FFFFF;
310 
311 }
312 
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
316 
317 static inline int_fast16_t extractFloat32Exp(float32 a)
318 {
319 
320     return ( float32_val(a)>>23 ) & 0xFF;
321 
322 }
323 
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
327 
328 static inline flag extractFloat32Sign( float32 a )
329 {
330 
331     return float32_val(a)>>31;
332 
333 }
334 
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32 float32_squash_input_denormal(float32 a, float_status *status)
340 {
341     if (status->flush_inputs_to_zero) {
342         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
343             float_raise(float_flag_input_denormal, status);
344             return make_float32(float32_val(a) & 0x80000000);
345         }
346     }
347     return a;
348 }
349 
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'.  The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
356 
357 static void
358  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
359 {
360     int8_t shiftCount;
361 
362     shiftCount = countLeadingZeros32( aSig ) - 8;
363     *zSigPtr = aSig<<shiftCount;
364     *zExpPtr = 1 - shiftCount;
365 
366 }
367 
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result.  After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result.  This means that any integer portion of `zSig'
373 | will be added into the exponent.  Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
376 | significand.
377 *----------------------------------------------------------------------------*/
378 
379 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
380 {
381 
382     return make_float32(
383           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
384 
385 }
386 
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input.  Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly.  However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned.  If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 |     The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location.  This shifted
401 | significand must be normalized or smaller.  If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding.  In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
408 
409 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
410                                    float_status *status)
411 {
412     int8_t roundingMode;
413     flag roundNearestEven;
414     int8_t roundIncrement, roundBits;
415     flag isTiny;
416 
417     roundingMode = status->float_rounding_mode;
418     roundNearestEven = ( roundingMode == float_round_nearest_even );
419     switch (roundingMode) {
420     case float_round_nearest_even:
421     case float_round_ties_away:
422         roundIncrement = 0x40;
423         break;
424     case float_round_to_zero:
425         roundIncrement = 0;
426         break;
427     case float_round_up:
428         roundIncrement = zSign ? 0 : 0x7f;
429         break;
430     case float_round_down:
431         roundIncrement = zSign ? 0x7f : 0;
432         break;
433     default:
434         abort();
435         break;
436     }
437     roundBits = zSig & 0x7F;
438     if ( 0xFD <= (uint16_t) zExp ) {
439         if (    ( 0xFD < zExp )
440              || (    ( zExp == 0xFD )
441                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
442            ) {
443             float_raise(float_flag_overflow | float_flag_inexact, status);
444             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
445         }
446         if ( zExp < 0 ) {
447             if (status->flush_to_zero) {
448                 float_raise(float_flag_output_denormal, status);
449                 return packFloat32(zSign, 0, 0);
450             }
451             isTiny =
452                 (status->float_detect_tininess
453                  == float_tininess_before_rounding)
454                 || ( zExp < -1 )
455                 || ( zSig + roundIncrement < 0x80000000 );
456             shift32RightJamming( zSig, - zExp, &zSig );
457             zExp = 0;
458             roundBits = zSig & 0x7F;
459             if (isTiny && roundBits) {
460                 float_raise(float_flag_underflow, status);
461             }
462         }
463     }
464     if (roundBits) {
465         status->float_exception_flags |= float_flag_inexact;
466     }
467     zSig = ( zSig + roundIncrement )>>7;
468     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
469     if ( zSig == 0 ) zExp = 0;
470     return packFloat32( zSign, zExp, zSig );
471 
472 }
473 
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input.  This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
482 
483 static float32
484  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
485                               float_status *status)
486 {
487     int8_t shiftCount;
488 
489     shiftCount = countLeadingZeros32( zSig ) - 1;
490     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
491                                status);
492 
493 }
494 
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
498 
499 static inline uint64_t extractFloat64Frac( float64 a )
500 {
501 
502     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
503 
504 }
505 
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
509 
510 static inline int_fast16_t extractFloat64Exp(float64 a)
511 {
512 
513     return ( float64_val(a)>>52 ) & 0x7FF;
514 
515 }
516 
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
520 
521 static inline flag extractFloat64Sign( float64 a )
522 {
523 
524     return float64_val(a)>>63;
525 
526 }
527 
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64 float64_squash_input_denormal(float64 a, float_status *status)
533 {
534     if (status->flush_inputs_to_zero) {
535         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
536             float_raise(float_flag_input_denormal, status);
537             return make_float64(float64_val(a) & (1ULL << 63));
538         }
539     }
540     return a;
541 }
542 
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'.  The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
549 
550 static void
551  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
552 {
553     int8_t shiftCount;
554 
555     shiftCount = countLeadingZeros64( aSig ) - 11;
556     *zSigPtr = aSig<<shiftCount;
557     *zExpPtr = 1 - shiftCount;
558 
559 }
560 
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result.  After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result.  This means that any integer portion of `zSig'
566 | will be added into the exponent.  Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
569 | significand.
570 *----------------------------------------------------------------------------*/
571 
572 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
573 {
574 
575     return make_float64(
576         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
577 
578 }
579 
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input.  Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly.  However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned.  If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 |     The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location.  This shifted
594 | significand must be normalized or smaller.  If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding.  In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
601 
602 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
603                                    float_status *status)
604 {
605     int8_t roundingMode;
606     flag roundNearestEven;
607     int_fast16_t roundIncrement, roundBits;
608     flag isTiny;
609 
610     roundingMode = status->float_rounding_mode;
611     roundNearestEven = ( roundingMode == float_round_nearest_even );
612     switch (roundingMode) {
613     case float_round_nearest_even:
614     case float_round_ties_away:
615         roundIncrement = 0x200;
616         break;
617     case float_round_to_zero:
618         roundIncrement = 0;
619         break;
620     case float_round_up:
621         roundIncrement = zSign ? 0 : 0x3ff;
622         break;
623     case float_round_down:
624         roundIncrement = zSign ? 0x3ff : 0;
625         break;
626     default:
627         abort();
628     }
629     roundBits = zSig & 0x3FF;
630     if ( 0x7FD <= (uint16_t) zExp ) {
631         if (    ( 0x7FD < zExp )
632              || (    ( zExp == 0x7FD )
633                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
634            ) {
635             float_raise(float_flag_overflow | float_flag_inexact, status);
636             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
637         }
638         if ( zExp < 0 ) {
639             if (status->flush_to_zero) {
640                 float_raise(float_flag_output_denormal, status);
641                 return packFloat64(zSign, 0, 0);
642             }
643             isTiny =
644                    (status->float_detect_tininess
645                     == float_tininess_before_rounding)
646                 || ( zExp < -1 )
647                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
648             shift64RightJamming( zSig, - zExp, &zSig );
649             zExp = 0;
650             roundBits = zSig & 0x3FF;
651             if (isTiny && roundBits) {
652                 float_raise(float_flag_underflow, status);
653             }
654         }
655     }
656     if (roundBits) {
657         status->float_exception_flags |= float_flag_inexact;
658     }
659     zSig = ( zSig + roundIncrement )>>10;
660     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
661     if ( zSig == 0 ) zExp = 0;
662     return packFloat64( zSign, zExp, zSig );
663 
664 }
665 
666 /*----------------------------------------------------------------------------
667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668 | and significand `zSig', and returns the proper double-precision floating-
669 | point value corresponding to the abstract input.  This routine is just like
670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
672 | floating-point exponent.
673 *----------------------------------------------------------------------------*/
674 
675 static float64
676  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
677                               float_status *status)
678 {
679     int8_t shiftCount;
680 
681     shiftCount = countLeadingZeros64( zSig ) - 1;
682     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
683                                status);
684 
685 }
686 
687 /*----------------------------------------------------------------------------
688 | Returns the fraction bits of the extended double-precision floating-point
689 | value `a'.
690 *----------------------------------------------------------------------------*/
691 
692 static inline uint64_t extractFloatx80Frac( floatx80 a )
693 {
694 
695     return a.low;
696 
697 }
698 
699 /*----------------------------------------------------------------------------
700 | Returns the exponent bits of the extended double-precision floating-point
701 | value `a'.
702 *----------------------------------------------------------------------------*/
703 
704 static inline int32_t extractFloatx80Exp( floatx80 a )
705 {
706 
707     return a.high & 0x7FFF;
708 
709 }
710 
711 /*----------------------------------------------------------------------------
712 | Returns the sign bit of the extended double-precision floating-point value
713 | `a'.
714 *----------------------------------------------------------------------------*/
715 
716 static inline flag extractFloatx80Sign( floatx80 a )
717 {
718 
719     return a.high>>15;
720 
721 }
722 
723 /*----------------------------------------------------------------------------
724 | Normalizes the subnormal extended double-precision floating-point value
725 | represented by the denormalized significand `aSig'.  The normalized exponent
726 | and significand are stored at the locations pointed to by `zExpPtr' and
727 | `zSigPtr', respectively.
728 *----------------------------------------------------------------------------*/
729 
730 static void
731  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
732 {
733     int8_t shiftCount;
734 
735     shiftCount = countLeadingZeros64( aSig );
736     *zSigPtr = aSig<<shiftCount;
737     *zExpPtr = 1 - shiftCount;
738 
739 }
740 
741 /*----------------------------------------------------------------------------
742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
743 | extended double-precision floating-point value, returning the result.
744 *----------------------------------------------------------------------------*/
745 
746 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
747 {
748     floatx80 z;
749 
750     z.low = zSig;
751     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
752     return z;
753 
754 }
755 
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
758 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input.  Ordinarily, the abstract value is
761 | rounded and packed into the extended double-precision format, with the
762 | inexact exception raised if the abstract input cannot be represented
763 | exactly.  However, if the abstract value is too large, the overflow and
764 | inexact exceptions are raised and an infinity or maximal finite value is
765 | returned.  If the abstract value is too small, the input value is rounded to
766 | a subnormal number, and the underflow and inexact exceptions are raised if
767 | the abstract input cannot be represented exactly as a subnormal extended
768 | double-precision floating-point number.
769 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
770 | number of bits as single or double precision, respectively.  Otherwise, the
771 | result is rounded to the full precision of the extended double-precision
772 | format.
773 |     The input significand must be normalized or smaller.  If the input
774 | significand is not normalized, `zExp' must be 0; in that case, the result
775 | returned is a subnormal number, and it must not require rounding.  The
776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
777 | Floating-Point Arithmetic.
778 *----------------------------------------------------------------------------*/
779 
780 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
781                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
782                                      float_status *status)
783 {
784     int8_t roundingMode;
785     flag roundNearestEven, increment, isTiny;
786     int64_t roundIncrement, roundMask, roundBits;
787 
788     roundingMode = status->float_rounding_mode;
789     roundNearestEven = ( roundingMode == float_round_nearest_even );
790     if ( roundingPrecision == 80 ) goto precision80;
791     if ( roundingPrecision == 64 ) {
792         roundIncrement = LIT64( 0x0000000000000400 );
793         roundMask = LIT64( 0x00000000000007FF );
794     }
795     else if ( roundingPrecision == 32 ) {
796         roundIncrement = LIT64( 0x0000008000000000 );
797         roundMask = LIT64( 0x000000FFFFFFFFFF );
798     }
799     else {
800         goto precision80;
801     }
802     zSig0 |= ( zSig1 != 0 );
803     switch (roundingMode) {
804     case float_round_nearest_even:
805     case float_round_ties_away:
806         break;
807     case float_round_to_zero:
808         roundIncrement = 0;
809         break;
810     case float_round_up:
811         roundIncrement = zSign ? 0 : roundMask;
812         break;
813     case float_round_down:
814         roundIncrement = zSign ? roundMask : 0;
815         break;
816     default:
817         abort();
818     }
819     roundBits = zSig0 & roundMask;
820     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
821         if (    ( 0x7FFE < zExp )
822              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
823            ) {
824             goto overflow;
825         }
826         if ( zExp <= 0 ) {
827             if (status->flush_to_zero) {
828                 float_raise(float_flag_output_denormal, status);
829                 return packFloatx80(zSign, 0, 0);
830             }
831             isTiny =
832                    (status->float_detect_tininess
833                     == float_tininess_before_rounding)
834                 || ( zExp < 0 )
835                 || ( zSig0 <= zSig0 + roundIncrement );
836             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
837             zExp = 0;
838             roundBits = zSig0 & roundMask;
839             if (isTiny && roundBits) {
840                 float_raise(float_flag_underflow, status);
841             }
842             if (roundBits) {
843                 status->float_exception_flags |= float_flag_inexact;
844             }
845             zSig0 += roundIncrement;
846             if ( (int64_t) zSig0 < 0 ) zExp = 1;
847             roundIncrement = roundMask + 1;
848             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
849                 roundMask |= roundIncrement;
850             }
851             zSig0 &= ~ roundMask;
852             return packFloatx80( zSign, zExp, zSig0 );
853         }
854     }
855     if (roundBits) {
856         status->float_exception_flags |= float_flag_inexact;
857     }
858     zSig0 += roundIncrement;
859     if ( zSig0 < roundIncrement ) {
860         ++zExp;
861         zSig0 = LIT64( 0x8000000000000000 );
862     }
863     roundIncrement = roundMask + 1;
864     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
865         roundMask |= roundIncrement;
866     }
867     zSig0 &= ~ roundMask;
868     if ( zSig0 == 0 ) zExp = 0;
869     return packFloatx80( zSign, zExp, zSig0 );
870  precision80:
871     switch (roundingMode) {
872     case float_round_nearest_even:
873     case float_round_ties_away:
874         increment = ((int64_t)zSig1 < 0);
875         break;
876     case float_round_to_zero:
877         increment = 0;
878         break;
879     case float_round_up:
880         increment = !zSign && zSig1;
881         break;
882     case float_round_down:
883         increment = zSign && zSig1;
884         break;
885     default:
886         abort();
887     }
888     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
889         if (    ( 0x7FFE < zExp )
890              || (    ( zExp == 0x7FFE )
891                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
892                   && increment
893                 )
894            ) {
895             roundMask = 0;
896  overflow:
897             float_raise(float_flag_overflow | float_flag_inexact, status);
898             if (    ( roundingMode == float_round_to_zero )
899                  || ( zSign && ( roundingMode == float_round_up ) )
900                  || ( ! zSign && ( roundingMode == float_round_down ) )
901                ) {
902                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
903             }
904             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
905         }
906         if ( zExp <= 0 ) {
907             isTiny =
908                    (status->float_detect_tininess
909                     == float_tininess_before_rounding)
910                 || ( zExp < 0 )
911                 || ! increment
912                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
913             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
914             zExp = 0;
915             if (isTiny && zSig1) {
916                 float_raise(float_flag_underflow, status);
917             }
918             if (zSig1) {
919                 status->float_exception_flags |= float_flag_inexact;
920             }
921             switch (roundingMode) {
922             case float_round_nearest_even:
923             case float_round_ties_away:
924                 increment = ((int64_t)zSig1 < 0);
925                 break;
926             case float_round_to_zero:
927                 increment = 0;
928                 break;
929             case float_round_up:
930                 increment = !zSign && zSig1;
931                 break;
932             case float_round_down:
933                 increment = zSign && zSig1;
934                 break;
935             default:
936                 abort();
937             }
938             if ( increment ) {
939                 ++zSig0;
940                 zSig0 &=
941                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
942                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
943             }
944             return packFloatx80( zSign, zExp, zSig0 );
945         }
946     }
947     if (zSig1) {
948         status->float_exception_flags |= float_flag_inexact;
949     }
950     if ( increment ) {
951         ++zSig0;
952         if ( zSig0 == 0 ) {
953             ++zExp;
954             zSig0 = LIT64( 0x8000000000000000 );
955         }
956         else {
957             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
958         }
959     }
960     else {
961         if ( zSig0 == 0 ) zExp = 0;
962     }
963     return packFloatx80( zSign, zExp, zSig0 );
964 
965 }
966 
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent
969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
970 | and returns the proper extended double-precision floating-point value
971 | corresponding to the abstract input.  This routine is just like
972 | `roundAndPackFloatx80' except that the input significand does not have to be
973 | normalized.
974 *----------------------------------------------------------------------------*/
975 
976 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
977                                               flag zSign, int32_t zExp,
978                                               uint64_t zSig0, uint64_t zSig1,
979                                               float_status *status)
980 {
981     int8_t shiftCount;
982 
983     if ( zSig0 == 0 ) {
984         zSig0 = zSig1;
985         zSig1 = 0;
986         zExp -= 64;
987     }
988     shiftCount = countLeadingZeros64( zSig0 );
989     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
990     zExp -= shiftCount;
991     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
992                                 zSig0, zSig1, status);
993 
994 }
995 
996 /*----------------------------------------------------------------------------
997 | Returns the least-significant 64 fraction bits of the quadruple-precision
998 | floating-point value `a'.
999 *----------------------------------------------------------------------------*/
1000 
1001 static inline uint64_t extractFloat128Frac1( float128 a )
1002 {
1003 
1004     return a.low;
1005 
1006 }
1007 
1008 /*----------------------------------------------------------------------------
1009 | Returns the most-significant 48 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1012 
1013 static inline uint64_t extractFloat128Frac0( float128 a )
1014 {
1015 
1016     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1017 
1018 }
1019 
1020 /*----------------------------------------------------------------------------
1021 | Returns the exponent bits of the quadruple-precision floating-point value
1022 | `a'.
1023 *----------------------------------------------------------------------------*/
1024 
1025 static inline int32_t extractFloat128Exp( float128 a )
1026 {
1027 
1028     return ( a.high>>48 ) & 0x7FFF;
1029 
1030 }
1031 
1032 /*----------------------------------------------------------------------------
1033 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1034 *----------------------------------------------------------------------------*/
1035 
1036 static inline flag extractFloat128Sign( float128 a )
1037 {
1038 
1039     return a.high>>63;
1040 
1041 }
1042 
1043 /*----------------------------------------------------------------------------
1044 | Normalizes the subnormal quadruple-precision floating-point value
1045 | represented by the denormalized significand formed by the concatenation of
1046 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1047 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1048 | significand are stored at the location pointed to by `zSig0Ptr', and the
1049 | least significant 64 bits of the normalized significand are stored at the
1050 | location pointed to by `zSig1Ptr'.
1051 *----------------------------------------------------------------------------*/
1052 
1053 static void
1054  normalizeFloat128Subnormal(
1055      uint64_t aSig0,
1056      uint64_t aSig1,
1057      int32_t *zExpPtr,
1058      uint64_t *zSig0Ptr,
1059      uint64_t *zSig1Ptr
1060  )
1061 {
1062     int8_t shiftCount;
1063 
1064     if ( aSig0 == 0 ) {
1065         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1066         if ( shiftCount < 0 ) {
1067             *zSig0Ptr = aSig1>>( - shiftCount );
1068             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1069         }
1070         else {
1071             *zSig0Ptr = aSig1<<shiftCount;
1072             *zSig1Ptr = 0;
1073         }
1074         *zExpPtr = - shiftCount - 63;
1075     }
1076     else {
1077         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1078         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1079         *zExpPtr = 1 - shiftCount;
1080     }
1081 
1082 }
1083 
1084 /*----------------------------------------------------------------------------
1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087 | floating-point value, returning the result.  After being shifted into the
1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089 | added together to form the most significant 32 bits of the result.  This
1090 | means that any integer portion of `zSig0' will be added into the exponent.
1091 | Since a properly normalized significand will have an integer portion equal
1092 | to 1, the `zExp' input should be 1 less than the desired result exponent
1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1094 | significand.
1095 *----------------------------------------------------------------------------*/
1096 
1097 static inline float128
1098  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1099 {
1100     float128 z;
1101 
1102     z.low = zSig1;
1103     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1104     return z;
1105 
1106 }
1107 
1108 /*----------------------------------------------------------------------------
1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1111 | and `zSig2', and returns the proper quadruple-precision floating-point value
1112 | corresponding to the abstract input.  Ordinarily, the abstract value is
1113 | simply rounded and packed into the quadruple-precision format, with the
1114 | inexact exception raised if the abstract input cannot be represented
1115 | exactly.  However, if the abstract value is too large, the overflow and
1116 | inexact exceptions are raised and an infinity or maximal finite value is
1117 | returned.  If the abstract value is too small, the input value is rounded to
1118 | a subnormal number, and the underflow and inexact exceptions are raised if
1119 | the abstract input cannot be represented exactly as a subnormal quadruple-
1120 | precision floating-point number.
1121 |     The input significand must be normalized or smaller.  If the input
1122 | significand is not normalized, `zExp' must be 0; in that case, the result
1123 | returned is a subnormal number, and it must not require rounding.  In the
1124 | usual case that the input significand is normalized, `zExp' must be 1 less
1125 | than the ``true'' floating-point exponent.  The handling of underflow and
1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128 
1129 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1130                                      uint64_t zSig0, uint64_t zSig1,
1131                                      uint64_t zSig2, float_status *status)
1132 {
1133     int8_t roundingMode;
1134     flag roundNearestEven, increment, isTiny;
1135 
1136     roundingMode = status->float_rounding_mode;
1137     roundNearestEven = ( roundingMode == float_round_nearest_even );
1138     switch (roundingMode) {
1139     case float_round_nearest_even:
1140     case float_round_ties_away:
1141         increment = ((int64_t)zSig2 < 0);
1142         break;
1143     case float_round_to_zero:
1144         increment = 0;
1145         break;
1146     case float_round_up:
1147         increment = !zSign && zSig2;
1148         break;
1149     case float_round_down:
1150         increment = zSign && zSig2;
1151         break;
1152     default:
1153         abort();
1154     }
1155     if ( 0x7FFD <= (uint32_t) zExp ) {
1156         if (    ( 0x7FFD < zExp )
1157              || (    ( zExp == 0x7FFD )
1158                   && eq128(
1159                          LIT64( 0x0001FFFFFFFFFFFF ),
1160                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1161                          zSig0,
1162                          zSig1
1163                      )
1164                   && increment
1165                 )
1166            ) {
1167             float_raise(float_flag_overflow | float_flag_inexact, status);
1168             if (    ( roundingMode == float_round_to_zero )
1169                  || ( zSign && ( roundingMode == float_round_up ) )
1170                  || ( ! zSign && ( roundingMode == float_round_down ) )
1171                ) {
1172                 return
1173                     packFloat128(
1174                         zSign,
1175                         0x7FFE,
1176                         LIT64( 0x0000FFFFFFFFFFFF ),
1177                         LIT64( 0xFFFFFFFFFFFFFFFF )
1178                     );
1179             }
1180             return packFloat128( zSign, 0x7FFF, 0, 0 );
1181         }
1182         if ( zExp < 0 ) {
1183             if (status->flush_to_zero) {
1184                 float_raise(float_flag_output_denormal, status);
1185                 return packFloat128(zSign, 0, 0, 0);
1186             }
1187             isTiny =
1188                    (status->float_detect_tininess
1189                     == float_tininess_before_rounding)
1190                 || ( zExp < -1 )
1191                 || ! increment
1192                 || lt128(
1193                        zSig0,
1194                        zSig1,
1195                        LIT64( 0x0001FFFFFFFFFFFF ),
1196                        LIT64( 0xFFFFFFFFFFFFFFFF )
1197                    );
1198             shift128ExtraRightJamming(
1199                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1200             zExp = 0;
1201             if (isTiny && zSig2) {
1202                 float_raise(float_flag_underflow, status);
1203             }
1204             switch (roundingMode) {
1205             case float_round_nearest_even:
1206             case float_round_ties_away:
1207                 increment = ((int64_t)zSig2 < 0);
1208                 break;
1209             case float_round_to_zero:
1210                 increment = 0;
1211                 break;
1212             case float_round_up:
1213                 increment = !zSign && zSig2;
1214                 break;
1215             case float_round_down:
1216                 increment = zSign && zSig2;
1217                 break;
1218             default:
1219                 abort();
1220             }
1221         }
1222     }
1223     if (zSig2) {
1224         status->float_exception_flags |= float_flag_inexact;
1225     }
1226     if ( increment ) {
1227         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1228         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1229     }
1230     else {
1231         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1232     }
1233     return packFloat128( zSign, zExp, zSig0, zSig1 );
1234 
1235 }
1236 
1237 /*----------------------------------------------------------------------------
1238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1239 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1240 | returns the proper quadruple-precision floating-point value corresponding
1241 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1242 | except that the input significand has fewer bits and does not have to be
1243 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1244 | point exponent.
1245 *----------------------------------------------------------------------------*/
1246 
1247 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1248                                               uint64_t zSig0, uint64_t zSig1,
1249                                               float_status *status)
1250 {
1251     int8_t shiftCount;
1252     uint64_t zSig2;
1253 
1254     if ( zSig0 == 0 ) {
1255         zSig0 = zSig1;
1256         zSig1 = 0;
1257         zExp -= 64;
1258     }
1259     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1260     if ( 0 <= shiftCount ) {
1261         zSig2 = 0;
1262         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1263     }
1264     else {
1265         shift128ExtraRightJamming(
1266             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1267     }
1268     zExp -= shiftCount;
1269     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1270 
1271 }
1272 
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the 32-bit two's complement integer `a'
1275 | to the single-precision floating-point format.  The conversion is performed
1276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277 *----------------------------------------------------------------------------*/
1278 
1279 float32 int32_to_float32(int32_t a, float_status *status)
1280 {
1281     flag zSign;
1282 
1283     if ( a == 0 ) return float32_zero;
1284     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1285     zSign = ( a < 0 );
1286     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1287 }
1288 
1289 /*----------------------------------------------------------------------------
1290 | Returns the result of converting the 32-bit two's complement integer `a'
1291 | to the double-precision floating-point format.  The conversion is performed
1292 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 *----------------------------------------------------------------------------*/
1294 
1295 float64 int32_to_float64(int32_t a, float_status *status)
1296 {
1297     flag zSign;
1298     uint32_t absA;
1299     int8_t shiftCount;
1300     uint64_t zSig;
1301 
1302     if ( a == 0 ) return float64_zero;
1303     zSign = ( a < 0 );
1304     absA = zSign ? - a : a;
1305     shiftCount = countLeadingZeros32( absA ) + 21;
1306     zSig = absA;
1307     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1308 
1309 }
1310 
1311 /*----------------------------------------------------------------------------
1312 | Returns the result of converting the 32-bit two's complement integer `a'
1313 | to the extended double-precision floating-point format.  The conversion
1314 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1315 | Arithmetic.
1316 *----------------------------------------------------------------------------*/
1317 
1318 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1319 {
1320     flag zSign;
1321     uint32_t absA;
1322     int8_t shiftCount;
1323     uint64_t zSig;
1324 
1325     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1326     zSign = ( a < 0 );
1327     absA = zSign ? - a : a;
1328     shiftCount = countLeadingZeros32( absA ) + 32;
1329     zSig = absA;
1330     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1331 
1332 }
1333 
1334 /*----------------------------------------------------------------------------
1335 | Returns the result of converting the 32-bit two's complement integer `a' to
1336 | the quadruple-precision floating-point format.  The conversion is performed
1337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338 *----------------------------------------------------------------------------*/
1339 
1340 float128 int32_to_float128(int32_t a, float_status *status)
1341 {
1342     flag zSign;
1343     uint32_t absA;
1344     int8_t shiftCount;
1345     uint64_t zSig0;
1346 
1347     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1348     zSign = ( a < 0 );
1349     absA = zSign ? - a : a;
1350     shiftCount = countLeadingZeros32( absA ) + 17;
1351     zSig0 = absA;
1352     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1353 
1354 }
1355 
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the 64-bit two's complement integer `a'
1358 | to the single-precision floating-point format.  The conversion is performed
1359 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1360 *----------------------------------------------------------------------------*/
1361 
1362 float32 int64_to_float32(int64_t a, float_status *status)
1363 {
1364     flag zSign;
1365     uint64_t absA;
1366     int8_t shiftCount;
1367 
1368     if ( a == 0 ) return float32_zero;
1369     zSign = ( a < 0 );
1370     absA = zSign ? - a : a;
1371     shiftCount = countLeadingZeros64( absA ) - 40;
1372     if ( 0 <= shiftCount ) {
1373         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1374     }
1375     else {
1376         shiftCount += 7;
1377         if ( shiftCount < 0 ) {
1378             shift64RightJamming( absA, - shiftCount, &absA );
1379         }
1380         else {
1381             absA <<= shiftCount;
1382         }
1383         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1384     }
1385 
1386 }
1387 
1388 /*----------------------------------------------------------------------------
1389 | Returns the result of converting the 64-bit two's complement integer `a'
1390 | to the double-precision floating-point format.  The conversion is performed
1391 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1392 *----------------------------------------------------------------------------*/
1393 
1394 float64 int64_to_float64(int64_t a, float_status *status)
1395 {
1396     flag zSign;
1397 
1398     if ( a == 0 ) return float64_zero;
1399     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1400         return packFloat64( 1, 0x43E, 0 );
1401     }
1402     zSign = ( a < 0 );
1403     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1404 }
1405 
1406 /*----------------------------------------------------------------------------
1407 | Returns the result of converting the 64-bit two's complement integer `a'
1408 | to the extended double-precision floating-point format.  The conversion
1409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1410 | Arithmetic.
1411 *----------------------------------------------------------------------------*/
1412 
1413 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1414 {
1415     flag zSign;
1416     uint64_t absA;
1417     int8_t shiftCount;
1418 
1419     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1420     zSign = ( a < 0 );
1421     absA = zSign ? - a : a;
1422     shiftCount = countLeadingZeros64( absA );
1423     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1424 
1425 }
1426 
1427 /*----------------------------------------------------------------------------
1428 | Returns the result of converting the 64-bit two's complement integer `a' to
1429 | the quadruple-precision floating-point format.  The conversion is performed
1430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1431 *----------------------------------------------------------------------------*/
1432 
1433 float128 int64_to_float128(int64_t a, float_status *status)
1434 {
1435     flag zSign;
1436     uint64_t absA;
1437     int8_t shiftCount;
1438     int32_t zExp;
1439     uint64_t zSig0, zSig1;
1440 
1441     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1442     zSign = ( a < 0 );
1443     absA = zSign ? - a : a;
1444     shiftCount = countLeadingZeros64( absA ) + 49;
1445     zExp = 0x406E - shiftCount;
1446     if ( 64 <= shiftCount ) {
1447         zSig1 = 0;
1448         zSig0 = absA;
1449         shiftCount -= 64;
1450     }
1451     else {
1452         zSig1 = absA;
1453         zSig0 = 0;
1454     }
1455     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1456     return packFloat128( zSign, zExp, zSig0, zSig1 );
1457 
1458 }
1459 
1460 /*----------------------------------------------------------------------------
1461 | Returns the result of converting the 64-bit unsigned integer `a'
1462 | to the single-precision floating-point format.  The conversion is performed
1463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1464 *----------------------------------------------------------------------------*/
1465 
1466 float32 uint64_to_float32(uint64_t a, float_status *status)
1467 {
1468     int shiftcount;
1469 
1470     if (a == 0) {
1471         return float32_zero;
1472     }
1473 
1474     /* Determine (left) shift needed to put first set bit into bit posn 23
1475      * (since packFloat32() expects the binary point between bits 23 and 22);
1476      * this is the fast case for smallish numbers.
1477      */
1478     shiftcount = countLeadingZeros64(a) - 40;
1479     if (shiftcount >= 0) {
1480         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1481     }
1482     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1483      * expects the binary point between bits 30 and 29, hence the + 7.
1484      */
1485     shiftcount += 7;
1486     if (shiftcount < 0) {
1487         shift64RightJamming(a, -shiftcount, &a);
1488     } else {
1489         a <<= shiftcount;
1490     }
1491 
1492     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1493 }
1494 
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the 64-bit unsigned integer `a'
1497 | to the double-precision floating-point format.  The conversion is performed
1498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1499 *----------------------------------------------------------------------------*/
1500 
1501 float64 uint64_to_float64(uint64_t a, float_status *status)
1502 {
1503     int exp = 0x43C;
1504     int shiftcount;
1505 
1506     if (a == 0) {
1507         return float64_zero;
1508     }
1509 
1510     shiftcount = countLeadingZeros64(a) - 1;
1511     if (shiftcount < 0) {
1512         shift64RightJamming(a, -shiftcount, &a);
1513     } else {
1514         a <<= shiftcount;
1515     }
1516     return roundAndPackFloat64(0, exp - shiftcount, a, status);
1517 }
1518 
1519 /*----------------------------------------------------------------------------
1520 | Returns the result of converting the 64-bit unsigned integer `a'
1521 | to the quadruple-precision floating-point format.  The conversion is performed
1522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523 *----------------------------------------------------------------------------*/
1524 
1525 float128 uint64_to_float128(uint64_t a, float_status *status)
1526 {
1527     if (a == 0) {
1528         return float128_zero;
1529     }
1530     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1531 }
1532 
1533 /*----------------------------------------------------------------------------
1534 | Returns the result of converting the single-precision floating-point value
1535 | `a' to the 32-bit two's complement integer format.  The conversion is
1536 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 | Arithmetic---which means in particular that the conversion is rounded
1538 | according to the current rounding mode.  If `a' is a NaN, the largest
1539 | positive integer is returned.  Otherwise, if the conversion overflows, the
1540 | largest integer with the same sign as `a' is returned.
1541 *----------------------------------------------------------------------------*/
1542 
1543 int32_t float32_to_int32(float32 a, float_status *status)
1544 {
1545     flag aSign;
1546     int_fast16_t aExp, shiftCount;
1547     uint32_t aSig;
1548     uint64_t aSig64;
1549 
1550     a = float32_squash_input_denormal(a, status);
1551     aSig = extractFloat32Frac( a );
1552     aExp = extractFloat32Exp( a );
1553     aSign = extractFloat32Sign( a );
1554     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1555     if ( aExp ) aSig |= 0x00800000;
1556     shiftCount = 0xAF - aExp;
1557     aSig64 = aSig;
1558     aSig64 <<= 32;
1559     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1560     return roundAndPackInt32(aSign, aSig64, status);
1561 
1562 }
1563 
1564 /*----------------------------------------------------------------------------
1565 | Returns the result of converting the single-precision floating-point value
1566 | `a' to the 32-bit two's complement integer format.  The conversion is
1567 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1568 | Arithmetic, except that the conversion is always rounded toward zero.
1569 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1570 | the conversion overflows, the largest integer with the same sign as `a' is
1571 | returned.
1572 *----------------------------------------------------------------------------*/
1573 
1574 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1575 {
1576     flag aSign;
1577     int_fast16_t aExp, shiftCount;
1578     uint32_t aSig;
1579     int32_t z;
1580     a = float32_squash_input_denormal(a, status);
1581 
1582     aSig = extractFloat32Frac( a );
1583     aExp = extractFloat32Exp( a );
1584     aSign = extractFloat32Sign( a );
1585     shiftCount = aExp - 0x9E;
1586     if ( 0 <= shiftCount ) {
1587         if ( float32_val(a) != 0xCF000000 ) {
1588             float_raise(float_flag_invalid, status);
1589             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1590         }
1591         return (int32_t) 0x80000000;
1592     }
1593     else if ( aExp <= 0x7E ) {
1594         if (aExp | aSig) {
1595             status->float_exception_flags |= float_flag_inexact;
1596         }
1597         return 0;
1598     }
1599     aSig = ( aSig | 0x00800000 )<<8;
1600     z = aSig>>( - shiftCount );
1601     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1602         status->float_exception_flags |= float_flag_inexact;
1603     }
1604     if ( aSign ) z = - z;
1605     return z;
1606 
1607 }
1608 
1609 /*----------------------------------------------------------------------------
1610 | Returns the result of converting the single-precision floating-point value
1611 | `a' to the 16-bit two's complement integer format.  The conversion is
1612 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1613 | Arithmetic, except that the conversion is always rounded toward zero.
1614 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1615 | the conversion overflows, the largest integer with the same sign as `a' is
1616 | returned.
1617 *----------------------------------------------------------------------------*/
1618 
1619 int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1620 {
1621     flag aSign;
1622     int_fast16_t aExp, shiftCount;
1623     uint32_t aSig;
1624     int32_t z;
1625 
1626     aSig = extractFloat32Frac( a );
1627     aExp = extractFloat32Exp( a );
1628     aSign = extractFloat32Sign( a );
1629     shiftCount = aExp - 0x8E;
1630     if ( 0 <= shiftCount ) {
1631         if ( float32_val(a) != 0xC7000000 ) {
1632             float_raise(float_flag_invalid, status);
1633             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1634                 return 0x7FFF;
1635             }
1636         }
1637         return (int32_t) 0xffff8000;
1638     }
1639     else if ( aExp <= 0x7E ) {
1640         if ( aExp | aSig ) {
1641             status->float_exception_flags |= float_flag_inexact;
1642         }
1643         return 0;
1644     }
1645     shiftCount -= 0x10;
1646     aSig = ( aSig | 0x00800000 )<<8;
1647     z = aSig>>( - shiftCount );
1648     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1649         status->float_exception_flags |= float_flag_inexact;
1650     }
1651     if ( aSign ) {
1652         z = - z;
1653     }
1654     return z;
1655 
1656 }
1657 
1658 /*----------------------------------------------------------------------------
1659 | Returns the result of converting the single-precision floating-point value
1660 | `a' to the 64-bit two's complement integer format.  The conversion is
1661 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1662 | Arithmetic---which means in particular that the conversion is rounded
1663 | according to the current rounding mode.  If `a' is a NaN, the largest
1664 | positive integer is returned.  Otherwise, if the conversion overflows, the
1665 | largest integer with the same sign as `a' is returned.
1666 *----------------------------------------------------------------------------*/
1667 
1668 int64_t float32_to_int64(float32 a, float_status *status)
1669 {
1670     flag aSign;
1671     int_fast16_t aExp, shiftCount;
1672     uint32_t aSig;
1673     uint64_t aSig64, aSigExtra;
1674     a = float32_squash_input_denormal(a, status);
1675 
1676     aSig = extractFloat32Frac( a );
1677     aExp = extractFloat32Exp( a );
1678     aSign = extractFloat32Sign( a );
1679     shiftCount = 0xBE - aExp;
1680     if ( shiftCount < 0 ) {
1681         float_raise(float_flag_invalid, status);
1682         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1683             return LIT64( 0x7FFFFFFFFFFFFFFF );
1684         }
1685         return (int64_t) LIT64( 0x8000000000000000 );
1686     }
1687     if ( aExp ) aSig |= 0x00800000;
1688     aSig64 = aSig;
1689     aSig64 <<= 40;
1690     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1691     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1692 
1693 }
1694 
1695 /*----------------------------------------------------------------------------
1696 | Returns the result of converting the single-precision floating-point value
1697 | `a' to the 64-bit unsigned integer format.  The conversion is
1698 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1699 | Arithmetic---which means in particular that the conversion is rounded
1700 | according to the current rounding mode.  If `a' is a NaN, the largest
1701 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1702 | largest unsigned integer is returned.  If the 'a' is negative, the result
1703 | is rounded and zero is returned; values that do not round to zero will
1704 | raise the inexact exception flag.
1705 *----------------------------------------------------------------------------*/
1706 
1707 uint64_t float32_to_uint64(float32 a, float_status *status)
1708 {
1709     flag aSign;
1710     int_fast16_t aExp, shiftCount;
1711     uint32_t aSig;
1712     uint64_t aSig64, aSigExtra;
1713     a = float32_squash_input_denormal(a, status);
1714 
1715     aSig = extractFloat32Frac(a);
1716     aExp = extractFloat32Exp(a);
1717     aSign = extractFloat32Sign(a);
1718     if ((aSign) && (aExp > 126)) {
1719         float_raise(float_flag_invalid, status);
1720         if (float32_is_any_nan(a)) {
1721             return LIT64(0xFFFFFFFFFFFFFFFF);
1722         } else {
1723             return 0;
1724         }
1725     }
1726     shiftCount = 0xBE - aExp;
1727     if (aExp) {
1728         aSig |= 0x00800000;
1729     }
1730     if (shiftCount < 0) {
1731         float_raise(float_flag_invalid, status);
1732         return LIT64(0xFFFFFFFFFFFFFFFF);
1733     }
1734 
1735     aSig64 = aSig;
1736     aSig64 <<= 40;
1737     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1738     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1739 }
1740 
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of converting the single-precision floating-point value
1743 | `a' to the 64-bit unsigned integer format.  The conversion is
1744 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1745 | Arithmetic, except that the conversion is always rounded toward zero.  If
1746 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1747 | conversion overflows, the largest unsigned integer is returned.  If the
1748 | 'a' is negative, the result is rounded and zero is returned; values that do
1749 | not round to zero will raise the inexact flag.
1750 *----------------------------------------------------------------------------*/
1751 
1752 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1753 {
1754     signed char current_rounding_mode = status->float_rounding_mode;
1755     set_float_rounding_mode(float_round_to_zero, status);
1756     int64_t v = float32_to_uint64(a, status);
1757     set_float_rounding_mode(current_rounding_mode, status);
1758     return v;
1759 }
1760 
1761 /*----------------------------------------------------------------------------
1762 | Returns the result of converting the single-precision floating-point value
1763 | `a' to the 64-bit two's complement integer format.  The conversion is
1764 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1765 | Arithmetic, except that the conversion is always rounded toward zero.  If
1766 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1767 | conversion overflows, the largest integer with the same sign as `a' is
1768 | returned.
1769 *----------------------------------------------------------------------------*/
1770 
1771 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1772 {
1773     flag aSign;
1774     int_fast16_t aExp, shiftCount;
1775     uint32_t aSig;
1776     uint64_t aSig64;
1777     int64_t z;
1778     a = float32_squash_input_denormal(a, status);
1779 
1780     aSig = extractFloat32Frac( a );
1781     aExp = extractFloat32Exp( a );
1782     aSign = extractFloat32Sign( a );
1783     shiftCount = aExp - 0xBE;
1784     if ( 0 <= shiftCount ) {
1785         if ( float32_val(a) != 0xDF000000 ) {
1786             float_raise(float_flag_invalid, status);
1787             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1788                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1789             }
1790         }
1791         return (int64_t) LIT64( 0x8000000000000000 );
1792     }
1793     else if ( aExp <= 0x7E ) {
1794         if (aExp | aSig) {
1795             status->float_exception_flags |= float_flag_inexact;
1796         }
1797         return 0;
1798     }
1799     aSig64 = aSig | 0x00800000;
1800     aSig64 <<= 40;
1801     z = aSig64>>( - shiftCount );
1802     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1803         status->float_exception_flags |= float_flag_inexact;
1804     }
1805     if ( aSign ) z = - z;
1806     return z;
1807 
1808 }
1809 
1810 /*----------------------------------------------------------------------------
1811 | Returns the result of converting the single-precision floating-point value
1812 | `a' to the double-precision floating-point format.  The conversion is
1813 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1814 | Arithmetic.
1815 *----------------------------------------------------------------------------*/
1816 
1817 float64 float32_to_float64(float32 a, float_status *status)
1818 {
1819     flag aSign;
1820     int_fast16_t aExp;
1821     uint32_t aSig;
1822     a = float32_squash_input_denormal(a, status);
1823 
1824     aSig = extractFloat32Frac( a );
1825     aExp = extractFloat32Exp( a );
1826     aSign = extractFloat32Sign( a );
1827     if ( aExp == 0xFF ) {
1828         if (aSig) {
1829             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1830         }
1831         return packFloat64( aSign, 0x7FF, 0 );
1832     }
1833     if ( aExp == 0 ) {
1834         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1835         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1836         --aExp;
1837     }
1838     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1839 
1840 }
1841 
1842 /*----------------------------------------------------------------------------
1843 | Returns the result of converting the single-precision floating-point value
1844 | `a' to the extended double-precision floating-point format.  The conversion
1845 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1846 | Arithmetic.
1847 *----------------------------------------------------------------------------*/
1848 
1849 floatx80 float32_to_floatx80(float32 a, float_status *status)
1850 {
1851     flag aSign;
1852     int_fast16_t aExp;
1853     uint32_t aSig;
1854 
1855     a = float32_squash_input_denormal(a, status);
1856     aSig = extractFloat32Frac( a );
1857     aExp = extractFloat32Exp( a );
1858     aSign = extractFloat32Sign( a );
1859     if ( aExp == 0xFF ) {
1860         if (aSig) {
1861             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1862         }
1863         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1864     }
1865     if ( aExp == 0 ) {
1866         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1867         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1868     }
1869     aSig |= 0x00800000;
1870     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1871 
1872 }
1873 
1874 /*----------------------------------------------------------------------------
1875 | Returns the result of converting the single-precision floating-point value
1876 | `a' to the double-precision floating-point format.  The conversion is
1877 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1878 | Arithmetic.
1879 *----------------------------------------------------------------------------*/
1880 
1881 float128 float32_to_float128(float32 a, float_status *status)
1882 {
1883     flag aSign;
1884     int_fast16_t aExp;
1885     uint32_t aSig;
1886 
1887     a = float32_squash_input_denormal(a, status);
1888     aSig = extractFloat32Frac( a );
1889     aExp = extractFloat32Exp( a );
1890     aSign = extractFloat32Sign( a );
1891     if ( aExp == 0xFF ) {
1892         if (aSig) {
1893             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1894         }
1895         return packFloat128( aSign, 0x7FFF, 0, 0 );
1896     }
1897     if ( aExp == 0 ) {
1898         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1899         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1900         --aExp;
1901     }
1902     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1903 
1904 }
1905 
1906 /*----------------------------------------------------------------------------
1907 | Rounds the single-precision floating-point value `a' to an integer, and
1908 | returns the result as a single-precision floating-point value.  The
1909 | operation is performed according to the IEC/IEEE Standard for Binary
1910 | Floating-Point Arithmetic.
1911 *----------------------------------------------------------------------------*/
1912 
1913 float32 float32_round_to_int(float32 a, float_status *status)
1914 {
1915     flag aSign;
1916     int_fast16_t aExp;
1917     uint32_t lastBitMask, roundBitsMask;
1918     uint32_t z;
1919     a = float32_squash_input_denormal(a, status);
1920 
1921     aExp = extractFloat32Exp( a );
1922     if ( 0x96 <= aExp ) {
1923         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1924             return propagateFloat32NaN(a, a, status);
1925         }
1926         return a;
1927     }
1928     if ( aExp <= 0x7E ) {
1929         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1930         status->float_exception_flags |= float_flag_inexact;
1931         aSign = extractFloat32Sign( a );
1932         switch (status->float_rounding_mode) {
1933          case float_round_nearest_even:
1934             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1935                 return packFloat32( aSign, 0x7F, 0 );
1936             }
1937             break;
1938         case float_round_ties_away:
1939             if (aExp == 0x7E) {
1940                 return packFloat32(aSign, 0x7F, 0);
1941             }
1942             break;
1943          case float_round_down:
1944             return make_float32(aSign ? 0xBF800000 : 0);
1945          case float_round_up:
1946             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1947         }
1948         return packFloat32( aSign, 0, 0 );
1949     }
1950     lastBitMask = 1;
1951     lastBitMask <<= 0x96 - aExp;
1952     roundBitsMask = lastBitMask - 1;
1953     z = float32_val(a);
1954     switch (status->float_rounding_mode) {
1955     case float_round_nearest_even:
1956         z += lastBitMask>>1;
1957         if ((z & roundBitsMask) == 0) {
1958             z &= ~lastBitMask;
1959         }
1960         break;
1961     case float_round_ties_away:
1962         z += lastBitMask >> 1;
1963         break;
1964     case float_round_to_zero:
1965         break;
1966     case float_round_up:
1967         if (!extractFloat32Sign(make_float32(z))) {
1968             z += roundBitsMask;
1969         }
1970         break;
1971     case float_round_down:
1972         if (extractFloat32Sign(make_float32(z))) {
1973             z += roundBitsMask;
1974         }
1975         break;
1976     default:
1977         abort();
1978     }
1979     z &= ~ roundBitsMask;
1980     if (z != float32_val(a)) {
1981         status->float_exception_flags |= float_flag_inexact;
1982     }
1983     return make_float32(z);
1984 
1985 }
1986 
1987 /*----------------------------------------------------------------------------
1988 | Returns the result of adding the absolute values of the single-precision
1989 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1990 | before being returned.  `zSign' is ignored if the result is a NaN.
1991 | The addition is performed according to the IEC/IEEE Standard for Binary
1992 | Floating-Point Arithmetic.
1993 *----------------------------------------------------------------------------*/
1994 
1995 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1996                               float_status *status)
1997 {
1998     int_fast16_t aExp, bExp, zExp;
1999     uint32_t aSig, bSig, zSig;
2000     int_fast16_t expDiff;
2001 
2002     aSig = extractFloat32Frac( a );
2003     aExp = extractFloat32Exp( a );
2004     bSig = extractFloat32Frac( b );
2005     bExp = extractFloat32Exp( b );
2006     expDiff = aExp - bExp;
2007     aSig <<= 6;
2008     bSig <<= 6;
2009     if ( 0 < expDiff ) {
2010         if ( aExp == 0xFF ) {
2011             if (aSig) {
2012                 return propagateFloat32NaN(a, b, status);
2013             }
2014             return a;
2015         }
2016         if ( bExp == 0 ) {
2017             --expDiff;
2018         }
2019         else {
2020             bSig |= 0x20000000;
2021         }
2022         shift32RightJamming( bSig, expDiff, &bSig );
2023         zExp = aExp;
2024     }
2025     else if ( expDiff < 0 ) {
2026         if ( bExp == 0xFF ) {
2027             if (bSig) {
2028                 return propagateFloat32NaN(a, b, status);
2029             }
2030             return packFloat32( zSign, 0xFF, 0 );
2031         }
2032         if ( aExp == 0 ) {
2033             ++expDiff;
2034         }
2035         else {
2036             aSig |= 0x20000000;
2037         }
2038         shift32RightJamming( aSig, - expDiff, &aSig );
2039         zExp = bExp;
2040     }
2041     else {
2042         if ( aExp == 0xFF ) {
2043             if (aSig | bSig) {
2044                 return propagateFloat32NaN(a, b, status);
2045             }
2046             return a;
2047         }
2048         if ( aExp == 0 ) {
2049             if (status->flush_to_zero) {
2050                 if (aSig | bSig) {
2051                     float_raise(float_flag_output_denormal, status);
2052                 }
2053                 return packFloat32(zSign, 0, 0);
2054             }
2055             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2056         }
2057         zSig = 0x40000000 + aSig + bSig;
2058         zExp = aExp;
2059         goto roundAndPack;
2060     }
2061     aSig |= 0x20000000;
2062     zSig = ( aSig + bSig )<<1;
2063     --zExp;
2064     if ( (int32_t) zSig < 0 ) {
2065         zSig = aSig + bSig;
2066         ++zExp;
2067     }
2068  roundAndPack:
2069     return roundAndPackFloat32(zSign, zExp, zSig, status);
2070 
2071 }
2072 
2073 /*----------------------------------------------------------------------------
2074 | Returns the result of subtracting the absolute values of the single-
2075 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2076 | difference is negated before being returned.  `zSign' is ignored if the
2077 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2078 | Standard for Binary Floating-Point Arithmetic.
2079 *----------------------------------------------------------------------------*/
2080 
2081 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2082                               float_status *status)
2083 {
2084     int_fast16_t aExp, bExp, zExp;
2085     uint32_t aSig, bSig, zSig;
2086     int_fast16_t expDiff;
2087 
2088     aSig = extractFloat32Frac( a );
2089     aExp = extractFloat32Exp( a );
2090     bSig = extractFloat32Frac( b );
2091     bExp = extractFloat32Exp( b );
2092     expDiff = aExp - bExp;
2093     aSig <<= 7;
2094     bSig <<= 7;
2095     if ( 0 < expDiff ) goto aExpBigger;
2096     if ( expDiff < 0 ) goto bExpBigger;
2097     if ( aExp == 0xFF ) {
2098         if (aSig | bSig) {
2099             return propagateFloat32NaN(a, b, status);
2100         }
2101         float_raise(float_flag_invalid, status);
2102         return float32_default_nan;
2103     }
2104     if ( aExp == 0 ) {
2105         aExp = 1;
2106         bExp = 1;
2107     }
2108     if ( bSig < aSig ) goto aBigger;
2109     if ( aSig < bSig ) goto bBigger;
2110     return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2111  bExpBigger:
2112     if ( bExp == 0xFF ) {
2113         if (bSig) {
2114             return propagateFloat32NaN(a, b, status);
2115         }
2116         return packFloat32( zSign ^ 1, 0xFF, 0 );
2117     }
2118     if ( aExp == 0 ) {
2119         ++expDiff;
2120     }
2121     else {
2122         aSig |= 0x40000000;
2123     }
2124     shift32RightJamming( aSig, - expDiff, &aSig );
2125     bSig |= 0x40000000;
2126  bBigger:
2127     zSig = bSig - aSig;
2128     zExp = bExp;
2129     zSign ^= 1;
2130     goto normalizeRoundAndPack;
2131  aExpBigger:
2132     if ( aExp == 0xFF ) {
2133         if (aSig) {
2134             return propagateFloat32NaN(a, b, status);
2135         }
2136         return a;
2137     }
2138     if ( bExp == 0 ) {
2139         --expDiff;
2140     }
2141     else {
2142         bSig |= 0x40000000;
2143     }
2144     shift32RightJamming( bSig, expDiff, &bSig );
2145     aSig |= 0x40000000;
2146  aBigger:
2147     zSig = aSig - bSig;
2148     zExp = aExp;
2149  normalizeRoundAndPack:
2150     --zExp;
2151     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2152 
2153 }
2154 
2155 /*----------------------------------------------------------------------------
2156 | Returns the result of adding the single-precision floating-point values `a'
2157 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2158 | Binary Floating-Point Arithmetic.
2159 *----------------------------------------------------------------------------*/
2160 
2161 float32 float32_add(float32 a, float32 b, float_status *status)
2162 {
2163     flag aSign, bSign;
2164     a = float32_squash_input_denormal(a, status);
2165     b = float32_squash_input_denormal(b, status);
2166 
2167     aSign = extractFloat32Sign( a );
2168     bSign = extractFloat32Sign( b );
2169     if ( aSign == bSign ) {
2170         return addFloat32Sigs(a, b, aSign, status);
2171     }
2172     else {
2173         return subFloat32Sigs(a, b, aSign, status);
2174     }
2175 
2176 }
2177 
2178 /*----------------------------------------------------------------------------
2179 | Returns the result of subtracting the single-precision floating-point values
2180 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2181 | for Binary Floating-Point Arithmetic.
2182 *----------------------------------------------------------------------------*/
2183 
2184 float32 float32_sub(float32 a, float32 b, float_status *status)
2185 {
2186     flag aSign, bSign;
2187     a = float32_squash_input_denormal(a, status);
2188     b = float32_squash_input_denormal(b, status);
2189 
2190     aSign = extractFloat32Sign( a );
2191     bSign = extractFloat32Sign( b );
2192     if ( aSign == bSign ) {
2193         return subFloat32Sigs(a, b, aSign, status);
2194     }
2195     else {
2196         return addFloat32Sigs(a, b, aSign, status);
2197     }
2198 
2199 }
2200 
2201 /*----------------------------------------------------------------------------
2202 | Returns the result of multiplying the single-precision floating-point values
2203 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2204 | for Binary Floating-Point Arithmetic.
2205 *----------------------------------------------------------------------------*/
2206 
2207 float32 float32_mul(float32 a, float32 b, float_status *status)
2208 {
2209     flag aSign, bSign, zSign;
2210     int_fast16_t aExp, bExp, zExp;
2211     uint32_t aSig, bSig;
2212     uint64_t zSig64;
2213     uint32_t zSig;
2214 
2215     a = float32_squash_input_denormal(a, status);
2216     b = float32_squash_input_denormal(b, status);
2217 
2218     aSig = extractFloat32Frac( a );
2219     aExp = extractFloat32Exp( a );
2220     aSign = extractFloat32Sign( a );
2221     bSig = extractFloat32Frac( b );
2222     bExp = extractFloat32Exp( b );
2223     bSign = extractFloat32Sign( b );
2224     zSign = aSign ^ bSign;
2225     if ( aExp == 0xFF ) {
2226         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2227             return propagateFloat32NaN(a, b, status);
2228         }
2229         if ( ( bExp | bSig ) == 0 ) {
2230             float_raise(float_flag_invalid, status);
2231             return float32_default_nan;
2232         }
2233         return packFloat32( zSign, 0xFF, 0 );
2234     }
2235     if ( bExp == 0xFF ) {
2236         if (bSig) {
2237             return propagateFloat32NaN(a, b, status);
2238         }
2239         if ( ( aExp | aSig ) == 0 ) {
2240             float_raise(float_flag_invalid, status);
2241             return float32_default_nan;
2242         }
2243         return packFloat32( zSign, 0xFF, 0 );
2244     }
2245     if ( aExp == 0 ) {
2246         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2247         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2248     }
2249     if ( bExp == 0 ) {
2250         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2251         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2252     }
2253     zExp = aExp + bExp - 0x7F;
2254     aSig = ( aSig | 0x00800000 )<<7;
2255     bSig = ( bSig | 0x00800000 )<<8;
2256     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2257     zSig = zSig64;
2258     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2259         zSig <<= 1;
2260         --zExp;
2261     }
2262     return roundAndPackFloat32(zSign, zExp, zSig, status);
2263 
2264 }
2265 
2266 /*----------------------------------------------------------------------------
2267 | Returns the result of dividing the single-precision floating-point value `a'
2268 | by the corresponding value `b'.  The operation is performed according to the
2269 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 *----------------------------------------------------------------------------*/
2271 
2272 float32 float32_div(float32 a, float32 b, float_status *status)
2273 {
2274     flag aSign, bSign, zSign;
2275     int_fast16_t aExp, bExp, zExp;
2276     uint32_t aSig, bSig, zSig;
2277     a = float32_squash_input_denormal(a, status);
2278     b = float32_squash_input_denormal(b, status);
2279 
2280     aSig = extractFloat32Frac( a );
2281     aExp = extractFloat32Exp( a );
2282     aSign = extractFloat32Sign( a );
2283     bSig = extractFloat32Frac( b );
2284     bExp = extractFloat32Exp( b );
2285     bSign = extractFloat32Sign( b );
2286     zSign = aSign ^ bSign;
2287     if ( aExp == 0xFF ) {
2288         if (aSig) {
2289             return propagateFloat32NaN(a, b, status);
2290         }
2291         if ( bExp == 0xFF ) {
2292             if (bSig) {
2293                 return propagateFloat32NaN(a, b, status);
2294             }
2295             float_raise(float_flag_invalid, status);
2296             return float32_default_nan;
2297         }
2298         return packFloat32( zSign, 0xFF, 0 );
2299     }
2300     if ( bExp == 0xFF ) {
2301         if (bSig) {
2302             return propagateFloat32NaN(a, b, status);
2303         }
2304         return packFloat32( zSign, 0, 0 );
2305     }
2306     if ( bExp == 0 ) {
2307         if ( bSig == 0 ) {
2308             if ( ( aExp | aSig ) == 0 ) {
2309                 float_raise(float_flag_invalid, status);
2310                 return float32_default_nan;
2311             }
2312             float_raise(float_flag_divbyzero, status);
2313             return packFloat32( zSign, 0xFF, 0 );
2314         }
2315         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2316     }
2317     if ( aExp == 0 ) {
2318         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2319         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2320     }
2321     zExp = aExp - bExp + 0x7D;
2322     aSig = ( aSig | 0x00800000 )<<7;
2323     bSig = ( bSig | 0x00800000 )<<8;
2324     if ( bSig <= ( aSig + aSig ) ) {
2325         aSig >>= 1;
2326         ++zExp;
2327     }
2328     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2329     if ( ( zSig & 0x3F ) == 0 ) {
2330         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2331     }
2332     return roundAndPackFloat32(zSign, zExp, zSig, status);
2333 
2334 }
2335 
2336 /*----------------------------------------------------------------------------
2337 | Returns the remainder of the single-precision floating-point value `a'
2338 | with respect to the corresponding value `b'.  The operation is performed
2339 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2340 *----------------------------------------------------------------------------*/
2341 
2342 float32 float32_rem(float32 a, float32 b, float_status *status)
2343 {
2344     flag aSign, zSign;
2345     int_fast16_t aExp, bExp, expDiff;
2346     uint32_t aSig, bSig;
2347     uint32_t q;
2348     uint64_t aSig64, bSig64, q64;
2349     uint32_t alternateASig;
2350     int32_t sigMean;
2351     a = float32_squash_input_denormal(a, status);
2352     b = float32_squash_input_denormal(b, status);
2353 
2354     aSig = extractFloat32Frac( a );
2355     aExp = extractFloat32Exp( a );
2356     aSign = extractFloat32Sign( a );
2357     bSig = extractFloat32Frac( b );
2358     bExp = extractFloat32Exp( b );
2359     if ( aExp == 0xFF ) {
2360         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2361             return propagateFloat32NaN(a, b, status);
2362         }
2363         float_raise(float_flag_invalid, status);
2364         return float32_default_nan;
2365     }
2366     if ( bExp == 0xFF ) {
2367         if (bSig) {
2368             return propagateFloat32NaN(a, b, status);
2369         }
2370         return a;
2371     }
2372     if ( bExp == 0 ) {
2373         if ( bSig == 0 ) {
2374             float_raise(float_flag_invalid, status);
2375             return float32_default_nan;
2376         }
2377         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2378     }
2379     if ( aExp == 0 ) {
2380         if ( aSig == 0 ) return a;
2381         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2382     }
2383     expDiff = aExp - bExp;
2384     aSig |= 0x00800000;
2385     bSig |= 0x00800000;
2386     if ( expDiff < 32 ) {
2387         aSig <<= 8;
2388         bSig <<= 8;
2389         if ( expDiff < 0 ) {
2390             if ( expDiff < -1 ) return a;
2391             aSig >>= 1;
2392         }
2393         q = ( bSig <= aSig );
2394         if ( q ) aSig -= bSig;
2395         if ( 0 < expDiff ) {
2396             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2397             q >>= 32 - expDiff;
2398             bSig >>= 2;
2399             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2400         }
2401         else {
2402             aSig >>= 2;
2403             bSig >>= 2;
2404         }
2405     }
2406     else {
2407         if ( bSig <= aSig ) aSig -= bSig;
2408         aSig64 = ( (uint64_t) aSig )<<40;
2409         bSig64 = ( (uint64_t) bSig )<<40;
2410         expDiff -= 64;
2411         while ( 0 < expDiff ) {
2412             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2413             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2414             aSig64 = - ( ( bSig * q64 )<<38 );
2415             expDiff -= 62;
2416         }
2417         expDiff += 64;
2418         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2419         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2420         q = q64>>( 64 - expDiff );
2421         bSig <<= 6;
2422         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2423     }
2424     do {
2425         alternateASig = aSig;
2426         ++q;
2427         aSig -= bSig;
2428     } while ( 0 <= (int32_t) aSig );
2429     sigMean = aSig + alternateASig;
2430     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2431         aSig = alternateASig;
2432     }
2433     zSign = ( (int32_t) aSig < 0 );
2434     if ( zSign ) aSig = - aSig;
2435     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2436 }
2437 
2438 /*----------------------------------------------------------------------------
2439 | Returns the result of multiplying the single-precision floating-point values
2440 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2441 | multiplication.  The operation is performed according to the IEC/IEEE
2442 | Standard for Binary Floating-Point Arithmetic 754-2008.
2443 | The flags argument allows the caller to select negation of the
2444 | addend, the intermediate product, or the final result. (The difference
2445 | between this and having the caller do a separate negation is that negating
2446 | externally will flip the sign bit on NaNs.)
2447 *----------------------------------------------------------------------------*/
2448 
2449 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2450                        float_status *status)
2451 {
2452     flag aSign, bSign, cSign, zSign;
2453     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2454     uint32_t aSig, bSig, cSig;
2455     flag pInf, pZero, pSign;
2456     uint64_t pSig64, cSig64, zSig64;
2457     uint32_t pSig;
2458     int shiftcount;
2459     flag signflip, infzero;
2460 
2461     a = float32_squash_input_denormal(a, status);
2462     b = float32_squash_input_denormal(b, status);
2463     c = float32_squash_input_denormal(c, status);
2464     aSig = extractFloat32Frac(a);
2465     aExp = extractFloat32Exp(a);
2466     aSign = extractFloat32Sign(a);
2467     bSig = extractFloat32Frac(b);
2468     bExp = extractFloat32Exp(b);
2469     bSign = extractFloat32Sign(b);
2470     cSig = extractFloat32Frac(c);
2471     cExp = extractFloat32Exp(c);
2472     cSign = extractFloat32Sign(c);
2473 
2474     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2475                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2476 
2477     /* It is implementation-defined whether the cases of (0,inf,qnan)
2478      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2479      * they return if they do), so we have to hand this information
2480      * off to the target-specific pick-a-NaN routine.
2481      */
2482     if (((aExp == 0xff) && aSig) ||
2483         ((bExp == 0xff) && bSig) ||
2484         ((cExp == 0xff) && cSig)) {
2485         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2486     }
2487 
2488     if (infzero) {
2489         float_raise(float_flag_invalid, status);
2490         return float32_default_nan;
2491     }
2492 
2493     if (flags & float_muladd_negate_c) {
2494         cSign ^= 1;
2495     }
2496 
2497     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2498 
2499     /* Work out the sign and type of the product */
2500     pSign = aSign ^ bSign;
2501     if (flags & float_muladd_negate_product) {
2502         pSign ^= 1;
2503     }
2504     pInf = (aExp == 0xff) || (bExp == 0xff);
2505     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2506 
2507     if (cExp == 0xff) {
2508         if (pInf && (pSign ^ cSign)) {
2509             /* addition of opposite-signed infinities => InvalidOperation */
2510             float_raise(float_flag_invalid, status);
2511             return float32_default_nan;
2512         }
2513         /* Otherwise generate an infinity of the same sign */
2514         return packFloat32(cSign ^ signflip, 0xff, 0);
2515     }
2516 
2517     if (pInf) {
2518         return packFloat32(pSign ^ signflip, 0xff, 0);
2519     }
2520 
2521     if (pZero) {
2522         if (cExp == 0) {
2523             if (cSig == 0) {
2524                 /* Adding two exact zeroes */
2525                 if (pSign == cSign) {
2526                     zSign = pSign;
2527                 } else if (status->float_rounding_mode == float_round_down) {
2528                     zSign = 1;
2529                 } else {
2530                     zSign = 0;
2531                 }
2532                 return packFloat32(zSign ^ signflip, 0, 0);
2533             }
2534             /* Exact zero plus a denorm */
2535             if (status->flush_to_zero) {
2536                 float_raise(float_flag_output_denormal, status);
2537                 return packFloat32(cSign ^ signflip, 0, 0);
2538             }
2539         }
2540         /* Zero plus something non-zero : just return the something */
2541         if (flags & float_muladd_halve_result) {
2542             if (cExp == 0) {
2543                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2544             }
2545             /* Subtract one to halve, and one again because roundAndPackFloat32
2546              * wants one less than the true exponent.
2547              */
2548             cExp -= 2;
2549             cSig = (cSig | 0x00800000) << 7;
2550             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2551         }
2552         return packFloat32(cSign ^ signflip, cExp, cSig);
2553     }
2554 
2555     if (aExp == 0) {
2556         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2557     }
2558     if (bExp == 0) {
2559         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2560     }
2561 
2562     /* Calculate the actual result a * b + c */
2563 
2564     /* Multiply first; this is easy. */
2565     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2566      * because we want the true exponent, not the "one-less-than"
2567      * flavour that roundAndPackFloat32() takes.
2568      */
2569     pExp = aExp + bExp - 0x7e;
2570     aSig = (aSig | 0x00800000) << 7;
2571     bSig = (bSig | 0x00800000) << 8;
2572     pSig64 = (uint64_t)aSig * bSig;
2573     if ((int64_t)(pSig64 << 1) >= 0) {
2574         pSig64 <<= 1;
2575         pExp--;
2576     }
2577 
2578     zSign = pSign ^ signflip;
2579 
2580     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2581      * position 62.
2582      */
2583     if (cExp == 0) {
2584         if (!cSig) {
2585             /* Throw out the special case of c being an exact zero now */
2586             shift64RightJamming(pSig64, 32, &pSig64);
2587             pSig = pSig64;
2588             if (flags & float_muladd_halve_result) {
2589                 pExp--;
2590             }
2591             return roundAndPackFloat32(zSign, pExp - 1,
2592                                        pSig, status);
2593         }
2594         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2595     }
2596 
2597     cSig64 = (uint64_t)cSig << (62 - 23);
2598     cSig64 |= LIT64(0x4000000000000000);
2599     expDiff = pExp - cExp;
2600 
2601     if (pSign == cSign) {
2602         /* Addition */
2603         if (expDiff > 0) {
2604             /* scale c to match p */
2605             shift64RightJamming(cSig64, expDiff, &cSig64);
2606             zExp = pExp;
2607         } else if (expDiff < 0) {
2608             /* scale p to match c */
2609             shift64RightJamming(pSig64, -expDiff, &pSig64);
2610             zExp = cExp;
2611         } else {
2612             /* no scaling needed */
2613             zExp = cExp;
2614         }
2615         /* Add significands and make sure explicit bit ends up in posn 62 */
2616         zSig64 = pSig64 + cSig64;
2617         if ((int64_t)zSig64 < 0) {
2618             shift64RightJamming(zSig64, 1, &zSig64);
2619         } else {
2620             zExp--;
2621         }
2622     } else {
2623         /* Subtraction */
2624         if (expDiff > 0) {
2625             shift64RightJamming(cSig64, expDiff, &cSig64);
2626             zSig64 = pSig64 - cSig64;
2627             zExp = pExp;
2628         } else if (expDiff < 0) {
2629             shift64RightJamming(pSig64, -expDiff, &pSig64);
2630             zSig64 = cSig64 - pSig64;
2631             zExp = cExp;
2632             zSign ^= 1;
2633         } else {
2634             zExp = pExp;
2635             if (cSig64 < pSig64) {
2636                 zSig64 = pSig64 - cSig64;
2637             } else if (pSig64 < cSig64) {
2638                 zSig64 = cSig64 - pSig64;
2639                 zSign ^= 1;
2640             } else {
2641                 /* Exact zero */
2642                 zSign = signflip;
2643                 if (status->float_rounding_mode == float_round_down) {
2644                     zSign ^= 1;
2645                 }
2646                 return packFloat32(zSign, 0, 0);
2647             }
2648         }
2649         --zExp;
2650         /* Normalize to put the explicit bit back into bit 62. */
2651         shiftcount = countLeadingZeros64(zSig64) - 1;
2652         zSig64 <<= shiftcount;
2653         zExp -= shiftcount;
2654     }
2655     if (flags & float_muladd_halve_result) {
2656         zExp--;
2657     }
2658 
2659     shift64RightJamming(zSig64, 32, &zSig64);
2660     return roundAndPackFloat32(zSign, zExp, zSig64, status);
2661 }
2662 
2663 
2664 /*----------------------------------------------------------------------------
2665 | Returns the square root of the single-precision floating-point value `a'.
2666 | The operation is performed according to the IEC/IEEE Standard for Binary
2667 | Floating-Point Arithmetic.
2668 *----------------------------------------------------------------------------*/
2669 
2670 float32 float32_sqrt(float32 a, float_status *status)
2671 {
2672     flag aSign;
2673     int_fast16_t aExp, zExp;
2674     uint32_t aSig, zSig;
2675     uint64_t rem, term;
2676     a = float32_squash_input_denormal(a, status);
2677 
2678     aSig = extractFloat32Frac( a );
2679     aExp = extractFloat32Exp( a );
2680     aSign = extractFloat32Sign( a );
2681     if ( aExp == 0xFF ) {
2682         if (aSig) {
2683             return propagateFloat32NaN(a, float32_zero, status);
2684         }
2685         if ( ! aSign ) return a;
2686         float_raise(float_flag_invalid, status);
2687         return float32_default_nan;
2688     }
2689     if ( aSign ) {
2690         if ( ( aExp | aSig ) == 0 ) return a;
2691         float_raise(float_flag_invalid, status);
2692         return float32_default_nan;
2693     }
2694     if ( aExp == 0 ) {
2695         if ( aSig == 0 ) return float32_zero;
2696         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2697     }
2698     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2699     aSig = ( aSig | 0x00800000 )<<8;
2700     zSig = estimateSqrt32( aExp, aSig ) + 2;
2701     if ( ( zSig & 0x7F ) <= 5 ) {
2702         if ( zSig < 2 ) {
2703             zSig = 0x7FFFFFFF;
2704             goto roundAndPack;
2705         }
2706         aSig >>= aExp & 1;
2707         term = ( (uint64_t) zSig ) * zSig;
2708         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2709         while ( (int64_t) rem < 0 ) {
2710             --zSig;
2711             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2712         }
2713         zSig |= ( rem != 0 );
2714     }
2715     shift32RightJamming( zSig, 1, &zSig );
2716  roundAndPack:
2717     return roundAndPackFloat32(0, zExp, zSig, status);
2718 
2719 }
2720 
2721 /*----------------------------------------------------------------------------
2722 | Returns the binary exponential of the single-precision floating-point value
2723 | `a'. The operation is performed according to the IEC/IEEE Standard for
2724 | Binary Floating-Point Arithmetic.
2725 |
2726 | Uses the following identities:
2727 |
2728 | 1. -------------------------------------------------------------------------
2729 |      x    x*ln(2)
2730 |     2  = e
2731 |
2732 | 2. -------------------------------------------------------------------------
2733 |                      2     3     4     5           n
2734 |      x        x     x     x     x     x           x
2735 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2736 |               1!    2!    3!    4!    5!          n!
2737 *----------------------------------------------------------------------------*/
2738 
2739 static const float64 float32_exp2_coefficients[15] =
2740 {
2741     const_float64( 0x3ff0000000000000ll ), /*  1 */
2742     const_float64( 0x3fe0000000000000ll ), /*  2 */
2743     const_float64( 0x3fc5555555555555ll ), /*  3 */
2744     const_float64( 0x3fa5555555555555ll ), /*  4 */
2745     const_float64( 0x3f81111111111111ll ), /*  5 */
2746     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2747     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2748     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2749     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2750     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2751     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2752     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2753     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2754     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2755     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2756 };
2757 
2758 float32 float32_exp2(float32 a, float_status *status)
2759 {
2760     flag aSign;
2761     int_fast16_t aExp;
2762     uint32_t aSig;
2763     float64 r, x, xn;
2764     int i;
2765     a = float32_squash_input_denormal(a, status);
2766 
2767     aSig = extractFloat32Frac( a );
2768     aExp = extractFloat32Exp( a );
2769     aSign = extractFloat32Sign( a );
2770 
2771     if ( aExp == 0xFF) {
2772         if (aSig) {
2773             return propagateFloat32NaN(a, float32_zero, status);
2774         }
2775         return (aSign) ? float32_zero : a;
2776     }
2777     if (aExp == 0) {
2778         if (aSig == 0) return float32_one;
2779     }
2780 
2781     float_raise(float_flag_inexact, status);
2782 
2783     /* ******************************* */
2784     /* using float64 for approximation */
2785     /* ******************************* */
2786     x = float32_to_float64(a, status);
2787     x = float64_mul(x, float64_ln2, status);
2788 
2789     xn = x;
2790     r = float64_one;
2791     for (i = 0 ; i < 15 ; i++) {
2792         float64 f;
2793 
2794         f = float64_mul(xn, float32_exp2_coefficients[i], status);
2795         r = float64_add(r, f, status);
2796 
2797         xn = float64_mul(xn, x, status);
2798     }
2799 
2800     return float64_to_float32(r, status);
2801 }
2802 
2803 /*----------------------------------------------------------------------------
2804 | Returns the binary log of the single-precision floating-point value `a'.
2805 | The operation is performed according to the IEC/IEEE Standard for Binary
2806 | Floating-Point Arithmetic.
2807 *----------------------------------------------------------------------------*/
2808 float32 float32_log2(float32 a, float_status *status)
2809 {
2810     flag aSign, zSign;
2811     int_fast16_t aExp;
2812     uint32_t aSig, zSig, i;
2813 
2814     a = float32_squash_input_denormal(a, status);
2815     aSig = extractFloat32Frac( a );
2816     aExp = extractFloat32Exp( a );
2817     aSign = extractFloat32Sign( a );
2818 
2819     if ( aExp == 0 ) {
2820         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2821         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2822     }
2823     if ( aSign ) {
2824         float_raise(float_flag_invalid, status);
2825         return float32_default_nan;
2826     }
2827     if ( aExp == 0xFF ) {
2828         if (aSig) {
2829             return propagateFloat32NaN(a, float32_zero, status);
2830         }
2831         return a;
2832     }
2833 
2834     aExp -= 0x7F;
2835     aSig |= 0x00800000;
2836     zSign = aExp < 0;
2837     zSig = aExp << 23;
2838 
2839     for (i = 1 << 22; i > 0; i >>= 1) {
2840         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2841         if ( aSig & 0x01000000 ) {
2842             aSig >>= 1;
2843             zSig |= i;
2844         }
2845     }
2846 
2847     if ( zSign )
2848         zSig = -zSig;
2849 
2850     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2851 }
2852 
2853 /*----------------------------------------------------------------------------
2854 | Returns 1 if the single-precision floating-point value `a' is equal to
2855 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2856 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2857 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2858 *----------------------------------------------------------------------------*/
2859 
2860 int float32_eq(float32 a, float32 b, float_status *status)
2861 {
2862     uint32_t av, bv;
2863     a = float32_squash_input_denormal(a, status);
2864     b = float32_squash_input_denormal(b, status);
2865 
2866     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2867          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2868        ) {
2869         float_raise(float_flag_invalid, status);
2870         return 0;
2871     }
2872     av = float32_val(a);
2873     bv = float32_val(b);
2874     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2875 }
2876 
2877 /*----------------------------------------------------------------------------
2878 | Returns 1 if the single-precision floating-point value `a' is less than
2879 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2880 | exception is raised if either operand is a NaN.  The comparison is performed
2881 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2882 *----------------------------------------------------------------------------*/
2883 
2884 int float32_le(float32 a, float32 b, float_status *status)
2885 {
2886     flag aSign, bSign;
2887     uint32_t av, bv;
2888     a = float32_squash_input_denormal(a, status);
2889     b = float32_squash_input_denormal(b, status);
2890 
2891     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2892          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2893        ) {
2894         float_raise(float_flag_invalid, status);
2895         return 0;
2896     }
2897     aSign = extractFloat32Sign( a );
2898     bSign = extractFloat32Sign( b );
2899     av = float32_val(a);
2900     bv = float32_val(b);
2901     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2902     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2903 
2904 }
2905 
2906 /*----------------------------------------------------------------------------
2907 | Returns 1 if the single-precision floating-point value `a' is less than
2908 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2909 | raised if either operand is a NaN.  The comparison is performed according
2910 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2911 *----------------------------------------------------------------------------*/
2912 
2913 int float32_lt(float32 a, float32 b, float_status *status)
2914 {
2915     flag aSign, bSign;
2916     uint32_t av, bv;
2917     a = float32_squash_input_denormal(a, status);
2918     b = float32_squash_input_denormal(b, status);
2919 
2920     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2921          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2922        ) {
2923         float_raise(float_flag_invalid, status);
2924         return 0;
2925     }
2926     aSign = extractFloat32Sign( a );
2927     bSign = extractFloat32Sign( b );
2928     av = float32_val(a);
2929     bv = float32_val(b);
2930     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2931     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2932 
2933 }
2934 
2935 /*----------------------------------------------------------------------------
2936 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2937 | be compared, and 0 otherwise.  The invalid exception is raised if either
2938 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2939 | Standard for Binary Floating-Point Arithmetic.
2940 *----------------------------------------------------------------------------*/
2941 
2942 int float32_unordered(float32 a, float32 b, float_status *status)
2943 {
2944     a = float32_squash_input_denormal(a, status);
2945     b = float32_squash_input_denormal(b, status);
2946 
2947     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2948          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2949        ) {
2950         float_raise(float_flag_invalid, status);
2951         return 1;
2952     }
2953     return 0;
2954 }
2955 
2956 /*----------------------------------------------------------------------------
2957 | Returns 1 if the single-precision floating-point value `a' is equal to
2958 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2959 | exception.  The comparison is performed according to the IEC/IEEE Standard
2960 | for Binary Floating-Point Arithmetic.
2961 *----------------------------------------------------------------------------*/
2962 
2963 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2964 {
2965     a = float32_squash_input_denormal(a, status);
2966     b = float32_squash_input_denormal(b, status);
2967 
2968     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2969          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2970        ) {
2971         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2972             float_raise(float_flag_invalid, status);
2973         }
2974         return 0;
2975     }
2976     return ( float32_val(a) == float32_val(b) ) ||
2977             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2978 }
2979 
2980 /*----------------------------------------------------------------------------
2981 | Returns 1 if the single-precision floating-point value `a' is less than or
2982 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2983 | cause an exception.  Otherwise, the comparison is performed according to the
2984 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2985 *----------------------------------------------------------------------------*/
2986 
2987 int float32_le_quiet(float32 a, float32 b, float_status *status)
2988 {
2989     flag aSign, bSign;
2990     uint32_t av, bv;
2991     a = float32_squash_input_denormal(a, status);
2992     b = float32_squash_input_denormal(b, status);
2993 
2994     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2995          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2996        ) {
2997         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2998             float_raise(float_flag_invalid, status);
2999         }
3000         return 0;
3001     }
3002     aSign = extractFloat32Sign( a );
3003     bSign = extractFloat32Sign( b );
3004     av = float32_val(a);
3005     bv = float32_val(b);
3006     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3007     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3008 
3009 }
3010 
3011 /*----------------------------------------------------------------------------
3012 | Returns 1 if the single-precision floating-point value `a' is less than
3013 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3014 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3015 | Standard for Binary Floating-Point Arithmetic.
3016 *----------------------------------------------------------------------------*/
3017 
3018 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3019 {
3020     flag aSign, bSign;
3021     uint32_t av, bv;
3022     a = float32_squash_input_denormal(a, status);
3023     b = float32_squash_input_denormal(b, status);
3024 
3025     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3026          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3027        ) {
3028         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3029             float_raise(float_flag_invalid, status);
3030         }
3031         return 0;
3032     }
3033     aSign = extractFloat32Sign( a );
3034     bSign = extractFloat32Sign( b );
3035     av = float32_val(a);
3036     bv = float32_val(b);
3037     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3038     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3039 
3040 }
3041 
3042 /*----------------------------------------------------------------------------
3043 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3044 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3045 | comparison is performed according to the IEC/IEEE Standard for Binary
3046 | Floating-Point Arithmetic.
3047 *----------------------------------------------------------------------------*/
3048 
3049 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3050 {
3051     a = float32_squash_input_denormal(a, status);
3052     b = float32_squash_input_denormal(b, status);
3053 
3054     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3055          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3056        ) {
3057         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3058             float_raise(float_flag_invalid, status);
3059         }
3060         return 1;
3061     }
3062     return 0;
3063 }
3064 
3065 /*----------------------------------------------------------------------------
3066 | Returns the result of converting the double-precision floating-point value
3067 | `a' to the 32-bit two's complement integer format.  The conversion is
3068 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3069 | Arithmetic---which means in particular that the conversion is rounded
3070 | according to the current rounding mode.  If `a' is a NaN, the largest
3071 | positive integer is returned.  Otherwise, if the conversion overflows, the
3072 | largest integer with the same sign as `a' is returned.
3073 *----------------------------------------------------------------------------*/
3074 
3075 int32_t float64_to_int32(float64 a, float_status *status)
3076 {
3077     flag aSign;
3078     int_fast16_t aExp, shiftCount;
3079     uint64_t aSig;
3080     a = float64_squash_input_denormal(a, status);
3081 
3082     aSig = extractFloat64Frac( a );
3083     aExp = extractFloat64Exp( a );
3084     aSign = extractFloat64Sign( a );
3085     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3086     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3087     shiftCount = 0x42C - aExp;
3088     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3089     return roundAndPackInt32(aSign, aSig, status);
3090 
3091 }
3092 
3093 /*----------------------------------------------------------------------------
3094 | Returns the result of converting the double-precision floating-point value
3095 | `a' to the 32-bit two's complement integer format.  The conversion is
3096 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3097 | Arithmetic, except that the conversion is always rounded toward zero.
3098 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3099 | the conversion overflows, the largest integer with the same sign as `a' is
3100 | returned.
3101 *----------------------------------------------------------------------------*/
3102 
3103 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3104 {
3105     flag aSign;
3106     int_fast16_t aExp, shiftCount;
3107     uint64_t aSig, savedASig;
3108     int32_t z;
3109     a = float64_squash_input_denormal(a, status);
3110 
3111     aSig = extractFloat64Frac( a );
3112     aExp = extractFloat64Exp( a );
3113     aSign = extractFloat64Sign( a );
3114     if ( 0x41E < aExp ) {
3115         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3116         goto invalid;
3117     }
3118     else if ( aExp < 0x3FF ) {
3119         if (aExp || aSig) {
3120             status->float_exception_flags |= float_flag_inexact;
3121         }
3122         return 0;
3123     }
3124     aSig |= LIT64( 0x0010000000000000 );
3125     shiftCount = 0x433 - aExp;
3126     savedASig = aSig;
3127     aSig >>= shiftCount;
3128     z = aSig;
3129     if ( aSign ) z = - z;
3130     if ( ( z < 0 ) ^ aSign ) {
3131  invalid:
3132         float_raise(float_flag_invalid, status);
3133         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3134     }
3135     if ( ( aSig<<shiftCount ) != savedASig ) {
3136         status->float_exception_flags |= float_flag_inexact;
3137     }
3138     return z;
3139 
3140 }
3141 
3142 /*----------------------------------------------------------------------------
3143 | Returns the result of converting the double-precision floating-point value
3144 | `a' to the 16-bit two's complement integer format.  The conversion is
3145 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3146 | Arithmetic, except that the conversion is always rounded toward zero.
3147 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3148 | the conversion overflows, the largest integer with the same sign as `a' is
3149 | returned.
3150 *----------------------------------------------------------------------------*/
3151 
3152 int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3153 {
3154     flag aSign;
3155     int_fast16_t aExp, shiftCount;
3156     uint64_t aSig, savedASig;
3157     int32_t z;
3158 
3159     aSig = extractFloat64Frac( a );
3160     aExp = extractFloat64Exp( a );
3161     aSign = extractFloat64Sign( a );
3162     if ( 0x40E < aExp ) {
3163         if ( ( aExp == 0x7FF ) && aSig ) {
3164             aSign = 0;
3165         }
3166         goto invalid;
3167     }
3168     else if ( aExp < 0x3FF ) {
3169         if ( aExp || aSig ) {
3170             status->float_exception_flags |= float_flag_inexact;
3171         }
3172         return 0;
3173     }
3174     aSig |= LIT64( 0x0010000000000000 );
3175     shiftCount = 0x433 - aExp;
3176     savedASig = aSig;
3177     aSig >>= shiftCount;
3178     z = aSig;
3179     if ( aSign ) {
3180         z = - z;
3181     }
3182     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3183  invalid:
3184         float_raise(float_flag_invalid, status);
3185         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3186     }
3187     if ( ( aSig<<shiftCount ) != savedASig ) {
3188         status->float_exception_flags |= float_flag_inexact;
3189     }
3190     return z;
3191 }
3192 
3193 /*----------------------------------------------------------------------------
3194 | Returns the result of converting the double-precision floating-point value
3195 | `a' to the 64-bit two's complement integer format.  The conversion is
3196 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3197 | Arithmetic---which means in particular that the conversion is rounded
3198 | according to the current rounding mode.  If `a' is a NaN, the largest
3199 | positive integer is returned.  Otherwise, if the conversion overflows, the
3200 | largest integer with the same sign as `a' is returned.
3201 *----------------------------------------------------------------------------*/
3202 
3203 int64_t float64_to_int64(float64 a, float_status *status)
3204 {
3205     flag aSign;
3206     int_fast16_t aExp, shiftCount;
3207     uint64_t aSig, aSigExtra;
3208     a = float64_squash_input_denormal(a, status);
3209 
3210     aSig = extractFloat64Frac( a );
3211     aExp = extractFloat64Exp( a );
3212     aSign = extractFloat64Sign( a );
3213     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3214     shiftCount = 0x433 - aExp;
3215     if ( shiftCount <= 0 ) {
3216         if ( 0x43E < aExp ) {
3217             float_raise(float_flag_invalid, status);
3218             if (    ! aSign
3219                  || (    ( aExp == 0x7FF )
3220                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3221                ) {
3222                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3223             }
3224             return (int64_t) LIT64( 0x8000000000000000 );
3225         }
3226         aSigExtra = 0;
3227         aSig <<= - shiftCount;
3228     }
3229     else {
3230         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3231     }
3232     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3233 
3234 }
3235 
3236 /*----------------------------------------------------------------------------
3237 | Returns the result of converting the double-precision floating-point value
3238 | `a' to the 64-bit two's complement integer format.  The conversion is
3239 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3240 | Arithmetic, except that the conversion is always rounded toward zero.
3241 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3242 | the conversion overflows, the largest integer with the same sign as `a' is
3243 | returned.
3244 *----------------------------------------------------------------------------*/
3245 
3246 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3247 {
3248     flag aSign;
3249     int_fast16_t aExp, shiftCount;
3250     uint64_t aSig;
3251     int64_t z;
3252     a = float64_squash_input_denormal(a, status);
3253 
3254     aSig = extractFloat64Frac( a );
3255     aExp = extractFloat64Exp( a );
3256     aSign = extractFloat64Sign( a );
3257     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3258     shiftCount = aExp - 0x433;
3259     if ( 0 <= shiftCount ) {
3260         if ( 0x43E <= aExp ) {
3261             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3262                 float_raise(float_flag_invalid, status);
3263                 if (    ! aSign
3264                      || (    ( aExp == 0x7FF )
3265                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3266                    ) {
3267                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3268                 }
3269             }
3270             return (int64_t) LIT64( 0x8000000000000000 );
3271         }
3272         z = aSig<<shiftCount;
3273     }
3274     else {
3275         if ( aExp < 0x3FE ) {
3276             if (aExp | aSig) {
3277                 status->float_exception_flags |= float_flag_inexact;
3278             }
3279             return 0;
3280         }
3281         z = aSig>>( - shiftCount );
3282         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3283             status->float_exception_flags |= float_flag_inexact;
3284         }
3285     }
3286     if ( aSign ) z = - z;
3287     return z;
3288 
3289 }
3290 
3291 /*----------------------------------------------------------------------------
3292 | Returns the result of converting the double-precision floating-point value
3293 | `a' to the single-precision floating-point format.  The conversion is
3294 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3295 | Arithmetic.
3296 *----------------------------------------------------------------------------*/
3297 
3298 float32 float64_to_float32(float64 a, float_status *status)
3299 {
3300     flag aSign;
3301     int_fast16_t aExp;
3302     uint64_t aSig;
3303     uint32_t zSig;
3304     a = float64_squash_input_denormal(a, status);
3305 
3306     aSig = extractFloat64Frac( a );
3307     aExp = extractFloat64Exp( a );
3308     aSign = extractFloat64Sign( a );
3309     if ( aExp == 0x7FF ) {
3310         if (aSig) {
3311             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3312         }
3313         return packFloat32( aSign, 0xFF, 0 );
3314     }
3315     shift64RightJamming( aSig, 22, &aSig );
3316     zSig = aSig;
3317     if ( aExp || zSig ) {
3318         zSig |= 0x40000000;
3319         aExp -= 0x381;
3320     }
3321     return roundAndPackFloat32(aSign, aExp, zSig, status);
3322 
3323 }
3324 
3325 
3326 /*----------------------------------------------------------------------------
3327 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3328 | half-precision floating-point value, returning the result.  After being
3329 | shifted into the proper positions, the three fields are simply added
3330 | together to form the result.  This means that any integer portion of `zSig'
3331 | will be added into the exponent.  Since a properly normalized significand
3332 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3333 | than the desired result exponent whenever `zSig' is a complete, normalized
3334 | significand.
3335 *----------------------------------------------------------------------------*/
3336 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3337 {
3338     return make_float16(
3339         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3340 }
3341 
3342 /*----------------------------------------------------------------------------
3343 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3344 | and significand `zSig', and returns the proper half-precision floating-
3345 | point value corresponding to the abstract input.  Ordinarily, the abstract
3346 | value is simply rounded and packed into the half-precision format, with
3347 | the inexact exception raised if the abstract input cannot be represented
3348 | exactly.  However, if the abstract value is too large, the overflow and
3349 | inexact exceptions are raised and an infinity or maximal finite value is
3350 | returned.  If the abstract value is too small, the input value is rounded to
3351 | a subnormal number, and the underflow and inexact exceptions are raised if
3352 | the abstract input cannot be represented exactly as a subnormal half-
3353 | precision floating-point number.
3354 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3355 | ARM-style "alternative representation", which omits the NaN and Inf
3356 | encodings in order to raise the maximum representable exponent by one.
3357 |     The input significand `zSig' has its binary point between bits 22
3358 | and 23, which is 13 bits to the left of the usual location.  This shifted
3359 | significand must be normalized or smaller.  If `zSig' is not normalized,
3360 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3361 | and it must not require rounding.  In the usual case that `zSig' is
3362 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3363 | Note the slightly odd position of the binary point in zSig compared with the
3364 | other roundAndPackFloat functions. This should probably be fixed if we
3365 | need to implement more float16 routines than just conversion.
3366 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3367 | Binary Floating-Point Arithmetic.
3368 *----------------------------------------------------------------------------*/
3369 
3370 static float16 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3371                                    uint32_t zSig, flag ieee,
3372                                    float_status *status)
3373 {
3374     int maxexp = ieee ? 29 : 30;
3375     uint32_t mask;
3376     uint32_t increment;
3377     bool rounding_bumps_exp;
3378     bool is_tiny = false;
3379 
3380     /* Calculate the mask of bits of the mantissa which are not
3381      * representable in half-precision and will be lost.
3382      */
3383     if (zExp < 1) {
3384         /* Will be denormal in halfprec */
3385         mask = 0x00ffffff;
3386         if (zExp >= -11) {
3387             mask >>= 11 + zExp;
3388         }
3389     } else {
3390         /* Normal number in halfprec */
3391         mask = 0x00001fff;
3392     }
3393 
3394     switch (status->float_rounding_mode) {
3395     case float_round_nearest_even:
3396         increment = (mask + 1) >> 1;
3397         if ((zSig & mask) == increment) {
3398             increment = zSig & (increment << 1);
3399         }
3400         break;
3401     case float_round_ties_away:
3402         increment = (mask + 1) >> 1;
3403         break;
3404     case float_round_up:
3405         increment = zSign ? 0 : mask;
3406         break;
3407     case float_round_down:
3408         increment = zSign ? mask : 0;
3409         break;
3410     default: /* round_to_zero */
3411         increment = 0;
3412         break;
3413     }
3414 
3415     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3416 
3417     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3418         if (ieee) {
3419             float_raise(float_flag_overflow | float_flag_inexact, status);
3420             return packFloat16(zSign, 0x1f, 0);
3421         } else {
3422             float_raise(float_flag_invalid, status);
3423             return packFloat16(zSign, 0x1f, 0x3ff);
3424         }
3425     }
3426 
3427     if (zExp < 0) {
3428         /* Note that flush-to-zero does not affect half-precision results */
3429         is_tiny =
3430             (status->float_detect_tininess == float_tininess_before_rounding)
3431             || (zExp < -1)
3432             || (!rounding_bumps_exp);
3433     }
3434     if (zSig & mask) {
3435         float_raise(float_flag_inexact, status);
3436         if (is_tiny) {
3437             float_raise(float_flag_underflow, status);
3438         }
3439     }
3440 
3441     zSig += increment;
3442     if (rounding_bumps_exp) {
3443         zSig >>= 1;
3444         zExp++;
3445     }
3446 
3447     if (zExp < -10) {
3448         return packFloat16(zSign, 0, 0);
3449     }
3450     if (zExp < 0) {
3451         zSig >>= -zExp;
3452         zExp = 0;
3453     }
3454     return packFloat16(zSign, zExp, zSig >> 13);
3455 }
3456 
3457 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3458                                       uint32_t *zSigPtr)
3459 {
3460     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3461     *zSigPtr = aSig << shiftCount;
3462     *zExpPtr = 1 - shiftCount;
3463 }
3464 
3465 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3466    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3467 
3468 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3469 {
3470     flag aSign;
3471     int_fast16_t aExp;
3472     uint32_t aSig;
3473 
3474     aSign = extractFloat16Sign(a);
3475     aExp = extractFloat16Exp(a);
3476     aSig = extractFloat16Frac(a);
3477 
3478     if (aExp == 0x1f && ieee) {
3479         if (aSig) {
3480             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3481         }
3482         return packFloat32(aSign, 0xff, 0);
3483     }
3484     if (aExp == 0) {
3485         if (aSig == 0) {
3486             return packFloat32(aSign, 0, 0);
3487         }
3488 
3489         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3490         aExp--;
3491     }
3492     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3493 }
3494 
3495 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3496 {
3497     flag aSign;
3498     int_fast16_t aExp;
3499     uint32_t aSig;
3500 
3501     a = float32_squash_input_denormal(a, status);
3502 
3503     aSig = extractFloat32Frac( a );
3504     aExp = extractFloat32Exp( a );
3505     aSign = extractFloat32Sign( a );
3506     if ( aExp == 0xFF ) {
3507         if (aSig) {
3508             /* Input is a NaN */
3509             if (!ieee) {
3510                 float_raise(float_flag_invalid, status);
3511                 return packFloat16(aSign, 0, 0);
3512             }
3513             return commonNaNToFloat16(
3514                 float32ToCommonNaN(a, status), status);
3515         }
3516         /* Infinity */
3517         if (!ieee) {
3518             float_raise(float_flag_invalid, status);
3519             return packFloat16(aSign, 0x1f, 0x3ff);
3520         }
3521         return packFloat16(aSign, 0x1f, 0);
3522     }
3523     if (aExp == 0 && aSig == 0) {
3524         return packFloat16(aSign, 0, 0);
3525     }
3526     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3527      * even if the input is denormal; however this is harmless because
3528      * the largest possible single-precision denormal is still smaller
3529      * than the smallest representable half-precision denormal, and so we
3530      * will end up ignoring aSig and returning via the "always return zero"
3531      * codepath.
3532      */
3533     aSig |= 0x00800000;
3534     aExp -= 0x71;
3535 
3536     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3537 }
3538 
3539 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3540 {
3541     flag aSign;
3542     int_fast16_t aExp;
3543     uint32_t aSig;
3544 
3545     aSign = extractFloat16Sign(a);
3546     aExp = extractFloat16Exp(a);
3547     aSig = extractFloat16Frac(a);
3548 
3549     if (aExp == 0x1f && ieee) {
3550         if (aSig) {
3551             return commonNaNToFloat64(
3552                 float16ToCommonNaN(a, status), status);
3553         }
3554         return packFloat64(aSign, 0x7ff, 0);
3555     }
3556     if (aExp == 0) {
3557         if (aSig == 0) {
3558             return packFloat64(aSign, 0, 0);
3559         }
3560 
3561         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3562         aExp--;
3563     }
3564     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3565 }
3566 
3567 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3568 {
3569     flag aSign;
3570     int_fast16_t aExp;
3571     uint64_t aSig;
3572     uint32_t zSig;
3573 
3574     a = float64_squash_input_denormal(a, status);
3575 
3576     aSig = extractFloat64Frac(a);
3577     aExp = extractFloat64Exp(a);
3578     aSign = extractFloat64Sign(a);
3579     if (aExp == 0x7FF) {
3580         if (aSig) {
3581             /* Input is a NaN */
3582             if (!ieee) {
3583                 float_raise(float_flag_invalid, status);
3584                 return packFloat16(aSign, 0, 0);
3585             }
3586             return commonNaNToFloat16(
3587                 float64ToCommonNaN(a, status), status);
3588         }
3589         /* Infinity */
3590         if (!ieee) {
3591             float_raise(float_flag_invalid, status);
3592             return packFloat16(aSign, 0x1f, 0x3ff);
3593         }
3594         return packFloat16(aSign, 0x1f, 0);
3595     }
3596     shift64RightJamming(aSig, 29, &aSig);
3597     zSig = aSig;
3598     if (aExp == 0 && zSig == 0) {
3599         return packFloat16(aSign, 0, 0);
3600     }
3601     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3602      * even if the input is denormal; however this is harmless because
3603      * the largest possible single-precision denormal is still smaller
3604      * than the smallest representable half-precision denormal, and so we
3605      * will end up ignoring aSig and returning via the "always return zero"
3606      * codepath.
3607      */
3608     zSig |= 0x00800000;
3609     aExp -= 0x3F1;
3610 
3611     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3612 }
3613 
3614 /*----------------------------------------------------------------------------
3615 | Returns the result of converting the double-precision floating-point value
3616 | `a' to the extended double-precision floating-point format.  The conversion
3617 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3618 | Arithmetic.
3619 *----------------------------------------------------------------------------*/
3620 
3621 floatx80 float64_to_floatx80(float64 a, float_status *status)
3622 {
3623     flag aSign;
3624     int_fast16_t aExp;
3625     uint64_t aSig;
3626 
3627     a = float64_squash_input_denormal(a, status);
3628     aSig = extractFloat64Frac( a );
3629     aExp = extractFloat64Exp( a );
3630     aSign = extractFloat64Sign( a );
3631     if ( aExp == 0x7FF ) {
3632         if (aSig) {
3633             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3634         }
3635         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3636     }
3637     if ( aExp == 0 ) {
3638         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3639         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3640     }
3641     return
3642         packFloatx80(
3643             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3644 
3645 }
3646 
3647 /*----------------------------------------------------------------------------
3648 | Returns the result of converting the double-precision floating-point value
3649 | `a' to the quadruple-precision floating-point format.  The conversion is
3650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3651 | Arithmetic.
3652 *----------------------------------------------------------------------------*/
3653 
3654 float128 float64_to_float128(float64 a, float_status *status)
3655 {
3656     flag aSign;
3657     int_fast16_t aExp;
3658     uint64_t aSig, zSig0, zSig1;
3659 
3660     a = float64_squash_input_denormal(a, status);
3661     aSig = extractFloat64Frac( a );
3662     aExp = extractFloat64Exp( a );
3663     aSign = extractFloat64Sign( a );
3664     if ( aExp == 0x7FF ) {
3665         if (aSig) {
3666             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3667         }
3668         return packFloat128( aSign, 0x7FFF, 0, 0 );
3669     }
3670     if ( aExp == 0 ) {
3671         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3672         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3673         --aExp;
3674     }
3675     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3676     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3677 
3678 }
3679 
3680 /*----------------------------------------------------------------------------
3681 | Rounds the double-precision floating-point value `a' to an integer, and
3682 | returns the result as a double-precision floating-point value.  The
3683 | operation is performed according to the IEC/IEEE Standard for Binary
3684 | Floating-Point Arithmetic.
3685 *----------------------------------------------------------------------------*/
3686 
3687 float64 float64_round_to_int(float64 a, float_status *status)
3688 {
3689     flag aSign;
3690     int_fast16_t aExp;
3691     uint64_t lastBitMask, roundBitsMask;
3692     uint64_t z;
3693     a = float64_squash_input_denormal(a, status);
3694 
3695     aExp = extractFloat64Exp( a );
3696     if ( 0x433 <= aExp ) {
3697         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3698             return propagateFloat64NaN(a, a, status);
3699         }
3700         return a;
3701     }
3702     if ( aExp < 0x3FF ) {
3703         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3704         status->float_exception_flags |= float_flag_inexact;
3705         aSign = extractFloat64Sign( a );
3706         switch (status->float_rounding_mode) {
3707          case float_round_nearest_even:
3708             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3709                 return packFloat64( aSign, 0x3FF, 0 );
3710             }
3711             break;
3712         case float_round_ties_away:
3713             if (aExp == 0x3FE) {
3714                 return packFloat64(aSign, 0x3ff, 0);
3715             }
3716             break;
3717          case float_round_down:
3718             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3719          case float_round_up:
3720             return make_float64(
3721             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3722         }
3723         return packFloat64( aSign, 0, 0 );
3724     }
3725     lastBitMask = 1;
3726     lastBitMask <<= 0x433 - aExp;
3727     roundBitsMask = lastBitMask - 1;
3728     z = float64_val(a);
3729     switch (status->float_rounding_mode) {
3730     case float_round_nearest_even:
3731         z += lastBitMask >> 1;
3732         if ((z & roundBitsMask) == 0) {
3733             z &= ~lastBitMask;
3734         }
3735         break;
3736     case float_round_ties_away:
3737         z += lastBitMask >> 1;
3738         break;
3739     case float_round_to_zero:
3740         break;
3741     case float_round_up:
3742         if (!extractFloat64Sign(make_float64(z))) {
3743             z += roundBitsMask;
3744         }
3745         break;
3746     case float_round_down:
3747         if (extractFloat64Sign(make_float64(z))) {
3748             z += roundBitsMask;
3749         }
3750         break;
3751     default:
3752         abort();
3753     }
3754     z &= ~ roundBitsMask;
3755     if (z != float64_val(a)) {
3756         status->float_exception_flags |= float_flag_inexact;
3757     }
3758     return make_float64(z);
3759 
3760 }
3761 
3762 float64 float64_trunc_to_int(float64 a, float_status *status)
3763 {
3764     int oldmode;
3765     float64 res;
3766     oldmode = status->float_rounding_mode;
3767     status->float_rounding_mode = float_round_to_zero;
3768     res = float64_round_to_int(a, status);
3769     status->float_rounding_mode = oldmode;
3770     return res;
3771 }
3772 
3773 /*----------------------------------------------------------------------------
3774 | Returns the result of adding the absolute values of the double-precision
3775 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3776 | before being returned.  `zSign' is ignored if the result is a NaN.
3777 | The addition is performed according to the IEC/IEEE Standard for Binary
3778 | Floating-Point Arithmetic.
3779 *----------------------------------------------------------------------------*/
3780 
3781 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3782                               float_status *status)
3783 {
3784     int_fast16_t aExp, bExp, zExp;
3785     uint64_t aSig, bSig, zSig;
3786     int_fast16_t expDiff;
3787 
3788     aSig = extractFloat64Frac( a );
3789     aExp = extractFloat64Exp( a );
3790     bSig = extractFloat64Frac( b );
3791     bExp = extractFloat64Exp( b );
3792     expDiff = aExp - bExp;
3793     aSig <<= 9;
3794     bSig <<= 9;
3795     if ( 0 < expDiff ) {
3796         if ( aExp == 0x7FF ) {
3797             if (aSig) {
3798                 return propagateFloat64NaN(a, b, status);
3799             }
3800             return a;
3801         }
3802         if ( bExp == 0 ) {
3803             --expDiff;
3804         }
3805         else {
3806             bSig |= LIT64( 0x2000000000000000 );
3807         }
3808         shift64RightJamming( bSig, expDiff, &bSig );
3809         zExp = aExp;
3810     }
3811     else if ( expDiff < 0 ) {
3812         if ( bExp == 0x7FF ) {
3813             if (bSig) {
3814                 return propagateFloat64NaN(a, b, status);
3815             }
3816             return packFloat64( zSign, 0x7FF, 0 );
3817         }
3818         if ( aExp == 0 ) {
3819             ++expDiff;
3820         }
3821         else {
3822             aSig |= LIT64( 0x2000000000000000 );
3823         }
3824         shift64RightJamming( aSig, - expDiff, &aSig );
3825         zExp = bExp;
3826     }
3827     else {
3828         if ( aExp == 0x7FF ) {
3829             if (aSig | bSig) {
3830                 return propagateFloat64NaN(a, b, status);
3831             }
3832             return a;
3833         }
3834         if ( aExp == 0 ) {
3835             if (status->flush_to_zero) {
3836                 if (aSig | bSig) {
3837                     float_raise(float_flag_output_denormal, status);
3838                 }
3839                 return packFloat64(zSign, 0, 0);
3840             }
3841             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3842         }
3843         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3844         zExp = aExp;
3845         goto roundAndPack;
3846     }
3847     aSig |= LIT64( 0x2000000000000000 );
3848     zSig = ( aSig + bSig )<<1;
3849     --zExp;
3850     if ( (int64_t) zSig < 0 ) {
3851         zSig = aSig + bSig;
3852         ++zExp;
3853     }
3854  roundAndPack:
3855     return roundAndPackFloat64(zSign, zExp, zSig, status);
3856 
3857 }
3858 
3859 /*----------------------------------------------------------------------------
3860 | Returns the result of subtracting the absolute values of the double-
3861 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3862 | difference is negated before being returned.  `zSign' is ignored if the
3863 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3864 | Standard for Binary Floating-Point Arithmetic.
3865 *----------------------------------------------------------------------------*/
3866 
3867 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3868                               float_status *status)
3869 {
3870     int_fast16_t aExp, bExp, zExp;
3871     uint64_t aSig, bSig, zSig;
3872     int_fast16_t expDiff;
3873 
3874     aSig = extractFloat64Frac( a );
3875     aExp = extractFloat64Exp( a );
3876     bSig = extractFloat64Frac( b );
3877     bExp = extractFloat64Exp( b );
3878     expDiff = aExp - bExp;
3879     aSig <<= 10;
3880     bSig <<= 10;
3881     if ( 0 < expDiff ) goto aExpBigger;
3882     if ( expDiff < 0 ) goto bExpBigger;
3883     if ( aExp == 0x7FF ) {
3884         if (aSig | bSig) {
3885             return propagateFloat64NaN(a, b, status);
3886         }
3887         float_raise(float_flag_invalid, status);
3888         return float64_default_nan;
3889     }
3890     if ( aExp == 0 ) {
3891         aExp = 1;
3892         bExp = 1;
3893     }
3894     if ( bSig < aSig ) goto aBigger;
3895     if ( aSig < bSig ) goto bBigger;
3896     return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3897  bExpBigger:
3898     if ( bExp == 0x7FF ) {
3899         if (bSig) {
3900             return propagateFloat64NaN(a, b, status);
3901         }
3902         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3903     }
3904     if ( aExp == 0 ) {
3905         ++expDiff;
3906     }
3907     else {
3908         aSig |= LIT64( 0x4000000000000000 );
3909     }
3910     shift64RightJamming( aSig, - expDiff, &aSig );
3911     bSig |= LIT64( 0x4000000000000000 );
3912  bBigger:
3913     zSig = bSig - aSig;
3914     zExp = bExp;
3915     zSign ^= 1;
3916     goto normalizeRoundAndPack;
3917  aExpBigger:
3918     if ( aExp == 0x7FF ) {
3919         if (aSig) {
3920             return propagateFloat64NaN(a, b, status);
3921         }
3922         return a;
3923     }
3924     if ( bExp == 0 ) {
3925         --expDiff;
3926     }
3927     else {
3928         bSig |= LIT64( 0x4000000000000000 );
3929     }
3930     shift64RightJamming( bSig, expDiff, &bSig );
3931     aSig |= LIT64( 0x4000000000000000 );
3932  aBigger:
3933     zSig = aSig - bSig;
3934     zExp = aExp;
3935  normalizeRoundAndPack:
3936     --zExp;
3937     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3938 
3939 }
3940 
3941 /*----------------------------------------------------------------------------
3942 | Returns the result of adding the double-precision floating-point values `a'
3943 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3944 | Binary Floating-Point Arithmetic.
3945 *----------------------------------------------------------------------------*/
3946 
3947 float64 float64_add(float64 a, float64 b, float_status *status)
3948 {
3949     flag aSign, bSign;
3950     a = float64_squash_input_denormal(a, status);
3951     b = float64_squash_input_denormal(b, status);
3952 
3953     aSign = extractFloat64Sign( a );
3954     bSign = extractFloat64Sign( b );
3955     if ( aSign == bSign ) {
3956         return addFloat64Sigs(a, b, aSign, status);
3957     }
3958     else {
3959         return subFloat64Sigs(a, b, aSign, status);
3960     }
3961 
3962 }
3963 
3964 /*----------------------------------------------------------------------------
3965 | Returns the result of subtracting the double-precision floating-point values
3966 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3967 | for Binary Floating-Point Arithmetic.
3968 *----------------------------------------------------------------------------*/
3969 
3970 float64 float64_sub(float64 a, float64 b, float_status *status)
3971 {
3972     flag aSign, bSign;
3973     a = float64_squash_input_denormal(a, status);
3974     b = float64_squash_input_denormal(b, status);
3975 
3976     aSign = extractFloat64Sign( a );
3977     bSign = extractFloat64Sign( b );
3978     if ( aSign == bSign ) {
3979         return subFloat64Sigs(a, b, aSign, status);
3980     }
3981     else {
3982         return addFloat64Sigs(a, b, aSign, status);
3983     }
3984 
3985 }
3986 
3987 /*----------------------------------------------------------------------------
3988 | Returns the result of multiplying the double-precision floating-point values
3989 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3990 | for Binary Floating-Point Arithmetic.
3991 *----------------------------------------------------------------------------*/
3992 
3993 float64 float64_mul(float64 a, float64 b, float_status *status)
3994 {
3995     flag aSign, bSign, zSign;
3996     int_fast16_t aExp, bExp, zExp;
3997     uint64_t aSig, bSig, zSig0, zSig1;
3998 
3999     a = float64_squash_input_denormal(a, status);
4000     b = float64_squash_input_denormal(b, status);
4001 
4002     aSig = extractFloat64Frac( a );
4003     aExp = extractFloat64Exp( a );
4004     aSign = extractFloat64Sign( a );
4005     bSig = extractFloat64Frac( b );
4006     bExp = extractFloat64Exp( b );
4007     bSign = extractFloat64Sign( b );
4008     zSign = aSign ^ bSign;
4009     if ( aExp == 0x7FF ) {
4010         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4011             return propagateFloat64NaN(a, b, status);
4012         }
4013         if ( ( bExp | bSig ) == 0 ) {
4014             float_raise(float_flag_invalid, status);
4015             return float64_default_nan;
4016         }
4017         return packFloat64( zSign, 0x7FF, 0 );
4018     }
4019     if ( bExp == 0x7FF ) {
4020         if (bSig) {
4021             return propagateFloat64NaN(a, b, status);
4022         }
4023         if ( ( aExp | aSig ) == 0 ) {
4024             float_raise(float_flag_invalid, status);
4025             return float64_default_nan;
4026         }
4027         return packFloat64( zSign, 0x7FF, 0 );
4028     }
4029     if ( aExp == 0 ) {
4030         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4031         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4032     }
4033     if ( bExp == 0 ) {
4034         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4035         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4036     }
4037     zExp = aExp + bExp - 0x3FF;
4038     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4039     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4040     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4041     zSig0 |= ( zSig1 != 0 );
4042     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4043         zSig0 <<= 1;
4044         --zExp;
4045     }
4046     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4047 
4048 }
4049 
4050 /*----------------------------------------------------------------------------
4051 | Returns the result of dividing the double-precision floating-point value `a'
4052 | by the corresponding value `b'.  The operation is performed according to
4053 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4054 *----------------------------------------------------------------------------*/
4055 
4056 float64 float64_div(float64 a, float64 b, float_status *status)
4057 {
4058     flag aSign, bSign, zSign;
4059     int_fast16_t aExp, bExp, zExp;
4060     uint64_t aSig, bSig, zSig;
4061     uint64_t rem0, rem1;
4062     uint64_t term0, term1;
4063     a = float64_squash_input_denormal(a, status);
4064     b = float64_squash_input_denormal(b, status);
4065 
4066     aSig = extractFloat64Frac( a );
4067     aExp = extractFloat64Exp( a );
4068     aSign = extractFloat64Sign( a );
4069     bSig = extractFloat64Frac( b );
4070     bExp = extractFloat64Exp( b );
4071     bSign = extractFloat64Sign( b );
4072     zSign = aSign ^ bSign;
4073     if ( aExp == 0x7FF ) {
4074         if (aSig) {
4075             return propagateFloat64NaN(a, b, status);
4076         }
4077         if ( bExp == 0x7FF ) {
4078             if (bSig) {
4079                 return propagateFloat64NaN(a, b, status);
4080             }
4081             float_raise(float_flag_invalid, status);
4082             return float64_default_nan;
4083         }
4084         return packFloat64( zSign, 0x7FF, 0 );
4085     }
4086     if ( bExp == 0x7FF ) {
4087         if (bSig) {
4088             return propagateFloat64NaN(a, b, status);
4089         }
4090         return packFloat64( zSign, 0, 0 );
4091     }
4092     if ( bExp == 0 ) {
4093         if ( bSig == 0 ) {
4094             if ( ( aExp | aSig ) == 0 ) {
4095                 float_raise(float_flag_invalid, status);
4096                 return float64_default_nan;
4097             }
4098             float_raise(float_flag_divbyzero, status);
4099             return packFloat64( zSign, 0x7FF, 0 );
4100         }
4101         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4102     }
4103     if ( aExp == 0 ) {
4104         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4105         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4106     }
4107     zExp = aExp - bExp + 0x3FD;
4108     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4109     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4110     if ( bSig <= ( aSig + aSig ) ) {
4111         aSig >>= 1;
4112         ++zExp;
4113     }
4114     zSig = estimateDiv128To64( aSig, 0, bSig );
4115     if ( ( zSig & 0x1FF ) <= 2 ) {
4116         mul64To128( bSig, zSig, &term0, &term1 );
4117         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4118         while ( (int64_t) rem0 < 0 ) {
4119             --zSig;
4120             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4121         }
4122         zSig |= ( rem1 != 0 );
4123     }
4124     return roundAndPackFloat64(zSign, zExp, zSig, status);
4125 
4126 }
4127 
4128 /*----------------------------------------------------------------------------
4129 | Returns the remainder of the double-precision floating-point value `a'
4130 | with respect to the corresponding value `b'.  The operation is performed
4131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4132 *----------------------------------------------------------------------------*/
4133 
4134 float64 float64_rem(float64 a, float64 b, float_status *status)
4135 {
4136     flag aSign, zSign;
4137     int_fast16_t aExp, bExp, expDiff;
4138     uint64_t aSig, bSig;
4139     uint64_t q, alternateASig;
4140     int64_t sigMean;
4141 
4142     a = float64_squash_input_denormal(a, status);
4143     b = float64_squash_input_denormal(b, status);
4144     aSig = extractFloat64Frac( a );
4145     aExp = extractFloat64Exp( a );
4146     aSign = extractFloat64Sign( a );
4147     bSig = extractFloat64Frac( b );
4148     bExp = extractFloat64Exp( b );
4149     if ( aExp == 0x7FF ) {
4150         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4151             return propagateFloat64NaN(a, b, status);
4152         }
4153         float_raise(float_flag_invalid, status);
4154         return float64_default_nan;
4155     }
4156     if ( bExp == 0x7FF ) {
4157         if (bSig) {
4158             return propagateFloat64NaN(a, b, status);
4159         }
4160         return a;
4161     }
4162     if ( bExp == 0 ) {
4163         if ( bSig == 0 ) {
4164             float_raise(float_flag_invalid, status);
4165             return float64_default_nan;
4166         }
4167         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4168     }
4169     if ( aExp == 0 ) {
4170         if ( aSig == 0 ) return a;
4171         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4172     }
4173     expDiff = aExp - bExp;
4174     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4175     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4176     if ( expDiff < 0 ) {
4177         if ( expDiff < -1 ) return a;
4178         aSig >>= 1;
4179     }
4180     q = ( bSig <= aSig );
4181     if ( q ) aSig -= bSig;
4182     expDiff -= 64;
4183     while ( 0 < expDiff ) {
4184         q = estimateDiv128To64( aSig, 0, bSig );
4185         q = ( 2 < q ) ? q - 2 : 0;
4186         aSig = - ( ( bSig>>2 ) * q );
4187         expDiff -= 62;
4188     }
4189     expDiff += 64;
4190     if ( 0 < expDiff ) {
4191         q = estimateDiv128To64( aSig, 0, bSig );
4192         q = ( 2 < q ) ? q - 2 : 0;
4193         q >>= 64 - expDiff;
4194         bSig >>= 2;
4195         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4196     }
4197     else {
4198         aSig >>= 2;
4199         bSig >>= 2;
4200     }
4201     do {
4202         alternateASig = aSig;
4203         ++q;
4204         aSig -= bSig;
4205     } while ( 0 <= (int64_t) aSig );
4206     sigMean = aSig + alternateASig;
4207     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4208         aSig = alternateASig;
4209     }
4210     zSign = ( (int64_t) aSig < 0 );
4211     if ( zSign ) aSig = - aSig;
4212     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4213 
4214 }
4215 
4216 /*----------------------------------------------------------------------------
4217 | Returns the result of multiplying the double-precision floating-point values
4218 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4219 | multiplication.  The operation is performed according to the IEC/IEEE
4220 | Standard for Binary Floating-Point Arithmetic 754-2008.
4221 | The flags argument allows the caller to select negation of the
4222 | addend, the intermediate product, or the final result. (The difference
4223 | between this and having the caller do a separate negation is that negating
4224 | externally will flip the sign bit on NaNs.)
4225 *----------------------------------------------------------------------------*/
4226 
4227 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4228                        float_status *status)
4229 {
4230     flag aSign, bSign, cSign, zSign;
4231     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4232     uint64_t aSig, bSig, cSig;
4233     flag pInf, pZero, pSign;
4234     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4235     int shiftcount;
4236     flag signflip, infzero;
4237 
4238     a = float64_squash_input_denormal(a, status);
4239     b = float64_squash_input_denormal(b, status);
4240     c = float64_squash_input_denormal(c, status);
4241     aSig = extractFloat64Frac(a);
4242     aExp = extractFloat64Exp(a);
4243     aSign = extractFloat64Sign(a);
4244     bSig = extractFloat64Frac(b);
4245     bExp = extractFloat64Exp(b);
4246     bSign = extractFloat64Sign(b);
4247     cSig = extractFloat64Frac(c);
4248     cExp = extractFloat64Exp(c);
4249     cSign = extractFloat64Sign(c);
4250 
4251     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4252                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4253 
4254     /* It is implementation-defined whether the cases of (0,inf,qnan)
4255      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4256      * they return if they do), so we have to hand this information
4257      * off to the target-specific pick-a-NaN routine.
4258      */
4259     if (((aExp == 0x7ff) && aSig) ||
4260         ((bExp == 0x7ff) && bSig) ||
4261         ((cExp == 0x7ff) && cSig)) {
4262         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4263     }
4264 
4265     if (infzero) {
4266         float_raise(float_flag_invalid, status);
4267         return float64_default_nan;
4268     }
4269 
4270     if (flags & float_muladd_negate_c) {
4271         cSign ^= 1;
4272     }
4273 
4274     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4275 
4276     /* Work out the sign and type of the product */
4277     pSign = aSign ^ bSign;
4278     if (flags & float_muladd_negate_product) {
4279         pSign ^= 1;
4280     }
4281     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4282     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4283 
4284     if (cExp == 0x7ff) {
4285         if (pInf && (pSign ^ cSign)) {
4286             /* addition of opposite-signed infinities => InvalidOperation */
4287             float_raise(float_flag_invalid, status);
4288             return float64_default_nan;
4289         }
4290         /* Otherwise generate an infinity of the same sign */
4291         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4292     }
4293 
4294     if (pInf) {
4295         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4296     }
4297 
4298     if (pZero) {
4299         if (cExp == 0) {
4300             if (cSig == 0) {
4301                 /* Adding two exact zeroes */
4302                 if (pSign == cSign) {
4303                     zSign = pSign;
4304                 } else if (status->float_rounding_mode == float_round_down) {
4305                     zSign = 1;
4306                 } else {
4307                     zSign = 0;
4308                 }
4309                 return packFloat64(zSign ^ signflip, 0, 0);
4310             }
4311             /* Exact zero plus a denorm */
4312             if (status->flush_to_zero) {
4313                 float_raise(float_flag_output_denormal, status);
4314                 return packFloat64(cSign ^ signflip, 0, 0);
4315             }
4316         }
4317         /* Zero plus something non-zero : just return the something */
4318         if (flags & float_muladd_halve_result) {
4319             if (cExp == 0) {
4320                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4321             }
4322             /* Subtract one to halve, and one again because roundAndPackFloat64
4323              * wants one less than the true exponent.
4324              */
4325             cExp -= 2;
4326             cSig = (cSig | 0x0010000000000000ULL) << 10;
4327             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4328         }
4329         return packFloat64(cSign ^ signflip, cExp, cSig);
4330     }
4331 
4332     if (aExp == 0) {
4333         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4334     }
4335     if (bExp == 0) {
4336         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4337     }
4338 
4339     /* Calculate the actual result a * b + c */
4340 
4341     /* Multiply first; this is easy. */
4342     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4343      * because we want the true exponent, not the "one-less-than"
4344      * flavour that roundAndPackFloat64() takes.
4345      */
4346     pExp = aExp + bExp - 0x3fe;
4347     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4348     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4349     mul64To128(aSig, bSig, &pSig0, &pSig1);
4350     if ((int64_t)(pSig0 << 1) >= 0) {
4351         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4352         pExp--;
4353     }
4354 
4355     zSign = pSign ^ signflip;
4356 
4357     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4358      * bit in position 126.
4359      */
4360     if (cExp == 0) {
4361         if (!cSig) {
4362             /* Throw out the special case of c being an exact zero now */
4363             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4364             if (flags & float_muladd_halve_result) {
4365                 pExp--;
4366             }
4367             return roundAndPackFloat64(zSign, pExp - 1,
4368                                        pSig1, status);
4369         }
4370         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4371     }
4372 
4373     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4374      * significand of the addend, with the explicit bit in position 126.
4375      */
4376     cSig0 = cSig << (126 - 64 - 52);
4377     cSig1 = 0;
4378     cSig0 |= LIT64(0x4000000000000000);
4379     expDiff = pExp - cExp;
4380 
4381     if (pSign == cSign) {
4382         /* Addition */
4383         if (expDiff > 0) {
4384             /* scale c to match p */
4385             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4386             zExp = pExp;
4387         } else if (expDiff < 0) {
4388             /* scale p to match c */
4389             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4390             zExp = cExp;
4391         } else {
4392             /* no scaling needed */
4393             zExp = cExp;
4394         }
4395         /* Add significands and make sure explicit bit ends up in posn 126 */
4396         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4397         if ((int64_t)zSig0 < 0) {
4398             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4399         } else {
4400             zExp--;
4401         }
4402         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4403         if (flags & float_muladd_halve_result) {
4404             zExp--;
4405         }
4406         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4407     } else {
4408         /* Subtraction */
4409         if (expDiff > 0) {
4410             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4411             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4412             zExp = pExp;
4413         } else if (expDiff < 0) {
4414             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4415             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4416             zExp = cExp;
4417             zSign ^= 1;
4418         } else {
4419             zExp = pExp;
4420             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4421                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4422             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4423                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4424                 zSign ^= 1;
4425             } else {
4426                 /* Exact zero */
4427                 zSign = signflip;
4428                 if (status->float_rounding_mode == float_round_down) {
4429                     zSign ^= 1;
4430                 }
4431                 return packFloat64(zSign, 0, 0);
4432             }
4433         }
4434         --zExp;
4435         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4436          * starting with the significand in a pair of uint64_t.
4437          */
4438         if (zSig0) {
4439             shiftcount = countLeadingZeros64(zSig0) - 1;
4440             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4441             if (zSig1) {
4442                 zSig0 |= 1;
4443             }
4444             zExp -= shiftcount;
4445         } else {
4446             shiftcount = countLeadingZeros64(zSig1);
4447             if (shiftcount == 0) {
4448                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4449                 zExp -= 63;
4450             } else {
4451                 shiftcount--;
4452                 zSig0 = zSig1 << shiftcount;
4453                 zExp -= (shiftcount + 64);
4454             }
4455         }
4456         if (flags & float_muladd_halve_result) {
4457             zExp--;
4458         }
4459         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4460     }
4461 }
4462 
4463 /*----------------------------------------------------------------------------
4464 | Returns the square root of the double-precision floating-point value `a'.
4465 | The operation is performed according to the IEC/IEEE Standard for Binary
4466 | Floating-Point Arithmetic.
4467 *----------------------------------------------------------------------------*/
4468 
4469 float64 float64_sqrt(float64 a, float_status *status)
4470 {
4471     flag aSign;
4472     int_fast16_t aExp, zExp;
4473     uint64_t aSig, zSig, doubleZSig;
4474     uint64_t rem0, rem1, term0, term1;
4475     a = float64_squash_input_denormal(a, status);
4476 
4477     aSig = extractFloat64Frac( a );
4478     aExp = extractFloat64Exp( a );
4479     aSign = extractFloat64Sign( a );
4480     if ( aExp == 0x7FF ) {
4481         if (aSig) {
4482             return propagateFloat64NaN(a, a, status);
4483         }
4484         if ( ! aSign ) return a;
4485         float_raise(float_flag_invalid, status);
4486         return float64_default_nan;
4487     }
4488     if ( aSign ) {
4489         if ( ( aExp | aSig ) == 0 ) return a;
4490         float_raise(float_flag_invalid, status);
4491         return float64_default_nan;
4492     }
4493     if ( aExp == 0 ) {
4494         if ( aSig == 0 ) return float64_zero;
4495         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4496     }
4497     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4498     aSig |= LIT64( 0x0010000000000000 );
4499     zSig = estimateSqrt32( aExp, aSig>>21 );
4500     aSig <<= 9 - ( aExp & 1 );
4501     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4502     if ( ( zSig & 0x1FF ) <= 5 ) {
4503         doubleZSig = zSig<<1;
4504         mul64To128( zSig, zSig, &term0, &term1 );
4505         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4506         while ( (int64_t) rem0 < 0 ) {
4507             --zSig;
4508             doubleZSig -= 2;
4509             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4510         }
4511         zSig |= ( ( rem0 | rem1 ) != 0 );
4512     }
4513     return roundAndPackFloat64(0, zExp, zSig, status);
4514 
4515 }
4516 
4517 /*----------------------------------------------------------------------------
4518 | Returns the binary log of the double-precision floating-point value `a'.
4519 | The operation is performed according to the IEC/IEEE Standard for Binary
4520 | Floating-Point Arithmetic.
4521 *----------------------------------------------------------------------------*/
4522 float64 float64_log2(float64 a, float_status *status)
4523 {
4524     flag aSign, zSign;
4525     int_fast16_t aExp;
4526     uint64_t aSig, aSig0, aSig1, zSig, i;
4527     a = float64_squash_input_denormal(a, status);
4528 
4529     aSig = extractFloat64Frac( a );
4530     aExp = extractFloat64Exp( a );
4531     aSign = extractFloat64Sign( a );
4532 
4533     if ( aExp == 0 ) {
4534         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4535         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4536     }
4537     if ( aSign ) {
4538         float_raise(float_flag_invalid, status);
4539         return float64_default_nan;
4540     }
4541     if ( aExp == 0x7FF ) {
4542         if (aSig) {
4543             return propagateFloat64NaN(a, float64_zero, status);
4544         }
4545         return a;
4546     }
4547 
4548     aExp -= 0x3FF;
4549     aSig |= LIT64( 0x0010000000000000 );
4550     zSign = aExp < 0;
4551     zSig = (uint64_t)aExp << 52;
4552     for (i = 1LL << 51; i > 0; i >>= 1) {
4553         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4554         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4555         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4556             aSig >>= 1;
4557             zSig |= i;
4558         }
4559     }
4560 
4561     if ( zSign )
4562         zSig = -zSig;
4563     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4564 }
4565 
4566 /*----------------------------------------------------------------------------
4567 | Returns 1 if the double-precision floating-point value `a' is equal to the
4568 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4569 | if either operand is a NaN.  Otherwise, the comparison is performed
4570 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4571 *----------------------------------------------------------------------------*/
4572 
4573 int float64_eq(float64 a, float64 b, float_status *status)
4574 {
4575     uint64_t av, bv;
4576     a = float64_squash_input_denormal(a, status);
4577     b = float64_squash_input_denormal(b, status);
4578 
4579     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4580          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4581        ) {
4582         float_raise(float_flag_invalid, status);
4583         return 0;
4584     }
4585     av = float64_val(a);
4586     bv = float64_val(b);
4587     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4588 
4589 }
4590 
4591 /*----------------------------------------------------------------------------
4592 | Returns 1 if the double-precision floating-point value `a' is less than or
4593 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4594 | exception is raised if either operand is a NaN.  The comparison is performed
4595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4596 *----------------------------------------------------------------------------*/
4597 
4598 int float64_le(float64 a, float64 b, float_status *status)
4599 {
4600     flag aSign, bSign;
4601     uint64_t av, bv;
4602     a = float64_squash_input_denormal(a, status);
4603     b = float64_squash_input_denormal(b, status);
4604 
4605     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4606          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4607        ) {
4608         float_raise(float_flag_invalid, status);
4609         return 0;
4610     }
4611     aSign = extractFloat64Sign( a );
4612     bSign = extractFloat64Sign( b );
4613     av = float64_val(a);
4614     bv = float64_val(b);
4615     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4616     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4617 
4618 }
4619 
4620 /*----------------------------------------------------------------------------
4621 | Returns 1 if the double-precision floating-point value `a' is less than
4622 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4623 | raised if either operand is a NaN.  The comparison is performed according
4624 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4625 *----------------------------------------------------------------------------*/
4626 
4627 int float64_lt(float64 a, float64 b, float_status *status)
4628 {
4629     flag aSign, bSign;
4630     uint64_t av, bv;
4631 
4632     a = float64_squash_input_denormal(a, status);
4633     b = float64_squash_input_denormal(b, status);
4634     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4635          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4636        ) {
4637         float_raise(float_flag_invalid, status);
4638         return 0;
4639     }
4640     aSign = extractFloat64Sign( a );
4641     bSign = extractFloat64Sign( b );
4642     av = float64_val(a);
4643     bv = float64_val(b);
4644     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4645     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4646 
4647 }
4648 
4649 /*----------------------------------------------------------------------------
4650 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4651 | be compared, and 0 otherwise.  The invalid exception is raised if either
4652 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4653 | Standard for Binary Floating-Point Arithmetic.
4654 *----------------------------------------------------------------------------*/
4655 
4656 int float64_unordered(float64 a, float64 b, float_status *status)
4657 {
4658     a = float64_squash_input_denormal(a, status);
4659     b = float64_squash_input_denormal(b, status);
4660 
4661     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4662          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4663        ) {
4664         float_raise(float_flag_invalid, status);
4665         return 1;
4666     }
4667     return 0;
4668 }
4669 
4670 /*----------------------------------------------------------------------------
4671 | Returns 1 if the double-precision floating-point value `a' is equal to the
4672 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4673 | exception.The comparison is performed according to the IEC/IEEE Standard
4674 | for Binary Floating-Point Arithmetic.
4675 *----------------------------------------------------------------------------*/
4676 
4677 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4678 {
4679     uint64_t av, bv;
4680     a = float64_squash_input_denormal(a, status);
4681     b = float64_squash_input_denormal(b, status);
4682 
4683     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4684          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4685        ) {
4686         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4687             float_raise(float_flag_invalid, status);
4688         }
4689         return 0;
4690     }
4691     av = float64_val(a);
4692     bv = float64_val(b);
4693     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4694 
4695 }
4696 
4697 /*----------------------------------------------------------------------------
4698 | Returns 1 if the double-precision floating-point value `a' is less than or
4699 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4700 | cause an exception.  Otherwise, the comparison is performed according to the
4701 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4702 *----------------------------------------------------------------------------*/
4703 
4704 int float64_le_quiet(float64 a, float64 b, float_status *status)
4705 {
4706     flag aSign, bSign;
4707     uint64_t av, bv;
4708     a = float64_squash_input_denormal(a, status);
4709     b = float64_squash_input_denormal(b, status);
4710 
4711     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4712          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4713        ) {
4714         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4715             float_raise(float_flag_invalid, status);
4716         }
4717         return 0;
4718     }
4719     aSign = extractFloat64Sign( a );
4720     bSign = extractFloat64Sign( b );
4721     av = float64_val(a);
4722     bv = float64_val(b);
4723     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4724     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4725 
4726 }
4727 
4728 /*----------------------------------------------------------------------------
4729 | Returns 1 if the double-precision floating-point value `a' is less than
4730 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4731 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4732 | Standard for Binary Floating-Point Arithmetic.
4733 *----------------------------------------------------------------------------*/
4734 
4735 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4736 {
4737     flag aSign, bSign;
4738     uint64_t av, bv;
4739     a = float64_squash_input_denormal(a, status);
4740     b = float64_squash_input_denormal(b, status);
4741 
4742     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4743          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4744        ) {
4745         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4746             float_raise(float_flag_invalid, status);
4747         }
4748         return 0;
4749     }
4750     aSign = extractFloat64Sign( a );
4751     bSign = extractFloat64Sign( b );
4752     av = float64_val(a);
4753     bv = float64_val(b);
4754     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4755     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4756 
4757 }
4758 
4759 /*----------------------------------------------------------------------------
4760 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4761 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4762 | comparison is performed according to the IEC/IEEE Standard for Binary
4763 | Floating-Point Arithmetic.
4764 *----------------------------------------------------------------------------*/
4765 
4766 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4767 {
4768     a = float64_squash_input_denormal(a, status);
4769     b = float64_squash_input_denormal(b, status);
4770 
4771     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4772          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4773        ) {
4774         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4775             float_raise(float_flag_invalid, status);
4776         }
4777         return 1;
4778     }
4779     return 0;
4780 }
4781 
4782 /*----------------------------------------------------------------------------
4783 | Returns the result of converting the extended double-precision floating-
4784 | point value `a' to the 32-bit two's complement integer format.  The
4785 | conversion is performed according to the IEC/IEEE Standard for Binary
4786 | Floating-Point Arithmetic---which means in particular that the conversion
4787 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4788 | largest positive integer is returned.  Otherwise, if the conversion
4789 | overflows, the largest integer with the same sign as `a' is returned.
4790 *----------------------------------------------------------------------------*/
4791 
4792 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4793 {
4794     flag aSign;
4795     int32_t aExp, shiftCount;
4796     uint64_t aSig;
4797 
4798     aSig = extractFloatx80Frac( a );
4799     aExp = extractFloatx80Exp( a );
4800     aSign = extractFloatx80Sign( a );
4801     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4802     shiftCount = 0x4037 - aExp;
4803     if ( shiftCount <= 0 ) shiftCount = 1;
4804     shift64RightJamming( aSig, shiftCount, &aSig );
4805     return roundAndPackInt32(aSign, aSig, status);
4806 
4807 }
4808 
4809 /*----------------------------------------------------------------------------
4810 | Returns the result of converting the extended double-precision floating-
4811 | point value `a' to the 32-bit two's complement integer format.  The
4812 | conversion is performed according to the IEC/IEEE Standard for Binary
4813 | Floating-Point Arithmetic, except that the conversion is always rounded
4814 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4815 | Otherwise, if the conversion overflows, the largest integer with the same
4816 | sign as `a' is returned.
4817 *----------------------------------------------------------------------------*/
4818 
4819 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4820 {
4821     flag aSign;
4822     int32_t aExp, shiftCount;
4823     uint64_t aSig, savedASig;
4824     int32_t z;
4825 
4826     aSig = extractFloatx80Frac( a );
4827     aExp = extractFloatx80Exp( a );
4828     aSign = extractFloatx80Sign( a );
4829     if ( 0x401E < aExp ) {
4830         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4831         goto invalid;
4832     }
4833     else if ( aExp < 0x3FFF ) {
4834         if (aExp || aSig) {
4835             status->float_exception_flags |= float_flag_inexact;
4836         }
4837         return 0;
4838     }
4839     shiftCount = 0x403E - aExp;
4840     savedASig = aSig;
4841     aSig >>= shiftCount;
4842     z = aSig;
4843     if ( aSign ) z = - z;
4844     if ( ( z < 0 ) ^ aSign ) {
4845  invalid:
4846         float_raise(float_flag_invalid, status);
4847         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4848     }
4849     if ( ( aSig<<shiftCount ) != savedASig ) {
4850         status->float_exception_flags |= float_flag_inexact;
4851     }
4852     return z;
4853 
4854 }
4855 
4856 /*----------------------------------------------------------------------------
4857 | Returns the result of converting the extended double-precision floating-
4858 | point value `a' to the 64-bit two's complement integer format.  The
4859 | conversion is performed according to the IEC/IEEE Standard for Binary
4860 | Floating-Point Arithmetic---which means in particular that the conversion
4861 | is rounded according to the current rounding mode.  If `a' is a NaN,
4862 | the largest positive integer is returned.  Otherwise, if the conversion
4863 | overflows, the largest integer with the same sign as `a' is returned.
4864 *----------------------------------------------------------------------------*/
4865 
4866 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4867 {
4868     flag aSign;
4869     int32_t aExp, shiftCount;
4870     uint64_t aSig, aSigExtra;
4871 
4872     aSig = extractFloatx80Frac( a );
4873     aExp = extractFloatx80Exp( a );
4874     aSign = extractFloatx80Sign( a );
4875     shiftCount = 0x403E - aExp;
4876     if ( shiftCount <= 0 ) {
4877         if ( shiftCount ) {
4878             float_raise(float_flag_invalid, status);
4879             if (    ! aSign
4880                  || (    ( aExp == 0x7FFF )
4881                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4882                ) {
4883                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4884             }
4885             return (int64_t) LIT64( 0x8000000000000000 );
4886         }
4887         aSigExtra = 0;
4888     }
4889     else {
4890         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4891     }
4892     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4893 
4894 }
4895 
4896 /*----------------------------------------------------------------------------
4897 | Returns the result of converting the extended double-precision floating-
4898 | point value `a' to the 64-bit two's complement integer format.  The
4899 | conversion is performed according to the IEC/IEEE Standard for Binary
4900 | Floating-Point Arithmetic, except that the conversion is always rounded
4901 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4902 | Otherwise, if the conversion overflows, the largest integer with the same
4903 | sign as `a' is returned.
4904 *----------------------------------------------------------------------------*/
4905 
4906 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4907 {
4908     flag aSign;
4909     int32_t aExp, shiftCount;
4910     uint64_t aSig;
4911     int64_t z;
4912 
4913     aSig = extractFloatx80Frac( a );
4914     aExp = extractFloatx80Exp( a );
4915     aSign = extractFloatx80Sign( a );
4916     shiftCount = aExp - 0x403E;
4917     if ( 0 <= shiftCount ) {
4918         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4919         if ( ( a.high != 0xC03E ) || aSig ) {
4920             float_raise(float_flag_invalid, status);
4921             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4922                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4923             }
4924         }
4925         return (int64_t) LIT64( 0x8000000000000000 );
4926     }
4927     else if ( aExp < 0x3FFF ) {
4928         if (aExp | aSig) {
4929             status->float_exception_flags |= float_flag_inexact;
4930         }
4931         return 0;
4932     }
4933     z = aSig>>( - shiftCount );
4934     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4935         status->float_exception_flags |= float_flag_inexact;
4936     }
4937     if ( aSign ) z = - z;
4938     return z;
4939 
4940 }
4941 
4942 /*----------------------------------------------------------------------------
4943 | Returns the result of converting the extended double-precision floating-
4944 | point value `a' to the single-precision floating-point format.  The
4945 | conversion is performed according to the IEC/IEEE Standard for Binary
4946 | Floating-Point Arithmetic.
4947 *----------------------------------------------------------------------------*/
4948 
4949 float32 floatx80_to_float32(floatx80 a, float_status *status)
4950 {
4951     flag aSign;
4952     int32_t aExp;
4953     uint64_t aSig;
4954 
4955     aSig = extractFloatx80Frac( a );
4956     aExp = extractFloatx80Exp( a );
4957     aSign = extractFloatx80Sign( a );
4958     if ( aExp == 0x7FFF ) {
4959         if ( (uint64_t) ( aSig<<1 ) ) {
4960             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4961         }
4962         return packFloat32( aSign, 0xFF, 0 );
4963     }
4964     shift64RightJamming( aSig, 33, &aSig );
4965     if ( aExp || aSig ) aExp -= 0x3F81;
4966     return roundAndPackFloat32(aSign, aExp, aSig, status);
4967 
4968 }
4969 
4970 /*----------------------------------------------------------------------------
4971 | Returns the result of converting the extended double-precision floating-
4972 | point value `a' to the double-precision floating-point format.  The
4973 | conversion is performed according to the IEC/IEEE Standard for Binary
4974 | Floating-Point Arithmetic.
4975 *----------------------------------------------------------------------------*/
4976 
4977 float64 floatx80_to_float64(floatx80 a, float_status *status)
4978 {
4979     flag aSign;
4980     int32_t aExp;
4981     uint64_t aSig, zSig;
4982 
4983     aSig = extractFloatx80Frac( a );
4984     aExp = extractFloatx80Exp( a );
4985     aSign = extractFloatx80Sign( a );
4986     if ( aExp == 0x7FFF ) {
4987         if ( (uint64_t) ( aSig<<1 ) ) {
4988             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4989         }
4990         return packFloat64( aSign, 0x7FF, 0 );
4991     }
4992     shift64RightJamming( aSig, 1, &zSig );
4993     if ( aExp || aSig ) aExp -= 0x3C01;
4994     return roundAndPackFloat64(aSign, aExp, zSig, status);
4995 
4996 }
4997 
4998 /*----------------------------------------------------------------------------
4999 | Returns the result of converting the extended double-precision floating-
5000 | point value `a' to the quadruple-precision floating-point format.  The
5001 | conversion is performed according to the IEC/IEEE Standard for Binary
5002 | Floating-Point Arithmetic.
5003 *----------------------------------------------------------------------------*/
5004 
5005 float128 floatx80_to_float128(floatx80 a, float_status *status)
5006 {
5007     flag aSign;
5008     int_fast16_t aExp;
5009     uint64_t aSig, zSig0, zSig1;
5010 
5011     aSig = extractFloatx80Frac( a );
5012     aExp = extractFloatx80Exp( a );
5013     aSign = extractFloatx80Sign( a );
5014     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5015         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5016     }
5017     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5018     return packFloat128( aSign, aExp, zSig0, zSig1 );
5019 
5020 }
5021 
5022 /*----------------------------------------------------------------------------
5023 | Rounds the extended double-precision floating-point value `a' to an integer,
5024 | and returns the result as an extended quadruple-precision floating-point
5025 | value.  The operation is performed according to the IEC/IEEE Standard for
5026 | Binary Floating-Point Arithmetic.
5027 *----------------------------------------------------------------------------*/
5028 
5029 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5030 {
5031     flag aSign;
5032     int32_t aExp;
5033     uint64_t lastBitMask, roundBitsMask;
5034     floatx80 z;
5035 
5036     aExp = extractFloatx80Exp( a );
5037     if ( 0x403E <= aExp ) {
5038         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5039             return propagateFloatx80NaN(a, a, status);
5040         }
5041         return a;
5042     }
5043     if ( aExp < 0x3FFF ) {
5044         if (    ( aExp == 0 )
5045              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5046             return a;
5047         }
5048         status->float_exception_flags |= float_flag_inexact;
5049         aSign = extractFloatx80Sign( a );
5050         switch (status->float_rounding_mode) {
5051          case float_round_nearest_even:
5052             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5053                ) {
5054                 return
5055                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5056             }
5057             break;
5058         case float_round_ties_away:
5059             if (aExp == 0x3FFE) {
5060                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5061             }
5062             break;
5063          case float_round_down:
5064             return
5065                   aSign ?
5066                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5067                 : packFloatx80( 0, 0, 0 );
5068          case float_round_up:
5069             return
5070                   aSign ? packFloatx80( 1, 0, 0 )
5071                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5072         }
5073         return packFloatx80( aSign, 0, 0 );
5074     }
5075     lastBitMask = 1;
5076     lastBitMask <<= 0x403E - aExp;
5077     roundBitsMask = lastBitMask - 1;
5078     z = a;
5079     switch (status->float_rounding_mode) {
5080     case float_round_nearest_even:
5081         z.low += lastBitMask>>1;
5082         if ((z.low & roundBitsMask) == 0) {
5083             z.low &= ~lastBitMask;
5084         }
5085         break;
5086     case float_round_ties_away:
5087         z.low += lastBitMask >> 1;
5088         break;
5089     case float_round_to_zero:
5090         break;
5091     case float_round_up:
5092         if (!extractFloatx80Sign(z)) {
5093             z.low += roundBitsMask;
5094         }
5095         break;
5096     case float_round_down:
5097         if (extractFloatx80Sign(z)) {
5098             z.low += roundBitsMask;
5099         }
5100         break;
5101     default:
5102         abort();
5103     }
5104     z.low &= ~ roundBitsMask;
5105     if ( z.low == 0 ) {
5106         ++z.high;
5107         z.low = LIT64( 0x8000000000000000 );
5108     }
5109     if (z.low != a.low) {
5110         status->float_exception_flags |= float_flag_inexact;
5111     }
5112     return z;
5113 
5114 }
5115 
5116 /*----------------------------------------------------------------------------
5117 | Returns the result of adding the absolute values of the extended double-
5118 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5119 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5120 | The addition is performed according to the IEC/IEEE Standard for Binary
5121 | Floating-Point Arithmetic.
5122 *----------------------------------------------------------------------------*/
5123 
5124 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5125                                 float_status *status)
5126 {
5127     int32_t aExp, bExp, zExp;
5128     uint64_t aSig, bSig, zSig0, zSig1;
5129     int32_t expDiff;
5130 
5131     aSig = extractFloatx80Frac( a );
5132     aExp = extractFloatx80Exp( a );
5133     bSig = extractFloatx80Frac( b );
5134     bExp = extractFloatx80Exp( b );
5135     expDiff = aExp - bExp;
5136     if ( 0 < expDiff ) {
5137         if ( aExp == 0x7FFF ) {
5138             if ((uint64_t)(aSig << 1)) {
5139                 return propagateFloatx80NaN(a, b, status);
5140             }
5141             return a;
5142         }
5143         if ( bExp == 0 ) --expDiff;
5144         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5145         zExp = aExp;
5146     }
5147     else if ( expDiff < 0 ) {
5148         if ( bExp == 0x7FFF ) {
5149             if ((uint64_t)(bSig << 1)) {
5150                 return propagateFloatx80NaN(a, b, status);
5151             }
5152             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5153         }
5154         if ( aExp == 0 ) ++expDiff;
5155         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5156         zExp = bExp;
5157     }
5158     else {
5159         if ( aExp == 0x7FFF ) {
5160             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5161                 return propagateFloatx80NaN(a, b, status);
5162             }
5163             return a;
5164         }
5165         zSig1 = 0;
5166         zSig0 = aSig + bSig;
5167         if ( aExp == 0 ) {
5168             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5169             goto roundAndPack;
5170         }
5171         zExp = aExp;
5172         goto shiftRight1;
5173     }
5174     zSig0 = aSig + bSig;
5175     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5176  shiftRight1:
5177     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5178     zSig0 |= LIT64( 0x8000000000000000 );
5179     ++zExp;
5180  roundAndPack:
5181     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5182                                 zSign, zExp, zSig0, zSig1, status);
5183 }
5184 
5185 /*----------------------------------------------------------------------------
5186 | Returns the result of subtracting the absolute values of the extended
5187 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5188 | difference is negated before being returned.  `zSign' is ignored if the
5189 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5190 | Standard for Binary Floating-Point Arithmetic.
5191 *----------------------------------------------------------------------------*/
5192 
5193 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5194                                 float_status *status)
5195 {
5196     int32_t aExp, bExp, zExp;
5197     uint64_t aSig, bSig, zSig0, zSig1;
5198     int32_t expDiff;
5199     floatx80 z;
5200 
5201     aSig = extractFloatx80Frac( a );
5202     aExp = extractFloatx80Exp( a );
5203     bSig = extractFloatx80Frac( b );
5204     bExp = extractFloatx80Exp( b );
5205     expDiff = aExp - bExp;
5206     if ( 0 < expDiff ) goto aExpBigger;
5207     if ( expDiff < 0 ) goto bExpBigger;
5208     if ( aExp == 0x7FFF ) {
5209         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5210             return propagateFloatx80NaN(a, b, status);
5211         }
5212         float_raise(float_flag_invalid, status);
5213         z.low = floatx80_default_nan_low;
5214         z.high = floatx80_default_nan_high;
5215         return z;
5216     }
5217     if ( aExp == 0 ) {
5218         aExp = 1;
5219         bExp = 1;
5220     }
5221     zSig1 = 0;
5222     if ( bSig < aSig ) goto aBigger;
5223     if ( aSig < bSig ) goto bBigger;
5224     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5225  bExpBigger:
5226     if ( bExp == 0x7FFF ) {
5227         if ((uint64_t)(bSig << 1)) {
5228             return propagateFloatx80NaN(a, b, status);
5229         }
5230         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5231     }
5232     if ( aExp == 0 ) ++expDiff;
5233     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5234  bBigger:
5235     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5236     zExp = bExp;
5237     zSign ^= 1;
5238     goto normalizeRoundAndPack;
5239  aExpBigger:
5240     if ( aExp == 0x7FFF ) {
5241         if ((uint64_t)(aSig << 1)) {
5242             return propagateFloatx80NaN(a, b, status);
5243         }
5244         return a;
5245     }
5246     if ( bExp == 0 ) --expDiff;
5247     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5248  aBigger:
5249     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5250     zExp = aExp;
5251  normalizeRoundAndPack:
5252     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5253                                          zSign, zExp, zSig0, zSig1, status);
5254 }
5255 
5256 /*----------------------------------------------------------------------------
5257 | Returns the result of adding the extended double-precision floating-point
5258 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5259 | Standard for Binary Floating-Point Arithmetic.
5260 *----------------------------------------------------------------------------*/
5261 
5262 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5263 {
5264     flag aSign, bSign;
5265 
5266     aSign = extractFloatx80Sign( a );
5267     bSign = extractFloatx80Sign( b );
5268     if ( aSign == bSign ) {
5269         return addFloatx80Sigs(a, b, aSign, status);
5270     }
5271     else {
5272         return subFloatx80Sigs(a, b, aSign, status);
5273     }
5274 
5275 }
5276 
5277 /*----------------------------------------------------------------------------
5278 | Returns the result of subtracting the extended double-precision floating-
5279 | point values `a' and `b'.  The operation is performed according to the
5280 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5281 *----------------------------------------------------------------------------*/
5282 
5283 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5284 {
5285     flag aSign, bSign;
5286 
5287     aSign = extractFloatx80Sign( a );
5288     bSign = extractFloatx80Sign( b );
5289     if ( aSign == bSign ) {
5290         return subFloatx80Sigs(a, b, aSign, status);
5291     }
5292     else {
5293         return addFloatx80Sigs(a, b, aSign, status);
5294     }
5295 
5296 }
5297 
5298 /*----------------------------------------------------------------------------
5299 | Returns the result of multiplying the extended double-precision floating-
5300 | point values `a' and `b'.  The operation is performed according to the
5301 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5302 *----------------------------------------------------------------------------*/
5303 
5304 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5305 {
5306     flag aSign, bSign, zSign;
5307     int32_t aExp, bExp, zExp;
5308     uint64_t aSig, bSig, zSig0, zSig1;
5309     floatx80 z;
5310 
5311     aSig = extractFloatx80Frac( a );
5312     aExp = extractFloatx80Exp( a );
5313     aSign = extractFloatx80Sign( a );
5314     bSig = extractFloatx80Frac( b );
5315     bExp = extractFloatx80Exp( b );
5316     bSign = extractFloatx80Sign( b );
5317     zSign = aSign ^ bSign;
5318     if ( aExp == 0x7FFF ) {
5319         if (    (uint64_t) ( aSig<<1 )
5320              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5321             return propagateFloatx80NaN(a, b, status);
5322         }
5323         if ( ( bExp | bSig ) == 0 ) goto invalid;
5324         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5325     }
5326     if ( bExp == 0x7FFF ) {
5327         if ((uint64_t)(bSig << 1)) {
5328             return propagateFloatx80NaN(a, b, status);
5329         }
5330         if ( ( aExp | aSig ) == 0 ) {
5331  invalid:
5332             float_raise(float_flag_invalid, status);
5333             z.low = floatx80_default_nan_low;
5334             z.high = floatx80_default_nan_high;
5335             return z;
5336         }
5337         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5338     }
5339     if ( aExp == 0 ) {
5340         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5341         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5342     }
5343     if ( bExp == 0 ) {
5344         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5345         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5346     }
5347     zExp = aExp + bExp - 0x3FFE;
5348     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5349     if ( 0 < (int64_t) zSig0 ) {
5350         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5351         --zExp;
5352     }
5353     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5354                                 zSign, zExp, zSig0, zSig1, status);
5355 }
5356 
5357 /*----------------------------------------------------------------------------
5358 | Returns the result of dividing the extended double-precision floating-point
5359 | value `a' by the corresponding value `b'.  The operation is performed
5360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5362 
5363 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5364 {
5365     flag aSign, bSign, zSign;
5366     int32_t aExp, bExp, zExp;
5367     uint64_t aSig, bSig, zSig0, zSig1;
5368     uint64_t rem0, rem1, rem2, term0, term1, term2;
5369     floatx80 z;
5370 
5371     aSig = extractFloatx80Frac( a );
5372     aExp = extractFloatx80Exp( a );
5373     aSign = extractFloatx80Sign( a );
5374     bSig = extractFloatx80Frac( b );
5375     bExp = extractFloatx80Exp( b );
5376     bSign = extractFloatx80Sign( b );
5377     zSign = aSign ^ bSign;
5378     if ( aExp == 0x7FFF ) {
5379         if ((uint64_t)(aSig << 1)) {
5380             return propagateFloatx80NaN(a, b, status);
5381         }
5382         if ( bExp == 0x7FFF ) {
5383             if ((uint64_t)(bSig << 1)) {
5384                 return propagateFloatx80NaN(a, b, status);
5385             }
5386             goto invalid;
5387         }
5388         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5389     }
5390     if ( bExp == 0x7FFF ) {
5391         if ((uint64_t)(bSig << 1)) {
5392             return propagateFloatx80NaN(a, b, status);
5393         }
5394         return packFloatx80( zSign, 0, 0 );
5395     }
5396     if ( bExp == 0 ) {
5397         if ( bSig == 0 ) {
5398             if ( ( aExp | aSig ) == 0 ) {
5399  invalid:
5400                 float_raise(float_flag_invalid, status);
5401                 z.low = floatx80_default_nan_low;
5402                 z.high = floatx80_default_nan_high;
5403                 return z;
5404             }
5405             float_raise(float_flag_divbyzero, status);
5406             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5407         }
5408         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5409     }
5410     if ( aExp == 0 ) {
5411         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5412         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5413     }
5414     zExp = aExp - bExp + 0x3FFE;
5415     rem1 = 0;
5416     if ( bSig <= aSig ) {
5417         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5418         ++zExp;
5419     }
5420     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5421     mul64To128( bSig, zSig0, &term0, &term1 );
5422     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5423     while ( (int64_t) rem0 < 0 ) {
5424         --zSig0;
5425         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5426     }
5427     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5428     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5429         mul64To128( bSig, zSig1, &term1, &term2 );
5430         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5431         while ( (int64_t) rem1 < 0 ) {
5432             --zSig1;
5433             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5434         }
5435         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5436     }
5437     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5438                                 zSign, zExp, zSig0, zSig1, status);
5439 }
5440 
5441 /*----------------------------------------------------------------------------
5442 | Returns the remainder of the extended double-precision floating-point value
5443 | `a' with respect to the corresponding value `b'.  The operation is performed
5444 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5445 *----------------------------------------------------------------------------*/
5446 
5447 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5448 {
5449     flag aSign, zSign;
5450     int32_t aExp, bExp, expDiff;
5451     uint64_t aSig0, aSig1, bSig;
5452     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5453     floatx80 z;
5454 
5455     aSig0 = extractFloatx80Frac( a );
5456     aExp = extractFloatx80Exp( a );
5457     aSign = extractFloatx80Sign( a );
5458     bSig = extractFloatx80Frac( b );
5459     bExp = extractFloatx80Exp( b );
5460     if ( aExp == 0x7FFF ) {
5461         if (    (uint64_t) ( aSig0<<1 )
5462              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5463             return propagateFloatx80NaN(a, b, status);
5464         }
5465         goto invalid;
5466     }
5467     if ( bExp == 0x7FFF ) {
5468         if ((uint64_t)(bSig << 1)) {
5469             return propagateFloatx80NaN(a, b, status);
5470         }
5471         return a;
5472     }
5473     if ( bExp == 0 ) {
5474         if ( bSig == 0 ) {
5475  invalid:
5476             float_raise(float_flag_invalid, status);
5477             z.low = floatx80_default_nan_low;
5478             z.high = floatx80_default_nan_high;
5479             return z;
5480         }
5481         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5482     }
5483     if ( aExp == 0 ) {
5484         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5485         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5486     }
5487     bSig |= LIT64( 0x8000000000000000 );
5488     zSign = aSign;
5489     expDiff = aExp - bExp;
5490     aSig1 = 0;
5491     if ( expDiff < 0 ) {
5492         if ( expDiff < -1 ) return a;
5493         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5494         expDiff = 0;
5495     }
5496     q = ( bSig <= aSig0 );
5497     if ( q ) aSig0 -= bSig;
5498     expDiff -= 64;
5499     while ( 0 < expDiff ) {
5500         q = estimateDiv128To64( aSig0, aSig1, bSig );
5501         q = ( 2 < q ) ? q - 2 : 0;
5502         mul64To128( bSig, q, &term0, &term1 );
5503         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5504         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5505         expDiff -= 62;
5506     }
5507     expDiff += 64;
5508     if ( 0 < expDiff ) {
5509         q = estimateDiv128To64( aSig0, aSig1, bSig );
5510         q = ( 2 < q ) ? q - 2 : 0;
5511         q >>= 64 - expDiff;
5512         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5513         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5514         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5515         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5516             ++q;
5517             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5518         }
5519     }
5520     else {
5521         term1 = 0;
5522         term0 = bSig;
5523     }
5524     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5525     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5526          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5527               && ( q & 1 ) )
5528        ) {
5529         aSig0 = alternateASig0;
5530         aSig1 = alternateASig1;
5531         zSign = ! zSign;
5532     }
5533     return
5534         normalizeRoundAndPackFloatx80(
5535             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5536 
5537 }
5538 
5539 /*----------------------------------------------------------------------------
5540 | Returns the square root of the extended double-precision floating-point
5541 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5542 | for Binary Floating-Point Arithmetic.
5543 *----------------------------------------------------------------------------*/
5544 
5545 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5546 {
5547     flag aSign;
5548     int32_t aExp, zExp;
5549     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5550     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5551     floatx80 z;
5552 
5553     aSig0 = extractFloatx80Frac( a );
5554     aExp = extractFloatx80Exp( a );
5555     aSign = extractFloatx80Sign( a );
5556     if ( aExp == 0x7FFF ) {
5557         if ((uint64_t)(aSig0 << 1)) {
5558             return propagateFloatx80NaN(a, a, status);
5559         }
5560         if ( ! aSign ) return a;
5561         goto invalid;
5562     }
5563     if ( aSign ) {
5564         if ( ( aExp | aSig0 ) == 0 ) return a;
5565  invalid:
5566         float_raise(float_flag_invalid, status);
5567         z.low = floatx80_default_nan_low;
5568         z.high = floatx80_default_nan_high;
5569         return z;
5570     }
5571     if ( aExp == 0 ) {
5572         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5573         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5574     }
5575     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5576     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5577     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5578     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5579     doubleZSig0 = zSig0<<1;
5580     mul64To128( zSig0, zSig0, &term0, &term1 );
5581     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5582     while ( (int64_t) rem0 < 0 ) {
5583         --zSig0;
5584         doubleZSig0 -= 2;
5585         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5586     }
5587     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5588     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5589         if ( zSig1 == 0 ) zSig1 = 1;
5590         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5591         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5592         mul64To128( zSig1, zSig1, &term2, &term3 );
5593         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5594         while ( (int64_t) rem1 < 0 ) {
5595             --zSig1;
5596             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5597             term3 |= 1;
5598             term2 |= doubleZSig0;
5599             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5600         }
5601         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5602     }
5603     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5604     zSig0 |= doubleZSig0;
5605     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5606                                 0, zExp, zSig0, zSig1, status);
5607 }
5608 
5609 /*----------------------------------------------------------------------------
5610 | Returns 1 if the extended double-precision floating-point value `a' is equal
5611 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5612 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5613 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5614 *----------------------------------------------------------------------------*/
5615 
5616 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5617 {
5618 
5619     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5620               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5621          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5622               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5623        ) {
5624         float_raise(float_flag_invalid, status);
5625         return 0;
5626     }
5627     return
5628            ( a.low == b.low )
5629         && (    ( a.high == b.high )
5630              || (    ( a.low == 0 )
5631                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5632            );
5633 
5634 }
5635 
5636 /*----------------------------------------------------------------------------
5637 | Returns 1 if the extended double-precision floating-point value `a' is
5638 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5639 | invalid exception is raised if either operand is a NaN.  The comparison is
5640 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5641 | Arithmetic.
5642 *----------------------------------------------------------------------------*/
5643 
5644 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5645 {
5646     flag aSign, bSign;
5647 
5648     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5649               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5650          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5651               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5652        ) {
5653         float_raise(float_flag_invalid, status);
5654         return 0;
5655     }
5656     aSign = extractFloatx80Sign( a );
5657     bSign = extractFloatx80Sign( b );
5658     if ( aSign != bSign ) {
5659         return
5660                aSign
5661             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5662                  == 0 );
5663     }
5664     return
5665           aSign ? le128( b.high, b.low, a.high, a.low )
5666         : le128( a.high, a.low, b.high, b.low );
5667 
5668 }
5669 
5670 /*----------------------------------------------------------------------------
5671 | Returns 1 if the extended double-precision floating-point value `a' is
5672 | less than the corresponding value `b', and 0 otherwise.  The invalid
5673 | exception is raised if either operand is a NaN.  The comparison is performed
5674 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5675 *----------------------------------------------------------------------------*/
5676 
5677 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5678 {
5679     flag aSign, bSign;
5680 
5681     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5682               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5683          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5684               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5685        ) {
5686         float_raise(float_flag_invalid, status);
5687         return 0;
5688     }
5689     aSign = extractFloatx80Sign( a );
5690     bSign = extractFloatx80Sign( b );
5691     if ( aSign != bSign ) {
5692         return
5693                aSign
5694             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5695                  != 0 );
5696     }
5697     return
5698           aSign ? lt128( b.high, b.low, a.high, a.low )
5699         : lt128( a.high, a.low, b.high, b.low );
5700 
5701 }
5702 
5703 /*----------------------------------------------------------------------------
5704 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5705 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5706 | either operand is a NaN.   The comparison is performed according to the
5707 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5708 *----------------------------------------------------------------------------*/
5709 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5710 {
5711     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5712               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5713          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5714               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5715        ) {
5716         float_raise(float_flag_invalid, status);
5717         return 1;
5718     }
5719     return 0;
5720 }
5721 
5722 /*----------------------------------------------------------------------------
5723 | Returns 1 if the extended double-precision floating-point value `a' is
5724 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5725 | cause an exception.  The comparison is performed according to the IEC/IEEE
5726 | Standard for Binary Floating-Point Arithmetic.
5727 *----------------------------------------------------------------------------*/
5728 
5729 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5730 {
5731 
5732     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5733               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5734          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5735               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5736        ) {
5737         if (    floatx80_is_signaling_nan( a )
5738              || floatx80_is_signaling_nan( b ) ) {
5739             float_raise(float_flag_invalid, status);
5740         }
5741         return 0;
5742     }
5743     return
5744            ( a.low == b.low )
5745         && (    ( a.high == b.high )
5746              || (    ( a.low == 0 )
5747                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5748            );
5749 
5750 }
5751 
5752 /*----------------------------------------------------------------------------
5753 | Returns 1 if the extended double-precision floating-point value `a' is less
5754 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5755 | do not cause an exception.  Otherwise, the comparison is performed according
5756 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5757 *----------------------------------------------------------------------------*/
5758 
5759 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5760 {
5761     flag aSign, bSign;
5762 
5763     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5764               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5765          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5766               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5767        ) {
5768         if (    floatx80_is_signaling_nan( a )
5769              || floatx80_is_signaling_nan( b ) ) {
5770             float_raise(float_flag_invalid, status);
5771         }
5772         return 0;
5773     }
5774     aSign = extractFloatx80Sign( a );
5775     bSign = extractFloatx80Sign( b );
5776     if ( aSign != bSign ) {
5777         return
5778                aSign
5779             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5780                  == 0 );
5781     }
5782     return
5783           aSign ? le128( b.high, b.low, a.high, a.low )
5784         : le128( a.high, a.low, b.high, b.low );
5785 
5786 }
5787 
5788 /*----------------------------------------------------------------------------
5789 | Returns 1 if the extended double-precision floating-point value `a' is less
5790 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5791 | an exception.  Otherwise, the comparison is performed according to the
5792 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5793 *----------------------------------------------------------------------------*/
5794 
5795 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5796 {
5797     flag aSign, bSign;
5798 
5799     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5800               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5801          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5802               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5803        ) {
5804         if (    floatx80_is_signaling_nan( a )
5805              || floatx80_is_signaling_nan( b ) ) {
5806             float_raise(float_flag_invalid, status);
5807         }
5808         return 0;
5809     }
5810     aSign = extractFloatx80Sign( a );
5811     bSign = extractFloatx80Sign( b );
5812     if ( aSign != bSign ) {
5813         return
5814                aSign
5815             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5816                  != 0 );
5817     }
5818     return
5819           aSign ? lt128( b.high, b.low, a.high, a.low )
5820         : lt128( a.high, a.low, b.high, b.low );
5821 
5822 }
5823 
5824 /*----------------------------------------------------------------------------
5825 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5826 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5827 | The comparison is performed according to the IEC/IEEE Standard for Binary
5828 | Floating-Point Arithmetic.
5829 *----------------------------------------------------------------------------*/
5830 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5831 {
5832     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5833               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5834          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5835               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5836        ) {
5837         if (    floatx80_is_signaling_nan( a )
5838              || floatx80_is_signaling_nan( b ) ) {
5839             float_raise(float_flag_invalid, status);
5840         }
5841         return 1;
5842     }
5843     return 0;
5844 }
5845 
5846 /*----------------------------------------------------------------------------
5847 | Returns the result of converting the quadruple-precision floating-point
5848 | value `a' to the 32-bit two's complement integer format.  The conversion
5849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5850 | Arithmetic---which means in particular that the conversion is rounded
5851 | according to the current rounding mode.  If `a' is a NaN, the largest
5852 | positive integer is returned.  Otherwise, if the conversion overflows, the
5853 | largest integer with the same sign as `a' is returned.
5854 *----------------------------------------------------------------------------*/
5855 
5856 int32_t float128_to_int32(float128 a, float_status *status)
5857 {
5858     flag aSign;
5859     int32_t aExp, shiftCount;
5860     uint64_t aSig0, aSig1;
5861 
5862     aSig1 = extractFloat128Frac1( a );
5863     aSig0 = extractFloat128Frac0( a );
5864     aExp = extractFloat128Exp( a );
5865     aSign = extractFloat128Sign( a );
5866     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5867     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5868     aSig0 |= ( aSig1 != 0 );
5869     shiftCount = 0x4028 - aExp;
5870     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5871     return roundAndPackInt32(aSign, aSig0, status);
5872 
5873 }
5874 
5875 /*----------------------------------------------------------------------------
5876 | Returns the result of converting the quadruple-precision floating-point
5877 | value `a' to the 32-bit two's complement integer format.  The conversion
5878 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5879 | Arithmetic, except that the conversion is always rounded toward zero.  If
5880 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5881 | conversion overflows, the largest integer with the same sign as `a' is
5882 | returned.
5883 *----------------------------------------------------------------------------*/
5884 
5885 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5886 {
5887     flag aSign;
5888     int32_t aExp, shiftCount;
5889     uint64_t aSig0, aSig1, savedASig;
5890     int32_t z;
5891 
5892     aSig1 = extractFloat128Frac1( a );
5893     aSig0 = extractFloat128Frac0( a );
5894     aExp = extractFloat128Exp( a );
5895     aSign = extractFloat128Sign( a );
5896     aSig0 |= ( aSig1 != 0 );
5897     if ( 0x401E < aExp ) {
5898         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5899         goto invalid;
5900     }
5901     else if ( aExp < 0x3FFF ) {
5902         if (aExp || aSig0) {
5903             status->float_exception_flags |= float_flag_inexact;
5904         }
5905         return 0;
5906     }
5907     aSig0 |= LIT64( 0x0001000000000000 );
5908     shiftCount = 0x402F - aExp;
5909     savedASig = aSig0;
5910     aSig0 >>= shiftCount;
5911     z = aSig0;
5912     if ( aSign ) z = - z;
5913     if ( ( z < 0 ) ^ aSign ) {
5914  invalid:
5915         float_raise(float_flag_invalid, status);
5916         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5917     }
5918     if ( ( aSig0<<shiftCount ) != savedASig ) {
5919         status->float_exception_flags |= float_flag_inexact;
5920     }
5921     return z;
5922 
5923 }
5924 
5925 /*----------------------------------------------------------------------------
5926 | Returns the result of converting the quadruple-precision floating-point
5927 | value `a' to the 64-bit two's complement integer format.  The conversion
5928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5929 | Arithmetic---which means in particular that the conversion is rounded
5930 | according to the current rounding mode.  If `a' is a NaN, the largest
5931 | positive integer is returned.  Otherwise, if the conversion overflows, the
5932 | largest integer with the same sign as `a' is returned.
5933 *----------------------------------------------------------------------------*/
5934 
5935 int64_t float128_to_int64(float128 a, float_status *status)
5936 {
5937     flag aSign;
5938     int32_t aExp, shiftCount;
5939     uint64_t aSig0, aSig1;
5940 
5941     aSig1 = extractFloat128Frac1( a );
5942     aSig0 = extractFloat128Frac0( a );
5943     aExp = extractFloat128Exp( a );
5944     aSign = extractFloat128Sign( a );
5945     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5946     shiftCount = 0x402F - aExp;
5947     if ( shiftCount <= 0 ) {
5948         if ( 0x403E < aExp ) {
5949             float_raise(float_flag_invalid, status);
5950             if (    ! aSign
5951                  || (    ( aExp == 0x7FFF )
5952                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5953                     )
5954                ) {
5955                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5956             }
5957             return (int64_t) LIT64( 0x8000000000000000 );
5958         }
5959         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5960     }
5961     else {
5962         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5963     }
5964     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5965 
5966 }
5967 
5968 /*----------------------------------------------------------------------------
5969 | Returns the result of converting the quadruple-precision floating-point
5970 | value `a' to the 64-bit two's complement integer format.  The conversion
5971 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5972 | Arithmetic, except that the conversion is always rounded toward zero.
5973 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5974 | the conversion overflows, the largest integer with the same sign as `a' is
5975 | returned.
5976 *----------------------------------------------------------------------------*/
5977 
5978 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5979 {
5980     flag aSign;
5981     int32_t aExp, shiftCount;
5982     uint64_t aSig0, aSig1;
5983     int64_t z;
5984 
5985     aSig1 = extractFloat128Frac1( a );
5986     aSig0 = extractFloat128Frac0( a );
5987     aExp = extractFloat128Exp( a );
5988     aSign = extractFloat128Sign( a );
5989     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5990     shiftCount = aExp - 0x402F;
5991     if ( 0 < shiftCount ) {
5992         if ( 0x403E <= aExp ) {
5993             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5994             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5995                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5996                 if (aSig1) {
5997                     status->float_exception_flags |= float_flag_inexact;
5998                 }
5999             }
6000             else {
6001                 float_raise(float_flag_invalid, status);
6002                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6003                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6004                 }
6005             }
6006             return (int64_t) LIT64( 0x8000000000000000 );
6007         }
6008         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6009         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6010             status->float_exception_flags |= float_flag_inexact;
6011         }
6012     }
6013     else {
6014         if ( aExp < 0x3FFF ) {
6015             if ( aExp | aSig0 | aSig1 ) {
6016                 status->float_exception_flags |= float_flag_inexact;
6017             }
6018             return 0;
6019         }
6020         z = aSig0>>( - shiftCount );
6021         if (    aSig1
6022              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6023             status->float_exception_flags |= float_flag_inexact;
6024         }
6025     }
6026     if ( aSign ) z = - z;
6027     return z;
6028 
6029 }
6030 
6031 /*----------------------------------------------------------------------------
6032 | Returns the result of converting the quadruple-precision floating-point
6033 | value `a' to the single-precision floating-point format.  The conversion
6034 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6035 | Arithmetic.
6036 *----------------------------------------------------------------------------*/
6037 
6038 float32 float128_to_float32(float128 a, float_status *status)
6039 {
6040     flag aSign;
6041     int32_t aExp;
6042     uint64_t aSig0, aSig1;
6043     uint32_t zSig;
6044 
6045     aSig1 = extractFloat128Frac1( a );
6046     aSig0 = extractFloat128Frac0( a );
6047     aExp = extractFloat128Exp( a );
6048     aSign = extractFloat128Sign( a );
6049     if ( aExp == 0x7FFF ) {
6050         if ( aSig0 | aSig1 ) {
6051             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6052         }
6053         return packFloat32( aSign, 0xFF, 0 );
6054     }
6055     aSig0 |= ( aSig1 != 0 );
6056     shift64RightJamming( aSig0, 18, &aSig0 );
6057     zSig = aSig0;
6058     if ( aExp || zSig ) {
6059         zSig |= 0x40000000;
6060         aExp -= 0x3F81;
6061     }
6062     return roundAndPackFloat32(aSign, aExp, zSig, status);
6063 
6064 }
6065 
6066 /*----------------------------------------------------------------------------
6067 | Returns the result of converting the quadruple-precision floating-point
6068 | value `a' to the double-precision floating-point format.  The conversion
6069 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6070 | Arithmetic.
6071 *----------------------------------------------------------------------------*/
6072 
6073 float64 float128_to_float64(float128 a, float_status *status)
6074 {
6075     flag aSign;
6076     int32_t aExp;
6077     uint64_t aSig0, aSig1;
6078 
6079     aSig1 = extractFloat128Frac1( a );
6080     aSig0 = extractFloat128Frac0( a );
6081     aExp = extractFloat128Exp( a );
6082     aSign = extractFloat128Sign( a );
6083     if ( aExp == 0x7FFF ) {
6084         if ( aSig0 | aSig1 ) {
6085             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6086         }
6087         return packFloat64( aSign, 0x7FF, 0 );
6088     }
6089     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6090     aSig0 |= ( aSig1 != 0 );
6091     if ( aExp || aSig0 ) {
6092         aSig0 |= LIT64( 0x4000000000000000 );
6093         aExp -= 0x3C01;
6094     }
6095     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6096 
6097 }
6098 
6099 /*----------------------------------------------------------------------------
6100 | Returns the result of converting the quadruple-precision floating-point
6101 | value `a' to the extended double-precision floating-point format.  The
6102 | conversion is performed according to the IEC/IEEE Standard for Binary
6103 | Floating-Point Arithmetic.
6104 *----------------------------------------------------------------------------*/
6105 
6106 floatx80 float128_to_floatx80(float128 a, float_status *status)
6107 {
6108     flag aSign;
6109     int32_t aExp;
6110     uint64_t aSig0, aSig1;
6111 
6112     aSig1 = extractFloat128Frac1( a );
6113     aSig0 = extractFloat128Frac0( a );
6114     aExp = extractFloat128Exp( a );
6115     aSign = extractFloat128Sign( a );
6116     if ( aExp == 0x7FFF ) {
6117         if ( aSig0 | aSig1 ) {
6118             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6119         }
6120         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6121     }
6122     if ( aExp == 0 ) {
6123         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6124         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6125     }
6126     else {
6127         aSig0 |= LIT64( 0x0001000000000000 );
6128     }
6129     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6130     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6131 
6132 }
6133 
6134 /*----------------------------------------------------------------------------
6135 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6136 | returns the result as a quadruple-precision floating-point value.  The
6137 | operation is performed according to the IEC/IEEE Standard for Binary
6138 | Floating-Point Arithmetic.
6139 *----------------------------------------------------------------------------*/
6140 
6141 float128 float128_round_to_int(float128 a, float_status *status)
6142 {
6143     flag aSign;
6144     int32_t aExp;
6145     uint64_t lastBitMask, roundBitsMask;
6146     float128 z;
6147 
6148     aExp = extractFloat128Exp( a );
6149     if ( 0x402F <= aExp ) {
6150         if ( 0x406F <= aExp ) {
6151             if (    ( aExp == 0x7FFF )
6152                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6153                ) {
6154                 return propagateFloat128NaN(a, a, status);
6155             }
6156             return a;
6157         }
6158         lastBitMask = 1;
6159         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6160         roundBitsMask = lastBitMask - 1;
6161         z = a;
6162         switch (status->float_rounding_mode) {
6163         case float_round_nearest_even:
6164             if ( lastBitMask ) {
6165                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6166                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6167             }
6168             else {
6169                 if ( (int64_t) z.low < 0 ) {
6170                     ++z.high;
6171                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6172                 }
6173             }
6174             break;
6175         case float_round_ties_away:
6176             if (lastBitMask) {
6177                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6178             } else {
6179                 if ((int64_t) z.low < 0) {
6180                     ++z.high;
6181                 }
6182             }
6183             break;
6184         case float_round_to_zero:
6185             break;
6186         case float_round_up:
6187             if (!extractFloat128Sign(z)) {
6188                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6189             }
6190             break;
6191         case float_round_down:
6192             if (extractFloat128Sign(z)) {
6193                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6194             }
6195             break;
6196         default:
6197             abort();
6198         }
6199         z.low &= ~ roundBitsMask;
6200     }
6201     else {
6202         if ( aExp < 0x3FFF ) {
6203             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6204             status->float_exception_flags |= float_flag_inexact;
6205             aSign = extractFloat128Sign( a );
6206             switch (status->float_rounding_mode) {
6207              case float_round_nearest_even:
6208                 if (    ( aExp == 0x3FFE )
6209                      && (   extractFloat128Frac0( a )
6210                           | extractFloat128Frac1( a ) )
6211                    ) {
6212                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6213                 }
6214                 break;
6215             case float_round_ties_away:
6216                 if (aExp == 0x3FFE) {
6217                     return packFloat128(aSign, 0x3FFF, 0, 0);
6218                 }
6219                 break;
6220              case float_round_down:
6221                 return
6222                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6223                     : packFloat128( 0, 0, 0, 0 );
6224              case float_round_up:
6225                 return
6226                       aSign ? packFloat128( 1, 0, 0, 0 )
6227                     : packFloat128( 0, 0x3FFF, 0, 0 );
6228             }
6229             return packFloat128( aSign, 0, 0, 0 );
6230         }
6231         lastBitMask = 1;
6232         lastBitMask <<= 0x402F - aExp;
6233         roundBitsMask = lastBitMask - 1;
6234         z.low = 0;
6235         z.high = a.high;
6236         switch (status->float_rounding_mode) {
6237         case float_round_nearest_even:
6238             z.high += lastBitMask>>1;
6239             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6240                 z.high &= ~ lastBitMask;
6241             }
6242             break;
6243         case float_round_ties_away:
6244             z.high += lastBitMask>>1;
6245             break;
6246         case float_round_to_zero:
6247             break;
6248         case float_round_up:
6249             if (!extractFloat128Sign(z)) {
6250                 z.high |= ( a.low != 0 );
6251                 z.high += roundBitsMask;
6252             }
6253             break;
6254         case float_round_down:
6255             if (extractFloat128Sign(z)) {
6256                 z.high |= (a.low != 0);
6257                 z.high += roundBitsMask;
6258             }
6259             break;
6260         default:
6261             abort();
6262         }
6263         z.high &= ~ roundBitsMask;
6264     }
6265     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6266         status->float_exception_flags |= float_flag_inexact;
6267     }
6268     return z;
6269 
6270 }
6271 
6272 /*----------------------------------------------------------------------------
6273 | Returns the result of adding the absolute values of the quadruple-precision
6274 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6275 | before being returned.  `zSign' is ignored if the result is a NaN.
6276 | The addition is performed according to the IEC/IEEE Standard for Binary
6277 | Floating-Point Arithmetic.
6278 *----------------------------------------------------------------------------*/
6279 
6280 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6281                                 float_status *status)
6282 {
6283     int32_t aExp, bExp, zExp;
6284     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6285     int32_t expDiff;
6286 
6287     aSig1 = extractFloat128Frac1( a );
6288     aSig0 = extractFloat128Frac0( a );
6289     aExp = extractFloat128Exp( a );
6290     bSig1 = extractFloat128Frac1( b );
6291     bSig0 = extractFloat128Frac0( b );
6292     bExp = extractFloat128Exp( b );
6293     expDiff = aExp - bExp;
6294     if ( 0 < expDiff ) {
6295         if ( aExp == 0x7FFF ) {
6296             if (aSig0 | aSig1) {
6297                 return propagateFloat128NaN(a, b, status);
6298             }
6299             return a;
6300         }
6301         if ( bExp == 0 ) {
6302             --expDiff;
6303         }
6304         else {
6305             bSig0 |= LIT64( 0x0001000000000000 );
6306         }
6307         shift128ExtraRightJamming(
6308             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6309         zExp = aExp;
6310     }
6311     else if ( expDiff < 0 ) {
6312         if ( bExp == 0x7FFF ) {
6313             if (bSig0 | bSig1) {
6314                 return propagateFloat128NaN(a, b, status);
6315             }
6316             return packFloat128( zSign, 0x7FFF, 0, 0 );
6317         }
6318         if ( aExp == 0 ) {
6319             ++expDiff;
6320         }
6321         else {
6322             aSig0 |= LIT64( 0x0001000000000000 );
6323         }
6324         shift128ExtraRightJamming(
6325             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6326         zExp = bExp;
6327     }
6328     else {
6329         if ( aExp == 0x7FFF ) {
6330             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6331                 return propagateFloat128NaN(a, b, status);
6332             }
6333             return a;
6334         }
6335         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6336         if ( aExp == 0 ) {
6337             if (status->flush_to_zero) {
6338                 if (zSig0 | zSig1) {
6339                     float_raise(float_flag_output_denormal, status);
6340                 }
6341                 return packFloat128(zSign, 0, 0, 0);
6342             }
6343             return packFloat128( zSign, 0, zSig0, zSig1 );
6344         }
6345         zSig2 = 0;
6346         zSig0 |= LIT64( 0x0002000000000000 );
6347         zExp = aExp;
6348         goto shiftRight1;
6349     }
6350     aSig0 |= LIT64( 0x0001000000000000 );
6351     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6352     --zExp;
6353     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6354     ++zExp;
6355  shiftRight1:
6356     shift128ExtraRightJamming(
6357         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6358  roundAndPack:
6359     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6360 
6361 }
6362 
6363 /*----------------------------------------------------------------------------
6364 | Returns the result of subtracting the absolute values of the quadruple-
6365 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6366 | difference is negated before being returned.  `zSign' is ignored if the
6367 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6368 | Standard for Binary Floating-Point Arithmetic.
6369 *----------------------------------------------------------------------------*/
6370 
6371 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6372                                 float_status *status)
6373 {
6374     int32_t aExp, bExp, zExp;
6375     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6376     int32_t expDiff;
6377     float128 z;
6378 
6379     aSig1 = extractFloat128Frac1( a );
6380     aSig0 = extractFloat128Frac0( a );
6381     aExp = extractFloat128Exp( a );
6382     bSig1 = extractFloat128Frac1( b );
6383     bSig0 = extractFloat128Frac0( b );
6384     bExp = extractFloat128Exp( b );
6385     expDiff = aExp - bExp;
6386     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6387     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6388     if ( 0 < expDiff ) goto aExpBigger;
6389     if ( expDiff < 0 ) goto bExpBigger;
6390     if ( aExp == 0x7FFF ) {
6391         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6392             return propagateFloat128NaN(a, b, status);
6393         }
6394         float_raise(float_flag_invalid, status);
6395         z.low = float128_default_nan_low;
6396         z.high = float128_default_nan_high;
6397         return z;
6398     }
6399     if ( aExp == 0 ) {
6400         aExp = 1;
6401         bExp = 1;
6402     }
6403     if ( bSig0 < aSig0 ) goto aBigger;
6404     if ( aSig0 < bSig0 ) goto bBigger;
6405     if ( bSig1 < aSig1 ) goto aBigger;
6406     if ( aSig1 < bSig1 ) goto bBigger;
6407     return packFloat128(status->float_rounding_mode == float_round_down,
6408                         0, 0, 0);
6409  bExpBigger:
6410     if ( bExp == 0x7FFF ) {
6411         if (bSig0 | bSig1) {
6412             return propagateFloat128NaN(a, b, status);
6413         }
6414         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6415     }
6416     if ( aExp == 0 ) {
6417         ++expDiff;
6418     }
6419     else {
6420         aSig0 |= LIT64( 0x4000000000000000 );
6421     }
6422     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6423     bSig0 |= LIT64( 0x4000000000000000 );
6424  bBigger:
6425     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6426     zExp = bExp;
6427     zSign ^= 1;
6428     goto normalizeRoundAndPack;
6429  aExpBigger:
6430     if ( aExp == 0x7FFF ) {
6431         if (aSig0 | aSig1) {
6432             return propagateFloat128NaN(a, b, status);
6433         }
6434         return a;
6435     }
6436     if ( bExp == 0 ) {
6437         --expDiff;
6438     }
6439     else {
6440         bSig0 |= LIT64( 0x4000000000000000 );
6441     }
6442     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6443     aSig0 |= LIT64( 0x4000000000000000 );
6444  aBigger:
6445     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6446     zExp = aExp;
6447  normalizeRoundAndPack:
6448     --zExp;
6449     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6450                                          status);
6451 
6452 }
6453 
6454 /*----------------------------------------------------------------------------
6455 | Returns the result of adding the quadruple-precision floating-point values
6456 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6457 | for Binary Floating-Point Arithmetic.
6458 *----------------------------------------------------------------------------*/
6459 
6460 float128 float128_add(float128 a, float128 b, float_status *status)
6461 {
6462     flag aSign, bSign;
6463 
6464     aSign = extractFloat128Sign( a );
6465     bSign = extractFloat128Sign( b );
6466     if ( aSign == bSign ) {
6467         return addFloat128Sigs(a, b, aSign, status);
6468     }
6469     else {
6470         return subFloat128Sigs(a, b, aSign, status);
6471     }
6472 
6473 }
6474 
6475 /*----------------------------------------------------------------------------
6476 | Returns the result of subtracting the quadruple-precision floating-point
6477 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6478 | Standard for Binary Floating-Point Arithmetic.
6479 *----------------------------------------------------------------------------*/
6480 
6481 float128 float128_sub(float128 a, float128 b, float_status *status)
6482 {
6483     flag aSign, bSign;
6484 
6485     aSign = extractFloat128Sign( a );
6486     bSign = extractFloat128Sign( b );
6487     if ( aSign == bSign ) {
6488         return subFloat128Sigs(a, b, aSign, status);
6489     }
6490     else {
6491         return addFloat128Sigs(a, b, aSign, status);
6492     }
6493 
6494 }
6495 
6496 /*----------------------------------------------------------------------------
6497 | Returns the result of multiplying the quadruple-precision floating-point
6498 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6499 | Standard for Binary Floating-Point Arithmetic.
6500 *----------------------------------------------------------------------------*/
6501 
6502 float128 float128_mul(float128 a, float128 b, float_status *status)
6503 {
6504     flag aSign, bSign, zSign;
6505     int32_t aExp, bExp, zExp;
6506     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6507     float128 z;
6508 
6509     aSig1 = extractFloat128Frac1( a );
6510     aSig0 = extractFloat128Frac0( a );
6511     aExp = extractFloat128Exp( a );
6512     aSign = extractFloat128Sign( a );
6513     bSig1 = extractFloat128Frac1( b );
6514     bSig0 = extractFloat128Frac0( b );
6515     bExp = extractFloat128Exp( b );
6516     bSign = extractFloat128Sign( b );
6517     zSign = aSign ^ bSign;
6518     if ( aExp == 0x7FFF ) {
6519         if (    ( aSig0 | aSig1 )
6520              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6521             return propagateFloat128NaN(a, b, status);
6522         }
6523         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6524         return packFloat128( zSign, 0x7FFF, 0, 0 );
6525     }
6526     if ( bExp == 0x7FFF ) {
6527         if (bSig0 | bSig1) {
6528             return propagateFloat128NaN(a, b, status);
6529         }
6530         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6531  invalid:
6532             float_raise(float_flag_invalid, status);
6533             z.low = float128_default_nan_low;
6534             z.high = float128_default_nan_high;
6535             return z;
6536         }
6537         return packFloat128( zSign, 0x7FFF, 0, 0 );
6538     }
6539     if ( aExp == 0 ) {
6540         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6541         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6542     }
6543     if ( bExp == 0 ) {
6544         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6545         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6546     }
6547     zExp = aExp + bExp - 0x4000;
6548     aSig0 |= LIT64( 0x0001000000000000 );
6549     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6550     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6551     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6552     zSig2 |= ( zSig3 != 0 );
6553     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6554         shift128ExtraRightJamming(
6555             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6556         ++zExp;
6557     }
6558     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6559 
6560 }
6561 
6562 /*----------------------------------------------------------------------------
6563 | Returns the result of dividing the quadruple-precision floating-point value
6564 | `a' by the corresponding value `b'.  The operation is performed according to
6565 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6566 *----------------------------------------------------------------------------*/
6567 
6568 float128 float128_div(float128 a, float128 b, float_status *status)
6569 {
6570     flag aSign, bSign, zSign;
6571     int32_t aExp, bExp, zExp;
6572     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6573     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6574     float128 z;
6575 
6576     aSig1 = extractFloat128Frac1( a );
6577     aSig0 = extractFloat128Frac0( a );
6578     aExp = extractFloat128Exp( a );
6579     aSign = extractFloat128Sign( a );
6580     bSig1 = extractFloat128Frac1( b );
6581     bSig0 = extractFloat128Frac0( b );
6582     bExp = extractFloat128Exp( b );
6583     bSign = extractFloat128Sign( b );
6584     zSign = aSign ^ bSign;
6585     if ( aExp == 0x7FFF ) {
6586         if (aSig0 | aSig1) {
6587             return propagateFloat128NaN(a, b, status);
6588         }
6589         if ( bExp == 0x7FFF ) {
6590             if (bSig0 | bSig1) {
6591                 return propagateFloat128NaN(a, b, status);
6592             }
6593             goto invalid;
6594         }
6595         return packFloat128( zSign, 0x7FFF, 0, 0 );
6596     }
6597     if ( bExp == 0x7FFF ) {
6598         if (bSig0 | bSig1) {
6599             return propagateFloat128NaN(a, b, status);
6600         }
6601         return packFloat128( zSign, 0, 0, 0 );
6602     }
6603     if ( bExp == 0 ) {
6604         if ( ( bSig0 | bSig1 ) == 0 ) {
6605             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6606  invalid:
6607                 float_raise(float_flag_invalid, status);
6608                 z.low = float128_default_nan_low;
6609                 z.high = float128_default_nan_high;
6610                 return z;
6611             }
6612             float_raise(float_flag_divbyzero, status);
6613             return packFloat128( zSign, 0x7FFF, 0, 0 );
6614         }
6615         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6616     }
6617     if ( aExp == 0 ) {
6618         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6619         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6620     }
6621     zExp = aExp - bExp + 0x3FFD;
6622     shortShift128Left(
6623         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6624     shortShift128Left(
6625         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6626     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6627         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6628         ++zExp;
6629     }
6630     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6631     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6632     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6633     while ( (int64_t) rem0 < 0 ) {
6634         --zSig0;
6635         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6636     }
6637     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6638     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6639         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6640         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6641         while ( (int64_t) rem1 < 0 ) {
6642             --zSig1;
6643             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6644         }
6645         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6646     }
6647     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6648     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6649 
6650 }
6651 
6652 /*----------------------------------------------------------------------------
6653 | Returns the remainder of the quadruple-precision floating-point value `a'
6654 | with respect to the corresponding value `b'.  The operation is performed
6655 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6656 *----------------------------------------------------------------------------*/
6657 
6658 float128 float128_rem(float128 a, float128 b, float_status *status)
6659 {
6660     flag aSign, zSign;
6661     int32_t aExp, bExp, expDiff;
6662     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6663     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6664     int64_t sigMean0;
6665     float128 z;
6666 
6667     aSig1 = extractFloat128Frac1( a );
6668     aSig0 = extractFloat128Frac0( a );
6669     aExp = extractFloat128Exp( a );
6670     aSign = extractFloat128Sign( a );
6671     bSig1 = extractFloat128Frac1( b );
6672     bSig0 = extractFloat128Frac0( b );
6673     bExp = extractFloat128Exp( b );
6674     if ( aExp == 0x7FFF ) {
6675         if (    ( aSig0 | aSig1 )
6676              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6677             return propagateFloat128NaN(a, b, status);
6678         }
6679         goto invalid;
6680     }
6681     if ( bExp == 0x7FFF ) {
6682         if (bSig0 | bSig1) {
6683             return propagateFloat128NaN(a, b, status);
6684         }
6685         return a;
6686     }
6687     if ( bExp == 0 ) {
6688         if ( ( bSig0 | bSig1 ) == 0 ) {
6689  invalid:
6690             float_raise(float_flag_invalid, status);
6691             z.low = float128_default_nan_low;
6692             z.high = float128_default_nan_high;
6693             return z;
6694         }
6695         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6696     }
6697     if ( aExp == 0 ) {
6698         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6699         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6700     }
6701     expDiff = aExp - bExp;
6702     if ( expDiff < -1 ) return a;
6703     shortShift128Left(
6704         aSig0 | LIT64( 0x0001000000000000 ),
6705         aSig1,
6706         15 - ( expDiff < 0 ),
6707         &aSig0,
6708         &aSig1
6709     );
6710     shortShift128Left(
6711         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6712     q = le128( bSig0, bSig1, aSig0, aSig1 );
6713     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6714     expDiff -= 64;
6715     while ( 0 < expDiff ) {
6716         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6717         q = ( 4 < q ) ? q - 4 : 0;
6718         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6719         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6720         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6721         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6722         expDiff -= 61;
6723     }
6724     if ( -64 < expDiff ) {
6725         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6726         q = ( 4 < q ) ? q - 4 : 0;
6727         q >>= - expDiff;
6728         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6729         expDiff += 52;
6730         if ( expDiff < 0 ) {
6731             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6732         }
6733         else {
6734             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6735         }
6736         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6737         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6738     }
6739     else {
6740         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6741         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6742     }
6743     do {
6744         alternateASig0 = aSig0;
6745         alternateASig1 = aSig1;
6746         ++q;
6747         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6748     } while ( 0 <= (int64_t) aSig0 );
6749     add128(
6750         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6751     if (    ( sigMean0 < 0 )
6752          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6753         aSig0 = alternateASig0;
6754         aSig1 = alternateASig1;
6755     }
6756     zSign = ( (int64_t) aSig0 < 0 );
6757     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6758     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6759                                          status);
6760 }
6761 
6762 /*----------------------------------------------------------------------------
6763 | Returns the square root of the quadruple-precision floating-point value `a'.
6764 | The operation is performed according to the IEC/IEEE Standard for Binary
6765 | Floating-Point Arithmetic.
6766 *----------------------------------------------------------------------------*/
6767 
6768 float128 float128_sqrt(float128 a, float_status *status)
6769 {
6770     flag aSign;
6771     int32_t aExp, zExp;
6772     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6773     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6774     float128 z;
6775 
6776     aSig1 = extractFloat128Frac1( a );
6777     aSig0 = extractFloat128Frac0( a );
6778     aExp = extractFloat128Exp( a );
6779     aSign = extractFloat128Sign( a );
6780     if ( aExp == 0x7FFF ) {
6781         if (aSig0 | aSig1) {
6782             return propagateFloat128NaN(a, a, status);
6783         }
6784         if ( ! aSign ) return a;
6785         goto invalid;
6786     }
6787     if ( aSign ) {
6788         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6789  invalid:
6790         float_raise(float_flag_invalid, status);
6791         z.low = float128_default_nan_low;
6792         z.high = float128_default_nan_high;
6793         return z;
6794     }
6795     if ( aExp == 0 ) {
6796         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6797         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6798     }
6799     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6800     aSig0 |= LIT64( 0x0001000000000000 );
6801     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6802     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6803     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6804     doubleZSig0 = zSig0<<1;
6805     mul64To128( zSig0, zSig0, &term0, &term1 );
6806     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6807     while ( (int64_t) rem0 < 0 ) {
6808         --zSig0;
6809         doubleZSig0 -= 2;
6810         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6811     }
6812     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6813     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6814         if ( zSig1 == 0 ) zSig1 = 1;
6815         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6816         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6817         mul64To128( zSig1, zSig1, &term2, &term3 );
6818         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6819         while ( (int64_t) rem1 < 0 ) {
6820             --zSig1;
6821             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6822             term3 |= 1;
6823             term2 |= doubleZSig0;
6824             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6825         }
6826         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6827     }
6828     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6829     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6830 
6831 }
6832 
6833 /*----------------------------------------------------------------------------
6834 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6835 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6836 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6837 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6838 *----------------------------------------------------------------------------*/
6839 
6840 int float128_eq(float128 a, float128 b, float_status *status)
6841 {
6842 
6843     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6844               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6845          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6846               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6847        ) {
6848         float_raise(float_flag_invalid, status);
6849         return 0;
6850     }
6851     return
6852            ( a.low == b.low )
6853         && (    ( a.high == b.high )
6854              || (    ( a.low == 0 )
6855                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6856            );
6857 
6858 }
6859 
6860 /*----------------------------------------------------------------------------
6861 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6862 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6863 | exception is raised if either operand is a NaN.  The comparison is performed
6864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6865 *----------------------------------------------------------------------------*/
6866 
6867 int float128_le(float128 a, float128 b, float_status *status)
6868 {
6869     flag aSign, bSign;
6870 
6871     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6872               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6873          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6874               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6875        ) {
6876         float_raise(float_flag_invalid, status);
6877         return 0;
6878     }
6879     aSign = extractFloat128Sign( a );
6880     bSign = extractFloat128Sign( b );
6881     if ( aSign != bSign ) {
6882         return
6883                aSign
6884             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6885                  == 0 );
6886     }
6887     return
6888           aSign ? le128( b.high, b.low, a.high, a.low )
6889         : le128( a.high, a.low, b.high, b.low );
6890 
6891 }
6892 
6893 /*----------------------------------------------------------------------------
6894 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6895 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6896 | raised if either operand is a NaN.  The comparison is performed according
6897 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6898 *----------------------------------------------------------------------------*/
6899 
6900 int float128_lt(float128 a, float128 b, float_status *status)
6901 {
6902     flag aSign, bSign;
6903 
6904     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6905               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6906          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6907               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6908        ) {
6909         float_raise(float_flag_invalid, status);
6910         return 0;
6911     }
6912     aSign = extractFloat128Sign( a );
6913     bSign = extractFloat128Sign( b );
6914     if ( aSign != bSign ) {
6915         return
6916                aSign
6917             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6918                  != 0 );
6919     }
6920     return
6921           aSign ? lt128( b.high, b.low, a.high, a.low )
6922         : lt128( a.high, a.low, b.high, b.low );
6923 
6924 }
6925 
6926 /*----------------------------------------------------------------------------
6927 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6928 | be compared, and 0 otherwise.  The invalid exception is raised if either
6929 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6930 | Standard for Binary Floating-Point Arithmetic.
6931 *----------------------------------------------------------------------------*/
6932 
6933 int float128_unordered(float128 a, float128 b, float_status *status)
6934 {
6935     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6936               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6937          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6938               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6939        ) {
6940         float_raise(float_flag_invalid, status);
6941         return 1;
6942     }
6943     return 0;
6944 }
6945 
6946 /*----------------------------------------------------------------------------
6947 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6948 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6949 | exception.  The comparison is performed according to the IEC/IEEE Standard
6950 | for Binary Floating-Point Arithmetic.
6951 *----------------------------------------------------------------------------*/
6952 
6953 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6954 {
6955 
6956     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6957               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6958          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6959               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6960        ) {
6961         if (    float128_is_signaling_nan( a )
6962              || float128_is_signaling_nan( b ) ) {
6963             float_raise(float_flag_invalid, status);
6964         }
6965         return 0;
6966     }
6967     return
6968            ( a.low == b.low )
6969         && (    ( a.high == b.high )
6970              || (    ( a.low == 0 )
6971                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6972            );
6973 
6974 }
6975 
6976 /*----------------------------------------------------------------------------
6977 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6978 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6979 | cause an exception.  Otherwise, the comparison is performed according to the
6980 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6981 *----------------------------------------------------------------------------*/
6982 
6983 int float128_le_quiet(float128 a, float128 b, float_status *status)
6984 {
6985     flag aSign, bSign;
6986 
6987     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6988               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6989          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6990               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6991        ) {
6992         if (    float128_is_signaling_nan( a )
6993              || float128_is_signaling_nan( b ) ) {
6994             float_raise(float_flag_invalid, status);
6995         }
6996         return 0;
6997     }
6998     aSign = extractFloat128Sign( a );
6999     bSign = extractFloat128Sign( b );
7000     if ( aSign != bSign ) {
7001         return
7002                aSign
7003             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7004                  == 0 );
7005     }
7006     return
7007           aSign ? le128( b.high, b.low, a.high, a.low )
7008         : le128( a.high, a.low, b.high, b.low );
7009 
7010 }
7011 
7012 /*----------------------------------------------------------------------------
7013 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7014 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7015 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7016 | Standard for Binary Floating-Point Arithmetic.
7017 *----------------------------------------------------------------------------*/
7018 
7019 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7020 {
7021     flag aSign, bSign;
7022 
7023     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7024               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7025          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7026               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7027        ) {
7028         if (    float128_is_signaling_nan( a )
7029              || float128_is_signaling_nan( b ) ) {
7030             float_raise(float_flag_invalid, status);
7031         }
7032         return 0;
7033     }
7034     aSign = extractFloat128Sign( a );
7035     bSign = extractFloat128Sign( b );
7036     if ( aSign != bSign ) {
7037         return
7038                aSign
7039             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7040                  != 0 );
7041     }
7042     return
7043           aSign ? lt128( b.high, b.low, a.high, a.low )
7044         : lt128( a.high, a.low, b.high, b.low );
7045 
7046 }
7047 
7048 /*----------------------------------------------------------------------------
7049 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7050 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7051 | comparison is performed according to the IEC/IEEE Standard for Binary
7052 | Floating-Point Arithmetic.
7053 *----------------------------------------------------------------------------*/
7054 
7055 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7056 {
7057     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7058               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7059          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7060               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7061        ) {
7062         if (    float128_is_signaling_nan( a )
7063              || float128_is_signaling_nan( b ) ) {
7064             float_raise(float_flag_invalid, status);
7065         }
7066         return 1;
7067     }
7068     return 0;
7069 }
7070 
7071 /* misc functions */
7072 float32 uint32_to_float32(uint32_t a, float_status *status)
7073 {
7074     return int64_to_float32(a, status);
7075 }
7076 
7077 float64 uint32_to_float64(uint32_t a, float_status *status)
7078 {
7079     return int64_to_float64(a, status);
7080 }
7081 
7082 uint32_t float32_to_uint32(float32 a, float_status *status)
7083 {
7084     int64_t v;
7085     uint32_t res;
7086     int old_exc_flags = get_float_exception_flags(status);
7087 
7088     v = float32_to_int64(a, status);
7089     if (v < 0) {
7090         res = 0;
7091     } else if (v > 0xffffffff) {
7092         res = 0xffffffff;
7093     } else {
7094         return v;
7095     }
7096     set_float_exception_flags(old_exc_flags, status);
7097     float_raise(float_flag_invalid, status);
7098     return res;
7099 }
7100 
7101 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7102 {
7103     int64_t v;
7104     uint32_t res;
7105     int old_exc_flags = get_float_exception_flags(status);
7106 
7107     v = float32_to_int64_round_to_zero(a, status);
7108     if (v < 0) {
7109         res = 0;
7110     } else if (v > 0xffffffff) {
7111         res = 0xffffffff;
7112     } else {
7113         return v;
7114     }
7115     set_float_exception_flags(old_exc_flags, status);
7116     float_raise(float_flag_invalid, status);
7117     return res;
7118 }
7119 
7120 int_fast16_t float32_to_int16(float32 a, float_status *status)
7121 {
7122     int32_t v;
7123     int_fast16_t res;
7124     int old_exc_flags = get_float_exception_flags(status);
7125 
7126     v = float32_to_int32(a, status);
7127     if (v < -0x8000) {
7128         res = -0x8000;
7129     } else if (v > 0x7fff) {
7130         res = 0x7fff;
7131     } else {
7132         return v;
7133     }
7134 
7135     set_float_exception_flags(old_exc_flags, status);
7136     float_raise(float_flag_invalid, status);
7137     return res;
7138 }
7139 
7140 uint_fast16_t float32_to_uint16(float32 a, float_status *status)
7141 {
7142     int32_t v;
7143     uint_fast16_t res;
7144     int old_exc_flags = get_float_exception_flags(status);
7145 
7146     v = float32_to_int32(a, status);
7147     if (v < 0) {
7148         res = 0;
7149     } else if (v > 0xffff) {
7150         res = 0xffff;
7151     } else {
7152         return v;
7153     }
7154 
7155     set_float_exception_flags(old_exc_flags, status);
7156     float_raise(float_flag_invalid, status);
7157     return res;
7158 }
7159 
7160 uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7161 {
7162     int64_t v;
7163     uint_fast16_t res;
7164     int old_exc_flags = get_float_exception_flags(status);
7165 
7166     v = float32_to_int64_round_to_zero(a, status);
7167     if (v < 0) {
7168         res = 0;
7169     } else if (v > 0xffff) {
7170         res = 0xffff;
7171     } else {
7172         return v;
7173     }
7174     set_float_exception_flags(old_exc_flags, status);
7175     float_raise(float_flag_invalid, status);
7176     return res;
7177 }
7178 
7179 uint32_t float64_to_uint32(float64 a, float_status *status)
7180 {
7181     uint64_t v;
7182     uint32_t res;
7183     int old_exc_flags = get_float_exception_flags(status);
7184 
7185     v = float64_to_uint64(a, status);
7186     if (v > 0xffffffff) {
7187         res = 0xffffffff;
7188     } else {
7189         return v;
7190     }
7191     set_float_exception_flags(old_exc_flags, status);
7192     float_raise(float_flag_invalid, status);
7193     return res;
7194 }
7195 
7196 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7197 {
7198     uint64_t v;
7199     uint32_t res;
7200     int old_exc_flags = get_float_exception_flags(status);
7201 
7202     v = float64_to_uint64_round_to_zero(a, status);
7203     if (v > 0xffffffff) {
7204         res = 0xffffffff;
7205     } else {
7206         return v;
7207     }
7208     set_float_exception_flags(old_exc_flags, status);
7209     float_raise(float_flag_invalid, status);
7210     return res;
7211 }
7212 
7213 int_fast16_t float64_to_int16(float64 a, float_status *status)
7214 {
7215     int64_t v;
7216     int_fast16_t res;
7217     int old_exc_flags = get_float_exception_flags(status);
7218 
7219     v = float64_to_int32(a, status);
7220     if (v < -0x8000) {
7221         res = -0x8000;
7222     } else if (v > 0x7fff) {
7223         res = 0x7fff;
7224     } else {
7225         return v;
7226     }
7227 
7228     set_float_exception_flags(old_exc_flags, status);
7229     float_raise(float_flag_invalid, status);
7230     return res;
7231 }
7232 
7233 uint_fast16_t float64_to_uint16(float64 a, float_status *status)
7234 {
7235     int64_t v;
7236     uint_fast16_t res;
7237     int old_exc_flags = get_float_exception_flags(status);
7238 
7239     v = float64_to_int32(a, status);
7240     if (v < 0) {
7241         res = 0;
7242     } else if (v > 0xffff) {
7243         res = 0xffff;
7244     } else {
7245         return v;
7246     }
7247 
7248     set_float_exception_flags(old_exc_flags, status);
7249     float_raise(float_flag_invalid, status);
7250     return res;
7251 }
7252 
7253 uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7254 {
7255     int64_t v;
7256     uint_fast16_t res;
7257     int old_exc_flags = get_float_exception_flags(status);
7258 
7259     v = float64_to_int64_round_to_zero(a, status);
7260     if (v < 0) {
7261         res = 0;
7262     } else if (v > 0xffff) {
7263         res = 0xffff;
7264     } else {
7265         return v;
7266     }
7267     set_float_exception_flags(old_exc_flags, status);
7268     float_raise(float_flag_invalid, status);
7269     return res;
7270 }
7271 
7272 /*----------------------------------------------------------------------------
7273 | Returns the result of converting the double-precision floating-point value
7274 | `a' to the 64-bit unsigned integer format.  The conversion is
7275 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7276 | Arithmetic---which means in particular that the conversion is rounded
7277 | according to the current rounding mode.  If `a' is a NaN, the largest
7278 | positive integer is returned.  If the conversion overflows, the
7279 | largest unsigned integer is returned.  If 'a' is negative, the value is
7280 | rounded and zero is returned; negative values that do not round to zero
7281 | will raise the inexact exception.
7282 *----------------------------------------------------------------------------*/
7283 
7284 uint64_t float64_to_uint64(float64 a, float_status *status)
7285 {
7286     flag aSign;
7287     int_fast16_t aExp, shiftCount;
7288     uint64_t aSig, aSigExtra;
7289     a = float64_squash_input_denormal(a, status);
7290 
7291     aSig = extractFloat64Frac(a);
7292     aExp = extractFloat64Exp(a);
7293     aSign = extractFloat64Sign(a);
7294     if (aSign && (aExp > 1022)) {
7295         float_raise(float_flag_invalid, status);
7296         if (float64_is_any_nan(a)) {
7297             return LIT64(0xFFFFFFFFFFFFFFFF);
7298         } else {
7299             return 0;
7300         }
7301     }
7302     if (aExp) {
7303         aSig |= LIT64(0x0010000000000000);
7304     }
7305     shiftCount = 0x433 - aExp;
7306     if (shiftCount <= 0) {
7307         if (0x43E < aExp) {
7308             float_raise(float_flag_invalid, status);
7309             return LIT64(0xFFFFFFFFFFFFFFFF);
7310         }
7311         aSigExtra = 0;
7312         aSig <<= -shiftCount;
7313     } else {
7314         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7315     }
7316     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7317 }
7318 
7319 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7320 {
7321     signed char current_rounding_mode = status->float_rounding_mode;
7322     set_float_rounding_mode(float_round_to_zero, status);
7323     int64_t v = float64_to_uint64(a, status);
7324     set_float_rounding_mode(current_rounding_mode, status);
7325     return v;
7326 }
7327 
7328 #define COMPARE(s, nan_exp)                                                  \
7329 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7330                                       int is_quiet, float_status *status)    \
7331 {                                                                            \
7332     flag aSign, bSign;                                                       \
7333     uint ## s ## _t av, bv;                                                  \
7334     a = float ## s ## _squash_input_denormal(a, status);                     \
7335     b = float ## s ## _squash_input_denormal(b, status);                     \
7336                                                                              \
7337     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7338          extractFloat ## s ## Frac( a ) ) ||                                 \
7339         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7340           extractFloat ## s ## Frac( b ) )) {                                \
7341         if (!is_quiet ||                                                     \
7342             float ## s ## _is_signaling_nan( a ) ||                          \
7343             float ## s ## _is_signaling_nan( b ) ) {                         \
7344             float_raise(float_flag_invalid, status);                         \
7345         }                                                                    \
7346         return float_relation_unordered;                                     \
7347     }                                                                        \
7348     aSign = extractFloat ## s ## Sign( a );                                  \
7349     bSign = extractFloat ## s ## Sign( b );                                  \
7350     av = float ## s ## _val(a);                                              \
7351     bv = float ## s ## _val(b);                                              \
7352     if ( aSign != bSign ) {                                                  \
7353         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7354             /* zero case */                                                  \
7355             return float_relation_equal;                                     \
7356         } else {                                                             \
7357             return 1 - (2 * aSign);                                          \
7358         }                                                                    \
7359     } else {                                                                 \
7360         if (av == bv) {                                                      \
7361             return float_relation_equal;                                     \
7362         } else {                                                             \
7363             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7364         }                                                                    \
7365     }                                                                        \
7366 }                                                                            \
7367                                                                              \
7368 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7369 {                                                                            \
7370     return float ## s ## _compare_internal(a, b, 0, status);                 \
7371 }                                                                            \
7372                                                                              \
7373 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7374                                  float_status *status)                       \
7375 {                                                                            \
7376     return float ## s ## _compare_internal(a, b, 1, status);                 \
7377 }
7378 
7379 COMPARE(32, 0xff)
7380 COMPARE(64, 0x7ff)
7381 
7382 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7383                                             int is_quiet, float_status *status)
7384 {
7385     flag aSign, bSign;
7386 
7387     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7388           ( extractFloatx80Frac( a )<<1 ) ) ||
7389         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7390           ( extractFloatx80Frac( b )<<1 ) )) {
7391         if (!is_quiet ||
7392             floatx80_is_signaling_nan( a ) ||
7393             floatx80_is_signaling_nan( b ) ) {
7394             float_raise(float_flag_invalid, status);
7395         }
7396         return float_relation_unordered;
7397     }
7398     aSign = extractFloatx80Sign( a );
7399     bSign = extractFloatx80Sign( b );
7400     if ( aSign != bSign ) {
7401 
7402         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7403              ( ( a.low | b.low ) == 0 ) ) {
7404             /* zero case */
7405             return float_relation_equal;
7406         } else {
7407             return 1 - (2 * aSign);
7408         }
7409     } else {
7410         if (a.low == b.low && a.high == b.high) {
7411             return float_relation_equal;
7412         } else {
7413             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7414         }
7415     }
7416 }
7417 
7418 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7419 {
7420     return floatx80_compare_internal(a, b, 0, status);
7421 }
7422 
7423 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7424 {
7425     return floatx80_compare_internal(a, b, 1, status);
7426 }
7427 
7428 static inline int float128_compare_internal(float128 a, float128 b,
7429                                             int is_quiet, float_status *status)
7430 {
7431     flag aSign, bSign;
7432 
7433     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7434           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7435         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7436           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7437         if (!is_quiet ||
7438             float128_is_signaling_nan( a ) ||
7439             float128_is_signaling_nan( b ) ) {
7440             float_raise(float_flag_invalid, status);
7441         }
7442         return float_relation_unordered;
7443     }
7444     aSign = extractFloat128Sign( a );
7445     bSign = extractFloat128Sign( b );
7446     if ( aSign != bSign ) {
7447         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7448             /* zero case */
7449             return float_relation_equal;
7450         } else {
7451             return 1 - (2 * aSign);
7452         }
7453     } else {
7454         if (a.low == b.low && a.high == b.high) {
7455             return float_relation_equal;
7456         } else {
7457             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7458         }
7459     }
7460 }
7461 
7462 int float128_compare(float128 a, float128 b, float_status *status)
7463 {
7464     return float128_compare_internal(a, b, 0, status);
7465 }
7466 
7467 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7468 {
7469     return float128_compare_internal(a, b, 1, status);
7470 }
7471 
7472 /* min() and max() functions. These can't be implemented as
7473  * 'compare and pick one input' because that would mishandle
7474  * NaNs and +0 vs -0.
7475  *
7476  * minnum() and maxnum() functions. These are similar to the min()
7477  * and max() functions but if one of the arguments is a QNaN and
7478  * the other is numerical then the numerical argument is returned.
7479  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7480  * and maxNum() operations. min() and max() are the typical min/max
7481  * semantics provided by many CPUs which predate that specification.
7482  *
7483  * minnummag() and maxnummag() functions correspond to minNumMag()
7484  * and minNumMag() from the IEEE-754 2008.
7485  */
7486 #define MINMAX(s)                                                       \
7487 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7488                                                int ismin, int isieee,   \
7489                                                int ismag,               \
7490                                                float_status *status)    \
7491 {                                                                       \
7492     flag aSign, bSign;                                                  \
7493     uint ## s ## _t av, bv, aav, abv;                                   \
7494     a = float ## s ## _squash_input_denormal(a, status);                \
7495     b = float ## s ## _squash_input_denormal(b, status);                \
7496     if (float ## s ## _is_any_nan(a) ||                                 \
7497         float ## s ## _is_any_nan(b)) {                                 \
7498         if (isieee) {                                                   \
7499             if (float ## s ## _is_quiet_nan(a) &&                       \
7500                 !float ## s ##_is_any_nan(b)) {                         \
7501                 return b;                                               \
7502             } else if (float ## s ## _is_quiet_nan(b) &&                \
7503                        !float ## s ## _is_any_nan(a)) {                 \
7504                 return a;                                               \
7505             }                                                           \
7506         }                                                               \
7507         return propagateFloat ## s ## NaN(a, b, status);                \
7508     }                                                                   \
7509     aSign = extractFloat ## s ## Sign(a);                               \
7510     bSign = extractFloat ## s ## Sign(b);                               \
7511     av = float ## s ## _val(a);                                         \
7512     bv = float ## s ## _val(b);                                         \
7513     if (ismag) {                                                        \
7514         aav = float ## s ## _abs(av);                                   \
7515         abv = float ## s ## _abs(bv);                                   \
7516         if (aav != abv) {                                               \
7517             if (ismin) {                                                \
7518                 return (aav < abv) ? a : b;                             \
7519             } else {                                                    \
7520                 return (aav < abv) ? b : a;                             \
7521             }                                                           \
7522         }                                                               \
7523     }                                                                   \
7524     if (aSign != bSign) {                                               \
7525         if (ismin) {                                                    \
7526             return aSign ? a : b;                                       \
7527         } else {                                                        \
7528             return aSign ? b : a;                                       \
7529         }                                                               \
7530     } else {                                                            \
7531         if (ismin) {                                                    \
7532             return (aSign ^ (av < bv)) ? a : b;                         \
7533         } else {                                                        \
7534             return (aSign ^ (av < bv)) ? b : a;                         \
7535         }                                                               \
7536     }                                                                   \
7537 }                                                                       \
7538                                                                         \
7539 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7540                               float_status *status)                     \
7541 {                                                                       \
7542     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7543 }                                                                       \
7544                                                                         \
7545 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7546                               float_status *status)                     \
7547 {                                                                       \
7548     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7549 }                                                                       \
7550                                                                         \
7551 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7552                                  float_status *status)                  \
7553 {                                                                       \
7554     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7555 }                                                                       \
7556                                                                         \
7557 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7558                                  float_status *status)                  \
7559 {                                                                       \
7560     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7561 }                                                                       \
7562                                                                         \
7563 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7564                                     float_status *status)               \
7565 {                                                                       \
7566     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7567 }                                                                       \
7568                                                                         \
7569 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7570                                     float_status *status)               \
7571 {                                                                       \
7572     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7573 }
7574 
7575 MINMAX(32)
7576 MINMAX(64)
7577 
7578 
7579 /* Multiply A by 2 raised to the power N.  */
7580 float32 float32_scalbn(float32 a, int n, float_status *status)
7581 {
7582     flag aSign;
7583     int16_t aExp;
7584     uint32_t aSig;
7585 
7586     a = float32_squash_input_denormal(a, status);
7587     aSig = extractFloat32Frac( a );
7588     aExp = extractFloat32Exp( a );
7589     aSign = extractFloat32Sign( a );
7590 
7591     if ( aExp == 0xFF ) {
7592         if ( aSig ) {
7593             return propagateFloat32NaN(a, a, status);
7594         }
7595         return a;
7596     }
7597     if (aExp != 0) {
7598         aSig |= 0x00800000;
7599     } else if (aSig == 0) {
7600         return a;
7601     } else {
7602         aExp++;
7603     }
7604 
7605     if (n > 0x200) {
7606         n = 0x200;
7607     } else if (n < -0x200) {
7608         n = -0x200;
7609     }
7610 
7611     aExp += n - 1;
7612     aSig <<= 7;
7613     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7614 }
7615 
7616 float64 float64_scalbn(float64 a, int n, float_status *status)
7617 {
7618     flag aSign;
7619     int16_t aExp;
7620     uint64_t aSig;
7621 
7622     a = float64_squash_input_denormal(a, status);
7623     aSig = extractFloat64Frac( a );
7624     aExp = extractFloat64Exp( a );
7625     aSign = extractFloat64Sign( a );
7626 
7627     if ( aExp == 0x7FF ) {
7628         if ( aSig ) {
7629             return propagateFloat64NaN(a, a, status);
7630         }
7631         return a;
7632     }
7633     if (aExp != 0) {
7634         aSig |= LIT64( 0x0010000000000000 );
7635     } else if (aSig == 0) {
7636         return a;
7637     } else {
7638         aExp++;
7639     }
7640 
7641     if (n > 0x1000) {
7642         n = 0x1000;
7643     } else if (n < -0x1000) {
7644         n = -0x1000;
7645     }
7646 
7647     aExp += n - 1;
7648     aSig <<= 10;
7649     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7650 }
7651 
7652 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7653 {
7654     flag aSign;
7655     int32_t aExp;
7656     uint64_t aSig;
7657 
7658     aSig = extractFloatx80Frac( a );
7659     aExp = extractFloatx80Exp( a );
7660     aSign = extractFloatx80Sign( a );
7661 
7662     if ( aExp == 0x7FFF ) {
7663         if ( aSig<<1 ) {
7664             return propagateFloatx80NaN(a, a, status);
7665         }
7666         return a;
7667     }
7668 
7669     if (aExp == 0) {
7670         if (aSig == 0) {
7671             return a;
7672         }
7673         aExp++;
7674     }
7675 
7676     if (n > 0x10000) {
7677         n = 0x10000;
7678     } else if (n < -0x10000) {
7679         n = -0x10000;
7680     }
7681 
7682     aExp += n;
7683     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7684                                          aSign, aExp, aSig, 0, status);
7685 }
7686 
7687 float128 float128_scalbn(float128 a, int n, float_status *status)
7688 {
7689     flag aSign;
7690     int32_t aExp;
7691     uint64_t aSig0, aSig1;
7692 
7693     aSig1 = extractFloat128Frac1( a );
7694     aSig0 = extractFloat128Frac0( a );
7695     aExp = extractFloat128Exp( a );
7696     aSign = extractFloat128Sign( a );
7697     if ( aExp == 0x7FFF ) {
7698         if ( aSig0 | aSig1 ) {
7699             return propagateFloat128NaN(a, a, status);
7700         }
7701         return a;
7702     }
7703     if (aExp != 0) {
7704         aSig0 |= LIT64( 0x0001000000000000 );
7705     } else if (aSig0 == 0 && aSig1 == 0) {
7706         return a;
7707     } else {
7708         aExp++;
7709     }
7710 
7711     if (n > 0x10000) {
7712         n = 0x10000;
7713     } else if (n < -0x10000) {
7714         n = -0x10000;
7715     }
7716 
7717     aExp += n - 1;
7718     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7719                                          , status);
7720 
7721 }
7722