xref: /qemu/fpu/softfloat.c (revision abff1abf)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345              f32_check_fn pre, f32_check_fn post)
346 {
347     union_float32 ua, ub, ur;
348 
349     ua.s = xa;
350     ub.s = xb;
351 
352     if (unlikely(!can_use_fpu(s))) {
353         goto soft;
354     }
355 
356     float32_input_flush2(&ua.s, &ub.s, s);
357     if (unlikely(!pre(ua, ub))) {
358         goto soft;
359     }
360 
361     ur.h = hard(ua.h, ub.h);
362     if (unlikely(f32_is_inf(ur))) {
363         s->float_exception_flags |= float_flag_overflow;
364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365         goto soft;
366     }
367     return ur.s;
368 
369  soft:
370     return soft(ua.s, ub.s, s);
371 }
372 
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376              f64_check_fn pre, f64_check_fn post)
377 {
378     union_float64 ua, ub, ur;
379 
380     ua.s = xa;
381     ub.s = xb;
382 
383     if (unlikely(!can_use_fpu(s))) {
384         goto soft;
385     }
386 
387     float64_input_flush2(&ua.s, &ub.s, s);
388     if (unlikely(!pre(ua, ub))) {
389         goto soft;
390     }
391 
392     ur.h = hard(ua.h, ub.h);
393     if (unlikely(f64_is_inf(ur))) {
394         s->float_exception_flags |= float_flag_overflow;
395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396         goto soft;
397     }
398     return ur.s;
399 
400  soft:
401     return soft(ua.s, ub.s, s);
402 }
403 
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407 
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410     return float32_val(a) & 0x007FFFFF;
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416 
417 static inline int extractFloat32Exp(float32 a)
418 {
419     return (float32_val(a) >> 23) & 0xFF;
420 }
421 
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425 
426 static inline bool extractFloat32Sign(float32 a)
427 {
428     return float32_val(a) >> 31;
429 }
430 
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434 
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439 
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443 
444 static inline int extractFloat64Exp(float64 a)
445 {
446     return (float64_val(a) >> 52) & 0x7FF;
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452 
453 static inline bool extractFloat64Sign(float64 a)
454 {
455     return float64_val(a) >> 63;
456 }
457 
458 /*
459  * Classify a floating point number. Everything above float_class_qnan
460  * is a NaN so cls >= float_class_qnan is any NaN.
461  */
462 
463 typedef enum __attribute__ ((__packed__)) {
464     float_class_unclassified,
465     float_class_zero,
466     float_class_normal,
467     float_class_inf,
468     float_class_qnan,  /* all NaNs from here */
469     float_class_snan,
470 } FloatClass;
471 
472 /* Simple helpers for checking if, or what kind of, NaN we have */
473 static inline __attribute__((unused)) bool is_nan(FloatClass c)
474 {
475     return unlikely(c >= float_class_qnan);
476 }
477 
478 static inline __attribute__((unused)) bool is_snan(FloatClass c)
479 {
480     return c == float_class_snan;
481 }
482 
483 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
484 {
485     return c == float_class_qnan;
486 }
487 
488 /*
489  * Structure holding all of the decomposed parts of a float. The
490  * exponent is unbiased and the fraction is normalized. All
491  * calculations are done with a 64 bit fraction and then rounded as
492  * appropriate for the final format.
493  *
494  * Thanks to the packed FloatClass a decent compiler should be able to
495  * fit the whole structure into registers and avoid using the stack
496  * for parameter passing.
497  */
498 
499 typedef struct {
500     uint64_t frac;
501     int32_t  exp;
502     FloatClass cls;
503     bool sign;
504 } FloatParts;
505 
506 #define DECOMPOSED_BINARY_POINT    (64 - 2)
507 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
508 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
509 
510 /* Structure holding all of the relevant parameters for a format.
511  *   exp_size: the size of the exponent field
512  *   exp_bias: the offset applied to the exponent field
513  *   exp_max: the maximum normalised exponent
514  *   frac_size: the size of the fraction field
515  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
516  * The following are computed based the size of fraction
517  *   frac_lsb: least significant bit of fraction
518  *   frac_lsbm1: the bit below the least significant bit (for rounding)
519  *   round_mask/roundeven_mask: masks used for rounding
520  * The following optional modifiers are available:
521  *   arm_althp: handle ARM Alternative Half Precision
522  */
523 typedef struct {
524     int exp_size;
525     int exp_bias;
526     int exp_max;
527     int frac_size;
528     int frac_shift;
529     uint64_t frac_lsb;
530     uint64_t frac_lsbm1;
531     uint64_t round_mask;
532     uint64_t roundeven_mask;
533     bool arm_althp;
534 } FloatFmt;
535 
536 /* Expand fields based on the size of exponent and fraction */
537 #define FLOAT_PARAMS(E, F)                                           \
538     .exp_size       = E,                                             \
539     .exp_bias       = ((1 << E) - 1) >> 1,                           \
540     .exp_max        = (1 << E) - 1,                                  \
541     .frac_size      = F,                                             \
542     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
543     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
544     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
545     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
546     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
547 
548 static const FloatFmt float16_params = {
549     FLOAT_PARAMS(5, 10)
550 };
551 
552 static const FloatFmt float16_params_ahp = {
553     FLOAT_PARAMS(5, 10),
554     .arm_althp = true
555 };
556 
557 static const FloatFmt float32_params = {
558     FLOAT_PARAMS(8, 23)
559 };
560 
561 static const FloatFmt float64_params = {
562     FLOAT_PARAMS(11, 52)
563 };
564 
565 /* Unpack a float to parts, but do not canonicalize.  */
566 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
567 {
568     const int sign_pos = fmt.frac_size + fmt.exp_size;
569 
570     return (FloatParts) {
571         .cls = float_class_unclassified,
572         .sign = extract64(raw, sign_pos, 1),
573         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
574         .frac = extract64(raw, 0, fmt.frac_size),
575     };
576 }
577 
578 static inline FloatParts float16_unpack_raw(float16 f)
579 {
580     return unpack_raw(float16_params, f);
581 }
582 
583 static inline FloatParts float32_unpack_raw(float32 f)
584 {
585     return unpack_raw(float32_params, f);
586 }
587 
588 static inline FloatParts float64_unpack_raw(float64 f)
589 {
590     return unpack_raw(float64_params, f);
591 }
592 
593 /* Pack a float from parts, but do not canonicalize.  */
594 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
595 {
596     const int sign_pos = fmt.frac_size + fmt.exp_size;
597     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
598     return deposit64(ret, sign_pos, 1, p.sign);
599 }
600 
601 static inline float16 float16_pack_raw(FloatParts p)
602 {
603     return make_float16(pack_raw(float16_params, p));
604 }
605 
606 static inline float32 float32_pack_raw(FloatParts p)
607 {
608     return make_float32(pack_raw(float32_params, p));
609 }
610 
611 static inline float64 float64_pack_raw(FloatParts p)
612 {
613     return make_float64(pack_raw(float64_params, p));
614 }
615 
616 /*----------------------------------------------------------------------------
617 | Functions and definitions to determine:  (1) whether tininess for underflow
618 | is detected before or after rounding by default, (2) what (if anything)
619 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
620 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
621 | are propagated from function inputs to output.  These details are target-
622 | specific.
623 *----------------------------------------------------------------------------*/
624 #include "softfloat-specialize.c.inc"
625 
626 /* Canonicalize EXP and FRAC, setting CLS.  */
627 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
628                                   float_status *status)
629 {
630     if (part.exp == parm->exp_max && !parm->arm_althp) {
631         if (part.frac == 0) {
632             part.cls = float_class_inf;
633         } else {
634             part.frac <<= parm->frac_shift;
635             part.cls = (parts_is_snan_frac(part.frac, status)
636                         ? float_class_snan : float_class_qnan);
637         }
638     } else if (part.exp == 0) {
639         if (likely(part.frac == 0)) {
640             part.cls = float_class_zero;
641         } else if (status->flush_inputs_to_zero) {
642             float_raise(float_flag_input_denormal, status);
643             part.cls = float_class_zero;
644             part.frac = 0;
645         } else {
646             int shift = clz64(part.frac) - 1;
647             part.cls = float_class_normal;
648             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
649             part.frac <<= shift;
650         }
651     } else {
652         part.cls = float_class_normal;
653         part.exp -= parm->exp_bias;
654         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
655     }
656     return part;
657 }
658 
659 /* Round and uncanonicalize a floating-point number by parts. There
660  * are FRAC_SHIFT bits that may require rounding at the bottom of the
661  * fraction; these bits will be removed. The exponent will be biased
662  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
663  */
664 
665 static FloatParts round_canonical(FloatParts p, float_status *s,
666                                   const FloatFmt *parm)
667 {
668     const uint64_t frac_lsb = parm->frac_lsb;
669     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
670     const uint64_t round_mask = parm->round_mask;
671     const uint64_t roundeven_mask = parm->roundeven_mask;
672     const int exp_max = parm->exp_max;
673     const int frac_shift = parm->frac_shift;
674     uint64_t frac, inc;
675     int exp, flags = 0;
676     bool overflow_norm;
677 
678     frac = p.frac;
679     exp = p.exp;
680 
681     switch (p.cls) {
682     case float_class_normal:
683         switch (s->float_rounding_mode) {
684         case float_round_nearest_even:
685             overflow_norm = false;
686             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
687             break;
688         case float_round_ties_away:
689             overflow_norm = false;
690             inc = frac_lsbm1;
691             break;
692         case float_round_to_zero:
693             overflow_norm = true;
694             inc = 0;
695             break;
696         case float_round_up:
697             inc = p.sign ? 0 : round_mask;
698             overflow_norm = p.sign;
699             break;
700         case float_round_down:
701             inc = p.sign ? round_mask : 0;
702             overflow_norm = !p.sign;
703             break;
704         case float_round_to_odd:
705             overflow_norm = true;
706             inc = frac & frac_lsb ? 0 : round_mask;
707             break;
708         default:
709             g_assert_not_reached();
710         }
711 
712         exp += parm->exp_bias;
713         if (likely(exp > 0)) {
714             if (frac & round_mask) {
715                 flags |= float_flag_inexact;
716                 frac += inc;
717                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
718                     frac >>= 1;
719                     exp++;
720                 }
721             }
722             frac >>= frac_shift;
723 
724             if (parm->arm_althp) {
725                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
726                 if (unlikely(exp > exp_max)) {
727                     /* Overflow.  Return the maximum normal.  */
728                     flags = float_flag_invalid;
729                     exp = exp_max;
730                     frac = -1;
731                 }
732             } else if (unlikely(exp >= exp_max)) {
733                 flags |= float_flag_overflow | float_flag_inexact;
734                 if (overflow_norm) {
735                     exp = exp_max - 1;
736                     frac = -1;
737                 } else {
738                     p.cls = float_class_inf;
739                     goto do_inf;
740                 }
741             }
742         } else if (s->flush_to_zero) {
743             flags |= float_flag_output_denormal;
744             p.cls = float_class_zero;
745             goto do_zero;
746         } else {
747             bool is_tiny = s->tininess_before_rounding
748                         || (exp < 0)
749                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
750 
751             shift64RightJamming(frac, 1 - exp, &frac);
752             if (frac & round_mask) {
753                 /* Need to recompute round-to-even.  */
754                 switch (s->float_rounding_mode) {
755                 case float_round_nearest_even:
756                     inc = ((frac & roundeven_mask) != frac_lsbm1
757                            ? frac_lsbm1 : 0);
758                     break;
759                 case float_round_to_odd:
760                     inc = frac & frac_lsb ? 0 : round_mask;
761                     break;
762                 default:
763                     break;
764                 }
765                 flags |= float_flag_inexact;
766                 frac += inc;
767             }
768 
769             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
770             frac >>= frac_shift;
771 
772             if (is_tiny && (flags & float_flag_inexact)) {
773                 flags |= float_flag_underflow;
774             }
775             if (exp == 0 && frac == 0) {
776                 p.cls = float_class_zero;
777             }
778         }
779         break;
780 
781     case float_class_zero:
782     do_zero:
783         exp = 0;
784         frac = 0;
785         break;
786 
787     case float_class_inf:
788     do_inf:
789         assert(!parm->arm_althp);
790         exp = exp_max;
791         frac = 0;
792         break;
793 
794     case float_class_qnan:
795     case float_class_snan:
796         assert(!parm->arm_althp);
797         exp = exp_max;
798         frac >>= parm->frac_shift;
799         break;
800 
801     default:
802         g_assert_not_reached();
803     }
804 
805     float_raise(flags, s);
806     p.exp = exp;
807     p.frac = frac;
808     return p;
809 }
810 
811 /* Explicit FloatFmt version */
812 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
813                                             const FloatFmt *params)
814 {
815     return sf_canonicalize(float16_unpack_raw(f), params, s);
816 }
817 
818 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
819 {
820     return float16a_unpack_canonical(f, s, &float16_params);
821 }
822 
823 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
824                                              const FloatFmt *params)
825 {
826     return float16_pack_raw(round_canonical(p, s, params));
827 }
828 
829 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
830 {
831     return float16a_round_pack_canonical(p, s, &float16_params);
832 }
833 
834 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
835 {
836     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
837 }
838 
839 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
840 {
841     return float32_pack_raw(round_canonical(p, s, &float32_params));
842 }
843 
844 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
845 {
846     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
847 }
848 
849 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float64_pack_raw(round_canonical(p, s, &float64_params));
852 }
853 
854 static FloatParts return_nan(FloatParts a, float_status *s)
855 {
856     switch (a.cls) {
857     case float_class_snan:
858         s->float_exception_flags |= float_flag_invalid;
859         a = parts_silence_nan(a, s);
860         /* fall through */
861     case float_class_qnan:
862         if (s->default_nan_mode) {
863             return parts_default_nan(s);
864         }
865         break;
866 
867     default:
868         g_assert_not_reached();
869     }
870     return a;
871 }
872 
873 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
874 {
875     if (is_snan(a.cls) || is_snan(b.cls)) {
876         s->float_exception_flags |= float_flag_invalid;
877     }
878 
879     if (s->default_nan_mode) {
880         return parts_default_nan(s);
881     } else {
882         if (pickNaN(a.cls, b.cls,
883                     a.frac > b.frac ||
884                     (a.frac == b.frac && a.sign < b.sign))) {
885             a = b;
886         }
887         if (is_snan(a.cls)) {
888             return parts_silence_nan(a, s);
889         }
890     }
891     return a;
892 }
893 
894 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
895                                   bool inf_zero, float_status *s)
896 {
897     int which;
898 
899     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
900         s->float_exception_flags |= float_flag_invalid;
901     }
902 
903     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
904 
905     if (s->default_nan_mode) {
906         /* Note that this check is after pickNaNMulAdd so that function
907          * has an opportunity to set the Invalid flag.
908          */
909         which = 3;
910     }
911 
912     switch (which) {
913     case 0:
914         break;
915     case 1:
916         a = b;
917         break;
918     case 2:
919         a = c;
920         break;
921     case 3:
922         return parts_default_nan(s);
923     default:
924         g_assert_not_reached();
925     }
926 
927     if (is_snan(a.cls)) {
928         return parts_silence_nan(a, s);
929     }
930     return a;
931 }
932 
933 /*
934  * Returns the result of adding or subtracting the values of the
935  * floating-point values `a' and `b'. The operation is performed
936  * according to the IEC/IEEE Standard for Binary Floating-Point
937  * Arithmetic.
938  */
939 
940 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
941                                 float_status *s)
942 {
943     bool a_sign = a.sign;
944     bool b_sign = b.sign ^ subtract;
945 
946     if (a_sign != b_sign) {
947         /* Subtraction */
948 
949         if (a.cls == float_class_normal && b.cls == float_class_normal) {
950             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
951                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
952                 a.frac = a.frac - b.frac;
953             } else {
954                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
955                 a.frac = b.frac - a.frac;
956                 a.exp = b.exp;
957                 a_sign ^= 1;
958             }
959 
960             if (a.frac == 0) {
961                 a.cls = float_class_zero;
962                 a.sign = s->float_rounding_mode == float_round_down;
963             } else {
964                 int shift = clz64(a.frac) - 1;
965                 a.frac = a.frac << shift;
966                 a.exp = a.exp - shift;
967                 a.sign = a_sign;
968             }
969             return a;
970         }
971         if (is_nan(a.cls) || is_nan(b.cls)) {
972             return pick_nan(a, b, s);
973         }
974         if (a.cls == float_class_inf) {
975             if (b.cls == float_class_inf) {
976                 float_raise(float_flag_invalid, s);
977                 return parts_default_nan(s);
978             }
979             return a;
980         }
981         if (a.cls == float_class_zero && b.cls == float_class_zero) {
982             a.sign = s->float_rounding_mode == float_round_down;
983             return a;
984         }
985         if (a.cls == float_class_zero || b.cls == float_class_inf) {
986             b.sign = a_sign ^ 1;
987             return b;
988         }
989         if (b.cls == float_class_zero) {
990             return a;
991         }
992     } else {
993         /* Addition */
994         if (a.cls == float_class_normal && b.cls == float_class_normal) {
995             if (a.exp > b.exp) {
996                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
997             } else if (a.exp < b.exp) {
998                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
999                 a.exp = b.exp;
1000             }
1001             a.frac += b.frac;
1002             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1003                 shift64RightJamming(a.frac, 1, &a.frac);
1004                 a.exp += 1;
1005             }
1006             return a;
1007         }
1008         if (is_nan(a.cls) || is_nan(b.cls)) {
1009             return pick_nan(a, b, s);
1010         }
1011         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1012             return a;
1013         }
1014         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1015             b.sign = b_sign;
1016             return b;
1017         }
1018     }
1019     g_assert_not_reached();
1020 }
1021 
1022 /*
1023  * Returns the result of adding or subtracting the floating-point
1024  * values `a' and `b'. The operation is performed according to the
1025  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1026  */
1027 
1028 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1029 {
1030     FloatParts pa = float16_unpack_canonical(a, status);
1031     FloatParts pb = float16_unpack_canonical(b, status);
1032     FloatParts pr = addsub_floats(pa, pb, false, status);
1033 
1034     return float16_round_pack_canonical(pr, status);
1035 }
1036 
1037 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1038 {
1039     FloatParts pa = float16_unpack_canonical(a, status);
1040     FloatParts pb = float16_unpack_canonical(b, status);
1041     FloatParts pr = addsub_floats(pa, pb, true, status);
1042 
1043     return float16_round_pack_canonical(pr, status);
1044 }
1045 
1046 static float32 QEMU_SOFTFLOAT_ATTR
1047 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1048 {
1049     FloatParts pa = float32_unpack_canonical(a, status);
1050     FloatParts pb = float32_unpack_canonical(b, status);
1051     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1052 
1053     return float32_round_pack_canonical(pr, status);
1054 }
1055 
1056 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1057 {
1058     return soft_f32_addsub(a, b, false, status);
1059 }
1060 
1061 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1062 {
1063     return soft_f32_addsub(a, b, true, status);
1064 }
1065 
1066 static float64 QEMU_SOFTFLOAT_ATTR
1067 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1068 {
1069     FloatParts pa = float64_unpack_canonical(a, status);
1070     FloatParts pb = float64_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072 
1073     return float64_round_pack_canonical(pr, status);
1074 }
1075 
1076 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1077 {
1078     return soft_f64_addsub(a, b, false, status);
1079 }
1080 
1081 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1082 {
1083     return soft_f64_addsub(a, b, true, status);
1084 }
1085 
1086 static float hard_f32_add(float a, float b)
1087 {
1088     return a + b;
1089 }
1090 
1091 static float hard_f32_sub(float a, float b)
1092 {
1093     return a - b;
1094 }
1095 
1096 static double hard_f64_add(double a, double b)
1097 {
1098     return a + b;
1099 }
1100 
1101 static double hard_f64_sub(double a, double b)
1102 {
1103     return a - b;
1104 }
1105 
1106 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1107 {
1108     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1109         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1110     }
1111     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1112 }
1113 
1114 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1115 {
1116     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1117         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1118     } else {
1119         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1120     }
1121 }
1122 
1123 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1124                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1125 {
1126     return float32_gen2(a, b, s, hard, soft,
1127                         f32_is_zon2, f32_addsubmul_post);
1128 }
1129 
1130 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1131                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1132 {
1133     return float64_gen2(a, b, s, hard, soft,
1134                         f64_is_zon2, f64_addsubmul_post);
1135 }
1136 
1137 float32 QEMU_FLATTEN
1138 float32_add(float32 a, float32 b, float_status *s)
1139 {
1140     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1141 }
1142 
1143 float32 QEMU_FLATTEN
1144 float32_sub(float32 a, float32 b, float_status *s)
1145 {
1146     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1147 }
1148 
1149 float64 QEMU_FLATTEN
1150 float64_add(float64 a, float64 b, float_status *s)
1151 {
1152     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1153 }
1154 
1155 float64 QEMU_FLATTEN
1156 float64_sub(float64 a, float64 b, float_status *s)
1157 {
1158     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1159 }
1160 
1161 /*
1162  * Returns the result of multiplying the floating-point values `a' and
1163  * `b'. The operation is performed according to the IEC/IEEE Standard
1164  * for Binary Floating-Point Arithmetic.
1165  */
1166 
1167 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1168 {
1169     bool sign = a.sign ^ b.sign;
1170 
1171     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1172         uint64_t hi, lo;
1173         int exp = a.exp + b.exp;
1174 
1175         mul64To128(a.frac, b.frac, &hi, &lo);
1176         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1177         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1178             shift64RightJamming(lo, 1, &lo);
1179             exp += 1;
1180         }
1181 
1182         /* Re-use a */
1183         a.exp = exp;
1184         a.sign = sign;
1185         a.frac = lo;
1186         return a;
1187     }
1188     /* handle all the NaN cases */
1189     if (is_nan(a.cls) || is_nan(b.cls)) {
1190         return pick_nan(a, b, s);
1191     }
1192     /* Inf * Zero == NaN */
1193     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1194         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1195         s->float_exception_flags |= float_flag_invalid;
1196         return parts_default_nan(s);
1197     }
1198     /* Multiply by 0 or Inf */
1199     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1200         a.sign = sign;
1201         return a;
1202     }
1203     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1204         b.sign = sign;
1205         return b;
1206     }
1207     g_assert_not_reached();
1208 }
1209 
1210 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1211 {
1212     FloatParts pa = float16_unpack_canonical(a, status);
1213     FloatParts pb = float16_unpack_canonical(b, status);
1214     FloatParts pr = mul_floats(pa, pb, status);
1215 
1216     return float16_round_pack_canonical(pr, status);
1217 }
1218 
1219 static float32 QEMU_SOFTFLOAT_ATTR
1220 soft_f32_mul(float32 a, float32 b, float_status *status)
1221 {
1222     FloatParts pa = float32_unpack_canonical(a, status);
1223     FloatParts pb = float32_unpack_canonical(b, status);
1224     FloatParts pr = mul_floats(pa, pb, status);
1225 
1226     return float32_round_pack_canonical(pr, status);
1227 }
1228 
1229 static float64 QEMU_SOFTFLOAT_ATTR
1230 soft_f64_mul(float64 a, float64 b, float_status *status)
1231 {
1232     FloatParts pa = float64_unpack_canonical(a, status);
1233     FloatParts pb = float64_unpack_canonical(b, status);
1234     FloatParts pr = mul_floats(pa, pb, status);
1235 
1236     return float64_round_pack_canonical(pr, status);
1237 }
1238 
1239 static float hard_f32_mul(float a, float b)
1240 {
1241     return a * b;
1242 }
1243 
1244 static double hard_f64_mul(double a, double b)
1245 {
1246     return a * b;
1247 }
1248 
1249 float32 QEMU_FLATTEN
1250 float32_mul(float32 a, float32 b, float_status *s)
1251 {
1252     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1253                         f32_is_zon2, f32_addsubmul_post);
1254 }
1255 
1256 float64 QEMU_FLATTEN
1257 float64_mul(float64 a, float64 b, float_status *s)
1258 {
1259     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1260                         f64_is_zon2, f64_addsubmul_post);
1261 }
1262 
1263 /*
1264  * Returns the result of multiplying the floating-point values `a' and
1265  * `b' then adding 'c', with no intermediate rounding step after the
1266  * multiplication. The operation is performed according to the
1267  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1268  * The flags argument allows the caller to select negation of the
1269  * addend, the intermediate product, or the final result. (The
1270  * difference between this and having the caller do a separate
1271  * negation is that negating externally will flip the sign bit on
1272  * NaNs.)
1273  */
1274 
1275 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1276                                 int flags, float_status *s)
1277 {
1278     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1279                     ((1 << float_class_inf) | (1 << float_class_zero));
1280     bool p_sign;
1281     bool sign_flip = flags & float_muladd_negate_result;
1282     FloatClass p_class;
1283     uint64_t hi, lo;
1284     int p_exp;
1285 
1286     /* It is implementation-defined whether the cases of (0,inf,qnan)
1287      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1288      * they return if they do), so we have to hand this information
1289      * off to the target-specific pick-a-NaN routine.
1290      */
1291     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1292         return pick_nan_muladd(a, b, c, inf_zero, s);
1293     }
1294 
1295     if (inf_zero) {
1296         s->float_exception_flags |= float_flag_invalid;
1297         return parts_default_nan(s);
1298     }
1299 
1300     if (flags & float_muladd_negate_c) {
1301         c.sign ^= 1;
1302     }
1303 
1304     p_sign = a.sign ^ b.sign;
1305 
1306     if (flags & float_muladd_negate_product) {
1307         p_sign ^= 1;
1308     }
1309 
1310     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1311         p_class = float_class_inf;
1312     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1313         p_class = float_class_zero;
1314     } else {
1315         p_class = float_class_normal;
1316     }
1317 
1318     if (c.cls == float_class_inf) {
1319         if (p_class == float_class_inf && p_sign != c.sign) {
1320             s->float_exception_flags |= float_flag_invalid;
1321             return parts_default_nan(s);
1322         } else {
1323             a.cls = float_class_inf;
1324             a.sign = c.sign ^ sign_flip;
1325             return a;
1326         }
1327     }
1328 
1329     if (p_class == float_class_inf) {
1330         a.cls = float_class_inf;
1331         a.sign = p_sign ^ sign_flip;
1332         return a;
1333     }
1334 
1335     if (p_class == float_class_zero) {
1336         if (c.cls == float_class_zero) {
1337             if (p_sign != c.sign) {
1338                 p_sign = s->float_rounding_mode == float_round_down;
1339             }
1340             c.sign = p_sign;
1341         } else if (flags & float_muladd_halve_result) {
1342             c.exp -= 1;
1343         }
1344         c.sign ^= sign_flip;
1345         return c;
1346     }
1347 
1348     /* a & b should be normals now... */
1349     assert(a.cls == float_class_normal &&
1350            b.cls == float_class_normal);
1351 
1352     p_exp = a.exp + b.exp;
1353 
1354     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1355      * result.
1356      */
1357     mul64To128(a.frac, b.frac, &hi, &lo);
1358     /* binary point now at bit 124 */
1359 
1360     /* check for overflow */
1361     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1362         shift128RightJamming(hi, lo, 1, &hi, &lo);
1363         p_exp += 1;
1364     }
1365 
1366     /* + add/sub */
1367     if (c.cls == float_class_zero) {
1368         /* move binary point back to 62 */
1369         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1370     } else {
1371         int exp_diff = p_exp - c.exp;
1372         if (p_sign == c.sign) {
1373             /* Addition */
1374             if (exp_diff <= 0) {
1375                 shift128RightJamming(hi, lo,
1376                                      DECOMPOSED_BINARY_POINT - exp_diff,
1377                                      &hi, &lo);
1378                 lo += c.frac;
1379                 p_exp = c.exp;
1380             } else {
1381                 uint64_t c_hi, c_lo;
1382                 /* shift c to the same binary point as the product (124) */
1383                 c_hi = c.frac >> 2;
1384                 c_lo = 0;
1385                 shift128RightJamming(c_hi, c_lo,
1386                                      exp_diff,
1387                                      &c_hi, &c_lo);
1388                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1389                 /* move binary point back to 62 */
1390                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1391             }
1392 
1393             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1394                 shift64RightJamming(lo, 1, &lo);
1395                 p_exp += 1;
1396             }
1397 
1398         } else {
1399             /* Subtraction */
1400             uint64_t c_hi, c_lo;
1401             /* make C binary point match product at bit 124 */
1402             c_hi = c.frac >> 2;
1403             c_lo = 0;
1404 
1405             if (exp_diff <= 0) {
1406                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1407                 if (exp_diff == 0
1408                     &&
1409                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1410                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1411                 } else {
1412                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1413                     p_sign ^= 1;
1414                     p_exp = c.exp;
1415                 }
1416             } else {
1417                 shift128RightJamming(c_hi, c_lo,
1418                                      exp_diff,
1419                                      &c_hi, &c_lo);
1420                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1421             }
1422 
1423             if (hi == 0 && lo == 0) {
1424                 a.cls = float_class_zero;
1425                 a.sign = s->float_rounding_mode == float_round_down;
1426                 a.sign ^= sign_flip;
1427                 return a;
1428             } else {
1429                 int shift;
1430                 if (hi != 0) {
1431                     shift = clz64(hi);
1432                 } else {
1433                     shift = clz64(lo) + 64;
1434                 }
1435                 /* Normalizing to a binary point of 124 is the
1436                    correct adjust for the exponent.  However since we're
1437                    shifting, we might as well put the binary point back
1438                    at 62 where we really want it.  Therefore shift as
1439                    if we're leaving 1 bit at the top of the word, but
1440                    adjust the exponent as if we're leaving 3 bits.  */
1441                 shift -= 1;
1442                 if (shift >= 64) {
1443                     lo = lo << (shift - 64);
1444                 } else {
1445                     hi = (hi << shift) | (lo >> (64 - shift));
1446                     lo = hi | ((lo << shift) != 0);
1447                 }
1448                 p_exp -= shift - 2;
1449             }
1450         }
1451     }
1452 
1453     if (flags & float_muladd_halve_result) {
1454         p_exp -= 1;
1455     }
1456 
1457     /* finally prepare our result */
1458     a.cls = float_class_normal;
1459     a.sign = p_sign ^ sign_flip;
1460     a.exp = p_exp;
1461     a.frac = lo;
1462 
1463     return a;
1464 }
1465 
1466 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1467                                                 int flags, float_status *status)
1468 {
1469     FloatParts pa = float16_unpack_canonical(a, status);
1470     FloatParts pb = float16_unpack_canonical(b, status);
1471     FloatParts pc = float16_unpack_canonical(c, status);
1472     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1473 
1474     return float16_round_pack_canonical(pr, status);
1475 }
1476 
1477 static float32 QEMU_SOFTFLOAT_ATTR
1478 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1479                 float_status *status)
1480 {
1481     FloatParts pa = float32_unpack_canonical(a, status);
1482     FloatParts pb = float32_unpack_canonical(b, status);
1483     FloatParts pc = float32_unpack_canonical(c, status);
1484     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1485 
1486     return float32_round_pack_canonical(pr, status);
1487 }
1488 
1489 static float64 QEMU_SOFTFLOAT_ATTR
1490 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1491                 float_status *status)
1492 {
1493     FloatParts pa = float64_unpack_canonical(a, status);
1494     FloatParts pb = float64_unpack_canonical(b, status);
1495     FloatParts pc = float64_unpack_canonical(c, status);
1496     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1497 
1498     return float64_round_pack_canonical(pr, status);
1499 }
1500 
1501 static bool force_soft_fma;
1502 
1503 float32 QEMU_FLATTEN
1504 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1505 {
1506     union_float32 ua, ub, uc, ur;
1507 
1508     ua.s = xa;
1509     ub.s = xb;
1510     uc.s = xc;
1511 
1512     if (unlikely(!can_use_fpu(s))) {
1513         goto soft;
1514     }
1515     if (unlikely(flags & float_muladd_halve_result)) {
1516         goto soft;
1517     }
1518 
1519     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1520     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1521         goto soft;
1522     }
1523 
1524     if (unlikely(force_soft_fma)) {
1525         goto soft;
1526     }
1527 
1528     /*
1529      * When (a || b) == 0, there's no need to check for under/over flow,
1530      * since we know the addend is (normal || 0) and the product is 0.
1531      */
1532     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1533         union_float32 up;
1534         bool prod_sign;
1535 
1536         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1537         prod_sign ^= !!(flags & float_muladd_negate_product);
1538         up.s = float32_set_sign(float32_zero, prod_sign);
1539 
1540         if (flags & float_muladd_negate_c) {
1541             uc.h = -uc.h;
1542         }
1543         ur.h = up.h + uc.h;
1544     } else {
1545         union_float32 ua_orig = ua;
1546         union_float32 uc_orig = uc;
1547 
1548         if (flags & float_muladd_negate_product) {
1549             ua.h = -ua.h;
1550         }
1551         if (flags & float_muladd_negate_c) {
1552             uc.h = -uc.h;
1553         }
1554 
1555         ur.h = fmaf(ua.h, ub.h, uc.h);
1556 
1557         if (unlikely(f32_is_inf(ur))) {
1558             s->float_exception_flags |= float_flag_overflow;
1559         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1560             ua = ua_orig;
1561             uc = uc_orig;
1562             goto soft;
1563         }
1564     }
1565     if (flags & float_muladd_negate_result) {
1566         return float32_chs(ur.s);
1567     }
1568     return ur.s;
1569 
1570  soft:
1571     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1572 }
1573 
1574 float64 QEMU_FLATTEN
1575 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1576 {
1577     union_float64 ua, ub, uc, ur;
1578 
1579     ua.s = xa;
1580     ub.s = xb;
1581     uc.s = xc;
1582 
1583     if (unlikely(!can_use_fpu(s))) {
1584         goto soft;
1585     }
1586     if (unlikely(flags & float_muladd_halve_result)) {
1587         goto soft;
1588     }
1589 
1590     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1591     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1592         goto soft;
1593     }
1594 
1595     if (unlikely(force_soft_fma)) {
1596         goto soft;
1597     }
1598 
1599     /*
1600      * When (a || b) == 0, there's no need to check for under/over flow,
1601      * since we know the addend is (normal || 0) and the product is 0.
1602      */
1603     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1604         union_float64 up;
1605         bool prod_sign;
1606 
1607         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1608         prod_sign ^= !!(flags & float_muladd_negate_product);
1609         up.s = float64_set_sign(float64_zero, prod_sign);
1610 
1611         if (flags & float_muladd_negate_c) {
1612             uc.h = -uc.h;
1613         }
1614         ur.h = up.h + uc.h;
1615     } else {
1616         union_float64 ua_orig = ua;
1617         union_float64 uc_orig = uc;
1618 
1619         if (flags & float_muladd_negate_product) {
1620             ua.h = -ua.h;
1621         }
1622         if (flags & float_muladd_negate_c) {
1623             uc.h = -uc.h;
1624         }
1625 
1626         ur.h = fma(ua.h, ub.h, uc.h);
1627 
1628         if (unlikely(f64_is_inf(ur))) {
1629             s->float_exception_flags |= float_flag_overflow;
1630         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1631             ua = ua_orig;
1632             uc = uc_orig;
1633             goto soft;
1634         }
1635     }
1636     if (flags & float_muladd_negate_result) {
1637         return float64_chs(ur.s);
1638     }
1639     return ur.s;
1640 
1641  soft:
1642     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1643 }
1644 
1645 /*
1646  * Returns the result of dividing the floating-point value `a' by the
1647  * corresponding value `b'. The operation is performed according to
1648  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1649  */
1650 
1651 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1652 {
1653     bool sign = a.sign ^ b.sign;
1654 
1655     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1656         uint64_t n0, n1, q, r;
1657         int exp = a.exp - b.exp;
1658 
1659         /*
1660          * We want a 2*N / N-bit division to produce exactly an N-bit
1661          * result, so that we do not lose any precision and so that we
1662          * do not have to renormalize afterward.  If A.frac < B.frac,
1663          * then division would produce an (N-1)-bit result; shift A left
1664          * by one to produce the an N-bit result, and decrement the
1665          * exponent to match.
1666          *
1667          * The udiv_qrnnd algorithm that we're using requires normalization,
1668          * i.e. the msb of the denominator must be set.  Since we know that
1669          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1670          * by one (more), and the remainder must be shifted right by one.
1671          */
1672         if (a.frac < b.frac) {
1673             exp -= 1;
1674             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1675         } else {
1676             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1677         }
1678         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1679 
1680         /*
1681          * Set lsb if there is a remainder, to set inexact.
1682          * As mentioned above, to find the actual value of the remainder we
1683          * would need to shift right, but (1) we are only concerned about
1684          * non-zero-ness, and (2) the remainder will always be even because
1685          * both inputs to the division primitive are even.
1686          */
1687         a.frac = q | (r != 0);
1688         a.sign = sign;
1689         a.exp = exp;
1690         return a;
1691     }
1692     /* handle all the NaN cases */
1693     if (is_nan(a.cls) || is_nan(b.cls)) {
1694         return pick_nan(a, b, s);
1695     }
1696     /* 0/0 or Inf/Inf */
1697     if (a.cls == b.cls
1698         &&
1699         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1700         s->float_exception_flags |= float_flag_invalid;
1701         return parts_default_nan(s);
1702     }
1703     /* Inf / x or 0 / x */
1704     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1705         a.sign = sign;
1706         return a;
1707     }
1708     /* Div 0 => Inf */
1709     if (b.cls == float_class_zero) {
1710         s->float_exception_flags |= float_flag_divbyzero;
1711         a.cls = float_class_inf;
1712         a.sign = sign;
1713         return a;
1714     }
1715     /* Div by Inf */
1716     if (b.cls == float_class_inf) {
1717         a.cls = float_class_zero;
1718         a.sign = sign;
1719         return a;
1720     }
1721     g_assert_not_reached();
1722 }
1723 
1724 float16 float16_div(float16 a, float16 b, float_status *status)
1725 {
1726     FloatParts pa = float16_unpack_canonical(a, status);
1727     FloatParts pb = float16_unpack_canonical(b, status);
1728     FloatParts pr = div_floats(pa, pb, status);
1729 
1730     return float16_round_pack_canonical(pr, status);
1731 }
1732 
1733 static float32 QEMU_SOFTFLOAT_ATTR
1734 soft_f32_div(float32 a, float32 b, float_status *status)
1735 {
1736     FloatParts pa = float32_unpack_canonical(a, status);
1737     FloatParts pb = float32_unpack_canonical(b, status);
1738     FloatParts pr = div_floats(pa, pb, status);
1739 
1740     return float32_round_pack_canonical(pr, status);
1741 }
1742 
1743 static float64 QEMU_SOFTFLOAT_ATTR
1744 soft_f64_div(float64 a, float64 b, float_status *status)
1745 {
1746     FloatParts pa = float64_unpack_canonical(a, status);
1747     FloatParts pb = float64_unpack_canonical(b, status);
1748     FloatParts pr = div_floats(pa, pb, status);
1749 
1750     return float64_round_pack_canonical(pr, status);
1751 }
1752 
1753 static float hard_f32_div(float a, float b)
1754 {
1755     return a / b;
1756 }
1757 
1758 static double hard_f64_div(double a, double b)
1759 {
1760     return a / b;
1761 }
1762 
1763 static bool f32_div_pre(union_float32 a, union_float32 b)
1764 {
1765     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1766         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1767                fpclassify(b.h) == FP_NORMAL;
1768     }
1769     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1770 }
1771 
1772 static bool f64_div_pre(union_float64 a, union_float64 b)
1773 {
1774     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1775         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1776                fpclassify(b.h) == FP_NORMAL;
1777     }
1778     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1779 }
1780 
1781 static bool f32_div_post(union_float32 a, union_float32 b)
1782 {
1783     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1784         return fpclassify(a.h) != FP_ZERO;
1785     }
1786     return !float32_is_zero(a.s);
1787 }
1788 
1789 static bool f64_div_post(union_float64 a, union_float64 b)
1790 {
1791     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1792         return fpclassify(a.h) != FP_ZERO;
1793     }
1794     return !float64_is_zero(a.s);
1795 }
1796 
1797 float32 QEMU_FLATTEN
1798 float32_div(float32 a, float32 b, float_status *s)
1799 {
1800     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1801                         f32_div_pre, f32_div_post);
1802 }
1803 
1804 float64 QEMU_FLATTEN
1805 float64_div(float64 a, float64 b, float_status *s)
1806 {
1807     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1808                         f64_div_pre, f64_div_post);
1809 }
1810 
1811 /*
1812  * Float to Float conversions
1813  *
1814  * Returns the result of converting one float format to another. The
1815  * conversion is performed according to the IEC/IEEE Standard for
1816  * Binary Floating-Point Arithmetic.
1817  *
1818  * The float_to_float helper only needs to take care of raising
1819  * invalid exceptions and handling the conversion on NaNs.
1820  */
1821 
1822 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1823                                  float_status *s)
1824 {
1825     if (dstf->arm_althp) {
1826         switch (a.cls) {
1827         case float_class_qnan:
1828         case float_class_snan:
1829             /* There is no NaN in the destination format.  Raise Invalid
1830              * and return a zero with the sign of the input NaN.
1831              */
1832             s->float_exception_flags |= float_flag_invalid;
1833             a.cls = float_class_zero;
1834             a.frac = 0;
1835             a.exp = 0;
1836             break;
1837 
1838         case float_class_inf:
1839             /* There is no Inf in the destination format.  Raise Invalid
1840              * and return the maximum normal with the correct sign.
1841              */
1842             s->float_exception_flags |= float_flag_invalid;
1843             a.cls = float_class_normal;
1844             a.exp = dstf->exp_max;
1845             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1846             break;
1847 
1848         default:
1849             break;
1850         }
1851     } else if (is_nan(a.cls)) {
1852         if (is_snan(a.cls)) {
1853             s->float_exception_flags |= float_flag_invalid;
1854             a = parts_silence_nan(a, s);
1855         }
1856         if (s->default_nan_mode) {
1857             return parts_default_nan(s);
1858         }
1859     }
1860     return a;
1861 }
1862 
1863 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1864 {
1865     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1866     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1867     FloatParts pr = float_to_float(p, &float32_params, s);
1868     return float32_round_pack_canonical(pr, s);
1869 }
1870 
1871 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1872 {
1873     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1874     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1875     FloatParts pr = float_to_float(p, &float64_params, s);
1876     return float64_round_pack_canonical(pr, s);
1877 }
1878 
1879 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1880 {
1881     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1882     FloatParts p = float32_unpack_canonical(a, s);
1883     FloatParts pr = float_to_float(p, fmt16, s);
1884     return float16a_round_pack_canonical(pr, s, fmt16);
1885 }
1886 
1887 static float64 QEMU_SOFTFLOAT_ATTR
1888 soft_float32_to_float64(float32 a, float_status *s)
1889 {
1890     FloatParts p = float32_unpack_canonical(a, s);
1891     FloatParts pr = float_to_float(p, &float64_params, s);
1892     return float64_round_pack_canonical(pr, s);
1893 }
1894 
1895 float64 float32_to_float64(float32 a, float_status *s)
1896 {
1897     if (likely(float32_is_normal(a))) {
1898         /* Widening conversion can never produce inexact results.  */
1899         union_float32 uf;
1900         union_float64 ud;
1901         uf.s = a;
1902         ud.h = uf.h;
1903         return ud.s;
1904     } else if (float32_is_zero(a)) {
1905         return float64_set_sign(float64_zero, float32_is_neg(a));
1906     } else {
1907         return soft_float32_to_float64(a, s);
1908     }
1909 }
1910 
1911 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1912 {
1913     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1914     FloatParts p = float64_unpack_canonical(a, s);
1915     FloatParts pr = float_to_float(p, fmt16, s);
1916     return float16a_round_pack_canonical(pr, s, fmt16);
1917 }
1918 
1919 float32 float64_to_float32(float64 a, float_status *s)
1920 {
1921     FloatParts p = float64_unpack_canonical(a, s);
1922     FloatParts pr = float_to_float(p, &float32_params, s);
1923     return float32_round_pack_canonical(pr, s);
1924 }
1925 
1926 /*
1927  * Rounds the floating-point value `a' to an integer, and returns the
1928  * result as a floating-point value. The operation is performed
1929  * according to the IEC/IEEE Standard for Binary Floating-Point
1930  * Arithmetic.
1931  */
1932 
1933 static FloatParts round_to_int(FloatParts a, FloatRoundMode rmode,
1934                                int scale, float_status *s)
1935 {
1936     switch (a.cls) {
1937     case float_class_qnan:
1938     case float_class_snan:
1939         return return_nan(a, s);
1940 
1941     case float_class_zero:
1942     case float_class_inf:
1943         /* already "integral" */
1944         break;
1945 
1946     case float_class_normal:
1947         scale = MIN(MAX(scale, -0x10000), 0x10000);
1948         a.exp += scale;
1949 
1950         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1951             /* already integral */
1952             break;
1953         }
1954         if (a.exp < 0) {
1955             bool one;
1956             /* all fractional */
1957             s->float_exception_flags |= float_flag_inexact;
1958             switch (rmode) {
1959             case float_round_nearest_even:
1960                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1961                 break;
1962             case float_round_ties_away:
1963                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1964                 break;
1965             case float_round_to_zero:
1966                 one = false;
1967                 break;
1968             case float_round_up:
1969                 one = !a.sign;
1970                 break;
1971             case float_round_down:
1972                 one = a.sign;
1973                 break;
1974             case float_round_to_odd:
1975                 one = true;
1976                 break;
1977             default:
1978                 g_assert_not_reached();
1979             }
1980 
1981             if (one) {
1982                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1983                 a.exp = 0;
1984             } else {
1985                 a.cls = float_class_zero;
1986             }
1987         } else {
1988             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1989             uint64_t frac_lsbm1 = frac_lsb >> 1;
1990             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1991             uint64_t rnd_mask = rnd_even_mask >> 1;
1992             uint64_t inc;
1993 
1994             switch (rmode) {
1995             case float_round_nearest_even:
1996                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1997                 break;
1998             case float_round_ties_away:
1999                 inc = frac_lsbm1;
2000                 break;
2001             case float_round_to_zero:
2002                 inc = 0;
2003                 break;
2004             case float_round_up:
2005                 inc = a.sign ? 0 : rnd_mask;
2006                 break;
2007             case float_round_down:
2008                 inc = a.sign ? rnd_mask : 0;
2009                 break;
2010             case float_round_to_odd:
2011                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2012                 break;
2013             default:
2014                 g_assert_not_reached();
2015             }
2016 
2017             if (a.frac & rnd_mask) {
2018                 s->float_exception_flags |= float_flag_inexact;
2019                 a.frac += inc;
2020                 a.frac &= ~rnd_mask;
2021                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2022                     a.frac >>= 1;
2023                     a.exp++;
2024                 }
2025             }
2026         }
2027         break;
2028     default:
2029         g_assert_not_reached();
2030     }
2031     return a;
2032 }
2033 
2034 float16 float16_round_to_int(float16 a, float_status *s)
2035 {
2036     FloatParts pa = float16_unpack_canonical(a, s);
2037     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2038     return float16_round_pack_canonical(pr, s);
2039 }
2040 
2041 float32 float32_round_to_int(float32 a, float_status *s)
2042 {
2043     FloatParts pa = float32_unpack_canonical(a, s);
2044     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2045     return float32_round_pack_canonical(pr, s);
2046 }
2047 
2048 float64 float64_round_to_int(float64 a, float_status *s)
2049 {
2050     FloatParts pa = float64_unpack_canonical(a, s);
2051     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2052     return float64_round_pack_canonical(pr, s);
2053 }
2054 
2055 /*
2056  * Returns the result of converting the floating-point value `a' to
2057  * the two's complement integer format. The conversion is performed
2058  * according to the IEC/IEEE Standard for Binary Floating-Point
2059  * Arithmetic---which means in particular that the conversion is
2060  * rounded according to the current rounding mode. If `a' is a NaN,
2061  * the largest positive integer is returned. Otherwise, if the
2062  * conversion overflows, the largest integer with the same sign as `a'
2063  * is returned.
2064 */
2065 
2066 static int64_t round_to_int_and_pack(FloatParts in, FloatRoundMode rmode,
2067                                      int scale, int64_t min, int64_t max,
2068                                      float_status *s)
2069 {
2070     uint64_t r;
2071     int orig_flags = get_float_exception_flags(s);
2072     FloatParts p = round_to_int(in, rmode, scale, s);
2073 
2074     switch (p.cls) {
2075     case float_class_snan:
2076     case float_class_qnan:
2077         s->float_exception_flags = orig_flags | float_flag_invalid;
2078         return max;
2079     case float_class_inf:
2080         s->float_exception_flags = orig_flags | float_flag_invalid;
2081         return p.sign ? min : max;
2082     case float_class_zero:
2083         return 0;
2084     case float_class_normal:
2085         if (p.exp < DECOMPOSED_BINARY_POINT) {
2086             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2087         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2088             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2089         } else {
2090             r = UINT64_MAX;
2091         }
2092         if (p.sign) {
2093             if (r <= -(uint64_t) min) {
2094                 return -r;
2095             } else {
2096                 s->float_exception_flags = orig_flags | float_flag_invalid;
2097                 return min;
2098             }
2099         } else {
2100             if (r <= max) {
2101                 return r;
2102             } else {
2103                 s->float_exception_flags = orig_flags | float_flag_invalid;
2104                 return max;
2105             }
2106         }
2107     default:
2108         g_assert_not_reached();
2109     }
2110 }
2111 
2112 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2113                                 float_status *s)
2114 {
2115     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2116                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2117 }
2118 
2119 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2120                                 float_status *s)
2121 {
2122     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2123                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2124 }
2125 
2126 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2127                                 float_status *s)
2128 {
2129     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2130                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2131 }
2132 
2133 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2134                                 float_status *s)
2135 {
2136     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2137                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2138 }
2139 
2140 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2141                                 float_status *s)
2142 {
2143     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2144                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2145 }
2146 
2147 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2148                                 float_status *s)
2149 {
2150     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2151                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2152 }
2153 
2154 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2155                                 float_status *s)
2156 {
2157     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2158                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2159 }
2160 
2161 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2162                                 float_status *s)
2163 {
2164     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2165                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2166 }
2167 
2168 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2169                                 float_status *s)
2170 {
2171     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2172                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2173 }
2174 
2175 int16_t float16_to_int16(float16 a, float_status *s)
2176 {
2177     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2178 }
2179 
2180 int32_t float16_to_int32(float16 a, float_status *s)
2181 {
2182     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2183 }
2184 
2185 int64_t float16_to_int64(float16 a, float_status *s)
2186 {
2187     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2188 }
2189 
2190 int16_t float32_to_int16(float32 a, float_status *s)
2191 {
2192     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2193 }
2194 
2195 int32_t float32_to_int32(float32 a, float_status *s)
2196 {
2197     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2198 }
2199 
2200 int64_t float32_to_int64(float32 a, float_status *s)
2201 {
2202     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2203 }
2204 
2205 int16_t float64_to_int16(float64 a, float_status *s)
2206 {
2207     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2208 }
2209 
2210 int32_t float64_to_int32(float64 a, float_status *s)
2211 {
2212     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2213 }
2214 
2215 int64_t float64_to_int64(float64 a, float_status *s)
2216 {
2217     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2218 }
2219 
2220 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2221 {
2222     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2223 }
2224 
2225 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2226 {
2227     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2228 }
2229 
2230 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2231 {
2232     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2233 }
2234 
2235 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2236 {
2237     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2238 }
2239 
2240 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2241 {
2242     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2243 }
2244 
2245 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2246 {
2247     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2248 }
2249 
2250 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2251 {
2252     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2253 }
2254 
2255 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2256 {
2257     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2258 }
2259 
2260 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2261 {
2262     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2263 }
2264 
2265 /*
2266  *  Returns the result of converting the floating-point value `a' to
2267  *  the unsigned integer format. The conversion is performed according
2268  *  to the IEC/IEEE Standard for Binary Floating-Point
2269  *  Arithmetic---which means in particular that the conversion is
2270  *  rounded according to the current rounding mode. If `a' is a NaN,
2271  *  the largest unsigned integer is returned. Otherwise, if the
2272  *  conversion overflows, the largest unsigned integer is returned. If
2273  *  the 'a' is negative, the result is rounded and zero is returned;
2274  *  values that do not round to zero will raise the inexact exception
2275  *  flag.
2276  */
2277 
2278 static uint64_t round_to_uint_and_pack(FloatParts in, FloatRoundMode rmode,
2279                                        int scale, uint64_t max,
2280                                        float_status *s)
2281 {
2282     int orig_flags = get_float_exception_flags(s);
2283     FloatParts p = round_to_int(in, rmode, scale, s);
2284     uint64_t r;
2285 
2286     switch (p.cls) {
2287     case float_class_snan:
2288     case float_class_qnan:
2289         s->float_exception_flags = orig_flags | float_flag_invalid;
2290         return max;
2291     case float_class_inf:
2292         s->float_exception_flags = orig_flags | float_flag_invalid;
2293         return p.sign ? 0 : max;
2294     case float_class_zero:
2295         return 0;
2296     case float_class_normal:
2297         if (p.sign) {
2298             s->float_exception_flags = orig_flags | float_flag_invalid;
2299             return 0;
2300         }
2301 
2302         if (p.exp < DECOMPOSED_BINARY_POINT) {
2303             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2304         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2305             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2306         } else {
2307             s->float_exception_flags = orig_flags | float_flag_invalid;
2308             return max;
2309         }
2310 
2311         /* For uint64 this will never trip, but if p.exp is too large
2312          * to shift a decomposed fraction we shall have exited via the
2313          * 3rd leg above.
2314          */
2315         if (r > max) {
2316             s->float_exception_flags = orig_flags | float_flag_invalid;
2317             return max;
2318         }
2319         return r;
2320     default:
2321         g_assert_not_reached();
2322     }
2323 }
2324 
2325 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2326                                   float_status *s)
2327 {
2328     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2329                                   rmode, scale, UINT16_MAX, s);
2330 }
2331 
2332 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2333                                   float_status *s)
2334 {
2335     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2336                                   rmode, scale, UINT32_MAX, s);
2337 }
2338 
2339 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2340                                   float_status *s)
2341 {
2342     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2343                                   rmode, scale, UINT64_MAX, s);
2344 }
2345 
2346 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2347                                   float_status *s)
2348 {
2349     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2350                                   rmode, scale, UINT16_MAX, s);
2351 }
2352 
2353 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2354                                   float_status *s)
2355 {
2356     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2357                                   rmode, scale, UINT32_MAX, s);
2358 }
2359 
2360 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2361                                   float_status *s)
2362 {
2363     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2364                                   rmode, scale, UINT64_MAX, s);
2365 }
2366 
2367 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2368                                   float_status *s)
2369 {
2370     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2371                                   rmode, scale, UINT16_MAX, s);
2372 }
2373 
2374 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2375                                   float_status *s)
2376 {
2377     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2378                                   rmode, scale, UINT32_MAX, s);
2379 }
2380 
2381 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2382                                   float_status *s)
2383 {
2384     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2385                                   rmode, scale, UINT64_MAX, s);
2386 }
2387 
2388 uint16_t float16_to_uint16(float16 a, float_status *s)
2389 {
2390     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2391 }
2392 
2393 uint32_t float16_to_uint32(float16 a, float_status *s)
2394 {
2395     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2396 }
2397 
2398 uint64_t float16_to_uint64(float16 a, float_status *s)
2399 {
2400     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2401 }
2402 
2403 uint16_t float32_to_uint16(float32 a, float_status *s)
2404 {
2405     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2406 }
2407 
2408 uint32_t float32_to_uint32(float32 a, float_status *s)
2409 {
2410     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2411 }
2412 
2413 uint64_t float32_to_uint64(float32 a, float_status *s)
2414 {
2415     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2416 }
2417 
2418 uint16_t float64_to_uint16(float64 a, float_status *s)
2419 {
2420     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2421 }
2422 
2423 uint32_t float64_to_uint32(float64 a, float_status *s)
2424 {
2425     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2426 }
2427 
2428 uint64_t float64_to_uint64(float64 a, float_status *s)
2429 {
2430     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2431 }
2432 
2433 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2434 {
2435     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2436 }
2437 
2438 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2439 {
2440     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2441 }
2442 
2443 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2444 {
2445     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2446 }
2447 
2448 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2449 {
2450     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2451 }
2452 
2453 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2454 {
2455     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2456 }
2457 
2458 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2459 {
2460     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2461 }
2462 
2463 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2464 {
2465     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2466 }
2467 
2468 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2469 {
2470     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2471 }
2472 
2473 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2474 {
2475     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2476 }
2477 
2478 /*
2479  * Integer to float conversions
2480  *
2481  * Returns the result of converting the two's complement integer `a'
2482  * to the floating-point format. The conversion is performed according
2483  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2484  */
2485 
2486 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2487 {
2488     FloatParts r = { .sign = false };
2489 
2490     if (a == 0) {
2491         r.cls = float_class_zero;
2492     } else {
2493         uint64_t f = a;
2494         int shift;
2495 
2496         r.cls = float_class_normal;
2497         if (a < 0) {
2498             f = -f;
2499             r.sign = true;
2500         }
2501         shift = clz64(f) - 1;
2502         scale = MIN(MAX(scale, -0x10000), 0x10000);
2503 
2504         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2505         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2506     }
2507 
2508     return r;
2509 }
2510 
2511 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2512 {
2513     FloatParts pa = int_to_float(a, scale, status);
2514     return float16_round_pack_canonical(pa, status);
2515 }
2516 
2517 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2518 {
2519     return int64_to_float16_scalbn(a, scale, status);
2520 }
2521 
2522 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2523 {
2524     return int64_to_float16_scalbn(a, scale, status);
2525 }
2526 
2527 float16 int64_to_float16(int64_t a, float_status *status)
2528 {
2529     return int64_to_float16_scalbn(a, 0, status);
2530 }
2531 
2532 float16 int32_to_float16(int32_t a, float_status *status)
2533 {
2534     return int64_to_float16_scalbn(a, 0, status);
2535 }
2536 
2537 float16 int16_to_float16(int16_t a, float_status *status)
2538 {
2539     return int64_to_float16_scalbn(a, 0, status);
2540 }
2541 
2542 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2543 {
2544     FloatParts pa = int_to_float(a, scale, status);
2545     return float32_round_pack_canonical(pa, status);
2546 }
2547 
2548 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2549 {
2550     return int64_to_float32_scalbn(a, scale, status);
2551 }
2552 
2553 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2554 {
2555     return int64_to_float32_scalbn(a, scale, status);
2556 }
2557 
2558 float32 int64_to_float32(int64_t a, float_status *status)
2559 {
2560     return int64_to_float32_scalbn(a, 0, status);
2561 }
2562 
2563 float32 int32_to_float32(int32_t a, float_status *status)
2564 {
2565     return int64_to_float32_scalbn(a, 0, status);
2566 }
2567 
2568 float32 int16_to_float32(int16_t a, float_status *status)
2569 {
2570     return int64_to_float32_scalbn(a, 0, status);
2571 }
2572 
2573 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2574 {
2575     FloatParts pa = int_to_float(a, scale, status);
2576     return float64_round_pack_canonical(pa, status);
2577 }
2578 
2579 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2580 {
2581     return int64_to_float64_scalbn(a, scale, status);
2582 }
2583 
2584 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2585 {
2586     return int64_to_float64_scalbn(a, scale, status);
2587 }
2588 
2589 float64 int64_to_float64(int64_t a, float_status *status)
2590 {
2591     return int64_to_float64_scalbn(a, 0, status);
2592 }
2593 
2594 float64 int32_to_float64(int32_t a, float_status *status)
2595 {
2596     return int64_to_float64_scalbn(a, 0, status);
2597 }
2598 
2599 float64 int16_to_float64(int16_t a, float_status *status)
2600 {
2601     return int64_to_float64_scalbn(a, 0, status);
2602 }
2603 
2604 
2605 /*
2606  * Unsigned Integer to float conversions
2607  *
2608  * Returns the result of converting the unsigned integer `a' to the
2609  * floating-point format. The conversion is performed according to the
2610  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2611  */
2612 
2613 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2614 {
2615     FloatParts r = { .sign = false };
2616 
2617     if (a == 0) {
2618         r.cls = float_class_zero;
2619     } else {
2620         scale = MIN(MAX(scale, -0x10000), 0x10000);
2621         r.cls = float_class_normal;
2622         if ((int64_t)a < 0) {
2623             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2624             shift64RightJamming(a, 1, &a);
2625             r.frac = a;
2626         } else {
2627             int shift = clz64(a) - 1;
2628             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2629             r.frac = a << shift;
2630         }
2631     }
2632 
2633     return r;
2634 }
2635 
2636 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2637 {
2638     FloatParts pa = uint_to_float(a, scale, status);
2639     return float16_round_pack_canonical(pa, status);
2640 }
2641 
2642 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2643 {
2644     return uint64_to_float16_scalbn(a, scale, status);
2645 }
2646 
2647 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2648 {
2649     return uint64_to_float16_scalbn(a, scale, status);
2650 }
2651 
2652 float16 uint64_to_float16(uint64_t a, float_status *status)
2653 {
2654     return uint64_to_float16_scalbn(a, 0, status);
2655 }
2656 
2657 float16 uint32_to_float16(uint32_t a, float_status *status)
2658 {
2659     return uint64_to_float16_scalbn(a, 0, status);
2660 }
2661 
2662 float16 uint16_to_float16(uint16_t a, float_status *status)
2663 {
2664     return uint64_to_float16_scalbn(a, 0, status);
2665 }
2666 
2667 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2668 {
2669     FloatParts pa = uint_to_float(a, scale, status);
2670     return float32_round_pack_canonical(pa, status);
2671 }
2672 
2673 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2674 {
2675     return uint64_to_float32_scalbn(a, scale, status);
2676 }
2677 
2678 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2679 {
2680     return uint64_to_float32_scalbn(a, scale, status);
2681 }
2682 
2683 float32 uint64_to_float32(uint64_t a, float_status *status)
2684 {
2685     return uint64_to_float32_scalbn(a, 0, status);
2686 }
2687 
2688 float32 uint32_to_float32(uint32_t a, float_status *status)
2689 {
2690     return uint64_to_float32_scalbn(a, 0, status);
2691 }
2692 
2693 float32 uint16_to_float32(uint16_t a, float_status *status)
2694 {
2695     return uint64_to_float32_scalbn(a, 0, status);
2696 }
2697 
2698 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2699 {
2700     FloatParts pa = uint_to_float(a, scale, status);
2701     return float64_round_pack_canonical(pa, status);
2702 }
2703 
2704 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2705 {
2706     return uint64_to_float64_scalbn(a, scale, status);
2707 }
2708 
2709 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2710 {
2711     return uint64_to_float64_scalbn(a, scale, status);
2712 }
2713 
2714 float64 uint64_to_float64(uint64_t a, float_status *status)
2715 {
2716     return uint64_to_float64_scalbn(a, 0, status);
2717 }
2718 
2719 float64 uint32_to_float64(uint32_t a, float_status *status)
2720 {
2721     return uint64_to_float64_scalbn(a, 0, status);
2722 }
2723 
2724 float64 uint16_to_float64(uint16_t a, float_status *status)
2725 {
2726     return uint64_to_float64_scalbn(a, 0, status);
2727 }
2728 
2729 /* Float Min/Max */
2730 /* min() and max() functions. These can't be implemented as
2731  * 'compare and pick one input' because that would mishandle
2732  * NaNs and +0 vs -0.
2733  *
2734  * minnum() and maxnum() functions. These are similar to the min()
2735  * and max() functions but if one of the arguments is a QNaN and
2736  * the other is numerical then the numerical argument is returned.
2737  * SNaNs will get quietened before being returned.
2738  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2739  * and maxNum() operations. min() and max() are the typical min/max
2740  * semantics provided by many CPUs which predate that specification.
2741  *
2742  * minnummag() and maxnummag() functions correspond to minNumMag()
2743  * and minNumMag() from the IEEE-754 2008.
2744  */
2745 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2746                                 bool ieee, bool ismag, float_status *s)
2747 {
2748     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2749         if (ieee) {
2750             /* Takes two floating-point values `a' and `b', one of
2751              * which is a NaN, and returns the appropriate NaN
2752              * result. If either `a' or `b' is a signaling NaN,
2753              * the invalid exception is raised.
2754              */
2755             if (is_snan(a.cls) || is_snan(b.cls)) {
2756                 return pick_nan(a, b, s);
2757             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2758                 return b;
2759             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2760                 return a;
2761             }
2762         }
2763         return pick_nan(a, b, s);
2764     } else {
2765         int a_exp, b_exp;
2766 
2767         switch (a.cls) {
2768         case float_class_normal:
2769             a_exp = a.exp;
2770             break;
2771         case float_class_inf:
2772             a_exp = INT_MAX;
2773             break;
2774         case float_class_zero:
2775             a_exp = INT_MIN;
2776             break;
2777         default:
2778             g_assert_not_reached();
2779             break;
2780         }
2781         switch (b.cls) {
2782         case float_class_normal:
2783             b_exp = b.exp;
2784             break;
2785         case float_class_inf:
2786             b_exp = INT_MAX;
2787             break;
2788         case float_class_zero:
2789             b_exp = INT_MIN;
2790             break;
2791         default:
2792             g_assert_not_reached();
2793             break;
2794         }
2795 
2796         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2797             bool a_less = a_exp < b_exp;
2798             if (a_exp == b_exp) {
2799                 a_less = a.frac < b.frac;
2800             }
2801             return a_less ^ ismin ? b : a;
2802         }
2803 
2804         if (a.sign == b.sign) {
2805             bool a_less = a_exp < b_exp;
2806             if (a_exp == b_exp) {
2807                 a_less = a.frac < b.frac;
2808             }
2809             return a.sign ^ a_less ^ ismin ? b : a;
2810         } else {
2811             return a.sign ^ ismin ? b : a;
2812         }
2813     }
2814 }
2815 
2816 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2817 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2818                                      float_status *s)                   \
2819 {                                                                       \
2820     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2821     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2822     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2823                                                                         \
2824     return float ## sz ## _round_pack_canonical(pr, s);                 \
2825 }
2826 
2827 MINMAX(16, min, true, false, false)
2828 MINMAX(16, minnum, true, true, false)
2829 MINMAX(16, minnummag, true, true, true)
2830 MINMAX(16, max, false, false, false)
2831 MINMAX(16, maxnum, false, true, false)
2832 MINMAX(16, maxnummag, false, true, true)
2833 
2834 MINMAX(32, min, true, false, false)
2835 MINMAX(32, minnum, true, true, false)
2836 MINMAX(32, minnummag, true, true, true)
2837 MINMAX(32, max, false, false, false)
2838 MINMAX(32, maxnum, false, true, false)
2839 MINMAX(32, maxnummag, false, true, true)
2840 
2841 MINMAX(64, min, true, false, false)
2842 MINMAX(64, minnum, true, true, false)
2843 MINMAX(64, minnummag, true, true, true)
2844 MINMAX(64, max, false, false, false)
2845 MINMAX(64, maxnum, false, true, false)
2846 MINMAX(64, maxnummag, false, true, true)
2847 
2848 #undef MINMAX
2849 
2850 /* Floating point compare */
2851 static FloatRelation compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2852                                     float_status *s)
2853 {
2854     if (is_nan(a.cls) || is_nan(b.cls)) {
2855         if (!is_quiet ||
2856             a.cls == float_class_snan ||
2857             b.cls == float_class_snan) {
2858             s->float_exception_flags |= float_flag_invalid;
2859         }
2860         return float_relation_unordered;
2861     }
2862 
2863     if (a.cls == float_class_zero) {
2864         if (b.cls == float_class_zero) {
2865             return float_relation_equal;
2866         }
2867         return b.sign ? float_relation_greater : float_relation_less;
2868     } else if (b.cls == float_class_zero) {
2869         return a.sign ? float_relation_less : float_relation_greater;
2870     }
2871 
2872     /* The only really important thing about infinity is its sign. If
2873      * both are infinities the sign marks the smallest of the two.
2874      */
2875     if (a.cls == float_class_inf) {
2876         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2877             return float_relation_equal;
2878         }
2879         return a.sign ? float_relation_less : float_relation_greater;
2880     } else if (b.cls == float_class_inf) {
2881         return b.sign ? float_relation_greater : float_relation_less;
2882     }
2883 
2884     if (a.sign != b.sign) {
2885         return a.sign ? float_relation_less : float_relation_greater;
2886     }
2887 
2888     if (a.exp == b.exp) {
2889         if (a.frac == b.frac) {
2890             return float_relation_equal;
2891         }
2892         if (a.sign) {
2893             return a.frac > b.frac ?
2894                 float_relation_less : float_relation_greater;
2895         } else {
2896             return a.frac > b.frac ?
2897                 float_relation_greater : float_relation_less;
2898         }
2899     } else {
2900         if (a.sign) {
2901             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2902         } else {
2903             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2904         }
2905     }
2906 }
2907 
2908 #define COMPARE(name, attr, sz)                                         \
2909 static int attr                                                         \
2910 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2911 {                                                                       \
2912     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2913     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2914     return compare_floats(pa, pb, is_quiet, s);                         \
2915 }
2916 
2917 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2918 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2919 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2920 
2921 #undef COMPARE
2922 
2923 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
2924 {
2925     return soft_f16_compare(a, b, false, s);
2926 }
2927 
2928 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
2929 {
2930     return soft_f16_compare(a, b, true, s);
2931 }
2932 
2933 static FloatRelation QEMU_FLATTEN
2934 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2935 {
2936     union_float32 ua, ub;
2937 
2938     ua.s = xa;
2939     ub.s = xb;
2940 
2941     if (QEMU_NO_HARDFLOAT) {
2942         goto soft;
2943     }
2944 
2945     float32_input_flush2(&ua.s, &ub.s, s);
2946     if (isgreaterequal(ua.h, ub.h)) {
2947         if (isgreater(ua.h, ub.h)) {
2948             return float_relation_greater;
2949         }
2950         return float_relation_equal;
2951     }
2952     if (likely(isless(ua.h, ub.h))) {
2953         return float_relation_less;
2954     }
2955     /* The only condition remaining is unordered.
2956      * Fall through to set flags.
2957      */
2958  soft:
2959     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2960 }
2961 
2962 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
2963 {
2964     return f32_compare(a, b, false, s);
2965 }
2966 
2967 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
2968 {
2969     return f32_compare(a, b, true, s);
2970 }
2971 
2972 static FloatRelation QEMU_FLATTEN
2973 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
2974 {
2975     union_float64 ua, ub;
2976 
2977     ua.s = xa;
2978     ub.s = xb;
2979 
2980     if (QEMU_NO_HARDFLOAT) {
2981         goto soft;
2982     }
2983 
2984     float64_input_flush2(&ua.s, &ub.s, s);
2985     if (isgreaterequal(ua.h, ub.h)) {
2986         if (isgreater(ua.h, ub.h)) {
2987             return float_relation_greater;
2988         }
2989         return float_relation_equal;
2990     }
2991     if (likely(isless(ua.h, ub.h))) {
2992         return float_relation_less;
2993     }
2994     /* The only condition remaining is unordered.
2995      * Fall through to set flags.
2996      */
2997  soft:
2998     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
2999 }
3000 
3001 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3002 {
3003     return f64_compare(a, b, false, s);
3004 }
3005 
3006 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3007 {
3008     return f64_compare(a, b, true, s);
3009 }
3010 
3011 /* Multiply A by 2 raised to the power N.  */
3012 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3013 {
3014     if (unlikely(is_nan(a.cls))) {
3015         return return_nan(a, s);
3016     }
3017     if (a.cls == float_class_normal) {
3018         /* The largest float type (even though not supported by FloatParts)
3019          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3020          * still allows rounding to infinity, without allowing overflow
3021          * within the int32_t that backs FloatParts.exp.
3022          */
3023         n = MIN(MAX(n, -0x10000), 0x10000);
3024         a.exp += n;
3025     }
3026     return a;
3027 }
3028 
3029 float16 float16_scalbn(float16 a, int n, float_status *status)
3030 {
3031     FloatParts pa = float16_unpack_canonical(a, status);
3032     FloatParts pr = scalbn_decomposed(pa, n, status);
3033     return float16_round_pack_canonical(pr, status);
3034 }
3035 
3036 float32 float32_scalbn(float32 a, int n, float_status *status)
3037 {
3038     FloatParts pa = float32_unpack_canonical(a, status);
3039     FloatParts pr = scalbn_decomposed(pa, n, status);
3040     return float32_round_pack_canonical(pr, status);
3041 }
3042 
3043 float64 float64_scalbn(float64 a, int n, float_status *status)
3044 {
3045     FloatParts pa = float64_unpack_canonical(a, status);
3046     FloatParts pr = scalbn_decomposed(pa, n, status);
3047     return float64_round_pack_canonical(pr, status);
3048 }
3049 
3050 /*
3051  * Square Root
3052  *
3053  * The old softfloat code did an approximation step before zeroing in
3054  * on the final result. However for simpleness we just compute the
3055  * square root by iterating down from the implicit bit to enough extra
3056  * bits to ensure we get a correctly rounded result.
3057  *
3058  * This does mean however the calculation is slower than before,
3059  * especially for 64 bit floats.
3060  */
3061 
3062 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3063 {
3064     uint64_t a_frac, r_frac, s_frac;
3065     int bit, last_bit;
3066 
3067     if (is_nan(a.cls)) {
3068         return return_nan(a, s);
3069     }
3070     if (a.cls == float_class_zero) {
3071         return a;  /* sqrt(+-0) = +-0 */
3072     }
3073     if (a.sign) {
3074         s->float_exception_flags |= float_flag_invalid;
3075         return parts_default_nan(s);
3076     }
3077     if (a.cls == float_class_inf) {
3078         return a;  /* sqrt(+inf) = +inf */
3079     }
3080 
3081     assert(a.cls == float_class_normal);
3082 
3083     /* We need two overflow bits at the top. Adding room for that is a
3084      * right shift. If the exponent is odd, we can discard the low bit
3085      * by multiplying the fraction by 2; that's a left shift. Combine
3086      * those and we shift right if the exponent is even.
3087      */
3088     a_frac = a.frac;
3089     if (!(a.exp & 1)) {
3090         a_frac >>= 1;
3091     }
3092     a.exp >>= 1;
3093 
3094     /* Bit-by-bit computation of sqrt.  */
3095     r_frac = 0;
3096     s_frac = 0;
3097 
3098     /* Iterate from implicit bit down to the 3 extra bits to compute a
3099      * properly rounded result. Remember we've inserted one more bit
3100      * at the top, so these positions are one less.
3101      */
3102     bit = DECOMPOSED_BINARY_POINT - 1;
3103     last_bit = MAX(p->frac_shift - 4, 0);
3104     do {
3105         uint64_t q = 1ULL << bit;
3106         uint64_t t_frac = s_frac + q;
3107         if (t_frac <= a_frac) {
3108             s_frac = t_frac + q;
3109             a_frac -= t_frac;
3110             r_frac += q;
3111         }
3112         a_frac <<= 1;
3113     } while (--bit >= last_bit);
3114 
3115     /* Undo the right shift done above. If there is any remaining
3116      * fraction, the result is inexact. Set the sticky bit.
3117      */
3118     a.frac = (r_frac << 1) + (a_frac != 0);
3119 
3120     return a;
3121 }
3122 
3123 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3124 {
3125     FloatParts pa = float16_unpack_canonical(a, status);
3126     FloatParts pr = sqrt_float(pa, status, &float16_params);
3127     return float16_round_pack_canonical(pr, status);
3128 }
3129 
3130 static float32 QEMU_SOFTFLOAT_ATTR
3131 soft_f32_sqrt(float32 a, float_status *status)
3132 {
3133     FloatParts pa = float32_unpack_canonical(a, status);
3134     FloatParts pr = sqrt_float(pa, status, &float32_params);
3135     return float32_round_pack_canonical(pr, status);
3136 }
3137 
3138 static float64 QEMU_SOFTFLOAT_ATTR
3139 soft_f64_sqrt(float64 a, float_status *status)
3140 {
3141     FloatParts pa = float64_unpack_canonical(a, status);
3142     FloatParts pr = sqrt_float(pa, status, &float64_params);
3143     return float64_round_pack_canonical(pr, status);
3144 }
3145 
3146 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3147 {
3148     union_float32 ua, ur;
3149 
3150     ua.s = xa;
3151     if (unlikely(!can_use_fpu(s))) {
3152         goto soft;
3153     }
3154 
3155     float32_input_flush1(&ua.s, s);
3156     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3157         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3158                        fpclassify(ua.h) == FP_ZERO) ||
3159                      signbit(ua.h))) {
3160             goto soft;
3161         }
3162     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3163                         float32_is_neg(ua.s))) {
3164         goto soft;
3165     }
3166     ur.h = sqrtf(ua.h);
3167     return ur.s;
3168 
3169  soft:
3170     return soft_f32_sqrt(ua.s, s);
3171 }
3172 
3173 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3174 {
3175     union_float64 ua, ur;
3176 
3177     ua.s = xa;
3178     if (unlikely(!can_use_fpu(s))) {
3179         goto soft;
3180     }
3181 
3182     float64_input_flush1(&ua.s, s);
3183     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3184         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3185                        fpclassify(ua.h) == FP_ZERO) ||
3186                      signbit(ua.h))) {
3187             goto soft;
3188         }
3189     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3190                         float64_is_neg(ua.s))) {
3191         goto soft;
3192     }
3193     ur.h = sqrt(ua.h);
3194     return ur.s;
3195 
3196  soft:
3197     return soft_f64_sqrt(ua.s, s);
3198 }
3199 
3200 /*----------------------------------------------------------------------------
3201 | The pattern for a default generated NaN.
3202 *----------------------------------------------------------------------------*/
3203 
3204 float16 float16_default_nan(float_status *status)
3205 {
3206     FloatParts p = parts_default_nan(status);
3207     p.frac >>= float16_params.frac_shift;
3208     return float16_pack_raw(p);
3209 }
3210 
3211 float32 float32_default_nan(float_status *status)
3212 {
3213     FloatParts p = parts_default_nan(status);
3214     p.frac >>= float32_params.frac_shift;
3215     return float32_pack_raw(p);
3216 }
3217 
3218 float64 float64_default_nan(float_status *status)
3219 {
3220     FloatParts p = parts_default_nan(status);
3221     p.frac >>= float64_params.frac_shift;
3222     return float64_pack_raw(p);
3223 }
3224 
3225 float128 float128_default_nan(float_status *status)
3226 {
3227     FloatParts p = parts_default_nan(status);
3228     float128 r;
3229 
3230     /* Extrapolate from the choices made by parts_default_nan to fill
3231      * in the quad-floating format.  If the low bit is set, assume we
3232      * want to set all non-snan bits.
3233      */
3234     r.low = -(p.frac & 1);
3235     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3236     r.high |= UINT64_C(0x7FFF000000000000);
3237     r.high |= (uint64_t)p.sign << 63;
3238 
3239     return r;
3240 }
3241 
3242 /*----------------------------------------------------------------------------
3243 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3244 *----------------------------------------------------------------------------*/
3245 
3246 float16 float16_silence_nan(float16 a, float_status *status)
3247 {
3248     FloatParts p = float16_unpack_raw(a);
3249     p.frac <<= float16_params.frac_shift;
3250     p = parts_silence_nan(p, status);
3251     p.frac >>= float16_params.frac_shift;
3252     return float16_pack_raw(p);
3253 }
3254 
3255 float32 float32_silence_nan(float32 a, float_status *status)
3256 {
3257     FloatParts p = float32_unpack_raw(a);
3258     p.frac <<= float32_params.frac_shift;
3259     p = parts_silence_nan(p, status);
3260     p.frac >>= float32_params.frac_shift;
3261     return float32_pack_raw(p);
3262 }
3263 
3264 float64 float64_silence_nan(float64 a, float_status *status)
3265 {
3266     FloatParts p = float64_unpack_raw(a);
3267     p.frac <<= float64_params.frac_shift;
3268     p = parts_silence_nan(p, status);
3269     p.frac >>= float64_params.frac_shift;
3270     return float64_pack_raw(p);
3271 }
3272 
3273 
3274 /*----------------------------------------------------------------------------
3275 | If `a' is denormal and we are in flush-to-zero mode then set the
3276 | input-denormal exception and return zero. Otherwise just return the value.
3277 *----------------------------------------------------------------------------*/
3278 
3279 static bool parts_squash_denormal(FloatParts p, float_status *status)
3280 {
3281     if (p.exp == 0 && p.frac != 0) {
3282         float_raise(float_flag_input_denormal, status);
3283         return true;
3284     }
3285 
3286     return false;
3287 }
3288 
3289 float16 float16_squash_input_denormal(float16 a, float_status *status)
3290 {
3291     if (status->flush_inputs_to_zero) {
3292         FloatParts p = float16_unpack_raw(a);
3293         if (parts_squash_denormal(p, status)) {
3294             return float16_set_sign(float16_zero, p.sign);
3295         }
3296     }
3297     return a;
3298 }
3299 
3300 float32 float32_squash_input_denormal(float32 a, float_status *status)
3301 {
3302     if (status->flush_inputs_to_zero) {
3303         FloatParts p = float32_unpack_raw(a);
3304         if (parts_squash_denormal(p, status)) {
3305             return float32_set_sign(float32_zero, p.sign);
3306         }
3307     }
3308     return a;
3309 }
3310 
3311 float64 float64_squash_input_denormal(float64 a, float_status *status)
3312 {
3313     if (status->flush_inputs_to_zero) {
3314         FloatParts p = float64_unpack_raw(a);
3315         if (parts_squash_denormal(p, status)) {
3316             return float64_set_sign(float64_zero, p.sign);
3317         }
3318     }
3319     return a;
3320 }
3321 
3322 /*----------------------------------------------------------------------------
3323 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3324 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3325 | input.  If `zSign' is 1, the input is negated before being converted to an
3326 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3327 | is simply rounded to an integer, with the inexact exception raised if the
3328 | input cannot be represented exactly as an integer.  However, if the fixed-
3329 | point input is too large, the invalid exception is raised and the largest
3330 | positive or negative integer is returned.
3331 *----------------------------------------------------------------------------*/
3332 
3333 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3334                                  float_status *status)
3335 {
3336     int8_t roundingMode;
3337     bool roundNearestEven;
3338     int8_t roundIncrement, roundBits;
3339     int32_t z;
3340 
3341     roundingMode = status->float_rounding_mode;
3342     roundNearestEven = ( roundingMode == float_round_nearest_even );
3343     switch (roundingMode) {
3344     case float_round_nearest_even:
3345     case float_round_ties_away:
3346         roundIncrement = 0x40;
3347         break;
3348     case float_round_to_zero:
3349         roundIncrement = 0;
3350         break;
3351     case float_round_up:
3352         roundIncrement = zSign ? 0 : 0x7f;
3353         break;
3354     case float_round_down:
3355         roundIncrement = zSign ? 0x7f : 0;
3356         break;
3357     case float_round_to_odd:
3358         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3359         break;
3360     default:
3361         abort();
3362     }
3363     roundBits = absZ & 0x7F;
3364     absZ = ( absZ + roundIncrement )>>7;
3365     if (!(roundBits ^ 0x40) && roundNearestEven) {
3366         absZ &= ~1;
3367     }
3368     z = absZ;
3369     if ( zSign ) z = - z;
3370     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3371         float_raise(float_flag_invalid, status);
3372         return zSign ? INT32_MIN : INT32_MAX;
3373     }
3374     if (roundBits) {
3375         status->float_exception_flags |= float_flag_inexact;
3376     }
3377     return z;
3378 
3379 }
3380 
3381 /*----------------------------------------------------------------------------
3382 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3383 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3384 | and returns the properly rounded 64-bit integer corresponding to the input.
3385 | If `zSign' is 1, the input is negated before being converted to an integer.
3386 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3387 | the inexact exception raised if the input cannot be represented exactly as
3388 | an integer.  However, if the fixed-point input is too large, the invalid
3389 | exception is raised and the largest positive or negative integer is
3390 | returned.
3391 *----------------------------------------------------------------------------*/
3392 
3393 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3394                                float_status *status)
3395 {
3396     int8_t roundingMode;
3397     bool roundNearestEven, increment;
3398     int64_t z;
3399 
3400     roundingMode = status->float_rounding_mode;
3401     roundNearestEven = ( roundingMode == float_round_nearest_even );
3402     switch (roundingMode) {
3403     case float_round_nearest_even:
3404     case float_round_ties_away:
3405         increment = ((int64_t) absZ1 < 0);
3406         break;
3407     case float_round_to_zero:
3408         increment = 0;
3409         break;
3410     case float_round_up:
3411         increment = !zSign && absZ1;
3412         break;
3413     case float_round_down:
3414         increment = zSign && absZ1;
3415         break;
3416     case float_round_to_odd:
3417         increment = !(absZ0 & 1) && absZ1;
3418         break;
3419     default:
3420         abort();
3421     }
3422     if ( increment ) {
3423         ++absZ0;
3424         if ( absZ0 == 0 ) goto overflow;
3425         if (!(absZ1 << 1) && roundNearestEven) {
3426             absZ0 &= ~1;
3427         }
3428     }
3429     z = absZ0;
3430     if ( zSign ) z = - z;
3431     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3432  overflow:
3433         float_raise(float_flag_invalid, status);
3434         return zSign ? INT64_MIN : INT64_MAX;
3435     }
3436     if (absZ1) {
3437         status->float_exception_flags |= float_flag_inexact;
3438     }
3439     return z;
3440 
3441 }
3442 
3443 /*----------------------------------------------------------------------------
3444 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3445 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3446 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3447 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3448 | with the inexact exception raised if the input cannot be represented exactly
3449 | as an integer.  However, if the fixed-point input is too large, the invalid
3450 | exception is raised and the largest unsigned integer is returned.
3451 *----------------------------------------------------------------------------*/
3452 
3453 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3454                                 uint64_t absZ1, float_status *status)
3455 {
3456     int8_t roundingMode;
3457     bool roundNearestEven, increment;
3458 
3459     roundingMode = status->float_rounding_mode;
3460     roundNearestEven = (roundingMode == float_round_nearest_even);
3461     switch (roundingMode) {
3462     case float_round_nearest_even:
3463     case float_round_ties_away:
3464         increment = ((int64_t)absZ1 < 0);
3465         break;
3466     case float_round_to_zero:
3467         increment = 0;
3468         break;
3469     case float_round_up:
3470         increment = !zSign && absZ1;
3471         break;
3472     case float_round_down:
3473         increment = zSign && absZ1;
3474         break;
3475     case float_round_to_odd:
3476         increment = !(absZ0 & 1) && absZ1;
3477         break;
3478     default:
3479         abort();
3480     }
3481     if (increment) {
3482         ++absZ0;
3483         if (absZ0 == 0) {
3484             float_raise(float_flag_invalid, status);
3485             return UINT64_MAX;
3486         }
3487         if (!(absZ1 << 1) && roundNearestEven) {
3488             absZ0 &= ~1;
3489         }
3490     }
3491 
3492     if (zSign && absZ0) {
3493         float_raise(float_flag_invalid, status);
3494         return 0;
3495     }
3496 
3497     if (absZ1) {
3498         status->float_exception_flags |= float_flag_inexact;
3499     }
3500     return absZ0;
3501 }
3502 
3503 /*----------------------------------------------------------------------------
3504 | Normalizes the subnormal single-precision floating-point value represented
3505 | by the denormalized significand `aSig'.  The normalized exponent and
3506 | significand are stored at the locations pointed to by `zExpPtr' and
3507 | `zSigPtr', respectively.
3508 *----------------------------------------------------------------------------*/
3509 
3510 static void
3511  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3512 {
3513     int8_t shiftCount;
3514 
3515     shiftCount = clz32(aSig) - 8;
3516     *zSigPtr = aSig<<shiftCount;
3517     *zExpPtr = 1 - shiftCount;
3518 
3519 }
3520 
3521 /*----------------------------------------------------------------------------
3522 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3523 | and significand `zSig', and returns the proper single-precision floating-
3524 | point value corresponding to the abstract input.  Ordinarily, the abstract
3525 | value is simply rounded and packed into the single-precision format, with
3526 | the inexact exception raised if the abstract input cannot be represented
3527 | exactly.  However, if the abstract value is too large, the overflow and
3528 | inexact exceptions are raised and an infinity or maximal finite value is
3529 | returned.  If the abstract value is too small, the input value is rounded to
3530 | a subnormal number, and the underflow and inexact exceptions are raised if
3531 | the abstract input cannot be represented exactly as a subnormal single-
3532 | precision floating-point number.
3533 |     The input significand `zSig' has its binary point between bits 30
3534 | and 29, which is 7 bits to the left of the usual location.  This shifted
3535 | significand must be normalized or smaller.  If `zSig' is not normalized,
3536 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3537 | and it must not require rounding.  In the usual case that `zSig' is
3538 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3539 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3540 | Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3542 
3543 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3544                                    float_status *status)
3545 {
3546     int8_t roundingMode;
3547     bool roundNearestEven;
3548     int8_t roundIncrement, roundBits;
3549     bool isTiny;
3550 
3551     roundingMode = status->float_rounding_mode;
3552     roundNearestEven = ( roundingMode == float_round_nearest_even );
3553     switch (roundingMode) {
3554     case float_round_nearest_even:
3555     case float_round_ties_away:
3556         roundIncrement = 0x40;
3557         break;
3558     case float_round_to_zero:
3559         roundIncrement = 0;
3560         break;
3561     case float_round_up:
3562         roundIncrement = zSign ? 0 : 0x7f;
3563         break;
3564     case float_round_down:
3565         roundIncrement = zSign ? 0x7f : 0;
3566         break;
3567     case float_round_to_odd:
3568         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3569         break;
3570     default:
3571         abort();
3572         break;
3573     }
3574     roundBits = zSig & 0x7F;
3575     if ( 0xFD <= (uint16_t) zExp ) {
3576         if (    ( 0xFD < zExp )
3577              || (    ( zExp == 0xFD )
3578                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3579            ) {
3580             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3581                                    roundIncrement != 0;
3582             float_raise(float_flag_overflow | float_flag_inexact, status);
3583             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3584         }
3585         if ( zExp < 0 ) {
3586             if (status->flush_to_zero) {
3587                 float_raise(float_flag_output_denormal, status);
3588                 return packFloat32(zSign, 0, 0);
3589             }
3590             isTiny = status->tininess_before_rounding
3591                   || (zExp < -1)
3592                   || (zSig + roundIncrement < 0x80000000);
3593             shift32RightJamming( zSig, - zExp, &zSig );
3594             zExp = 0;
3595             roundBits = zSig & 0x7F;
3596             if (isTiny && roundBits) {
3597                 float_raise(float_flag_underflow, status);
3598             }
3599             if (roundingMode == float_round_to_odd) {
3600                 /*
3601                  * For round-to-odd case, the roundIncrement depends on
3602                  * zSig which just changed.
3603                  */
3604                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3605             }
3606         }
3607     }
3608     if (roundBits) {
3609         status->float_exception_flags |= float_flag_inexact;
3610     }
3611     zSig = ( zSig + roundIncrement )>>7;
3612     if (!(roundBits ^ 0x40) && roundNearestEven) {
3613         zSig &= ~1;
3614     }
3615     if ( zSig == 0 ) zExp = 0;
3616     return packFloat32( zSign, zExp, zSig );
3617 
3618 }
3619 
3620 /*----------------------------------------------------------------------------
3621 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3622 | and significand `zSig', and returns the proper single-precision floating-
3623 | point value corresponding to the abstract input.  This routine is just like
3624 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3625 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3626 | floating-point exponent.
3627 *----------------------------------------------------------------------------*/
3628 
3629 static float32
3630  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3631                               float_status *status)
3632 {
3633     int8_t shiftCount;
3634 
3635     shiftCount = clz32(zSig) - 1;
3636     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3637                                status);
3638 
3639 }
3640 
3641 /*----------------------------------------------------------------------------
3642 | Normalizes the subnormal double-precision floating-point value represented
3643 | by the denormalized significand `aSig'.  The normalized exponent and
3644 | significand are stored at the locations pointed to by `zExpPtr' and
3645 | `zSigPtr', respectively.
3646 *----------------------------------------------------------------------------*/
3647 
3648 static void
3649  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3650 {
3651     int8_t shiftCount;
3652 
3653     shiftCount = clz64(aSig) - 11;
3654     *zSigPtr = aSig<<shiftCount;
3655     *zExpPtr = 1 - shiftCount;
3656 
3657 }
3658 
3659 /*----------------------------------------------------------------------------
3660 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3661 | double-precision floating-point value, returning the result.  After being
3662 | shifted into the proper positions, the three fields are simply added
3663 | together to form the result.  This means that any integer portion of `zSig'
3664 | will be added into the exponent.  Since a properly normalized significand
3665 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3666 | than the desired result exponent whenever `zSig' is a complete, normalized
3667 | significand.
3668 *----------------------------------------------------------------------------*/
3669 
3670 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
3671 {
3672 
3673     return make_float64(
3674         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3675 
3676 }
3677 
3678 /*----------------------------------------------------------------------------
3679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3680 | and significand `zSig', and returns the proper double-precision floating-
3681 | point value corresponding to the abstract input.  Ordinarily, the abstract
3682 | value is simply rounded and packed into the double-precision format, with
3683 | the inexact exception raised if the abstract input cannot be represented
3684 | exactly.  However, if the abstract value is too large, the overflow and
3685 | inexact exceptions are raised and an infinity or maximal finite value is
3686 | returned.  If the abstract value is too small, the input value is rounded to
3687 | a subnormal number, and the underflow and inexact exceptions are raised if
3688 | the abstract input cannot be represented exactly as a subnormal double-
3689 | precision floating-point number.
3690 |     The input significand `zSig' has its binary point between bits 62
3691 | and 61, which is 10 bits to the left of the usual location.  This shifted
3692 | significand must be normalized or smaller.  If `zSig' is not normalized,
3693 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3694 | and it must not require rounding.  In the usual case that `zSig' is
3695 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3696 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3697 | Binary Floating-Point Arithmetic.
3698 *----------------------------------------------------------------------------*/
3699 
3700 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
3701                                    float_status *status)
3702 {
3703     int8_t roundingMode;
3704     bool roundNearestEven;
3705     int roundIncrement, roundBits;
3706     bool isTiny;
3707 
3708     roundingMode = status->float_rounding_mode;
3709     roundNearestEven = ( roundingMode == float_round_nearest_even );
3710     switch (roundingMode) {
3711     case float_round_nearest_even:
3712     case float_round_ties_away:
3713         roundIncrement = 0x200;
3714         break;
3715     case float_round_to_zero:
3716         roundIncrement = 0;
3717         break;
3718     case float_round_up:
3719         roundIncrement = zSign ? 0 : 0x3ff;
3720         break;
3721     case float_round_down:
3722         roundIncrement = zSign ? 0x3ff : 0;
3723         break;
3724     case float_round_to_odd:
3725         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3726         break;
3727     default:
3728         abort();
3729     }
3730     roundBits = zSig & 0x3FF;
3731     if ( 0x7FD <= (uint16_t) zExp ) {
3732         if (    ( 0x7FD < zExp )
3733              || (    ( zExp == 0x7FD )
3734                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3735            ) {
3736             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3737                                    roundIncrement != 0;
3738             float_raise(float_flag_overflow | float_flag_inexact, status);
3739             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3740         }
3741         if ( zExp < 0 ) {
3742             if (status->flush_to_zero) {
3743                 float_raise(float_flag_output_denormal, status);
3744                 return packFloat64(zSign, 0, 0);
3745             }
3746             isTiny = status->tininess_before_rounding
3747                   || (zExp < -1)
3748                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
3749             shift64RightJamming( zSig, - zExp, &zSig );
3750             zExp = 0;
3751             roundBits = zSig & 0x3FF;
3752             if (isTiny && roundBits) {
3753                 float_raise(float_flag_underflow, status);
3754             }
3755             if (roundingMode == float_round_to_odd) {
3756                 /*
3757                  * For round-to-odd case, the roundIncrement depends on
3758                  * zSig which just changed.
3759                  */
3760                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3761             }
3762         }
3763     }
3764     if (roundBits) {
3765         status->float_exception_flags |= float_flag_inexact;
3766     }
3767     zSig = ( zSig + roundIncrement )>>10;
3768     if (!(roundBits ^ 0x200) && roundNearestEven) {
3769         zSig &= ~1;
3770     }
3771     if ( zSig == 0 ) zExp = 0;
3772     return packFloat64( zSign, zExp, zSig );
3773 
3774 }
3775 
3776 /*----------------------------------------------------------------------------
3777 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3778 | and significand `zSig', and returns the proper double-precision floating-
3779 | point value corresponding to the abstract input.  This routine is just like
3780 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3781 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3782 | floating-point exponent.
3783 *----------------------------------------------------------------------------*/
3784 
3785 static float64
3786  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
3787                               float_status *status)
3788 {
3789     int8_t shiftCount;
3790 
3791     shiftCount = clz64(zSig) - 1;
3792     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3793                                status);
3794 
3795 }
3796 
3797 /*----------------------------------------------------------------------------
3798 | Normalizes the subnormal extended double-precision floating-point value
3799 | represented by the denormalized significand `aSig'.  The normalized exponent
3800 | and significand are stored at the locations pointed to by `zExpPtr' and
3801 | `zSigPtr', respectively.
3802 *----------------------------------------------------------------------------*/
3803 
3804 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3805                                 uint64_t *zSigPtr)
3806 {
3807     int8_t shiftCount;
3808 
3809     shiftCount = clz64(aSig);
3810     *zSigPtr = aSig<<shiftCount;
3811     *zExpPtr = 1 - shiftCount;
3812 }
3813 
3814 /*----------------------------------------------------------------------------
3815 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3816 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3817 | and returns the proper extended double-precision floating-point value
3818 | corresponding to the abstract input.  Ordinarily, the abstract value is
3819 | rounded and packed into the extended double-precision format, with the
3820 | inexact exception raised if the abstract input cannot be represented
3821 | exactly.  However, if the abstract value is too large, the overflow and
3822 | inexact exceptions are raised and an infinity or maximal finite value is
3823 | returned.  If the abstract value is too small, the input value is rounded to
3824 | a subnormal number, and the underflow and inexact exceptions are raised if
3825 | the abstract input cannot be represented exactly as a subnormal extended
3826 | double-precision floating-point number.
3827 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3828 | number of bits as single or double precision, respectively.  Otherwise, the
3829 | result is rounded to the full precision of the extended double-precision
3830 | format.
3831 |     The input significand must be normalized or smaller.  If the input
3832 | significand is not normalized, `zExp' must be 0; in that case, the result
3833 | returned is a subnormal number, and it must not require rounding.  The
3834 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3835 | Floating-Point Arithmetic.
3836 *----------------------------------------------------------------------------*/
3837 
3838 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
3839                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3840                               float_status *status)
3841 {
3842     int8_t roundingMode;
3843     bool roundNearestEven, increment, isTiny;
3844     int64_t roundIncrement, roundMask, roundBits;
3845 
3846     roundingMode = status->float_rounding_mode;
3847     roundNearestEven = ( roundingMode == float_round_nearest_even );
3848     if ( roundingPrecision == 80 ) goto precision80;
3849     if ( roundingPrecision == 64 ) {
3850         roundIncrement = UINT64_C(0x0000000000000400);
3851         roundMask = UINT64_C(0x00000000000007FF);
3852     }
3853     else if ( roundingPrecision == 32 ) {
3854         roundIncrement = UINT64_C(0x0000008000000000);
3855         roundMask = UINT64_C(0x000000FFFFFFFFFF);
3856     }
3857     else {
3858         goto precision80;
3859     }
3860     zSig0 |= ( zSig1 != 0 );
3861     switch (roundingMode) {
3862     case float_round_nearest_even:
3863     case float_round_ties_away:
3864         break;
3865     case float_round_to_zero:
3866         roundIncrement = 0;
3867         break;
3868     case float_round_up:
3869         roundIncrement = zSign ? 0 : roundMask;
3870         break;
3871     case float_round_down:
3872         roundIncrement = zSign ? roundMask : 0;
3873         break;
3874     default:
3875         abort();
3876     }
3877     roundBits = zSig0 & roundMask;
3878     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3879         if (    ( 0x7FFE < zExp )
3880              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3881            ) {
3882             goto overflow;
3883         }
3884         if ( zExp <= 0 ) {
3885             if (status->flush_to_zero) {
3886                 float_raise(float_flag_output_denormal, status);
3887                 return packFloatx80(zSign, 0, 0);
3888             }
3889             isTiny = status->tininess_before_rounding
3890                   || (zExp < 0 )
3891                   || (zSig0 <= zSig0 + roundIncrement);
3892             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3893             zExp = 0;
3894             roundBits = zSig0 & roundMask;
3895             if (isTiny && roundBits) {
3896                 float_raise(float_flag_underflow, status);
3897             }
3898             if (roundBits) {
3899                 status->float_exception_flags |= float_flag_inexact;
3900             }
3901             zSig0 += roundIncrement;
3902             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3903             roundIncrement = roundMask + 1;
3904             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3905                 roundMask |= roundIncrement;
3906             }
3907             zSig0 &= ~ roundMask;
3908             return packFloatx80( zSign, zExp, zSig0 );
3909         }
3910     }
3911     if (roundBits) {
3912         status->float_exception_flags |= float_flag_inexact;
3913     }
3914     zSig0 += roundIncrement;
3915     if ( zSig0 < roundIncrement ) {
3916         ++zExp;
3917         zSig0 = UINT64_C(0x8000000000000000);
3918     }
3919     roundIncrement = roundMask + 1;
3920     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3921         roundMask |= roundIncrement;
3922     }
3923     zSig0 &= ~ roundMask;
3924     if ( zSig0 == 0 ) zExp = 0;
3925     return packFloatx80( zSign, zExp, zSig0 );
3926  precision80:
3927     switch (roundingMode) {
3928     case float_round_nearest_even:
3929     case float_round_ties_away:
3930         increment = ((int64_t)zSig1 < 0);
3931         break;
3932     case float_round_to_zero:
3933         increment = 0;
3934         break;
3935     case float_round_up:
3936         increment = !zSign && zSig1;
3937         break;
3938     case float_round_down:
3939         increment = zSign && zSig1;
3940         break;
3941     default:
3942         abort();
3943     }
3944     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3945         if (    ( 0x7FFE < zExp )
3946              || (    ( zExp == 0x7FFE )
3947                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
3948                   && increment
3949                 )
3950            ) {
3951             roundMask = 0;
3952  overflow:
3953             float_raise(float_flag_overflow | float_flag_inexact, status);
3954             if (    ( roundingMode == float_round_to_zero )
3955                  || ( zSign && ( roundingMode == float_round_up ) )
3956                  || ( ! zSign && ( roundingMode == float_round_down ) )
3957                ) {
3958                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3959             }
3960             return packFloatx80(zSign,
3961                                 floatx80_infinity_high,
3962                                 floatx80_infinity_low);
3963         }
3964         if ( zExp <= 0 ) {
3965             isTiny = status->tininess_before_rounding
3966                   || (zExp < 0)
3967                   || !increment
3968                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
3969             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3970             zExp = 0;
3971             if (isTiny && zSig1) {
3972                 float_raise(float_flag_underflow, status);
3973             }
3974             if (zSig1) {
3975                 status->float_exception_flags |= float_flag_inexact;
3976             }
3977             switch (roundingMode) {
3978             case float_round_nearest_even:
3979             case float_round_ties_away:
3980                 increment = ((int64_t)zSig1 < 0);
3981                 break;
3982             case float_round_to_zero:
3983                 increment = 0;
3984                 break;
3985             case float_round_up:
3986                 increment = !zSign && zSig1;
3987                 break;
3988             case float_round_down:
3989                 increment = zSign && zSig1;
3990                 break;
3991             default:
3992                 abort();
3993             }
3994             if ( increment ) {
3995                 ++zSig0;
3996                 if (!(zSig1 << 1) && roundNearestEven) {
3997                     zSig0 &= ~1;
3998                 }
3999                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4000             }
4001             return packFloatx80( zSign, zExp, zSig0 );
4002         }
4003     }
4004     if (zSig1) {
4005         status->float_exception_flags |= float_flag_inexact;
4006     }
4007     if ( increment ) {
4008         ++zSig0;
4009         if ( zSig0 == 0 ) {
4010             ++zExp;
4011             zSig0 = UINT64_C(0x8000000000000000);
4012         }
4013         else {
4014             if (!(zSig1 << 1) && roundNearestEven) {
4015                 zSig0 &= ~1;
4016             }
4017         }
4018     }
4019     else {
4020         if ( zSig0 == 0 ) zExp = 0;
4021     }
4022     return packFloatx80( zSign, zExp, zSig0 );
4023 
4024 }
4025 
4026 /*----------------------------------------------------------------------------
4027 | Takes an abstract floating-point value having sign `zSign', exponent
4028 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4029 | and returns the proper extended double-precision floating-point value
4030 | corresponding to the abstract input.  This routine is just like
4031 | `roundAndPackFloatx80' except that the input significand does not have to be
4032 | normalized.
4033 *----------------------------------------------------------------------------*/
4034 
4035 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4036                                        bool zSign, int32_t zExp,
4037                                        uint64_t zSig0, uint64_t zSig1,
4038                                        float_status *status)
4039 {
4040     int8_t shiftCount;
4041 
4042     if ( zSig0 == 0 ) {
4043         zSig0 = zSig1;
4044         zSig1 = 0;
4045         zExp -= 64;
4046     }
4047     shiftCount = clz64(zSig0);
4048     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4049     zExp -= shiftCount;
4050     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4051                                 zSig0, zSig1, status);
4052 
4053 }
4054 
4055 /*----------------------------------------------------------------------------
4056 | Returns the least-significant 64 fraction bits of the quadruple-precision
4057 | floating-point value `a'.
4058 *----------------------------------------------------------------------------*/
4059 
4060 static inline uint64_t extractFloat128Frac1( float128 a )
4061 {
4062 
4063     return a.low;
4064 
4065 }
4066 
4067 /*----------------------------------------------------------------------------
4068 | Returns the most-significant 48 fraction bits of the quadruple-precision
4069 | floating-point value `a'.
4070 *----------------------------------------------------------------------------*/
4071 
4072 static inline uint64_t extractFloat128Frac0( float128 a )
4073 {
4074 
4075     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4076 
4077 }
4078 
4079 /*----------------------------------------------------------------------------
4080 | Returns the exponent bits of the quadruple-precision floating-point value
4081 | `a'.
4082 *----------------------------------------------------------------------------*/
4083 
4084 static inline int32_t extractFloat128Exp( float128 a )
4085 {
4086 
4087     return ( a.high>>48 ) & 0x7FFF;
4088 
4089 }
4090 
4091 /*----------------------------------------------------------------------------
4092 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4093 *----------------------------------------------------------------------------*/
4094 
4095 static inline bool extractFloat128Sign(float128 a)
4096 {
4097     return a.high >> 63;
4098 }
4099 
4100 /*----------------------------------------------------------------------------
4101 | Normalizes the subnormal quadruple-precision floating-point value
4102 | represented by the denormalized significand formed by the concatenation of
4103 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4104 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4105 | significand are stored at the location pointed to by `zSig0Ptr', and the
4106 | least significant 64 bits of the normalized significand are stored at the
4107 | location pointed to by `zSig1Ptr'.
4108 *----------------------------------------------------------------------------*/
4109 
4110 static void
4111  normalizeFloat128Subnormal(
4112      uint64_t aSig0,
4113      uint64_t aSig1,
4114      int32_t *zExpPtr,
4115      uint64_t *zSig0Ptr,
4116      uint64_t *zSig1Ptr
4117  )
4118 {
4119     int8_t shiftCount;
4120 
4121     if ( aSig0 == 0 ) {
4122         shiftCount = clz64(aSig1) - 15;
4123         if ( shiftCount < 0 ) {
4124             *zSig0Ptr = aSig1>>( - shiftCount );
4125             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4126         }
4127         else {
4128             *zSig0Ptr = aSig1<<shiftCount;
4129             *zSig1Ptr = 0;
4130         }
4131         *zExpPtr = - shiftCount - 63;
4132     }
4133     else {
4134         shiftCount = clz64(aSig0) - 15;
4135         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4136         *zExpPtr = 1 - shiftCount;
4137     }
4138 
4139 }
4140 
4141 /*----------------------------------------------------------------------------
4142 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4143 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4144 | floating-point value, returning the result.  After being shifted into the
4145 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4146 | added together to form the most significant 32 bits of the result.  This
4147 | means that any integer portion of `zSig0' will be added into the exponent.
4148 | Since a properly normalized significand will have an integer portion equal
4149 | to 1, the `zExp' input should be 1 less than the desired result exponent
4150 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4151 | significand.
4152 *----------------------------------------------------------------------------*/
4153 
4154 static inline float128
4155 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4156 {
4157     float128 z;
4158 
4159     z.low = zSig1;
4160     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4161     return z;
4162 }
4163 
4164 /*----------------------------------------------------------------------------
4165 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4166 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4167 | and `zSig2', and returns the proper quadruple-precision floating-point value
4168 | corresponding to the abstract input.  Ordinarily, the abstract value is
4169 | simply rounded and packed into the quadruple-precision format, with the
4170 | inexact exception raised if the abstract input cannot be represented
4171 | exactly.  However, if the abstract value is too large, the overflow and
4172 | inexact exceptions are raised and an infinity or maximal finite value is
4173 | returned.  If the abstract value is too small, the input value is rounded to
4174 | a subnormal number, and the underflow and inexact exceptions are raised if
4175 | the abstract input cannot be represented exactly as a subnormal quadruple-
4176 | precision floating-point number.
4177 |     The input significand must be normalized or smaller.  If the input
4178 | significand is not normalized, `zExp' must be 0; in that case, the result
4179 | returned is a subnormal number, and it must not require rounding.  In the
4180 | usual case that the input significand is normalized, `zExp' must be 1 less
4181 | than the ``true'' floating-point exponent.  The handling of underflow and
4182 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4183 *----------------------------------------------------------------------------*/
4184 
4185 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4186                                      uint64_t zSig0, uint64_t zSig1,
4187                                      uint64_t zSig2, float_status *status)
4188 {
4189     int8_t roundingMode;
4190     bool roundNearestEven, increment, isTiny;
4191 
4192     roundingMode = status->float_rounding_mode;
4193     roundNearestEven = ( roundingMode == float_round_nearest_even );
4194     switch (roundingMode) {
4195     case float_round_nearest_even:
4196     case float_round_ties_away:
4197         increment = ((int64_t)zSig2 < 0);
4198         break;
4199     case float_round_to_zero:
4200         increment = 0;
4201         break;
4202     case float_round_up:
4203         increment = !zSign && zSig2;
4204         break;
4205     case float_round_down:
4206         increment = zSign && zSig2;
4207         break;
4208     case float_round_to_odd:
4209         increment = !(zSig1 & 0x1) && zSig2;
4210         break;
4211     default:
4212         abort();
4213     }
4214     if ( 0x7FFD <= (uint32_t) zExp ) {
4215         if (    ( 0x7FFD < zExp )
4216              || (    ( zExp == 0x7FFD )
4217                   && eq128(
4218                          UINT64_C(0x0001FFFFFFFFFFFF),
4219                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4220                          zSig0,
4221                          zSig1
4222                      )
4223                   && increment
4224                 )
4225            ) {
4226             float_raise(float_flag_overflow | float_flag_inexact, status);
4227             if (    ( roundingMode == float_round_to_zero )
4228                  || ( zSign && ( roundingMode == float_round_up ) )
4229                  || ( ! zSign && ( roundingMode == float_round_down ) )
4230                  || (roundingMode == float_round_to_odd)
4231                ) {
4232                 return
4233                     packFloat128(
4234                         zSign,
4235                         0x7FFE,
4236                         UINT64_C(0x0000FFFFFFFFFFFF),
4237                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4238                     );
4239             }
4240             return packFloat128( zSign, 0x7FFF, 0, 0 );
4241         }
4242         if ( zExp < 0 ) {
4243             if (status->flush_to_zero) {
4244                 float_raise(float_flag_output_denormal, status);
4245                 return packFloat128(zSign, 0, 0, 0);
4246             }
4247             isTiny = status->tininess_before_rounding
4248                   || (zExp < -1)
4249                   || !increment
4250                   || lt128(zSig0, zSig1,
4251                            UINT64_C(0x0001FFFFFFFFFFFF),
4252                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4253             shift128ExtraRightJamming(
4254                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4255             zExp = 0;
4256             if (isTiny && zSig2) {
4257                 float_raise(float_flag_underflow, status);
4258             }
4259             switch (roundingMode) {
4260             case float_round_nearest_even:
4261             case float_round_ties_away:
4262                 increment = ((int64_t)zSig2 < 0);
4263                 break;
4264             case float_round_to_zero:
4265                 increment = 0;
4266                 break;
4267             case float_round_up:
4268                 increment = !zSign && zSig2;
4269                 break;
4270             case float_round_down:
4271                 increment = zSign && zSig2;
4272                 break;
4273             case float_round_to_odd:
4274                 increment = !(zSig1 & 0x1) && zSig2;
4275                 break;
4276             default:
4277                 abort();
4278             }
4279         }
4280     }
4281     if (zSig2) {
4282         status->float_exception_flags |= float_flag_inexact;
4283     }
4284     if ( increment ) {
4285         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4286         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4287             zSig1 &= ~1;
4288         }
4289     }
4290     else {
4291         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4292     }
4293     return packFloat128( zSign, zExp, zSig0, zSig1 );
4294 
4295 }
4296 
4297 /*----------------------------------------------------------------------------
4298 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4299 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4300 | returns the proper quadruple-precision floating-point value corresponding
4301 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4302 | except that the input significand has fewer bits and does not have to be
4303 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4304 | point exponent.
4305 *----------------------------------------------------------------------------*/
4306 
4307 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4308                                               uint64_t zSig0, uint64_t zSig1,
4309                                               float_status *status)
4310 {
4311     int8_t shiftCount;
4312     uint64_t zSig2;
4313 
4314     if ( zSig0 == 0 ) {
4315         zSig0 = zSig1;
4316         zSig1 = 0;
4317         zExp -= 64;
4318     }
4319     shiftCount = clz64(zSig0) - 15;
4320     if ( 0 <= shiftCount ) {
4321         zSig2 = 0;
4322         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4323     }
4324     else {
4325         shift128ExtraRightJamming(
4326             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4327     }
4328     zExp -= shiftCount;
4329     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4330 
4331 }
4332 
4333 
4334 /*----------------------------------------------------------------------------
4335 | Returns the result of converting the 32-bit two's complement integer `a'
4336 | to the extended double-precision floating-point format.  The conversion
4337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4338 | Arithmetic.
4339 *----------------------------------------------------------------------------*/
4340 
4341 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4342 {
4343     bool zSign;
4344     uint32_t absA;
4345     int8_t shiftCount;
4346     uint64_t zSig;
4347 
4348     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4349     zSign = ( a < 0 );
4350     absA = zSign ? - a : a;
4351     shiftCount = clz32(absA) + 32;
4352     zSig = absA;
4353     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4354 
4355 }
4356 
4357 /*----------------------------------------------------------------------------
4358 | Returns the result of converting the 32-bit two's complement integer `a' to
4359 | the quadruple-precision floating-point format.  The conversion is performed
4360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4361 *----------------------------------------------------------------------------*/
4362 
4363 float128 int32_to_float128(int32_t a, float_status *status)
4364 {
4365     bool zSign;
4366     uint32_t absA;
4367     int8_t shiftCount;
4368     uint64_t zSig0;
4369 
4370     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4371     zSign = ( a < 0 );
4372     absA = zSign ? - a : a;
4373     shiftCount = clz32(absA) + 17;
4374     zSig0 = absA;
4375     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4376 
4377 }
4378 
4379 /*----------------------------------------------------------------------------
4380 | Returns the result of converting the 64-bit two's complement integer `a'
4381 | to the extended double-precision floating-point format.  The conversion
4382 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4383 | Arithmetic.
4384 *----------------------------------------------------------------------------*/
4385 
4386 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4387 {
4388     bool zSign;
4389     uint64_t absA;
4390     int8_t shiftCount;
4391 
4392     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4393     zSign = ( a < 0 );
4394     absA = zSign ? - a : a;
4395     shiftCount = clz64(absA);
4396     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4397 
4398 }
4399 
4400 /*----------------------------------------------------------------------------
4401 | Returns the result of converting the 64-bit two's complement integer `a' to
4402 | the quadruple-precision floating-point format.  The conversion is performed
4403 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4404 *----------------------------------------------------------------------------*/
4405 
4406 float128 int64_to_float128(int64_t a, float_status *status)
4407 {
4408     bool zSign;
4409     uint64_t absA;
4410     int8_t shiftCount;
4411     int32_t zExp;
4412     uint64_t zSig0, zSig1;
4413 
4414     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4415     zSign = ( a < 0 );
4416     absA = zSign ? - a : a;
4417     shiftCount = clz64(absA) + 49;
4418     zExp = 0x406E - shiftCount;
4419     if ( 64 <= shiftCount ) {
4420         zSig1 = 0;
4421         zSig0 = absA;
4422         shiftCount -= 64;
4423     }
4424     else {
4425         zSig1 = absA;
4426         zSig0 = 0;
4427     }
4428     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4429     return packFloat128( zSign, zExp, zSig0, zSig1 );
4430 
4431 }
4432 
4433 /*----------------------------------------------------------------------------
4434 | Returns the result of converting the 64-bit unsigned integer `a'
4435 | to the quadruple-precision floating-point format.  The conversion is performed
4436 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4437 *----------------------------------------------------------------------------*/
4438 
4439 float128 uint64_to_float128(uint64_t a, float_status *status)
4440 {
4441     if (a == 0) {
4442         return float128_zero;
4443     }
4444     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4445 }
4446 
4447 /*----------------------------------------------------------------------------
4448 | Returns the result of converting the single-precision floating-point value
4449 | `a' to the extended double-precision floating-point format.  The conversion
4450 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4451 | Arithmetic.
4452 *----------------------------------------------------------------------------*/
4453 
4454 floatx80 float32_to_floatx80(float32 a, float_status *status)
4455 {
4456     bool aSign;
4457     int aExp;
4458     uint32_t aSig;
4459 
4460     a = float32_squash_input_denormal(a, status);
4461     aSig = extractFloat32Frac( a );
4462     aExp = extractFloat32Exp( a );
4463     aSign = extractFloat32Sign( a );
4464     if ( aExp == 0xFF ) {
4465         if (aSig) {
4466             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4467                                                status);
4468             return floatx80_silence_nan(res, status);
4469         }
4470         return packFloatx80(aSign,
4471                             floatx80_infinity_high,
4472                             floatx80_infinity_low);
4473     }
4474     if ( aExp == 0 ) {
4475         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4476         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4477     }
4478     aSig |= 0x00800000;
4479     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4480 
4481 }
4482 
4483 /*----------------------------------------------------------------------------
4484 | Returns the result of converting the single-precision floating-point value
4485 | `a' to the double-precision floating-point format.  The conversion is
4486 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4487 | Arithmetic.
4488 *----------------------------------------------------------------------------*/
4489 
4490 float128 float32_to_float128(float32 a, float_status *status)
4491 {
4492     bool aSign;
4493     int aExp;
4494     uint32_t aSig;
4495 
4496     a = float32_squash_input_denormal(a, status);
4497     aSig = extractFloat32Frac( a );
4498     aExp = extractFloat32Exp( a );
4499     aSign = extractFloat32Sign( a );
4500     if ( aExp == 0xFF ) {
4501         if (aSig) {
4502             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4503         }
4504         return packFloat128( aSign, 0x7FFF, 0, 0 );
4505     }
4506     if ( aExp == 0 ) {
4507         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4508         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4509         --aExp;
4510     }
4511     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4512 
4513 }
4514 
4515 /*----------------------------------------------------------------------------
4516 | Returns the remainder of the single-precision floating-point value `a'
4517 | with respect to the corresponding value `b'.  The operation is performed
4518 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4519 *----------------------------------------------------------------------------*/
4520 
4521 float32 float32_rem(float32 a, float32 b, float_status *status)
4522 {
4523     bool aSign, zSign;
4524     int aExp, bExp, expDiff;
4525     uint32_t aSig, bSig;
4526     uint32_t q;
4527     uint64_t aSig64, bSig64, q64;
4528     uint32_t alternateASig;
4529     int32_t sigMean;
4530     a = float32_squash_input_denormal(a, status);
4531     b = float32_squash_input_denormal(b, status);
4532 
4533     aSig = extractFloat32Frac( a );
4534     aExp = extractFloat32Exp( a );
4535     aSign = extractFloat32Sign( a );
4536     bSig = extractFloat32Frac( b );
4537     bExp = extractFloat32Exp( b );
4538     if ( aExp == 0xFF ) {
4539         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4540             return propagateFloat32NaN(a, b, status);
4541         }
4542         float_raise(float_flag_invalid, status);
4543         return float32_default_nan(status);
4544     }
4545     if ( bExp == 0xFF ) {
4546         if (bSig) {
4547             return propagateFloat32NaN(a, b, status);
4548         }
4549         return a;
4550     }
4551     if ( bExp == 0 ) {
4552         if ( bSig == 0 ) {
4553             float_raise(float_flag_invalid, status);
4554             return float32_default_nan(status);
4555         }
4556         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4557     }
4558     if ( aExp == 0 ) {
4559         if ( aSig == 0 ) return a;
4560         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4561     }
4562     expDiff = aExp - bExp;
4563     aSig |= 0x00800000;
4564     bSig |= 0x00800000;
4565     if ( expDiff < 32 ) {
4566         aSig <<= 8;
4567         bSig <<= 8;
4568         if ( expDiff < 0 ) {
4569             if ( expDiff < -1 ) return a;
4570             aSig >>= 1;
4571         }
4572         q = ( bSig <= aSig );
4573         if ( q ) aSig -= bSig;
4574         if ( 0 < expDiff ) {
4575             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4576             q >>= 32 - expDiff;
4577             bSig >>= 2;
4578             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4579         }
4580         else {
4581             aSig >>= 2;
4582             bSig >>= 2;
4583         }
4584     }
4585     else {
4586         if ( bSig <= aSig ) aSig -= bSig;
4587         aSig64 = ( (uint64_t) aSig )<<40;
4588         bSig64 = ( (uint64_t) bSig )<<40;
4589         expDiff -= 64;
4590         while ( 0 < expDiff ) {
4591             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4592             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4593             aSig64 = - ( ( bSig * q64 )<<38 );
4594             expDiff -= 62;
4595         }
4596         expDiff += 64;
4597         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4598         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4599         q = q64>>( 64 - expDiff );
4600         bSig <<= 6;
4601         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4602     }
4603     do {
4604         alternateASig = aSig;
4605         ++q;
4606         aSig -= bSig;
4607     } while ( 0 <= (int32_t) aSig );
4608     sigMean = aSig + alternateASig;
4609     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4610         aSig = alternateASig;
4611     }
4612     zSign = ( (int32_t) aSig < 0 );
4613     if ( zSign ) aSig = - aSig;
4614     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4615 }
4616 
4617 
4618 
4619 /*----------------------------------------------------------------------------
4620 | Returns the binary exponential of the single-precision floating-point value
4621 | `a'. The operation is performed according to the IEC/IEEE Standard for
4622 | Binary Floating-Point Arithmetic.
4623 |
4624 | Uses the following identities:
4625 |
4626 | 1. -------------------------------------------------------------------------
4627 |      x    x*ln(2)
4628 |     2  = e
4629 |
4630 | 2. -------------------------------------------------------------------------
4631 |                      2     3     4     5           n
4632 |      x        x     x     x     x     x           x
4633 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4634 |               1!    2!    3!    4!    5!          n!
4635 *----------------------------------------------------------------------------*/
4636 
4637 static const float64 float32_exp2_coefficients[15] =
4638 {
4639     const_float64( 0x3ff0000000000000ll ), /*  1 */
4640     const_float64( 0x3fe0000000000000ll ), /*  2 */
4641     const_float64( 0x3fc5555555555555ll ), /*  3 */
4642     const_float64( 0x3fa5555555555555ll ), /*  4 */
4643     const_float64( 0x3f81111111111111ll ), /*  5 */
4644     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4645     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4646     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4647     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4648     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4649     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4650     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4651     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4652     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4653     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4654 };
4655 
4656 float32 float32_exp2(float32 a, float_status *status)
4657 {
4658     bool aSign;
4659     int aExp;
4660     uint32_t aSig;
4661     float64 r, x, xn;
4662     int i;
4663     a = float32_squash_input_denormal(a, status);
4664 
4665     aSig = extractFloat32Frac( a );
4666     aExp = extractFloat32Exp( a );
4667     aSign = extractFloat32Sign( a );
4668 
4669     if ( aExp == 0xFF) {
4670         if (aSig) {
4671             return propagateFloat32NaN(a, float32_zero, status);
4672         }
4673         return (aSign) ? float32_zero : a;
4674     }
4675     if (aExp == 0) {
4676         if (aSig == 0) return float32_one;
4677     }
4678 
4679     float_raise(float_flag_inexact, status);
4680 
4681     /* ******************************* */
4682     /* using float64 for approximation */
4683     /* ******************************* */
4684     x = float32_to_float64(a, status);
4685     x = float64_mul(x, float64_ln2, status);
4686 
4687     xn = x;
4688     r = float64_one;
4689     for (i = 0 ; i < 15 ; i++) {
4690         float64 f;
4691 
4692         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4693         r = float64_add(r, f, status);
4694 
4695         xn = float64_mul(xn, x, status);
4696     }
4697 
4698     return float64_to_float32(r, status);
4699 }
4700 
4701 /*----------------------------------------------------------------------------
4702 | Returns the binary log of the single-precision floating-point value `a'.
4703 | The operation is performed according to the IEC/IEEE Standard for Binary
4704 | Floating-Point Arithmetic.
4705 *----------------------------------------------------------------------------*/
4706 float32 float32_log2(float32 a, float_status *status)
4707 {
4708     bool aSign, zSign;
4709     int aExp;
4710     uint32_t aSig, zSig, i;
4711 
4712     a = float32_squash_input_denormal(a, status);
4713     aSig = extractFloat32Frac( a );
4714     aExp = extractFloat32Exp( a );
4715     aSign = extractFloat32Sign( a );
4716 
4717     if ( aExp == 0 ) {
4718         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4719         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4720     }
4721     if ( aSign ) {
4722         float_raise(float_flag_invalid, status);
4723         return float32_default_nan(status);
4724     }
4725     if ( aExp == 0xFF ) {
4726         if (aSig) {
4727             return propagateFloat32NaN(a, float32_zero, status);
4728         }
4729         return a;
4730     }
4731 
4732     aExp -= 0x7F;
4733     aSig |= 0x00800000;
4734     zSign = aExp < 0;
4735     zSig = aExp << 23;
4736 
4737     for (i = 1 << 22; i > 0; i >>= 1) {
4738         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4739         if ( aSig & 0x01000000 ) {
4740             aSig >>= 1;
4741             zSig |= i;
4742         }
4743     }
4744 
4745     if ( zSign )
4746         zSig = -zSig;
4747 
4748     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4749 }
4750 
4751 /*----------------------------------------------------------------------------
4752 | Returns the result of converting the double-precision floating-point value
4753 | `a' to the extended double-precision floating-point format.  The conversion
4754 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4755 | Arithmetic.
4756 *----------------------------------------------------------------------------*/
4757 
4758 floatx80 float64_to_floatx80(float64 a, float_status *status)
4759 {
4760     bool aSign;
4761     int aExp;
4762     uint64_t aSig;
4763 
4764     a = float64_squash_input_denormal(a, status);
4765     aSig = extractFloat64Frac( a );
4766     aExp = extractFloat64Exp( a );
4767     aSign = extractFloat64Sign( a );
4768     if ( aExp == 0x7FF ) {
4769         if (aSig) {
4770             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
4771                                                status);
4772             return floatx80_silence_nan(res, status);
4773         }
4774         return packFloatx80(aSign,
4775                             floatx80_infinity_high,
4776                             floatx80_infinity_low);
4777     }
4778     if ( aExp == 0 ) {
4779         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4780         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4781     }
4782     return
4783         packFloatx80(
4784             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
4785 
4786 }
4787 
4788 /*----------------------------------------------------------------------------
4789 | Returns the result of converting the double-precision floating-point value
4790 | `a' to the quadruple-precision floating-point format.  The conversion is
4791 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4792 | Arithmetic.
4793 *----------------------------------------------------------------------------*/
4794 
4795 float128 float64_to_float128(float64 a, float_status *status)
4796 {
4797     bool aSign;
4798     int aExp;
4799     uint64_t aSig, zSig0, zSig1;
4800 
4801     a = float64_squash_input_denormal(a, status);
4802     aSig = extractFloat64Frac( a );
4803     aExp = extractFloat64Exp( a );
4804     aSign = extractFloat64Sign( a );
4805     if ( aExp == 0x7FF ) {
4806         if (aSig) {
4807             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4808         }
4809         return packFloat128( aSign, 0x7FFF, 0, 0 );
4810     }
4811     if ( aExp == 0 ) {
4812         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4813         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4814         --aExp;
4815     }
4816     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4817     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4818 
4819 }
4820 
4821 
4822 /*----------------------------------------------------------------------------
4823 | Returns the remainder of the double-precision floating-point value `a'
4824 | with respect to the corresponding value `b'.  The operation is performed
4825 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4826 *----------------------------------------------------------------------------*/
4827 
4828 float64 float64_rem(float64 a, float64 b, float_status *status)
4829 {
4830     bool aSign, zSign;
4831     int aExp, bExp, expDiff;
4832     uint64_t aSig, bSig;
4833     uint64_t q, alternateASig;
4834     int64_t sigMean;
4835 
4836     a = float64_squash_input_denormal(a, status);
4837     b = float64_squash_input_denormal(b, status);
4838     aSig = extractFloat64Frac( a );
4839     aExp = extractFloat64Exp( a );
4840     aSign = extractFloat64Sign( a );
4841     bSig = extractFloat64Frac( b );
4842     bExp = extractFloat64Exp( b );
4843     if ( aExp == 0x7FF ) {
4844         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4845             return propagateFloat64NaN(a, b, status);
4846         }
4847         float_raise(float_flag_invalid, status);
4848         return float64_default_nan(status);
4849     }
4850     if ( bExp == 0x7FF ) {
4851         if (bSig) {
4852             return propagateFloat64NaN(a, b, status);
4853         }
4854         return a;
4855     }
4856     if ( bExp == 0 ) {
4857         if ( bSig == 0 ) {
4858             float_raise(float_flag_invalid, status);
4859             return float64_default_nan(status);
4860         }
4861         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4862     }
4863     if ( aExp == 0 ) {
4864         if ( aSig == 0 ) return a;
4865         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4866     }
4867     expDiff = aExp - bExp;
4868     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
4869     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
4870     if ( expDiff < 0 ) {
4871         if ( expDiff < -1 ) return a;
4872         aSig >>= 1;
4873     }
4874     q = ( bSig <= aSig );
4875     if ( q ) aSig -= bSig;
4876     expDiff -= 64;
4877     while ( 0 < expDiff ) {
4878         q = estimateDiv128To64( aSig, 0, bSig );
4879         q = ( 2 < q ) ? q - 2 : 0;
4880         aSig = - ( ( bSig>>2 ) * q );
4881         expDiff -= 62;
4882     }
4883     expDiff += 64;
4884     if ( 0 < expDiff ) {
4885         q = estimateDiv128To64( aSig, 0, bSig );
4886         q = ( 2 < q ) ? q - 2 : 0;
4887         q >>= 64 - expDiff;
4888         bSig >>= 2;
4889         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4890     }
4891     else {
4892         aSig >>= 2;
4893         bSig >>= 2;
4894     }
4895     do {
4896         alternateASig = aSig;
4897         ++q;
4898         aSig -= bSig;
4899     } while ( 0 <= (int64_t) aSig );
4900     sigMean = aSig + alternateASig;
4901     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4902         aSig = alternateASig;
4903     }
4904     zSign = ( (int64_t) aSig < 0 );
4905     if ( zSign ) aSig = - aSig;
4906     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4907 
4908 }
4909 
4910 /*----------------------------------------------------------------------------
4911 | Returns the binary log of the double-precision floating-point value `a'.
4912 | The operation is performed according to the IEC/IEEE Standard for Binary
4913 | Floating-Point Arithmetic.
4914 *----------------------------------------------------------------------------*/
4915 float64 float64_log2(float64 a, float_status *status)
4916 {
4917     bool aSign, zSign;
4918     int aExp;
4919     uint64_t aSig, aSig0, aSig1, zSig, i;
4920     a = float64_squash_input_denormal(a, status);
4921 
4922     aSig = extractFloat64Frac( a );
4923     aExp = extractFloat64Exp( a );
4924     aSign = extractFloat64Sign( a );
4925 
4926     if ( aExp == 0 ) {
4927         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4928         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4929     }
4930     if ( aSign ) {
4931         float_raise(float_flag_invalid, status);
4932         return float64_default_nan(status);
4933     }
4934     if ( aExp == 0x7FF ) {
4935         if (aSig) {
4936             return propagateFloat64NaN(a, float64_zero, status);
4937         }
4938         return a;
4939     }
4940 
4941     aExp -= 0x3FF;
4942     aSig |= UINT64_C(0x0010000000000000);
4943     zSign = aExp < 0;
4944     zSig = (uint64_t)aExp << 52;
4945     for (i = 1LL << 51; i > 0; i >>= 1) {
4946         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4947         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4948         if ( aSig & UINT64_C(0x0020000000000000) ) {
4949             aSig >>= 1;
4950             zSig |= i;
4951         }
4952     }
4953 
4954     if ( zSign )
4955         zSig = -zSig;
4956     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4957 }
4958 
4959 /*----------------------------------------------------------------------------
4960 | Returns the result of converting the extended double-precision floating-
4961 | point value `a' to the 32-bit two's complement integer format.  The
4962 | conversion is performed according to the IEC/IEEE Standard for Binary
4963 | Floating-Point Arithmetic---which means in particular that the conversion
4964 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4965 | largest positive integer is returned.  Otherwise, if the conversion
4966 | overflows, the largest integer with the same sign as `a' is returned.
4967 *----------------------------------------------------------------------------*/
4968 
4969 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4970 {
4971     bool aSign;
4972     int32_t aExp, shiftCount;
4973     uint64_t aSig;
4974 
4975     if (floatx80_invalid_encoding(a)) {
4976         float_raise(float_flag_invalid, status);
4977         return 1 << 31;
4978     }
4979     aSig = extractFloatx80Frac( a );
4980     aExp = extractFloatx80Exp( a );
4981     aSign = extractFloatx80Sign( a );
4982     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4983     shiftCount = 0x4037 - aExp;
4984     if ( shiftCount <= 0 ) shiftCount = 1;
4985     shift64RightJamming( aSig, shiftCount, &aSig );
4986     return roundAndPackInt32(aSign, aSig, status);
4987 
4988 }
4989 
4990 /*----------------------------------------------------------------------------
4991 | Returns the result of converting the extended double-precision floating-
4992 | point value `a' to the 32-bit two's complement integer format.  The
4993 | conversion is performed according to the IEC/IEEE Standard for Binary
4994 | Floating-Point Arithmetic, except that the conversion is always rounded
4995 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4996 | Otherwise, if the conversion overflows, the largest integer with the same
4997 | sign as `a' is returned.
4998 *----------------------------------------------------------------------------*/
4999 
5000 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5001 {
5002     bool aSign;
5003     int32_t aExp, shiftCount;
5004     uint64_t aSig, savedASig;
5005     int32_t z;
5006 
5007     if (floatx80_invalid_encoding(a)) {
5008         float_raise(float_flag_invalid, status);
5009         return 1 << 31;
5010     }
5011     aSig = extractFloatx80Frac( a );
5012     aExp = extractFloatx80Exp( a );
5013     aSign = extractFloatx80Sign( a );
5014     if ( 0x401E < aExp ) {
5015         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5016         goto invalid;
5017     }
5018     else if ( aExp < 0x3FFF ) {
5019         if (aExp || aSig) {
5020             status->float_exception_flags |= float_flag_inexact;
5021         }
5022         return 0;
5023     }
5024     shiftCount = 0x403E - aExp;
5025     savedASig = aSig;
5026     aSig >>= shiftCount;
5027     z = aSig;
5028     if ( aSign ) z = - z;
5029     if ( ( z < 0 ) ^ aSign ) {
5030  invalid:
5031         float_raise(float_flag_invalid, status);
5032         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5033     }
5034     if ( ( aSig<<shiftCount ) != savedASig ) {
5035         status->float_exception_flags |= float_flag_inexact;
5036     }
5037     return z;
5038 
5039 }
5040 
5041 /*----------------------------------------------------------------------------
5042 | Returns the result of converting the extended double-precision floating-
5043 | point value `a' to the 64-bit two's complement integer format.  The
5044 | conversion is performed according to the IEC/IEEE Standard for Binary
5045 | Floating-Point Arithmetic---which means in particular that the conversion
5046 | is rounded according to the current rounding mode.  If `a' is a NaN,
5047 | the largest positive integer is returned.  Otherwise, if the conversion
5048 | overflows, the largest integer with the same sign as `a' is returned.
5049 *----------------------------------------------------------------------------*/
5050 
5051 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5052 {
5053     bool aSign;
5054     int32_t aExp, shiftCount;
5055     uint64_t aSig, aSigExtra;
5056 
5057     if (floatx80_invalid_encoding(a)) {
5058         float_raise(float_flag_invalid, status);
5059         return 1ULL << 63;
5060     }
5061     aSig = extractFloatx80Frac( a );
5062     aExp = extractFloatx80Exp( a );
5063     aSign = extractFloatx80Sign( a );
5064     shiftCount = 0x403E - aExp;
5065     if ( shiftCount <= 0 ) {
5066         if ( shiftCount ) {
5067             float_raise(float_flag_invalid, status);
5068             if (!aSign || floatx80_is_any_nan(a)) {
5069                 return INT64_MAX;
5070             }
5071             return INT64_MIN;
5072         }
5073         aSigExtra = 0;
5074     }
5075     else {
5076         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5077     }
5078     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5079 
5080 }
5081 
5082 /*----------------------------------------------------------------------------
5083 | Returns the result of converting the extended double-precision floating-
5084 | point value `a' to the 64-bit two's complement integer format.  The
5085 | conversion is performed according to the IEC/IEEE Standard for Binary
5086 | Floating-Point Arithmetic, except that the conversion is always rounded
5087 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5088 | Otherwise, if the conversion overflows, the largest integer with the same
5089 | sign as `a' is returned.
5090 *----------------------------------------------------------------------------*/
5091 
5092 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5093 {
5094     bool aSign;
5095     int32_t aExp, shiftCount;
5096     uint64_t aSig;
5097     int64_t z;
5098 
5099     if (floatx80_invalid_encoding(a)) {
5100         float_raise(float_flag_invalid, status);
5101         return 1ULL << 63;
5102     }
5103     aSig = extractFloatx80Frac( a );
5104     aExp = extractFloatx80Exp( a );
5105     aSign = extractFloatx80Sign( a );
5106     shiftCount = aExp - 0x403E;
5107     if ( 0 <= shiftCount ) {
5108         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5109         if ( ( a.high != 0xC03E ) || aSig ) {
5110             float_raise(float_flag_invalid, status);
5111             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5112                 return INT64_MAX;
5113             }
5114         }
5115         return INT64_MIN;
5116     }
5117     else if ( aExp < 0x3FFF ) {
5118         if (aExp | aSig) {
5119             status->float_exception_flags |= float_flag_inexact;
5120         }
5121         return 0;
5122     }
5123     z = aSig>>( - shiftCount );
5124     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5125         status->float_exception_flags |= float_flag_inexact;
5126     }
5127     if ( aSign ) z = - z;
5128     return z;
5129 
5130 }
5131 
5132 /*----------------------------------------------------------------------------
5133 | Returns the result of converting the extended double-precision floating-
5134 | point value `a' to the single-precision floating-point format.  The
5135 | conversion is performed according to the IEC/IEEE Standard for Binary
5136 | Floating-Point Arithmetic.
5137 *----------------------------------------------------------------------------*/
5138 
5139 float32 floatx80_to_float32(floatx80 a, float_status *status)
5140 {
5141     bool aSign;
5142     int32_t aExp;
5143     uint64_t aSig;
5144 
5145     if (floatx80_invalid_encoding(a)) {
5146         float_raise(float_flag_invalid, status);
5147         return float32_default_nan(status);
5148     }
5149     aSig = extractFloatx80Frac( a );
5150     aExp = extractFloatx80Exp( a );
5151     aSign = extractFloatx80Sign( a );
5152     if ( aExp == 0x7FFF ) {
5153         if ( (uint64_t) ( aSig<<1 ) ) {
5154             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5155                                              status);
5156             return float32_silence_nan(res, status);
5157         }
5158         return packFloat32( aSign, 0xFF, 0 );
5159     }
5160     shift64RightJamming( aSig, 33, &aSig );
5161     if ( aExp || aSig ) aExp -= 0x3F81;
5162     return roundAndPackFloat32(aSign, aExp, aSig, status);
5163 
5164 }
5165 
5166 /*----------------------------------------------------------------------------
5167 | Returns the result of converting the extended double-precision floating-
5168 | point value `a' to the double-precision floating-point format.  The
5169 | conversion is performed according to the IEC/IEEE Standard for Binary
5170 | Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5172 
5173 float64 floatx80_to_float64(floatx80 a, float_status *status)
5174 {
5175     bool aSign;
5176     int32_t aExp;
5177     uint64_t aSig, zSig;
5178 
5179     if (floatx80_invalid_encoding(a)) {
5180         float_raise(float_flag_invalid, status);
5181         return float64_default_nan(status);
5182     }
5183     aSig = extractFloatx80Frac( a );
5184     aExp = extractFloatx80Exp( a );
5185     aSign = extractFloatx80Sign( a );
5186     if ( aExp == 0x7FFF ) {
5187         if ( (uint64_t) ( aSig<<1 ) ) {
5188             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5189                                              status);
5190             return float64_silence_nan(res, status);
5191         }
5192         return packFloat64( aSign, 0x7FF, 0 );
5193     }
5194     shift64RightJamming( aSig, 1, &zSig );
5195     if ( aExp || aSig ) aExp -= 0x3C01;
5196     return roundAndPackFloat64(aSign, aExp, zSig, status);
5197 
5198 }
5199 
5200 /*----------------------------------------------------------------------------
5201 | Returns the result of converting the extended double-precision floating-
5202 | point value `a' to the quadruple-precision floating-point format.  The
5203 | conversion is performed according to the IEC/IEEE Standard for Binary
5204 | Floating-Point Arithmetic.
5205 *----------------------------------------------------------------------------*/
5206 
5207 float128 floatx80_to_float128(floatx80 a, float_status *status)
5208 {
5209     bool aSign;
5210     int aExp;
5211     uint64_t aSig, zSig0, zSig1;
5212 
5213     if (floatx80_invalid_encoding(a)) {
5214         float_raise(float_flag_invalid, status);
5215         return float128_default_nan(status);
5216     }
5217     aSig = extractFloatx80Frac( a );
5218     aExp = extractFloatx80Exp( a );
5219     aSign = extractFloatx80Sign( a );
5220     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5221         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5222                                            status);
5223         return float128_silence_nan(res, status);
5224     }
5225     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5226     return packFloat128( aSign, aExp, zSig0, zSig1 );
5227 
5228 }
5229 
5230 /*----------------------------------------------------------------------------
5231 | Rounds the extended double-precision floating-point value `a'
5232 | to the precision provided by floatx80_rounding_precision and returns the
5233 | result as an extended double-precision floating-point value.
5234 | The operation is performed according to the IEC/IEEE Standard for Binary
5235 | Floating-Point Arithmetic.
5236 *----------------------------------------------------------------------------*/
5237 
5238 floatx80 floatx80_round(floatx80 a, float_status *status)
5239 {
5240     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5241                                 extractFloatx80Sign(a),
5242                                 extractFloatx80Exp(a),
5243                                 extractFloatx80Frac(a), 0, status);
5244 }
5245 
5246 /*----------------------------------------------------------------------------
5247 | Rounds the extended double-precision floating-point value `a' to an integer,
5248 | and returns the result as an extended quadruple-precision floating-point
5249 | value.  The operation is performed according to the IEC/IEEE Standard for
5250 | Binary Floating-Point Arithmetic.
5251 *----------------------------------------------------------------------------*/
5252 
5253 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5254 {
5255     bool aSign;
5256     int32_t aExp;
5257     uint64_t lastBitMask, roundBitsMask;
5258     floatx80 z;
5259 
5260     if (floatx80_invalid_encoding(a)) {
5261         float_raise(float_flag_invalid, status);
5262         return floatx80_default_nan(status);
5263     }
5264     aExp = extractFloatx80Exp( a );
5265     if ( 0x403E <= aExp ) {
5266         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5267             return propagateFloatx80NaN(a, a, status);
5268         }
5269         return a;
5270     }
5271     if ( aExp < 0x3FFF ) {
5272         if (    ( aExp == 0 )
5273              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5274             return a;
5275         }
5276         status->float_exception_flags |= float_flag_inexact;
5277         aSign = extractFloatx80Sign( a );
5278         switch (status->float_rounding_mode) {
5279          case float_round_nearest_even:
5280             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5281                ) {
5282                 return
5283                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5284             }
5285             break;
5286         case float_round_ties_away:
5287             if (aExp == 0x3FFE) {
5288                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5289             }
5290             break;
5291          case float_round_down:
5292             return
5293                   aSign ?
5294                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5295                 : packFloatx80( 0, 0, 0 );
5296          case float_round_up:
5297             return
5298                   aSign ? packFloatx80( 1, 0, 0 )
5299                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5300 
5301         case float_round_to_zero:
5302             break;
5303         default:
5304             g_assert_not_reached();
5305         }
5306         return packFloatx80( aSign, 0, 0 );
5307     }
5308     lastBitMask = 1;
5309     lastBitMask <<= 0x403E - aExp;
5310     roundBitsMask = lastBitMask - 1;
5311     z = a;
5312     switch (status->float_rounding_mode) {
5313     case float_round_nearest_even:
5314         z.low += lastBitMask>>1;
5315         if ((z.low & roundBitsMask) == 0) {
5316             z.low &= ~lastBitMask;
5317         }
5318         break;
5319     case float_round_ties_away:
5320         z.low += lastBitMask >> 1;
5321         break;
5322     case float_round_to_zero:
5323         break;
5324     case float_round_up:
5325         if (!extractFloatx80Sign(z)) {
5326             z.low += roundBitsMask;
5327         }
5328         break;
5329     case float_round_down:
5330         if (extractFloatx80Sign(z)) {
5331             z.low += roundBitsMask;
5332         }
5333         break;
5334     default:
5335         abort();
5336     }
5337     z.low &= ~ roundBitsMask;
5338     if ( z.low == 0 ) {
5339         ++z.high;
5340         z.low = UINT64_C(0x8000000000000000);
5341     }
5342     if (z.low != a.low) {
5343         status->float_exception_flags |= float_flag_inexact;
5344     }
5345     return z;
5346 
5347 }
5348 
5349 /*----------------------------------------------------------------------------
5350 | Returns the result of adding the absolute values of the extended double-
5351 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5352 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5353 | The addition is performed according to the IEC/IEEE Standard for Binary
5354 | Floating-Point Arithmetic.
5355 *----------------------------------------------------------------------------*/
5356 
5357 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5358                                 float_status *status)
5359 {
5360     int32_t aExp, bExp, zExp;
5361     uint64_t aSig, bSig, zSig0, zSig1;
5362     int32_t expDiff;
5363 
5364     aSig = extractFloatx80Frac( a );
5365     aExp = extractFloatx80Exp( a );
5366     bSig = extractFloatx80Frac( b );
5367     bExp = extractFloatx80Exp( b );
5368     expDiff = aExp - bExp;
5369     if ( 0 < expDiff ) {
5370         if ( aExp == 0x7FFF ) {
5371             if ((uint64_t)(aSig << 1)) {
5372                 return propagateFloatx80NaN(a, b, status);
5373             }
5374             return a;
5375         }
5376         if ( bExp == 0 ) --expDiff;
5377         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5378         zExp = aExp;
5379     }
5380     else if ( expDiff < 0 ) {
5381         if ( bExp == 0x7FFF ) {
5382             if ((uint64_t)(bSig << 1)) {
5383                 return propagateFloatx80NaN(a, b, status);
5384             }
5385             return packFloatx80(zSign,
5386                                 floatx80_infinity_high,
5387                                 floatx80_infinity_low);
5388         }
5389         if ( aExp == 0 ) ++expDiff;
5390         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5391         zExp = bExp;
5392     }
5393     else {
5394         if ( aExp == 0x7FFF ) {
5395             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5396                 return propagateFloatx80NaN(a, b, status);
5397             }
5398             return a;
5399         }
5400         zSig1 = 0;
5401         zSig0 = aSig + bSig;
5402         if ( aExp == 0 ) {
5403             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5404                 /* At least one of the values is a pseudo-denormal,
5405                  * and there is a carry out of the result.  */
5406                 zExp = 1;
5407                 goto shiftRight1;
5408             }
5409             if (zSig0 == 0) {
5410                 return packFloatx80(zSign, 0, 0);
5411             }
5412             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5413             goto roundAndPack;
5414         }
5415         zExp = aExp;
5416         goto shiftRight1;
5417     }
5418     zSig0 = aSig + bSig;
5419     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5420  shiftRight1:
5421     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5422     zSig0 |= UINT64_C(0x8000000000000000);
5423     ++zExp;
5424  roundAndPack:
5425     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5426                                 zSign, zExp, zSig0, zSig1, status);
5427 }
5428 
5429 /*----------------------------------------------------------------------------
5430 | Returns the result of subtracting the absolute values of the extended
5431 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5432 | difference is negated before being returned.  `zSign' is ignored if the
5433 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5434 | Standard for Binary Floating-Point Arithmetic.
5435 *----------------------------------------------------------------------------*/
5436 
5437 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5438                                 float_status *status)
5439 {
5440     int32_t aExp, bExp, zExp;
5441     uint64_t aSig, bSig, zSig0, zSig1;
5442     int32_t expDiff;
5443 
5444     aSig = extractFloatx80Frac( a );
5445     aExp = extractFloatx80Exp( a );
5446     bSig = extractFloatx80Frac( b );
5447     bExp = extractFloatx80Exp( b );
5448     expDiff = aExp - bExp;
5449     if ( 0 < expDiff ) goto aExpBigger;
5450     if ( expDiff < 0 ) goto bExpBigger;
5451     if ( aExp == 0x7FFF ) {
5452         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5453             return propagateFloatx80NaN(a, b, status);
5454         }
5455         float_raise(float_flag_invalid, status);
5456         return floatx80_default_nan(status);
5457     }
5458     if ( aExp == 0 ) {
5459         aExp = 1;
5460         bExp = 1;
5461     }
5462     zSig1 = 0;
5463     if ( bSig < aSig ) goto aBigger;
5464     if ( aSig < bSig ) goto bBigger;
5465     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5466  bExpBigger:
5467     if ( bExp == 0x7FFF ) {
5468         if ((uint64_t)(bSig << 1)) {
5469             return propagateFloatx80NaN(a, b, status);
5470         }
5471         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5472                             floatx80_infinity_low);
5473     }
5474     if ( aExp == 0 ) ++expDiff;
5475     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5476  bBigger:
5477     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5478     zExp = bExp;
5479     zSign ^= 1;
5480     goto normalizeRoundAndPack;
5481  aExpBigger:
5482     if ( aExp == 0x7FFF ) {
5483         if ((uint64_t)(aSig << 1)) {
5484             return propagateFloatx80NaN(a, b, status);
5485         }
5486         return a;
5487     }
5488     if ( bExp == 0 ) --expDiff;
5489     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5490  aBigger:
5491     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5492     zExp = aExp;
5493  normalizeRoundAndPack:
5494     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5495                                          zSign, zExp, zSig0, zSig1, status);
5496 }
5497 
5498 /*----------------------------------------------------------------------------
5499 | Returns the result of adding the extended double-precision floating-point
5500 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5501 | Standard for Binary Floating-Point Arithmetic.
5502 *----------------------------------------------------------------------------*/
5503 
5504 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5505 {
5506     bool aSign, bSign;
5507 
5508     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5509         float_raise(float_flag_invalid, status);
5510         return floatx80_default_nan(status);
5511     }
5512     aSign = extractFloatx80Sign( a );
5513     bSign = extractFloatx80Sign( b );
5514     if ( aSign == bSign ) {
5515         return addFloatx80Sigs(a, b, aSign, status);
5516     }
5517     else {
5518         return subFloatx80Sigs(a, b, aSign, status);
5519     }
5520 
5521 }
5522 
5523 /*----------------------------------------------------------------------------
5524 | Returns the result of subtracting the extended double-precision floating-
5525 | point values `a' and `b'.  The operation is performed according to the
5526 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5527 *----------------------------------------------------------------------------*/
5528 
5529 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5530 {
5531     bool aSign, bSign;
5532 
5533     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5534         float_raise(float_flag_invalid, status);
5535         return floatx80_default_nan(status);
5536     }
5537     aSign = extractFloatx80Sign( a );
5538     bSign = extractFloatx80Sign( b );
5539     if ( aSign == bSign ) {
5540         return subFloatx80Sigs(a, b, aSign, status);
5541     }
5542     else {
5543         return addFloatx80Sigs(a, b, aSign, status);
5544     }
5545 
5546 }
5547 
5548 /*----------------------------------------------------------------------------
5549 | Returns the result of multiplying the extended double-precision floating-
5550 | point values `a' and `b'.  The operation is performed according to the
5551 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5552 *----------------------------------------------------------------------------*/
5553 
5554 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5555 {
5556     bool aSign, bSign, zSign;
5557     int32_t aExp, bExp, zExp;
5558     uint64_t aSig, bSig, zSig0, zSig1;
5559 
5560     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5561         float_raise(float_flag_invalid, status);
5562         return floatx80_default_nan(status);
5563     }
5564     aSig = extractFloatx80Frac( a );
5565     aExp = extractFloatx80Exp( a );
5566     aSign = extractFloatx80Sign( a );
5567     bSig = extractFloatx80Frac( b );
5568     bExp = extractFloatx80Exp( b );
5569     bSign = extractFloatx80Sign( b );
5570     zSign = aSign ^ bSign;
5571     if ( aExp == 0x7FFF ) {
5572         if (    (uint64_t) ( aSig<<1 )
5573              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5574             return propagateFloatx80NaN(a, b, status);
5575         }
5576         if ( ( bExp | bSig ) == 0 ) goto invalid;
5577         return packFloatx80(zSign, floatx80_infinity_high,
5578                                    floatx80_infinity_low);
5579     }
5580     if ( bExp == 0x7FFF ) {
5581         if ((uint64_t)(bSig << 1)) {
5582             return propagateFloatx80NaN(a, b, status);
5583         }
5584         if ( ( aExp | aSig ) == 0 ) {
5585  invalid:
5586             float_raise(float_flag_invalid, status);
5587             return floatx80_default_nan(status);
5588         }
5589         return packFloatx80(zSign, floatx80_infinity_high,
5590                                    floatx80_infinity_low);
5591     }
5592     if ( aExp == 0 ) {
5593         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5594         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5595     }
5596     if ( bExp == 0 ) {
5597         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5598         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5599     }
5600     zExp = aExp + bExp - 0x3FFE;
5601     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5602     if ( 0 < (int64_t) zSig0 ) {
5603         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5604         --zExp;
5605     }
5606     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5607                                 zSign, zExp, zSig0, zSig1, status);
5608 }
5609 
5610 /*----------------------------------------------------------------------------
5611 | Returns the result of dividing the extended double-precision floating-point
5612 | value `a' by the corresponding value `b'.  The operation is performed
5613 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5614 *----------------------------------------------------------------------------*/
5615 
5616 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5617 {
5618     bool aSign, bSign, zSign;
5619     int32_t aExp, bExp, zExp;
5620     uint64_t aSig, bSig, zSig0, zSig1;
5621     uint64_t rem0, rem1, rem2, term0, term1, term2;
5622 
5623     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5624         float_raise(float_flag_invalid, status);
5625         return floatx80_default_nan(status);
5626     }
5627     aSig = extractFloatx80Frac( a );
5628     aExp = extractFloatx80Exp( a );
5629     aSign = extractFloatx80Sign( a );
5630     bSig = extractFloatx80Frac( b );
5631     bExp = extractFloatx80Exp( b );
5632     bSign = extractFloatx80Sign( b );
5633     zSign = aSign ^ bSign;
5634     if ( aExp == 0x7FFF ) {
5635         if ((uint64_t)(aSig << 1)) {
5636             return propagateFloatx80NaN(a, b, status);
5637         }
5638         if ( bExp == 0x7FFF ) {
5639             if ((uint64_t)(bSig << 1)) {
5640                 return propagateFloatx80NaN(a, b, status);
5641             }
5642             goto invalid;
5643         }
5644         return packFloatx80(zSign, floatx80_infinity_high,
5645                                    floatx80_infinity_low);
5646     }
5647     if ( bExp == 0x7FFF ) {
5648         if ((uint64_t)(bSig << 1)) {
5649             return propagateFloatx80NaN(a, b, status);
5650         }
5651         return packFloatx80( zSign, 0, 0 );
5652     }
5653     if ( bExp == 0 ) {
5654         if ( bSig == 0 ) {
5655             if ( ( aExp | aSig ) == 0 ) {
5656  invalid:
5657                 float_raise(float_flag_invalid, status);
5658                 return floatx80_default_nan(status);
5659             }
5660             float_raise(float_flag_divbyzero, status);
5661             return packFloatx80(zSign, floatx80_infinity_high,
5662                                        floatx80_infinity_low);
5663         }
5664         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5665     }
5666     if ( aExp == 0 ) {
5667         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5668         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5669     }
5670     zExp = aExp - bExp + 0x3FFE;
5671     rem1 = 0;
5672     if ( bSig <= aSig ) {
5673         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5674         ++zExp;
5675     }
5676     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5677     mul64To128( bSig, zSig0, &term0, &term1 );
5678     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5679     while ( (int64_t) rem0 < 0 ) {
5680         --zSig0;
5681         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5682     }
5683     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5684     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5685         mul64To128( bSig, zSig1, &term1, &term2 );
5686         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5687         while ( (int64_t) rem1 < 0 ) {
5688             --zSig1;
5689             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5690         }
5691         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5692     }
5693     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5694                                 zSign, zExp, zSig0, zSig1, status);
5695 }
5696 
5697 /*----------------------------------------------------------------------------
5698 | Returns the remainder of the extended double-precision floating-point value
5699 | `a' with respect to the corresponding value `b'.  The operation is performed
5700 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
5701 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
5702 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
5703 | the absolute value of the integer quotient.
5704 *----------------------------------------------------------------------------*/
5705 
5706 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
5707                          float_status *status)
5708 {
5709     bool aSign, zSign;
5710     int32_t aExp, bExp, expDiff, aExpOrig;
5711     uint64_t aSig0, aSig1, bSig;
5712     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5713 
5714     *quotient = 0;
5715     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5716         float_raise(float_flag_invalid, status);
5717         return floatx80_default_nan(status);
5718     }
5719     aSig0 = extractFloatx80Frac( a );
5720     aExpOrig = aExp = extractFloatx80Exp( a );
5721     aSign = extractFloatx80Sign( a );
5722     bSig = extractFloatx80Frac( b );
5723     bExp = extractFloatx80Exp( b );
5724     if ( aExp == 0x7FFF ) {
5725         if (    (uint64_t) ( aSig0<<1 )
5726              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5727             return propagateFloatx80NaN(a, b, status);
5728         }
5729         goto invalid;
5730     }
5731     if ( bExp == 0x7FFF ) {
5732         if ((uint64_t)(bSig << 1)) {
5733             return propagateFloatx80NaN(a, b, status);
5734         }
5735         if (aExp == 0 && aSig0 >> 63) {
5736             /*
5737              * Pseudo-denormal argument must be returned in normalized
5738              * form.
5739              */
5740             return packFloatx80(aSign, 1, aSig0);
5741         }
5742         return a;
5743     }
5744     if ( bExp == 0 ) {
5745         if ( bSig == 0 ) {
5746  invalid:
5747             float_raise(float_flag_invalid, status);
5748             return floatx80_default_nan(status);
5749         }
5750         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5751     }
5752     if ( aExp == 0 ) {
5753         if ( aSig0 == 0 ) return a;
5754         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5755     }
5756     zSign = aSign;
5757     expDiff = aExp - bExp;
5758     aSig1 = 0;
5759     if ( expDiff < 0 ) {
5760         if ( mod || expDiff < -1 ) {
5761             if (aExp == 1 && aExpOrig == 0) {
5762                 /*
5763                  * Pseudo-denormal argument must be returned in
5764                  * normalized form.
5765                  */
5766                 return packFloatx80(aSign, aExp, aSig0);
5767             }
5768             return a;
5769         }
5770         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5771         expDiff = 0;
5772     }
5773     *quotient = q = ( bSig <= aSig0 );
5774     if ( q ) aSig0 -= bSig;
5775     expDiff -= 64;
5776     while ( 0 < expDiff ) {
5777         q = estimateDiv128To64( aSig0, aSig1, bSig );
5778         q = ( 2 < q ) ? q - 2 : 0;
5779         mul64To128( bSig, q, &term0, &term1 );
5780         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5781         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5782         expDiff -= 62;
5783         *quotient <<= 62;
5784         *quotient += q;
5785     }
5786     expDiff += 64;
5787     if ( 0 < expDiff ) {
5788         q = estimateDiv128To64( aSig0, aSig1, bSig );
5789         q = ( 2 < q ) ? q - 2 : 0;
5790         q >>= 64 - expDiff;
5791         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5792         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5793         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5794         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5795             ++q;
5796             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5797         }
5798         if (expDiff < 64) {
5799             *quotient <<= expDiff;
5800         } else {
5801             *quotient = 0;
5802         }
5803         *quotient += q;
5804     }
5805     else {
5806         term1 = 0;
5807         term0 = bSig;
5808     }
5809     if (!mod) {
5810         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5811         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5812                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5813                         && ( q & 1 ) )
5814             ) {
5815             aSig0 = alternateASig0;
5816             aSig1 = alternateASig1;
5817             zSign = ! zSign;
5818             ++*quotient;
5819         }
5820     }
5821     return
5822         normalizeRoundAndPackFloatx80(
5823             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5824 
5825 }
5826 
5827 /*----------------------------------------------------------------------------
5828 | Returns the remainder of the extended double-precision floating-point value
5829 | `a' with respect to the corresponding value `b'.  The operation is performed
5830 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5831 *----------------------------------------------------------------------------*/
5832 
5833 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5834 {
5835     uint64_t quotient;
5836     return floatx80_modrem(a, b, false, &quotient, status);
5837 }
5838 
5839 /*----------------------------------------------------------------------------
5840 | Returns the remainder of the extended double-precision floating-point value
5841 | `a' with respect to the corresponding value `b', with the quotient truncated
5842 | toward zero.
5843 *----------------------------------------------------------------------------*/
5844 
5845 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
5846 {
5847     uint64_t quotient;
5848     return floatx80_modrem(a, b, true, &quotient, status);
5849 }
5850 
5851 /*----------------------------------------------------------------------------
5852 | Returns the square root of the extended double-precision floating-point
5853 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5854 | for Binary Floating-Point Arithmetic.
5855 *----------------------------------------------------------------------------*/
5856 
5857 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5858 {
5859     bool aSign;
5860     int32_t aExp, zExp;
5861     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5862     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5863 
5864     if (floatx80_invalid_encoding(a)) {
5865         float_raise(float_flag_invalid, status);
5866         return floatx80_default_nan(status);
5867     }
5868     aSig0 = extractFloatx80Frac( a );
5869     aExp = extractFloatx80Exp( a );
5870     aSign = extractFloatx80Sign( a );
5871     if ( aExp == 0x7FFF ) {
5872         if ((uint64_t)(aSig0 << 1)) {
5873             return propagateFloatx80NaN(a, a, status);
5874         }
5875         if ( ! aSign ) return a;
5876         goto invalid;
5877     }
5878     if ( aSign ) {
5879         if ( ( aExp | aSig0 ) == 0 ) return a;
5880  invalid:
5881         float_raise(float_flag_invalid, status);
5882         return floatx80_default_nan(status);
5883     }
5884     if ( aExp == 0 ) {
5885         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5886         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5887     }
5888     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5889     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5890     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5891     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5892     doubleZSig0 = zSig0<<1;
5893     mul64To128( zSig0, zSig0, &term0, &term1 );
5894     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5895     while ( (int64_t) rem0 < 0 ) {
5896         --zSig0;
5897         doubleZSig0 -= 2;
5898         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5899     }
5900     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5901     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
5902         if ( zSig1 == 0 ) zSig1 = 1;
5903         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5904         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5905         mul64To128( zSig1, zSig1, &term2, &term3 );
5906         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5907         while ( (int64_t) rem1 < 0 ) {
5908             --zSig1;
5909             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5910             term3 |= 1;
5911             term2 |= doubleZSig0;
5912             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5913         }
5914         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5915     }
5916     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5917     zSig0 |= doubleZSig0;
5918     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5919                                 0, zExp, zSig0, zSig1, status);
5920 }
5921 
5922 /*----------------------------------------------------------------------------
5923 | Returns the result of converting the quadruple-precision floating-point
5924 | value `a' to the 32-bit two's complement integer format.  The conversion
5925 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5926 | Arithmetic---which means in particular that the conversion is rounded
5927 | according to the current rounding mode.  If `a' is a NaN, the largest
5928 | positive integer is returned.  Otherwise, if the conversion overflows, the
5929 | largest integer with the same sign as `a' is returned.
5930 *----------------------------------------------------------------------------*/
5931 
5932 int32_t float128_to_int32(float128 a, float_status *status)
5933 {
5934     bool aSign;
5935     int32_t aExp, shiftCount;
5936     uint64_t aSig0, aSig1;
5937 
5938     aSig1 = extractFloat128Frac1( a );
5939     aSig0 = extractFloat128Frac0( a );
5940     aExp = extractFloat128Exp( a );
5941     aSign = extractFloat128Sign( a );
5942     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5943     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
5944     aSig0 |= ( aSig1 != 0 );
5945     shiftCount = 0x4028 - aExp;
5946     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5947     return roundAndPackInt32(aSign, aSig0, status);
5948 
5949 }
5950 
5951 /*----------------------------------------------------------------------------
5952 | Returns the result of converting the quadruple-precision floating-point
5953 | value `a' to the 32-bit two's complement integer format.  The conversion
5954 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5955 | Arithmetic, except that the conversion is always rounded toward zero.  If
5956 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5957 | conversion overflows, the largest integer with the same sign as `a' is
5958 | returned.
5959 *----------------------------------------------------------------------------*/
5960 
5961 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5962 {
5963     bool aSign;
5964     int32_t aExp, shiftCount;
5965     uint64_t aSig0, aSig1, savedASig;
5966     int32_t z;
5967 
5968     aSig1 = extractFloat128Frac1( a );
5969     aSig0 = extractFloat128Frac0( a );
5970     aExp = extractFloat128Exp( a );
5971     aSign = extractFloat128Sign( a );
5972     aSig0 |= ( aSig1 != 0 );
5973     if ( 0x401E < aExp ) {
5974         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5975         goto invalid;
5976     }
5977     else if ( aExp < 0x3FFF ) {
5978         if (aExp || aSig0) {
5979             status->float_exception_flags |= float_flag_inexact;
5980         }
5981         return 0;
5982     }
5983     aSig0 |= UINT64_C(0x0001000000000000);
5984     shiftCount = 0x402F - aExp;
5985     savedASig = aSig0;
5986     aSig0 >>= shiftCount;
5987     z = aSig0;
5988     if ( aSign ) z = - z;
5989     if ( ( z < 0 ) ^ aSign ) {
5990  invalid:
5991         float_raise(float_flag_invalid, status);
5992         return aSign ? INT32_MIN : INT32_MAX;
5993     }
5994     if ( ( aSig0<<shiftCount ) != savedASig ) {
5995         status->float_exception_flags |= float_flag_inexact;
5996     }
5997     return z;
5998 
5999 }
6000 
6001 /*----------------------------------------------------------------------------
6002 | Returns the result of converting the quadruple-precision floating-point
6003 | value `a' to the 64-bit two's complement integer format.  The conversion
6004 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6005 | Arithmetic---which means in particular that the conversion is rounded
6006 | according to the current rounding mode.  If `a' is a NaN, the largest
6007 | positive integer is returned.  Otherwise, if the conversion overflows, the
6008 | largest integer with the same sign as `a' is returned.
6009 *----------------------------------------------------------------------------*/
6010 
6011 int64_t float128_to_int64(float128 a, float_status *status)
6012 {
6013     bool aSign;
6014     int32_t aExp, shiftCount;
6015     uint64_t aSig0, aSig1;
6016 
6017     aSig1 = extractFloat128Frac1( a );
6018     aSig0 = extractFloat128Frac0( a );
6019     aExp = extractFloat128Exp( a );
6020     aSign = extractFloat128Sign( a );
6021     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6022     shiftCount = 0x402F - aExp;
6023     if ( shiftCount <= 0 ) {
6024         if ( 0x403E < aExp ) {
6025             float_raise(float_flag_invalid, status);
6026             if (    ! aSign
6027                  || (    ( aExp == 0x7FFF )
6028                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6029                     )
6030                ) {
6031                 return INT64_MAX;
6032             }
6033             return INT64_MIN;
6034         }
6035         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6036     }
6037     else {
6038         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6039     }
6040     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6041 
6042 }
6043 
6044 /*----------------------------------------------------------------------------
6045 | Returns the result of converting the quadruple-precision floating-point
6046 | value `a' to the 64-bit two's complement integer format.  The conversion
6047 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6048 | Arithmetic, except that the conversion is always rounded toward zero.
6049 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6050 | the conversion overflows, the largest integer with the same sign as `a' is
6051 | returned.
6052 *----------------------------------------------------------------------------*/
6053 
6054 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6055 {
6056     bool aSign;
6057     int32_t aExp, shiftCount;
6058     uint64_t aSig0, aSig1;
6059     int64_t z;
6060 
6061     aSig1 = extractFloat128Frac1( a );
6062     aSig0 = extractFloat128Frac0( a );
6063     aExp = extractFloat128Exp( a );
6064     aSign = extractFloat128Sign( a );
6065     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6066     shiftCount = aExp - 0x402F;
6067     if ( 0 < shiftCount ) {
6068         if ( 0x403E <= aExp ) {
6069             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6070             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6071                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6072                 if (aSig1) {
6073                     status->float_exception_flags |= float_flag_inexact;
6074                 }
6075             }
6076             else {
6077                 float_raise(float_flag_invalid, status);
6078                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6079                     return INT64_MAX;
6080                 }
6081             }
6082             return INT64_MIN;
6083         }
6084         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6085         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6086             status->float_exception_flags |= float_flag_inexact;
6087         }
6088     }
6089     else {
6090         if ( aExp < 0x3FFF ) {
6091             if ( aExp | aSig0 | aSig1 ) {
6092                 status->float_exception_flags |= float_flag_inexact;
6093             }
6094             return 0;
6095         }
6096         z = aSig0>>( - shiftCount );
6097         if (    aSig1
6098              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6099             status->float_exception_flags |= float_flag_inexact;
6100         }
6101     }
6102     if ( aSign ) z = - z;
6103     return z;
6104 
6105 }
6106 
6107 /*----------------------------------------------------------------------------
6108 | Returns the result of converting the quadruple-precision floating-point value
6109 | `a' to the 64-bit unsigned integer format.  The conversion is
6110 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6111 | Arithmetic---which means in particular that the conversion is rounded
6112 | according to the current rounding mode.  If `a' is a NaN, the largest
6113 | positive integer is returned.  If the conversion overflows, the
6114 | largest unsigned integer is returned.  If 'a' is negative, the value is
6115 | rounded and zero is returned; negative values that do not round to zero
6116 | will raise the inexact exception.
6117 *----------------------------------------------------------------------------*/
6118 
6119 uint64_t float128_to_uint64(float128 a, float_status *status)
6120 {
6121     bool aSign;
6122     int aExp;
6123     int shiftCount;
6124     uint64_t aSig0, aSig1;
6125 
6126     aSig0 = extractFloat128Frac0(a);
6127     aSig1 = extractFloat128Frac1(a);
6128     aExp = extractFloat128Exp(a);
6129     aSign = extractFloat128Sign(a);
6130     if (aSign && (aExp > 0x3FFE)) {
6131         float_raise(float_flag_invalid, status);
6132         if (float128_is_any_nan(a)) {
6133             return UINT64_MAX;
6134         } else {
6135             return 0;
6136         }
6137     }
6138     if (aExp) {
6139         aSig0 |= UINT64_C(0x0001000000000000);
6140     }
6141     shiftCount = 0x402F - aExp;
6142     if (shiftCount <= 0) {
6143         if (0x403E < aExp) {
6144             float_raise(float_flag_invalid, status);
6145             return UINT64_MAX;
6146         }
6147         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6148     } else {
6149         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6150     }
6151     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6152 }
6153 
6154 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6155 {
6156     uint64_t v;
6157     signed char current_rounding_mode = status->float_rounding_mode;
6158 
6159     set_float_rounding_mode(float_round_to_zero, status);
6160     v = float128_to_uint64(a, status);
6161     set_float_rounding_mode(current_rounding_mode, status);
6162 
6163     return v;
6164 }
6165 
6166 /*----------------------------------------------------------------------------
6167 | Returns the result of converting the quadruple-precision floating-point
6168 | value `a' to the 32-bit unsigned integer format.  The conversion
6169 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6170 | Arithmetic except that the conversion is always rounded toward zero.
6171 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6172 | if the conversion overflows, the largest unsigned integer is returned.
6173 | If 'a' is negative, the value is rounded and zero is returned; negative
6174 | values that do not round to zero will raise the inexact exception.
6175 *----------------------------------------------------------------------------*/
6176 
6177 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6178 {
6179     uint64_t v;
6180     uint32_t res;
6181     int old_exc_flags = get_float_exception_flags(status);
6182 
6183     v = float128_to_uint64_round_to_zero(a, status);
6184     if (v > 0xffffffff) {
6185         res = 0xffffffff;
6186     } else {
6187         return v;
6188     }
6189     set_float_exception_flags(old_exc_flags, status);
6190     float_raise(float_flag_invalid, status);
6191     return res;
6192 }
6193 
6194 /*----------------------------------------------------------------------------
6195 | Returns the result of converting the quadruple-precision floating-point value
6196 | `a' to the 32-bit unsigned integer format.  The conversion is
6197 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6198 | Arithmetic---which means in particular that the conversion is rounded
6199 | according to the current rounding mode.  If `a' is a NaN, the largest
6200 | positive integer is returned.  If the conversion overflows, the
6201 | largest unsigned integer is returned.  If 'a' is negative, the value is
6202 | rounded and zero is returned; negative values that do not round to zero
6203 | will raise the inexact exception.
6204 *----------------------------------------------------------------------------*/
6205 
6206 uint32_t float128_to_uint32(float128 a, float_status *status)
6207 {
6208     uint64_t v;
6209     uint32_t res;
6210     int old_exc_flags = get_float_exception_flags(status);
6211 
6212     v = float128_to_uint64(a, status);
6213     if (v > 0xffffffff) {
6214         res = 0xffffffff;
6215     } else {
6216         return v;
6217     }
6218     set_float_exception_flags(old_exc_flags, status);
6219     float_raise(float_flag_invalid, status);
6220     return res;
6221 }
6222 
6223 /*----------------------------------------------------------------------------
6224 | Returns the result of converting the quadruple-precision floating-point
6225 | value `a' to the single-precision floating-point format.  The conversion
6226 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6227 | Arithmetic.
6228 *----------------------------------------------------------------------------*/
6229 
6230 float32 float128_to_float32(float128 a, float_status *status)
6231 {
6232     bool aSign;
6233     int32_t aExp;
6234     uint64_t aSig0, aSig1;
6235     uint32_t zSig;
6236 
6237     aSig1 = extractFloat128Frac1( a );
6238     aSig0 = extractFloat128Frac0( a );
6239     aExp = extractFloat128Exp( a );
6240     aSign = extractFloat128Sign( a );
6241     if ( aExp == 0x7FFF ) {
6242         if ( aSig0 | aSig1 ) {
6243             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6244         }
6245         return packFloat32( aSign, 0xFF, 0 );
6246     }
6247     aSig0 |= ( aSig1 != 0 );
6248     shift64RightJamming( aSig0, 18, &aSig0 );
6249     zSig = aSig0;
6250     if ( aExp || zSig ) {
6251         zSig |= 0x40000000;
6252         aExp -= 0x3F81;
6253     }
6254     return roundAndPackFloat32(aSign, aExp, zSig, status);
6255 
6256 }
6257 
6258 /*----------------------------------------------------------------------------
6259 | Returns the result of converting the quadruple-precision floating-point
6260 | value `a' to the double-precision floating-point format.  The conversion
6261 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6262 | Arithmetic.
6263 *----------------------------------------------------------------------------*/
6264 
6265 float64 float128_to_float64(float128 a, float_status *status)
6266 {
6267     bool aSign;
6268     int32_t aExp;
6269     uint64_t aSig0, aSig1;
6270 
6271     aSig1 = extractFloat128Frac1( a );
6272     aSig0 = extractFloat128Frac0( a );
6273     aExp = extractFloat128Exp( a );
6274     aSign = extractFloat128Sign( a );
6275     if ( aExp == 0x7FFF ) {
6276         if ( aSig0 | aSig1 ) {
6277             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6278         }
6279         return packFloat64( aSign, 0x7FF, 0 );
6280     }
6281     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6282     aSig0 |= ( aSig1 != 0 );
6283     if ( aExp || aSig0 ) {
6284         aSig0 |= UINT64_C(0x4000000000000000);
6285         aExp -= 0x3C01;
6286     }
6287     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6288 
6289 }
6290 
6291 /*----------------------------------------------------------------------------
6292 | Returns the result of converting the quadruple-precision floating-point
6293 | value `a' to the extended double-precision floating-point format.  The
6294 | conversion is performed according to the IEC/IEEE Standard for Binary
6295 | Floating-Point Arithmetic.
6296 *----------------------------------------------------------------------------*/
6297 
6298 floatx80 float128_to_floatx80(float128 a, float_status *status)
6299 {
6300     bool aSign;
6301     int32_t aExp;
6302     uint64_t aSig0, aSig1;
6303 
6304     aSig1 = extractFloat128Frac1( a );
6305     aSig0 = extractFloat128Frac0( a );
6306     aExp = extractFloat128Exp( a );
6307     aSign = extractFloat128Sign( a );
6308     if ( aExp == 0x7FFF ) {
6309         if ( aSig0 | aSig1 ) {
6310             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6311                                                status);
6312             return floatx80_silence_nan(res, status);
6313         }
6314         return packFloatx80(aSign, floatx80_infinity_high,
6315                                    floatx80_infinity_low);
6316     }
6317     if ( aExp == 0 ) {
6318         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6319         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6320     }
6321     else {
6322         aSig0 |= UINT64_C(0x0001000000000000);
6323     }
6324     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6325     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6326 
6327 }
6328 
6329 /*----------------------------------------------------------------------------
6330 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6331 | returns the result as a quadruple-precision floating-point value.  The
6332 | operation is performed according to the IEC/IEEE Standard for Binary
6333 | Floating-Point Arithmetic.
6334 *----------------------------------------------------------------------------*/
6335 
6336 float128 float128_round_to_int(float128 a, float_status *status)
6337 {
6338     bool aSign;
6339     int32_t aExp;
6340     uint64_t lastBitMask, roundBitsMask;
6341     float128 z;
6342 
6343     aExp = extractFloat128Exp( a );
6344     if ( 0x402F <= aExp ) {
6345         if ( 0x406F <= aExp ) {
6346             if (    ( aExp == 0x7FFF )
6347                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6348                ) {
6349                 return propagateFloat128NaN(a, a, status);
6350             }
6351             return a;
6352         }
6353         lastBitMask = 1;
6354         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6355         roundBitsMask = lastBitMask - 1;
6356         z = a;
6357         switch (status->float_rounding_mode) {
6358         case float_round_nearest_even:
6359             if ( lastBitMask ) {
6360                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6361                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6362             }
6363             else {
6364                 if ( (int64_t) z.low < 0 ) {
6365                     ++z.high;
6366                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6367                 }
6368             }
6369             break;
6370         case float_round_ties_away:
6371             if (lastBitMask) {
6372                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6373             } else {
6374                 if ((int64_t) z.low < 0) {
6375                     ++z.high;
6376                 }
6377             }
6378             break;
6379         case float_round_to_zero:
6380             break;
6381         case float_round_up:
6382             if (!extractFloat128Sign(z)) {
6383                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6384             }
6385             break;
6386         case float_round_down:
6387             if (extractFloat128Sign(z)) {
6388                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6389             }
6390             break;
6391         case float_round_to_odd:
6392             /*
6393              * Note that if lastBitMask == 0, the last bit is the lsb
6394              * of high, and roundBitsMask == -1.
6395              */
6396             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6397                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6398             }
6399             break;
6400         default:
6401             abort();
6402         }
6403         z.low &= ~ roundBitsMask;
6404     }
6405     else {
6406         if ( aExp < 0x3FFF ) {
6407             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6408             status->float_exception_flags |= float_flag_inexact;
6409             aSign = extractFloat128Sign( a );
6410             switch (status->float_rounding_mode) {
6411             case float_round_nearest_even:
6412                 if (    ( aExp == 0x3FFE )
6413                      && (   extractFloat128Frac0( a )
6414                           | extractFloat128Frac1( a ) )
6415                    ) {
6416                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6417                 }
6418                 break;
6419             case float_round_ties_away:
6420                 if (aExp == 0x3FFE) {
6421                     return packFloat128(aSign, 0x3FFF, 0, 0);
6422                 }
6423                 break;
6424             case float_round_down:
6425                 return
6426                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6427                     : packFloat128( 0, 0, 0, 0 );
6428             case float_round_up:
6429                 return
6430                       aSign ? packFloat128( 1, 0, 0, 0 )
6431                     : packFloat128( 0, 0x3FFF, 0, 0 );
6432 
6433             case float_round_to_odd:
6434                 return packFloat128(aSign, 0x3FFF, 0, 0);
6435 
6436             case float_round_to_zero:
6437                 break;
6438             }
6439             return packFloat128( aSign, 0, 0, 0 );
6440         }
6441         lastBitMask = 1;
6442         lastBitMask <<= 0x402F - aExp;
6443         roundBitsMask = lastBitMask - 1;
6444         z.low = 0;
6445         z.high = a.high;
6446         switch (status->float_rounding_mode) {
6447         case float_round_nearest_even:
6448             z.high += lastBitMask>>1;
6449             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6450                 z.high &= ~ lastBitMask;
6451             }
6452             break;
6453         case float_round_ties_away:
6454             z.high += lastBitMask>>1;
6455             break;
6456         case float_round_to_zero:
6457             break;
6458         case float_round_up:
6459             if (!extractFloat128Sign(z)) {
6460                 z.high |= ( a.low != 0 );
6461                 z.high += roundBitsMask;
6462             }
6463             break;
6464         case float_round_down:
6465             if (extractFloat128Sign(z)) {
6466                 z.high |= (a.low != 0);
6467                 z.high += roundBitsMask;
6468             }
6469             break;
6470         case float_round_to_odd:
6471             if ((z.high & lastBitMask) == 0) {
6472                 z.high |= (a.low != 0);
6473                 z.high += roundBitsMask;
6474             }
6475             break;
6476         default:
6477             abort();
6478         }
6479         z.high &= ~ roundBitsMask;
6480     }
6481     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6482         status->float_exception_flags |= float_flag_inexact;
6483     }
6484     return z;
6485 
6486 }
6487 
6488 /*----------------------------------------------------------------------------
6489 | Returns the result of adding the absolute values of the quadruple-precision
6490 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6491 | before being returned.  `zSign' is ignored if the result is a NaN.
6492 | The addition is performed according to the IEC/IEEE Standard for Binary
6493 | Floating-Point Arithmetic.
6494 *----------------------------------------------------------------------------*/
6495 
6496 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6497                                 float_status *status)
6498 {
6499     int32_t aExp, bExp, zExp;
6500     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6501     int32_t expDiff;
6502 
6503     aSig1 = extractFloat128Frac1( a );
6504     aSig0 = extractFloat128Frac0( a );
6505     aExp = extractFloat128Exp( a );
6506     bSig1 = extractFloat128Frac1( b );
6507     bSig0 = extractFloat128Frac0( b );
6508     bExp = extractFloat128Exp( b );
6509     expDiff = aExp - bExp;
6510     if ( 0 < expDiff ) {
6511         if ( aExp == 0x7FFF ) {
6512             if (aSig0 | aSig1) {
6513                 return propagateFloat128NaN(a, b, status);
6514             }
6515             return a;
6516         }
6517         if ( bExp == 0 ) {
6518             --expDiff;
6519         }
6520         else {
6521             bSig0 |= UINT64_C(0x0001000000000000);
6522         }
6523         shift128ExtraRightJamming(
6524             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6525         zExp = aExp;
6526     }
6527     else if ( expDiff < 0 ) {
6528         if ( bExp == 0x7FFF ) {
6529             if (bSig0 | bSig1) {
6530                 return propagateFloat128NaN(a, b, status);
6531             }
6532             return packFloat128( zSign, 0x7FFF, 0, 0 );
6533         }
6534         if ( aExp == 0 ) {
6535             ++expDiff;
6536         }
6537         else {
6538             aSig0 |= UINT64_C(0x0001000000000000);
6539         }
6540         shift128ExtraRightJamming(
6541             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6542         zExp = bExp;
6543     }
6544     else {
6545         if ( aExp == 0x7FFF ) {
6546             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6547                 return propagateFloat128NaN(a, b, status);
6548             }
6549             return a;
6550         }
6551         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6552         if ( aExp == 0 ) {
6553             if (status->flush_to_zero) {
6554                 if (zSig0 | zSig1) {
6555                     float_raise(float_flag_output_denormal, status);
6556                 }
6557                 return packFloat128(zSign, 0, 0, 0);
6558             }
6559             return packFloat128( zSign, 0, zSig0, zSig1 );
6560         }
6561         zSig2 = 0;
6562         zSig0 |= UINT64_C(0x0002000000000000);
6563         zExp = aExp;
6564         goto shiftRight1;
6565     }
6566     aSig0 |= UINT64_C(0x0001000000000000);
6567     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6568     --zExp;
6569     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
6570     ++zExp;
6571  shiftRight1:
6572     shift128ExtraRightJamming(
6573         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6574  roundAndPack:
6575     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6576 
6577 }
6578 
6579 /*----------------------------------------------------------------------------
6580 | Returns the result of subtracting the absolute values of the quadruple-
6581 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6582 | difference is negated before being returned.  `zSign' is ignored if the
6583 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6584 | Standard for Binary Floating-Point Arithmetic.
6585 *----------------------------------------------------------------------------*/
6586 
6587 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
6588                                 float_status *status)
6589 {
6590     int32_t aExp, bExp, zExp;
6591     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6592     int32_t expDiff;
6593 
6594     aSig1 = extractFloat128Frac1( a );
6595     aSig0 = extractFloat128Frac0( a );
6596     aExp = extractFloat128Exp( a );
6597     bSig1 = extractFloat128Frac1( b );
6598     bSig0 = extractFloat128Frac0( b );
6599     bExp = extractFloat128Exp( b );
6600     expDiff = aExp - bExp;
6601     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6602     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6603     if ( 0 < expDiff ) goto aExpBigger;
6604     if ( expDiff < 0 ) goto bExpBigger;
6605     if ( aExp == 0x7FFF ) {
6606         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6607             return propagateFloat128NaN(a, b, status);
6608         }
6609         float_raise(float_flag_invalid, status);
6610         return float128_default_nan(status);
6611     }
6612     if ( aExp == 0 ) {
6613         aExp = 1;
6614         bExp = 1;
6615     }
6616     if ( bSig0 < aSig0 ) goto aBigger;
6617     if ( aSig0 < bSig0 ) goto bBigger;
6618     if ( bSig1 < aSig1 ) goto aBigger;
6619     if ( aSig1 < bSig1 ) goto bBigger;
6620     return packFloat128(status->float_rounding_mode == float_round_down,
6621                         0, 0, 0);
6622  bExpBigger:
6623     if ( bExp == 0x7FFF ) {
6624         if (bSig0 | bSig1) {
6625             return propagateFloat128NaN(a, b, status);
6626         }
6627         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6628     }
6629     if ( aExp == 0 ) {
6630         ++expDiff;
6631     }
6632     else {
6633         aSig0 |= UINT64_C(0x4000000000000000);
6634     }
6635     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6636     bSig0 |= UINT64_C(0x4000000000000000);
6637  bBigger:
6638     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6639     zExp = bExp;
6640     zSign ^= 1;
6641     goto normalizeRoundAndPack;
6642  aExpBigger:
6643     if ( aExp == 0x7FFF ) {
6644         if (aSig0 | aSig1) {
6645             return propagateFloat128NaN(a, b, status);
6646         }
6647         return a;
6648     }
6649     if ( bExp == 0 ) {
6650         --expDiff;
6651     }
6652     else {
6653         bSig0 |= UINT64_C(0x4000000000000000);
6654     }
6655     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6656     aSig0 |= UINT64_C(0x4000000000000000);
6657  aBigger:
6658     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6659     zExp = aExp;
6660  normalizeRoundAndPack:
6661     --zExp;
6662     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6663                                          status);
6664 
6665 }
6666 
6667 /*----------------------------------------------------------------------------
6668 | Returns the result of adding the quadruple-precision floating-point values
6669 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6670 | for Binary Floating-Point Arithmetic.
6671 *----------------------------------------------------------------------------*/
6672 
6673 float128 float128_add(float128 a, float128 b, float_status *status)
6674 {
6675     bool aSign, bSign;
6676 
6677     aSign = extractFloat128Sign( a );
6678     bSign = extractFloat128Sign( b );
6679     if ( aSign == bSign ) {
6680         return addFloat128Sigs(a, b, aSign, status);
6681     }
6682     else {
6683         return subFloat128Sigs(a, b, aSign, status);
6684     }
6685 
6686 }
6687 
6688 /*----------------------------------------------------------------------------
6689 | Returns the result of subtracting the quadruple-precision floating-point
6690 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6691 | Standard for Binary Floating-Point Arithmetic.
6692 *----------------------------------------------------------------------------*/
6693 
6694 float128 float128_sub(float128 a, float128 b, float_status *status)
6695 {
6696     bool aSign, bSign;
6697 
6698     aSign = extractFloat128Sign( a );
6699     bSign = extractFloat128Sign( b );
6700     if ( aSign == bSign ) {
6701         return subFloat128Sigs(a, b, aSign, status);
6702     }
6703     else {
6704         return addFloat128Sigs(a, b, aSign, status);
6705     }
6706 
6707 }
6708 
6709 /*----------------------------------------------------------------------------
6710 | Returns the result of multiplying the quadruple-precision floating-point
6711 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6712 | Standard for Binary Floating-Point Arithmetic.
6713 *----------------------------------------------------------------------------*/
6714 
6715 float128 float128_mul(float128 a, float128 b, float_status *status)
6716 {
6717     bool aSign, bSign, zSign;
6718     int32_t aExp, bExp, zExp;
6719     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6720 
6721     aSig1 = extractFloat128Frac1( a );
6722     aSig0 = extractFloat128Frac0( a );
6723     aExp = extractFloat128Exp( a );
6724     aSign = extractFloat128Sign( a );
6725     bSig1 = extractFloat128Frac1( b );
6726     bSig0 = extractFloat128Frac0( b );
6727     bExp = extractFloat128Exp( b );
6728     bSign = extractFloat128Sign( b );
6729     zSign = aSign ^ bSign;
6730     if ( aExp == 0x7FFF ) {
6731         if (    ( aSig0 | aSig1 )
6732              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6733             return propagateFloat128NaN(a, b, status);
6734         }
6735         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6736         return packFloat128( zSign, 0x7FFF, 0, 0 );
6737     }
6738     if ( bExp == 0x7FFF ) {
6739         if (bSig0 | bSig1) {
6740             return propagateFloat128NaN(a, b, status);
6741         }
6742         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6743  invalid:
6744             float_raise(float_flag_invalid, status);
6745             return float128_default_nan(status);
6746         }
6747         return packFloat128( zSign, 0x7FFF, 0, 0 );
6748     }
6749     if ( aExp == 0 ) {
6750         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6751         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6752     }
6753     if ( bExp == 0 ) {
6754         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6755         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6756     }
6757     zExp = aExp + bExp - 0x4000;
6758     aSig0 |= UINT64_C(0x0001000000000000);
6759     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6760     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6761     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6762     zSig2 |= ( zSig3 != 0 );
6763     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
6764         shift128ExtraRightJamming(
6765             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6766         ++zExp;
6767     }
6768     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6769 
6770 }
6771 
6772 /*----------------------------------------------------------------------------
6773 | Returns the result of dividing the quadruple-precision floating-point value
6774 | `a' by the corresponding value `b'.  The operation is performed according to
6775 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6776 *----------------------------------------------------------------------------*/
6777 
6778 float128 float128_div(float128 a, float128 b, float_status *status)
6779 {
6780     bool aSign, bSign, zSign;
6781     int32_t aExp, bExp, zExp;
6782     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6783     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6784 
6785     aSig1 = extractFloat128Frac1( a );
6786     aSig0 = extractFloat128Frac0( a );
6787     aExp = extractFloat128Exp( a );
6788     aSign = extractFloat128Sign( a );
6789     bSig1 = extractFloat128Frac1( b );
6790     bSig0 = extractFloat128Frac0( b );
6791     bExp = extractFloat128Exp( b );
6792     bSign = extractFloat128Sign( b );
6793     zSign = aSign ^ bSign;
6794     if ( aExp == 0x7FFF ) {
6795         if (aSig0 | aSig1) {
6796             return propagateFloat128NaN(a, b, status);
6797         }
6798         if ( bExp == 0x7FFF ) {
6799             if (bSig0 | bSig1) {
6800                 return propagateFloat128NaN(a, b, status);
6801             }
6802             goto invalid;
6803         }
6804         return packFloat128( zSign, 0x7FFF, 0, 0 );
6805     }
6806     if ( bExp == 0x7FFF ) {
6807         if (bSig0 | bSig1) {
6808             return propagateFloat128NaN(a, b, status);
6809         }
6810         return packFloat128( zSign, 0, 0, 0 );
6811     }
6812     if ( bExp == 0 ) {
6813         if ( ( bSig0 | bSig1 ) == 0 ) {
6814             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6815  invalid:
6816                 float_raise(float_flag_invalid, status);
6817                 return float128_default_nan(status);
6818             }
6819             float_raise(float_flag_divbyzero, status);
6820             return packFloat128( zSign, 0x7FFF, 0, 0 );
6821         }
6822         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6823     }
6824     if ( aExp == 0 ) {
6825         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6826         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6827     }
6828     zExp = aExp - bExp + 0x3FFD;
6829     shortShift128Left(
6830         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
6831     shortShift128Left(
6832         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
6833     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6834         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6835         ++zExp;
6836     }
6837     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6838     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6839     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6840     while ( (int64_t) rem0 < 0 ) {
6841         --zSig0;
6842         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6843     }
6844     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6845     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6846         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6847         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6848         while ( (int64_t) rem1 < 0 ) {
6849             --zSig1;
6850             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6851         }
6852         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6853     }
6854     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6855     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6856 
6857 }
6858 
6859 /*----------------------------------------------------------------------------
6860 | Returns the remainder of the quadruple-precision floating-point value `a'
6861 | with respect to the corresponding value `b'.  The operation is performed
6862 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6863 *----------------------------------------------------------------------------*/
6864 
6865 float128 float128_rem(float128 a, float128 b, float_status *status)
6866 {
6867     bool aSign, zSign;
6868     int32_t aExp, bExp, expDiff;
6869     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6870     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6871     int64_t sigMean0;
6872 
6873     aSig1 = extractFloat128Frac1( a );
6874     aSig0 = extractFloat128Frac0( a );
6875     aExp = extractFloat128Exp( a );
6876     aSign = extractFloat128Sign( a );
6877     bSig1 = extractFloat128Frac1( b );
6878     bSig0 = extractFloat128Frac0( b );
6879     bExp = extractFloat128Exp( b );
6880     if ( aExp == 0x7FFF ) {
6881         if (    ( aSig0 | aSig1 )
6882              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6883             return propagateFloat128NaN(a, b, status);
6884         }
6885         goto invalid;
6886     }
6887     if ( bExp == 0x7FFF ) {
6888         if (bSig0 | bSig1) {
6889             return propagateFloat128NaN(a, b, status);
6890         }
6891         return a;
6892     }
6893     if ( bExp == 0 ) {
6894         if ( ( bSig0 | bSig1 ) == 0 ) {
6895  invalid:
6896             float_raise(float_flag_invalid, status);
6897             return float128_default_nan(status);
6898         }
6899         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6900     }
6901     if ( aExp == 0 ) {
6902         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6903         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6904     }
6905     expDiff = aExp - bExp;
6906     if ( expDiff < -1 ) return a;
6907     shortShift128Left(
6908         aSig0 | UINT64_C(0x0001000000000000),
6909         aSig1,
6910         15 - ( expDiff < 0 ),
6911         &aSig0,
6912         &aSig1
6913     );
6914     shortShift128Left(
6915         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
6916     q = le128( bSig0, bSig1, aSig0, aSig1 );
6917     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6918     expDiff -= 64;
6919     while ( 0 < expDiff ) {
6920         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6921         q = ( 4 < q ) ? q - 4 : 0;
6922         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6923         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6924         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6925         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6926         expDiff -= 61;
6927     }
6928     if ( -64 < expDiff ) {
6929         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6930         q = ( 4 < q ) ? q - 4 : 0;
6931         q >>= - expDiff;
6932         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6933         expDiff += 52;
6934         if ( expDiff < 0 ) {
6935             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6936         }
6937         else {
6938             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6939         }
6940         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6941         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6942     }
6943     else {
6944         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6945         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6946     }
6947     do {
6948         alternateASig0 = aSig0;
6949         alternateASig1 = aSig1;
6950         ++q;
6951         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6952     } while ( 0 <= (int64_t) aSig0 );
6953     add128(
6954         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6955     if (    ( sigMean0 < 0 )
6956          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6957         aSig0 = alternateASig0;
6958         aSig1 = alternateASig1;
6959     }
6960     zSign = ( (int64_t) aSig0 < 0 );
6961     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6962     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6963                                          status);
6964 }
6965 
6966 /*----------------------------------------------------------------------------
6967 | Returns the square root of the quadruple-precision floating-point value `a'.
6968 | The operation is performed according to the IEC/IEEE Standard for Binary
6969 | Floating-Point Arithmetic.
6970 *----------------------------------------------------------------------------*/
6971 
6972 float128 float128_sqrt(float128 a, float_status *status)
6973 {
6974     bool aSign;
6975     int32_t aExp, zExp;
6976     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6977     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6978 
6979     aSig1 = extractFloat128Frac1( a );
6980     aSig0 = extractFloat128Frac0( a );
6981     aExp = extractFloat128Exp( a );
6982     aSign = extractFloat128Sign( a );
6983     if ( aExp == 0x7FFF ) {
6984         if (aSig0 | aSig1) {
6985             return propagateFloat128NaN(a, a, status);
6986         }
6987         if ( ! aSign ) return a;
6988         goto invalid;
6989     }
6990     if ( aSign ) {
6991         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6992  invalid:
6993         float_raise(float_flag_invalid, status);
6994         return float128_default_nan(status);
6995     }
6996     if ( aExp == 0 ) {
6997         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6998         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6999     }
7000     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7001     aSig0 |= UINT64_C(0x0001000000000000);
7002     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7003     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7004     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7005     doubleZSig0 = zSig0<<1;
7006     mul64To128( zSig0, zSig0, &term0, &term1 );
7007     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7008     while ( (int64_t) rem0 < 0 ) {
7009         --zSig0;
7010         doubleZSig0 -= 2;
7011         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7012     }
7013     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7014     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7015         if ( zSig1 == 0 ) zSig1 = 1;
7016         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7017         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7018         mul64To128( zSig1, zSig1, &term2, &term3 );
7019         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7020         while ( (int64_t) rem1 < 0 ) {
7021             --zSig1;
7022             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7023             term3 |= 1;
7024             term2 |= doubleZSig0;
7025             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7026         }
7027         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7028     }
7029     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7030     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7031 
7032 }
7033 
7034 static inline FloatRelation
7035 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7036                           float_status *status)
7037 {
7038     bool aSign, bSign;
7039 
7040     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7041         float_raise(float_flag_invalid, status);
7042         return float_relation_unordered;
7043     }
7044     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7045           ( extractFloatx80Frac( a )<<1 ) ) ||
7046         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7047           ( extractFloatx80Frac( b )<<1 ) )) {
7048         if (!is_quiet ||
7049             floatx80_is_signaling_nan(a, status) ||
7050             floatx80_is_signaling_nan(b, status)) {
7051             float_raise(float_flag_invalid, status);
7052         }
7053         return float_relation_unordered;
7054     }
7055     aSign = extractFloatx80Sign( a );
7056     bSign = extractFloatx80Sign( b );
7057     if ( aSign != bSign ) {
7058 
7059         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7060              ( ( a.low | b.low ) == 0 ) ) {
7061             /* zero case */
7062             return float_relation_equal;
7063         } else {
7064             return 1 - (2 * aSign);
7065         }
7066     } else {
7067         /* Normalize pseudo-denormals before comparison.  */
7068         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7069             ++a.high;
7070         }
7071         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7072             ++b.high;
7073         }
7074         if (a.low == b.low && a.high == b.high) {
7075             return float_relation_equal;
7076         } else {
7077             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7078         }
7079     }
7080 }
7081 
7082 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7083 {
7084     return floatx80_compare_internal(a, b, 0, status);
7085 }
7086 
7087 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7088                                      float_status *status)
7089 {
7090     return floatx80_compare_internal(a, b, 1, status);
7091 }
7092 
7093 static inline FloatRelation
7094 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7095                           float_status *status)
7096 {
7097     bool aSign, bSign;
7098 
7099     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7100           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7101         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7102           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7103         if (!is_quiet ||
7104             float128_is_signaling_nan(a, status) ||
7105             float128_is_signaling_nan(b, status)) {
7106             float_raise(float_flag_invalid, status);
7107         }
7108         return float_relation_unordered;
7109     }
7110     aSign = extractFloat128Sign( a );
7111     bSign = extractFloat128Sign( b );
7112     if ( aSign != bSign ) {
7113         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7114             /* zero case */
7115             return float_relation_equal;
7116         } else {
7117             return 1 - (2 * aSign);
7118         }
7119     } else {
7120         if (a.low == b.low && a.high == b.high) {
7121             return float_relation_equal;
7122         } else {
7123             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7124         }
7125     }
7126 }
7127 
7128 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7129 {
7130     return float128_compare_internal(a, b, 0, status);
7131 }
7132 
7133 FloatRelation float128_compare_quiet(float128 a, float128 b,
7134                                      float_status *status)
7135 {
7136     return float128_compare_internal(a, b, 1, status);
7137 }
7138 
7139 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7140 {
7141     bool aSign;
7142     int32_t aExp;
7143     uint64_t aSig;
7144 
7145     if (floatx80_invalid_encoding(a)) {
7146         float_raise(float_flag_invalid, status);
7147         return floatx80_default_nan(status);
7148     }
7149     aSig = extractFloatx80Frac( a );
7150     aExp = extractFloatx80Exp( a );
7151     aSign = extractFloatx80Sign( a );
7152 
7153     if ( aExp == 0x7FFF ) {
7154         if ( aSig<<1 ) {
7155             return propagateFloatx80NaN(a, a, status);
7156         }
7157         return a;
7158     }
7159 
7160     if (aExp == 0) {
7161         if (aSig == 0) {
7162             return a;
7163         }
7164         aExp++;
7165     }
7166 
7167     if (n > 0x10000) {
7168         n = 0x10000;
7169     } else if (n < -0x10000) {
7170         n = -0x10000;
7171     }
7172 
7173     aExp += n;
7174     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7175                                          aSign, aExp, aSig, 0, status);
7176 }
7177 
7178 float128 float128_scalbn(float128 a, int n, float_status *status)
7179 {
7180     bool aSign;
7181     int32_t aExp;
7182     uint64_t aSig0, aSig1;
7183 
7184     aSig1 = extractFloat128Frac1( a );
7185     aSig0 = extractFloat128Frac0( a );
7186     aExp = extractFloat128Exp( a );
7187     aSign = extractFloat128Sign( a );
7188     if ( aExp == 0x7FFF ) {
7189         if ( aSig0 | aSig1 ) {
7190             return propagateFloat128NaN(a, a, status);
7191         }
7192         return a;
7193     }
7194     if (aExp != 0) {
7195         aSig0 |= UINT64_C(0x0001000000000000);
7196     } else if (aSig0 == 0 && aSig1 == 0) {
7197         return a;
7198     } else {
7199         aExp++;
7200     }
7201 
7202     if (n > 0x10000) {
7203         n = 0x10000;
7204     } else if (n < -0x10000) {
7205         n = -0x10000;
7206     }
7207 
7208     aExp += n - 1;
7209     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7210                                          , status);
7211 
7212 }
7213 
7214 static void __attribute__((constructor)) softfloat_init(void)
7215 {
7216     union_float64 ua, ub, uc, ur;
7217 
7218     if (QEMU_NO_HARDFLOAT) {
7219         return;
7220     }
7221     /*
7222      * Test that the host's FMA is not obviously broken. For example,
7223      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7224      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7225      */
7226     ua.s = 0x0020000000000001ULL;
7227     ub.s = 0x3ca0000000000000ULL;
7228     uc.s = 0x0020000000000000ULL;
7229     ur.h = fma(ua.h, ub.h, uc.h);
7230     if (ur.s != 0x0020000000000001ULL) {
7231         force_soft_fma = true;
7232     }
7233 }
7234