xref: /qemu/fpu/softfloat.c (revision bbc0586c)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat16Frac(float16 a)
422 {
423     return float16_val(a) & 0x3ff;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat16Exp(float16 a)
431 {
432     return (float16_val(a) >> 10) & 0x1f;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline uint32_t extractFloat32Frac(float32 a)
440 {
441     return float32_val(a) & 0x007FFFFF;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline int extractFloat32Exp(float32 a)
449 {
450     return (float32_val(a) >> 23) & 0xFF;
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline flag extractFloat32Sign(float32 a)
458 {
459     return float32_val(a) >> 31;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline uint64_t extractFloat64Frac(float64 a)
467 {
468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
469 }
470 
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
474 
475 static inline int extractFloat64Exp(float64 a)
476 {
477     return (float64_val(a) >> 52) & 0x7FF;
478 }
479 
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
483 
484 static inline flag extractFloat64Sign(float64 a)
485 {
486     return float64_val(a) >> 63;
487 }
488 
489 /*
490  * Classify a floating point number. Everything above float_class_qnan
491  * is a NaN so cls >= float_class_qnan is any NaN.
492  */
493 
494 typedef enum __attribute__ ((__packed__)) {
495     float_class_unclassified,
496     float_class_zero,
497     float_class_normal,
498     float_class_inf,
499     float_class_qnan,  /* all NaNs from here */
500     float_class_snan,
501 } FloatClass;
502 
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
505 {
506     return unlikely(c >= float_class_qnan);
507 }
508 
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
510 {
511     return c == float_class_snan;
512 }
513 
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
515 {
516     return c == float_class_qnan;
517 }
518 
519 /*
520  * Structure holding all of the decomposed parts of a float. The
521  * exponent is unbiased and the fraction is normalized. All
522  * calculations are done with a 64 bit fraction and then rounded as
523  * appropriate for the final format.
524  *
525  * Thanks to the packed FloatClass a decent compiler should be able to
526  * fit the whole structure into registers and avoid using the stack
527  * for parameter passing.
528  */
529 
530 typedef struct {
531     uint64_t frac;
532     int32_t  exp;
533     FloatClass cls;
534     bool sign;
535 } FloatParts;
536 
537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
540 
541 /* Structure holding all of the relevant parameters for a format.
542  *   exp_size: the size of the exponent field
543  *   exp_bias: the offset applied to the exponent field
544  *   exp_max: the maximum normalised exponent
545  *   frac_size: the size of the fraction field
546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547  * The following are computed based the size of fraction
548  *   frac_lsb: least significant bit of fraction
549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
550  *   round_mask/roundeven_mask: masks used for rounding
551  * The following optional modifiers are available:
552  *   arm_althp: handle ARM Alternative Half Precision
553  */
554 typedef struct {
555     int exp_size;
556     int exp_bias;
557     int exp_max;
558     int frac_size;
559     int frac_shift;
560     uint64_t frac_lsb;
561     uint64_t frac_lsbm1;
562     uint64_t round_mask;
563     uint64_t roundeven_mask;
564     bool arm_althp;
565 } FloatFmt;
566 
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F)                                           \
569     .exp_size       = E,                                             \
570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
571     .exp_max        = (1 << E) - 1,                                  \
572     .frac_size      = F,                                             \
573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
578 
579 static const FloatFmt float16_params = {
580     FLOAT_PARAMS(5, 10)
581 };
582 
583 static const FloatFmt float16_params_ahp = {
584     FLOAT_PARAMS(5, 10),
585     .arm_althp = true
586 };
587 
588 static const FloatFmt float32_params = {
589     FLOAT_PARAMS(8, 23)
590 };
591 
592 static const FloatFmt float64_params = {
593     FLOAT_PARAMS(11, 52)
594 };
595 
596 /* Unpack a float to parts, but do not canonicalize.  */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
598 {
599     const int sign_pos = fmt.frac_size + fmt.exp_size;
600 
601     return (FloatParts) {
602         .cls = float_class_unclassified,
603         .sign = extract64(raw, sign_pos, 1),
604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605         .frac = extract64(raw, 0, fmt.frac_size),
606     };
607 }
608 
609 static inline FloatParts float16_unpack_raw(float16 f)
610 {
611     return unpack_raw(float16_params, f);
612 }
613 
614 static inline FloatParts float32_unpack_raw(float32 f)
615 {
616     return unpack_raw(float32_params, f);
617 }
618 
619 static inline FloatParts float64_unpack_raw(float64 f)
620 {
621     return unpack_raw(float64_params, f);
622 }
623 
624 /* Pack a float from parts, but do not canonicalize.  */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
626 {
627     const int sign_pos = fmt.frac_size + fmt.exp_size;
628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629     return deposit64(ret, sign_pos, 1, p.sign);
630 }
631 
632 static inline float16 float16_pack_raw(FloatParts p)
633 {
634     return make_float16(pack_raw(float16_params, p));
635 }
636 
637 static inline float32 float32_pack_raw(FloatParts p)
638 {
639     return make_float32(pack_raw(float32_params, p));
640 }
641 
642 static inline float64 float64_pack_raw(FloatParts p)
643 {
644     return make_float64(pack_raw(float64_params, p));
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine:  (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output.  These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
656 
657 /* Canonicalize EXP and FRAC, setting CLS.  */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659                                   float_status *status)
660 {
661     if (part.exp == parm->exp_max && !parm->arm_althp) {
662         if (part.frac == 0) {
663             part.cls = float_class_inf;
664         } else {
665             part.frac <<= parm->frac_shift;
666             part.cls = (parts_is_snan_frac(part.frac, status)
667                         ? float_class_snan : float_class_qnan);
668         }
669     } else if (part.exp == 0) {
670         if (likely(part.frac == 0)) {
671             part.cls = float_class_zero;
672         } else if (status->flush_inputs_to_zero) {
673             float_raise(float_flag_input_denormal, status);
674             part.cls = float_class_zero;
675             part.frac = 0;
676         } else {
677             int shift = clz64(part.frac) - 1;
678             part.cls = float_class_normal;
679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680             part.frac <<= shift;
681         }
682     } else {
683         part.cls = float_class_normal;
684         part.exp -= parm->exp_bias;
685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
686     }
687     return part;
688 }
689 
690 /* Round and uncanonicalize a floating-point number by parts. There
691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
692  * fraction; these bits will be removed. The exponent will be biased
693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694  */
695 
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697                                   const FloatFmt *parm)
698 {
699     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700     const uint64_t round_mask = parm->round_mask;
701     const uint64_t roundeven_mask = parm->roundeven_mask;
702     const int exp_max = parm->exp_max;
703     const int frac_shift = parm->frac_shift;
704     uint64_t frac, inc;
705     int exp, flags = 0;
706     bool overflow_norm;
707 
708     frac = p.frac;
709     exp = p.exp;
710 
711     switch (p.cls) {
712     case float_class_normal:
713         switch (s->float_rounding_mode) {
714         case float_round_nearest_even:
715             overflow_norm = false;
716             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717             break;
718         case float_round_ties_away:
719             overflow_norm = false;
720             inc = frac_lsbm1;
721             break;
722         case float_round_to_zero:
723             overflow_norm = true;
724             inc = 0;
725             break;
726         case float_round_up:
727             inc = p.sign ? 0 : round_mask;
728             overflow_norm = p.sign;
729             break;
730         case float_round_down:
731             inc = p.sign ? round_mask : 0;
732             overflow_norm = !p.sign;
733             break;
734         default:
735             g_assert_not_reached();
736         }
737 
738         exp += parm->exp_bias;
739         if (likely(exp > 0)) {
740             if (frac & round_mask) {
741                 flags |= float_flag_inexact;
742                 frac += inc;
743                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744                     frac >>= 1;
745                     exp++;
746                 }
747             }
748             frac >>= frac_shift;
749 
750             if (parm->arm_althp) {
751                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
752                 if (unlikely(exp > exp_max)) {
753                     /* Overflow.  Return the maximum normal.  */
754                     flags = float_flag_invalid;
755                     exp = exp_max;
756                     frac = -1;
757                 }
758             } else if (unlikely(exp >= exp_max)) {
759                 flags |= float_flag_overflow | float_flag_inexact;
760                 if (overflow_norm) {
761                     exp = exp_max - 1;
762                     frac = -1;
763                 } else {
764                     p.cls = float_class_inf;
765                     goto do_inf;
766                 }
767             }
768         } else if (s->flush_to_zero) {
769             flags |= float_flag_output_denormal;
770             p.cls = float_class_zero;
771             goto do_zero;
772         } else {
773             bool is_tiny = (s->float_detect_tininess
774                             == float_tininess_before_rounding)
775                         || (exp < 0)
776                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
777 
778             shift64RightJamming(frac, 1 - exp, &frac);
779             if (frac & round_mask) {
780                 /* Need to recompute round-to-even.  */
781                 if (s->float_rounding_mode == float_round_nearest_even) {
782                     inc = ((frac & roundeven_mask) != frac_lsbm1
783                            ? frac_lsbm1 : 0);
784                 }
785                 flags |= float_flag_inexact;
786                 frac += inc;
787             }
788 
789             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790             frac >>= frac_shift;
791 
792             if (is_tiny && (flags & float_flag_inexact)) {
793                 flags |= float_flag_underflow;
794             }
795             if (exp == 0 && frac == 0) {
796                 p.cls = float_class_zero;
797             }
798         }
799         break;
800 
801     case float_class_zero:
802     do_zero:
803         exp = 0;
804         frac = 0;
805         break;
806 
807     case float_class_inf:
808     do_inf:
809         assert(!parm->arm_althp);
810         exp = exp_max;
811         frac = 0;
812         break;
813 
814     case float_class_qnan:
815     case float_class_snan:
816         assert(!parm->arm_althp);
817         exp = exp_max;
818         frac >>= parm->frac_shift;
819         break;
820 
821     default:
822         g_assert_not_reached();
823     }
824 
825     float_raise(flags, s);
826     p.exp = exp;
827     p.frac = frac;
828     return p;
829 }
830 
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833                                             const FloatFmt *params)
834 {
835     return sf_canonicalize(float16_unpack_raw(f), params, s);
836 }
837 
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
839 {
840     return float16a_unpack_canonical(f, s, &float16_params);
841 }
842 
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844                                              const FloatFmt *params)
845 {
846     return float16_pack_raw(round_canonical(p, s, params));
847 }
848 
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float16a_round_pack_canonical(p, s, &float16_params);
852 }
853 
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
855 {
856     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
857 }
858 
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
860 {
861     return float32_pack_raw(round_canonical(p, s, &float32_params));
862 }
863 
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
865 {
866     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
867 }
868 
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
870 {
871     return float64_pack_raw(round_canonical(p, s, &float64_params));
872 }
873 
874 static FloatParts return_nan(FloatParts a, float_status *s)
875 {
876     switch (a.cls) {
877     case float_class_snan:
878         s->float_exception_flags |= float_flag_invalid;
879         a = parts_silence_nan(a, s);
880         /* fall through */
881     case float_class_qnan:
882         if (s->default_nan_mode) {
883             return parts_default_nan(s);
884         }
885         break;
886 
887     default:
888         g_assert_not_reached();
889     }
890     return a;
891 }
892 
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
894 {
895     if (is_snan(a.cls) || is_snan(b.cls)) {
896         s->float_exception_flags |= float_flag_invalid;
897     }
898 
899     if (s->default_nan_mode) {
900         return parts_default_nan(s);
901     } else {
902         if (pickNaN(a.cls, b.cls,
903                     a.frac > b.frac ||
904                     (a.frac == b.frac && a.sign < b.sign))) {
905             a = b;
906         }
907         if (is_snan(a.cls)) {
908             return parts_silence_nan(a, s);
909         }
910     }
911     return a;
912 }
913 
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915                                   bool inf_zero, float_status *s)
916 {
917     int which;
918 
919     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920         s->float_exception_flags |= float_flag_invalid;
921     }
922 
923     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
924 
925     if (s->default_nan_mode) {
926         /* Note that this check is after pickNaNMulAdd so that function
927          * has an opportunity to set the Invalid flag.
928          */
929         which = 3;
930     }
931 
932     switch (which) {
933     case 0:
934         break;
935     case 1:
936         a = b;
937         break;
938     case 2:
939         a = c;
940         break;
941     case 3:
942         return parts_default_nan(s);
943     default:
944         g_assert_not_reached();
945     }
946 
947     if (is_snan(a.cls)) {
948         return parts_silence_nan(a, s);
949     }
950     return a;
951 }
952 
953 /*
954  * Returns the result of adding or subtracting the values of the
955  * floating-point values `a' and `b'. The operation is performed
956  * according to the IEC/IEEE Standard for Binary Floating-Point
957  * Arithmetic.
958  */
959 
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961                                 float_status *s)
962 {
963     bool a_sign = a.sign;
964     bool b_sign = b.sign ^ subtract;
965 
966     if (a_sign != b_sign) {
967         /* Subtraction */
968 
969         if (a.cls == float_class_normal && b.cls == float_class_normal) {
970             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972                 a.frac = a.frac - b.frac;
973             } else {
974                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975                 a.frac = b.frac - a.frac;
976                 a.exp = b.exp;
977                 a_sign ^= 1;
978             }
979 
980             if (a.frac == 0) {
981                 a.cls = float_class_zero;
982                 a.sign = s->float_rounding_mode == float_round_down;
983             } else {
984                 int shift = clz64(a.frac) - 1;
985                 a.frac = a.frac << shift;
986                 a.exp = a.exp - shift;
987                 a.sign = a_sign;
988             }
989             return a;
990         }
991         if (is_nan(a.cls) || is_nan(b.cls)) {
992             return pick_nan(a, b, s);
993         }
994         if (a.cls == float_class_inf) {
995             if (b.cls == float_class_inf) {
996                 float_raise(float_flag_invalid, s);
997                 return parts_default_nan(s);
998             }
999             return a;
1000         }
1001         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002             a.sign = s->float_rounding_mode == float_round_down;
1003             return a;
1004         }
1005         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006             b.sign = a_sign ^ 1;
1007             return b;
1008         }
1009         if (b.cls == float_class_zero) {
1010             return a;
1011         }
1012     } else {
1013         /* Addition */
1014         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015             if (a.exp > b.exp) {
1016                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017             } else if (a.exp < b.exp) {
1018                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019                 a.exp = b.exp;
1020             }
1021             a.frac += b.frac;
1022             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023                 shift64RightJamming(a.frac, 1, &a.frac);
1024                 a.exp += 1;
1025             }
1026             return a;
1027         }
1028         if (is_nan(a.cls) || is_nan(b.cls)) {
1029             return pick_nan(a, b, s);
1030         }
1031         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032             return a;
1033         }
1034         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035             b.sign = b_sign;
1036             return b;
1037         }
1038     }
1039     g_assert_not_reached();
1040 }
1041 
1042 /*
1043  * Returns the result of adding or subtracting the floating-point
1044  * values `a' and `b'. The operation is performed according to the
1045  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1046  */
1047 
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1049 {
1050     FloatParts pa = float16_unpack_canonical(a, status);
1051     FloatParts pb = float16_unpack_canonical(b, status);
1052     FloatParts pr = addsub_floats(pa, pb, false, status);
1053 
1054     return float16_round_pack_canonical(pr, status);
1055 }
1056 
1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1058 {
1059     FloatParts pa = float16_unpack_canonical(a, status);
1060     FloatParts pb = float16_unpack_canonical(b, status);
1061     FloatParts pr = addsub_floats(pa, pb, true, status);
1062 
1063     return float16_round_pack_canonical(pr, status);
1064 }
1065 
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1068 {
1069     FloatParts pa = float32_unpack_canonical(a, status);
1070     FloatParts pb = float32_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072 
1073     return float32_round_pack_canonical(pr, status);
1074 }
1075 
1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1077 {
1078     return soft_f32_addsub(a, b, false, status);
1079 }
1080 
1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1082 {
1083     return soft_f32_addsub(a, b, true, status);
1084 }
1085 
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1088 {
1089     FloatParts pa = float64_unpack_canonical(a, status);
1090     FloatParts pb = float64_unpack_canonical(b, status);
1091     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1092 
1093     return float64_round_pack_canonical(pr, status);
1094 }
1095 
1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1097 {
1098     return soft_f64_addsub(a, b, false, status);
1099 }
1100 
1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1102 {
1103     return soft_f64_addsub(a, b, true, status);
1104 }
1105 
1106 static float hard_f32_add(float a, float b)
1107 {
1108     return a + b;
1109 }
1110 
1111 static float hard_f32_sub(float a, float b)
1112 {
1113     return a - b;
1114 }
1115 
1116 static double hard_f64_add(double a, double b)
1117 {
1118     return a + b;
1119 }
1120 
1121 static double hard_f64_sub(double a, double b)
1122 {
1123     return a - b;
1124 }
1125 
1126 static bool f32_addsub_post(union_float32 a, union_float32 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     }
1131     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1132 }
1133 
1134 static bool f64_addsub_post(union_float64 a, union_float64 b)
1135 {
1136     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1137         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1138     } else {
1139         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1140     }
1141 }
1142 
1143 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1144                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1145 {
1146     return float32_gen2(a, b, s, hard, soft,
1147                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1148 }
1149 
1150 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1151                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1152 {
1153     return float64_gen2(a, b, s, hard, soft,
1154                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1155 }
1156 
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a, float32 b, float_status *s)
1159 {
1160     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1161 }
1162 
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a, float32 b, float_status *s)
1165 {
1166     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1167 }
1168 
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a, float64 b, float_status *s)
1171 {
1172     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1173 }
1174 
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a, float64 b, float_status *s)
1177 {
1178     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1179 }
1180 
1181 /*
1182  * Returns the result of multiplying the floating-point values `a' and
1183  * `b'. The operation is performed according to the IEC/IEEE Standard
1184  * for Binary Floating-Point Arithmetic.
1185  */
1186 
1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1188 {
1189     bool sign = a.sign ^ b.sign;
1190 
1191     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1192         uint64_t hi, lo;
1193         int exp = a.exp + b.exp;
1194 
1195         mul64To128(a.frac, b.frac, &hi, &lo);
1196         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1197         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1198             shift64RightJamming(lo, 1, &lo);
1199             exp += 1;
1200         }
1201 
1202         /* Re-use a */
1203         a.exp = exp;
1204         a.sign = sign;
1205         a.frac = lo;
1206         return a;
1207     }
1208     /* handle all the NaN cases */
1209     if (is_nan(a.cls) || is_nan(b.cls)) {
1210         return pick_nan(a, b, s);
1211     }
1212     /* Inf * Zero == NaN */
1213     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1214         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1215         s->float_exception_flags |= float_flag_invalid;
1216         return parts_default_nan(s);
1217     }
1218     /* Multiply by 0 or Inf */
1219     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1220         a.sign = sign;
1221         return a;
1222     }
1223     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1224         b.sign = sign;
1225         return b;
1226     }
1227     g_assert_not_reached();
1228 }
1229 
1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1231 {
1232     FloatParts pa = float16_unpack_canonical(a, status);
1233     FloatParts pb = float16_unpack_canonical(b, status);
1234     FloatParts pr = mul_floats(pa, pb, status);
1235 
1236     return float16_round_pack_canonical(pr, status);
1237 }
1238 
1239 static float32 QEMU_SOFTFLOAT_ATTR
1240 soft_f32_mul(float32 a, float32 b, float_status *status)
1241 {
1242     FloatParts pa = float32_unpack_canonical(a, status);
1243     FloatParts pb = float32_unpack_canonical(b, status);
1244     FloatParts pr = mul_floats(pa, pb, status);
1245 
1246     return float32_round_pack_canonical(pr, status);
1247 }
1248 
1249 static float64 QEMU_SOFTFLOAT_ATTR
1250 soft_f64_mul(float64 a, float64 b, float_status *status)
1251 {
1252     FloatParts pa = float64_unpack_canonical(a, status);
1253     FloatParts pb = float64_unpack_canonical(b, status);
1254     FloatParts pr = mul_floats(pa, pb, status);
1255 
1256     return float64_round_pack_canonical(pr, status);
1257 }
1258 
1259 static float hard_f32_mul(float a, float b)
1260 {
1261     return a * b;
1262 }
1263 
1264 static double hard_f64_mul(double a, double b)
1265 {
1266     return a * b;
1267 }
1268 
1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1270 {
1271     return float32_is_zero(a.s) || float32_is_zero(b.s);
1272 }
1273 
1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1275 {
1276     return float64_is_zero(a.s) || float64_is_zero(b.s);
1277 }
1278 
1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1280 {
1281     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1282 
1283     return float32_set_sign(float32_zero, signbit);
1284 }
1285 
1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1287 {
1288     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1289 
1290     return float64_set_sign(float64_zero, signbit);
1291 }
1292 
1293 float32 QEMU_FLATTEN
1294 float32_mul(float32 a, float32 b, float_status *s)
1295 {
1296     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1297                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1298 }
1299 
1300 float64 QEMU_FLATTEN
1301 float64_mul(float64 a, float64 b, float_status *s)
1302 {
1303     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1304                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1305 }
1306 
1307 /*
1308  * Returns the result of multiplying the floating-point values `a' and
1309  * `b' then adding 'c', with no intermediate rounding step after the
1310  * multiplication. The operation is performed according to the
1311  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1312  * The flags argument allows the caller to select negation of the
1313  * addend, the intermediate product, or the final result. (The
1314  * difference between this and having the caller do a separate
1315  * negation is that negating externally will flip the sign bit on
1316  * NaNs.)
1317  */
1318 
1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1320                                 int flags, float_status *s)
1321 {
1322     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1323                     ((1 << float_class_inf) | (1 << float_class_zero));
1324     bool p_sign;
1325     bool sign_flip = flags & float_muladd_negate_result;
1326     FloatClass p_class;
1327     uint64_t hi, lo;
1328     int p_exp;
1329 
1330     /* It is implementation-defined whether the cases of (0,inf,qnan)
1331      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1332      * they return if they do), so we have to hand this information
1333      * off to the target-specific pick-a-NaN routine.
1334      */
1335     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1336         return pick_nan_muladd(a, b, c, inf_zero, s);
1337     }
1338 
1339     if (inf_zero) {
1340         s->float_exception_flags |= float_flag_invalid;
1341         return parts_default_nan(s);
1342     }
1343 
1344     if (flags & float_muladd_negate_c) {
1345         c.sign ^= 1;
1346     }
1347 
1348     p_sign = a.sign ^ b.sign;
1349 
1350     if (flags & float_muladd_negate_product) {
1351         p_sign ^= 1;
1352     }
1353 
1354     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1355         p_class = float_class_inf;
1356     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1357         p_class = float_class_zero;
1358     } else {
1359         p_class = float_class_normal;
1360     }
1361 
1362     if (c.cls == float_class_inf) {
1363         if (p_class == float_class_inf && p_sign != c.sign) {
1364             s->float_exception_flags |= float_flag_invalid;
1365             return parts_default_nan(s);
1366         } else {
1367             a.cls = float_class_inf;
1368             a.sign = c.sign ^ sign_flip;
1369             return a;
1370         }
1371     }
1372 
1373     if (p_class == float_class_inf) {
1374         a.cls = float_class_inf;
1375         a.sign = p_sign ^ sign_flip;
1376         return a;
1377     }
1378 
1379     if (p_class == float_class_zero) {
1380         if (c.cls == float_class_zero) {
1381             if (p_sign != c.sign) {
1382                 p_sign = s->float_rounding_mode == float_round_down;
1383             }
1384             c.sign = p_sign;
1385         } else if (flags & float_muladd_halve_result) {
1386             c.exp -= 1;
1387         }
1388         c.sign ^= sign_flip;
1389         return c;
1390     }
1391 
1392     /* a & b should be normals now... */
1393     assert(a.cls == float_class_normal &&
1394            b.cls == float_class_normal);
1395 
1396     p_exp = a.exp + b.exp;
1397 
1398     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1399      * result.
1400      */
1401     mul64To128(a.frac, b.frac, &hi, &lo);
1402     /* binary point now at bit 124 */
1403 
1404     /* check for overflow */
1405     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1406         shift128RightJamming(hi, lo, 1, &hi, &lo);
1407         p_exp += 1;
1408     }
1409 
1410     /* + add/sub */
1411     if (c.cls == float_class_zero) {
1412         /* move binary point back to 62 */
1413         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1414     } else {
1415         int exp_diff = p_exp - c.exp;
1416         if (p_sign == c.sign) {
1417             /* Addition */
1418             if (exp_diff <= 0) {
1419                 shift128RightJamming(hi, lo,
1420                                      DECOMPOSED_BINARY_POINT - exp_diff,
1421                                      &hi, &lo);
1422                 lo += c.frac;
1423                 p_exp = c.exp;
1424             } else {
1425                 uint64_t c_hi, c_lo;
1426                 /* shift c to the same binary point as the product (124) */
1427                 c_hi = c.frac >> 2;
1428                 c_lo = 0;
1429                 shift128RightJamming(c_hi, c_lo,
1430                                      exp_diff,
1431                                      &c_hi, &c_lo);
1432                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1433                 /* move binary point back to 62 */
1434                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1435             }
1436 
1437             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1438                 shift64RightJamming(lo, 1, &lo);
1439                 p_exp += 1;
1440             }
1441 
1442         } else {
1443             /* Subtraction */
1444             uint64_t c_hi, c_lo;
1445             /* make C binary point match product at bit 124 */
1446             c_hi = c.frac >> 2;
1447             c_lo = 0;
1448 
1449             if (exp_diff <= 0) {
1450                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1451                 if (exp_diff == 0
1452                     &&
1453                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1454                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1455                 } else {
1456                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1457                     p_sign ^= 1;
1458                     p_exp = c.exp;
1459                 }
1460             } else {
1461                 shift128RightJamming(c_hi, c_lo,
1462                                      exp_diff,
1463                                      &c_hi, &c_lo);
1464                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465             }
1466 
1467             if (hi == 0 && lo == 0) {
1468                 a.cls = float_class_zero;
1469                 a.sign = s->float_rounding_mode == float_round_down;
1470                 a.sign ^= sign_flip;
1471                 return a;
1472             } else {
1473                 int shift;
1474                 if (hi != 0) {
1475                     shift = clz64(hi);
1476                 } else {
1477                     shift = clz64(lo) + 64;
1478                 }
1479                 /* Normalizing to a binary point of 124 is the
1480                    correct adjust for the exponent.  However since we're
1481                    shifting, we might as well put the binary point back
1482                    at 62 where we really want it.  Therefore shift as
1483                    if we're leaving 1 bit at the top of the word, but
1484                    adjust the exponent as if we're leaving 3 bits.  */
1485                 shift -= 1;
1486                 if (shift >= 64) {
1487                     lo = lo << (shift - 64);
1488                 } else {
1489                     hi = (hi << shift) | (lo >> (64 - shift));
1490                     lo = hi | ((lo << shift) != 0);
1491                 }
1492                 p_exp -= shift - 2;
1493             }
1494         }
1495     }
1496 
1497     if (flags & float_muladd_halve_result) {
1498         p_exp -= 1;
1499     }
1500 
1501     /* finally prepare our result */
1502     a.cls = float_class_normal;
1503     a.sign = p_sign ^ sign_flip;
1504     a.exp = p_exp;
1505     a.frac = lo;
1506 
1507     return a;
1508 }
1509 
1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1511                                                 int flags, float_status *status)
1512 {
1513     FloatParts pa = float16_unpack_canonical(a, status);
1514     FloatParts pb = float16_unpack_canonical(b, status);
1515     FloatParts pc = float16_unpack_canonical(c, status);
1516     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1517 
1518     return float16_round_pack_canonical(pr, status);
1519 }
1520 
1521 static float32 QEMU_SOFTFLOAT_ATTR
1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1523                 float_status *status)
1524 {
1525     FloatParts pa = float32_unpack_canonical(a, status);
1526     FloatParts pb = float32_unpack_canonical(b, status);
1527     FloatParts pc = float32_unpack_canonical(c, status);
1528     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1529 
1530     return float32_round_pack_canonical(pr, status);
1531 }
1532 
1533 static float64 QEMU_SOFTFLOAT_ATTR
1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1535                 float_status *status)
1536 {
1537     FloatParts pa = float64_unpack_canonical(a, status);
1538     FloatParts pb = float64_unpack_canonical(b, status);
1539     FloatParts pc = float64_unpack_canonical(c, status);
1540     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1541 
1542     return float64_round_pack_canonical(pr, status);
1543 }
1544 
1545 float32 QEMU_FLATTEN
1546 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1547 {
1548     union_float32 ua, ub, uc, ur;
1549 
1550     ua.s = xa;
1551     ub.s = xb;
1552     uc.s = xc;
1553 
1554     if (unlikely(!can_use_fpu(s))) {
1555         goto soft;
1556     }
1557     if (unlikely(flags & float_muladd_halve_result)) {
1558         goto soft;
1559     }
1560 
1561     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1562     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1563         goto soft;
1564     }
1565     /*
1566      * When (a || b) == 0, there's no need to check for under/over flow,
1567      * since we know the addend is (normal || 0) and the product is 0.
1568      */
1569     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1570         union_float32 up;
1571         bool prod_sign;
1572 
1573         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1574         prod_sign ^= !!(flags & float_muladd_negate_product);
1575         up.s = float32_set_sign(float32_zero, prod_sign);
1576 
1577         if (flags & float_muladd_negate_c) {
1578             uc.h = -uc.h;
1579         }
1580         ur.h = up.h + uc.h;
1581     } else {
1582         if (flags & float_muladd_negate_product) {
1583             ua.h = -ua.h;
1584         }
1585         if (flags & float_muladd_negate_c) {
1586             uc.h = -uc.h;
1587         }
1588 
1589         ur.h = fmaf(ua.h, ub.h, uc.h);
1590 
1591         if (unlikely(f32_is_inf(ur))) {
1592             s->float_exception_flags |= float_flag_overflow;
1593         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1594             goto soft;
1595         }
1596     }
1597     if (flags & float_muladd_negate_result) {
1598         return float32_chs(ur.s);
1599     }
1600     return ur.s;
1601 
1602  soft:
1603     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1604 }
1605 
1606 float64 QEMU_FLATTEN
1607 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1608 {
1609     union_float64 ua, ub, uc, ur;
1610 
1611     ua.s = xa;
1612     ub.s = xb;
1613     uc.s = xc;
1614 
1615     if (unlikely(!can_use_fpu(s))) {
1616         goto soft;
1617     }
1618     if (unlikely(flags & float_muladd_halve_result)) {
1619         goto soft;
1620     }
1621 
1622     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1623     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1624         goto soft;
1625     }
1626     /*
1627      * When (a || b) == 0, there's no need to check for under/over flow,
1628      * since we know the addend is (normal || 0) and the product is 0.
1629      */
1630     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1631         union_float64 up;
1632         bool prod_sign;
1633 
1634         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1635         prod_sign ^= !!(flags & float_muladd_negate_product);
1636         up.s = float64_set_sign(float64_zero, prod_sign);
1637 
1638         if (flags & float_muladd_negate_c) {
1639             uc.h = -uc.h;
1640         }
1641         ur.h = up.h + uc.h;
1642     } else {
1643         if (flags & float_muladd_negate_product) {
1644             ua.h = -ua.h;
1645         }
1646         if (flags & float_muladd_negate_c) {
1647             uc.h = -uc.h;
1648         }
1649 
1650         ur.h = fma(ua.h, ub.h, uc.h);
1651 
1652         if (unlikely(f64_is_inf(ur))) {
1653             s->float_exception_flags |= float_flag_overflow;
1654         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1655             goto soft;
1656         }
1657     }
1658     if (flags & float_muladd_negate_result) {
1659         return float64_chs(ur.s);
1660     }
1661     return ur.s;
1662 
1663  soft:
1664     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1665 }
1666 
1667 /*
1668  * Returns the result of dividing the floating-point value `a' by the
1669  * corresponding value `b'. The operation is performed according to
1670  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1671  */
1672 
1673 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1674 {
1675     bool sign = a.sign ^ b.sign;
1676 
1677     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1678         uint64_t n0, n1, q, r;
1679         int exp = a.exp - b.exp;
1680 
1681         /*
1682          * We want a 2*N / N-bit division to produce exactly an N-bit
1683          * result, so that we do not lose any precision and so that we
1684          * do not have to renormalize afterward.  If A.frac < B.frac,
1685          * then division would produce an (N-1)-bit result; shift A left
1686          * by one to produce the an N-bit result, and decrement the
1687          * exponent to match.
1688          *
1689          * The udiv_qrnnd algorithm that we're using requires normalization,
1690          * i.e. the msb of the denominator must be set.  Since we know that
1691          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1692          * by one (more), and the remainder must be shifted right by one.
1693          */
1694         if (a.frac < b.frac) {
1695             exp -= 1;
1696             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1697         } else {
1698             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1699         }
1700         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1701 
1702         /*
1703          * Set lsb if there is a remainder, to set inexact.
1704          * As mentioned above, to find the actual value of the remainder we
1705          * would need to shift right, but (1) we are only concerned about
1706          * non-zero-ness, and (2) the remainder will always be even because
1707          * both inputs to the division primitive are even.
1708          */
1709         a.frac = q | (r != 0);
1710         a.sign = sign;
1711         a.exp = exp;
1712         return a;
1713     }
1714     /* handle all the NaN cases */
1715     if (is_nan(a.cls) || is_nan(b.cls)) {
1716         return pick_nan(a, b, s);
1717     }
1718     /* 0/0 or Inf/Inf */
1719     if (a.cls == b.cls
1720         &&
1721         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1722         s->float_exception_flags |= float_flag_invalid;
1723         return parts_default_nan(s);
1724     }
1725     /* Inf / x or 0 / x */
1726     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1727         a.sign = sign;
1728         return a;
1729     }
1730     /* Div 0 => Inf */
1731     if (b.cls == float_class_zero) {
1732         s->float_exception_flags |= float_flag_divbyzero;
1733         a.cls = float_class_inf;
1734         a.sign = sign;
1735         return a;
1736     }
1737     /* Div by Inf */
1738     if (b.cls == float_class_inf) {
1739         a.cls = float_class_zero;
1740         a.sign = sign;
1741         return a;
1742     }
1743     g_assert_not_reached();
1744 }
1745 
1746 float16 float16_div(float16 a, float16 b, float_status *status)
1747 {
1748     FloatParts pa = float16_unpack_canonical(a, status);
1749     FloatParts pb = float16_unpack_canonical(b, status);
1750     FloatParts pr = div_floats(pa, pb, status);
1751 
1752     return float16_round_pack_canonical(pr, status);
1753 }
1754 
1755 static float32 QEMU_SOFTFLOAT_ATTR
1756 soft_f32_div(float32 a, float32 b, float_status *status)
1757 {
1758     FloatParts pa = float32_unpack_canonical(a, status);
1759     FloatParts pb = float32_unpack_canonical(b, status);
1760     FloatParts pr = div_floats(pa, pb, status);
1761 
1762     return float32_round_pack_canonical(pr, status);
1763 }
1764 
1765 static float64 QEMU_SOFTFLOAT_ATTR
1766 soft_f64_div(float64 a, float64 b, float_status *status)
1767 {
1768     FloatParts pa = float64_unpack_canonical(a, status);
1769     FloatParts pb = float64_unpack_canonical(b, status);
1770     FloatParts pr = div_floats(pa, pb, status);
1771 
1772     return float64_round_pack_canonical(pr, status);
1773 }
1774 
1775 static float hard_f32_div(float a, float b)
1776 {
1777     return a / b;
1778 }
1779 
1780 static double hard_f64_div(double a, double b)
1781 {
1782     return a / b;
1783 }
1784 
1785 static bool f32_div_pre(union_float32 a, union_float32 b)
1786 {
1787     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1788         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1789                fpclassify(b.h) == FP_NORMAL;
1790     }
1791     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1792 }
1793 
1794 static bool f64_div_pre(union_float64 a, union_float64 b)
1795 {
1796     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1797         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1798                fpclassify(b.h) == FP_NORMAL;
1799     }
1800     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1801 }
1802 
1803 static bool f32_div_post(union_float32 a, union_float32 b)
1804 {
1805     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1806         return fpclassify(a.h) != FP_ZERO;
1807     }
1808     return !float32_is_zero(a.s);
1809 }
1810 
1811 static bool f64_div_post(union_float64 a, union_float64 b)
1812 {
1813     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1814         return fpclassify(a.h) != FP_ZERO;
1815     }
1816     return !float64_is_zero(a.s);
1817 }
1818 
1819 float32 QEMU_FLATTEN
1820 float32_div(float32 a, float32 b, float_status *s)
1821 {
1822     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1823                         f32_div_pre, f32_div_post, NULL, NULL);
1824 }
1825 
1826 float64 QEMU_FLATTEN
1827 float64_div(float64 a, float64 b, float_status *s)
1828 {
1829     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1830                         f64_div_pre, f64_div_post, NULL, NULL);
1831 }
1832 
1833 /*
1834  * Float to Float conversions
1835  *
1836  * Returns the result of converting one float format to another. The
1837  * conversion is performed according to the IEC/IEEE Standard for
1838  * Binary Floating-Point Arithmetic.
1839  *
1840  * The float_to_float helper only needs to take care of raising
1841  * invalid exceptions and handling the conversion on NaNs.
1842  */
1843 
1844 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1845                                  float_status *s)
1846 {
1847     if (dstf->arm_althp) {
1848         switch (a.cls) {
1849         case float_class_qnan:
1850         case float_class_snan:
1851             /* There is no NaN in the destination format.  Raise Invalid
1852              * and return a zero with the sign of the input NaN.
1853              */
1854             s->float_exception_flags |= float_flag_invalid;
1855             a.cls = float_class_zero;
1856             a.frac = 0;
1857             a.exp = 0;
1858             break;
1859 
1860         case float_class_inf:
1861             /* There is no Inf in the destination format.  Raise Invalid
1862              * and return the maximum normal with the correct sign.
1863              */
1864             s->float_exception_flags |= float_flag_invalid;
1865             a.cls = float_class_normal;
1866             a.exp = dstf->exp_max;
1867             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1868             break;
1869 
1870         default:
1871             break;
1872         }
1873     } else if (is_nan(a.cls)) {
1874         if (is_snan(a.cls)) {
1875             s->float_exception_flags |= float_flag_invalid;
1876             a = parts_silence_nan(a, s);
1877         }
1878         if (s->default_nan_mode) {
1879             return parts_default_nan(s);
1880         }
1881     }
1882     return a;
1883 }
1884 
1885 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1886 {
1887     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1888     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1889     FloatParts pr = float_to_float(p, &float32_params, s);
1890     return float32_round_pack_canonical(pr, s);
1891 }
1892 
1893 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1894 {
1895     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1896     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1897     FloatParts pr = float_to_float(p, &float64_params, s);
1898     return float64_round_pack_canonical(pr, s);
1899 }
1900 
1901 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1902 {
1903     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1904     FloatParts p = float32_unpack_canonical(a, s);
1905     FloatParts pr = float_to_float(p, fmt16, s);
1906     return float16a_round_pack_canonical(pr, s, fmt16);
1907 }
1908 
1909 float64 float32_to_float64(float32 a, float_status *s)
1910 {
1911     FloatParts p = float32_unpack_canonical(a, s);
1912     FloatParts pr = float_to_float(p, &float64_params, s);
1913     return float64_round_pack_canonical(pr, s);
1914 }
1915 
1916 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1917 {
1918     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1919     FloatParts p = float64_unpack_canonical(a, s);
1920     FloatParts pr = float_to_float(p, fmt16, s);
1921     return float16a_round_pack_canonical(pr, s, fmt16);
1922 }
1923 
1924 float32 float64_to_float32(float64 a, float_status *s)
1925 {
1926     FloatParts p = float64_unpack_canonical(a, s);
1927     FloatParts pr = float_to_float(p, &float32_params, s);
1928     return float32_round_pack_canonical(pr, s);
1929 }
1930 
1931 /*
1932  * Rounds the floating-point value `a' to an integer, and returns the
1933  * result as a floating-point value. The operation is performed
1934  * according to the IEC/IEEE Standard for Binary Floating-Point
1935  * Arithmetic.
1936  */
1937 
1938 static FloatParts round_to_int(FloatParts a, int rmode,
1939                                int scale, float_status *s)
1940 {
1941     switch (a.cls) {
1942     case float_class_qnan:
1943     case float_class_snan:
1944         return return_nan(a, s);
1945 
1946     case float_class_zero:
1947     case float_class_inf:
1948         /* already "integral" */
1949         break;
1950 
1951     case float_class_normal:
1952         scale = MIN(MAX(scale, -0x10000), 0x10000);
1953         a.exp += scale;
1954 
1955         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1956             /* already integral */
1957             break;
1958         }
1959         if (a.exp < 0) {
1960             bool one;
1961             /* all fractional */
1962             s->float_exception_flags |= float_flag_inexact;
1963             switch (rmode) {
1964             case float_round_nearest_even:
1965                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1966                 break;
1967             case float_round_ties_away:
1968                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1969                 break;
1970             case float_round_to_zero:
1971                 one = false;
1972                 break;
1973             case float_round_up:
1974                 one = !a.sign;
1975                 break;
1976             case float_round_down:
1977                 one = a.sign;
1978                 break;
1979             default:
1980                 g_assert_not_reached();
1981             }
1982 
1983             if (one) {
1984                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1985                 a.exp = 0;
1986             } else {
1987                 a.cls = float_class_zero;
1988             }
1989         } else {
1990             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1991             uint64_t frac_lsbm1 = frac_lsb >> 1;
1992             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1993             uint64_t rnd_mask = rnd_even_mask >> 1;
1994             uint64_t inc;
1995 
1996             switch (rmode) {
1997             case float_round_nearest_even:
1998                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1999                 break;
2000             case float_round_ties_away:
2001                 inc = frac_lsbm1;
2002                 break;
2003             case float_round_to_zero:
2004                 inc = 0;
2005                 break;
2006             case float_round_up:
2007                 inc = a.sign ? 0 : rnd_mask;
2008                 break;
2009             case float_round_down:
2010                 inc = a.sign ? rnd_mask : 0;
2011                 break;
2012             default:
2013                 g_assert_not_reached();
2014             }
2015 
2016             if (a.frac & rnd_mask) {
2017                 s->float_exception_flags |= float_flag_inexact;
2018                 a.frac += inc;
2019                 a.frac &= ~rnd_mask;
2020                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2021                     a.frac >>= 1;
2022                     a.exp++;
2023                 }
2024             }
2025         }
2026         break;
2027     default:
2028         g_assert_not_reached();
2029     }
2030     return a;
2031 }
2032 
2033 float16 float16_round_to_int(float16 a, float_status *s)
2034 {
2035     FloatParts pa = float16_unpack_canonical(a, s);
2036     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2037     return float16_round_pack_canonical(pr, s);
2038 }
2039 
2040 float32 float32_round_to_int(float32 a, float_status *s)
2041 {
2042     FloatParts pa = float32_unpack_canonical(a, s);
2043     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2044     return float32_round_pack_canonical(pr, s);
2045 }
2046 
2047 float64 float64_round_to_int(float64 a, float_status *s)
2048 {
2049     FloatParts pa = float64_unpack_canonical(a, s);
2050     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2051     return float64_round_pack_canonical(pr, s);
2052 }
2053 
2054 /*
2055  * Returns the result of converting the floating-point value `a' to
2056  * the two's complement integer format. The conversion is performed
2057  * according to the IEC/IEEE Standard for Binary Floating-Point
2058  * Arithmetic---which means in particular that the conversion is
2059  * rounded according to the current rounding mode. If `a' is a NaN,
2060  * the largest positive integer is returned. Otherwise, if the
2061  * conversion overflows, the largest integer with the same sign as `a'
2062  * is returned.
2063 */
2064 
2065 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2066                                      int64_t min, int64_t max,
2067                                      float_status *s)
2068 {
2069     uint64_t r;
2070     int orig_flags = get_float_exception_flags(s);
2071     FloatParts p = round_to_int(in, rmode, scale, s);
2072 
2073     switch (p.cls) {
2074     case float_class_snan:
2075     case float_class_qnan:
2076         s->float_exception_flags = orig_flags | float_flag_invalid;
2077         return max;
2078     case float_class_inf:
2079         s->float_exception_flags = orig_flags | float_flag_invalid;
2080         return p.sign ? min : max;
2081     case float_class_zero:
2082         return 0;
2083     case float_class_normal:
2084         if (p.exp < DECOMPOSED_BINARY_POINT) {
2085             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2086         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2087             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2088         } else {
2089             r = UINT64_MAX;
2090         }
2091         if (p.sign) {
2092             if (r <= -(uint64_t) min) {
2093                 return -r;
2094             } else {
2095                 s->float_exception_flags = orig_flags | float_flag_invalid;
2096                 return min;
2097             }
2098         } else {
2099             if (r <= max) {
2100                 return r;
2101             } else {
2102                 s->float_exception_flags = orig_flags | float_flag_invalid;
2103                 return max;
2104             }
2105         }
2106     default:
2107         g_assert_not_reached();
2108     }
2109 }
2110 
2111 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2112                                 float_status *s)
2113 {
2114     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2115                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2116 }
2117 
2118 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2119                                 float_status *s)
2120 {
2121     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2122                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2123 }
2124 
2125 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2126                                 float_status *s)
2127 {
2128     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2129                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2130 }
2131 
2132 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2133                                 float_status *s)
2134 {
2135     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2136                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2137 }
2138 
2139 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2140                                 float_status *s)
2141 {
2142     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2143                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2144 }
2145 
2146 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2147                                 float_status *s)
2148 {
2149     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2150                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2151 }
2152 
2153 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2154                                 float_status *s)
2155 {
2156     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2157                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2158 }
2159 
2160 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2161                                 float_status *s)
2162 {
2163     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2164                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2165 }
2166 
2167 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2168                                 float_status *s)
2169 {
2170     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2171                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2172 }
2173 
2174 int16_t float16_to_int16(float16 a, float_status *s)
2175 {
2176     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2177 }
2178 
2179 int32_t float16_to_int32(float16 a, float_status *s)
2180 {
2181     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2182 }
2183 
2184 int64_t float16_to_int64(float16 a, float_status *s)
2185 {
2186     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2187 }
2188 
2189 int16_t float32_to_int16(float32 a, float_status *s)
2190 {
2191     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2192 }
2193 
2194 int32_t float32_to_int32(float32 a, float_status *s)
2195 {
2196     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2197 }
2198 
2199 int64_t float32_to_int64(float32 a, float_status *s)
2200 {
2201     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2202 }
2203 
2204 int16_t float64_to_int16(float64 a, float_status *s)
2205 {
2206     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2207 }
2208 
2209 int32_t float64_to_int32(float64 a, float_status *s)
2210 {
2211     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2212 }
2213 
2214 int64_t float64_to_int64(float64 a, float_status *s)
2215 {
2216     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2217 }
2218 
2219 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2220 {
2221     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2222 }
2223 
2224 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2225 {
2226     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2227 }
2228 
2229 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2230 {
2231     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2232 }
2233 
2234 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2235 {
2236     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2237 }
2238 
2239 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2240 {
2241     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2242 }
2243 
2244 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2245 {
2246     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2247 }
2248 
2249 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2250 {
2251     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2252 }
2253 
2254 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2255 {
2256     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2257 }
2258 
2259 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2260 {
2261     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2262 }
2263 
2264 /*
2265  *  Returns the result of converting the floating-point value `a' to
2266  *  the unsigned integer format. The conversion is performed according
2267  *  to the IEC/IEEE Standard for Binary Floating-Point
2268  *  Arithmetic---which means in particular that the conversion is
2269  *  rounded according to the current rounding mode. If `a' is a NaN,
2270  *  the largest unsigned integer is returned. Otherwise, if the
2271  *  conversion overflows, the largest unsigned integer is returned. If
2272  *  the 'a' is negative, the result is rounded and zero is returned;
2273  *  values that do not round to zero will raise the inexact exception
2274  *  flag.
2275  */
2276 
2277 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2278                                        uint64_t max, float_status *s)
2279 {
2280     int orig_flags = get_float_exception_flags(s);
2281     FloatParts p = round_to_int(in, rmode, scale, s);
2282     uint64_t r;
2283 
2284     switch (p.cls) {
2285     case float_class_snan:
2286     case float_class_qnan:
2287         s->float_exception_flags = orig_flags | float_flag_invalid;
2288         return max;
2289     case float_class_inf:
2290         s->float_exception_flags = orig_flags | float_flag_invalid;
2291         return p.sign ? 0 : max;
2292     case float_class_zero:
2293         return 0;
2294     case float_class_normal:
2295         if (p.sign) {
2296             s->float_exception_flags = orig_flags | float_flag_invalid;
2297             return 0;
2298         }
2299 
2300         if (p.exp < DECOMPOSED_BINARY_POINT) {
2301             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2302         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2303             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2304         } else {
2305             s->float_exception_flags = orig_flags | float_flag_invalid;
2306             return max;
2307         }
2308 
2309         /* For uint64 this will never trip, but if p.exp is too large
2310          * to shift a decomposed fraction we shall have exited via the
2311          * 3rd leg above.
2312          */
2313         if (r > max) {
2314             s->float_exception_flags = orig_flags | float_flag_invalid;
2315             return max;
2316         }
2317         return r;
2318     default:
2319         g_assert_not_reached();
2320     }
2321 }
2322 
2323 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2324                                   float_status *s)
2325 {
2326     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2327                                   rmode, scale, UINT16_MAX, s);
2328 }
2329 
2330 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2331                                   float_status *s)
2332 {
2333     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2334                                   rmode, scale, UINT32_MAX, s);
2335 }
2336 
2337 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2338                                   float_status *s)
2339 {
2340     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2341                                   rmode, scale, UINT64_MAX, s);
2342 }
2343 
2344 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2345                                   float_status *s)
2346 {
2347     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2348                                   rmode, scale, UINT16_MAX, s);
2349 }
2350 
2351 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2352                                   float_status *s)
2353 {
2354     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2355                                   rmode, scale, UINT32_MAX, s);
2356 }
2357 
2358 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2359                                   float_status *s)
2360 {
2361     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2362                                   rmode, scale, UINT64_MAX, s);
2363 }
2364 
2365 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2366                                   float_status *s)
2367 {
2368     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2369                                   rmode, scale, UINT16_MAX, s);
2370 }
2371 
2372 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2373                                   float_status *s)
2374 {
2375     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2376                                   rmode, scale, UINT32_MAX, s);
2377 }
2378 
2379 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2380                                   float_status *s)
2381 {
2382     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2383                                   rmode, scale, UINT64_MAX, s);
2384 }
2385 
2386 uint16_t float16_to_uint16(float16 a, float_status *s)
2387 {
2388     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2389 }
2390 
2391 uint32_t float16_to_uint32(float16 a, float_status *s)
2392 {
2393     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2394 }
2395 
2396 uint64_t float16_to_uint64(float16 a, float_status *s)
2397 {
2398     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2399 }
2400 
2401 uint16_t float32_to_uint16(float32 a, float_status *s)
2402 {
2403     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2404 }
2405 
2406 uint32_t float32_to_uint32(float32 a, float_status *s)
2407 {
2408     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2409 }
2410 
2411 uint64_t float32_to_uint64(float32 a, float_status *s)
2412 {
2413     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2414 }
2415 
2416 uint16_t float64_to_uint16(float64 a, float_status *s)
2417 {
2418     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2419 }
2420 
2421 uint32_t float64_to_uint32(float64 a, float_status *s)
2422 {
2423     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2424 }
2425 
2426 uint64_t float64_to_uint64(float64 a, float_status *s)
2427 {
2428     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2429 }
2430 
2431 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2432 {
2433     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2434 }
2435 
2436 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2437 {
2438     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2439 }
2440 
2441 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2442 {
2443     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2444 }
2445 
2446 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2447 {
2448     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2449 }
2450 
2451 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2452 {
2453     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2454 }
2455 
2456 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2457 {
2458     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2459 }
2460 
2461 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2462 {
2463     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2464 }
2465 
2466 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2467 {
2468     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2469 }
2470 
2471 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2472 {
2473     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2474 }
2475 
2476 /*
2477  * Integer to float conversions
2478  *
2479  * Returns the result of converting the two's complement integer `a'
2480  * to the floating-point format. The conversion is performed according
2481  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2482  */
2483 
2484 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2485 {
2486     FloatParts r = { .sign = false };
2487 
2488     if (a == 0) {
2489         r.cls = float_class_zero;
2490     } else {
2491         uint64_t f = a;
2492         int shift;
2493 
2494         r.cls = float_class_normal;
2495         if (a < 0) {
2496             f = -f;
2497             r.sign = true;
2498         }
2499         shift = clz64(f) - 1;
2500         scale = MIN(MAX(scale, -0x10000), 0x10000);
2501 
2502         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2503         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2504     }
2505 
2506     return r;
2507 }
2508 
2509 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2510 {
2511     FloatParts pa = int_to_float(a, scale, status);
2512     return float16_round_pack_canonical(pa, status);
2513 }
2514 
2515 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2516 {
2517     return int64_to_float16_scalbn(a, scale, status);
2518 }
2519 
2520 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2521 {
2522     return int64_to_float16_scalbn(a, scale, status);
2523 }
2524 
2525 float16 int64_to_float16(int64_t a, float_status *status)
2526 {
2527     return int64_to_float16_scalbn(a, 0, status);
2528 }
2529 
2530 float16 int32_to_float16(int32_t a, float_status *status)
2531 {
2532     return int64_to_float16_scalbn(a, 0, status);
2533 }
2534 
2535 float16 int16_to_float16(int16_t a, float_status *status)
2536 {
2537     return int64_to_float16_scalbn(a, 0, status);
2538 }
2539 
2540 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2541 {
2542     FloatParts pa = int_to_float(a, scale, status);
2543     return float32_round_pack_canonical(pa, status);
2544 }
2545 
2546 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2547 {
2548     return int64_to_float32_scalbn(a, scale, status);
2549 }
2550 
2551 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2552 {
2553     return int64_to_float32_scalbn(a, scale, status);
2554 }
2555 
2556 float32 int64_to_float32(int64_t a, float_status *status)
2557 {
2558     return int64_to_float32_scalbn(a, 0, status);
2559 }
2560 
2561 float32 int32_to_float32(int32_t a, float_status *status)
2562 {
2563     return int64_to_float32_scalbn(a, 0, status);
2564 }
2565 
2566 float32 int16_to_float32(int16_t a, float_status *status)
2567 {
2568     return int64_to_float32_scalbn(a, 0, status);
2569 }
2570 
2571 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2572 {
2573     FloatParts pa = int_to_float(a, scale, status);
2574     return float64_round_pack_canonical(pa, status);
2575 }
2576 
2577 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2578 {
2579     return int64_to_float64_scalbn(a, scale, status);
2580 }
2581 
2582 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2583 {
2584     return int64_to_float64_scalbn(a, scale, status);
2585 }
2586 
2587 float64 int64_to_float64(int64_t a, float_status *status)
2588 {
2589     return int64_to_float64_scalbn(a, 0, status);
2590 }
2591 
2592 float64 int32_to_float64(int32_t a, float_status *status)
2593 {
2594     return int64_to_float64_scalbn(a, 0, status);
2595 }
2596 
2597 float64 int16_to_float64(int16_t a, float_status *status)
2598 {
2599     return int64_to_float64_scalbn(a, 0, status);
2600 }
2601 
2602 
2603 /*
2604  * Unsigned Integer to float conversions
2605  *
2606  * Returns the result of converting the unsigned integer `a' to the
2607  * floating-point format. The conversion is performed according to the
2608  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2609  */
2610 
2611 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2612 {
2613     FloatParts r = { .sign = false };
2614 
2615     if (a == 0) {
2616         r.cls = float_class_zero;
2617     } else {
2618         scale = MIN(MAX(scale, -0x10000), 0x10000);
2619         r.cls = float_class_normal;
2620         if ((int64_t)a < 0) {
2621             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2622             shift64RightJamming(a, 1, &a);
2623             r.frac = a;
2624         } else {
2625             int shift = clz64(a) - 1;
2626             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2627             r.frac = a << shift;
2628         }
2629     }
2630 
2631     return r;
2632 }
2633 
2634 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2635 {
2636     FloatParts pa = uint_to_float(a, scale, status);
2637     return float16_round_pack_canonical(pa, status);
2638 }
2639 
2640 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2641 {
2642     return uint64_to_float16_scalbn(a, scale, status);
2643 }
2644 
2645 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2646 {
2647     return uint64_to_float16_scalbn(a, scale, status);
2648 }
2649 
2650 float16 uint64_to_float16(uint64_t a, float_status *status)
2651 {
2652     return uint64_to_float16_scalbn(a, 0, status);
2653 }
2654 
2655 float16 uint32_to_float16(uint32_t a, float_status *status)
2656 {
2657     return uint64_to_float16_scalbn(a, 0, status);
2658 }
2659 
2660 float16 uint16_to_float16(uint16_t a, float_status *status)
2661 {
2662     return uint64_to_float16_scalbn(a, 0, status);
2663 }
2664 
2665 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2666 {
2667     FloatParts pa = uint_to_float(a, scale, status);
2668     return float32_round_pack_canonical(pa, status);
2669 }
2670 
2671 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2672 {
2673     return uint64_to_float32_scalbn(a, scale, status);
2674 }
2675 
2676 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2677 {
2678     return uint64_to_float32_scalbn(a, scale, status);
2679 }
2680 
2681 float32 uint64_to_float32(uint64_t a, float_status *status)
2682 {
2683     return uint64_to_float32_scalbn(a, 0, status);
2684 }
2685 
2686 float32 uint32_to_float32(uint32_t a, float_status *status)
2687 {
2688     return uint64_to_float32_scalbn(a, 0, status);
2689 }
2690 
2691 float32 uint16_to_float32(uint16_t a, float_status *status)
2692 {
2693     return uint64_to_float32_scalbn(a, 0, status);
2694 }
2695 
2696 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2697 {
2698     FloatParts pa = uint_to_float(a, scale, status);
2699     return float64_round_pack_canonical(pa, status);
2700 }
2701 
2702 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2703 {
2704     return uint64_to_float64_scalbn(a, scale, status);
2705 }
2706 
2707 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2708 {
2709     return uint64_to_float64_scalbn(a, scale, status);
2710 }
2711 
2712 float64 uint64_to_float64(uint64_t a, float_status *status)
2713 {
2714     return uint64_to_float64_scalbn(a, 0, status);
2715 }
2716 
2717 float64 uint32_to_float64(uint32_t a, float_status *status)
2718 {
2719     return uint64_to_float64_scalbn(a, 0, status);
2720 }
2721 
2722 float64 uint16_to_float64(uint16_t a, float_status *status)
2723 {
2724     return uint64_to_float64_scalbn(a, 0, status);
2725 }
2726 
2727 /* Float Min/Max */
2728 /* min() and max() functions. These can't be implemented as
2729  * 'compare and pick one input' because that would mishandle
2730  * NaNs and +0 vs -0.
2731  *
2732  * minnum() and maxnum() functions. These are similar to the min()
2733  * and max() functions but if one of the arguments is a QNaN and
2734  * the other is numerical then the numerical argument is returned.
2735  * SNaNs will get quietened before being returned.
2736  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2737  * and maxNum() operations. min() and max() are the typical min/max
2738  * semantics provided by many CPUs which predate that specification.
2739  *
2740  * minnummag() and maxnummag() functions correspond to minNumMag()
2741  * and minNumMag() from the IEEE-754 2008.
2742  */
2743 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2744                                 bool ieee, bool ismag, float_status *s)
2745 {
2746     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2747         if (ieee) {
2748             /* Takes two floating-point values `a' and `b', one of
2749              * which is a NaN, and returns the appropriate NaN
2750              * result. If either `a' or `b' is a signaling NaN,
2751              * the invalid exception is raised.
2752              */
2753             if (is_snan(a.cls) || is_snan(b.cls)) {
2754                 return pick_nan(a, b, s);
2755             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2756                 return b;
2757             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2758                 return a;
2759             }
2760         }
2761         return pick_nan(a, b, s);
2762     } else {
2763         int a_exp, b_exp;
2764 
2765         switch (a.cls) {
2766         case float_class_normal:
2767             a_exp = a.exp;
2768             break;
2769         case float_class_inf:
2770             a_exp = INT_MAX;
2771             break;
2772         case float_class_zero:
2773             a_exp = INT_MIN;
2774             break;
2775         default:
2776             g_assert_not_reached();
2777             break;
2778         }
2779         switch (b.cls) {
2780         case float_class_normal:
2781             b_exp = b.exp;
2782             break;
2783         case float_class_inf:
2784             b_exp = INT_MAX;
2785             break;
2786         case float_class_zero:
2787             b_exp = INT_MIN;
2788             break;
2789         default:
2790             g_assert_not_reached();
2791             break;
2792         }
2793 
2794         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2795             bool a_less = a_exp < b_exp;
2796             if (a_exp == b_exp) {
2797                 a_less = a.frac < b.frac;
2798             }
2799             return a_less ^ ismin ? b : a;
2800         }
2801 
2802         if (a.sign == b.sign) {
2803             bool a_less = a_exp < b_exp;
2804             if (a_exp == b_exp) {
2805                 a_less = a.frac < b.frac;
2806             }
2807             return a.sign ^ a_less ^ ismin ? b : a;
2808         } else {
2809             return a.sign ^ ismin ? b : a;
2810         }
2811     }
2812 }
2813 
2814 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2815 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2816                                      float_status *s)                   \
2817 {                                                                       \
2818     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2819     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2820     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2821                                                                         \
2822     return float ## sz ## _round_pack_canonical(pr, s);                 \
2823 }
2824 
2825 MINMAX(16, min, true, false, false)
2826 MINMAX(16, minnum, true, true, false)
2827 MINMAX(16, minnummag, true, true, true)
2828 MINMAX(16, max, false, false, false)
2829 MINMAX(16, maxnum, false, true, false)
2830 MINMAX(16, maxnummag, false, true, true)
2831 
2832 MINMAX(32, min, true, false, false)
2833 MINMAX(32, minnum, true, true, false)
2834 MINMAX(32, minnummag, true, true, true)
2835 MINMAX(32, max, false, false, false)
2836 MINMAX(32, maxnum, false, true, false)
2837 MINMAX(32, maxnummag, false, true, true)
2838 
2839 MINMAX(64, min, true, false, false)
2840 MINMAX(64, minnum, true, true, false)
2841 MINMAX(64, minnummag, true, true, true)
2842 MINMAX(64, max, false, false, false)
2843 MINMAX(64, maxnum, false, true, false)
2844 MINMAX(64, maxnummag, false, true, true)
2845 
2846 #undef MINMAX
2847 
2848 /* Floating point compare */
2849 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2850                           float_status *s)
2851 {
2852     if (is_nan(a.cls) || is_nan(b.cls)) {
2853         if (!is_quiet ||
2854             a.cls == float_class_snan ||
2855             b.cls == float_class_snan) {
2856             s->float_exception_flags |= float_flag_invalid;
2857         }
2858         return float_relation_unordered;
2859     }
2860 
2861     if (a.cls == float_class_zero) {
2862         if (b.cls == float_class_zero) {
2863             return float_relation_equal;
2864         }
2865         return b.sign ? float_relation_greater : float_relation_less;
2866     } else if (b.cls == float_class_zero) {
2867         return a.sign ? float_relation_less : float_relation_greater;
2868     }
2869 
2870     /* The only really important thing about infinity is its sign. If
2871      * both are infinities the sign marks the smallest of the two.
2872      */
2873     if (a.cls == float_class_inf) {
2874         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2875             return float_relation_equal;
2876         }
2877         return a.sign ? float_relation_less : float_relation_greater;
2878     } else if (b.cls == float_class_inf) {
2879         return b.sign ? float_relation_greater : float_relation_less;
2880     }
2881 
2882     if (a.sign != b.sign) {
2883         return a.sign ? float_relation_less : float_relation_greater;
2884     }
2885 
2886     if (a.exp == b.exp) {
2887         if (a.frac == b.frac) {
2888             return float_relation_equal;
2889         }
2890         if (a.sign) {
2891             return a.frac > b.frac ?
2892                 float_relation_less : float_relation_greater;
2893         } else {
2894             return a.frac > b.frac ?
2895                 float_relation_greater : float_relation_less;
2896         }
2897     } else {
2898         if (a.sign) {
2899             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2900         } else {
2901             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2902         }
2903     }
2904 }
2905 
2906 #define COMPARE(name, attr, sz)                                         \
2907 static int attr                                                         \
2908 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2909 {                                                                       \
2910     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2911     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2912     return compare_floats(pa, pb, is_quiet, s);                         \
2913 }
2914 
2915 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2916 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2917 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2918 
2919 #undef COMPARE
2920 
2921 int float16_compare(float16 a, float16 b, float_status *s)
2922 {
2923     return soft_f16_compare(a, b, false, s);
2924 }
2925 
2926 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2927 {
2928     return soft_f16_compare(a, b, true, s);
2929 }
2930 
2931 static int QEMU_FLATTEN
2932 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2933 {
2934     union_float32 ua, ub;
2935 
2936     ua.s = xa;
2937     ub.s = xb;
2938 
2939     if (QEMU_NO_HARDFLOAT) {
2940         goto soft;
2941     }
2942 
2943     float32_input_flush2(&ua.s, &ub.s, s);
2944     if (isgreaterequal(ua.h, ub.h)) {
2945         if (isgreater(ua.h, ub.h)) {
2946             return float_relation_greater;
2947         }
2948         return float_relation_equal;
2949     }
2950     if (likely(isless(ua.h, ub.h))) {
2951         return float_relation_less;
2952     }
2953     /* The only condition remaining is unordered.
2954      * Fall through to set flags.
2955      */
2956  soft:
2957     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2958 }
2959 
2960 int float32_compare(float32 a, float32 b, float_status *s)
2961 {
2962     return f32_compare(a, b, false, s);
2963 }
2964 
2965 int float32_compare_quiet(float32 a, float32 b, float_status *s)
2966 {
2967     return f32_compare(a, b, true, s);
2968 }
2969 
2970 static int QEMU_FLATTEN
2971 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
2972 {
2973     union_float64 ua, ub;
2974 
2975     ua.s = xa;
2976     ub.s = xb;
2977 
2978     if (QEMU_NO_HARDFLOAT) {
2979         goto soft;
2980     }
2981 
2982     float64_input_flush2(&ua.s, &ub.s, s);
2983     if (isgreaterequal(ua.h, ub.h)) {
2984         if (isgreater(ua.h, ub.h)) {
2985             return float_relation_greater;
2986         }
2987         return float_relation_equal;
2988     }
2989     if (likely(isless(ua.h, ub.h))) {
2990         return float_relation_less;
2991     }
2992     /* The only condition remaining is unordered.
2993      * Fall through to set flags.
2994      */
2995  soft:
2996     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
2997 }
2998 
2999 int float64_compare(float64 a, float64 b, float_status *s)
3000 {
3001     return f64_compare(a, b, false, s);
3002 }
3003 
3004 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3005 {
3006     return f64_compare(a, b, true, s);
3007 }
3008 
3009 /* Multiply A by 2 raised to the power N.  */
3010 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3011 {
3012     if (unlikely(is_nan(a.cls))) {
3013         return return_nan(a, s);
3014     }
3015     if (a.cls == float_class_normal) {
3016         /* The largest float type (even though not supported by FloatParts)
3017          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3018          * still allows rounding to infinity, without allowing overflow
3019          * within the int32_t that backs FloatParts.exp.
3020          */
3021         n = MIN(MAX(n, -0x10000), 0x10000);
3022         a.exp += n;
3023     }
3024     return a;
3025 }
3026 
3027 float16 float16_scalbn(float16 a, int n, float_status *status)
3028 {
3029     FloatParts pa = float16_unpack_canonical(a, status);
3030     FloatParts pr = scalbn_decomposed(pa, n, status);
3031     return float16_round_pack_canonical(pr, status);
3032 }
3033 
3034 float32 float32_scalbn(float32 a, int n, float_status *status)
3035 {
3036     FloatParts pa = float32_unpack_canonical(a, status);
3037     FloatParts pr = scalbn_decomposed(pa, n, status);
3038     return float32_round_pack_canonical(pr, status);
3039 }
3040 
3041 float64 float64_scalbn(float64 a, int n, float_status *status)
3042 {
3043     FloatParts pa = float64_unpack_canonical(a, status);
3044     FloatParts pr = scalbn_decomposed(pa, n, status);
3045     return float64_round_pack_canonical(pr, status);
3046 }
3047 
3048 /*
3049  * Square Root
3050  *
3051  * The old softfloat code did an approximation step before zeroing in
3052  * on the final result. However for simpleness we just compute the
3053  * square root by iterating down from the implicit bit to enough extra
3054  * bits to ensure we get a correctly rounded result.
3055  *
3056  * This does mean however the calculation is slower than before,
3057  * especially for 64 bit floats.
3058  */
3059 
3060 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3061 {
3062     uint64_t a_frac, r_frac, s_frac;
3063     int bit, last_bit;
3064 
3065     if (is_nan(a.cls)) {
3066         return return_nan(a, s);
3067     }
3068     if (a.cls == float_class_zero) {
3069         return a;  /* sqrt(+-0) = +-0 */
3070     }
3071     if (a.sign) {
3072         s->float_exception_flags |= float_flag_invalid;
3073         return parts_default_nan(s);
3074     }
3075     if (a.cls == float_class_inf) {
3076         return a;  /* sqrt(+inf) = +inf */
3077     }
3078 
3079     assert(a.cls == float_class_normal);
3080 
3081     /* We need two overflow bits at the top. Adding room for that is a
3082      * right shift. If the exponent is odd, we can discard the low bit
3083      * by multiplying the fraction by 2; that's a left shift. Combine
3084      * those and we shift right if the exponent is even.
3085      */
3086     a_frac = a.frac;
3087     if (!(a.exp & 1)) {
3088         a_frac >>= 1;
3089     }
3090     a.exp >>= 1;
3091 
3092     /* Bit-by-bit computation of sqrt.  */
3093     r_frac = 0;
3094     s_frac = 0;
3095 
3096     /* Iterate from implicit bit down to the 3 extra bits to compute a
3097      * properly rounded result. Remember we've inserted one more bit
3098      * at the top, so these positions are one less.
3099      */
3100     bit = DECOMPOSED_BINARY_POINT - 1;
3101     last_bit = MAX(p->frac_shift - 4, 0);
3102     do {
3103         uint64_t q = 1ULL << bit;
3104         uint64_t t_frac = s_frac + q;
3105         if (t_frac <= a_frac) {
3106             s_frac = t_frac + q;
3107             a_frac -= t_frac;
3108             r_frac += q;
3109         }
3110         a_frac <<= 1;
3111     } while (--bit >= last_bit);
3112 
3113     /* Undo the right shift done above. If there is any remaining
3114      * fraction, the result is inexact. Set the sticky bit.
3115      */
3116     a.frac = (r_frac << 1) + (a_frac != 0);
3117 
3118     return a;
3119 }
3120 
3121 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3122 {
3123     FloatParts pa = float16_unpack_canonical(a, status);
3124     FloatParts pr = sqrt_float(pa, status, &float16_params);
3125     return float16_round_pack_canonical(pr, status);
3126 }
3127 
3128 static float32 QEMU_SOFTFLOAT_ATTR
3129 soft_f32_sqrt(float32 a, float_status *status)
3130 {
3131     FloatParts pa = float32_unpack_canonical(a, status);
3132     FloatParts pr = sqrt_float(pa, status, &float32_params);
3133     return float32_round_pack_canonical(pr, status);
3134 }
3135 
3136 static float64 QEMU_SOFTFLOAT_ATTR
3137 soft_f64_sqrt(float64 a, float_status *status)
3138 {
3139     FloatParts pa = float64_unpack_canonical(a, status);
3140     FloatParts pr = sqrt_float(pa, status, &float64_params);
3141     return float64_round_pack_canonical(pr, status);
3142 }
3143 
3144 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3145 {
3146     union_float32 ua, ur;
3147 
3148     ua.s = xa;
3149     if (unlikely(!can_use_fpu(s))) {
3150         goto soft;
3151     }
3152 
3153     float32_input_flush1(&ua.s, s);
3154     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3155         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3156                        fpclassify(ua.h) == FP_ZERO) ||
3157                      signbit(ua.h))) {
3158             goto soft;
3159         }
3160     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3161                         float32_is_neg(ua.s))) {
3162         goto soft;
3163     }
3164     ur.h = sqrtf(ua.h);
3165     return ur.s;
3166 
3167  soft:
3168     return soft_f32_sqrt(ua.s, s);
3169 }
3170 
3171 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3172 {
3173     union_float64 ua, ur;
3174 
3175     ua.s = xa;
3176     if (unlikely(!can_use_fpu(s))) {
3177         goto soft;
3178     }
3179 
3180     float64_input_flush1(&ua.s, s);
3181     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3182         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3183                        fpclassify(ua.h) == FP_ZERO) ||
3184                      signbit(ua.h))) {
3185             goto soft;
3186         }
3187     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3188                         float64_is_neg(ua.s))) {
3189         goto soft;
3190     }
3191     ur.h = sqrt(ua.h);
3192     return ur.s;
3193 
3194  soft:
3195     return soft_f64_sqrt(ua.s, s);
3196 }
3197 
3198 /*----------------------------------------------------------------------------
3199 | The pattern for a default generated NaN.
3200 *----------------------------------------------------------------------------*/
3201 
3202 float16 float16_default_nan(float_status *status)
3203 {
3204     FloatParts p = parts_default_nan(status);
3205     p.frac >>= float16_params.frac_shift;
3206     return float16_pack_raw(p);
3207 }
3208 
3209 float32 float32_default_nan(float_status *status)
3210 {
3211     FloatParts p = parts_default_nan(status);
3212     p.frac >>= float32_params.frac_shift;
3213     return float32_pack_raw(p);
3214 }
3215 
3216 float64 float64_default_nan(float_status *status)
3217 {
3218     FloatParts p = parts_default_nan(status);
3219     p.frac >>= float64_params.frac_shift;
3220     return float64_pack_raw(p);
3221 }
3222 
3223 float128 float128_default_nan(float_status *status)
3224 {
3225     FloatParts p = parts_default_nan(status);
3226     float128 r;
3227 
3228     /* Extrapolate from the choices made by parts_default_nan to fill
3229      * in the quad-floating format.  If the low bit is set, assume we
3230      * want to set all non-snan bits.
3231      */
3232     r.low = -(p.frac & 1);
3233     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3234     r.high |= LIT64(0x7FFF000000000000);
3235     r.high |= (uint64_t)p.sign << 63;
3236 
3237     return r;
3238 }
3239 
3240 /*----------------------------------------------------------------------------
3241 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3242 *----------------------------------------------------------------------------*/
3243 
3244 float16 float16_silence_nan(float16 a, float_status *status)
3245 {
3246     FloatParts p = float16_unpack_raw(a);
3247     p.frac <<= float16_params.frac_shift;
3248     p = parts_silence_nan(p, status);
3249     p.frac >>= float16_params.frac_shift;
3250     return float16_pack_raw(p);
3251 }
3252 
3253 float32 float32_silence_nan(float32 a, float_status *status)
3254 {
3255     FloatParts p = float32_unpack_raw(a);
3256     p.frac <<= float32_params.frac_shift;
3257     p = parts_silence_nan(p, status);
3258     p.frac >>= float32_params.frac_shift;
3259     return float32_pack_raw(p);
3260 }
3261 
3262 float64 float64_silence_nan(float64 a, float_status *status)
3263 {
3264     FloatParts p = float64_unpack_raw(a);
3265     p.frac <<= float64_params.frac_shift;
3266     p = parts_silence_nan(p, status);
3267     p.frac >>= float64_params.frac_shift;
3268     return float64_pack_raw(p);
3269 }
3270 
3271 /*----------------------------------------------------------------------------
3272 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3273 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3274 | input.  If `zSign' is 1, the input is negated before being converted to an
3275 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3276 | is simply rounded to an integer, with the inexact exception raised if the
3277 | input cannot be represented exactly as an integer.  However, if the fixed-
3278 | point input is too large, the invalid exception is raised and the largest
3279 | positive or negative integer is returned.
3280 *----------------------------------------------------------------------------*/
3281 
3282 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3283 {
3284     int8_t roundingMode;
3285     flag roundNearestEven;
3286     int8_t roundIncrement, roundBits;
3287     int32_t z;
3288 
3289     roundingMode = status->float_rounding_mode;
3290     roundNearestEven = ( roundingMode == float_round_nearest_even );
3291     switch (roundingMode) {
3292     case float_round_nearest_even:
3293     case float_round_ties_away:
3294         roundIncrement = 0x40;
3295         break;
3296     case float_round_to_zero:
3297         roundIncrement = 0;
3298         break;
3299     case float_round_up:
3300         roundIncrement = zSign ? 0 : 0x7f;
3301         break;
3302     case float_round_down:
3303         roundIncrement = zSign ? 0x7f : 0;
3304         break;
3305     default:
3306         abort();
3307     }
3308     roundBits = absZ & 0x7F;
3309     absZ = ( absZ + roundIncrement )>>7;
3310     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3311     z = absZ;
3312     if ( zSign ) z = - z;
3313     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3314         float_raise(float_flag_invalid, status);
3315         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3316     }
3317     if (roundBits) {
3318         status->float_exception_flags |= float_flag_inexact;
3319     }
3320     return z;
3321 
3322 }
3323 
3324 /*----------------------------------------------------------------------------
3325 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3326 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3327 | and returns the properly rounded 64-bit integer corresponding to the input.
3328 | If `zSign' is 1, the input is negated before being converted to an integer.
3329 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3330 | the inexact exception raised if the input cannot be represented exactly as
3331 | an integer.  However, if the fixed-point input is too large, the invalid
3332 | exception is raised and the largest positive or negative integer is
3333 | returned.
3334 *----------------------------------------------------------------------------*/
3335 
3336 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3337                                float_status *status)
3338 {
3339     int8_t roundingMode;
3340     flag roundNearestEven, increment;
3341     int64_t z;
3342 
3343     roundingMode = status->float_rounding_mode;
3344     roundNearestEven = ( roundingMode == float_round_nearest_even );
3345     switch (roundingMode) {
3346     case float_round_nearest_even:
3347     case float_round_ties_away:
3348         increment = ((int64_t) absZ1 < 0);
3349         break;
3350     case float_round_to_zero:
3351         increment = 0;
3352         break;
3353     case float_round_up:
3354         increment = !zSign && absZ1;
3355         break;
3356     case float_round_down:
3357         increment = zSign && absZ1;
3358         break;
3359     default:
3360         abort();
3361     }
3362     if ( increment ) {
3363         ++absZ0;
3364         if ( absZ0 == 0 ) goto overflow;
3365         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3366     }
3367     z = absZ0;
3368     if ( zSign ) z = - z;
3369     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3370  overflow:
3371         float_raise(float_flag_invalid, status);
3372         return
3373               zSign ? (int64_t) LIT64( 0x8000000000000000 )
3374             : LIT64( 0x7FFFFFFFFFFFFFFF );
3375     }
3376     if (absZ1) {
3377         status->float_exception_flags |= float_flag_inexact;
3378     }
3379     return z;
3380 
3381 }
3382 
3383 /*----------------------------------------------------------------------------
3384 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3385 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3386 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3387 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3388 | with the inexact exception raised if the input cannot be represented exactly
3389 | as an integer.  However, if the fixed-point input is too large, the invalid
3390 | exception is raised and the largest unsigned integer is returned.
3391 *----------------------------------------------------------------------------*/
3392 
3393 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3394                                 uint64_t absZ1, float_status *status)
3395 {
3396     int8_t roundingMode;
3397     flag roundNearestEven, increment;
3398 
3399     roundingMode = status->float_rounding_mode;
3400     roundNearestEven = (roundingMode == float_round_nearest_even);
3401     switch (roundingMode) {
3402     case float_round_nearest_even:
3403     case float_round_ties_away:
3404         increment = ((int64_t)absZ1 < 0);
3405         break;
3406     case float_round_to_zero:
3407         increment = 0;
3408         break;
3409     case float_round_up:
3410         increment = !zSign && absZ1;
3411         break;
3412     case float_round_down:
3413         increment = zSign && absZ1;
3414         break;
3415     default:
3416         abort();
3417     }
3418     if (increment) {
3419         ++absZ0;
3420         if (absZ0 == 0) {
3421             float_raise(float_flag_invalid, status);
3422             return LIT64(0xFFFFFFFFFFFFFFFF);
3423         }
3424         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3425     }
3426 
3427     if (zSign && absZ0) {
3428         float_raise(float_flag_invalid, status);
3429         return 0;
3430     }
3431 
3432     if (absZ1) {
3433         status->float_exception_flags |= float_flag_inexact;
3434     }
3435     return absZ0;
3436 }
3437 
3438 /*----------------------------------------------------------------------------
3439 | If `a' is denormal and we are in flush-to-zero mode then set the
3440 | input-denormal exception and return zero. Otherwise just return the value.
3441 *----------------------------------------------------------------------------*/
3442 float32 float32_squash_input_denormal(float32 a, float_status *status)
3443 {
3444     if (status->flush_inputs_to_zero) {
3445         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3446             float_raise(float_flag_input_denormal, status);
3447             return make_float32(float32_val(a) & 0x80000000);
3448         }
3449     }
3450     return a;
3451 }
3452 
3453 /*----------------------------------------------------------------------------
3454 | Normalizes the subnormal single-precision floating-point value represented
3455 | by the denormalized significand `aSig'.  The normalized exponent and
3456 | significand are stored at the locations pointed to by `zExpPtr' and
3457 | `zSigPtr', respectively.
3458 *----------------------------------------------------------------------------*/
3459 
3460 static void
3461  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3462 {
3463     int8_t shiftCount;
3464 
3465     shiftCount = clz32(aSig) - 8;
3466     *zSigPtr = aSig<<shiftCount;
3467     *zExpPtr = 1 - shiftCount;
3468 
3469 }
3470 
3471 /*----------------------------------------------------------------------------
3472 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3473 | and significand `zSig', and returns the proper single-precision floating-
3474 | point value corresponding to the abstract input.  Ordinarily, the abstract
3475 | value is simply rounded and packed into the single-precision format, with
3476 | the inexact exception raised if the abstract input cannot be represented
3477 | exactly.  However, if the abstract value is too large, the overflow and
3478 | inexact exceptions are raised and an infinity or maximal finite value is
3479 | returned.  If the abstract value is too small, the input value is rounded to
3480 | a subnormal number, and the underflow and inexact exceptions are raised if
3481 | the abstract input cannot be represented exactly as a subnormal single-
3482 | precision floating-point number.
3483 |     The input significand `zSig' has its binary point between bits 30
3484 | and 29, which is 7 bits to the left of the usual location.  This shifted
3485 | significand must be normalized or smaller.  If `zSig' is not normalized,
3486 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3487 | and it must not require rounding.  In the usual case that `zSig' is
3488 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3489 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3490 | Binary Floating-Point Arithmetic.
3491 *----------------------------------------------------------------------------*/
3492 
3493 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3494                                    float_status *status)
3495 {
3496     int8_t roundingMode;
3497     flag roundNearestEven;
3498     int8_t roundIncrement, roundBits;
3499     flag isTiny;
3500 
3501     roundingMode = status->float_rounding_mode;
3502     roundNearestEven = ( roundingMode == float_round_nearest_even );
3503     switch (roundingMode) {
3504     case float_round_nearest_even:
3505     case float_round_ties_away:
3506         roundIncrement = 0x40;
3507         break;
3508     case float_round_to_zero:
3509         roundIncrement = 0;
3510         break;
3511     case float_round_up:
3512         roundIncrement = zSign ? 0 : 0x7f;
3513         break;
3514     case float_round_down:
3515         roundIncrement = zSign ? 0x7f : 0;
3516         break;
3517     default:
3518         abort();
3519         break;
3520     }
3521     roundBits = zSig & 0x7F;
3522     if ( 0xFD <= (uint16_t) zExp ) {
3523         if (    ( 0xFD < zExp )
3524              || (    ( zExp == 0xFD )
3525                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3526            ) {
3527             float_raise(float_flag_overflow | float_flag_inexact, status);
3528             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3529         }
3530         if ( zExp < 0 ) {
3531             if (status->flush_to_zero) {
3532                 float_raise(float_flag_output_denormal, status);
3533                 return packFloat32(zSign, 0, 0);
3534             }
3535             isTiny =
3536                 (status->float_detect_tininess
3537                  == float_tininess_before_rounding)
3538                 || ( zExp < -1 )
3539                 || ( zSig + roundIncrement < 0x80000000 );
3540             shift32RightJamming( zSig, - zExp, &zSig );
3541             zExp = 0;
3542             roundBits = zSig & 0x7F;
3543             if (isTiny && roundBits) {
3544                 float_raise(float_flag_underflow, status);
3545             }
3546         }
3547     }
3548     if (roundBits) {
3549         status->float_exception_flags |= float_flag_inexact;
3550     }
3551     zSig = ( zSig + roundIncrement )>>7;
3552     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3553     if ( zSig == 0 ) zExp = 0;
3554     return packFloat32( zSign, zExp, zSig );
3555 
3556 }
3557 
3558 /*----------------------------------------------------------------------------
3559 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3560 | and significand `zSig', and returns the proper single-precision floating-
3561 | point value corresponding to the abstract input.  This routine is just like
3562 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3563 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3564 | floating-point exponent.
3565 *----------------------------------------------------------------------------*/
3566 
3567 static float32
3568  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3569                               float_status *status)
3570 {
3571     int8_t shiftCount;
3572 
3573     shiftCount = clz32(zSig) - 1;
3574     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3575                                status);
3576 
3577 }
3578 
3579 /*----------------------------------------------------------------------------
3580 | If `a' is denormal and we are in flush-to-zero mode then set the
3581 | input-denormal exception and return zero. Otherwise just return the value.
3582 *----------------------------------------------------------------------------*/
3583 float64 float64_squash_input_denormal(float64 a, float_status *status)
3584 {
3585     if (status->flush_inputs_to_zero) {
3586         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3587             float_raise(float_flag_input_denormal, status);
3588             return make_float64(float64_val(a) & (1ULL << 63));
3589         }
3590     }
3591     return a;
3592 }
3593 
3594 /*----------------------------------------------------------------------------
3595 | Normalizes the subnormal double-precision floating-point value represented
3596 | by the denormalized significand `aSig'.  The normalized exponent and
3597 | significand are stored at the locations pointed to by `zExpPtr' and
3598 | `zSigPtr', respectively.
3599 *----------------------------------------------------------------------------*/
3600 
3601 static void
3602  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3603 {
3604     int8_t shiftCount;
3605 
3606     shiftCount = clz64(aSig) - 11;
3607     *zSigPtr = aSig<<shiftCount;
3608     *zExpPtr = 1 - shiftCount;
3609 
3610 }
3611 
3612 /*----------------------------------------------------------------------------
3613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3614 | double-precision floating-point value, returning the result.  After being
3615 | shifted into the proper positions, the three fields are simply added
3616 | together to form the result.  This means that any integer portion of `zSig'
3617 | will be added into the exponent.  Since a properly normalized significand
3618 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3619 | than the desired result exponent whenever `zSig' is a complete, normalized
3620 | significand.
3621 *----------------------------------------------------------------------------*/
3622 
3623 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3624 {
3625 
3626     return make_float64(
3627         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3628 
3629 }
3630 
3631 /*----------------------------------------------------------------------------
3632 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3633 | and significand `zSig', and returns the proper double-precision floating-
3634 | point value corresponding to the abstract input.  Ordinarily, the abstract
3635 | value is simply rounded and packed into the double-precision format, with
3636 | the inexact exception raised if the abstract input cannot be represented
3637 | exactly.  However, if the abstract value is too large, the overflow and
3638 | inexact exceptions are raised and an infinity or maximal finite value is
3639 | returned.  If the abstract value is too small, the input value is rounded to
3640 | a subnormal number, and the underflow and inexact exceptions are raised if
3641 | the abstract input cannot be represented exactly as a subnormal double-
3642 | precision floating-point number.
3643 |     The input significand `zSig' has its binary point between bits 62
3644 | and 61, which is 10 bits to the left of the usual location.  This shifted
3645 | significand must be normalized or smaller.  If `zSig' is not normalized,
3646 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3647 | and it must not require rounding.  In the usual case that `zSig' is
3648 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3649 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3650 | Binary Floating-Point Arithmetic.
3651 *----------------------------------------------------------------------------*/
3652 
3653 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3654                                    float_status *status)
3655 {
3656     int8_t roundingMode;
3657     flag roundNearestEven;
3658     int roundIncrement, roundBits;
3659     flag isTiny;
3660 
3661     roundingMode = status->float_rounding_mode;
3662     roundNearestEven = ( roundingMode == float_round_nearest_even );
3663     switch (roundingMode) {
3664     case float_round_nearest_even:
3665     case float_round_ties_away:
3666         roundIncrement = 0x200;
3667         break;
3668     case float_round_to_zero:
3669         roundIncrement = 0;
3670         break;
3671     case float_round_up:
3672         roundIncrement = zSign ? 0 : 0x3ff;
3673         break;
3674     case float_round_down:
3675         roundIncrement = zSign ? 0x3ff : 0;
3676         break;
3677     case float_round_to_odd:
3678         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3679         break;
3680     default:
3681         abort();
3682     }
3683     roundBits = zSig & 0x3FF;
3684     if ( 0x7FD <= (uint16_t) zExp ) {
3685         if (    ( 0x7FD < zExp )
3686              || (    ( zExp == 0x7FD )
3687                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3688            ) {
3689             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3690                                    roundIncrement != 0;
3691             float_raise(float_flag_overflow | float_flag_inexact, status);
3692             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3693         }
3694         if ( zExp < 0 ) {
3695             if (status->flush_to_zero) {
3696                 float_raise(float_flag_output_denormal, status);
3697                 return packFloat64(zSign, 0, 0);
3698             }
3699             isTiny =
3700                    (status->float_detect_tininess
3701                     == float_tininess_before_rounding)
3702                 || ( zExp < -1 )
3703                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3704             shift64RightJamming( zSig, - zExp, &zSig );
3705             zExp = 0;
3706             roundBits = zSig & 0x3FF;
3707             if (isTiny && roundBits) {
3708                 float_raise(float_flag_underflow, status);
3709             }
3710             if (roundingMode == float_round_to_odd) {
3711                 /*
3712                  * For round-to-odd case, the roundIncrement depends on
3713                  * zSig which just changed.
3714                  */
3715                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3716             }
3717         }
3718     }
3719     if (roundBits) {
3720         status->float_exception_flags |= float_flag_inexact;
3721     }
3722     zSig = ( zSig + roundIncrement )>>10;
3723     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3724     if ( zSig == 0 ) zExp = 0;
3725     return packFloat64( zSign, zExp, zSig );
3726 
3727 }
3728 
3729 /*----------------------------------------------------------------------------
3730 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3731 | and significand `zSig', and returns the proper double-precision floating-
3732 | point value corresponding to the abstract input.  This routine is just like
3733 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3734 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3735 | floating-point exponent.
3736 *----------------------------------------------------------------------------*/
3737 
3738 static float64
3739  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3740                               float_status *status)
3741 {
3742     int8_t shiftCount;
3743 
3744     shiftCount = clz64(zSig) - 1;
3745     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3746                                status);
3747 
3748 }
3749 
3750 /*----------------------------------------------------------------------------
3751 | Normalizes the subnormal extended double-precision floating-point value
3752 | represented by the denormalized significand `aSig'.  The normalized exponent
3753 | and significand are stored at the locations pointed to by `zExpPtr' and
3754 | `zSigPtr', respectively.
3755 *----------------------------------------------------------------------------*/
3756 
3757 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3758                                 uint64_t *zSigPtr)
3759 {
3760     int8_t shiftCount;
3761 
3762     shiftCount = clz64(aSig);
3763     *zSigPtr = aSig<<shiftCount;
3764     *zExpPtr = 1 - shiftCount;
3765 }
3766 
3767 /*----------------------------------------------------------------------------
3768 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3769 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3770 | and returns the proper extended double-precision floating-point value
3771 | corresponding to the abstract input.  Ordinarily, the abstract value is
3772 | rounded and packed into the extended double-precision format, with the
3773 | inexact exception raised if the abstract input cannot be represented
3774 | exactly.  However, if the abstract value is too large, the overflow and
3775 | inexact exceptions are raised and an infinity or maximal finite value is
3776 | returned.  If the abstract value is too small, the input value is rounded to
3777 | a subnormal number, and the underflow and inexact exceptions are raised if
3778 | the abstract input cannot be represented exactly as a subnormal extended
3779 | double-precision floating-point number.
3780 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3781 | number of bits as single or double precision, respectively.  Otherwise, the
3782 | result is rounded to the full precision of the extended double-precision
3783 | format.
3784 |     The input significand must be normalized or smaller.  If the input
3785 | significand is not normalized, `zExp' must be 0; in that case, the result
3786 | returned is a subnormal number, and it must not require rounding.  The
3787 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3788 | Floating-Point Arithmetic.
3789 *----------------------------------------------------------------------------*/
3790 
3791 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3792                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3793                               float_status *status)
3794 {
3795     int8_t roundingMode;
3796     flag roundNearestEven, increment, isTiny;
3797     int64_t roundIncrement, roundMask, roundBits;
3798 
3799     roundingMode = status->float_rounding_mode;
3800     roundNearestEven = ( roundingMode == float_round_nearest_even );
3801     if ( roundingPrecision == 80 ) goto precision80;
3802     if ( roundingPrecision == 64 ) {
3803         roundIncrement = LIT64( 0x0000000000000400 );
3804         roundMask = LIT64( 0x00000000000007FF );
3805     }
3806     else if ( roundingPrecision == 32 ) {
3807         roundIncrement = LIT64( 0x0000008000000000 );
3808         roundMask = LIT64( 0x000000FFFFFFFFFF );
3809     }
3810     else {
3811         goto precision80;
3812     }
3813     zSig0 |= ( zSig1 != 0 );
3814     switch (roundingMode) {
3815     case float_round_nearest_even:
3816     case float_round_ties_away:
3817         break;
3818     case float_round_to_zero:
3819         roundIncrement = 0;
3820         break;
3821     case float_round_up:
3822         roundIncrement = zSign ? 0 : roundMask;
3823         break;
3824     case float_round_down:
3825         roundIncrement = zSign ? roundMask : 0;
3826         break;
3827     default:
3828         abort();
3829     }
3830     roundBits = zSig0 & roundMask;
3831     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3832         if (    ( 0x7FFE < zExp )
3833              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3834            ) {
3835             goto overflow;
3836         }
3837         if ( zExp <= 0 ) {
3838             if (status->flush_to_zero) {
3839                 float_raise(float_flag_output_denormal, status);
3840                 return packFloatx80(zSign, 0, 0);
3841             }
3842             isTiny =
3843                    (status->float_detect_tininess
3844                     == float_tininess_before_rounding)
3845                 || ( zExp < 0 )
3846                 || ( zSig0 <= zSig0 + roundIncrement );
3847             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3848             zExp = 0;
3849             roundBits = zSig0 & roundMask;
3850             if (isTiny && roundBits) {
3851                 float_raise(float_flag_underflow, status);
3852             }
3853             if (roundBits) {
3854                 status->float_exception_flags |= float_flag_inexact;
3855             }
3856             zSig0 += roundIncrement;
3857             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3858             roundIncrement = roundMask + 1;
3859             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3860                 roundMask |= roundIncrement;
3861             }
3862             zSig0 &= ~ roundMask;
3863             return packFloatx80( zSign, zExp, zSig0 );
3864         }
3865     }
3866     if (roundBits) {
3867         status->float_exception_flags |= float_flag_inexact;
3868     }
3869     zSig0 += roundIncrement;
3870     if ( zSig0 < roundIncrement ) {
3871         ++zExp;
3872         zSig0 = LIT64( 0x8000000000000000 );
3873     }
3874     roundIncrement = roundMask + 1;
3875     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3876         roundMask |= roundIncrement;
3877     }
3878     zSig0 &= ~ roundMask;
3879     if ( zSig0 == 0 ) zExp = 0;
3880     return packFloatx80( zSign, zExp, zSig0 );
3881  precision80:
3882     switch (roundingMode) {
3883     case float_round_nearest_even:
3884     case float_round_ties_away:
3885         increment = ((int64_t)zSig1 < 0);
3886         break;
3887     case float_round_to_zero:
3888         increment = 0;
3889         break;
3890     case float_round_up:
3891         increment = !zSign && zSig1;
3892         break;
3893     case float_round_down:
3894         increment = zSign && zSig1;
3895         break;
3896     default:
3897         abort();
3898     }
3899     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3900         if (    ( 0x7FFE < zExp )
3901              || (    ( zExp == 0x7FFE )
3902                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3903                   && increment
3904                 )
3905            ) {
3906             roundMask = 0;
3907  overflow:
3908             float_raise(float_flag_overflow | float_flag_inexact, status);
3909             if (    ( roundingMode == float_round_to_zero )
3910                  || ( zSign && ( roundingMode == float_round_up ) )
3911                  || ( ! zSign && ( roundingMode == float_round_down ) )
3912                ) {
3913                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3914             }
3915             return packFloatx80(zSign,
3916                                 floatx80_infinity_high,
3917                                 floatx80_infinity_low);
3918         }
3919         if ( zExp <= 0 ) {
3920             isTiny =
3921                    (status->float_detect_tininess
3922                     == float_tininess_before_rounding)
3923                 || ( zExp < 0 )
3924                 || ! increment
3925                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3926             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3927             zExp = 0;
3928             if (isTiny && zSig1) {
3929                 float_raise(float_flag_underflow, status);
3930             }
3931             if (zSig1) {
3932                 status->float_exception_flags |= float_flag_inexact;
3933             }
3934             switch (roundingMode) {
3935             case float_round_nearest_even:
3936             case float_round_ties_away:
3937                 increment = ((int64_t)zSig1 < 0);
3938                 break;
3939             case float_round_to_zero:
3940                 increment = 0;
3941                 break;
3942             case float_round_up:
3943                 increment = !zSign && zSig1;
3944                 break;
3945             case float_round_down:
3946                 increment = zSign && zSig1;
3947                 break;
3948             default:
3949                 abort();
3950             }
3951             if ( increment ) {
3952                 ++zSig0;
3953                 zSig0 &=
3954                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3955                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3956             }
3957             return packFloatx80( zSign, zExp, zSig0 );
3958         }
3959     }
3960     if (zSig1) {
3961         status->float_exception_flags |= float_flag_inexact;
3962     }
3963     if ( increment ) {
3964         ++zSig0;
3965         if ( zSig0 == 0 ) {
3966             ++zExp;
3967             zSig0 = LIT64( 0x8000000000000000 );
3968         }
3969         else {
3970             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3971         }
3972     }
3973     else {
3974         if ( zSig0 == 0 ) zExp = 0;
3975     }
3976     return packFloatx80( zSign, zExp, zSig0 );
3977 
3978 }
3979 
3980 /*----------------------------------------------------------------------------
3981 | Takes an abstract floating-point value having sign `zSign', exponent
3982 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3983 | and returns the proper extended double-precision floating-point value
3984 | corresponding to the abstract input.  This routine is just like
3985 | `roundAndPackFloatx80' except that the input significand does not have to be
3986 | normalized.
3987 *----------------------------------------------------------------------------*/
3988 
3989 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3990                                        flag zSign, int32_t zExp,
3991                                        uint64_t zSig0, uint64_t zSig1,
3992                                        float_status *status)
3993 {
3994     int8_t shiftCount;
3995 
3996     if ( zSig0 == 0 ) {
3997         zSig0 = zSig1;
3998         zSig1 = 0;
3999         zExp -= 64;
4000     }
4001     shiftCount = clz64(zSig0);
4002     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4003     zExp -= shiftCount;
4004     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4005                                 zSig0, zSig1, status);
4006 
4007 }
4008 
4009 /*----------------------------------------------------------------------------
4010 | Returns the least-significant 64 fraction bits of the quadruple-precision
4011 | floating-point value `a'.
4012 *----------------------------------------------------------------------------*/
4013 
4014 static inline uint64_t extractFloat128Frac1( float128 a )
4015 {
4016 
4017     return a.low;
4018 
4019 }
4020 
4021 /*----------------------------------------------------------------------------
4022 | Returns the most-significant 48 fraction bits of the quadruple-precision
4023 | floating-point value `a'.
4024 *----------------------------------------------------------------------------*/
4025 
4026 static inline uint64_t extractFloat128Frac0( float128 a )
4027 {
4028 
4029     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
4030 
4031 }
4032 
4033 /*----------------------------------------------------------------------------
4034 | Returns the exponent bits of the quadruple-precision floating-point value
4035 | `a'.
4036 *----------------------------------------------------------------------------*/
4037 
4038 static inline int32_t extractFloat128Exp( float128 a )
4039 {
4040 
4041     return ( a.high>>48 ) & 0x7FFF;
4042 
4043 }
4044 
4045 /*----------------------------------------------------------------------------
4046 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4047 *----------------------------------------------------------------------------*/
4048 
4049 static inline flag extractFloat128Sign( float128 a )
4050 {
4051 
4052     return a.high>>63;
4053 
4054 }
4055 
4056 /*----------------------------------------------------------------------------
4057 | Normalizes the subnormal quadruple-precision floating-point value
4058 | represented by the denormalized significand formed by the concatenation of
4059 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4060 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4061 | significand are stored at the location pointed to by `zSig0Ptr', and the
4062 | least significant 64 bits of the normalized significand are stored at the
4063 | location pointed to by `zSig1Ptr'.
4064 *----------------------------------------------------------------------------*/
4065 
4066 static void
4067  normalizeFloat128Subnormal(
4068      uint64_t aSig0,
4069      uint64_t aSig1,
4070      int32_t *zExpPtr,
4071      uint64_t *zSig0Ptr,
4072      uint64_t *zSig1Ptr
4073  )
4074 {
4075     int8_t shiftCount;
4076 
4077     if ( aSig0 == 0 ) {
4078         shiftCount = clz64(aSig1) - 15;
4079         if ( shiftCount < 0 ) {
4080             *zSig0Ptr = aSig1>>( - shiftCount );
4081             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4082         }
4083         else {
4084             *zSig0Ptr = aSig1<<shiftCount;
4085             *zSig1Ptr = 0;
4086         }
4087         *zExpPtr = - shiftCount - 63;
4088     }
4089     else {
4090         shiftCount = clz64(aSig0) - 15;
4091         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4092         *zExpPtr = 1 - shiftCount;
4093     }
4094 
4095 }
4096 
4097 /*----------------------------------------------------------------------------
4098 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4099 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4100 | floating-point value, returning the result.  After being shifted into the
4101 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4102 | added together to form the most significant 32 bits of the result.  This
4103 | means that any integer portion of `zSig0' will be added into the exponent.
4104 | Since a properly normalized significand will have an integer portion equal
4105 | to 1, the `zExp' input should be 1 less than the desired result exponent
4106 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4107 | significand.
4108 *----------------------------------------------------------------------------*/
4109 
4110 static inline float128
4111  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4112 {
4113     float128 z;
4114 
4115     z.low = zSig1;
4116     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4117     return z;
4118 
4119 }
4120 
4121 /*----------------------------------------------------------------------------
4122 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4123 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4124 | and `zSig2', and returns the proper quadruple-precision floating-point value
4125 | corresponding to the abstract input.  Ordinarily, the abstract value is
4126 | simply rounded and packed into the quadruple-precision format, with the
4127 | inexact exception raised if the abstract input cannot be represented
4128 | exactly.  However, if the abstract value is too large, the overflow and
4129 | inexact exceptions are raised and an infinity or maximal finite value is
4130 | returned.  If the abstract value is too small, the input value is rounded to
4131 | a subnormal number, and the underflow and inexact exceptions are raised if
4132 | the abstract input cannot be represented exactly as a subnormal quadruple-
4133 | precision floating-point number.
4134 |     The input significand must be normalized or smaller.  If the input
4135 | significand is not normalized, `zExp' must be 0; in that case, the result
4136 | returned is a subnormal number, and it must not require rounding.  In the
4137 | usual case that the input significand is normalized, `zExp' must be 1 less
4138 | than the ``true'' floating-point exponent.  The handling of underflow and
4139 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4140 *----------------------------------------------------------------------------*/
4141 
4142 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4143                                      uint64_t zSig0, uint64_t zSig1,
4144                                      uint64_t zSig2, float_status *status)
4145 {
4146     int8_t roundingMode;
4147     flag roundNearestEven, increment, isTiny;
4148 
4149     roundingMode = status->float_rounding_mode;
4150     roundNearestEven = ( roundingMode == float_round_nearest_even );
4151     switch (roundingMode) {
4152     case float_round_nearest_even:
4153     case float_round_ties_away:
4154         increment = ((int64_t)zSig2 < 0);
4155         break;
4156     case float_round_to_zero:
4157         increment = 0;
4158         break;
4159     case float_round_up:
4160         increment = !zSign && zSig2;
4161         break;
4162     case float_round_down:
4163         increment = zSign && zSig2;
4164         break;
4165     case float_round_to_odd:
4166         increment = !(zSig1 & 0x1) && zSig2;
4167         break;
4168     default:
4169         abort();
4170     }
4171     if ( 0x7FFD <= (uint32_t) zExp ) {
4172         if (    ( 0x7FFD < zExp )
4173              || (    ( zExp == 0x7FFD )
4174                   && eq128(
4175                          LIT64( 0x0001FFFFFFFFFFFF ),
4176                          LIT64( 0xFFFFFFFFFFFFFFFF ),
4177                          zSig0,
4178                          zSig1
4179                      )
4180                   && increment
4181                 )
4182            ) {
4183             float_raise(float_flag_overflow | float_flag_inexact, status);
4184             if (    ( roundingMode == float_round_to_zero )
4185                  || ( zSign && ( roundingMode == float_round_up ) )
4186                  || ( ! zSign && ( roundingMode == float_round_down ) )
4187                  || (roundingMode == float_round_to_odd)
4188                ) {
4189                 return
4190                     packFloat128(
4191                         zSign,
4192                         0x7FFE,
4193                         LIT64( 0x0000FFFFFFFFFFFF ),
4194                         LIT64( 0xFFFFFFFFFFFFFFFF )
4195                     );
4196             }
4197             return packFloat128( zSign, 0x7FFF, 0, 0 );
4198         }
4199         if ( zExp < 0 ) {
4200             if (status->flush_to_zero) {
4201                 float_raise(float_flag_output_denormal, status);
4202                 return packFloat128(zSign, 0, 0, 0);
4203             }
4204             isTiny =
4205                    (status->float_detect_tininess
4206                     == float_tininess_before_rounding)
4207                 || ( zExp < -1 )
4208                 || ! increment
4209                 || lt128(
4210                        zSig0,
4211                        zSig1,
4212                        LIT64( 0x0001FFFFFFFFFFFF ),
4213                        LIT64( 0xFFFFFFFFFFFFFFFF )
4214                    );
4215             shift128ExtraRightJamming(
4216                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4217             zExp = 0;
4218             if (isTiny && zSig2) {
4219                 float_raise(float_flag_underflow, status);
4220             }
4221             switch (roundingMode) {
4222             case float_round_nearest_even:
4223             case float_round_ties_away:
4224                 increment = ((int64_t)zSig2 < 0);
4225                 break;
4226             case float_round_to_zero:
4227                 increment = 0;
4228                 break;
4229             case float_round_up:
4230                 increment = !zSign && zSig2;
4231                 break;
4232             case float_round_down:
4233                 increment = zSign && zSig2;
4234                 break;
4235             case float_round_to_odd:
4236                 increment = !(zSig1 & 0x1) && zSig2;
4237                 break;
4238             default:
4239                 abort();
4240             }
4241         }
4242     }
4243     if (zSig2) {
4244         status->float_exception_flags |= float_flag_inexact;
4245     }
4246     if ( increment ) {
4247         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4248         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4249     }
4250     else {
4251         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4252     }
4253     return packFloat128( zSign, zExp, zSig0, zSig1 );
4254 
4255 }
4256 
4257 /*----------------------------------------------------------------------------
4258 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4259 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4260 | returns the proper quadruple-precision floating-point value corresponding
4261 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4262 | except that the input significand has fewer bits and does not have to be
4263 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4264 | point exponent.
4265 *----------------------------------------------------------------------------*/
4266 
4267 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4268                                               uint64_t zSig0, uint64_t zSig1,
4269                                               float_status *status)
4270 {
4271     int8_t shiftCount;
4272     uint64_t zSig2;
4273 
4274     if ( zSig0 == 0 ) {
4275         zSig0 = zSig1;
4276         zSig1 = 0;
4277         zExp -= 64;
4278     }
4279     shiftCount = clz64(zSig0) - 15;
4280     if ( 0 <= shiftCount ) {
4281         zSig2 = 0;
4282         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4283     }
4284     else {
4285         shift128ExtraRightJamming(
4286             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4287     }
4288     zExp -= shiftCount;
4289     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4290 
4291 }
4292 
4293 
4294 /*----------------------------------------------------------------------------
4295 | Returns the result of converting the 32-bit two's complement integer `a'
4296 | to the extended double-precision floating-point format.  The conversion
4297 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4298 | Arithmetic.
4299 *----------------------------------------------------------------------------*/
4300 
4301 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4302 {
4303     flag zSign;
4304     uint32_t absA;
4305     int8_t shiftCount;
4306     uint64_t zSig;
4307 
4308     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4309     zSign = ( a < 0 );
4310     absA = zSign ? - a : a;
4311     shiftCount = clz32(absA) + 32;
4312     zSig = absA;
4313     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4314 
4315 }
4316 
4317 /*----------------------------------------------------------------------------
4318 | Returns the result of converting the 32-bit two's complement integer `a' to
4319 | the quadruple-precision floating-point format.  The conversion is performed
4320 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4321 *----------------------------------------------------------------------------*/
4322 
4323 float128 int32_to_float128(int32_t a, float_status *status)
4324 {
4325     flag zSign;
4326     uint32_t absA;
4327     int8_t shiftCount;
4328     uint64_t zSig0;
4329 
4330     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4331     zSign = ( a < 0 );
4332     absA = zSign ? - a : a;
4333     shiftCount = clz32(absA) + 17;
4334     zSig0 = absA;
4335     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4336 
4337 }
4338 
4339 /*----------------------------------------------------------------------------
4340 | Returns the result of converting the 64-bit two's complement integer `a'
4341 | to the extended double-precision floating-point format.  The conversion
4342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4343 | Arithmetic.
4344 *----------------------------------------------------------------------------*/
4345 
4346 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4347 {
4348     flag zSign;
4349     uint64_t absA;
4350     int8_t shiftCount;
4351 
4352     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4353     zSign = ( a < 0 );
4354     absA = zSign ? - a : a;
4355     shiftCount = clz64(absA);
4356     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4357 
4358 }
4359 
4360 /*----------------------------------------------------------------------------
4361 | Returns the result of converting the 64-bit two's complement integer `a' to
4362 | the quadruple-precision floating-point format.  The conversion is performed
4363 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4364 *----------------------------------------------------------------------------*/
4365 
4366 float128 int64_to_float128(int64_t a, float_status *status)
4367 {
4368     flag zSign;
4369     uint64_t absA;
4370     int8_t shiftCount;
4371     int32_t zExp;
4372     uint64_t zSig0, zSig1;
4373 
4374     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4375     zSign = ( a < 0 );
4376     absA = zSign ? - a : a;
4377     shiftCount = clz64(absA) + 49;
4378     zExp = 0x406E - shiftCount;
4379     if ( 64 <= shiftCount ) {
4380         zSig1 = 0;
4381         zSig0 = absA;
4382         shiftCount -= 64;
4383     }
4384     else {
4385         zSig1 = absA;
4386         zSig0 = 0;
4387     }
4388     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4389     return packFloat128( zSign, zExp, zSig0, zSig1 );
4390 
4391 }
4392 
4393 /*----------------------------------------------------------------------------
4394 | Returns the result of converting the 64-bit unsigned integer `a'
4395 | to the quadruple-precision floating-point format.  The conversion is performed
4396 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4397 *----------------------------------------------------------------------------*/
4398 
4399 float128 uint64_to_float128(uint64_t a, float_status *status)
4400 {
4401     if (a == 0) {
4402         return float128_zero;
4403     }
4404     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4405 }
4406 
4407 /*----------------------------------------------------------------------------
4408 | Returns the result of converting the single-precision floating-point value
4409 | `a' to the extended double-precision floating-point format.  The conversion
4410 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4411 | Arithmetic.
4412 *----------------------------------------------------------------------------*/
4413 
4414 floatx80 float32_to_floatx80(float32 a, float_status *status)
4415 {
4416     flag aSign;
4417     int aExp;
4418     uint32_t aSig;
4419 
4420     a = float32_squash_input_denormal(a, status);
4421     aSig = extractFloat32Frac( a );
4422     aExp = extractFloat32Exp( a );
4423     aSign = extractFloat32Sign( a );
4424     if ( aExp == 0xFF ) {
4425         if (aSig) {
4426             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4427         }
4428         return packFloatx80(aSign,
4429                             floatx80_infinity_high,
4430                             floatx80_infinity_low);
4431     }
4432     if ( aExp == 0 ) {
4433         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4434         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4435     }
4436     aSig |= 0x00800000;
4437     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4438 
4439 }
4440 
4441 /*----------------------------------------------------------------------------
4442 | Returns the result of converting the single-precision floating-point value
4443 | `a' to the double-precision floating-point format.  The conversion is
4444 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4445 | Arithmetic.
4446 *----------------------------------------------------------------------------*/
4447 
4448 float128 float32_to_float128(float32 a, float_status *status)
4449 {
4450     flag aSign;
4451     int aExp;
4452     uint32_t aSig;
4453 
4454     a = float32_squash_input_denormal(a, status);
4455     aSig = extractFloat32Frac( a );
4456     aExp = extractFloat32Exp( a );
4457     aSign = extractFloat32Sign( a );
4458     if ( aExp == 0xFF ) {
4459         if (aSig) {
4460             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4461         }
4462         return packFloat128( aSign, 0x7FFF, 0, 0 );
4463     }
4464     if ( aExp == 0 ) {
4465         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4466         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4467         --aExp;
4468     }
4469     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4470 
4471 }
4472 
4473 /*----------------------------------------------------------------------------
4474 | Returns the remainder of the single-precision floating-point value `a'
4475 | with respect to the corresponding value `b'.  The operation is performed
4476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4477 *----------------------------------------------------------------------------*/
4478 
4479 float32 float32_rem(float32 a, float32 b, float_status *status)
4480 {
4481     flag aSign, zSign;
4482     int aExp, bExp, expDiff;
4483     uint32_t aSig, bSig;
4484     uint32_t q;
4485     uint64_t aSig64, bSig64, q64;
4486     uint32_t alternateASig;
4487     int32_t sigMean;
4488     a = float32_squash_input_denormal(a, status);
4489     b = float32_squash_input_denormal(b, status);
4490 
4491     aSig = extractFloat32Frac( a );
4492     aExp = extractFloat32Exp( a );
4493     aSign = extractFloat32Sign( a );
4494     bSig = extractFloat32Frac( b );
4495     bExp = extractFloat32Exp( b );
4496     if ( aExp == 0xFF ) {
4497         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4498             return propagateFloat32NaN(a, b, status);
4499         }
4500         float_raise(float_flag_invalid, status);
4501         return float32_default_nan(status);
4502     }
4503     if ( bExp == 0xFF ) {
4504         if (bSig) {
4505             return propagateFloat32NaN(a, b, status);
4506         }
4507         return a;
4508     }
4509     if ( bExp == 0 ) {
4510         if ( bSig == 0 ) {
4511             float_raise(float_flag_invalid, status);
4512             return float32_default_nan(status);
4513         }
4514         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4515     }
4516     if ( aExp == 0 ) {
4517         if ( aSig == 0 ) return a;
4518         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4519     }
4520     expDiff = aExp - bExp;
4521     aSig |= 0x00800000;
4522     bSig |= 0x00800000;
4523     if ( expDiff < 32 ) {
4524         aSig <<= 8;
4525         bSig <<= 8;
4526         if ( expDiff < 0 ) {
4527             if ( expDiff < -1 ) return a;
4528             aSig >>= 1;
4529         }
4530         q = ( bSig <= aSig );
4531         if ( q ) aSig -= bSig;
4532         if ( 0 < expDiff ) {
4533             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4534             q >>= 32 - expDiff;
4535             bSig >>= 2;
4536             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4537         }
4538         else {
4539             aSig >>= 2;
4540             bSig >>= 2;
4541         }
4542     }
4543     else {
4544         if ( bSig <= aSig ) aSig -= bSig;
4545         aSig64 = ( (uint64_t) aSig )<<40;
4546         bSig64 = ( (uint64_t) bSig )<<40;
4547         expDiff -= 64;
4548         while ( 0 < expDiff ) {
4549             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4550             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4551             aSig64 = - ( ( bSig * q64 )<<38 );
4552             expDiff -= 62;
4553         }
4554         expDiff += 64;
4555         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4556         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4557         q = q64>>( 64 - expDiff );
4558         bSig <<= 6;
4559         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4560     }
4561     do {
4562         alternateASig = aSig;
4563         ++q;
4564         aSig -= bSig;
4565     } while ( 0 <= (int32_t) aSig );
4566     sigMean = aSig + alternateASig;
4567     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4568         aSig = alternateASig;
4569     }
4570     zSign = ( (int32_t) aSig < 0 );
4571     if ( zSign ) aSig = - aSig;
4572     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4573 }
4574 
4575 
4576 
4577 /*----------------------------------------------------------------------------
4578 | Returns the binary exponential of the single-precision floating-point value
4579 | `a'. The operation is performed according to the IEC/IEEE Standard for
4580 | Binary Floating-Point Arithmetic.
4581 |
4582 | Uses the following identities:
4583 |
4584 | 1. -------------------------------------------------------------------------
4585 |      x    x*ln(2)
4586 |     2  = e
4587 |
4588 | 2. -------------------------------------------------------------------------
4589 |                      2     3     4     5           n
4590 |      x        x     x     x     x     x           x
4591 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4592 |               1!    2!    3!    4!    5!          n!
4593 *----------------------------------------------------------------------------*/
4594 
4595 static const float64 float32_exp2_coefficients[15] =
4596 {
4597     const_float64( 0x3ff0000000000000ll ), /*  1 */
4598     const_float64( 0x3fe0000000000000ll ), /*  2 */
4599     const_float64( 0x3fc5555555555555ll ), /*  3 */
4600     const_float64( 0x3fa5555555555555ll ), /*  4 */
4601     const_float64( 0x3f81111111111111ll ), /*  5 */
4602     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4603     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4604     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4605     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4606     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4607     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4608     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4609     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4610     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4611     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4612 };
4613 
4614 float32 float32_exp2(float32 a, float_status *status)
4615 {
4616     flag aSign;
4617     int aExp;
4618     uint32_t aSig;
4619     float64 r, x, xn;
4620     int i;
4621     a = float32_squash_input_denormal(a, status);
4622 
4623     aSig = extractFloat32Frac( a );
4624     aExp = extractFloat32Exp( a );
4625     aSign = extractFloat32Sign( a );
4626 
4627     if ( aExp == 0xFF) {
4628         if (aSig) {
4629             return propagateFloat32NaN(a, float32_zero, status);
4630         }
4631         return (aSign) ? float32_zero : a;
4632     }
4633     if (aExp == 0) {
4634         if (aSig == 0) return float32_one;
4635     }
4636 
4637     float_raise(float_flag_inexact, status);
4638 
4639     /* ******************************* */
4640     /* using float64 for approximation */
4641     /* ******************************* */
4642     x = float32_to_float64(a, status);
4643     x = float64_mul(x, float64_ln2, status);
4644 
4645     xn = x;
4646     r = float64_one;
4647     for (i = 0 ; i < 15 ; i++) {
4648         float64 f;
4649 
4650         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4651         r = float64_add(r, f, status);
4652 
4653         xn = float64_mul(xn, x, status);
4654     }
4655 
4656     return float64_to_float32(r, status);
4657 }
4658 
4659 /*----------------------------------------------------------------------------
4660 | Returns the binary log of the single-precision floating-point value `a'.
4661 | The operation is performed according to the IEC/IEEE Standard for Binary
4662 | Floating-Point Arithmetic.
4663 *----------------------------------------------------------------------------*/
4664 float32 float32_log2(float32 a, float_status *status)
4665 {
4666     flag aSign, zSign;
4667     int aExp;
4668     uint32_t aSig, zSig, i;
4669 
4670     a = float32_squash_input_denormal(a, status);
4671     aSig = extractFloat32Frac( a );
4672     aExp = extractFloat32Exp( a );
4673     aSign = extractFloat32Sign( a );
4674 
4675     if ( aExp == 0 ) {
4676         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4677         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4678     }
4679     if ( aSign ) {
4680         float_raise(float_flag_invalid, status);
4681         return float32_default_nan(status);
4682     }
4683     if ( aExp == 0xFF ) {
4684         if (aSig) {
4685             return propagateFloat32NaN(a, float32_zero, status);
4686         }
4687         return a;
4688     }
4689 
4690     aExp -= 0x7F;
4691     aSig |= 0x00800000;
4692     zSign = aExp < 0;
4693     zSig = aExp << 23;
4694 
4695     for (i = 1 << 22; i > 0; i >>= 1) {
4696         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4697         if ( aSig & 0x01000000 ) {
4698             aSig >>= 1;
4699             zSig |= i;
4700         }
4701     }
4702 
4703     if ( zSign )
4704         zSig = -zSig;
4705 
4706     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4707 }
4708 
4709 /*----------------------------------------------------------------------------
4710 | Returns 1 if the single-precision floating-point value `a' is equal to
4711 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4712 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4713 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4714 *----------------------------------------------------------------------------*/
4715 
4716 int float32_eq(float32 a, float32 b, float_status *status)
4717 {
4718     uint32_t av, bv;
4719     a = float32_squash_input_denormal(a, status);
4720     b = float32_squash_input_denormal(b, status);
4721 
4722     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4723          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4724        ) {
4725         float_raise(float_flag_invalid, status);
4726         return 0;
4727     }
4728     av = float32_val(a);
4729     bv = float32_val(b);
4730     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4731 }
4732 
4733 /*----------------------------------------------------------------------------
4734 | Returns 1 if the single-precision floating-point value `a' is less than
4735 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4736 | exception is raised if either operand is a NaN.  The comparison is performed
4737 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4738 *----------------------------------------------------------------------------*/
4739 
4740 int float32_le(float32 a, float32 b, float_status *status)
4741 {
4742     flag aSign, bSign;
4743     uint32_t av, bv;
4744     a = float32_squash_input_denormal(a, status);
4745     b = float32_squash_input_denormal(b, status);
4746 
4747     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4748          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4749        ) {
4750         float_raise(float_flag_invalid, status);
4751         return 0;
4752     }
4753     aSign = extractFloat32Sign( a );
4754     bSign = extractFloat32Sign( b );
4755     av = float32_val(a);
4756     bv = float32_val(b);
4757     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4758     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4759 
4760 }
4761 
4762 /*----------------------------------------------------------------------------
4763 | Returns 1 if the single-precision floating-point value `a' is less than
4764 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4765 | raised if either operand is a NaN.  The comparison is performed according
4766 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4767 *----------------------------------------------------------------------------*/
4768 
4769 int float32_lt(float32 a, float32 b, float_status *status)
4770 {
4771     flag aSign, bSign;
4772     uint32_t av, bv;
4773     a = float32_squash_input_denormal(a, status);
4774     b = float32_squash_input_denormal(b, status);
4775 
4776     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4777          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4778        ) {
4779         float_raise(float_flag_invalid, status);
4780         return 0;
4781     }
4782     aSign = extractFloat32Sign( a );
4783     bSign = extractFloat32Sign( b );
4784     av = float32_val(a);
4785     bv = float32_val(b);
4786     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4787     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4788 
4789 }
4790 
4791 /*----------------------------------------------------------------------------
4792 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4793 | be compared, and 0 otherwise.  The invalid exception is raised if either
4794 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4795 | Standard for Binary Floating-Point Arithmetic.
4796 *----------------------------------------------------------------------------*/
4797 
4798 int float32_unordered(float32 a, float32 b, float_status *status)
4799 {
4800     a = float32_squash_input_denormal(a, status);
4801     b = float32_squash_input_denormal(b, status);
4802 
4803     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4804          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4805        ) {
4806         float_raise(float_flag_invalid, status);
4807         return 1;
4808     }
4809     return 0;
4810 }
4811 
4812 /*----------------------------------------------------------------------------
4813 | Returns 1 if the single-precision floating-point value `a' is equal to
4814 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4815 | exception.  The comparison is performed according to the IEC/IEEE Standard
4816 | for Binary Floating-Point Arithmetic.
4817 *----------------------------------------------------------------------------*/
4818 
4819 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4820 {
4821     a = float32_squash_input_denormal(a, status);
4822     b = float32_squash_input_denormal(b, status);
4823 
4824     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4825          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4826        ) {
4827         if (float32_is_signaling_nan(a, status)
4828          || float32_is_signaling_nan(b, status)) {
4829             float_raise(float_flag_invalid, status);
4830         }
4831         return 0;
4832     }
4833     return ( float32_val(a) == float32_val(b) ) ||
4834             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4835 }
4836 
4837 /*----------------------------------------------------------------------------
4838 | Returns 1 if the single-precision floating-point value `a' is less than or
4839 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4840 | cause an exception.  Otherwise, the comparison is performed according to the
4841 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4842 *----------------------------------------------------------------------------*/
4843 
4844 int float32_le_quiet(float32 a, float32 b, float_status *status)
4845 {
4846     flag aSign, bSign;
4847     uint32_t av, bv;
4848     a = float32_squash_input_denormal(a, status);
4849     b = float32_squash_input_denormal(b, status);
4850 
4851     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4852          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4853        ) {
4854         if (float32_is_signaling_nan(a, status)
4855          || float32_is_signaling_nan(b, status)) {
4856             float_raise(float_flag_invalid, status);
4857         }
4858         return 0;
4859     }
4860     aSign = extractFloat32Sign( a );
4861     bSign = extractFloat32Sign( b );
4862     av = float32_val(a);
4863     bv = float32_val(b);
4864     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4865     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4866 
4867 }
4868 
4869 /*----------------------------------------------------------------------------
4870 | Returns 1 if the single-precision floating-point value `a' is less than
4871 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4872 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4873 | Standard for Binary Floating-Point Arithmetic.
4874 *----------------------------------------------------------------------------*/
4875 
4876 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4877 {
4878     flag aSign, bSign;
4879     uint32_t av, bv;
4880     a = float32_squash_input_denormal(a, status);
4881     b = float32_squash_input_denormal(b, status);
4882 
4883     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4884          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4885        ) {
4886         if (float32_is_signaling_nan(a, status)
4887          || float32_is_signaling_nan(b, status)) {
4888             float_raise(float_flag_invalid, status);
4889         }
4890         return 0;
4891     }
4892     aSign = extractFloat32Sign( a );
4893     bSign = extractFloat32Sign( b );
4894     av = float32_val(a);
4895     bv = float32_val(b);
4896     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4897     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4898 
4899 }
4900 
4901 /*----------------------------------------------------------------------------
4902 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4903 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4904 | comparison is performed according to the IEC/IEEE Standard for Binary
4905 | Floating-Point Arithmetic.
4906 *----------------------------------------------------------------------------*/
4907 
4908 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4909 {
4910     a = float32_squash_input_denormal(a, status);
4911     b = float32_squash_input_denormal(b, status);
4912 
4913     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4914          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4915        ) {
4916         if (float32_is_signaling_nan(a, status)
4917          || float32_is_signaling_nan(b, status)) {
4918             float_raise(float_flag_invalid, status);
4919         }
4920         return 1;
4921     }
4922     return 0;
4923 }
4924 
4925 /*----------------------------------------------------------------------------
4926 | If `a' is denormal and we are in flush-to-zero mode then set the
4927 | input-denormal exception and return zero. Otherwise just return the value.
4928 *----------------------------------------------------------------------------*/
4929 float16 float16_squash_input_denormal(float16 a, float_status *status)
4930 {
4931     if (status->flush_inputs_to_zero) {
4932         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4933             float_raise(float_flag_input_denormal, status);
4934             return make_float16(float16_val(a) & 0x8000);
4935         }
4936     }
4937     return a;
4938 }
4939 
4940 /*----------------------------------------------------------------------------
4941 | Returns the result of converting the double-precision floating-point value
4942 | `a' to the extended double-precision floating-point format.  The conversion
4943 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4944 | Arithmetic.
4945 *----------------------------------------------------------------------------*/
4946 
4947 floatx80 float64_to_floatx80(float64 a, float_status *status)
4948 {
4949     flag aSign;
4950     int aExp;
4951     uint64_t aSig;
4952 
4953     a = float64_squash_input_denormal(a, status);
4954     aSig = extractFloat64Frac( a );
4955     aExp = extractFloat64Exp( a );
4956     aSign = extractFloat64Sign( a );
4957     if ( aExp == 0x7FF ) {
4958         if (aSig) {
4959             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4960         }
4961         return packFloatx80(aSign,
4962                             floatx80_infinity_high,
4963                             floatx80_infinity_low);
4964     }
4965     if ( aExp == 0 ) {
4966         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4967         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4968     }
4969     return
4970         packFloatx80(
4971             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4972 
4973 }
4974 
4975 /*----------------------------------------------------------------------------
4976 | Returns the result of converting the double-precision floating-point value
4977 | `a' to the quadruple-precision floating-point format.  The conversion is
4978 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4979 | Arithmetic.
4980 *----------------------------------------------------------------------------*/
4981 
4982 float128 float64_to_float128(float64 a, float_status *status)
4983 {
4984     flag aSign;
4985     int aExp;
4986     uint64_t aSig, zSig0, zSig1;
4987 
4988     a = float64_squash_input_denormal(a, status);
4989     aSig = extractFloat64Frac( a );
4990     aExp = extractFloat64Exp( a );
4991     aSign = extractFloat64Sign( a );
4992     if ( aExp == 0x7FF ) {
4993         if (aSig) {
4994             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4995         }
4996         return packFloat128( aSign, 0x7FFF, 0, 0 );
4997     }
4998     if ( aExp == 0 ) {
4999         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5000         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5001         --aExp;
5002     }
5003     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5004     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5005 
5006 }
5007 
5008 
5009 /*----------------------------------------------------------------------------
5010 | Returns the remainder of the double-precision floating-point value `a'
5011 | with respect to the corresponding value `b'.  The operation is performed
5012 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5013 *----------------------------------------------------------------------------*/
5014 
5015 float64 float64_rem(float64 a, float64 b, float_status *status)
5016 {
5017     flag aSign, zSign;
5018     int aExp, bExp, expDiff;
5019     uint64_t aSig, bSig;
5020     uint64_t q, alternateASig;
5021     int64_t sigMean;
5022 
5023     a = float64_squash_input_denormal(a, status);
5024     b = float64_squash_input_denormal(b, status);
5025     aSig = extractFloat64Frac( a );
5026     aExp = extractFloat64Exp( a );
5027     aSign = extractFloat64Sign( a );
5028     bSig = extractFloat64Frac( b );
5029     bExp = extractFloat64Exp( b );
5030     if ( aExp == 0x7FF ) {
5031         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5032             return propagateFloat64NaN(a, b, status);
5033         }
5034         float_raise(float_flag_invalid, status);
5035         return float64_default_nan(status);
5036     }
5037     if ( bExp == 0x7FF ) {
5038         if (bSig) {
5039             return propagateFloat64NaN(a, b, status);
5040         }
5041         return a;
5042     }
5043     if ( bExp == 0 ) {
5044         if ( bSig == 0 ) {
5045             float_raise(float_flag_invalid, status);
5046             return float64_default_nan(status);
5047         }
5048         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5049     }
5050     if ( aExp == 0 ) {
5051         if ( aSig == 0 ) return a;
5052         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5053     }
5054     expDiff = aExp - bExp;
5055     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
5056     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
5057     if ( expDiff < 0 ) {
5058         if ( expDiff < -1 ) return a;
5059         aSig >>= 1;
5060     }
5061     q = ( bSig <= aSig );
5062     if ( q ) aSig -= bSig;
5063     expDiff -= 64;
5064     while ( 0 < expDiff ) {
5065         q = estimateDiv128To64( aSig, 0, bSig );
5066         q = ( 2 < q ) ? q - 2 : 0;
5067         aSig = - ( ( bSig>>2 ) * q );
5068         expDiff -= 62;
5069     }
5070     expDiff += 64;
5071     if ( 0 < expDiff ) {
5072         q = estimateDiv128To64( aSig, 0, bSig );
5073         q = ( 2 < q ) ? q - 2 : 0;
5074         q >>= 64 - expDiff;
5075         bSig >>= 2;
5076         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5077     }
5078     else {
5079         aSig >>= 2;
5080         bSig >>= 2;
5081     }
5082     do {
5083         alternateASig = aSig;
5084         ++q;
5085         aSig -= bSig;
5086     } while ( 0 <= (int64_t) aSig );
5087     sigMean = aSig + alternateASig;
5088     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5089         aSig = alternateASig;
5090     }
5091     zSign = ( (int64_t) aSig < 0 );
5092     if ( zSign ) aSig = - aSig;
5093     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5094 
5095 }
5096 
5097 /*----------------------------------------------------------------------------
5098 | Returns the binary log of the double-precision floating-point value `a'.
5099 | The operation is performed according to the IEC/IEEE Standard for Binary
5100 | Floating-Point Arithmetic.
5101 *----------------------------------------------------------------------------*/
5102 float64 float64_log2(float64 a, float_status *status)
5103 {
5104     flag aSign, zSign;
5105     int aExp;
5106     uint64_t aSig, aSig0, aSig1, zSig, i;
5107     a = float64_squash_input_denormal(a, status);
5108 
5109     aSig = extractFloat64Frac( a );
5110     aExp = extractFloat64Exp( a );
5111     aSign = extractFloat64Sign( a );
5112 
5113     if ( aExp == 0 ) {
5114         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5115         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5116     }
5117     if ( aSign ) {
5118         float_raise(float_flag_invalid, status);
5119         return float64_default_nan(status);
5120     }
5121     if ( aExp == 0x7FF ) {
5122         if (aSig) {
5123             return propagateFloat64NaN(a, float64_zero, status);
5124         }
5125         return a;
5126     }
5127 
5128     aExp -= 0x3FF;
5129     aSig |= LIT64( 0x0010000000000000 );
5130     zSign = aExp < 0;
5131     zSig = (uint64_t)aExp << 52;
5132     for (i = 1LL << 51; i > 0; i >>= 1) {
5133         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5134         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5135         if ( aSig & LIT64( 0x0020000000000000 ) ) {
5136             aSig >>= 1;
5137             zSig |= i;
5138         }
5139     }
5140 
5141     if ( zSign )
5142         zSig = -zSig;
5143     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5144 }
5145 
5146 /*----------------------------------------------------------------------------
5147 | Returns 1 if the double-precision floating-point value `a' is equal to the
5148 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5149 | if either operand is a NaN.  Otherwise, the comparison is performed
5150 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5151 *----------------------------------------------------------------------------*/
5152 
5153 int float64_eq(float64 a, float64 b, float_status *status)
5154 {
5155     uint64_t av, bv;
5156     a = float64_squash_input_denormal(a, status);
5157     b = float64_squash_input_denormal(b, status);
5158 
5159     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5160          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5161        ) {
5162         float_raise(float_flag_invalid, status);
5163         return 0;
5164     }
5165     av = float64_val(a);
5166     bv = float64_val(b);
5167     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5168 
5169 }
5170 
5171 /*----------------------------------------------------------------------------
5172 | Returns 1 if the double-precision floating-point value `a' is less than or
5173 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5174 | exception is raised if either operand is a NaN.  The comparison is performed
5175 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5176 *----------------------------------------------------------------------------*/
5177 
5178 int float64_le(float64 a, float64 b, float_status *status)
5179 {
5180     flag aSign, bSign;
5181     uint64_t av, bv;
5182     a = float64_squash_input_denormal(a, status);
5183     b = float64_squash_input_denormal(b, status);
5184 
5185     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5186          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5187        ) {
5188         float_raise(float_flag_invalid, status);
5189         return 0;
5190     }
5191     aSign = extractFloat64Sign( a );
5192     bSign = extractFloat64Sign( b );
5193     av = float64_val(a);
5194     bv = float64_val(b);
5195     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5196     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5197 
5198 }
5199 
5200 /*----------------------------------------------------------------------------
5201 | Returns 1 if the double-precision floating-point value `a' is less than
5202 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5203 | raised if either operand is a NaN.  The comparison is performed according
5204 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5205 *----------------------------------------------------------------------------*/
5206 
5207 int float64_lt(float64 a, float64 b, float_status *status)
5208 {
5209     flag aSign, bSign;
5210     uint64_t av, bv;
5211 
5212     a = float64_squash_input_denormal(a, status);
5213     b = float64_squash_input_denormal(b, status);
5214     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5215          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5216        ) {
5217         float_raise(float_flag_invalid, status);
5218         return 0;
5219     }
5220     aSign = extractFloat64Sign( a );
5221     bSign = extractFloat64Sign( b );
5222     av = float64_val(a);
5223     bv = float64_val(b);
5224     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5225     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5226 
5227 }
5228 
5229 /*----------------------------------------------------------------------------
5230 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5231 | be compared, and 0 otherwise.  The invalid exception is raised if either
5232 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5233 | Standard for Binary Floating-Point Arithmetic.
5234 *----------------------------------------------------------------------------*/
5235 
5236 int float64_unordered(float64 a, float64 b, float_status *status)
5237 {
5238     a = float64_squash_input_denormal(a, status);
5239     b = float64_squash_input_denormal(b, status);
5240 
5241     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5242          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5243        ) {
5244         float_raise(float_flag_invalid, status);
5245         return 1;
5246     }
5247     return 0;
5248 }
5249 
5250 /*----------------------------------------------------------------------------
5251 | Returns 1 if the double-precision floating-point value `a' is equal to the
5252 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5253 | exception.The comparison is performed according to the IEC/IEEE Standard
5254 | for Binary Floating-Point Arithmetic.
5255 *----------------------------------------------------------------------------*/
5256 
5257 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5258 {
5259     uint64_t av, bv;
5260     a = float64_squash_input_denormal(a, status);
5261     b = float64_squash_input_denormal(b, status);
5262 
5263     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5264          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5265        ) {
5266         if (float64_is_signaling_nan(a, status)
5267          || float64_is_signaling_nan(b, status)) {
5268             float_raise(float_flag_invalid, status);
5269         }
5270         return 0;
5271     }
5272     av = float64_val(a);
5273     bv = float64_val(b);
5274     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5275 
5276 }
5277 
5278 /*----------------------------------------------------------------------------
5279 | Returns 1 if the double-precision floating-point value `a' is less than or
5280 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5281 | cause an exception.  Otherwise, the comparison is performed according to the
5282 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5284 
5285 int float64_le_quiet(float64 a, float64 b, float_status *status)
5286 {
5287     flag aSign, bSign;
5288     uint64_t av, bv;
5289     a = float64_squash_input_denormal(a, status);
5290     b = float64_squash_input_denormal(b, status);
5291 
5292     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5293          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5294        ) {
5295         if (float64_is_signaling_nan(a, status)
5296          || float64_is_signaling_nan(b, status)) {
5297             float_raise(float_flag_invalid, status);
5298         }
5299         return 0;
5300     }
5301     aSign = extractFloat64Sign( a );
5302     bSign = extractFloat64Sign( b );
5303     av = float64_val(a);
5304     bv = float64_val(b);
5305     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5306     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5307 
5308 }
5309 
5310 /*----------------------------------------------------------------------------
5311 | Returns 1 if the double-precision floating-point value `a' is less than
5312 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5313 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5314 | Standard for Binary Floating-Point Arithmetic.
5315 *----------------------------------------------------------------------------*/
5316 
5317 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5318 {
5319     flag aSign, bSign;
5320     uint64_t av, bv;
5321     a = float64_squash_input_denormal(a, status);
5322     b = float64_squash_input_denormal(b, status);
5323 
5324     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5325          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5326        ) {
5327         if (float64_is_signaling_nan(a, status)
5328          || float64_is_signaling_nan(b, status)) {
5329             float_raise(float_flag_invalid, status);
5330         }
5331         return 0;
5332     }
5333     aSign = extractFloat64Sign( a );
5334     bSign = extractFloat64Sign( b );
5335     av = float64_val(a);
5336     bv = float64_val(b);
5337     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5338     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5339 
5340 }
5341 
5342 /*----------------------------------------------------------------------------
5343 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5344 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5345 | comparison is performed according to the IEC/IEEE Standard for Binary
5346 | Floating-Point Arithmetic.
5347 *----------------------------------------------------------------------------*/
5348 
5349 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5350 {
5351     a = float64_squash_input_denormal(a, status);
5352     b = float64_squash_input_denormal(b, status);
5353 
5354     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5355          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5356        ) {
5357         if (float64_is_signaling_nan(a, status)
5358          || float64_is_signaling_nan(b, status)) {
5359             float_raise(float_flag_invalid, status);
5360         }
5361         return 1;
5362     }
5363     return 0;
5364 }
5365 
5366 /*----------------------------------------------------------------------------
5367 | Returns the result of converting the extended double-precision floating-
5368 | point value `a' to the 32-bit two's complement integer format.  The
5369 | conversion is performed according to the IEC/IEEE Standard for Binary
5370 | Floating-Point Arithmetic---which means in particular that the conversion
5371 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5372 | largest positive integer is returned.  Otherwise, if the conversion
5373 | overflows, the largest integer with the same sign as `a' is returned.
5374 *----------------------------------------------------------------------------*/
5375 
5376 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5377 {
5378     flag aSign;
5379     int32_t aExp, shiftCount;
5380     uint64_t aSig;
5381 
5382     if (floatx80_invalid_encoding(a)) {
5383         float_raise(float_flag_invalid, status);
5384         return 1 << 31;
5385     }
5386     aSig = extractFloatx80Frac( a );
5387     aExp = extractFloatx80Exp( a );
5388     aSign = extractFloatx80Sign( a );
5389     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5390     shiftCount = 0x4037 - aExp;
5391     if ( shiftCount <= 0 ) shiftCount = 1;
5392     shift64RightJamming( aSig, shiftCount, &aSig );
5393     return roundAndPackInt32(aSign, aSig, status);
5394 
5395 }
5396 
5397 /*----------------------------------------------------------------------------
5398 | Returns the result of converting the extended double-precision floating-
5399 | point value `a' to the 32-bit two's complement integer format.  The
5400 | conversion is performed according to the IEC/IEEE Standard for Binary
5401 | Floating-Point Arithmetic, except that the conversion is always rounded
5402 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5403 | Otherwise, if the conversion overflows, the largest integer with the same
5404 | sign as `a' is returned.
5405 *----------------------------------------------------------------------------*/
5406 
5407 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5408 {
5409     flag aSign;
5410     int32_t aExp, shiftCount;
5411     uint64_t aSig, savedASig;
5412     int32_t z;
5413 
5414     if (floatx80_invalid_encoding(a)) {
5415         float_raise(float_flag_invalid, status);
5416         return 1 << 31;
5417     }
5418     aSig = extractFloatx80Frac( a );
5419     aExp = extractFloatx80Exp( a );
5420     aSign = extractFloatx80Sign( a );
5421     if ( 0x401E < aExp ) {
5422         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5423         goto invalid;
5424     }
5425     else if ( aExp < 0x3FFF ) {
5426         if (aExp || aSig) {
5427             status->float_exception_flags |= float_flag_inexact;
5428         }
5429         return 0;
5430     }
5431     shiftCount = 0x403E - aExp;
5432     savedASig = aSig;
5433     aSig >>= shiftCount;
5434     z = aSig;
5435     if ( aSign ) z = - z;
5436     if ( ( z < 0 ) ^ aSign ) {
5437  invalid:
5438         float_raise(float_flag_invalid, status);
5439         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5440     }
5441     if ( ( aSig<<shiftCount ) != savedASig ) {
5442         status->float_exception_flags |= float_flag_inexact;
5443     }
5444     return z;
5445 
5446 }
5447 
5448 /*----------------------------------------------------------------------------
5449 | Returns the result of converting the extended double-precision floating-
5450 | point value `a' to the 64-bit two's complement integer format.  The
5451 | conversion is performed according to the IEC/IEEE Standard for Binary
5452 | Floating-Point Arithmetic---which means in particular that the conversion
5453 | is rounded according to the current rounding mode.  If `a' is a NaN,
5454 | the largest positive integer is returned.  Otherwise, if the conversion
5455 | overflows, the largest integer with the same sign as `a' is returned.
5456 *----------------------------------------------------------------------------*/
5457 
5458 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5459 {
5460     flag aSign;
5461     int32_t aExp, shiftCount;
5462     uint64_t aSig, aSigExtra;
5463 
5464     if (floatx80_invalid_encoding(a)) {
5465         float_raise(float_flag_invalid, status);
5466         return 1ULL << 63;
5467     }
5468     aSig = extractFloatx80Frac( a );
5469     aExp = extractFloatx80Exp( a );
5470     aSign = extractFloatx80Sign( a );
5471     shiftCount = 0x403E - aExp;
5472     if ( shiftCount <= 0 ) {
5473         if ( shiftCount ) {
5474             float_raise(float_flag_invalid, status);
5475             if (!aSign || floatx80_is_any_nan(a)) {
5476                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5477             }
5478             return (int64_t) LIT64( 0x8000000000000000 );
5479         }
5480         aSigExtra = 0;
5481     }
5482     else {
5483         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5484     }
5485     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5486 
5487 }
5488 
5489 /*----------------------------------------------------------------------------
5490 | Returns the result of converting the extended double-precision floating-
5491 | point value `a' to the 64-bit two's complement integer format.  The
5492 | conversion is performed according to the IEC/IEEE Standard for Binary
5493 | Floating-Point Arithmetic, except that the conversion is always rounded
5494 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5495 | Otherwise, if the conversion overflows, the largest integer with the same
5496 | sign as `a' is returned.
5497 *----------------------------------------------------------------------------*/
5498 
5499 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5500 {
5501     flag aSign;
5502     int32_t aExp, shiftCount;
5503     uint64_t aSig;
5504     int64_t z;
5505 
5506     if (floatx80_invalid_encoding(a)) {
5507         float_raise(float_flag_invalid, status);
5508         return 1ULL << 63;
5509     }
5510     aSig = extractFloatx80Frac( a );
5511     aExp = extractFloatx80Exp( a );
5512     aSign = extractFloatx80Sign( a );
5513     shiftCount = aExp - 0x403E;
5514     if ( 0 <= shiftCount ) {
5515         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5516         if ( ( a.high != 0xC03E ) || aSig ) {
5517             float_raise(float_flag_invalid, status);
5518             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5519                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5520             }
5521         }
5522         return (int64_t) LIT64( 0x8000000000000000 );
5523     }
5524     else if ( aExp < 0x3FFF ) {
5525         if (aExp | aSig) {
5526             status->float_exception_flags |= float_flag_inexact;
5527         }
5528         return 0;
5529     }
5530     z = aSig>>( - shiftCount );
5531     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5532         status->float_exception_flags |= float_flag_inexact;
5533     }
5534     if ( aSign ) z = - z;
5535     return z;
5536 
5537 }
5538 
5539 /*----------------------------------------------------------------------------
5540 | Returns the result of converting the extended double-precision floating-
5541 | point value `a' to the single-precision floating-point format.  The
5542 | conversion is performed according to the IEC/IEEE Standard for Binary
5543 | Floating-Point Arithmetic.
5544 *----------------------------------------------------------------------------*/
5545 
5546 float32 floatx80_to_float32(floatx80 a, float_status *status)
5547 {
5548     flag aSign;
5549     int32_t aExp;
5550     uint64_t aSig;
5551 
5552     if (floatx80_invalid_encoding(a)) {
5553         float_raise(float_flag_invalid, status);
5554         return float32_default_nan(status);
5555     }
5556     aSig = extractFloatx80Frac( a );
5557     aExp = extractFloatx80Exp( a );
5558     aSign = extractFloatx80Sign( a );
5559     if ( aExp == 0x7FFF ) {
5560         if ( (uint64_t) ( aSig<<1 ) ) {
5561             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5562         }
5563         return packFloat32( aSign, 0xFF, 0 );
5564     }
5565     shift64RightJamming( aSig, 33, &aSig );
5566     if ( aExp || aSig ) aExp -= 0x3F81;
5567     return roundAndPackFloat32(aSign, aExp, aSig, status);
5568 
5569 }
5570 
5571 /*----------------------------------------------------------------------------
5572 | Returns the result of converting the extended double-precision floating-
5573 | point value `a' to the double-precision floating-point format.  The
5574 | conversion is performed according to the IEC/IEEE Standard for Binary
5575 | Floating-Point Arithmetic.
5576 *----------------------------------------------------------------------------*/
5577 
5578 float64 floatx80_to_float64(floatx80 a, float_status *status)
5579 {
5580     flag aSign;
5581     int32_t aExp;
5582     uint64_t aSig, zSig;
5583 
5584     if (floatx80_invalid_encoding(a)) {
5585         float_raise(float_flag_invalid, status);
5586         return float64_default_nan(status);
5587     }
5588     aSig = extractFloatx80Frac( a );
5589     aExp = extractFloatx80Exp( a );
5590     aSign = extractFloatx80Sign( a );
5591     if ( aExp == 0x7FFF ) {
5592         if ( (uint64_t) ( aSig<<1 ) ) {
5593             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5594         }
5595         return packFloat64( aSign, 0x7FF, 0 );
5596     }
5597     shift64RightJamming( aSig, 1, &zSig );
5598     if ( aExp || aSig ) aExp -= 0x3C01;
5599     return roundAndPackFloat64(aSign, aExp, zSig, status);
5600 
5601 }
5602 
5603 /*----------------------------------------------------------------------------
5604 | Returns the result of converting the extended double-precision floating-
5605 | point value `a' to the quadruple-precision floating-point format.  The
5606 | conversion is performed according to the IEC/IEEE Standard for Binary
5607 | Floating-Point Arithmetic.
5608 *----------------------------------------------------------------------------*/
5609 
5610 float128 floatx80_to_float128(floatx80 a, float_status *status)
5611 {
5612     flag aSign;
5613     int aExp;
5614     uint64_t aSig, zSig0, zSig1;
5615 
5616     if (floatx80_invalid_encoding(a)) {
5617         float_raise(float_flag_invalid, status);
5618         return float128_default_nan(status);
5619     }
5620     aSig = extractFloatx80Frac( a );
5621     aExp = extractFloatx80Exp( a );
5622     aSign = extractFloatx80Sign( a );
5623     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5624         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5625     }
5626     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5627     return packFloat128( aSign, aExp, zSig0, zSig1 );
5628 
5629 }
5630 
5631 /*----------------------------------------------------------------------------
5632 | Rounds the extended double-precision floating-point value `a'
5633 | to the precision provided by floatx80_rounding_precision and returns the
5634 | result as an extended double-precision floating-point value.
5635 | The operation is performed according to the IEC/IEEE Standard for Binary
5636 | Floating-Point Arithmetic.
5637 *----------------------------------------------------------------------------*/
5638 
5639 floatx80 floatx80_round(floatx80 a, float_status *status)
5640 {
5641     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5642                                 extractFloatx80Sign(a),
5643                                 extractFloatx80Exp(a),
5644                                 extractFloatx80Frac(a), 0, status);
5645 }
5646 
5647 /*----------------------------------------------------------------------------
5648 | Rounds the extended double-precision floating-point value `a' to an integer,
5649 | and returns the result as an extended quadruple-precision floating-point
5650 | value.  The operation is performed according to the IEC/IEEE Standard for
5651 | Binary Floating-Point Arithmetic.
5652 *----------------------------------------------------------------------------*/
5653 
5654 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5655 {
5656     flag aSign;
5657     int32_t aExp;
5658     uint64_t lastBitMask, roundBitsMask;
5659     floatx80 z;
5660 
5661     if (floatx80_invalid_encoding(a)) {
5662         float_raise(float_flag_invalid, status);
5663         return floatx80_default_nan(status);
5664     }
5665     aExp = extractFloatx80Exp( a );
5666     if ( 0x403E <= aExp ) {
5667         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5668             return propagateFloatx80NaN(a, a, status);
5669         }
5670         return a;
5671     }
5672     if ( aExp < 0x3FFF ) {
5673         if (    ( aExp == 0 )
5674              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5675             return a;
5676         }
5677         status->float_exception_flags |= float_flag_inexact;
5678         aSign = extractFloatx80Sign( a );
5679         switch (status->float_rounding_mode) {
5680          case float_round_nearest_even:
5681             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5682                ) {
5683                 return
5684                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5685             }
5686             break;
5687         case float_round_ties_away:
5688             if (aExp == 0x3FFE) {
5689                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5690             }
5691             break;
5692          case float_round_down:
5693             return
5694                   aSign ?
5695                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5696                 : packFloatx80( 0, 0, 0 );
5697          case float_round_up:
5698             return
5699                   aSign ? packFloatx80( 1, 0, 0 )
5700                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5701         }
5702         return packFloatx80( aSign, 0, 0 );
5703     }
5704     lastBitMask = 1;
5705     lastBitMask <<= 0x403E - aExp;
5706     roundBitsMask = lastBitMask - 1;
5707     z = a;
5708     switch (status->float_rounding_mode) {
5709     case float_round_nearest_even:
5710         z.low += lastBitMask>>1;
5711         if ((z.low & roundBitsMask) == 0) {
5712             z.low &= ~lastBitMask;
5713         }
5714         break;
5715     case float_round_ties_away:
5716         z.low += lastBitMask >> 1;
5717         break;
5718     case float_round_to_zero:
5719         break;
5720     case float_round_up:
5721         if (!extractFloatx80Sign(z)) {
5722             z.low += roundBitsMask;
5723         }
5724         break;
5725     case float_round_down:
5726         if (extractFloatx80Sign(z)) {
5727             z.low += roundBitsMask;
5728         }
5729         break;
5730     default:
5731         abort();
5732     }
5733     z.low &= ~ roundBitsMask;
5734     if ( z.low == 0 ) {
5735         ++z.high;
5736         z.low = LIT64( 0x8000000000000000 );
5737     }
5738     if (z.low != a.low) {
5739         status->float_exception_flags |= float_flag_inexact;
5740     }
5741     return z;
5742 
5743 }
5744 
5745 /*----------------------------------------------------------------------------
5746 | Returns the result of adding the absolute values of the extended double-
5747 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5748 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5749 | The addition is performed according to the IEC/IEEE Standard for Binary
5750 | Floating-Point Arithmetic.
5751 *----------------------------------------------------------------------------*/
5752 
5753 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5754                                 float_status *status)
5755 {
5756     int32_t aExp, bExp, zExp;
5757     uint64_t aSig, bSig, zSig0, zSig1;
5758     int32_t expDiff;
5759 
5760     aSig = extractFloatx80Frac( a );
5761     aExp = extractFloatx80Exp( a );
5762     bSig = extractFloatx80Frac( b );
5763     bExp = extractFloatx80Exp( b );
5764     expDiff = aExp - bExp;
5765     if ( 0 < expDiff ) {
5766         if ( aExp == 0x7FFF ) {
5767             if ((uint64_t)(aSig << 1)) {
5768                 return propagateFloatx80NaN(a, b, status);
5769             }
5770             return a;
5771         }
5772         if ( bExp == 0 ) --expDiff;
5773         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5774         zExp = aExp;
5775     }
5776     else if ( expDiff < 0 ) {
5777         if ( bExp == 0x7FFF ) {
5778             if ((uint64_t)(bSig << 1)) {
5779                 return propagateFloatx80NaN(a, b, status);
5780             }
5781             return packFloatx80(zSign,
5782                                 floatx80_infinity_high,
5783                                 floatx80_infinity_low);
5784         }
5785         if ( aExp == 0 ) ++expDiff;
5786         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5787         zExp = bExp;
5788     }
5789     else {
5790         if ( aExp == 0x7FFF ) {
5791             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5792                 return propagateFloatx80NaN(a, b, status);
5793             }
5794             return a;
5795         }
5796         zSig1 = 0;
5797         zSig0 = aSig + bSig;
5798         if ( aExp == 0 ) {
5799             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5800             goto roundAndPack;
5801         }
5802         zExp = aExp;
5803         goto shiftRight1;
5804     }
5805     zSig0 = aSig + bSig;
5806     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5807  shiftRight1:
5808     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5809     zSig0 |= LIT64( 0x8000000000000000 );
5810     ++zExp;
5811  roundAndPack:
5812     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5813                                 zSign, zExp, zSig0, zSig1, status);
5814 }
5815 
5816 /*----------------------------------------------------------------------------
5817 | Returns the result of subtracting the absolute values of the extended
5818 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5819 | difference is negated before being returned.  `zSign' is ignored if the
5820 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5821 | Standard for Binary Floating-Point Arithmetic.
5822 *----------------------------------------------------------------------------*/
5823 
5824 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5825                                 float_status *status)
5826 {
5827     int32_t aExp, bExp, zExp;
5828     uint64_t aSig, bSig, zSig0, zSig1;
5829     int32_t expDiff;
5830 
5831     aSig = extractFloatx80Frac( a );
5832     aExp = extractFloatx80Exp( a );
5833     bSig = extractFloatx80Frac( b );
5834     bExp = extractFloatx80Exp( b );
5835     expDiff = aExp - bExp;
5836     if ( 0 < expDiff ) goto aExpBigger;
5837     if ( expDiff < 0 ) goto bExpBigger;
5838     if ( aExp == 0x7FFF ) {
5839         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5840             return propagateFloatx80NaN(a, b, status);
5841         }
5842         float_raise(float_flag_invalid, status);
5843         return floatx80_default_nan(status);
5844     }
5845     if ( aExp == 0 ) {
5846         aExp = 1;
5847         bExp = 1;
5848     }
5849     zSig1 = 0;
5850     if ( bSig < aSig ) goto aBigger;
5851     if ( aSig < bSig ) goto bBigger;
5852     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5853  bExpBigger:
5854     if ( bExp == 0x7FFF ) {
5855         if ((uint64_t)(bSig << 1)) {
5856             return propagateFloatx80NaN(a, b, status);
5857         }
5858         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5859                             floatx80_infinity_low);
5860     }
5861     if ( aExp == 0 ) ++expDiff;
5862     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5863  bBigger:
5864     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5865     zExp = bExp;
5866     zSign ^= 1;
5867     goto normalizeRoundAndPack;
5868  aExpBigger:
5869     if ( aExp == 0x7FFF ) {
5870         if ((uint64_t)(aSig << 1)) {
5871             return propagateFloatx80NaN(a, b, status);
5872         }
5873         return a;
5874     }
5875     if ( bExp == 0 ) --expDiff;
5876     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5877  aBigger:
5878     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5879     zExp = aExp;
5880  normalizeRoundAndPack:
5881     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5882                                          zSign, zExp, zSig0, zSig1, status);
5883 }
5884 
5885 /*----------------------------------------------------------------------------
5886 | Returns the result of adding the extended double-precision floating-point
5887 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5888 | Standard for Binary Floating-Point Arithmetic.
5889 *----------------------------------------------------------------------------*/
5890 
5891 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5892 {
5893     flag aSign, bSign;
5894 
5895     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5896         float_raise(float_flag_invalid, status);
5897         return floatx80_default_nan(status);
5898     }
5899     aSign = extractFloatx80Sign( a );
5900     bSign = extractFloatx80Sign( b );
5901     if ( aSign == bSign ) {
5902         return addFloatx80Sigs(a, b, aSign, status);
5903     }
5904     else {
5905         return subFloatx80Sigs(a, b, aSign, status);
5906     }
5907 
5908 }
5909 
5910 /*----------------------------------------------------------------------------
5911 | Returns the result of subtracting the extended double-precision floating-
5912 | point values `a' and `b'.  The operation is performed according to the
5913 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5914 *----------------------------------------------------------------------------*/
5915 
5916 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5917 {
5918     flag aSign, bSign;
5919 
5920     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5921         float_raise(float_flag_invalid, status);
5922         return floatx80_default_nan(status);
5923     }
5924     aSign = extractFloatx80Sign( a );
5925     bSign = extractFloatx80Sign( b );
5926     if ( aSign == bSign ) {
5927         return subFloatx80Sigs(a, b, aSign, status);
5928     }
5929     else {
5930         return addFloatx80Sigs(a, b, aSign, status);
5931     }
5932 
5933 }
5934 
5935 /*----------------------------------------------------------------------------
5936 | Returns the result of multiplying the extended double-precision floating-
5937 | point values `a' and `b'.  The operation is performed according to the
5938 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5939 *----------------------------------------------------------------------------*/
5940 
5941 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5942 {
5943     flag aSign, bSign, zSign;
5944     int32_t aExp, bExp, zExp;
5945     uint64_t aSig, bSig, zSig0, zSig1;
5946 
5947     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5948         float_raise(float_flag_invalid, status);
5949         return floatx80_default_nan(status);
5950     }
5951     aSig = extractFloatx80Frac( a );
5952     aExp = extractFloatx80Exp( a );
5953     aSign = extractFloatx80Sign( a );
5954     bSig = extractFloatx80Frac( b );
5955     bExp = extractFloatx80Exp( b );
5956     bSign = extractFloatx80Sign( b );
5957     zSign = aSign ^ bSign;
5958     if ( aExp == 0x7FFF ) {
5959         if (    (uint64_t) ( aSig<<1 )
5960              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5961             return propagateFloatx80NaN(a, b, status);
5962         }
5963         if ( ( bExp | bSig ) == 0 ) goto invalid;
5964         return packFloatx80(zSign, floatx80_infinity_high,
5965                                    floatx80_infinity_low);
5966     }
5967     if ( bExp == 0x7FFF ) {
5968         if ((uint64_t)(bSig << 1)) {
5969             return propagateFloatx80NaN(a, b, status);
5970         }
5971         if ( ( aExp | aSig ) == 0 ) {
5972  invalid:
5973             float_raise(float_flag_invalid, status);
5974             return floatx80_default_nan(status);
5975         }
5976         return packFloatx80(zSign, floatx80_infinity_high,
5977                                    floatx80_infinity_low);
5978     }
5979     if ( aExp == 0 ) {
5980         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5981         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5982     }
5983     if ( bExp == 0 ) {
5984         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5985         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5986     }
5987     zExp = aExp + bExp - 0x3FFE;
5988     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5989     if ( 0 < (int64_t) zSig0 ) {
5990         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5991         --zExp;
5992     }
5993     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5994                                 zSign, zExp, zSig0, zSig1, status);
5995 }
5996 
5997 /*----------------------------------------------------------------------------
5998 | Returns the result of dividing the extended double-precision floating-point
5999 | value `a' by the corresponding value `b'.  The operation is performed
6000 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6001 *----------------------------------------------------------------------------*/
6002 
6003 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6004 {
6005     flag aSign, bSign, zSign;
6006     int32_t aExp, bExp, zExp;
6007     uint64_t aSig, bSig, zSig0, zSig1;
6008     uint64_t rem0, rem1, rem2, term0, term1, term2;
6009 
6010     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6011         float_raise(float_flag_invalid, status);
6012         return floatx80_default_nan(status);
6013     }
6014     aSig = extractFloatx80Frac( a );
6015     aExp = extractFloatx80Exp( a );
6016     aSign = extractFloatx80Sign( a );
6017     bSig = extractFloatx80Frac( b );
6018     bExp = extractFloatx80Exp( b );
6019     bSign = extractFloatx80Sign( b );
6020     zSign = aSign ^ bSign;
6021     if ( aExp == 0x7FFF ) {
6022         if ((uint64_t)(aSig << 1)) {
6023             return propagateFloatx80NaN(a, b, status);
6024         }
6025         if ( bExp == 0x7FFF ) {
6026             if ((uint64_t)(bSig << 1)) {
6027                 return propagateFloatx80NaN(a, b, status);
6028             }
6029             goto invalid;
6030         }
6031         return packFloatx80(zSign, floatx80_infinity_high,
6032                                    floatx80_infinity_low);
6033     }
6034     if ( bExp == 0x7FFF ) {
6035         if ((uint64_t)(bSig << 1)) {
6036             return propagateFloatx80NaN(a, b, status);
6037         }
6038         return packFloatx80( zSign, 0, 0 );
6039     }
6040     if ( bExp == 0 ) {
6041         if ( bSig == 0 ) {
6042             if ( ( aExp | aSig ) == 0 ) {
6043  invalid:
6044                 float_raise(float_flag_invalid, status);
6045                 return floatx80_default_nan(status);
6046             }
6047             float_raise(float_flag_divbyzero, status);
6048             return packFloatx80(zSign, floatx80_infinity_high,
6049                                        floatx80_infinity_low);
6050         }
6051         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6052     }
6053     if ( aExp == 0 ) {
6054         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6055         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6056     }
6057     zExp = aExp - bExp + 0x3FFE;
6058     rem1 = 0;
6059     if ( bSig <= aSig ) {
6060         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6061         ++zExp;
6062     }
6063     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6064     mul64To128( bSig, zSig0, &term0, &term1 );
6065     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6066     while ( (int64_t) rem0 < 0 ) {
6067         --zSig0;
6068         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6069     }
6070     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6071     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6072         mul64To128( bSig, zSig1, &term1, &term2 );
6073         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6074         while ( (int64_t) rem1 < 0 ) {
6075             --zSig1;
6076             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6077         }
6078         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6079     }
6080     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6081                                 zSign, zExp, zSig0, zSig1, status);
6082 }
6083 
6084 /*----------------------------------------------------------------------------
6085 | Returns the remainder of the extended double-precision floating-point value
6086 | `a' with respect to the corresponding value `b'.  The operation is performed
6087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6088 *----------------------------------------------------------------------------*/
6089 
6090 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6091 {
6092     flag aSign, zSign;
6093     int32_t aExp, bExp, expDiff;
6094     uint64_t aSig0, aSig1, bSig;
6095     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6096 
6097     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6098         float_raise(float_flag_invalid, status);
6099         return floatx80_default_nan(status);
6100     }
6101     aSig0 = extractFloatx80Frac( a );
6102     aExp = extractFloatx80Exp( a );
6103     aSign = extractFloatx80Sign( a );
6104     bSig = extractFloatx80Frac( b );
6105     bExp = extractFloatx80Exp( b );
6106     if ( aExp == 0x7FFF ) {
6107         if (    (uint64_t) ( aSig0<<1 )
6108              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6109             return propagateFloatx80NaN(a, b, status);
6110         }
6111         goto invalid;
6112     }
6113     if ( bExp == 0x7FFF ) {
6114         if ((uint64_t)(bSig << 1)) {
6115             return propagateFloatx80NaN(a, b, status);
6116         }
6117         return a;
6118     }
6119     if ( bExp == 0 ) {
6120         if ( bSig == 0 ) {
6121  invalid:
6122             float_raise(float_flag_invalid, status);
6123             return floatx80_default_nan(status);
6124         }
6125         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6126     }
6127     if ( aExp == 0 ) {
6128         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6129         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6130     }
6131     bSig |= LIT64( 0x8000000000000000 );
6132     zSign = aSign;
6133     expDiff = aExp - bExp;
6134     aSig1 = 0;
6135     if ( expDiff < 0 ) {
6136         if ( expDiff < -1 ) return a;
6137         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6138         expDiff = 0;
6139     }
6140     q = ( bSig <= aSig0 );
6141     if ( q ) aSig0 -= bSig;
6142     expDiff -= 64;
6143     while ( 0 < expDiff ) {
6144         q = estimateDiv128To64( aSig0, aSig1, bSig );
6145         q = ( 2 < q ) ? q - 2 : 0;
6146         mul64To128( bSig, q, &term0, &term1 );
6147         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6148         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6149         expDiff -= 62;
6150     }
6151     expDiff += 64;
6152     if ( 0 < expDiff ) {
6153         q = estimateDiv128To64( aSig0, aSig1, bSig );
6154         q = ( 2 < q ) ? q - 2 : 0;
6155         q >>= 64 - expDiff;
6156         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6157         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6158         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6159         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6160             ++q;
6161             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6162         }
6163     }
6164     else {
6165         term1 = 0;
6166         term0 = bSig;
6167     }
6168     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6169     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6170          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6171               && ( q & 1 ) )
6172        ) {
6173         aSig0 = alternateASig0;
6174         aSig1 = alternateASig1;
6175         zSign = ! zSign;
6176     }
6177     return
6178         normalizeRoundAndPackFloatx80(
6179             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6180 
6181 }
6182 
6183 /*----------------------------------------------------------------------------
6184 | Returns the square root of the extended double-precision floating-point
6185 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6186 | for Binary Floating-Point Arithmetic.
6187 *----------------------------------------------------------------------------*/
6188 
6189 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6190 {
6191     flag aSign;
6192     int32_t aExp, zExp;
6193     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6194     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6195 
6196     if (floatx80_invalid_encoding(a)) {
6197         float_raise(float_flag_invalid, status);
6198         return floatx80_default_nan(status);
6199     }
6200     aSig0 = extractFloatx80Frac( a );
6201     aExp = extractFloatx80Exp( a );
6202     aSign = extractFloatx80Sign( a );
6203     if ( aExp == 0x7FFF ) {
6204         if ((uint64_t)(aSig0 << 1)) {
6205             return propagateFloatx80NaN(a, a, status);
6206         }
6207         if ( ! aSign ) return a;
6208         goto invalid;
6209     }
6210     if ( aSign ) {
6211         if ( ( aExp | aSig0 ) == 0 ) return a;
6212  invalid:
6213         float_raise(float_flag_invalid, status);
6214         return floatx80_default_nan(status);
6215     }
6216     if ( aExp == 0 ) {
6217         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6218         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6219     }
6220     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6221     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6222     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6223     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6224     doubleZSig0 = zSig0<<1;
6225     mul64To128( zSig0, zSig0, &term0, &term1 );
6226     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6227     while ( (int64_t) rem0 < 0 ) {
6228         --zSig0;
6229         doubleZSig0 -= 2;
6230         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6231     }
6232     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6233     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6234         if ( zSig1 == 0 ) zSig1 = 1;
6235         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6236         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6237         mul64To128( zSig1, zSig1, &term2, &term3 );
6238         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6239         while ( (int64_t) rem1 < 0 ) {
6240             --zSig1;
6241             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6242             term3 |= 1;
6243             term2 |= doubleZSig0;
6244             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6245         }
6246         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6247     }
6248     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6249     zSig0 |= doubleZSig0;
6250     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6251                                 0, zExp, zSig0, zSig1, status);
6252 }
6253 
6254 /*----------------------------------------------------------------------------
6255 | Returns 1 if the extended double-precision floating-point value `a' is equal
6256 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6257 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6259 *----------------------------------------------------------------------------*/
6260 
6261 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6262 {
6263 
6264     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6265         || (extractFloatx80Exp(a) == 0x7FFF
6266             && (uint64_t) (extractFloatx80Frac(a) << 1))
6267         || (extractFloatx80Exp(b) == 0x7FFF
6268             && (uint64_t) (extractFloatx80Frac(b) << 1))
6269        ) {
6270         float_raise(float_flag_invalid, status);
6271         return 0;
6272     }
6273     return
6274            ( a.low == b.low )
6275         && (    ( a.high == b.high )
6276              || (    ( a.low == 0 )
6277                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6278            );
6279 
6280 }
6281 
6282 /*----------------------------------------------------------------------------
6283 | Returns 1 if the extended double-precision floating-point value `a' is
6284 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6285 | invalid exception is raised if either operand is a NaN.  The comparison is
6286 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6287 | Arithmetic.
6288 *----------------------------------------------------------------------------*/
6289 
6290 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6291 {
6292     flag aSign, bSign;
6293 
6294     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6295         || (extractFloatx80Exp(a) == 0x7FFF
6296             && (uint64_t) (extractFloatx80Frac(a) << 1))
6297         || (extractFloatx80Exp(b) == 0x7FFF
6298             && (uint64_t) (extractFloatx80Frac(b) << 1))
6299        ) {
6300         float_raise(float_flag_invalid, status);
6301         return 0;
6302     }
6303     aSign = extractFloatx80Sign( a );
6304     bSign = extractFloatx80Sign( b );
6305     if ( aSign != bSign ) {
6306         return
6307                aSign
6308             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6309                  == 0 );
6310     }
6311     return
6312           aSign ? le128( b.high, b.low, a.high, a.low )
6313         : le128( a.high, a.low, b.high, b.low );
6314 
6315 }
6316 
6317 /*----------------------------------------------------------------------------
6318 | Returns 1 if the extended double-precision floating-point value `a' is
6319 | less than the corresponding value `b', and 0 otherwise.  The invalid
6320 | exception is raised if either operand is a NaN.  The comparison is performed
6321 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6322 *----------------------------------------------------------------------------*/
6323 
6324 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6325 {
6326     flag aSign, bSign;
6327 
6328     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6329         || (extractFloatx80Exp(a) == 0x7FFF
6330             && (uint64_t) (extractFloatx80Frac(a) << 1))
6331         || (extractFloatx80Exp(b) == 0x7FFF
6332             && (uint64_t) (extractFloatx80Frac(b) << 1))
6333        ) {
6334         float_raise(float_flag_invalid, status);
6335         return 0;
6336     }
6337     aSign = extractFloatx80Sign( a );
6338     bSign = extractFloatx80Sign( b );
6339     if ( aSign != bSign ) {
6340         return
6341                aSign
6342             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6343                  != 0 );
6344     }
6345     return
6346           aSign ? lt128( b.high, b.low, a.high, a.low )
6347         : lt128( a.high, a.low, b.high, b.low );
6348 
6349 }
6350 
6351 /*----------------------------------------------------------------------------
6352 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6353 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6354 | either operand is a NaN.   The comparison is performed according to the
6355 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6356 *----------------------------------------------------------------------------*/
6357 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6358 {
6359     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6360         || (extractFloatx80Exp(a) == 0x7FFF
6361             && (uint64_t) (extractFloatx80Frac(a) << 1))
6362         || (extractFloatx80Exp(b) == 0x7FFF
6363             && (uint64_t) (extractFloatx80Frac(b) << 1))
6364        ) {
6365         float_raise(float_flag_invalid, status);
6366         return 1;
6367     }
6368     return 0;
6369 }
6370 
6371 /*----------------------------------------------------------------------------
6372 | Returns 1 if the extended double-precision floating-point value `a' is
6373 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6374 | cause an exception.  The comparison is performed according to the IEC/IEEE
6375 | Standard for Binary Floating-Point Arithmetic.
6376 *----------------------------------------------------------------------------*/
6377 
6378 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6379 {
6380 
6381     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6382         float_raise(float_flag_invalid, status);
6383         return 0;
6384     }
6385     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6386               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6387          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6388               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6389        ) {
6390         if (floatx80_is_signaling_nan(a, status)
6391          || floatx80_is_signaling_nan(b, status)) {
6392             float_raise(float_flag_invalid, status);
6393         }
6394         return 0;
6395     }
6396     return
6397            ( a.low == b.low )
6398         && (    ( a.high == b.high )
6399              || (    ( a.low == 0 )
6400                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6401            );
6402 
6403 }
6404 
6405 /*----------------------------------------------------------------------------
6406 | Returns 1 if the extended double-precision floating-point value `a' is less
6407 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6408 | do not cause an exception.  Otherwise, the comparison is performed according
6409 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6410 *----------------------------------------------------------------------------*/
6411 
6412 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6413 {
6414     flag aSign, bSign;
6415 
6416     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6417         float_raise(float_flag_invalid, status);
6418         return 0;
6419     }
6420     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6421               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6422          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6423               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6424        ) {
6425         if (floatx80_is_signaling_nan(a, status)
6426          || floatx80_is_signaling_nan(b, status)) {
6427             float_raise(float_flag_invalid, status);
6428         }
6429         return 0;
6430     }
6431     aSign = extractFloatx80Sign( a );
6432     bSign = extractFloatx80Sign( b );
6433     if ( aSign != bSign ) {
6434         return
6435                aSign
6436             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6437                  == 0 );
6438     }
6439     return
6440           aSign ? le128( b.high, b.low, a.high, a.low )
6441         : le128( a.high, a.low, b.high, b.low );
6442 
6443 }
6444 
6445 /*----------------------------------------------------------------------------
6446 | Returns 1 if the extended double-precision floating-point value `a' is less
6447 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6448 | an exception.  Otherwise, the comparison is performed according to the
6449 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6450 *----------------------------------------------------------------------------*/
6451 
6452 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6453 {
6454     flag aSign, bSign;
6455 
6456     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6457         float_raise(float_flag_invalid, status);
6458         return 0;
6459     }
6460     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6461               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6462          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6463               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6464        ) {
6465         if (floatx80_is_signaling_nan(a, status)
6466          || floatx80_is_signaling_nan(b, status)) {
6467             float_raise(float_flag_invalid, status);
6468         }
6469         return 0;
6470     }
6471     aSign = extractFloatx80Sign( a );
6472     bSign = extractFloatx80Sign( b );
6473     if ( aSign != bSign ) {
6474         return
6475                aSign
6476             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6477                  != 0 );
6478     }
6479     return
6480           aSign ? lt128( b.high, b.low, a.high, a.low )
6481         : lt128( a.high, a.low, b.high, b.low );
6482 
6483 }
6484 
6485 /*----------------------------------------------------------------------------
6486 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6487 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6488 | The comparison is performed according to the IEC/IEEE Standard for Binary
6489 | Floating-Point Arithmetic.
6490 *----------------------------------------------------------------------------*/
6491 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6492 {
6493     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6494         float_raise(float_flag_invalid, status);
6495         return 1;
6496     }
6497     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6498               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6499          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6500               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6501        ) {
6502         if (floatx80_is_signaling_nan(a, status)
6503          || floatx80_is_signaling_nan(b, status)) {
6504             float_raise(float_flag_invalid, status);
6505         }
6506         return 1;
6507     }
6508     return 0;
6509 }
6510 
6511 /*----------------------------------------------------------------------------
6512 | Returns the result of converting the quadruple-precision floating-point
6513 | value `a' to the 32-bit two's complement integer format.  The conversion
6514 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6515 | Arithmetic---which means in particular that the conversion is rounded
6516 | according to the current rounding mode.  If `a' is a NaN, the largest
6517 | positive integer is returned.  Otherwise, if the conversion overflows, the
6518 | largest integer with the same sign as `a' is returned.
6519 *----------------------------------------------------------------------------*/
6520 
6521 int32_t float128_to_int32(float128 a, float_status *status)
6522 {
6523     flag aSign;
6524     int32_t aExp, shiftCount;
6525     uint64_t aSig0, aSig1;
6526 
6527     aSig1 = extractFloat128Frac1( a );
6528     aSig0 = extractFloat128Frac0( a );
6529     aExp = extractFloat128Exp( a );
6530     aSign = extractFloat128Sign( a );
6531     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6532     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6533     aSig0 |= ( aSig1 != 0 );
6534     shiftCount = 0x4028 - aExp;
6535     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6536     return roundAndPackInt32(aSign, aSig0, status);
6537 
6538 }
6539 
6540 /*----------------------------------------------------------------------------
6541 | Returns the result of converting the quadruple-precision floating-point
6542 | value `a' to the 32-bit two's complement integer format.  The conversion
6543 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6544 | Arithmetic, except that the conversion is always rounded toward zero.  If
6545 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6546 | conversion overflows, the largest integer with the same sign as `a' is
6547 | returned.
6548 *----------------------------------------------------------------------------*/
6549 
6550 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6551 {
6552     flag aSign;
6553     int32_t aExp, shiftCount;
6554     uint64_t aSig0, aSig1, savedASig;
6555     int32_t z;
6556 
6557     aSig1 = extractFloat128Frac1( a );
6558     aSig0 = extractFloat128Frac0( a );
6559     aExp = extractFloat128Exp( a );
6560     aSign = extractFloat128Sign( a );
6561     aSig0 |= ( aSig1 != 0 );
6562     if ( 0x401E < aExp ) {
6563         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6564         goto invalid;
6565     }
6566     else if ( aExp < 0x3FFF ) {
6567         if (aExp || aSig0) {
6568             status->float_exception_flags |= float_flag_inexact;
6569         }
6570         return 0;
6571     }
6572     aSig0 |= LIT64( 0x0001000000000000 );
6573     shiftCount = 0x402F - aExp;
6574     savedASig = aSig0;
6575     aSig0 >>= shiftCount;
6576     z = aSig0;
6577     if ( aSign ) z = - z;
6578     if ( ( z < 0 ) ^ aSign ) {
6579  invalid:
6580         float_raise(float_flag_invalid, status);
6581         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6582     }
6583     if ( ( aSig0<<shiftCount ) != savedASig ) {
6584         status->float_exception_flags |= float_flag_inexact;
6585     }
6586     return z;
6587 
6588 }
6589 
6590 /*----------------------------------------------------------------------------
6591 | Returns the result of converting the quadruple-precision floating-point
6592 | value `a' to the 64-bit two's complement integer format.  The conversion
6593 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6594 | Arithmetic---which means in particular that the conversion is rounded
6595 | according to the current rounding mode.  If `a' is a NaN, the largest
6596 | positive integer is returned.  Otherwise, if the conversion overflows, the
6597 | largest integer with the same sign as `a' is returned.
6598 *----------------------------------------------------------------------------*/
6599 
6600 int64_t float128_to_int64(float128 a, float_status *status)
6601 {
6602     flag aSign;
6603     int32_t aExp, shiftCount;
6604     uint64_t aSig0, aSig1;
6605 
6606     aSig1 = extractFloat128Frac1( a );
6607     aSig0 = extractFloat128Frac0( a );
6608     aExp = extractFloat128Exp( a );
6609     aSign = extractFloat128Sign( a );
6610     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6611     shiftCount = 0x402F - aExp;
6612     if ( shiftCount <= 0 ) {
6613         if ( 0x403E < aExp ) {
6614             float_raise(float_flag_invalid, status);
6615             if (    ! aSign
6616                  || (    ( aExp == 0x7FFF )
6617                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6618                     )
6619                ) {
6620                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6621             }
6622             return (int64_t) LIT64( 0x8000000000000000 );
6623         }
6624         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6625     }
6626     else {
6627         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6628     }
6629     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6630 
6631 }
6632 
6633 /*----------------------------------------------------------------------------
6634 | Returns the result of converting the quadruple-precision floating-point
6635 | value `a' to the 64-bit two's complement integer format.  The conversion
6636 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6637 | Arithmetic, except that the conversion is always rounded toward zero.
6638 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6639 | the conversion overflows, the largest integer with the same sign as `a' is
6640 | returned.
6641 *----------------------------------------------------------------------------*/
6642 
6643 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6644 {
6645     flag aSign;
6646     int32_t aExp, shiftCount;
6647     uint64_t aSig0, aSig1;
6648     int64_t z;
6649 
6650     aSig1 = extractFloat128Frac1( a );
6651     aSig0 = extractFloat128Frac0( a );
6652     aExp = extractFloat128Exp( a );
6653     aSign = extractFloat128Sign( a );
6654     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6655     shiftCount = aExp - 0x402F;
6656     if ( 0 < shiftCount ) {
6657         if ( 0x403E <= aExp ) {
6658             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6659             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6660                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6661                 if (aSig1) {
6662                     status->float_exception_flags |= float_flag_inexact;
6663                 }
6664             }
6665             else {
6666                 float_raise(float_flag_invalid, status);
6667                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6668                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6669                 }
6670             }
6671             return (int64_t) LIT64( 0x8000000000000000 );
6672         }
6673         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6674         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6675             status->float_exception_flags |= float_flag_inexact;
6676         }
6677     }
6678     else {
6679         if ( aExp < 0x3FFF ) {
6680             if ( aExp | aSig0 | aSig1 ) {
6681                 status->float_exception_flags |= float_flag_inexact;
6682             }
6683             return 0;
6684         }
6685         z = aSig0>>( - shiftCount );
6686         if (    aSig1
6687              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6688             status->float_exception_flags |= float_flag_inexact;
6689         }
6690     }
6691     if ( aSign ) z = - z;
6692     return z;
6693 
6694 }
6695 
6696 /*----------------------------------------------------------------------------
6697 | Returns the result of converting the quadruple-precision floating-point value
6698 | `a' to the 64-bit unsigned integer format.  The conversion is
6699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6700 | Arithmetic---which means in particular that the conversion is rounded
6701 | according to the current rounding mode.  If `a' is a NaN, the largest
6702 | positive integer is returned.  If the conversion overflows, the
6703 | largest unsigned integer is returned.  If 'a' is negative, the value is
6704 | rounded and zero is returned; negative values that do not round to zero
6705 | will raise the inexact exception.
6706 *----------------------------------------------------------------------------*/
6707 
6708 uint64_t float128_to_uint64(float128 a, float_status *status)
6709 {
6710     flag aSign;
6711     int aExp;
6712     int shiftCount;
6713     uint64_t aSig0, aSig1;
6714 
6715     aSig0 = extractFloat128Frac0(a);
6716     aSig1 = extractFloat128Frac1(a);
6717     aExp = extractFloat128Exp(a);
6718     aSign = extractFloat128Sign(a);
6719     if (aSign && (aExp > 0x3FFE)) {
6720         float_raise(float_flag_invalid, status);
6721         if (float128_is_any_nan(a)) {
6722             return LIT64(0xFFFFFFFFFFFFFFFF);
6723         } else {
6724             return 0;
6725         }
6726     }
6727     if (aExp) {
6728         aSig0 |= LIT64(0x0001000000000000);
6729     }
6730     shiftCount = 0x402F - aExp;
6731     if (shiftCount <= 0) {
6732         if (0x403E < aExp) {
6733             float_raise(float_flag_invalid, status);
6734             return LIT64(0xFFFFFFFFFFFFFFFF);
6735         }
6736         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6737     } else {
6738         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6739     }
6740     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6741 }
6742 
6743 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6744 {
6745     uint64_t v;
6746     signed char current_rounding_mode = status->float_rounding_mode;
6747 
6748     set_float_rounding_mode(float_round_to_zero, status);
6749     v = float128_to_uint64(a, status);
6750     set_float_rounding_mode(current_rounding_mode, status);
6751 
6752     return v;
6753 }
6754 
6755 /*----------------------------------------------------------------------------
6756 | Returns the result of converting the quadruple-precision floating-point
6757 | value `a' to the 32-bit unsigned integer format.  The conversion
6758 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6759 | Arithmetic except that the conversion is always rounded toward zero.
6760 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6761 | if the conversion overflows, the largest unsigned integer is returned.
6762 | If 'a' is negative, the value is rounded and zero is returned; negative
6763 | values that do not round to zero will raise the inexact exception.
6764 *----------------------------------------------------------------------------*/
6765 
6766 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6767 {
6768     uint64_t v;
6769     uint32_t res;
6770     int old_exc_flags = get_float_exception_flags(status);
6771 
6772     v = float128_to_uint64_round_to_zero(a, status);
6773     if (v > 0xffffffff) {
6774         res = 0xffffffff;
6775     } else {
6776         return v;
6777     }
6778     set_float_exception_flags(old_exc_flags, status);
6779     float_raise(float_flag_invalid, status);
6780     return res;
6781 }
6782 
6783 /*----------------------------------------------------------------------------
6784 | Returns the result of converting the quadruple-precision floating-point
6785 | value `a' to the single-precision floating-point format.  The conversion
6786 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6787 | Arithmetic.
6788 *----------------------------------------------------------------------------*/
6789 
6790 float32 float128_to_float32(float128 a, float_status *status)
6791 {
6792     flag aSign;
6793     int32_t aExp;
6794     uint64_t aSig0, aSig1;
6795     uint32_t zSig;
6796 
6797     aSig1 = extractFloat128Frac1( a );
6798     aSig0 = extractFloat128Frac0( a );
6799     aExp = extractFloat128Exp( a );
6800     aSign = extractFloat128Sign( a );
6801     if ( aExp == 0x7FFF ) {
6802         if ( aSig0 | aSig1 ) {
6803             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6804         }
6805         return packFloat32( aSign, 0xFF, 0 );
6806     }
6807     aSig0 |= ( aSig1 != 0 );
6808     shift64RightJamming( aSig0, 18, &aSig0 );
6809     zSig = aSig0;
6810     if ( aExp || zSig ) {
6811         zSig |= 0x40000000;
6812         aExp -= 0x3F81;
6813     }
6814     return roundAndPackFloat32(aSign, aExp, zSig, status);
6815 
6816 }
6817 
6818 /*----------------------------------------------------------------------------
6819 | Returns the result of converting the quadruple-precision floating-point
6820 | value `a' to the double-precision floating-point format.  The conversion
6821 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6822 | Arithmetic.
6823 *----------------------------------------------------------------------------*/
6824 
6825 float64 float128_to_float64(float128 a, float_status *status)
6826 {
6827     flag aSign;
6828     int32_t aExp;
6829     uint64_t aSig0, aSig1;
6830 
6831     aSig1 = extractFloat128Frac1( a );
6832     aSig0 = extractFloat128Frac0( a );
6833     aExp = extractFloat128Exp( a );
6834     aSign = extractFloat128Sign( a );
6835     if ( aExp == 0x7FFF ) {
6836         if ( aSig0 | aSig1 ) {
6837             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6838         }
6839         return packFloat64( aSign, 0x7FF, 0 );
6840     }
6841     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6842     aSig0 |= ( aSig1 != 0 );
6843     if ( aExp || aSig0 ) {
6844         aSig0 |= LIT64( 0x4000000000000000 );
6845         aExp -= 0x3C01;
6846     }
6847     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6848 
6849 }
6850 
6851 /*----------------------------------------------------------------------------
6852 | Returns the result of converting the quadruple-precision floating-point
6853 | value `a' to the extended double-precision floating-point format.  The
6854 | conversion is performed according to the IEC/IEEE Standard for Binary
6855 | Floating-Point Arithmetic.
6856 *----------------------------------------------------------------------------*/
6857 
6858 floatx80 float128_to_floatx80(float128 a, float_status *status)
6859 {
6860     flag aSign;
6861     int32_t aExp;
6862     uint64_t aSig0, aSig1;
6863 
6864     aSig1 = extractFloat128Frac1( a );
6865     aSig0 = extractFloat128Frac0( a );
6866     aExp = extractFloat128Exp( a );
6867     aSign = extractFloat128Sign( a );
6868     if ( aExp == 0x7FFF ) {
6869         if ( aSig0 | aSig1 ) {
6870             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6871         }
6872         return packFloatx80(aSign, floatx80_infinity_high,
6873                                    floatx80_infinity_low);
6874     }
6875     if ( aExp == 0 ) {
6876         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6877         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6878     }
6879     else {
6880         aSig0 |= LIT64( 0x0001000000000000 );
6881     }
6882     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6883     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6884 
6885 }
6886 
6887 /*----------------------------------------------------------------------------
6888 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6889 | returns the result as a quadruple-precision floating-point value.  The
6890 | operation is performed according to the IEC/IEEE Standard for Binary
6891 | Floating-Point Arithmetic.
6892 *----------------------------------------------------------------------------*/
6893 
6894 float128 float128_round_to_int(float128 a, float_status *status)
6895 {
6896     flag aSign;
6897     int32_t aExp;
6898     uint64_t lastBitMask, roundBitsMask;
6899     float128 z;
6900 
6901     aExp = extractFloat128Exp( a );
6902     if ( 0x402F <= aExp ) {
6903         if ( 0x406F <= aExp ) {
6904             if (    ( aExp == 0x7FFF )
6905                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6906                ) {
6907                 return propagateFloat128NaN(a, a, status);
6908             }
6909             return a;
6910         }
6911         lastBitMask = 1;
6912         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6913         roundBitsMask = lastBitMask - 1;
6914         z = a;
6915         switch (status->float_rounding_mode) {
6916         case float_round_nearest_even:
6917             if ( lastBitMask ) {
6918                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6919                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6920             }
6921             else {
6922                 if ( (int64_t) z.low < 0 ) {
6923                     ++z.high;
6924                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6925                 }
6926             }
6927             break;
6928         case float_round_ties_away:
6929             if (lastBitMask) {
6930                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6931             } else {
6932                 if ((int64_t) z.low < 0) {
6933                     ++z.high;
6934                 }
6935             }
6936             break;
6937         case float_round_to_zero:
6938             break;
6939         case float_round_up:
6940             if (!extractFloat128Sign(z)) {
6941                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6942             }
6943             break;
6944         case float_round_down:
6945             if (extractFloat128Sign(z)) {
6946                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6947             }
6948             break;
6949         default:
6950             abort();
6951         }
6952         z.low &= ~ roundBitsMask;
6953     }
6954     else {
6955         if ( aExp < 0x3FFF ) {
6956             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6957             status->float_exception_flags |= float_flag_inexact;
6958             aSign = extractFloat128Sign( a );
6959             switch (status->float_rounding_mode) {
6960              case float_round_nearest_even:
6961                 if (    ( aExp == 0x3FFE )
6962                      && (   extractFloat128Frac0( a )
6963                           | extractFloat128Frac1( a ) )
6964                    ) {
6965                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6966                 }
6967                 break;
6968             case float_round_ties_away:
6969                 if (aExp == 0x3FFE) {
6970                     return packFloat128(aSign, 0x3FFF, 0, 0);
6971                 }
6972                 break;
6973              case float_round_down:
6974                 return
6975                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6976                     : packFloat128( 0, 0, 0, 0 );
6977              case float_round_up:
6978                 return
6979                       aSign ? packFloat128( 1, 0, 0, 0 )
6980                     : packFloat128( 0, 0x3FFF, 0, 0 );
6981             }
6982             return packFloat128( aSign, 0, 0, 0 );
6983         }
6984         lastBitMask = 1;
6985         lastBitMask <<= 0x402F - aExp;
6986         roundBitsMask = lastBitMask - 1;
6987         z.low = 0;
6988         z.high = a.high;
6989         switch (status->float_rounding_mode) {
6990         case float_round_nearest_even:
6991             z.high += lastBitMask>>1;
6992             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6993                 z.high &= ~ lastBitMask;
6994             }
6995             break;
6996         case float_round_ties_away:
6997             z.high += lastBitMask>>1;
6998             break;
6999         case float_round_to_zero:
7000             break;
7001         case float_round_up:
7002             if (!extractFloat128Sign(z)) {
7003                 z.high |= ( a.low != 0 );
7004                 z.high += roundBitsMask;
7005             }
7006             break;
7007         case float_round_down:
7008             if (extractFloat128Sign(z)) {
7009                 z.high |= (a.low != 0);
7010                 z.high += roundBitsMask;
7011             }
7012             break;
7013         default:
7014             abort();
7015         }
7016         z.high &= ~ roundBitsMask;
7017     }
7018     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7019         status->float_exception_flags |= float_flag_inexact;
7020     }
7021     return z;
7022 
7023 }
7024 
7025 /*----------------------------------------------------------------------------
7026 | Returns the result of adding the absolute values of the quadruple-precision
7027 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7028 | before being returned.  `zSign' is ignored if the result is a NaN.
7029 | The addition is performed according to the IEC/IEEE Standard for Binary
7030 | Floating-Point Arithmetic.
7031 *----------------------------------------------------------------------------*/
7032 
7033 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7034                                 float_status *status)
7035 {
7036     int32_t aExp, bExp, zExp;
7037     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7038     int32_t expDiff;
7039 
7040     aSig1 = extractFloat128Frac1( a );
7041     aSig0 = extractFloat128Frac0( a );
7042     aExp = extractFloat128Exp( a );
7043     bSig1 = extractFloat128Frac1( b );
7044     bSig0 = extractFloat128Frac0( b );
7045     bExp = extractFloat128Exp( b );
7046     expDiff = aExp - bExp;
7047     if ( 0 < expDiff ) {
7048         if ( aExp == 0x7FFF ) {
7049             if (aSig0 | aSig1) {
7050                 return propagateFloat128NaN(a, b, status);
7051             }
7052             return a;
7053         }
7054         if ( bExp == 0 ) {
7055             --expDiff;
7056         }
7057         else {
7058             bSig0 |= LIT64( 0x0001000000000000 );
7059         }
7060         shift128ExtraRightJamming(
7061             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7062         zExp = aExp;
7063     }
7064     else if ( expDiff < 0 ) {
7065         if ( bExp == 0x7FFF ) {
7066             if (bSig0 | bSig1) {
7067                 return propagateFloat128NaN(a, b, status);
7068             }
7069             return packFloat128( zSign, 0x7FFF, 0, 0 );
7070         }
7071         if ( aExp == 0 ) {
7072             ++expDiff;
7073         }
7074         else {
7075             aSig0 |= LIT64( 0x0001000000000000 );
7076         }
7077         shift128ExtraRightJamming(
7078             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7079         zExp = bExp;
7080     }
7081     else {
7082         if ( aExp == 0x7FFF ) {
7083             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7084                 return propagateFloat128NaN(a, b, status);
7085             }
7086             return a;
7087         }
7088         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7089         if ( aExp == 0 ) {
7090             if (status->flush_to_zero) {
7091                 if (zSig0 | zSig1) {
7092                     float_raise(float_flag_output_denormal, status);
7093                 }
7094                 return packFloat128(zSign, 0, 0, 0);
7095             }
7096             return packFloat128( zSign, 0, zSig0, zSig1 );
7097         }
7098         zSig2 = 0;
7099         zSig0 |= LIT64( 0x0002000000000000 );
7100         zExp = aExp;
7101         goto shiftRight1;
7102     }
7103     aSig0 |= LIT64( 0x0001000000000000 );
7104     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7105     --zExp;
7106     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
7107     ++zExp;
7108  shiftRight1:
7109     shift128ExtraRightJamming(
7110         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7111  roundAndPack:
7112     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7113 
7114 }
7115 
7116 /*----------------------------------------------------------------------------
7117 | Returns the result of subtracting the absolute values of the quadruple-
7118 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7119 | difference is negated before being returned.  `zSign' is ignored if the
7120 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7121 | Standard for Binary Floating-Point Arithmetic.
7122 *----------------------------------------------------------------------------*/
7123 
7124 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7125                                 float_status *status)
7126 {
7127     int32_t aExp, bExp, zExp;
7128     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7129     int32_t expDiff;
7130 
7131     aSig1 = extractFloat128Frac1( a );
7132     aSig0 = extractFloat128Frac0( a );
7133     aExp = extractFloat128Exp( a );
7134     bSig1 = extractFloat128Frac1( b );
7135     bSig0 = extractFloat128Frac0( b );
7136     bExp = extractFloat128Exp( b );
7137     expDiff = aExp - bExp;
7138     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7139     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7140     if ( 0 < expDiff ) goto aExpBigger;
7141     if ( expDiff < 0 ) goto bExpBigger;
7142     if ( aExp == 0x7FFF ) {
7143         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7144             return propagateFloat128NaN(a, b, status);
7145         }
7146         float_raise(float_flag_invalid, status);
7147         return float128_default_nan(status);
7148     }
7149     if ( aExp == 0 ) {
7150         aExp = 1;
7151         bExp = 1;
7152     }
7153     if ( bSig0 < aSig0 ) goto aBigger;
7154     if ( aSig0 < bSig0 ) goto bBigger;
7155     if ( bSig1 < aSig1 ) goto aBigger;
7156     if ( aSig1 < bSig1 ) goto bBigger;
7157     return packFloat128(status->float_rounding_mode == float_round_down,
7158                         0, 0, 0);
7159  bExpBigger:
7160     if ( bExp == 0x7FFF ) {
7161         if (bSig0 | bSig1) {
7162             return propagateFloat128NaN(a, b, status);
7163         }
7164         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7165     }
7166     if ( aExp == 0 ) {
7167         ++expDiff;
7168     }
7169     else {
7170         aSig0 |= LIT64( 0x4000000000000000 );
7171     }
7172     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7173     bSig0 |= LIT64( 0x4000000000000000 );
7174  bBigger:
7175     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7176     zExp = bExp;
7177     zSign ^= 1;
7178     goto normalizeRoundAndPack;
7179  aExpBigger:
7180     if ( aExp == 0x7FFF ) {
7181         if (aSig0 | aSig1) {
7182             return propagateFloat128NaN(a, b, status);
7183         }
7184         return a;
7185     }
7186     if ( bExp == 0 ) {
7187         --expDiff;
7188     }
7189     else {
7190         bSig0 |= LIT64( 0x4000000000000000 );
7191     }
7192     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7193     aSig0 |= LIT64( 0x4000000000000000 );
7194  aBigger:
7195     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7196     zExp = aExp;
7197  normalizeRoundAndPack:
7198     --zExp;
7199     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7200                                          status);
7201 
7202 }
7203 
7204 /*----------------------------------------------------------------------------
7205 | Returns the result of adding the quadruple-precision floating-point values
7206 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7207 | for Binary Floating-Point Arithmetic.
7208 *----------------------------------------------------------------------------*/
7209 
7210 float128 float128_add(float128 a, float128 b, float_status *status)
7211 {
7212     flag aSign, bSign;
7213 
7214     aSign = extractFloat128Sign( a );
7215     bSign = extractFloat128Sign( b );
7216     if ( aSign == bSign ) {
7217         return addFloat128Sigs(a, b, aSign, status);
7218     }
7219     else {
7220         return subFloat128Sigs(a, b, aSign, status);
7221     }
7222 
7223 }
7224 
7225 /*----------------------------------------------------------------------------
7226 | Returns the result of subtracting the quadruple-precision floating-point
7227 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7228 | Standard for Binary Floating-Point Arithmetic.
7229 *----------------------------------------------------------------------------*/
7230 
7231 float128 float128_sub(float128 a, float128 b, float_status *status)
7232 {
7233     flag aSign, bSign;
7234 
7235     aSign = extractFloat128Sign( a );
7236     bSign = extractFloat128Sign( b );
7237     if ( aSign == bSign ) {
7238         return subFloat128Sigs(a, b, aSign, status);
7239     }
7240     else {
7241         return addFloat128Sigs(a, b, aSign, status);
7242     }
7243 
7244 }
7245 
7246 /*----------------------------------------------------------------------------
7247 | Returns the result of multiplying the quadruple-precision floating-point
7248 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7249 | Standard for Binary Floating-Point Arithmetic.
7250 *----------------------------------------------------------------------------*/
7251 
7252 float128 float128_mul(float128 a, float128 b, float_status *status)
7253 {
7254     flag aSign, bSign, zSign;
7255     int32_t aExp, bExp, zExp;
7256     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7257 
7258     aSig1 = extractFloat128Frac1( a );
7259     aSig0 = extractFloat128Frac0( a );
7260     aExp = extractFloat128Exp( a );
7261     aSign = extractFloat128Sign( a );
7262     bSig1 = extractFloat128Frac1( b );
7263     bSig0 = extractFloat128Frac0( b );
7264     bExp = extractFloat128Exp( b );
7265     bSign = extractFloat128Sign( b );
7266     zSign = aSign ^ bSign;
7267     if ( aExp == 0x7FFF ) {
7268         if (    ( aSig0 | aSig1 )
7269              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7270             return propagateFloat128NaN(a, b, status);
7271         }
7272         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7273         return packFloat128( zSign, 0x7FFF, 0, 0 );
7274     }
7275     if ( bExp == 0x7FFF ) {
7276         if (bSig0 | bSig1) {
7277             return propagateFloat128NaN(a, b, status);
7278         }
7279         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7280  invalid:
7281             float_raise(float_flag_invalid, status);
7282             return float128_default_nan(status);
7283         }
7284         return packFloat128( zSign, 0x7FFF, 0, 0 );
7285     }
7286     if ( aExp == 0 ) {
7287         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7288         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7289     }
7290     if ( bExp == 0 ) {
7291         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7292         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7293     }
7294     zExp = aExp + bExp - 0x4000;
7295     aSig0 |= LIT64( 0x0001000000000000 );
7296     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7297     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7298     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7299     zSig2 |= ( zSig3 != 0 );
7300     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7301         shift128ExtraRightJamming(
7302             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7303         ++zExp;
7304     }
7305     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7306 
7307 }
7308 
7309 /*----------------------------------------------------------------------------
7310 | Returns the result of dividing the quadruple-precision floating-point value
7311 | `a' by the corresponding value `b'.  The operation is performed according to
7312 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7313 *----------------------------------------------------------------------------*/
7314 
7315 float128 float128_div(float128 a, float128 b, float_status *status)
7316 {
7317     flag aSign, bSign, zSign;
7318     int32_t aExp, bExp, zExp;
7319     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7320     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7321 
7322     aSig1 = extractFloat128Frac1( a );
7323     aSig0 = extractFloat128Frac0( a );
7324     aExp = extractFloat128Exp( a );
7325     aSign = extractFloat128Sign( a );
7326     bSig1 = extractFloat128Frac1( b );
7327     bSig0 = extractFloat128Frac0( b );
7328     bExp = extractFloat128Exp( b );
7329     bSign = extractFloat128Sign( b );
7330     zSign = aSign ^ bSign;
7331     if ( aExp == 0x7FFF ) {
7332         if (aSig0 | aSig1) {
7333             return propagateFloat128NaN(a, b, status);
7334         }
7335         if ( bExp == 0x7FFF ) {
7336             if (bSig0 | bSig1) {
7337                 return propagateFloat128NaN(a, b, status);
7338             }
7339             goto invalid;
7340         }
7341         return packFloat128( zSign, 0x7FFF, 0, 0 );
7342     }
7343     if ( bExp == 0x7FFF ) {
7344         if (bSig0 | bSig1) {
7345             return propagateFloat128NaN(a, b, status);
7346         }
7347         return packFloat128( zSign, 0, 0, 0 );
7348     }
7349     if ( bExp == 0 ) {
7350         if ( ( bSig0 | bSig1 ) == 0 ) {
7351             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7352  invalid:
7353                 float_raise(float_flag_invalid, status);
7354                 return float128_default_nan(status);
7355             }
7356             float_raise(float_flag_divbyzero, status);
7357             return packFloat128( zSign, 0x7FFF, 0, 0 );
7358         }
7359         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7360     }
7361     if ( aExp == 0 ) {
7362         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7363         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7364     }
7365     zExp = aExp - bExp + 0x3FFD;
7366     shortShift128Left(
7367         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7368     shortShift128Left(
7369         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7370     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7371         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7372         ++zExp;
7373     }
7374     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7375     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7376     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7377     while ( (int64_t) rem0 < 0 ) {
7378         --zSig0;
7379         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7380     }
7381     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7382     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7383         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7384         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7385         while ( (int64_t) rem1 < 0 ) {
7386             --zSig1;
7387             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7388         }
7389         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7390     }
7391     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7392     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7393 
7394 }
7395 
7396 /*----------------------------------------------------------------------------
7397 | Returns the remainder of the quadruple-precision floating-point value `a'
7398 | with respect to the corresponding value `b'.  The operation is performed
7399 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7400 *----------------------------------------------------------------------------*/
7401 
7402 float128 float128_rem(float128 a, float128 b, float_status *status)
7403 {
7404     flag aSign, zSign;
7405     int32_t aExp, bExp, expDiff;
7406     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7407     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7408     int64_t sigMean0;
7409 
7410     aSig1 = extractFloat128Frac1( a );
7411     aSig0 = extractFloat128Frac0( a );
7412     aExp = extractFloat128Exp( a );
7413     aSign = extractFloat128Sign( a );
7414     bSig1 = extractFloat128Frac1( b );
7415     bSig0 = extractFloat128Frac0( b );
7416     bExp = extractFloat128Exp( b );
7417     if ( aExp == 0x7FFF ) {
7418         if (    ( aSig0 | aSig1 )
7419              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7420             return propagateFloat128NaN(a, b, status);
7421         }
7422         goto invalid;
7423     }
7424     if ( bExp == 0x7FFF ) {
7425         if (bSig0 | bSig1) {
7426             return propagateFloat128NaN(a, b, status);
7427         }
7428         return a;
7429     }
7430     if ( bExp == 0 ) {
7431         if ( ( bSig0 | bSig1 ) == 0 ) {
7432  invalid:
7433             float_raise(float_flag_invalid, status);
7434             return float128_default_nan(status);
7435         }
7436         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7437     }
7438     if ( aExp == 0 ) {
7439         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7440         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7441     }
7442     expDiff = aExp - bExp;
7443     if ( expDiff < -1 ) return a;
7444     shortShift128Left(
7445         aSig0 | LIT64( 0x0001000000000000 ),
7446         aSig1,
7447         15 - ( expDiff < 0 ),
7448         &aSig0,
7449         &aSig1
7450     );
7451     shortShift128Left(
7452         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7453     q = le128( bSig0, bSig1, aSig0, aSig1 );
7454     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7455     expDiff -= 64;
7456     while ( 0 < expDiff ) {
7457         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7458         q = ( 4 < q ) ? q - 4 : 0;
7459         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7460         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7461         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7462         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7463         expDiff -= 61;
7464     }
7465     if ( -64 < expDiff ) {
7466         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7467         q = ( 4 < q ) ? q - 4 : 0;
7468         q >>= - expDiff;
7469         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7470         expDiff += 52;
7471         if ( expDiff < 0 ) {
7472             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7473         }
7474         else {
7475             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7476         }
7477         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7478         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7479     }
7480     else {
7481         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7482         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7483     }
7484     do {
7485         alternateASig0 = aSig0;
7486         alternateASig1 = aSig1;
7487         ++q;
7488         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7489     } while ( 0 <= (int64_t) aSig0 );
7490     add128(
7491         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7492     if (    ( sigMean0 < 0 )
7493          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7494         aSig0 = alternateASig0;
7495         aSig1 = alternateASig1;
7496     }
7497     zSign = ( (int64_t) aSig0 < 0 );
7498     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7499     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7500                                          status);
7501 }
7502 
7503 /*----------------------------------------------------------------------------
7504 | Returns the square root of the quadruple-precision floating-point value `a'.
7505 | The operation is performed according to the IEC/IEEE Standard for Binary
7506 | Floating-Point Arithmetic.
7507 *----------------------------------------------------------------------------*/
7508 
7509 float128 float128_sqrt(float128 a, float_status *status)
7510 {
7511     flag aSign;
7512     int32_t aExp, zExp;
7513     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7514     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7515 
7516     aSig1 = extractFloat128Frac1( a );
7517     aSig0 = extractFloat128Frac0( a );
7518     aExp = extractFloat128Exp( a );
7519     aSign = extractFloat128Sign( a );
7520     if ( aExp == 0x7FFF ) {
7521         if (aSig0 | aSig1) {
7522             return propagateFloat128NaN(a, a, status);
7523         }
7524         if ( ! aSign ) return a;
7525         goto invalid;
7526     }
7527     if ( aSign ) {
7528         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7529  invalid:
7530         float_raise(float_flag_invalid, status);
7531         return float128_default_nan(status);
7532     }
7533     if ( aExp == 0 ) {
7534         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7535         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7536     }
7537     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7538     aSig0 |= LIT64( 0x0001000000000000 );
7539     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7540     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7541     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7542     doubleZSig0 = zSig0<<1;
7543     mul64To128( zSig0, zSig0, &term0, &term1 );
7544     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7545     while ( (int64_t) rem0 < 0 ) {
7546         --zSig0;
7547         doubleZSig0 -= 2;
7548         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7549     }
7550     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7551     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7552         if ( zSig1 == 0 ) zSig1 = 1;
7553         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7554         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7555         mul64To128( zSig1, zSig1, &term2, &term3 );
7556         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7557         while ( (int64_t) rem1 < 0 ) {
7558             --zSig1;
7559             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7560             term3 |= 1;
7561             term2 |= doubleZSig0;
7562             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7563         }
7564         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7565     }
7566     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7567     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7568 
7569 }
7570 
7571 /*----------------------------------------------------------------------------
7572 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7573 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7574 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7575 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7576 *----------------------------------------------------------------------------*/
7577 
7578 int float128_eq(float128 a, float128 b, float_status *status)
7579 {
7580 
7581     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7582               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7583          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7584               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7585        ) {
7586         float_raise(float_flag_invalid, status);
7587         return 0;
7588     }
7589     return
7590            ( a.low == b.low )
7591         && (    ( a.high == b.high )
7592              || (    ( a.low == 0 )
7593                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7594            );
7595 
7596 }
7597 
7598 /*----------------------------------------------------------------------------
7599 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7600 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7601 | exception is raised if either operand is a NaN.  The comparison is performed
7602 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7603 *----------------------------------------------------------------------------*/
7604 
7605 int float128_le(float128 a, float128 b, float_status *status)
7606 {
7607     flag aSign, bSign;
7608 
7609     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7610               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7611          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7612               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7613        ) {
7614         float_raise(float_flag_invalid, status);
7615         return 0;
7616     }
7617     aSign = extractFloat128Sign( a );
7618     bSign = extractFloat128Sign( b );
7619     if ( aSign != bSign ) {
7620         return
7621                aSign
7622             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7623                  == 0 );
7624     }
7625     return
7626           aSign ? le128( b.high, b.low, a.high, a.low )
7627         : le128( a.high, a.low, b.high, b.low );
7628 
7629 }
7630 
7631 /*----------------------------------------------------------------------------
7632 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7633 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7634 | raised if either operand is a NaN.  The comparison is performed according
7635 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7636 *----------------------------------------------------------------------------*/
7637 
7638 int float128_lt(float128 a, float128 b, float_status *status)
7639 {
7640     flag aSign, bSign;
7641 
7642     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7643               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7644          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7645               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7646        ) {
7647         float_raise(float_flag_invalid, status);
7648         return 0;
7649     }
7650     aSign = extractFloat128Sign( a );
7651     bSign = extractFloat128Sign( b );
7652     if ( aSign != bSign ) {
7653         return
7654                aSign
7655             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7656                  != 0 );
7657     }
7658     return
7659           aSign ? lt128( b.high, b.low, a.high, a.low )
7660         : lt128( a.high, a.low, b.high, b.low );
7661 
7662 }
7663 
7664 /*----------------------------------------------------------------------------
7665 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7666 | be compared, and 0 otherwise.  The invalid exception is raised if either
7667 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7668 | Standard for Binary Floating-Point Arithmetic.
7669 *----------------------------------------------------------------------------*/
7670 
7671 int float128_unordered(float128 a, float128 b, float_status *status)
7672 {
7673     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7674               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7675          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7676               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7677        ) {
7678         float_raise(float_flag_invalid, status);
7679         return 1;
7680     }
7681     return 0;
7682 }
7683 
7684 /*----------------------------------------------------------------------------
7685 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7686 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7687 | exception.  The comparison is performed according to the IEC/IEEE Standard
7688 | for Binary Floating-Point Arithmetic.
7689 *----------------------------------------------------------------------------*/
7690 
7691 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7692 {
7693 
7694     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7695               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7696          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7697               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7698        ) {
7699         if (float128_is_signaling_nan(a, status)
7700          || float128_is_signaling_nan(b, status)) {
7701             float_raise(float_flag_invalid, status);
7702         }
7703         return 0;
7704     }
7705     return
7706            ( a.low == b.low )
7707         && (    ( a.high == b.high )
7708              || (    ( a.low == 0 )
7709                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7710            );
7711 
7712 }
7713 
7714 /*----------------------------------------------------------------------------
7715 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7716 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7717 | cause an exception.  Otherwise, the comparison is performed according to the
7718 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7719 *----------------------------------------------------------------------------*/
7720 
7721 int float128_le_quiet(float128 a, float128 b, float_status *status)
7722 {
7723     flag aSign, bSign;
7724 
7725     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7726               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7727          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7728               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7729        ) {
7730         if (float128_is_signaling_nan(a, status)
7731          || float128_is_signaling_nan(b, status)) {
7732             float_raise(float_flag_invalid, status);
7733         }
7734         return 0;
7735     }
7736     aSign = extractFloat128Sign( a );
7737     bSign = extractFloat128Sign( b );
7738     if ( aSign != bSign ) {
7739         return
7740                aSign
7741             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7742                  == 0 );
7743     }
7744     return
7745           aSign ? le128( b.high, b.low, a.high, a.low )
7746         : le128( a.high, a.low, b.high, b.low );
7747 
7748 }
7749 
7750 /*----------------------------------------------------------------------------
7751 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7752 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7753 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7754 | Standard for Binary Floating-Point Arithmetic.
7755 *----------------------------------------------------------------------------*/
7756 
7757 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7758 {
7759     flag aSign, bSign;
7760 
7761     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7762               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7763          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7764               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7765        ) {
7766         if (float128_is_signaling_nan(a, status)
7767          || float128_is_signaling_nan(b, status)) {
7768             float_raise(float_flag_invalid, status);
7769         }
7770         return 0;
7771     }
7772     aSign = extractFloat128Sign( a );
7773     bSign = extractFloat128Sign( b );
7774     if ( aSign != bSign ) {
7775         return
7776                aSign
7777             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7778                  != 0 );
7779     }
7780     return
7781           aSign ? lt128( b.high, b.low, a.high, a.low )
7782         : lt128( a.high, a.low, b.high, b.low );
7783 
7784 }
7785 
7786 /*----------------------------------------------------------------------------
7787 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7788 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7789 | comparison is performed according to the IEC/IEEE Standard for Binary
7790 | Floating-Point Arithmetic.
7791 *----------------------------------------------------------------------------*/
7792 
7793 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7794 {
7795     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7796               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7797          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7798               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7799        ) {
7800         if (float128_is_signaling_nan(a, status)
7801          || float128_is_signaling_nan(b, status)) {
7802             float_raise(float_flag_invalid, status);
7803         }
7804         return 1;
7805     }
7806     return 0;
7807 }
7808 
7809 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7810                                             int is_quiet, float_status *status)
7811 {
7812     flag aSign, bSign;
7813 
7814     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7815         float_raise(float_flag_invalid, status);
7816         return float_relation_unordered;
7817     }
7818     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7819           ( extractFloatx80Frac( a )<<1 ) ) ||
7820         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7821           ( extractFloatx80Frac( b )<<1 ) )) {
7822         if (!is_quiet ||
7823             floatx80_is_signaling_nan(a, status) ||
7824             floatx80_is_signaling_nan(b, status)) {
7825             float_raise(float_flag_invalid, status);
7826         }
7827         return float_relation_unordered;
7828     }
7829     aSign = extractFloatx80Sign( a );
7830     bSign = extractFloatx80Sign( b );
7831     if ( aSign != bSign ) {
7832 
7833         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7834              ( ( a.low | b.low ) == 0 ) ) {
7835             /* zero case */
7836             return float_relation_equal;
7837         } else {
7838             return 1 - (2 * aSign);
7839         }
7840     } else {
7841         if (a.low == b.low && a.high == b.high) {
7842             return float_relation_equal;
7843         } else {
7844             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7845         }
7846     }
7847 }
7848 
7849 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7850 {
7851     return floatx80_compare_internal(a, b, 0, status);
7852 }
7853 
7854 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7855 {
7856     return floatx80_compare_internal(a, b, 1, status);
7857 }
7858 
7859 static inline int float128_compare_internal(float128 a, float128 b,
7860                                             int is_quiet, float_status *status)
7861 {
7862     flag aSign, bSign;
7863 
7864     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7865           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7866         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7867           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7868         if (!is_quiet ||
7869             float128_is_signaling_nan(a, status) ||
7870             float128_is_signaling_nan(b, status)) {
7871             float_raise(float_flag_invalid, status);
7872         }
7873         return float_relation_unordered;
7874     }
7875     aSign = extractFloat128Sign( a );
7876     bSign = extractFloat128Sign( b );
7877     if ( aSign != bSign ) {
7878         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7879             /* zero case */
7880             return float_relation_equal;
7881         } else {
7882             return 1 - (2 * aSign);
7883         }
7884     } else {
7885         if (a.low == b.low && a.high == b.high) {
7886             return float_relation_equal;
7887         } else {
7888             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7889         }
7890     }
7891 }
7892 
7893 int float128_compare(float128 a, float128 b, float_status *status)
7894 {
7895     return float128_compare_internal(a, b, 0, status);
7896 }
7897 
7898 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7899 {
7900     return float128_compare_internal(a, b, 1, status);
7901 }
7902 
7903 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7904 {
7905     flag aSign;
7906     int32_t aExp;
7907     uint64_t aSig;
7908 
7909     if (floatx80_invalid_encoding(a)) {
7910         float_raise(float_flag_invalid, status);
7911         return floatx80_default_nan(status);
7912     }
7913     aSig = extractFloatx80Frac( a );
7914     aExp = extractFloatx80Exp( a );
7915     aSign = extractFloatx80Sign( a );
7916 
7917     if ( aExp == 0x7FFF ) {
7918         if ( aSig<<1 ) {
7919             return propagateFloatx80NaN(a, a, status);
7920         }
7921         return a;
7922     }
7923 
7924     if (aExp == 0) {
7925         if (aSig == 0) {
7926             return a;
7927         }
7928         aExp++;
7929     }
7930 
7931     if (n > 0x10000) {
7932         n = 0x10000;
7933     } else if (n < -0x10000) {
7934         n = -0x10000;
7935     }
7936 
7937     aExp += n;
7938     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7939                                          aSign, aExp, aSig, 0, status);
7940 }
7941 
7942 float128 float128_scalbn(float128 a, int n, float_status *status)
7943 {
7944     flag aSign;
7945     int32_t aExp;
7946     uint64_t aSig0, aSig1;
7947 
7948     aSig1 = extractFloat128Frac1( a );
7949     aSig0 = extractFloat128Frac0( a );
7950     aExp = extractFloat128Exp( a );
7951     aSign = extractFloat128Sign( a );
7952     if ( aExp == 0x7FFF ) {
7953         if ( aSig0 | aSig1 ) {
7954             return propagateFloat128NaN(a, a, status);
7955         }
7956         return a;
7957     }
7958     if (aExp != 0) {
7959         aSig0 |= LIT64( 0x0001000000000000 );
7960     } else if (aSig0 == 0 && aSig1 == 0) {
7961         return a;
7962     } else {
7963         aExp++;
7964     }
7965 
7966     if (n > 0x10000) {
7967         n = 0x10000;
7968     } else if (n < -0x10000) {
7969         n = -0x10000;
7970     }
7971 
7972     aExp += n - 1;
7973     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7974                                          , status);
7975 
7976 }
7977