1 /*============================================================================
2 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
3 Package, Release 2b.
4 
5 Written by John R. Hauser.  This work was made possible in part by the
6 International Computer Science Institute, located at Suite 600, 1947 Center
7 Street, Berkeley, California 94704.  Funding was partially provided by the
8 National Science Foundation under grant MIP-9311980.  The original version
9 of this code was written as part of a project to build a fixed-point vector
10 processor in collaboration with the University of California at Berkeley,
11 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
12 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
13 arithmetic/SoftFloat.html'.
14 
15 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
16 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
17 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
18 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
19 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
20 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
21 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
22 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
23 
24 Derivative works are acceptable, even for commercial purposes, so long as
25 (1) the source code for the derivative work includes prominent notice that
26 the work is derivative, and (2) the source code includes prominent notice with
27 these four paragraphs for those parts of this code that are retained.
28 =============================================================================*/
29 
30 #define FLOAT128
31 
32 /*============================================================================
33  * Adapted for Bochs (x86 achitecture simulator) by
34  *            Stanislav Shwartsman [sshwarts at sourceforge net]
35  * ==========================================================================*/
36 
37 #include "softfloat.h"
38 #include "softfloat-round-pack.h"
39 
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target
43 | if desired).
44 *----------------------------------------------------------------------------*/
45 #define USE_estimateDiv128To64
46 #define USE_estimateSqrt32
47 #include "softfloat-macros.h"
48 
49 /*----------------------------------------------------------------------------
50 | Functions and definitions to determine:  (1) whether tininess for underflow
51 | is detected before or after rounding by default, (2) what (if anything)
52 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
53 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
54 | are propagated from function inputs to output.  These details are target-
55 | specific.
56 *----------------------------------------------------------------------------*/
57 #include "softfloat-specialize.h"
58 
59 /*----------------------------------------------------------------------------
60 | Returns the result of converting the 32-bit two's complement integer `a'
61 | to the single-precision floating-point format.  The conversion is performed
62 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
63 *----------------------------------------------------------------------------*/
64 
int32_to_float32(Bit32s a,float_status_t & status)65 float32 int32_to_float32(Bit32s a, float_status_t &status)
66 {
67     if (a == 0) return 0;
68     if (a == (Bit32s) 0x80000000) return packFloat32(1, 0x9E, 0);
69     int zSign = (a < 0);
70     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
71 }
72 
73 /*----------------------------------------------------------------------------
74 | Returns the result of converting the 32-bit two's complement integer `a'
75 | to the double-precision floating-point format.  The conversion is performed
76 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
77 *----------------------------------------------------------------------------*/
78 
int32_to_float64(Bit32s a)79 float64 int32_to_float64(Bit32s a)
80 {
81     if (a == 0) return 0;
82     int zSign = (a < 0);
83     Bit32u absA = zSign ? -a : a;
84     int shiftCount = countLeadingZeros32(absA) + 21;
85     Bit64u zSig = absA;
86     return packFloat64(zSign, 0x432 - shiftCount, zSig<<shiftCount);
87 }
88 
89 /*----------------------------------------------------------------------------
90 | Returns the result of converting the 64-bit two's complement integer `a'
91 | to the single-precision floating-point format.  The conversion is performed
92 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
93 *----------------------------------------------------------------------------*/
94 
int64_to_float32(Bit64s a,float_status_t & status)95 float32 int64_to_float32(Bit64s a, float_status_t &status)
96 {
97     if (a == 0) return 0;
98     int zSign = (a < 0);
99     Bit64u absA = zSign ? -a : a;
100     int shiftCount = countLeadingZeros64(absA) - 40;
101     if (0 <= shiftCount) {
102         return packFloat32(zSign, 0x95 - shiftCount, (Bit32u)(absA<<shiftCount));
103     }
104     else {
105         shiftCount += 7;
106         if (shiftCount < 0) {
107             absA = shift64RightJamming(absA, -shiftCount);
108         }
109         else {
110             absA <<= shiftCount;
111         }
112         return roundAndPackFloat32(zSign, 0x9C - shiftCount, (Bit32u) absA, status);
113     }
114 }
115 
116 /*----------------------------------------------------------------------------
117 | Returns the result of converting the 64-bit two's complement integer `a'
118 | to the double-precision floating-point format.  The conversion is performed
119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
120 *----------------------------------------------------------------------------*/
121 
int64_to_float64(Bit64s a,float_status_t & status)122 float64 int64_to_float64(Bit64s a, float_status_t &status)
123 {
124     if (a == 0) return 0;
125     if (a == (Bit64s) BX_CONST64(0x8000000000000000)) {
126         return packFloat64(1, 0x43E, 0);
127     }
128     int zSign = (a < 0);
129     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
130 }
131 
132 /*----------------------------------------------------------------------------
133 | Returns the result of converting the 32-bit unsigned integer `a' to the
134 | single-precision floating-point format.  The conversion is performed
135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
136 *----------------------------------------------------------------------------*/
137 
uint32_to_float32(Bit32u a,float_status_t & status)138 float32 uint32_to_float32(Bit32u a, float_status_t &status)
139 {
140     if (a == 0) return 0;
141     if (a & 0x80000000) return normalizeRoundAndPackFloat32(0, 0x9D, a >> 1, status);
142     return normalizeRoundAndPackFloat32(0, 0x9C, a, status);
143 }
144 
145 /*----------------------------------------------------------------------------
146 | Returns the result of converting the 32-bit unsigned integer `a' to the
147 | double-precision floating-point format.  The conversion is performed
148 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
149 *----------------------------------------------------------------------------*/
150 
uint32_to_float64(Bit32u a)151 float64 uint32_to_float64(Bit32u a)
152 {
153    if (a == 0) return 0;
154    int shiftCount = countLeadingZeros32(a) + 21;
155    Bit64u zSig = a;
156    return packFloat64(0, 0x432 - shiftCount, zSig<<shiftCount);
157 }
158 
159 /*----------------------------------------------------------------------------
160 | Returns the result of converting the 64-bit unsigned integer integer `a'
161 | to the single-precision floating-point format.  The conversion is performed
162 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
163 *----------------------------------------------------------------------------*/
164 
uint64_to_float32(Bit64u a,float_status_t & status)165 float32 uint64_to_float32(Bit64u a, float_status_t &status)
166 {
167     if (a == 0) return 0;
168     int shiftCount = countLeadingZeros64(a) - 40;
169     if (0 <= shiftCount) {
170         return packFloat32(0, 0x95 - shiftCount, (Bit32u)(a<<shiftCount));
171     }
172     else {
173         shiftCount += 7;
174         if (shiftCount < 0) {
175             a = shift64RightJamming(a, -shiftCount);
176         }
177         else {
178             a <<= shiftCount;
179         }
180         return roundAndPackFloat32(0, 0x9C - shiftCount, (Bit32u) a, status);
181     }
182 }
183 
184 /*----------------------------------------------------------------------------
185 | Returns the result of converting the 64-bit unsigned integer integer `a'
186 | to the double-precision floating-point format.  The conversion is performed
187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
188 *----------------------------------------------------------------------------*/
189 
uint64_to_float64(Bit64u a,float_status_t & status)190 float64 uint64_to_float64(Bit64u a, float_status_t &status)
191 {
192     if (a == 0) return 0;
193     return normalizeRoundAndPackFloat64(0, 0x43C, a, status);
194 }
195 
196 /*----------------------------------------------------------------------------
197 | Returns the result of converting the single-precision floating-point value
198 | `a' to the 32-bit two's complement integer format.  The conversion is
199 | performed according to the IEC/IEEE Standard for Binary Floating-Point
200 | Arithmetic - which means in particular that the conversion is rounded
201 | according to the current rounding mode.  If `a' is a NaN or the
202 | conversion overflows the integer indefinite value is returned.
203 *----------------------------------------------------------------------------*/
204 
float32_to_int32(float32 a,float_status_t & status)205 Bit32s float32_to_int32(float32 a, float_status_t &status)
206 {
207     Bit32u aSig = extractFloat32Frac(a);
208     Bit16s aExp = extractFloat32Exp(a);
209     int aSign = extractFloat32Sign(a);
210     if ((aExp == 0xFF) && aSig) aSign = 0;
211     if (aExp) aSig |= 0x00800000;
212     else {
213         if (get_denormals_are_zeros(status)) aSig = 0;
214     }
215     int shiftCount = 0xAF - aExp;
216     Bit64u aSig64 = Bit64u(aSig) << 32;
217     if (0 < shiftCount) aSig64 = shift64RightJamming(aSig64, shiftCount);
218     return roundAndPackInt32(aSign, aSig64, status);
219 }
220 
221 /*----------------------------------------------------------------------------
222 | Returns the result of converting the single-precision floating-point value
223 | `a' to the 32-bit two's complement integer format.  The conversion is
224 | performed according to the IEC/IEEE Standard for Binary Floating-Point
225 | Arithmetic, except that the conversion is always rounded toward zero.
226 | If `a' is a NaN or the conversion overflows, the integer indefinite
227 | value is returned.
228 *----------------------------------------------------------------------------*/
229 
float32_to_int32_round_to_zero(float32 a,float_status_t & status)230 Bit32s float32_to_int32_round_to_zero(float32 a, float_status_t &status)
231 {
232     int aSign;
233     Bit16s aExp;
234     Bit32u aSig;
235     Bit32s z;
236 
237     aSig = extractFloat32Frac(a);
238     aExp = extractFloat32Exp(a);
239     aSign = extractFloat32Sign(a);
240     int shiftCount = aExp - 0x9E;
241     if (0 <= shiftCount) {
242         if (a != 0xCF000000) {
243             float_raise(status, float_flag_invalid);
244         }
245         return (Bit32s)(int32_indefinite);
246     }
247     else if (aExp <= 0x7E) {
248         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
249         if (aExp | aSig) float_raise(status, float_flag_inexact);
250         return 0;
251     }
252     aSig = (aSig | 0x800000)<<8;
253     z = aSig>>(-shiftCount);
254     if ((Bit32u) (aSig<<(shiftCount & 31))) {
255         float_raise(status, float_flag_inexact);
256     }
257     if (aSign) z = -z;
258     return z;
259 }
260 
261 /*----------------------------------------------------------------------------
262 | Returns the result of converting the single-precision floating-point value
263 | `a' to the 32-bit unsigned integer format.  The conversion is performed
264 | according to the IEC/IEEE Standard for Binary Floating-point Arithmetic,
265 | except that the conversion is always rounded toward zero.  If `a' is a NaN
266 | or conversion overflows, the largest positive integer is returned.
267 *----------------------------------------------------------------------------*/
268 
float32_to_uint32_round_to_zero(float32 a,float_status_t & status)269 Bit32u float32_to_uint32_round_to_zero(float32 a, float_status_t &status)
270 {
271     int aSign;
272     Bit16s aExp;
273     Bit32u aSig;
274 
275     aSig = extractFloat32Frac(a);
276     aExp = extractFloat32Exp(a);
277     aSign = extractFloat32Sign(a);
278     int shiftCount = aExp - 0x9E;
279 
280     if (aExp <= 0x7E) {
281         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
282         if (aExp | aSig) float_raise(status, float_flag_inexact);
283         return 0;
284     }
285     else if (0 < shiftCount || aSign) {
286         float_raise(status, float_flag_invalid);
287         return uint32_indefinite;
288     }
289 
290     aSig = (aSig | 0x800000)<<8;
291     Bit32u z = aSig >> (-shiftCount);
292     if (aSig << (shiftCount & 31)) {
293         float_raise(status, float_flag_inexact);
294     }
295     return z;
296 }
297 
298 /*----------------------------------------------------------------------------
299 | Returns the result of converting the single-precision floating-point value
300 | `a' to the 64-bit two's complement integer format.  The conversion is
301 | performed according to the IEC/IEEE Standard for Binary Floating-Point
302 | Arithmetic - which means in particular that the conversion is rounded
303 | according to the current rounding mode. If `a' is a NaN or the
304 | conversion overflows, the integer indefinite value is returned.
305 *----------------------------------------------------------------------------*/
306 
float32_to_int64(float32 a,float_status_t & status)307 Bit64s float32_to_int64(float32 a, float_status_t &status)
308 {
309     Bit64u aSig64, aSigExtra;
310 
311     Bit32u aSig = extractFloat32Frac(a);
312     Bit16s aExp = extractFloat32Exp(a);
313     int aSign = extractFloat32Sign(a);
314 
315     int shiftCount = 0xBE - aExp;
316     if (shiftCount < 0) {
317         float_raise(status, float_flag_invalid);
318         return (Bit64s)(int64_indefinite);
319     }
320     if (aExp) aSig |= 0x00800000;
321     else {
322         if (get_denormals_are_zeros(status)) aSig = 0;
323     }
324     aSig64 = aSig;
325     aSig64 <<= 40;
326     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
327     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
328 }
329 
330 /*----------------------------------------------------------------------------
331 | Returns the result of converting the single-precision floating-point value
332 | `a' to the 64-bit two's complement integer format.  The conversion is
333 | performed according to the IEC/IEEE Standard for Binary Floating-Point
334 | Arithmetic, except that the conversion is always rounded toward zero.
335 | If `a' is a NaN or the conversion overflows, the integer indefinite
336 | value is returned.
337 *----------------------------------------------------------------------------*/
338 
float32_to_int64_round_to_zero(float32 a,float_status_t & status)339 Bit64s float32_to_int64_round_to_zero(float32 a, float_status_t &status)
340 {
341     int aSign;
342     Bit16s aExp;
343     Bit32u aSig;
344     Bit64u aSig64;
345     Bit64s z;
346 
347     aSig = extractFloat32Frac(a);
348     aExp = extractFloat32Exp(a);
349     aSign = extractFloat32Sign(a);
350     int shiftCount = aExp - 0xBE;
351     if (0 <= shiftCount) {
352         if (a != 0xDF000000) {
353             float_raise(status, float_flag_invalid);
354         }
355         return (Bit64s)(int64_indefinite);
356     }
357     else if (aExp <= 0x7E) {
358         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
359         if (aExp | aSig) float_raise(status, float_flag_inexact);
360         return 0;
361     }
362     aSig64 = aSig | 0x00800000;
363     aSig64 <<= 40;
364     z = aSig64>>(-shiftCount);
365     if ((Bit64u) (aSig64<<(shiftCount & 63))) {
366         float_raise(status, float_flag_inexact);
367     }
368     if (aSign) z = -z;
369     return z;
370 }
371 
372 /*----------------------------------------------------------------------------
373 | Returns the result of converting the single-precision floating-point value
374 | `a' to the 64-bit unsigned integer format.  The conversion is performed
375 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
376 | except that the conversion is always rounded toward zero. If `a' is a NaN
377 | or the conversion overflows, the largest unsigned integer is returned.
378 *----------------------------------------------------------------------------*/
379 
float32_to_uint64_round_to_zero(float32 a,float_status_t & status)380 Bit64u float32_to_uint64_round_to_zero(float32 a, float_status_t &status)
381 {
382     int aSign;
383     Bit16s aExp;
384     Bit32u aSig;
385     Bit64u aSig64;
386 
387     aSig = extractFloat32Frac(a);
388     aExp = extractFloat32Exp(a);
389     aSign = extractFloat32Sign(a);
390     int shiftCount = aExp - 0xBE;
391 
392     if (aExp <= 0x7E) {
393         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
394         if (aExp | aSig) float_raise(status, float_flag_inexact);
395         return 0;
396     }
397     else if (0 < shiftCount || aSign) {
398         float_raise(status, float_flag_invalid);
399         return uint64_indefinite;
400     }
401 
402     aSig64 = aSig | 0x00800000;
403     aSig64 <<= 40;
404     Bit64u z = aSig64>>(-shiftCount);
405     if ((Bit64u) (aSig64<<(shiftCount & 63))) {
406         float_raise(status, float_flag_inexact);
407     }
408     return z;
409 }
410 
411 /*----------------------------------------------------------------------------
412 | Returns the result of converting the single-precision floating-point value
413 | `a' to the 64-bit unsigned integer format.  The conversion is
414 | performed according to the IEC/IEEE Standard for Binary Floating-Point
415 | Arithmetic---which means in particular that the conversion is rounded
416 | according to the current rounding mode.  If `a' is a NaN or the conversion
417 | overflows, the largest unsigned integer is returned.
418 *----------------------------------------------------------------------------*/
419 
float32_to_uint64(float32 a,float_status_t & status)420 Bit64u float32_to_uint64(float32 a, float_status_t &status)
421 {
422     int aSign;
423     Bit16s aExp, shiftCount;
424     Bit32u aSig;
425     Bit64u aSig64, aSigExtra;
426 
427     aSig = extractFloat32Frac(a);
428     aExp = extractFloat32Exp(a);
429     aSign = extractFloat32Sign(a);
430 
431     if (get_denormals_are_zeros(status)) {
432         if (aExp == 0) aSig = 0;
433     }
434 
435     if ((aSign) && (aExp > 0x7E)) {
436         float_raise(status, float_flag_invalid);
437         return uint64_indefinite;
438     }
439 
440     shiftCount = 0xBE - aExp;
441     if (aExp) aSig |= 0x00800000;
442 
443     if (shiftCount < 0) {
444         float_raise(status, float_flag_invalid);
445         return uint64_indefinite;
446     }
447 
448     aSig64 = aSig;
449     aSig64 <<= 40;
450     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
451     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
452 }
453 
454 /*----------------------------------------------------------------------------
455 | Returns the result of converting the single-precision floating-point value
456 | `a' to the 32-bit unsigned integer format.  The conversion is
457 | performed according to the IEC/IEEE Standard for Binary Floating-Point
458 | Arithmetic---which means in particular that the conversion is rounded
459 | according to the current rounding mode.  If `a' is a NaN or the conversion
460 | overflows, the largest unsigned integer is returned.
461 *----------------------------------------------------------------------------*/
462 
float32_to_uint32(float32 a,float_status_t & status)463 Bit32u float32_to_uint32(float32 a, float_status_t &status)
464 {
465     Bit64u val_64 = float32_to_uint64(a, status);
466 
467     if (val_64 > 0xffffffff) {
468         status.float_exception_flags = float_flag_invalid; // throw away other flags
469         return uint32_indefinite;
470     }
471 
472     return (Bit32u) val_64;
473 }
474 
475 /*----------------------------------------------------------------------------
476 | Returns the result of converting the single-precision floating-point value
477 | `a' to the double-precision floating-point format.  The conversion is
478 | performed according to the IEC/IEEE Standard for Binary Floating-Point
479 | Arithmetic.
480 *----------------------------------------------------------------------------*/
481 
float32_to_float64(float32 a,float_status_t & status)482 float64 float32_to_float64(float32 a, float_status_t &status)
483 {
484     Bit32u aSig = extractFloat32Frac(a);
485     Bit16s aExp = extractFloat32Exp(a);
486     int  aSign = extractFloat32Sign(a);
487 
488     if (aExp == 0xFF) {
489         if (aSig) return commonNaNToFloat64(float32ToCommonNaN(a, status));
490         return packFloat64(aSign, 0x7FF, 0);
491     }
492     if (aExp == 0) {
493         if (aSig == 0 || get_denormals_are_zeros(status))
494             return packFloat64(aSign, 0, 0);
495 
496         float_raise(status, float_flag_denormal);
497         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
498         --aExp;
499     }
500     return packFloat64(aSign, aExp + 0x380, ((Bit64u) aSig)<<29);
501 }
502 
503 /*----------------------------------------------------------------------------
504 | Rounds the single-precision floating-point value `a' to an integer, and
505 | returns the result as a single-precision floating-point value.  The
506 | operation is performed according to the IEC/IEEE Standard for Binary
507 | Floating-Point Arithmetic.
508 *----------------------------------------------------------------------------*/
509 
float32_round_to_int(float32 a,Bit8u scale,float_status_t & status)510 float32 float32_round_to_int(float32 a, Bit8u scale, float_status_t &status)
511 {
512     Bit32u lastBitMask, roundBitsMask;
513     int roundingMode = get_float_rounding_mode(status);
514     Bit16s aExp = extractFloat32Exp(a);
515     scale &= 0xf;
516 
517     if ((aExp == 0xFF) && extractFloat32Frac(a)) {
518         return propagateFloat32NaN(a, status);
519     }
520 
521     aExp += scale; // scale the exponent
522 
523     if (0x96 <= aExp) {
524         return a;
525     }
526 
527     if (get_denormals_are_zeros(status)) {
528         a = float32_denormal_to_zero(a);
529     }
530 
531     if (aExp <= 0x7E) {
532         if ((Bit32u) (a<<1) == 0) return a;
533         float_raise(status, float_flag_inexact);
534         int aSign = extractFloat32Sign(a);
535         switch (roundingMode) {
536          case float_round_nearest_even:
537             if ((aExp == 0x7E) && extractFloat32Frac(a)) {
538                 return packFloat32(aSign, 0x7F - scale, 0);
539             }
540             break;
541          case float_round_down:
542             return aSign ? packFloat32(1, 0x7F - scale, 0) : float32_positive_zero;
543          case float_round_up:
544             return aSign ? float32_negative_zero : packFloat32(0, 0x7F - scale, 0);
545         }
546         return packFloat32(aSign, 0, 0);
547     }
548 
549     lastBitMask = 1;
550     lastBitMask <<= 0x96 - aExp;
551     roundBitsMask = lastBitMask - 1;
552     float32 z = a;
553     if (roundingMode == float_round_nearest_even) {
554         z += lastBitMask>>1;
555         if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
556     }
557     else if (roundingMode != float_round_to_zero) {
558         if (extractFloat32Sign(z) ^ (roundingMode == float_round_up)) {
559             z += roundBitsMask;
560         }
561     }
562     z &= ~roundBitsMask;
563     if (z != a) float_raise(status, float_flag_inexact);
564     return z;
565 }
566 
567 /*----------------------------------------------------------------------------
568 | Extracts the fractional portion of single-precision floating-point value `a',
569 | and returns the result  as a  single-precision  floating-point value. The
570 | fractional results are precise. The operation is performed according to the
571 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
572 *----------------------------------------------------------------------------*/
573 
float32_frc(float32 a,float_status_t & status)574 float32 float32_frc(float32 a, float_status_t &status)
575 {
576     int roundingMode = get_float_rounding_mode(status);
577 
578     Bit16s aExp = extractFloat32Exp(a);
579     Bit32u aSig = extractFloat32Frac(a);
580     int aSign = extractFloat32Sign(a);
581 
582     if (aExp == 0xFF) {
583         if (aSig) return propagateFloat32NaN(a, status);
584         float_raise(status, float_flag_invalid);
585         return float32_default_nan;
586     }
587 
588     if (aExp >= 0x96) {
589         return packFloat32(roundingMode == float_round_down, 0, 0);
590     }
591 
592     if (aExp < 0x7F) {
593         if (aExp == 0) {
594             if (aSig == 0 || get_denormals_are_zeros(status))
595                 return packFloat32(roundingMode == float_round_down, 0, 0);
596 
597             float_raise(status, float_flag_denormal);
598             if (! float_exception_masked(status, float_flag_underflow))
599                 float_raise(status, float_flag_underflow);
600 
601             if(get_flush_underflow_to_zero(status)) {
602                 float_raise(status, float_flag_underflow | float_flag_inexact);
603                 return packFloat32(aSign, 0, 0);
604             }
605         }
606         return a;
607     }
608 
609     Bit32u lastBitMask = 1 << (0x96 - aExp);
610     Bit32u roundBitsMask = lastBitMask - 1;
611 
612     aSig &= roundBitsMask;
613     aSig <<= 7;
614     aExp--;
615 
616     if (aSig == 0)
617        return packFloat32(roundingMode == float_round_down, 0, 0);
618 
619     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
620 }
621 
622 /*----------------------------------------------------------------------------
623 | Extracts the exponent portion of single-precision floating-point value 'a',
624 | and returns the result as a single-precision floating-point value
625 | representing unbiased integer exponent. The operation is performed according
626 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
627 *----------------------------------------------------------------------------*/
628 
float32_getexp(float32 a,float_status_t & status)629 float32 float32_getexp(float32 a, float_status_t &status)
630 {
631     Bit16s aExp = extractFloat32Exp(a);
632     Bit32u aSig = extractFloat32Frac(a);
633 
634     if (aExp == 0xFF) {
635         if (aSig) return propagateFloat32NaN(a, status);
636         return float32_positive_inf;
637     }
638 
639     if (aExp == 0) {
640         if (aSig == 0 || get_denormals_are_zeros(status))
641             return float32_negative_inf;
642 
643         float_raise(status, float_flag_denormal);
644         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
645     }
646 
647     return int32_to_float32(aExp - 0x7F, status);
648 }
649 
650 /*----------------------------------------------------------------------------
651 | Extracts the mantissa of single-precision floating-point value 'a' and
652 | returns the result as a single-precision floating-point after applying
653 | the mantissa interval normalization and sign control. The operation is
654 | performed according to the IEC/IEEE Standard for Binary Floating-Point
655 | Arithmetic.
656 *----------------------------------------------------------------------------*/
657 
float32_getmant(float32 a,float_status_t & status,int sign_ctrl,int interv)658 float32 float32_getmant(float32 a, float_status_t &status, int sign_ctrl, int interv)
659 {
660     Bit16s aExp = extractFloat32Exp(a);
661     Bit32u aSig = extractFloat32Frac(a);
662     int aSign = extractFloat32Sign(a);
663 
664     if (aExp == 0xFF) {
665         if (aSig) return propagateFloat32NaN(a, status);
666         if (aSign) {
667             if (sign_ctrl & 0x2) {
668                 float_raise(status, float_flag_invalid);
669                 return float32_default_nan;
670             }
671         }
672         return packFloat32(~sign_ctrl & aSign, 0x7F, 0);
673     }
674 
675     if (aExp == 0 && (aSig == 0 || get_denormals_are_zeros(status))) {
676         return packFloat32(~sign_ctrl & aSign, 0x7F, 0);
677     }
678 
679     if (aSign) {
680         if (sign_ctrl & 0x2) {
681             float_raise(status, float_flag_invalid);
682             return float32_default_nan;
683         }
684     }
685 
686     if (aExp == 0) {
687         float_raise(status, float_flag_denormal);
688         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
689     }
690 
691     switch(interv) {
692     case 0x0: // interval [1,2)
693         aExp = 0x7F;
694         break;
695     case 0x1: // interval [1/2,2)
696         aExp -= 0x7F;
697         aExp  = 0x7F - (aExp & 0x1);
698         break;
699     case 0x2: // interval [1/2,1)
700         aExp = 0x7E;
701         break;
702     case 0x3: // interval [3/4,3/2)
703         aExp = 0x7F - ((aSig >> 22) & 0x1);
704         break;
705     }
706 
707     return packFloat32(~sign_ctrl & aSign, aExp, aSig);
708 }
709 
710 /*----------------------------------------------------------------------------
711 | Return the result of a floating point scale of the single-precision floating
712 | point value `a' by multiplying it by 2 power of the single-precision
713 | floating point value 'b' converted to integral value. If the result cannot
714 | be represented in single precision, then the proper overflow response (for
715 | positive scaling operand), or the proper underflow response (for negative
716 | scaling operand) is issued. The operation is performed according to the
717 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
718 *----------------------------------------------------------------------------*/
719 
float32_scalef(float32 a,float32 b,float_status_t & status)720 float32 float32_scalef(float32 a, float32 b, float_status_t &status)
721 {
722     Bit32u aSig = extractFloat32Frac(a);
723     Bit16s aExp = extractFloat32Exp(a);
724     int aSign = extractFloat32Sign(a);
725     Bit32u bSig = extractFloat32Frac(b);
726     Bit16s bExp = extractFloat32Exp(b);
727     int bSign = extractFloat32Sign(b);
728 
729     if (get_denormals_are_zeros(status)) {
730         if (aExp == 0) aSig = 0;
731         if (bExp == 0) bSig = 0;
732     }
733 
734     if (bExp == 0xFF) {
735         if (bSig) return propagateFloat32NaN(a, b, status);
736     }
737 
738     if (aExp == 0xFF) {
739         if (aSig) {
740             int aIsSignalingNaN = (aSig & 0x00400000) == 0;
741             if (aIsSignalingNaN || bExp != 0xFF || bSig)
742                 return propagateFloat32NaN(a, b, status);
743 
744             return bSign ? 0 : float32_positive_inf;
745         }
746 
747         if (bExp == 0xFF && bSign) {
748             float_raise(status, float_flag_invalid);
749             return float32_default_nan;
750         }
751         return a;
752     }
753 
754     if ((aExp | aSig) == 0) {
755         if (bExp == 0xFF && ! bSign) {
756             float_raise(status, float_flag_invalid);
757             return float32_default_nan;
758         }
759         return a;
760     }
761 
762     if ((bExp | bSig) == 0) return a;
763 
764     if (bExp == 0xFF) {
765         if (bSign) return packFloat32(aSign, 0, 0);
766         return packFloat32(aSign, 0xFF, 0);
767     }
768 
769     if (bExp >= 0x8E) {
770         // handle obvious overflow/underflow result
771         return roundAndPackFloat32(aSign, bSign ? -0x7F : 0xFF, aSig, status);
772     }
773 
774     int scale = 0;
775 
776     if (bExp <= 0x7E) {
777         scale = -bSign;
778     }
779     else {
780         int shiftCount = bExp - 0x9E;
781         bSig = (bSig | 0x800000)<<8;
782         scale = bSig>>(-shiftCount);
783 
784         if (bSign) {
785             if ((Bit32u) (bSig<<(shiftCount & 31))) scale++;
786             scale = -scale;
787         }
788 
789         if (scale >  0x200) scale =  0x200;
790         if (scale < -0x200) scale = -0x200;
791     }
792 
793     if (aExp != 0) {
794         aSig |= 0x00800000;
795     } else {
796         aExp++;
797     }
798 
799     aExp += scale - 1;
800     aSig <<= 7;
801     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
802 }
803 
804 /*----------------------------------------------------------------------------
805 | Returns the result of adding the absolute values of the single-precision
806 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
807 | before being returned.  `zSign' is ignored if the result is a NaN.
808 | The addition is performed according to the IEC/IEEE Standard for Binary
809 | Floating-Point Arithmetic.
810 *----------------------------------------------------------------------------*/
811 
addFloat32Sigs(float32 a,float32 b,int zSign,float_status_t & status)812 static float32 addFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
813 {
814     Bit16s aExp, bExp, zExp;
815     Bit32u aSig, bSig, zSig;
816     Bit16s expDiff;
817 
818     aSig = extractFloat32Frac(a);
819     aExp = extractFloat32Exp(a);
820     bSig = extractFloat32Frac(b);
821     bExp = extractFloat32Exp(b);
822 
823     if (get_denormals_are_zeros(status)) {
824         if (aExp == 0) aSig = 0;
825         if (bExp == 0) bSig = 0;
826     }
827 
828     expDiff = aExp - bExp;
829     aSig <<= 6;
830     bSig <<= 6;
831 
832     if (0 < expDiff) {
833         if (aExp == 0xFF) {
834             if (aSig) return propagateFloat32NaN(a, b, status);
835             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
836             return a;
837         }
838         if ((aExp == 0) && aSig)
839             float_raise(status, float_flag_denormal);
840 
841         if (bExp == 0) {
842             if (bSig) float_raise(status, float_flag_denormal);
843             --expDiff;
844         }
845         else bSig |= 0x20000000;
846 
847         bSig = shift32RightJamming(bSig, expDiff);
848         zExp = aExp;
849     }
850     else if (expDiff < 0) {
851         if (bExp == 0xFF) {
852             if (bSig) return propagateFloat32NaN(a, b, status);
853             if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
854             return packFloat32(zSign, 0xFF, 0);
855         }
856         if ((bExp == 0) && bSig)
857             float_raise(status, float_flag_denormal);
858 
859         if (aExp == 0) {
860             if (aSig) float_raise(status, float_flag_denormal);
861             ++expDiff;
862         }
863         else aSig |= 0x20000000;
864 
865         aSig = shift32RightJamming(aSig, -expDiff);
866         zExp = bExp;
867     }
868     else {
869         if (aExp == 0xFF) {
870             if (aSig | bSig) return propagateFloat32NaN(a, b, status);
871             return a;
872         }
873         if (aExp == 0) {
874             zSig = (aSig + bSig) >> 6;
875             if (aSig | bSig) {
876                 float_raise(status, float_flag_denormal);
877                 if (get_flush_underflow_to_zero(status)) {
878                     float_raise(status, float_flag_underflow | float_flag_inexact);
879                     return packFloat32(zSign, 0, 0);
880                 }
881                 if (! float_exception_masked(status, float_flag_underflow)) {
882                     if (extractFloat32Frac(zSig) == zSig)
883                         float_raise(status, float_flag_underflow);
884                 }
885             }
886             return packFloat32(zSign, 0, zSig);
887         }
888         zSig = 0x40000000 + aSig + bSig;
889         return roundAndPackFloat32(zSign, aExp, zSig, status);
890     }
891     aSig |= 0x20000000;
892     zSig = (aSig + bSig)<<1;
893     --zExp;
894     if ((Bit32s) zSig < 0) {
895         zSig = aSig + bSig;
896         ++zExp;
897     }
898     return roundAndPackFloat32(zSign, zExp, zSig, status);
899 }
900 
901 /*----------------------------------------------------------------------------
902 | Returns the result of subtracting the absolute values of the single-
903 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
904 | difference is negated before being returned.  `zSign' is ignored if the
905 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
906 | Standard for Binary Floating-Point Arithmetic.
907 *----------------------------------------------------------------------------*/
908 
subFloat32Sigs(float32 a,float32 b,int zSign,float_status_t & status)909 static float32 subFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
910 {
911     Bit16s aExp, bExp, zExp;
912     Bit32u aSig, bSig, zSig;
913     Bit16s expDiff;
914 
915     aSig = extractFloat32Frac(a);
916     aExp = extractFloat32Exp(a);
917     bSig = extractFloat32Frac(b);
918     bExp = extractFloat32Exp(b);
919 
920     if (get_denormals_are_zeros(status)) {
921         if (aExp == 0) aSig = 0;
922         if (bExp == 0) bSig = 0;
923     }
924 
925     expDiff = aExp - bExp;
926     aSig <<= 7;
927     bSig <<= 7;
928     if (0 < expDiff) goto aExpBigger;
929     if (expDiff < 0) goto bExpBigger;
930     if (aExp == 0xFF) {
931         if (aSig | bSig) return propagateFloat32NaN(a, b, status);
932         float_raise(status, float_flag_invalid);
933         return float32_default_nan;
934     }
935     if (aExp == 0) {
936         if (aSig | bSig) float_raise(status, float_flag_denormal);
937         aExp = 1;
938         bExp = 1;
939     }
940     if (bSig < aSig) goto aBigger;
941     if (aSig < bSig) goto bBigger;
942     return packFloat32(get_float_rounding_mode(status) == float_round_down, 0, 0);
943  bExpBigger:
944     if (bExp == 0xFF) {
945         if (bSig) return propagateFloat32NaN(a, b, status);
946         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
947         return packFloat32(zSign ^ 1, 0xFF, 0);
948     }
949     if ((bExp == 0) && bSig)
950         float_raise(status, float_flag_denormal);
951 
952     if (aExp == 0) {
953         if (aSig) float_raise(status, float_flag_denormal);
954         ++expDiff;
955     }
956     else aSig |= 0x40000000;
957 
958     aSig = shift32RightJamming(aSig, -expDiff);
959     bSig |= 0x40000000;
960  bBigger:
961     zSig = bSig - aSig;
962     zExp = bExp;
963     zSign ^= 1;
964     goto normalizeRoundAndPack;
965  aExpBigger:
966     if (aExp == 0xFF) {
967         if (aSig) return propagateFloat32NaN(a, b, status);
968         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
969         return a;
970     }
971     if ((aExp == 0) && aSig)
972         float_raise(status, float_flag_denormal);
973 
974     if (bExp == 0) {
975         if (bSig) float_raise(status, float_flag_denormal);
976         --expDiff;
977     }
978     else bSig |= 0x40000000;
979 
980     bSig = shift32RightJamming(bSig, expDiff);
981     aSig |= 0x40000000;
982  aBigger:
983     zSig = aSig - bSig;
984     zExp = aExp;
985  normalizeRoundAndPack:
986     --zExp;
987     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
988 }
989 
990 /*----------------------------------------------------------------------------
991 | Returns the result of adding the single-precision floating-point values `a'
992 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
993 | Binary Floating-Point Arithmetic.
994 *----------------------------------------------------------------------------*/
995 
float32_add(float32 a,float32 b,float_status_t & status)996 float32 float32_add(float32 a, float32 b, float_status_t &status)
997 {
998     int aSign = extractFloat32Sign(a);
999     int bSign = extractFloat32Sign(b);
1000 
1001     if (aSign == bSign) {
1002         return addFloat32Sigs(a, b, aSign, status);
1003     }
1004     else {
1005         return subFloat32Sigs(a, b, aSign, status);
1006     }
1007 }
1008 
1009 /*----------------------------------------------------------------------------
1010 | Returns the result of subtracting the single-precision floating-point values
1011 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1012 | for Binary Floating-Point Arithmetic.
1013 *----------------------------------------------------------------------------*/
1014 
float32_sub(float32 a,float32 b,float_status_t & status)1015 float32 float32_sub(float32 a, float32 b, float_status_t &status)
1016 {
1017     int aSign = extractFloat32Sign(a);
1018     int bSign = extractFloat32Sign(b);
1019 
1020     if (aSign == bSign) {
1021         return subFloat32Sigs(a, b, aSign, status);
1022     }
1023     else {
1024         return addFloat32Sigs(a, b, aSign, status);
1025     }
1026 }
1027 
1028 /*----------------------------------------------------------------------------
1029 | Returns the result of multiplying the single-precision floating-point values
1030 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1031 | for Binary Floating-Point Arithmetic.
1032 *----------------------------------------------------------------------------*/
1033 
float32_mul(float32 a,float32 b,float_status_t & status)1034 float32 float32_mul(float32 a, float32 b, float_status_t &status)
1035 {
1036     int aSign, bSign, zSign;
1037     Bit16s aExp, bExp, zExp;
1038     Bit32u aSig, bSig;
1039     Bit64u zSig64;
1040     Bit32u zSig;
1041 
1042     aSig = extractFloat32Frac(a);
1043     aExp = extractFloat32Exp(a);
1044     aSign = extractFloat32Sign(a);
1045     bSig = extractFloat32Frac(b);
1046     bExp = extractFloat32Exp(b);
1047     bSign = extractFloat32Sign(b);
1048     zSign = aSign ^ bSign;
1049 
1050     if (get_denormals_are_zeros(status)) {
1051         if (aExp == 0) aSig = 0;
1052         if (bExp == 0) bSig = 0;
1053     }
1054 
1055     if (aExp == 0xFF) {
1056         if (aSig || ((bExp == 0xFF) && bSig))
1057             return propagateFloat32NaN(a, b, status);
1058 
1059         if ((bExp | bSig) == 0) {
1060             float_raise(status, float_flag_invalid);
1061             return float32_default_nan;
1062         }
1063         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1064         return packFloat32(zSign, 0xFF, 0);
1065     }
1066     if (bExp == 0xFF) {
1067         if (bSig) return propagateFloat32NaN(a, b, status);
1068         if ((aExp | aSig) == 0) {
1069             float_raise(status, float_flag_invalid);
1070             return float32_default_nan;
1071         }
1072         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1073         return packFloat32(zSign, 0xFF, 0);
1074     }
1075     if (aExp == 0) {
1076         if (aSig == 0) {
1077             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1078             return packFloat32(zSign, 0, 0);
1079         }
1080         float_raise(status, float_flag_denormal);
1081         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1082     }
1083     if (bExp == 0) {
1084         if (bSig == 0) return packFloat32(zSign, 0, 0);
1085         float_raise(status, float_flag_denormal);
1086         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
1087     }
1088     zExp = aExp + bExp - 0x7F;
1089     aSig = (aSig | 0x00800000)<<7;
1090     bSig = (bSig | 0x00800000)<<8;
1091     zSig64 = shift64RightJamming(((Bit64u) aSig) * bSig, 32);
1092     zSig = (Bit32u) zSig64;
1093     if (0 <= (Bit32s) (zSig<<1)) {
1094         zSig <<= 1;
1095         --zExp;
1096     }
1097     return roundAndPackFloat32(zSign, zExp, zSig, status);
1098 }
1099 
1100 /*----------------------------------------------------------------------------
1101 | Returns the result of dividing the single-precision floating-point value `a'
1102 | by the corresponding value `b'.  The operation is performed according to the
1103 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1104 *----------------------------------------------------------------------------*/
1105 
float32_div(float32 a,float32 b,float_status_t & status)1106 float32 float32_div(float32 a, float32 b, float_status_t &status)
1107 {
1108     int aSign, bSign, zSign;
1109     Bit16s aExp, bExp, zExp;
1110     Bit32u aSig, bSig, zSig;
1111 
1112     aSig = extractFloat32Frac(a);
1113     aExp = extractFloat32Exp(a);
1114     aSign = extractFloat32Sign(a);
1115     bSig = extractFloat32Frac(b);
1116     bExp = extractFloat32Exp(b);
1117     bSign = extractFloat32Sign(b);
1118     zSign = aSign ^ bSign;
1119 
1120     if (get_denormals_are_zeros(status)) {
1121         if (aExp == 0) aSig = 0;
1122         if (bExp == 0) bSig = 0;
1123     }
1124 
1125     if (aExp == 0xFF) {
1126         if (aSig) return propagateFloat32NaN(a, b, status);
1127         if (bExp == 0xFF) {
1128             if (bSig) return propagateFloat32NaN(a, b, status);
1129             float_raise(status, float_flag_invalid);
1130             return float32_default_nan;
1131         }
1132         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1133         return packFloat32(zSign, 0xFF, 0);
1134     }
1135     if (bExp == 0xFF) {
1136         if (bSig) return propagateFloat32NaN(a, b, status);
1137         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1138         return packFloat32(zSign, 0, 0);
1139     }
1140     if (bExp == 0) {
1141         if (bSig == 0) {
1142             if ((aExp | aSig) == 0) {
1143                 float_raise(status, float_flag_invalid);
1144                 return float32_default_nan;
1145             }
1146             float_raise(status, float_flag_divbyzero);
1147             return packFloat32(zSign, 0xFF, 0);
1148         }
1149         float_raise(status, float_flag_denormal);
1150         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
1151     }
1152     if (aExp == 0) {
1153         if (aSig == 0) return packFloat32(zSign, 0, 0);
1154         float_raise(status, float_flag_denormal);
1155         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1156     }
1157     zExp = aExp - bExp + 0x7D;
1158     aSig = (aSig | 0x00800000)<<7;
1159     bSig = (bSig | 0x00800000)<<8;
1160     if (bSig <= (aSig + aSig)) {
1161         aSig >>= 1;
1162         ++zExp;
1163     }
1164     zSig = (((Bit64u) aSig)<<32) / bSig;
1165     if ((zSig & 0x3F) == 0) {
1166         zSig |= ((Bit64u) bSig * zSig != ((Bit64u) aSig)<<32);
1167     }
1168     return roundAndPackFloat32(zSign, zExp, zSig, status);
1169 }
1170 
1171 /*----------------------------------------------------------------------------
1172 | Returns the square root of the single-precision floating-point value `a'.
1173 | The operation is performed according to the IEC/IEEE Standard for Binary
1174 | Floating-Point Arithmetic.
1175 *----------------------------------------------------------------------------*/
1176 
float32_sqrt(float32 a,float_status_t & status)1177 float32 float32_sqrt(float32 a, float_status_t &status)
1178 {
1179     int aSign;
1180     Bit16s aExp, zExp;
1181     Bit32u aSig, zSig;
1182     Bit64u rem, term;
1183 
1184     aSig = extractFloat32Frac(a);
1185     aExp = extractFloat32Exp(a);
1186     aSign = extractFloat32Sign(a);
1187 
1188     if (aExp == 0xFF) {
1189         if (aSig) return propagateFloat32NaN(a, status);
1190         if (! aSign) return a;
1191         float_raise(status, float_flag_invalid);
1192         return float32_default_nan;
1193     }
1194 
1195     if (get_denormals_are_zeros(status)) {
1196         if (aExp == 0) aSig = 0;
1197     }
1198 
1199     if (aSign) {
1200         if ((aExp | aSig) == 0) return packFloat32(aSign, 0, 0);
1201         float_raise(status, float_flag_invalid);
1202         return float32_default_nan;
1203     }
1204     if (aExp == 0) {
1205         if (aSig == 0) return 0;
1206         float_raise(status, float_flag_denormal);
1207         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1208     }
1209     zExp = ((aExp - 0x7F)>>1) + 0x7E;
1210     aSig = (aSig | 0x00800000)<<8;
1211     zSig = estimateSqrt32(aExp, aSig) + 2;
1212     if ((zSig & 0x7F) <= 5) {
1213         if (zSig < 2) {
1214             zSig = 0x7FFFFFFF;
1215             goto roundAndPack;
1216         }
1217         aSig >>= aExp & 1;
1218         term = ((Bit64u) zSig) * zSig;
1219         rem = (((Bit64u) aSig)<<32) - term;
1220         while ((Bit64s) rem < 0) {
1221             --zSig;
1222             rem += (((Bit64u) zSig)<<1) | 1;
1223         }
1224         zSig |= (rem != 0);
1225     }
1226     zSig = shift32RightJamming(zSig, 1);
1227  roundAndPack:
1228     return roundAndPackFloat32(0, zExp, zSig, status);
1229 }
1230 
1231 /*----------------------------------------------------------------------------
1232 | Determine single-precision floating-point number class.
1233 *----------------------------------------------------------------------------*/
1234 
float32_class(float32 a)1235 float_class_t float32_class(float32 a)
1236 {
1237    Bit16s aExp = extractFloat32Exp(a);
1238    Bit32u aSig = extractFloat32Frac(a);
1239    int  aSign = extractFloat32Sign(a);
1240 
1241    if(aExp == 0xFF) {
1242        if (aSig == 0)
1243            return (aSign) ? float_negative_inf : float_positive_inf;
1244 
1245        return (aSig & 0x00400000) ? float_QNaN : float_SNaN;
1246    }
1247 
1248    if(aExp == 0) {
1249        if (aSig == 0) return float_zero;
1250        return float_denormal;
1251    }
1252 
1253    return float_normalized;
1254 }
1255 
1256 /*----------------------------------------------------------------------------
1257 | Compare  between  two  single  precision  floating  point  numbers. Returns
1258 | 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
1259 | the    value    'a'   is   less   than   the   corresponding   value   `b',
1260 | 'float_relation_greater' if the value 'a' is greater than the corresponding
1261 | value `b', or 'float_relation_unordered' otherwise.
1262 *----------------------------------------------------------------------------*/
1263 
float32_compare(float32 a,float32 b,float_status_t & status)1264 int float32_compare(float32 a, float32 b, float_status_t &status)
1265 {
1266     if (get_denormals_are_zeros(status)) {
1267         a = float32_denormal_to_zero(a);
1268         b = float32_denormal_to_zero(b);
1269     }
1270 
1271     float_class_t aClass = float32_class(a);
1272     float_class_t bClass = float32_class(b);
1273 
1274     if (aClass == float_SNaN || aClass == float_QNaN || bClass == float_SNaN || bClass == float_QNaN)
1275     {
1276         float_raise(status, float_flag_invalid);
1277         return float_relation_unordered;
1278     }
1279 
1280     if (aClass == float_denormal || bClass == float_denormal) {
1281         float_raise(status, float_flag_denormal);
1282     }
1283 
1284     if ((a == b) || ((Bit32u) ((a | b)<<1) == 0)) return float_relation_equal;
1285 
1286     int aSign = extractFloat32Sign(a);
1287     int bSign = extractFloat32Sign(b);
1288     if (aSign != bSign)
1289         return (aSign) ? float_relation_less : float_relation_greater;
1290 
1291     if (aSign ^ (a < b)) return float_relation_less;
1292     return float_relation_greater;
1293 }
1294 
1295 /*----------------------------------------------------------------------------
1296 | Compare  between  two  double  precision  floating  point  numbers. Returns
1297 | 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
1298 | the    value    'a'   is   less   than   the   corresponding   value   `b',
1299 | 'float_relation_greater' if the value 'a' is greater than the corresponding
1300 | value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause
1301 | an exception.
1302 *----------------------------------------------------------------------------*/
1303 
float32_compare_quiet(float32 a,float32 b,float_status_t & status)1304 int float32_compare_quiet(float32 a, float32 b, float_status_t &status)
1305 {
1306     if (get_denormals_are_zeros(status)) {
1307         a = float32_denormal_to_zero(a);
1308         b = float32_denormal_to_zero(b);
1309     }
1310 
1311     float_class_t aClass = float32_class(a);
1312     float_class_t bClass = float32_class(b);
1313 
1314     if (aClass == float_SNaN || bClass == float_SNaN) {
1315         float_raise(status, float_flag_invalid);
1316         return float_relation_unordered;
1317     }
1318 
1319     if (aClass == float_QNaN || bClass == float_QNaN) {
1320         return float_relation_unordered;
1321     }
1322 
1323     if (aClass == float_denormal || bClass == float_denormal) {
1324         float_raise(status, float_flag_denormal);
1325     }
1326 
1327     if ((a == b) || ((Bit32u) ((a | b)<<1) == 0)) return float_relation_equal;
1328 
1329     int aSign = extractFloat32Sign(a);
1330     int bSign = extractFloat32Sign(b);
1331     if (aSign != bSign)
1332         return (aSign) ? float_relation_less : float_relation_greater;
1333 
1334     if (aSign ^ (a < b)) return float_relation_less;
1335     return float_relation_greater;
1336 }
1337 
1338 /*----------------------------------------------------------------------------
1339 | Compare bewteen two single precision floating point numbers and return the
1340 | smaller of  them.  The operation  is performed  according to  the IEC/IEEE
1341 | Standard for Binary Floating-Point Arithmetic.
1342 *----------------------------------------------------------------------------*/
1343 
float32_min(float32 a,float32 b,float_status_t & status)1344 float32 float32_min(float32 a, float32 b, float_status_t &status)
1345 {
1346   if (get_denormals_are_zeros(status)) {
1347     a = float32_denormal_to_zero(a);
1348     b = float32_denormal_to_zero(b);
1349   }
1350 
1351   return (float32_compare(a, b, status) == float_relation_less) ? a : b;
1352 }
1353 
1354 /*----------------------------------------------------------------------------
1355 | Compare bewteen two single precision floating point numbers and return the
1356 | larger  of  them.  The operation  is performed  according to  the IEC/IEEE
1357 | Standard for Binary Floating-Point Arithmetic.
1358 *----------------------------------------------------------------------------*/
1359 
float32_max(float32 a,float32 b,float_status_t & status)1360 float32 float32_max(float32 a, float32 b, float_status_t &status)
1361 {
1362   if (get_denormals_are_zeros(status)) {
1363     a = float32_denormal_to_zero(a);
1364     b = float32_denormal_to_zero(b);
1365   }
1366 
1367   return (float32_compare(a, b, status) == float_relation_greater) ? a : b;
1368 }
1369 
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the double-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format.  The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic - which means in particular that the conversion is rounded
1375 | according to the current rounding mode. If `a' is a NaN or the
1376 | conversion overflows, the integer indefinite value is returned.
1377 *----------------------------------------------------------------------------*/
1378 
float64_to_int32(float64 a,float_status_t & status)1379 Bit32s float64_to_int32(float64 a, float_status_t &status)
1380 {
1381     Bit64u aSig = extractFloat64Frac(a);
1382     Bit16s aExp = extractFloat64Exp(a);
1383     int aSign = extractFloat64Sign(a);
1384     if ((aExp == 0x7FF) && aSig) aSign = 0;
1385     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1386     else {
1387         if (get_denormals_are_zeros(status)) aSig = 0;
1388     }
1389     int shiftCount = 0x42C - aExp;
1390     if (0 < shiftCount) aSig = shift64RightJamming(aSig, shiftCount);
1391     return roundAndPackInt32(aSign, aSig, status);
1392 }
1393 
1394 /*----------------------------------------------------------------------------
1395 | Returns the result of converting the double-precision floating-point value
1396 | `a' to the 32-bit two's complement integer format.  The conversion is
1397 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1398 | Arithmetic, except that the conversion is always rounded toward zero.
1399 | If `a' is a NaN or the conversion overflows, the integer indefinite
1400 | value  is returned.
1401 *----------------------------------------------------------------------------*/
1402 
float64_to_int32_round_to_zero(float64 a,float_status_t & status)1403 Bit32s float64_to_int32_round_to_zero(float64 a, float_status_t &status)
1404 {
1405     int aSign;
1406     Bit16s aExp;
1407     Bit64u aSig, savedASig;
1408     Bit32s z;
1409     int shiftCount;
1410 
1411     aSig = extractFloat64Frac(a);
1412     aExp = extractFloat64Exp(a);
1413     aSign = extractFloat64Sign(a);
1414     if (0x41E < aExp) {
1415         float_raise(status, float_flag_invalid);
1416         return (Bit32s)(int32_indefinite);
1417     }
1418     else if (aExp < 0x3FF) {
1419         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1420         if (aExp || aSig) float_raise(status, float_flag_inexact);
1421         return 0;
1422     }
1423     aSig |= BX_CONST64(0x0010000000000000);
1424     shiftCount = 0x433 - aExp;
1425     savedASig = aSig;
1426     aSig >>= shiftCount;
1427     z = (Bit32s) aSig;
1428     if (aSign) z = -z;
1429     if ((z < 0) ^ aSign) {
1430         float_raise(status, float_flag_invalid);
1431         return (Bit32s)(int32_indefinite);
1432     }
1433     if ((aSig<<shiftCount) != savedASig) {
1434         float_raise(status, float_flag_inexact);
1435     }
1436     return z;
1437 }
1438 
1439 /*----------------------------------------------------------------------------
1440 | Returns the result of converting the double-precision floating-point value
1441 | `a' to the 32-bit unsigned integer format.  The conversion is performed
1442 | according to the IEC/IEEE Standard for Binary Floating-point Arithmetic,
1443 | except that the conversion is always rounded toward zero.  If `a' is a NaN
1444 | or conversion overflows, the largest positive integer is returned.
1445 *----------------------------------------------------------------------------*/
1446 
float64_to_uint32_round_to_zero(float64 a,float_status_t & status)1447 Bit32u float64_to_uint32_round_to_zero(float64 a, float_status_t &status)
1448 {
1449     Bit64u aSig = extractFloat64Frac(a);
1450     Bit16s aExp = extractFloat64Exp(a);
1451     int aSign = extractFloat64Sign(a);
1452 
1453     if (aExp < 0x3FF) {
1454         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1455         if (aExp || aSig) float_raise(status, float_flag_inexact);
1456         return 0;
1457     }
1458 
1459     if (0x41E < aExp || aSign) {
1460         float_raise(status, float_flag_invalid);
1461         return uint32_indefinite;
1462     }
1463 
1464     aSig |= BX_CONST64(0x0010000000000000);
1465     int shiftCount = 0x433 - aExp;
1466     Bit64u savedASig = aSig;
1467     aSig >>= shiftCount;
1468     if ((aSig<<shiftCount) != savedASig) {
1469         float_raise(status, float_flag_inexact);
1470     }
1471     return (Bit32u) aSig;
1472 }
1473 
1474 /*----------------------------------------------------------------------------
1475 | Returns the result of converting the double-precision floating-point value
1476 | `a' to the 64-bit two's complement integer format.  The conversion is
1477 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1478 | Arithmetic - which means in particular that the conversion is rounded
1479 | according to the current rounding mode.  If `a' is a NaN, the largest
1480 | positive integer is returned.  Otherwise, if the conversion overflows, the
1481 | largest integer with the same sign as `a' is returned.
1482 *----------------------------------------------------------------------------*/
1483 
float64_to_int64(float64 a,float_status_t & status)1484 Bit64s float64_to_int64(float64 a, float_status_t &status)
1485 {
1486     int aSign;
1487     Bit16s aExp;
1488     Bit64u aSig, aSigExtra;
1489 
1490     aSig = extractFloat64Frac(a);
1491     aExp = extractFloat64Exp(a);
1492     aSign = extractFloat64Sign(a);
1493     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1494     else {
1495         if (get_denormals_are_zeros(status)) aSig = 0;
1496     }
1497     int shiftCount = 0x433 - aExp;
1498     if (shiftCount <= 0) {
1499         if (0x43E < aExp) {
1500             float_raise(status, float_flag_invalid);
1501             return (Bit64s)(int64_indefinite);
1502         }
1503         aSigExtra = 0;
1504         aSig <<= -shiftCount;
1505     }
1506     else {
1507         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
1508     }
1509     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
1510 }
1511 
1512 /*----------------------------------------------------------------------------
1513 | Returns the result of converting the double-precision floating-point value
1514 | `a' to the 64-bit two's complement integer format.  The conversion is
1515 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1516 | Arithmetic, except that the conversion is always rounded toward zero.
1517 | If `a' is a NaN or the conversion overflows, the integer indefinite
1518 | value  is returned.
1519 *----------------------------------------------------------------------------*/
1520 
float64_to_int64_round_to_zero(float64 a,float_status_t & status)1521 Bit64s float64_to_int64_round_to_zero(float64 a, float_status_t &status)
1522 {
1523     int aSign;
1524     Bit16s aExp;
1525     Bit64u aSig;
1526     Bit64s z;
1527 
1528     aSig = extractFloat64Frac(a);
1529     aExp = extractFloat64Exp(a);
1530     aSign = extractFloat64Sign(a);
1531     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1532     int shiftCount = aExp - 0x433;
1533     if (0 <= shiftCount) {
1534         if (0x43E <= aExp) {
1535             if (a != BX_CONST64(0xC3E0000000000000)) {
1536                 float_raise(status, float_flag_invalid);
1537             }
1538             return (Bit64s)(int64_indefinite);
1539         }
1540         z = aSig<<shiftCount;
1541     }
1542     else {
1543         if (aExp < 0x3FE) {
1544             if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1545             if (aExp | aSig) float_raise(status, float_flag_inexact);
1546             return 0;
1547         }
1548         z = aSig>>(-shiftCount);
1549         if ((Bit64u) (aSig<<(shiftCount & 63))) {
1550              float_raise(status, float_flag_inexact);
1551         }
1552     }
1553     if (aSign) z = -z;
1554     return z;
1555 }
1556 
1557 /*----------------------------------------------------------------------------
1558 | Returns the result of converting the double-precision floating-point value
1559 | `a' to the 64-bit unsigned integer format.  The conversion is performed
1560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
1561 | except that the conversion is always rounded toward zero. If `a' is a NaN
1562 | or the conversion overflows, the largest unsigned integer is returned.
1563 *----------------------------------------------------------------------------*/
1564 
float64_to_uint64_round_to_zero(float64 a,float_status_t & status)1565 Bit64u float64_to_uint64_round_to_zero(float64 a, float_status_t &status)
1566 {
1567     int aSign;
1568     Bit16s aExp;
1569     Bit64u aSig, z;
1570 
1571     aSig = extractFloat64Frac(a);
1572     aExp = extractFloat64Exp(a);
1573     aSign = extractFloat64Sign(a);
1574 
1575     if (aExp < 0x3FE) {
1576         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1577         if (aExp | aSig) float_raise(status, float_flag_inexact);
1578         return 0;
1579     }
1580 
1581     if (0x43E <= aExp || aSign) {
1582         float_raise(status, float_flag_invalid);
1583         return uint64_indefinite;
1584     }
1585 
1586     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1587     int shiftCount = aExp - 0x433;
1588 
1589     if (0 <= shiftCount) {
1590         z = aSig<<shiftCount;
1591     }
1592     else {
1593         z = aSig>>(-shiftCount);
1594         if ((Bit64u) (aSig<<(shiftCount & 63))) {
1595              float_raise(status, float_flag_inexact);
1596         }
1597     }
1598     return z;
1599 }
1600 
1601 /*----------------------------------------------------------------------------
1602 | Returns the result of converting the double-precision floating-point value
1603 | `a' to the 32-bit unsigned integer format.  The conversion is
1604 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1605 | Arithmetic---which means in particular that the conversion is rounded
1606 | according to the current rounding mode.  If `a' is a NaN or the conversion
1607 | overflows, the largest unsigned integer is returned.
1608 *----------------------------------------------------------------------------*/
1609 
float64_to_uint32(float64 a,float_status_t & status)1610 Bit32u float64_to_uint32(float64 a, float_status_t &status)
1611 {
1612     Bit64u val_64 = float64_to_uint64(a, status);
1613 
1614     if (val_64 > 0xffffffff) {
1615         status.float_exception_flags = float_flag_invalid; // throw away other flags
1616         return uint32_indefinite;
1617     }
1618 
1619     return (Bit32u) val_64;
1620 }
1621 
1622 /*----------------------------------------------------------------------------
1623 | Returns the result of converting the double-precision floating-point value
1624 | `a' to the 64-bit unsigned integer format.  The conversion is
1625 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1626 | Arithmetic---which means in particular that the conversion is rounded
1627 | according to the current rounding mode.  If `a' is a NaN or the conversion
1628 | overflows, the largest unsigned integer is returned.
1629 *----------------------------------------------------------------------------*/
1630 
float64_to_uint64(float64 a,float_status_t & status)1631 Bit64u float64_to_uint64(float64 a, float_status_t &status)
1632 {
1633     int aSign;
1634     Bit16s aExp, shiftCount;
1635     Bit64u aSig, aSigExtra;
1636 
1637     aSig = extractFloat64Frac(a);
1638     aExp = extractFloat64Exp(a);
1639     aSign = extractFloat64Sign(a);
1640 
1641     if (get_denormals_are_zeros(status)) {
1642         if (aExp == 0) aSig = 0;
1643     }
1644 
1645     if (aSign && (aExp > 0x3FE)) {
1646         float_raise(status, float_flag_invalid);
1647         return uint64_indefinite;
1648     }
1649 
1650     if (aExp) {
1651         aSig |= BX_CONST64(0x0010000000000000);
1652     }
1653     shiftCount = 0x433 - aExp;
1654     if (shiftCount <= 0) {
1655         if (0x43E < aExp) {
1656             float_raise(status, float_flag_invalid);
1657             return uint64_indefinite;
1658         }
1659         aSigExtra = 0;
1660         aSig <<= -shiftCount;
1661     } else {
1662         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
1663     }
1664 
1665     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
1666 }
1667 
1668 /*----------------------------------------------------------------------------
1669 | Returns the result of converting the double-precision floating-point value
1670 | `a' to the single-precision floating-point format.  The conversion is
1671 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1672 | Arithmetic.
1673 *----------------------------------------------------------------------------*/
1674 
float64_to_float32(float64 a,float_status_t & status)1675 float32 float64_to_float32(float64 a, float_status_t &status)
1676 {
1677     int aSign;
1678     Bit16s aExp;
1679     Bit64u aSig;
1680     Bit32u zSig;
1681 
1682     aSig = extractFloat64Frac(a);
1683     aExp = extractFloat64Exp(a);
1684     aSign = extractFloat64Sign(a);
1685     if (aExp == 0x7FF) {
1686         if (aSig) return commonNaNToFloat32(float64ToCommonNaN(a, status));
1687         return packFloat32(aSign, 0xFF, 0);
1688     }
1689     if (aExp == 0) {
1690         if (aSig == 0 || get_denormals_are_zeros(status))
1691             return packFloat32(aSign, 0, 0);
1692         float_raise(status, float_flag_denormal);
1693     }
1694     aSig = shift64RightJamming(aSig, 22);
1695     zSig = (Bit32u) aSig;
1696     if (aExp || zSig) {
1697         zSig |= 0x40000000;
1698         aExp -= 0x381;
1699     }
1700     return roundAndPackFloat32(aSign, aExp, zSig, status);
1701 }
1702 
1703 /*----------------------------------------------------------------------------
1704 | Rounds the double-precision floating-point value `a' to an integer, and
1705 | returns the result as a double-precision floating-point value.  The
1706 | operation is performed according to the IEC/IEEE Standard for Binary
1707 | Floating-Point Arithmetic.
1708 *----------------------------------------------------------------------------*/
1709 
float64_round_to_int(float64 a,Bit8u scale,float_status_t & status)1710 float64 float64_round_to_int(float64 a, Bit8u scale, float_status_t &status)
1711 {
1712     Bit64u lastBitMask, roundBitsMask;
1713     int roundingMode = get_float_rounding_mode(status);
1714     Bit16s aExp = extractFloat64Exp(a);
1715     scale &= 0xf;
1716 
1717     if ((aExp == 0x7FF) && extractFloat64Frac(a)) {
1718         return propagateFloat64NaN(a, status);
1719     }
1720 
1721     aExp += scale; // scale the exponent
1722 
1723     if (0x433 <= aExp) {
1724         return a;
1725     }
1726 
1727     if (get_denormals_are_zeros(status)) {
1728         a = float64_denormal_to_zero(a);
1729     }
1730 
1731     if (aExp < 0x3FF) {
1732         if ((Bit64u) (a<<1) == 0) return a;
1733         float_raise(status, float_flag_inexact);
1734         int aSign = extractFloat64Sign(a);
1735         switch (roundingMode) {
1736          case float_round_nearest_even:
1737             if ((aExp == 0x3FE) && extractFloat64Frac(a)) {
1738               return packFloat64(aSign, 0x3FF - scale, 0);
1739             }
1740             break;
1741          case float_round_down:
1742             return aSign ? packFloat64(1, 0x3FF - scale, 0) : float64_positive_zero;
1743          case float_round_up:
1744             return aSign ? float64_negative_zero : packFloat64(0, 0x3FF - scale, 0);
1745         }
1746         return packFloat64(aSign, 0, 0);
1747     }
1748 
1749     lastBitMask = 1;
1750     lastBitMask <<= 0x433 - aExp;
1751     roundBitsMask = lastBitMask - 1;
1752     float64 z = a;
1753     if (roundingMode == float_round_nearest_even) {
1754         z += lastBitMask>>1;
1755         if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
1756     }
1757     else if (roundingMode != float_round_to_zero) {
1758         if (extractFloat64Sign(z) ^ (roundingMode == float_round_up)) {
1759             z += roundBitsMask;
1760         }
1761     }
1762     z &= ~roundBitsMask;
1763     if (z != a) float_raise(status, float_flag_inexact);
1764     return z;
1765 }
1766 
1767 /*----------------------------------------------------------------------------
1768 | Extracts the fractional portion of double-precision floating-point value `a',
1769 | and returns the result  as a  double-precision  floating-point value. The
1770 | fractional results are precise. The operation is performed according to the
1771 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1772 *----------------------------------------------------------------------------*/
1773 
float64_frc(float64 a,float_status_t & status)1774 float64 float64_frc(float64 a, float_status_t &status)
1775 {
1776     int roundingMode = get_float_rounding_mode(status);
1777 
1778     Bit64u aSig = extractFloat64Frac(a);
1779     Bit16s aExp = extractFloat64Exp(a);
1780     int aSign = extractFloat64Sign(a);
1781 
1782     if (aExp == 0x7FF) {
1783         if (aSig) return propagateFloat64NaN(a, status);
1784         float_raise(status, float_flag_invalid);
1785         return float64_default_nan;
1786     }
1787 
1788     if (aExp >= 0x433) {
1789         return packFloat64(roundingMode == float_round_down, 0, 0);
1790     }
1791 
1792     if (aExp < 0x3FF) {
1793         if (aExp == 0) {
1794             if (aSig == 0 || get_denormals_are_zeros(status))
1795                 return packFloat64(roundingMode == float_round_down, 0, 0);
1796 
1797             float_raise(status, float_flag_denormal);
1798             if (! float_exception_masked(status, float_flag_underflow))
1799                 float_raise(status, float_flag_underflow);
1800 
1801             if(get_flush_underflow_to_zero(status)) {
1802                 float_raise(status, float_flag_underflow | float_flag_inexact);
1803                 return packFloat64(aSign, 0, 0);
1804             }
1805         }
1806         return a;
1807     }
1808 
1809     Bit64u lastBitMask = BX_CONST64(1) << (0x433 - aExp);
1810     Bit64u roundBitsMask = lastBitMask - 1;
1811 
1812     aSig &= roundBitsMask;
1813     aSig <<= 10;
1814     aExp--;
1815 
1816     if (aSig == 0)
1817        return packFloat64(roundingMode == float_round_down, 0, 0);
1818 
1819     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
1820 }
1821 
1822 /*----------------------------------------------------------------------------
1823 | Extracts the exponent portion of double-precision floating-point value 'a',
1824 | and returns the result as a double-precision floating-point value
1825 | representing unbiased integer exponent. The operation is performed according
1826 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1827 *----------------------------------------------------------------------------*/
1828 
float64_getexp(float64 a,float_status_t & status)1829 float64 float64_getexp(float64 a, float_status_t &status)
1830 {
1831     Bit16s aExp = extractFloat64Exp(a);
1832     Bit64u aSig = extractFloat64Frac(a);
1833 
1834     if (aExp == 0x7FF) {
1835         if (aSig) return propagateFloat64NaN(a, status);
1836         return float64_positive_inf;
1837     }
1838 
1839     if (aExp == 0) {
1840         if (aSig == 0 || get_denormals_are_zeros(status))
1841             return float64_negative_inf;
1842 
1843         float_raise(status, float_flag_denormal);
1844         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1845     }
1846 
1847     return int32_to_float64(aExp - 0x3FF);
1848 }
1849 
1850 /*----------------------------------------------------------------------------
1851 | Extracts the mantissa of double-precision floating-point value 'a' and
1852 | returns the result as a double-precision floating-point after applying
1853 | the mantissa interval normalization and sign control. The operation is
1854 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1855 | Arithmetic.
1856 *----------------------------------------------------------------------------*/
1857 
float64_getmant(float64 a,float_status_t & status,int sign_ctrl,int interv)1858 float64 float64_getmant(float64 a, float_status_t &status, int sign_ctrl, int interv)
1859 {
1860     Bit16s aExp = extractFloat64Exp(a);
1861     Bit64u aSig = extractFloat64Frac(a);
1862     int aSign = extractFloat64Sign(a);
1863 
1864     if (aExp == 0x7FF) {
1865         if (aSig) return propagateFloat64NaN(a, status);
1866         if (aSign) {
1867             if (sign_ctrl & 0x2) {
1868                 float_raise(status, float_flag_invalid);
1869                 return float64_default_nan;
1870             }
1871         }
1872         return packFloat64(~sign_ctrl & aSign, 0x3FF, 0);
1873     }
1874 
1875     if (aExp == 0 && (aSig == 0 || get_denormals_are_zeros(status))) {
1876         return packFloat64(~sign_ctrl & aSign, 0x3FF, 0);
1877     }
1878 
1879     if (aSign) {
1880         if (sign_ctrl & 0x2) {
1881             float_raise(status, float_flag_invalid);
1882             return float64_default_nan;
1883         }
1884     }
1885 
1886     if (aExp == 0) {
1887         float_raise(status, float_flag_denormal);
1888         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1889     }
1890 
1891     switch(interv) {
1892     case 0x0: // interval [1,2)
1893         aExp = 0x3FF;
1894         break;
1895     case 0x1: // interval [1/2,2)
1896         aExp -= 0x3FF;
1897         aExp  = 0x3FF - (aExp & 0x1);
1898         break;
1899     case 0x2: // interval [1/2,1)
1900         aExp = 0x3FE;
1901         break;
1902     case 0x3: // interval [3/4,3/2)
1903         aExp = 0x3FF - ((aSig >> 51) & 0x1);
1904         break;
1905     }
1906 
1907     return packFloat64(~sign_ctrl & aSign, aExp, aSig);
1908 }
1909 
1910 /*----------------------------------------------------------------------------
1911 | Return the result of a floating point scale of the double-precision floating
1912 | point value `a' by multiplying it by 2 power of the double-precision
1913 | floating point value 'b' converted to integral value. If the result cannot
1914 | be represented in double precision, then the proper overflow response (for
1915 | positive scaling operand), or the proper underflow response (for negative
1916 | scaling operand) is issued. The operation is performed according to the
1917 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1918 *----------------------------------------------------------------------------*/
1919 
float64_scalef(float64 a,float64 b,float_status_t & status)1920 float64 float64_scalef(float64 a, float64 b, float_status_t &status)
1921 {
1922     Bit64u aSig = extractFloat64Frac(a);
1923     Bit16s aExp = extractFloat64Exp(a);
1924     int aSign = extractFloat64Sign(a);
1925     Bit64u bSig = extractFloat64Frac(b);
1926     Bit16s bExp = extractFloat64Exp(b);
1927     int bSign = extractFloat64Sign(b);
1928 
1929     if (get_denormals_are_zeros(status)) {
1930         if (aExp == 0) aSig = 0;
1931         if (bExp == 0) bSig = 0;
1932     }
1933 
1934     if (bExp == 0x7FF) {
1935         if (bSig) return propagateFloat64NaN(a, b, status);
1936     }
1937 
1938     if (aExp == 0x7FF) {
1939         if (aSig) {
1940             int aIsSignalingNaN = (aSig & BX_CONST64(0x0008000000000000)) == 0;
1941             if (aIsSignalingNaN || bExp != 0x7FF || bSig)
1942                 return propagateFloat64NaN(a, b, status);
1943 
1944             return bSign ? 0 : float64_positive_inf;
1945         }
1946 
1947         if (bExp == 0x7FF && bSign) {
1948             float_raise(status, float_flag_invalid);
1949             return float64_default_nan;
1950         }
1951         return a;
1952     }
1953 
1954     if ((aExp | aSig) == 0) {
1955         if (bExp == 0x7FF && ! bSign) {
1956             float_raise(status, float_flag_invalid);
1957             return float64_default_nan;
1958         }
1959         return a;
1960     }
1961 
1962     if ((bExp | bSig) == 0) return a;
1963 
1964     if (bExp == 0x7FF) {
1965         if (bSign) return packFloat64(aSign, 0, 0);
1966         return packFloat64(aSign, 0x7FF, 0);
1967     }
1968 
1969     if (0x40F <= bExp) {
1970         // handle obvious overflow/underflow result
1971         return roundAndPackFloat64(aSign, bSign ? -0x3FF : 0x7FF, aSig, status);
1972     }
1973 
1974     int scale = 0;
1975 
1976     if (bExp < 0x3FF) {
1977         scale = -bSign;
1978     }
1979     else {
1980         bSig |= BX_CONST64(0x0010000000000000);
1981         int shiftCount = 0x433 - bExp;
1982         Bit64u savedBSig = bSig;
1983         bSig >>= shiftCount;
1984         scale = (Bit32s) bSig;
1985         if (bSign) {
1986             if ((bSig<<shiftCount) != savedBSig) scale++;
1987             scale = -scale;
1988         }
1989 
1990         if (scale >  0x1000) scale =  0x1000;
1991         if (scale < -0x1000) scale = -0x1000;
1992     }
1993 
1994     if (aExp != 0) {
1995         aSig |= BX_CONST64(0x0010000000000000);
1996     } else {
1997         aExp++;
1998     }
1999 
2000     aExp += scale - 1;
2001     aSig <<= 10;
2002     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
2003 }
2004 
2005 /*----------------------------------------------------------------------------
2006 | Returns the result of adding the absolute values of the double-precision
2007 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2008 | before being returned.  `zSign' is ignored if the result is a NaN.
2009 | The addition is performed according to the IEC/IEEE Standard for Binary
2010 | Floating-Point Arithmetic.
2011 *----------------------------------------------------------------------------*/
2012 
addFloat64Sigs(float64 a,float64 b,int zSign,float_status_t & status)2013 static float64 addFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
2014 {
2015     Bit16s aExp, bExp, zExp;
2016     Bit64u aSig, bSig, zSig;
2017     Bit16s expDiff;
2018 
2019     aSig = extractFloat64Frac(a);
2020     aExp = extractFloat64Exp(a);
2021     bSig = extractFloat64Frac(b);
2022     bExp = extractFloat64Exp(b);
2023 
2024     if (get_denormals_are_zeros(status)) {
2025         if (aExp == 0) aSig = 0;
2026         if (bExp == 0) bSig = 0;
2027     }
2028 
2029     expDiff = aExp - bExp;
2030     aSig <<= 9;
2031     bSig <<= 9;
2032     if (0 < expDiff) {
2033         if (aExp == 0x7FF) {
2034             if (aSig) return propagateFloat64NaN(a, b, status);
2035             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2036             return a;
2037         }
2038         if ((aExp == 0) && aSig)
2039             float_raise(status, float_flag_denormal);
2040 
2041         if (bExp == 0) {
2042             if (bSig) float_raise(status, float_flag_denormal);
2043             --expDiff;
2044         }
2045         else bSig |= BX_CONST64(0x2000000000000000);
2046 
2047         bSig = shift64RightJamming(bSig, expDiff);
2048         zExp = aExp;
2049     }
2050     else if (expDiff < 0) {
2051         if (bExp == 0x7FF) {
2052             if (bSig) return propagateFloat64NaN(a, b, status);
2053             if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2054             return packFloat64(zSign, 0x7FF, 0);
2055         }
2056         if ((bExp == 0) && bSig)
2057             float_raise(status, float_flag_denormal);
2058 
2059         if (aExp == 0) {
2060             if (aSig) float_raise(status, float_flag_denormal);
2061             ++expDiff;
2062         }
2063         else aSig |= BX_CONST64(0x2000000000000000);
2064 
2065         aSig = shift64RightJamming(aSig, -expDiff);
2066         zExp = bExp;
2067     }
2068     else {
2069         if (aExp == 0x7FF) {
2070             if (aSig | bSig) return propagateFloat64NaN(a, b, status);
2071             return a;
2072         }
2073         if (aExp == 0) {
2074             zSig = (aSig + bSig) >> 9;
2075             if (aSig | bSig) {
2076                 float_raise(status, float_flag_denormal);
2077                 if (get_flush_underflow_to_zero(status)) {
2078                     float_raise(status, float_flag_underflow | float_flag_inexact);
2079                     return packFloat64(zSign, 0, 0);
2080                 }
2081                 if (! float_exception_masked(status, float_flag_underflow)) {
2082                     if (extractFloat64Frac(zSig) == zSig)
2083                         float_raise(status, float_flag_underflow);
2084                 }
2085             }
2086             return packFloat64(zSign, 0, zSig);
2087         }
2088         zSig = BX_CONST64(0x4000000000000000) + aSig + bSig;
2089         return roundAndPackFloat64(zSign, aExp, zSig, status);
2090     }
2091     aSig |= BX_CONST64(0x2000000000000000);
2092     zSig = (aSig + bSig)<<1;
2093     --zExp;
2094     if ((Bit64s) zSig < 0) {
2095         zSig = aSig + bSig;
2096         ++zExp;
2097     }
2098     return roundAndPackFloat64(zSign, zExp, zSig, status);
2099 }
2100 
2101 /*----------------------------------------------------------------------------
2102 | Returns the result of subtracting the absolute values of the double-
2103 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2104 | difference is negated before being returned.  `zSign' is ignored if the
2105 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2106 | Standard for Binary Floating-Point Arithmetic.
2107 *----------------------------------------------------------------------------*/
2108 
subFloat64Sigs(float64 a,float64 b,int zSign,float_status_t & status)2109 static float64 subFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
2110 {
2111     Bit16s aExp, bExp, zExp;
2112     Bit64u aSig, bSig, zSig;
2113     Bit16s expDiff;
2114 
2115     aSig = extractFloat64Frac(a);
2116     aExp = extractFloat64Exp(a);
2117     bSig = extractFloat64Frac(b);
2118     bExp = extractFloat64Exp(b);
2119 
2120     if (get_denormals_are_zeros(status)) {
2121         if (aExp == 0) aSig = 0;
2122         if (bExp == 0) bSig = 0;
2123     }
2124 
2125     expDiff = aExp - bExp;
2126     aSig <<= 10;
2127     bSig <<= 10;
2128     if (0 < expDiff) goto aExpBigger;
2129     if (expDiff < 0) goto bExpBigger;
2130     if (aExp == 0x7FF) {
2131         if (aSig | bSig) return propagateFloat64NaN(a, b, status);
2132         float_raise(status, float_flag_invalid);
2133         return float64_default_nan;
2134     }
2135     if (aExp == 0) {
2136         if (aSig | bSig) float_raise(status, float_flag_denormal);
2137         aExp = 1;
2138         bExp = 1;
2139     }
2140     if (bSig < aSig) goto aBigger;
2141     if (aSig < bSig) goto bBigger;
2142     return packFloat64(get_float_rounding_mode(status) == float_round_down, 0, 0);
2143  bExpBigger:
2144     if (bExp == 0x7FF) {
2145         if (bSig) return propagateFloat64NaN(a, b, status);
2146         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2147         return packFloat64(zSign ^ 1, 0x7FF, 0);
2148     }
2149     if ((bExp == 0) && bSig)
2150         float_raise(status, float_flag_denormal);
2151 
2152     if (aExp == 0) {
2153         if (aSig) float_raise(status, float_flag_denormal);
2154         ++expDiff;
2155     }
2156     else aSig |= BX_CONST64(0x4000000000000000);
2157 
2158     aSig = shift64RightJamming(aSig, -expDiff);
2159     bSig |= BX_CONST64(0x4000000000000000);
2160  bBigger:
2161     zSig = bSig - aSig;
2162     zExp = bExp;
2163     zSign ^= 1;
2164     goto normalizeRoundAndPack;
2165  aExpBigger:
2166     if (aExp == 0x7FF) {
2167         if (aSig) return propagateFloat64NaN(a, b, status);
2168         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2169         return a;
2170     }
2171     if ((aExp == 0) && aSig)
2172         float_raise(status, float_flag_denormal);
2173 
2174     if (bExp == 0) {
2175         if (bSig) float_raise(status, float_flag_denormal);
2176         --expDiff;
2177     }
2178     else bSig |= BX_CONST64(0x4000000000000000);
2179 
2180     bSig = shift64RightJamming(bSig, expDiff);
2181     aSig |= BX_CONST64(0x4000000000000000);
2182  aBigger:
2183     zSig = aSig - bSig;
2184     zExp = aExp;
2185  normalizeRoundAndPack:
2186     --zExp;
2187     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
2188 }
2189 
2190 /*----------------------------------------------------------------------------
2191 | Returns the result of adding the double-precision floating-point values `a'
2192 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2193 | Binary Floating-Point Arithmetic.
2194 *----------------------------------------------------------------------------*/
2195 
float64_add(float64 a,float64 b,float_status_t & status)2196 float64 float64_add(float64 a, float64 b, float_status_t &status)
2197 {
2198     int aSign = extractFloat64Sign(a);
2199     int bSign = extractFloat64Sign(b);
2200 
2201     if (aSign == bSign) {
2202         return addFloat64Sigs(a, b, aSign, status);
2203     }
2204     else {
2205         return subFloat64Sigs(a, b, aSign, status);
2206     }
2207 }
2208 
2209 /*----------------------------------------------------------------------------
2210 | Returns the result of subtracting the double-precision floating-point values
2211 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2212 | for Binary Floating-Point Arithmetic.
2213 *----------------------------------------------------------------------------*/
2214 
float64_sub(float64 a,float64 b,float_status_t & status)2215 float64 float64_sub(float64 a, float64 b, float_status_t &status)
2216 {
2217     int aSign = extractFloat64Sign(a);
2218     int bSign = extractFloat64Sign(b);
2219 
2220     if (aSign == bSign) {
2221         return subFloat64Sigs(a, b, aSign, status);
2222     }
2223     else {
2224         return addFloat64Sigs(a, b, aSign, status);
2225     }
2226 }
2227 
2228 /*----------------------------------------------------------------------------
2229 | Returns the result of multiplying the double-precision floating-point values
2230 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2231 | for Binary Floating-Point Arithmetic.
2232 *----------------------------------------------------------------------------*/
2233 
float64_mul(float64 a,float64 b,float_status_t & status)2234 float64 float64_mul(float64 a, float64 b, float_status_t &status)
2235 {
2236     int aSign, bSign, zSign;
2237     Bit16s aExp, bExp, zExp;
2238     Bit64u aSig, bSig, zSig0, zSig1;
2239 
2240     aSig = extractFloat64Frac(a);
2241     aExp = extractFloat64Exp(a);
2242     aSign = extractFloat64Sign(a);
2243     bSig = extractFloat64Frac(b);
2244     bExp = extractFloat64Exp(b);
2245     bSign = extractFloat64Sign(b);
2246     zSign = aSign ^ bSign;
2247 
2248     if (get_denormals_are_zeros(status)) {
2249         if (aExp == 0) aSig = 0;
2250         if (bExp == 0) bSig = 0;
2251     }
2252 
2253     if (aExp == 0x7FF) {
2254         if (aSig || ((bExp == 0x7FF) && bSig)) {
2255             return propagateFloat64NaN(a, b, status);
2256         }
2257         if ((bExp | bSig) == 0) {
2258             float_raise(status, float_flag_invalid);
2259             return float64_default_nan;
2260         }
2261         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2262         return packFloat64(zSign, 0x7FF, 0);
2263     }
2264     if (bExp == 0x7FF) {
2265         if (bSig) return propagateFloat64NaN(a, b, status);
2266         if ((aExp | aSig) == 0) {
2267             float_raise(status, float_flag_invalid);
2268             return float64_default_nan;
2269         }
2270         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2271         return packFloat64(zSign, 0x7FF, 0);
2272     }
2273     if (aExp == 0) {
2274         if (aSig == 0) {
2275             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2276             return packFloat64(zSign, 0, 0);
2277         }
2278         float_raise(status, float_flag_denormal);
2279         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2280     }
2281     if (bExp == 0) {
2282         if (bSig == 0) return packFloat64(zSign, 0, 0);
2283         float_raise(status, float_flag_denormal);
2284         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
2285     }
2286     zExp = aExp + bExp - 0x3FF;
2287     aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
2288     bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
2289     mul64To128(aSig, bSig, &zSig0, &zSig1);
2290     zSig0 |= (zSig1 != 0);
2291     if (0 <= (Bit64s) (zSig0<<1)) {
2292         zSig0 <<= 1;
2293         --zExp;
2294     }
2295     return roundAndPackFloat64(zSign, zExp, zSig0, status);
2296 }
2297 
2298 /*----------------------------------------------------------------------------
2299 | Returns the result of dividing the double-precision floating-point value `a'
2300 | by the corresponding value `b'.  The operation is performed according to
2301 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2302 *----------------------------------------------------------------------------*/
2303 
float64_div(float64 a,float64 b,float_status_t & status)2304 float64 float64_div(float64 a, float64 b, float_status_t &status)
2305 {
2306     int aSign, bSign, zSign;
2307     Bit16s aExp, bExp, zExp;
2308     Bit64u aSig, bSig, zSig;
2309     Bit64u rem0, rem1;
2310     Bit64u term0, term1;
2311 
2312     aSig = extractFloat64Frac(a);
2313     aExp = extractFloat64Exp(a);
2314     aSign = extractFloat64Sign(a);
2315     bSig = extractFloat64Frac(b);
2316     bExp = extractFloat64Exp(b);
2317     bSign = extractFloat64Sign(b);
2318     zSign = aSign ^ bSign;
2319 
2320     if (get_denormals_are_zeros(status)) {
2321         if (aExp == 0) aSig = 0;
2322         if (bExp == 0) bSig = 0;
2323     }
2324 
2325     if (aExp == 0x7FF) {
2326         if (aSig) return propagateFloat64NaN(a, b, status);
2327         if (bExp == 0x7FF) {
2328             if (bSig) return propagateFloat64NaN(a, b, status);
2329             float_raise(status, float_flag_invalid);
2330             return float64_default_nan;
2331         }
2332         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2333         return packFloat64(zSign, 0x7FF, 0);
2334     }
2335     if (bExp == 0x7FF) {
2336         if (bSig) return propagateFloat64NaN(a, b, status);
2337         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2338         return packFloat64(zSign, 0, 0);
2339     }
2340     if (bExp == 0) {
2341         if (bSig == 0) {
2342             if ((aExp | aSig) == 0) {
2343                 float_raise(status, float_flag_invalid);
2344                 return float64_default_nan;
2345             }
2346             float_raise(status, float_flag_divbyzero);
2347             return packFloat64(zSign, 0x7FF, 0);
2348         }
2349         float_raise(status, float_flag_denormal);
2350         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
2351     }
2352     if (aExp == 0) {
2353         if (aSig == 0) return packFloat64(zSign, 0, 0);
2354         float_raise(status, float_flag_denormal);
2355         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2356     }
2357     zExp = aExp - bExp + 0x3FD;
2358     aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
2359     bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
2360     if (bSig <= (aSig + aSig)) {
2361         aSig >>= 1;
2362         ++zExp;
2363     }
2364     zSig = estimateDiv128To64(aSig, 0, bSig);
2365     if ((zSig & 0x1FF) <= 2) {
2366         mul64To128(bSig, zSig, &term0, &term1);
2367         sub128(aSig, 0, term0, term1, &rem0, &rem1);
2368         while ((Bit64s) rem0 < 0) {
2369             --zSig;
2370             add128(rem0, rem1, 0, bSig, &rem0, &rem1);
2371         }
2372         zSig |= (rem1 != 0);
2373     }
2374     return roundAndPackFloat64(zSign, zExp, zSig, status);
2375 }
2376 
2377 /*----------------------------------------------------------------------------
2378 | Returns the square root of the double-precision floating-point value `a'.
2379 | The operation is performed according to the IEC/IEEE Standard for Binary
2380 | Floating-Point Arithmetic.
2381 *----------------------------------------------------------------------------*/
2382 
float64_sqrt(float64 a,float_status_t & status)2383 float64 float64_sqrt(float64 a, float_status_t &status)
2384 {
2385     int aSign;
2386     Bit16s aExp, zExp;
2387     Bit64u aSig, zSig, doubleZSig;
2388     Bit64u rem0, rem1, term0, term1;
2389 
2390     aSig = extractFloat64Frac(a);
2391     aExp = extractFloat64Exp(a);
2392     aSign = extractFloat64Sign(a);
2393 
2394     if (aExp == 0x7FF) {
2395         if (aSig) return propagateFloat64NaN(a, status);
2396         if (! aSign) return a;
2397         float_raise(status, float_flag_invalid);
2398         return float64_default_nan;
2399     }
2400 
2401     if (get_denormals_are_zeros(status)) {
2402         if (aExp == 0) aSig = 0;
2403     }
2404 
2405     if (aSign) {
2406         if ((aExp | aSig) == 0) return packFloat64(aSign, 0, 0);
2407         float_raise(status, float_flag_invalid);
2408         return float64_default_nan;
2409     }
2410     if (aExp == 0) {
2411         if (aSig == 0) return 0;
2412         float_raise(status, float_flag_denormal);
2413         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2414     }
2415     zExp = ((aExp - 0x3FF)>>1) + 0x3FE;
2416     aSig |= BX_CONST64(0x0010000000000000);
2417     zSig = estimateSqrt32(aExp, (Bit32u)(aSig>>21));
2418     aSig <<= 9 - (aExp & 1);
2419     zSig = estimateDiv128To64(aSig, 0, zSig<<32) + (zSig<<30);
2420     if ((zSig & 0x1FF) <= 5) {
2421         doubleZSig = zSig<<1;
2422         mul64To128(zSig, zSig, &term0, &term1);
2423         sub128(aSig, 0, term0, term1, &rem0, &rem1);
2424         while ((Bit64s) rem0 < 0) {
2425             --zSig;
2426             doubleZSig -= 2;
2427             add128(rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1);
2428         }
2429         zSig |= ((rem0 | rem1) != 0);
2430     }
2431     return roundAndPackFloat64(0, zExp, zSig, status);
2432 }
2433 
2434 /*----------------------------------------------------------------------------
2435 | Determine double-precision floating-point number class
2436 *----------------------------------------------------------------------------*/
2437 
float64_class(float64 a)2438 float_class_t float64_class(float64 a)
2439 {
2440    Bit16s aExp = extractFloat64Exp(a);
2441    Bit64u aSig = extractFloat64Frac(a);
2442    int  aSign = extractFloat64Sign(a);
2443 
2444    if(aExp == 0x7FF) {
2445        if (aSig == 0)
2446            return (aSign) ? float_negative_inf : float_positive_inf;
2447 
2448        return (aSig & BX_CONST64(0x0008000000000000)) ? float_QNaN : float_SNaN;
2449    }
2450 
2451    if(aExp == 0) {
2452        if (aSig == 0)
2453            return float_zero;
2454        return float_denormal;
2455    }
2456 
2457    return float_normalized;
2458 }
2459 
2460 /*----------------------------------------------------------------------------
2461 | Compare  between  two  double  precision  floating  point  numbers. Returns
2462 | 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
2463 | the    value    'a'   is   less   than   the   corresponding   value   `b',
2464 | 'float_relation_greater' if the value 'a' is greater than the corresponding
2465 | value `b', or 'float_relation_unordered' otherwise.
2466 *----------------------------------------------------------------------------*/
2467 
float64_compare(float64 a,float64 b,float_status_t & status)2468 int float64_compare(float64 a, float64 b, float_status_t &status)
2469 {
2470     if (get_denormals_are_zeros(status)) {
2471         a = float64_denormal_to_zero(a);
2472         b = float64_denormal_to_zero(b);
2473     }
2474 
2475     float_class_t aClass = float64_class(a);
2476     float_class_t bClass = float64_class(b);
2477 
2478     if (aClass == float_SNaN || aClass == float_QNaN || bClass == float_SNaN || bClass == float_QNaN)
2479     {
2480         float_raise(status, float_flag_invalid);
2481         return float_relation_unordered;
2482     }
2483 
2484     if (aClass == float_denormal || bClass == float_denormal) {
2485         float_raise(status, float_flag_denormal);
2486     }
2487 
2488     if ((a == b) || ((Bit64u) ((a | b)<<1) == 0)) return float_relation_equal;
2489 
2490     int aSign = extractFloat64Sign(a);
2491     int bSign = extractFloat64Sign(b);
2492     if (aSign != bSign)
2493         return (aSign) ? float_relation_less : float_relation_greater;
2494 
2495     if (aSign ^ (a < b)) return float_relation_less;
2496     return float_relation_greater;
2497 }
2498 
2499 /*----------------------------------------------------------------------------
2500 | Compare  between  two  double  precision  floating  point  numbers. Returns
2501 | 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
2502 | the    value    'a'   is   less   than   the   corresponding   value   `b',
2503 | 'float_relation_greater' if the value 'a' is greater than the corresponding
2504 | value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause
2505 | an exception.
2506 *----------------------------------------------------------------------------*/
2507 
float64_compare_quiet(float64 a,float64 b,float_status_t & status)2508 int float64_compare_quiet(float64 a, float64 b, float_status_t &status)
2509 {
2510     if (get_denormals_are_zeros(status)) {
2511         a = float64_denormal_to_zero(a);
2512         b = float64_denormal_to_zero(b);
2513     }
2514 
2515     float_class_t aClass = float64_class(a);
2516     float_class_t bClass = float64_class(b);
2517 
2518     if (aClass == float_SNaN || bClass == float_SNaN) {
2519         float_raise(status, float_flag_invalid);
2520         return float_relation_unordered;
2521     }
2522 
2523     if (aClass == float_QNaN || bClass == float_QNaN) {
2524         return float_relation_unordered;
2525     }
2526 
2527     if (aClass == float_denormal || bClass == float_denormal) {
2528         float_raise(status, float_flag_denormal);
2529     }
2530 
2531     if ((a == b) || ((Bit64u) ((a | b)<<1) == 0)) return float_relation_equal;
2532 
2533     int aSign = extractFloat64Sign(a);
2534     int bSign = extractFloat64Sign(b);
2535     if (aSign != bSign)
2536         return (aSign) ? float_relation_less : float_relation_greater;
2537 
2538     if (aSign ^ (a < b)) return float_relation_less;
2539     return float_relation_greater;
2540 }
2541 
2542 /*----------------------------------------------------------------------------
2543 | Compare bewteen two double precision floating point numbers and return the
2544 | smaller  of  them.  The operation  is performed  according to the IEC/IEEE
2545 | Standard for Binary Floating-Point Arithmetic.
2546 *----------------------------------------------------------------------------*/
2547 
float64_min(float64 a,float64 b,float_status_t & status)2548 float64 float64_min(float64 a, float64 b, float_status_t &status)
2549 {
2550   if (get_denormals_are_zeros(status)) {
2551     a = float64_denormal_to_zero(a);
2552     b = float64_denormal_to_zero(b);
2553   }
2554 
2555   return (float64_compare(a, b, status) == float_relation_less) ? a : b;
2556 }
2557 
2558 /*----------------------------------------------------------------------------
2559 | Compare bewteen two double precision floating point numbers and return the
2560 | larger  of  them.  The operation  is performed  according to  the IEC/IEEE
2561 | Standard for Binary Floating-Point Arithmetic.
2562 *----------------------------------------------------------------------------*/
2563 
float64_max(float64 a,float64 b,float_status_t & status)2564 float64 float64_max(float64 a, float64 b, float_status_t &status)
2565 {
2566   if (get_denormals_are_zeros(status)) {
2567     a = float64_denormal_to_zero(a);
2568     b = float64_denormal_to_zero(b);
2569   }
2570 
2571   return (float64_compare(a, b, status) == float_relation_greater) ? a : b;
2572 }
2573 
2574 #ifdef FLOATX80
2575 
2576 /*----------------------------------------------------------------------------
2577 | Returns the result of converting the 32-bit two's complement integer `a'
2578 | to the extended double-precision floating-point format.  The conversion
2579 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2580 | Arithmetic.
2581 *----------------------------------------------------------------------------*/
2582 
int32_to_floatx80(Bit32s a)2583 floatx80 int32_to_floatx80(Bit32s a)
2584 {
2585     if (a == 0) return packFloatx80(0, 0, 0);
2586     int   zSign = (a < 0);
2587     Bit32u absA = zSign ? -a : a;
2588     int    shiftCount = countLeadingZeros32(absA) + 32;
2589     Bit64u zSig = absA;
2590     return packFloatx80(zSign, 0x403E - shiftCount, zSig<<shiftCount);
2591 }
2592 
2593 /*----------------------------------------------------------------------------
2594 | Returns the result of converting the 64-bit two's complement integer `a'
2595 | to the extended double-precision floating-point format.  The conversion
2596 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2597 | Arithmetic.
2598 *----------------------------------------------------------------------------*/
2599 
int64_to_floatx80(Bit64s a)2600 floatx80 int64_to_floatx80(Bit64s a)
2601 {
2602     if (a == 0) return packFloatx80(0, 0, 0);
2603     int   zSign = (a < 0);
2604     Bit64u absA = zSign ? -a : a;
2605     int    shiftCount = countLeadingZeros64(absA);
2606     return packFloatx80(zSign, 0x403E - shiftCount, absA<<shiftCount);
2607 }
2608 
2609 /*----------------------------------------------------------------------------
2610 | Returns the result of converting the single-precision floating-point value
2611 | `a' to the extended double-precision floating-point format.  The conversion
2612 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2613 | Arithmetic.
2614 *----------------------------------------------------------------------------*/
2615 
float32_to_floatx80(float32 a,float_status_t & status)2616 floatx80 float32_to_floatx80(float32 a, float_status_t &status)
2617 {
2618     Bit32u aSig = extractFloat32Frac(a);
2619     Bit16s aExp = extractFloat32Exp(a);
2620     int aSign = extractFloat32Sign(a);
2621     if (aExp == 0xFF) {
2622         if (aSig) return commonNaNToFloatx80(float32ToCommonNaN(a, status));
2623         return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2624     }
2625     if (aExp == 0) {
2626         if (aSig == 0) return packFloatx80(aSign, 0, 0);
2627         float_raise(status, float_flag_denormal);
2628         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2629     }
2630     aSig |= 0x00800000;
2631     return packFloatx80(aSign, aExp + 0x3F80, ((Bit64u) aSig)<<40);
2632 }
2633 
2634 /*----------------------------------------------------------------------------
2635 | Returns the result of converting the double-precision floating-point value
2636 | `a' to the extended double-precision floating-point format.  The conversion
2637 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2638 | Arithmetic.
2639 *----------------------------------------------------------------------------*/
2640 
float64_to_floatx80(float64 a,float_status_t & status)2641 floatx80 float64_to_floatx80(float64 a, float_status_t &status)
2642 {
2643     Bit64u aSig = extractFloat64Frac(a);
2644     Bit16s aExp = extractFloat64Exp(a);
2645     int aSign = extractFloat64Sign(a);
2646 
2647     if (aExp == 0x7FF) {
2648         if (aSig) return commonNaNToFloatx80(float64ToCommonNaN(a, status));
2649         return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2650     }
2651     if (aExp == 0) {
2652         if (aSig == 0) return packFloatx80(aSign, 0, 0);
2653         float_raise(status, float_flag_denormal);
2654         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2655     }
2656     return
2657         packFloatx80(
2658             aSign, aExp + 0x3C00, (aSig | BX_CONST64(0x0010000000000000))<<11);
2659 }
2660 
2661 /*----------------------------------------------------------------------------
2662 | Returns the result of converting the extended double-precision floating-
2663 | point value `a' to the 32-bit two's complement integer format.  The
2664 | conversion is performed according to the IEC/IEEE Standard for Binary
2665 | Floating-Point Arithmetic - which means in particular that the conversion
2666 | is rounded according to the current rounding mode. If `a' is a NaN or the
2667 | conversion overflows, the integer indefinite value is returned.
2668 *----------------------------------------------------------------------------*/
2669 
floatx80_to_int32(floatx80 a,float_status_t & status)2670 Bit32s floatx80_to_int32(floatx80 a, float_status_t &status)
2671 {
2672     Bit64u aSig = extractFloatx80Frac(a);
2673     Bit32s aExp = extractFloatx80Exp(a);
2674     int aSign = extractFloatx80Sign(a);
2675 
2676     // handle unsupported extended double-precision floating encodings
2677     if (floatx80_is_unsupported(a))
2678     {
2679         float_raise(status, float_flag_invalid);
2680         return int32_indefinite;
2681     }
2682 
2683     if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) aSign = 0;
2684     int shiftCount = 0x4037 - aExp;
2685     if (shiftCount <= 0) shiftCount = 1;
2686     aSig = shift64RightJamming(aSig, shiftCount);
2687     return roundAndPackInt32(aSign, aSig, status);
2688 }
2689 
2690 /*----------------------------------------------------------------------------
2691 | Returns the result of converting the extended double-precision floating-
2692 | point value `a' to the 32-bit two's complement integer format.  The
2693 | conversion is performed according to the IEC/IEEE Standard for Binary
2694 | Floating-Point Arithmetic, except that the conversion is always rounded
2695 | toward zero.  If `a' is a NaN or the conversion overflows, the integer
2696 | indefinite value is returned.
2697 *----------------------------------------------------------------------------*/
2698 
floatx80_to_int32_round_to_zero(floatx80 a,float_status_t & status)2699 Bit32s floatx80_to_int32_round_to_zero(floatx80 a, float_status_t &status)
2700 {
2701     Bit32s aExp;
2702     Bit64u aSig, savedASig;
2703     Bit32s z;
2704     int shiftCount;
2705 
2706     // handle unsupported extended double-precision floating encodings
2707     if (floatx80_is_unsupported(a))
2708     {
2709         float_raise(status, float_flag_invalid);
2710         return int32_indefinite;
2711     }
2712 
2713     aSig = extractFloatx80Frac(a);
2714     aExp = extractFloatx80Exp(a);
2715     int aSign = extractFloatx80Sign(a);
2716 
2717     if (aExp > 0x401E) {
2718         float_raise(status, float_flag_invalid);
2719         return (Bit32s)(int32_indefinite);
2720     }
2721     if (aExp < 0x3FFF) {
2722         if (aExp || aSig) float_raise(status, float_flag_inexact);
2723         return 0;
2724     }
2725     shiftCount = 0x403E - aExp;
2726     savedASig = aSig;
2727     aSig >>= shiftCount;
2728     z = (Bit32s) aSig;
2729     if (aSign) z = -z;
2730     if ((z < 0) ^ aSign) {
2731         float_raise(status, float_flag_invalid);
2732         return (Bit32s)(int32_indefinite);
2733     }
2734     if ((aSig<<shiftCount) != savedASig)
2735     {
2736         float_raise(status, float_flag_inexact);
2737     }
2738     return z;
2739 }
2740 
2741 /*----------------------------------------------------------------------------
2742 | Returns the result of converting the extended double-precision floating-
2743 | point value `a' to the 64-bit two's complement integer format.  The
2744 | conversion is performed according to the IEC/IEEE Standard for Binary
2745 | Floating-Point Arithmetic - which means in particular that the conversion
2746 | is rounded according to the current rounding mode. If `a' is a NaN or the
2747 | conversion overflows, the integer indefinite value is returned.
2748 *----------------------------------------------------------------------------*/
2749 
floatx80_to_int64(floatx80 a,float_status_t & status)2750 Bit64s floatx80_to_int64(floatx80 a, float_status_t &status)
2751 {
2752     Bit32s aExp;
2753     Bit64u aSig, aSigExtra;
2754 
2755     // handle unsupported extended double-precision floating encodings
2756     if (floatx80_is_unsupported(a))
2757     {
2758         float_raise(status, float_flag_invalid);
2759         return int64_indefinite;
2760     }
2761 
2762     aSig = extractFloatx80Frac(a);
2763     aExp = extractFloatx80Exp(a);
2764     int aSign = extractFloatx80Sign(a);
2765 
2766     int shiftCount = 0x403E - aExp;
2767     if (shiftCount <= 0)
2768     {
2769         if (shiftCount)
2770         {
2771             float_raise(status, float_flag_invalid);
2772             return (Bit64s)(int64_indefinite);
2773         }
2774         aSigExtra = 0;
2775     }
2776     else {
2777         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
2778     }
2779 
2780     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
2781 }
2782 
2783 /*----------------------------------------------------------------------------
2784 | Returns the result of converting the extended double-precision floating-
2785 | point value `a' to the 64-bit two's complement integer format.  The
2786 | conversion is performed according to the IEC/IEEE Standard for Binary
2787 | Floating-Point Arithmetic, except that the conversion is always rounded
2788 | toward zero.  If `a' is a NaN or the conversion overflows, the integer
2789 | indefinite value is returned.
2790 *----------------------------------------------------------------------------*/
2791 
floatx80_to_int64_round_to_zero(floatx80 a,float_status_t & status)2792 Bit64s floatx80_to_int64_round_to_zero(floatx80 a, float_status_t &status)
2793 {
2794     int aSign;
2795     Bit32s aExp;
2796     Bit64u aSig;
2797     Bit64s z;
2798 
2799     // handle unsupported extended double-precision floating encodings
2800     if (floatx80_is_unsupported(a))
2801     {
2802         float_raise(status, float_flag_invalid);
2803         return int64_indefinite;
2804     }
2805 
2806     aSig = extractFloatx80Frac(a);
2807     aExp = extractFloatx80Exp(a);
2808     aSign = extractFloatx80Sign(a);
2809     int shiftCount = aExp - 0x403E;
2810     if (0 <= shiftCount) {
2811         aSig &= BX_CONST64(0x7FFFFFFFFFFFFFFF);
2812         if ((a.exp != 0xC03E) || aSig) {
2813             float_raise(status, float_flag_invalid);
2814         }
2815         return (Bit64s)(int64_indefinite);
2816     }
2817     else if (aExp < 0x3FFF) {
2818         if (aExp | aSig) float_raise(status, float_flag_inexact);
2819         return 0;
2820     }
2821     z = aSig>>(-shiftCount);
2822     if ((Bit64u) (aSig<<(shiftCount & 63))) {
2823         float_raise(status, float_flag_inexact);
2824     }
2825     if (aSign) z = -z;
2826     return z;
2827 }
2828 
2829 /*----------------------------------------------------------------------------
2830 | Returns the result of converting the extended double-precision floating-
2831 | point value `a' to the single-precision floating-point format.  The
2832 | conversion is performed according to the IEC/IEEE Standard for Binary
2833 | Floating-Point Arithmetic.
2834 *----------------------------------------------------------------------------*/
2835 
floatx80_to_float32(floatx80 a,float_status_t & status)2836 float32 floatx80_to_float32(floatx80 a, float_status_t &status)
2837 {
2838     Bit64u aSig = extractFloatx80Frac(a);
2839     Bit32s aExp = extractFloatx80Exp(a);
2840     int aSign = extractFloatx80Sign(a);
2841 
2842     // handle unsupported extended double-precision floating encodings
2843     if (floatx80_is_unsupported(a))
2844     {
2845         float_raise(status, float_flag_invalid);
2846         return float32_default_nan;
2847     }
2848 
2849     if (aExp == 0x7FFF) {
2850         if ((Bit64u) (aSig<<1))
2851             return commonNaNToFloat32(floatx80ToCommonNaN(a, status));
2852 
2853         return packFloat32(aSign, 0xFF, 0);
2854     }
2855     aSig = shift64RightJamming(aSig, 33);
2856     if (aExp || aSig) aExp -= 0x3F81;
2857     return roundAndPackFloat32(aSign, aExp, (Bit32u) aSig, status);
2858 }
2859 
2860 /*----------------------------------------------------------------------------
2861 | Returns the result of converting the extended double-precision floating-
2862 | point value `a' to the double-precision floating-point format.  The
2863 | conversion is performed according to the IEC/IEEE Standard for Binary
2864 | Floating-Point Arithmetic.
2865 *----------------------------------------------------------------------------*/
2866 
floatx80_to_float64(floatx80 a,float_status_t & status)2867 float64 floatx80_to_float64(floatx80 a, float_status_t &status)
2868 {
2869     Bit32s aExp;
2870     Bit64u aSig, zSig;
2871 
2872     // handle unsupported extended double-precision floating encodings
2873     if (floatx80_is_unsupported(a))
2874     {
2875         float_raise(status, float_flag_invalid);
2876         return float64_default_nan;
2877     }
2878 
2879     aSig = extractFloatx80Frac(a);
2880     aExp = extractFloatx80Exp(a);
2881     int aSign = extractFloatx80Sign(a);
2882 
2883     if (aExp == 0x7FFF) {
2884         if ((Bit64u) (aSig<<1)) {
2885             return commonNaNToFloat64(floatx80ToCommonNaN(a, status));
2886         }
2887         return packFloat64(aSign, 0x7FF, 0);
2888     }
2889     zSig = shift64RightJamming(aSig, 1);
2890     if (aExp || aSig) aExp -= 0x3C01;
2891     return roundAndPackFloat64(aSign, aExp, zSig, status);
2892 }
2893 
2894 /*----------------------------------------------------------------------------
2895 | Rounds the extended double-precision floating-point value `a' to an integer,
2896 | and returns the result as an extended double-precision floating-point
2897 | value.  The operation is performed according to the IEC/IEEE Standard for
2898 | Binary Floating-Point Arithmetic.
2899 *----------------------------------------------------------------------------*/
2900 
floatx80_round_to_int(floatx80 a,float_status_t & status)2901 floatx80 floatx80_round_to_int(floatx80 a, float_status_t &status)
2902 {
2903     int aSign;
2904     Bit64u lastBitMask, roundBitsMask;
2905     int roundingMode = get_float_rounding_mode(status);
2906     floatx80 z;
2907 
2908     // handle unsupported extended double-precision floating encodings
2909     if (floatx80_is_unsupported(a))
2910     {
2911         float_raise(status, float_flag_invalid);
2912         return floatx80_default_nan;
2913     }
2914 
2915     Bit32s aExp = extractFloatx80Exp(a);
2916     Bit64u aSig = extractFloatx80Frac(a);
2917     if (0x403E <= aExp) {
2918         if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) {
2919             return propagateFloatx80NaN(a, status);
2920         }
2921         return a;
2922     }
2923     if (aExp < 0x3FFF) {
2924         if (aExp == 0) {
2925             if ((aSig<<1) == 0) return a;
2926             float_raise(status, float_flag_denormal);
2927         }
2928         float_raise(status, float_flag_inexact);
2929         aSign = extractFloatx80Sign(a);
2930         switch (roundingMode) {
2931          case float_round_nearest_even:
2932             if ((aExp == 0x3FFE) && (Bit64u) (aSig<<1)) {
2933                 set_float_rounding_up(status);
2934                 return packFloatx80(aSign, 0x3FFF, BX_CONST64(0x8000000000000000));
2935             }
2936             break;
2937          case float_round_down:
2938             if (aSign) {
2939                 set_float_rounding_up(status);
2940                 return packFloatx80(1, 0x3FFF, BX_CONST64(0x8000000000000000));
2941             }
2942             else {
2943                 return packFloatx80(0, 0, 0);
2944             }
2945          case float_round_up:
2946             if (aSign) {
2947                 return packFloatx80(1, 0, 0);
2948             }
2949             else {
2950                 set_float_rounding_up(status);
2951                 return packFloatx80(0, 0x3FFF, BX_CONST64(0x8000000000000000));
2952             }
2953         }
2954         return packFloatx80(aSign, 0, 0);
2955     }
2956     lastBitMask = 1;
2957     lastBitMask <<= 0x403E - aExp;
2958     roundBitsMask = lastBitMask - 1;
2959     z = a;
2960     if (roundingMode == float_round_nearest_even) {
2961         z.fraction += lastBitMask>>1;
2962         if ((z.fraction & roundBitsMask) == 0) z.fraction &= ~lastBitMask;
2963     }
2964     else if (roundingMode != float_round_to_zero) {
2965         if (extractFloatx80Sign(z) ^ (roundingMode == float_round_up))
2966             z.fraction += roundBitsMask;
2967     }
2968     z.fraction &= ~roundBitsMask;
2969     if (z.fraction == 0) {
2970         z.exp++;
2971         z.fraction = BX_CONST64(0x8000000000000000);
2972     }
2973     if (z.fraction != a.fraction) {
2974         float_raise(status, float_flag_inexact);
2975         if (z.fraction > a.fraction || z.exp > a.exp)
2976             set_float_rounding_up(status);
2977     }
2978     return z;
2979 }
2980 
2981 /*----------------------------------------------------------------------------
2982 | Returns the result of adding the absolute values of the extended double-
2983 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
2984 | negated before being returned.  `zSign' is ignored if the result is a NaN.
2985 | The addition is performed according to the IEC/IEEE Standard for Binary
2986 | Floating-Point Arithmetic.
2987 *----------------------------------------------------------------------------*/
2988 
addFloatx80Sigs(floatx80 a,floatx80 b,int zSign,float_status_t & status)2989 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
2990 {
2991     Bit32s aExp, bExp, zExp;
2992     Bit64u aSig, bSig, zSig0, zSig1;
2993 
2994     // handle unsupported extended double-precision floating encodings
2995     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
2996     {
2997         float_raise(status, float_flag_invalid);
2998         return floatx80_default_nan;
2999     }
3000 
3001     aSig = extractFloatx80Frac(a);
3002     aExp = extractFloatx80Exp(a);
3003     bSig = extractFloatx80Frac(b);
3004     bExp = extractFloatx80Exp(b);
3005 
3006     if (aExp == 0x7FFF) {
3007         if ((Bit64u) (aSig<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
3008             return propagateFloatx80NaN(a, b, status);
3009         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3010         return a;
3011     }
3012     if (bExp == 0x7FFF) {
3013         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3014         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3015         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3016     }
3017     if (aExp == 0) {
3018         if (aSig == 0) {
3019             if ((bExp == 0) && bSig) {
3020                 float_raise(status, float_flag_denormal);
3021                 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3022             }
3023             return roundAndPackFloatx80(get_float_rounding_precision(status),
3024                     zSign, bExp, bSig, 0, status);
3025         }
3026         float_raise(status, float_flag_denormal);
3027         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3028     }
3029     if (bExp == 0) {
3030         if (bSig == 0)
3031             return roundAndPackFloatx80(get_float_rounding_precision(status),
3032                     zSign, aExp, aSig, 0, status);
3033 
3034         float_raise(status, float_flag_denormal);
3035         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3036     }
3037     Bit32s expDiff = aExp - bExp;
3038     zExp = aExp;
3039     if (0 < expDiff) {
3040         shift64ExtraRightJamming(bSig, 0,  expDiff, &bSig, &zSig1);
3041     }
3042     else if (expDiff < 0) {
3043         shift64ExtraRightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
3044         zExp = bExp;
3045     }
3046     else {
3047         zSig0 = aSig + bSig;
3048         zSig1 = 0;
3049         goto shiftRight1;
3050     }
3051     zSig0 = aSig + bSig;
3052     if ((Bit64s) zSig0 < 0) goto roundAndPack;
3053  shiftRight1:
3054     shift64ExtraRightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3055     zSig0 |= BX_CONST64(0x8000000000000000);
3056     zExp++;
3057  roundAndPack:
3058     return
3059         roundAndPackFloatx80(get_float_rounding_precision(status),
3060             zSign, zExp, zSig0, zSig1, status);
3061 }
3062 
3063 /*----------------------------------------------------------------------------
3064 | Returns the result of subtracting the absolute values of the extended
3065 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3066 | difference is negated before being returned.  `zSign' is ignored if the
3067 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3068 | Standard for Binary Floating-Point Arithmetic.
3069 *----------------------------------------------------------------------------*/
3070 
subFloatx80Sigs(floatx80 a,floatx80 b,int zSign,float_status_t & status)3071 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
3072 {
3073     Bit32s aExp, bExp, zExp;
3074     Bit64u aSig, bSig, zSig0, zSig1;
3075 
3076     // handle unsupported extended double-precision floating encodings
3077     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3078     {
3079         float_raise(status, float_flag_invalid);
3080         return floatx80_default_nan;
3081     }
3082 
3083     aSig = extractFloatx80Frac(a);
3084     aExp = extractFloatx80Exp(a);
3085     bSig = extractFloatx80Frac(b);
3086     bExp = extractFloatx80Exp(b);
3087 
3088     if (aExp == 0x7FFF) {
3089         if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
3090         if (bExp == 0x7FFF) {
3091             if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3092             float_raise(status, float_flag_invalid);
3093             return floatx80_default_nan;
3094         }
3095         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3096         return a;
3097     }
3098     if (bExp == 0x7FFF) {
3099         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3100         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3101         return packFloatx80(zSign ^ 1, 0x7FFF, BX_CONST64(0x8000000000000000));
3102     }
3103     if (aExp == 0) {
3104         if (aSig == 0) {
3105             if (bExp == 0) {
3106                 if (bSig) {
3107                     float_raise(status, float_flag_denormal);
3108                     normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3109                     return roundAndPackFloatx80(get_float_rounding_precision(status),
3110                         zSign ^ 1, bExp, bSig, 0, status);
3111                 }
3112                 return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0);
3113             }
3114             return roundAndPackFloatx80(get_float_rounding_precision(status),
3115                     zSign ^ 1, bExp, bSig, 0, status);
3116         }
3117         float_raise(status, float_flag_denormal);
3118         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3119     }
3120     if (bExp == 0) {
3121         if (bSig == 0)
3122             return roundAndPackFloatx80(get_float_rounding_precision(status),
3123                     zSign, aExp, aSig, 0, status);
3124 
3125         float_raise(status, float_flag_denormal);
3126         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3127     }
3128     Bit32s expDiff = aExp - bExp;
3129     if (0 < expDiff) {
3130         shift128RightJamming(bSig, 0, expDiff, &bSig, &zSig1);
3131         goto aBigger;
3132     }
3133     if (expDiff < 0) {
3134         shift128RightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
3135         goto bBigger;
3136     }
3137     zSig1 = 0;
3138     if (bSig < aSig) goto aBigger;
3139     if (aSig < bSig) goto bBigger;
3140     return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0);
3141  bBigger:
3142     sub128(bSig, 0, aSig, zSig1, &zSig0, &zSig1);
3143     zExp = bExp;
3144     zSign ^= 1;
3145     goto normalizeRoundAndPack;
3146  aBigger:
3147     sub128(aSig, 0, bSig, zSig1, &zSig0, &zSig1);
3148     zExp = aExp;
3149  normalizeRoundAndPack:
3150     return
3151         normalizeRoundAndPackFloatx80(get_float_rounding_precision(status),
3152             zSign, zExp, zSig0, zSig1, status);
3153 }
3154 
3155 /*----------------------------------------------------------------------------
3156 | Returns the result of adding the extended double-precision floating-point
3157 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3158 | Standard for Binary Floating-Point Arithmetic.
3159 *----------------------------------------------------------------------------*/
3160 
floatx80_add(floatx80 a,floatx80 b,float_status_t & status)3161 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status_t &status)
3162 {
3163     int aSign = extractFloatx80Sign(a);
3164     int bSign = extractFloatx80Sign(b);
3165 
3166     if (aSign == bSign)
3167         return addFloatx80Sigs(a, b, aSign, status);
3168     else
3169         return subFloatx80Sigs(a, b, aSign, status);
3170 }
3171 
3172 /*----------------------------------------------------------------------------
3173 | Returns the result of subtracting the extended double-precision floating-
3174 | point values `a' and `b'.  The operation is performed according to the
3175 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3176 *----------------------------------------------------------------------------*/
3177 
floatx80_sub(floatx80 a,floatx80 b,float_status_t & status)3178 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status_t &status)
3179 {
3180     int aSign = extractFloatx80Sign(a);
3181     int bSign = extractFloatx80Sign(b);
3182 
3183     if (aSign == bSign)
3184         return subFloatx80Sigs(a, b, aSign, status);
3185     else
3186         return addFloatx80Sigs(a, b, aSign, status);
3187 }
3188 
3189 /*----------------------------------------------------------------------------
3190 | Returns the result of multiplying the extended double-precision floating-
3191 | point values `a' and `b'.  The operation is performed according to the
3192 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3193 *----------------------------------------------------------------------------*/
3194 
floatx80_mul(floatx80 a,floatx80 b,float_status_t & status)3195 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status_t &status)
3196 {
3197     int aSign, bSign, zSign;
3198     Bit32s aExp, bExp, zExp;
3199     Bit64u aSig, bSig, zSig0, zSig1;
3200 
3201     // handle unsupported extended double-precision floating encodings
3202     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3203     {
3204  invalid:
3205         float_raise(status, float_flag_invalid);
3206         return floatx80_default_nan;
3207     }
3208 
3209     aSig = extractFloatx80Frac(a);
3210     aExp = extractFloatx80Exp(a);
3211     aSign = extractFloatx80Sign(a);
3212     bSig = extractFloatx80Frac(b);
3213     bExp = extractFloatx80Exp(b);
3214     bSign = extractFloatx80Sign(b);
3215     zSign = aSign ^ bSign;
3216 
3217     if (aExp == 0x7FFF) {
3218         if ((Bit64u) (aSig<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1))) {
3219             return propagateFloatx80NaN(a, b, status);
3220         }
3221         if (bExp == 0) {
3222             if (bSig == 0) goto invalid;
3223             float_raise(status, float_flag_denormal);
3224         }
3225         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3226     }
3227     if (bExp == 0x7FFF) {
3228         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3229         if (aExp == 0) {
3230             if (aSig == 0) goto invalid;
3231             float_raise(status, float_flag_denormal);
3232         }
3233         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3234     }
3235     if (aExp == 0) {
3236         if (aSig == 0) {
3237             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3238             return packFloatx80(zSign, 0, 0);
3239         }
3240         float_raise(status, float_flag_denormal);
3241         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3242     }
3243     if (bExp == 0) {
3244         if (bSig == 0) return packFloatx80(zSign, 0, 0);
3245         float_raise(status, float_flag_denormal);
3246         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3247     }
3248     zExp = aExp + bExp - 0x3FFE;
3249     mul64To128(aSig, bSig, &zSig0, &zSig1);
3250     if (0 < (Bit64s) zSig0) {
3251         shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
3252         --zExp;
3253     }
3254     return
3255         roundAndPackFloatx80(get_float_rounding_precision(status),
3256              zSign, zExp, zSig0, zSig1, status);
3257 }
3258 
3259 /*----------------------------------------------------------------------------
3260 | Returns the result of dividing the extended double-precision floating-point
3261 | value `a' by the corresponding value `b'.  The operation is performed
3262 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3263 *----------------------------------------------------------------------------*/
3264 
floatx80_div(floatx80 a,floatx80 b,float_status_t & status)3265 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status_t &status)
3266 {
3267     int aSign, bSign, zSign;
3268     Bit32s aExp, bExp, zExp;
3269     Bit64u aSig, bSig, zSig0, zSig1;
3270     Bit64u rem0, rem1, rem2, term0, term1, term2;
3271 
3272     // handle unsupported extended double-precision floating encodings
3273     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3274     {
3275         float_raise(status, float_flag_invalid);
3276         return floatx80_default_nan;
3277     }
3278 
3279     aSig = extractFloatx80Frac(a);
3280     aExp = extractFloatx80Exp(a);
3281     aSign = extractFloatx80Sign(a);
3282     bSig = extractFloatx80Frac(b);
3283     bExp = extractFloatx80Exp(b);
3284     bSign = extractFloatx80Sign(b);
3285 
3286     zSign = aSign ^ bSign;
3287     if (aExp == 0x7FFF) {
3288         if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
3289         if (bExp == 0x7FFF) {
3290             if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3291             float_raise(status, float_flag_invalid);
3292             return floatx80_default_nan;
3293         }
3294         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3295         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3296     }
3297     if (bExp == 0x7FFF) {
3298         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3299         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3300         return packFloatx80(zSign, 0, 0);
3301     }
3302     if (bExp == 0) {
3303         if (bSig == 0) {
3304             if ((aExp | aSig) == 0) {
3305                 float_raise(status, float_flag_invalid);
3306                 return floatx80_default_nan;
3307             }
3308             float_raise(status, float_flag_divbyzero);
3309             return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3310         }
3311         float_raise(status, float_flag_denormal);
3312         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3313     }
3314     if (aExp == 0) {
3315         if (aSig == 0) return packFloatx80(zSign, 0, 0);
3316         float_raise(status, float_flag_denormal);
3317         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3318     }
3319     zExp = aExp - bExp + 0x3FFE;
3320     rem1 = 0;
3321     if (bSig <= aSig) {
3322         shift128Right(aSig, 0, 1, &aSig, &rem1);
3323         ++zExp;
3324     }
3325     zSig0 = estimateDiv128To64(aSig, rem1, bSig);
3326     mul64To128(bSig, zSig0, &term0, &term1);
3327     sub128(aSig, rem1, term0, term1, &rem0, &rem1);
3328     while ((Bit64s) rem0 < 0) {
3329         --zSig0;
3330         add128(rem0, rem1, 0, bSig, &rem0, &rem1);
3331     }
3332     zSig1 = estimateDiv128To64(rem1, 0, bSig);
3333     if ((Bit64u) (zSig1<<1) <= 8) {
3334         mul64To128(bSig, zSig1, &term1, &term2);
3335         sub128(rem1, 0, term1, term2, &rem1, &rem2);
3336         while ((Bit64s) rem1 < 0) {
3337             --zSig1;
3338             add128(rem1, rem2, 0, bSig, &rem1, &rem2);
3339         }
3340         zSig1 |= ((rem1 | rem2) != 0);
3341     }
3342     return
3343         roundAndPackFloatx80(get_float_rounding_precision(status),
3344             zSign, zExp, zSig0, zSig1, status);
3345 }
3346 
3347 /*----------------------------------------------------------------------------
3348 | Returns the square root of the extended double-precision floating-point
3349 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3350 | for Binary Floating-Point Arithmetic.
3351 *----------------------------------------------------------------------------*/
3352 
floatx80_sqrt(floatx80 a,float_status_t & status)3353 floatx80 floatx80_sqrt(floatx80 a, float_status_t &status)
3354 {
3355     int aSign;
3356     Bit32s aExp, zExp;
3357     Bit64u aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3358     Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3359 
3360     // handle unsupported extended double-precision floating encodings
3361     if (floatx80_is_unsupported(a))
3362     {
3363         float_raise(status, float_flag_invalid);
3364         return floatx80_default_nan;
3365     }
3366 
3367     aSig0 = extractFloatx80Frac(a);
3368     aExp = extractFloatx80Exp(a);
3369     aSign = extractFloatx80Sign(a);
3370     if (aExp == 0x7FFF) {
3371         if ((Bit64u) (aSig0<<1)) return propagateFloatx80NaN(a, status);
3372         if (! aSign) return a;
3373         float_raise(status, float_flag_invalid);
3374         return floatx80_default_nan;
3375     }
3376     if (aSign) {
3377         if ((aExp | aSig0) == 0) return a;
3378         float_raise(status, float_flag_invalid);
3379         return floatx80_default_nan;
3380     }
3381     if (aExp == 0) {
3382         if (aSig0 == 0) return packFloatx80(0, 0, 0);
3383         float_raise(status, float_flag_denormal);
3384         normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
3385     }
3386     zExp = ((aExp - 0x3FFF)>>1) + 0x3FFF;
3387     zSig0 = estimateSqrt32(aExp, aSig0>>32);
3388     shift128Right(aSig0, 0, 2 + (aExp & 1), &aSig0, &aSig1);
3389     zSig0 = estimateDiv128To64(aSig0, aSig1, zSig0<<32) + (zSig0<<30);
3390     doubleZSig0 = zSig0<<1;
3391     mul64To128(zSig0, zSig0, &term0, &term1);
3392     sub128(aSig0, aSig1, term0, term1, &rem0, &rem1);
3393     while ((Bit64s) rem0 < 0) {
3394         --zSig0;
3395         doubleZSig0 -= 2;
3396         add128(rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1);
3397     }
3398     zSig1 = estimateDiv128To64(rem1, 0, doubleZSig0);
3399     if ((zSig1 & BX_CONST64(0x3FFFFFFFFFFFFFFF)) <= 5) {
3400         if (zSig1 == 0) zSig1 = 1;
3401         mul64To128(doubleZSig0, zSig1, &term1, &term2);
3402         sub128(rem1, 0, term1, term2, &rem1, &rem2);
3403         mul64To128(zSig1, zSig1, &term2, &term3);
3404         sub192(rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3);
3405         while ((Bit64s) rem1 < 0) {
3406             --zSig1;
3407             shortShift128Left(0, zSig1, 1, &term2, &term3);
3408             term3 |= 1;
3409             term2 |= doubleZSig0;
3410             add192(rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3);
3411         }
3412         zSig1 |= ((rem1 | rem2 | rem3) != 0);
3413     }
3414     shortShift128Left(0, zSig1, 1, &zSig0, &zSig1);
3415     zSig0 |= doubleZSig0;
3416     return
3417         roundAndPackFloatx80(get_float_rounding_precision(status),
3418             0, zExp, zSig0, zSig1, status);
3419 }
3420 
3421 #endif
3422 
3423 #ifdef FLOAT128
3424 
3425 /*----------------------------------------------------------------------------
3426 | Returns the result of converting the extended double-precision floating-
3427 | point value `a' to the quadruple-precision floating-point format. The
3428 | conversion is performed according to the IEC/IEEE Standard for Binary
3429 | Floating-Point Arithmetic.
3430 *----------------------------------------------------------------------------*/
3431 
floatx80_to_float128(floatx80 a,float_status_t & status)3432 float128 floatx80_to_float128(floatx80 a, float_status_t &status)
3433 {
3434     Bit64u zSig0, zSig1;
3435 
3436     Bit64u aSig = extractFloatx80Frac(a);
3437     Bit32s aExp = extractFloatx80Exp(a);
3438     int   aSign = extractFloatx80Sign(a);
3439 
3440     if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1))
3441         return commonNaNToFloat128(floatx80ToCommonNaN(a, status));
3442 
3443     shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
3444     return packFloat128(aSign, aExp, zSig0, zSig1);
3445 }
3446 
3447 /*----------------------------------------------------------------------------
3448 | Returns the result of converting the quadruple-precision floating-point
3449 | value `a' to the extended double-precision floating-point format.  The
3450 | conversion is performed according to the IEC/IEEE Standard for Binary
3451 | Floating-Point Arithmetic.
3452 *----------------------------------------------------------------------------*/
3453 
float128_to_floatx80(float128 a,float_status_t & status)3454 floatx80 float128_to_floatx80(float128 a, float_status_t &status)
3455 {
3456     Bit32s aExp;
3457     Bit64u aSig0, aSig1;
3458 
3459     aSig1 = extractFloat128Frac1(a);
3460     aSig0 = extractFloat128Frac0(a);
3461     aExp = extractFloat128Exp(a);
3462     int aSign = extractFloat128Sign(a);
3463 
3464     if (aExp == 0x7FFF) {
3465         if (aSig0 | aSig1)
3466             return commonNaNToFloatx80(float128ToCommonNaN(a, status));
3467 
3468         return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3469     }
3470 
3471     if (aExp == 0) {
3472         if ((aSig0 | aSig1) == 0) return packFloatx80(aSign, 0, 0);
3473         float_raise(status, float_flag_denormal);
3474         normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3475     }
3476     else aSig0 |= BX_CONST64(0x0001000000000000);
3477 
3478     shortShift128Left(aSig0, aSig1, 15, &aSig0, &aSig1);
3479     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
3480 }
3481 
3482 /*----------------------------------------------------------------------------
3483 | Returns the result of multiplying the extended double-precision floating-
3484 | point value `a' and quadruple-precision floating point value `b'. The
3485 | operation is performed according to the IEC/IEEE Standard for Binary
3486 | Floating-Point Arithmetic.
3487 *----------------------------------------------------------------------------*/
3488 
floatx80_mul(floatx80 a,float128 b,float_status_t & status)3489 floatx80 floatx80_mul(floatx80 a, float128 b, float_status_t &status)
3490 {
3491     Bit32s aExp, bExp, zExp;
3492     Bit64u aSig, bSig0, bSig1, zSig0, zSig1, zSig2;
3493     int aSign, bSign, zSign;
3494 
3495     // handle unsupported extended double-precision floating encodings
3496     if (floatx80_is_unsupported(a))
3497     {
3498  invalid:
3499         float_raise(status, float_flag_invalid);
3500         return floatx80_default_nan;
3501     }
3502 
3503     aSig = extractFloatx80Frac(a);
3504     aExp = extractFloatx80Exp(a);
3505     aSign = extractFloatx80Sign(a);
3506     bSig0 = extractFloat128Frac0(b);
3507     bSig1 = extractFloat128Frac1(b);
3508     bExp = extractFloat128Exp(b);
3509     bSign = extractFloat128Sign(b);
3510 
3511     zSign = aSign ^ bSign;
3512 
3513     if (aExp == 0x7FFF) {
3514         if ((Bit64u) (aSig<<1)
3515              || ((bExp == 0x7FFF) && (bSig0 | bSig1)))
3516         {
3517             floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
3518             return propagateFloatx80NaN(a, r, status);
3519         }
3520         if (bExp == 0) {
3521             if ((bSig0 | bSig1) == 0) goto invalid;
3522             float_raise(status, float_flag_denormal);
3523         }
3524         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3525     }
3526     if (bExp == 0x7FFF) {
3527         if (bSig0 | bSig1) {
3528             floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
3529             return propagateFloatx80NaN(a, r, status);
3530         }
3531         if (aExp == 0) {
3532             if (aSig == 0) goto invalid;
3533             float_raise(status, float_flag_denormal);
3534         }
3535         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3536     }
3537     if (aExp == 0) {
3538         if (aSig == 0) {
3539             if ((bExp == 0) && (bSig0 | bSig1)) float_raise(status, float_flag_denormal);
3540             return packFloatx80(zSign, 0, 0);
3541         }
3542         float_raise(status, float_flag_denormal);
3543         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3544     }
3545     if (bExp == 0) {
3546         if ((bSig0 | bSig1) == 0) return packFloatx80(zSign, 0, 0);
3547         float_raise(status, float_flag_denormal);
3548         normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3549     }
3550     else bSig0 |= BX_CONST64(0x0001000000000000);
3551 
3552     zExp = aExp + bExp - 0x3FFE;
3553     shortShift128Left(bSig0, bSig1, 15, &bSig0, &bSig1);
3554     mul128By64To192(bSig0, bSig1, aSig, &zSig0, &zSig1, &zSig2);
3555     if (0 < (Bit64s) zSig0) {
3556         shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
3557         --zExp;
3558     }
3559     return
3560         roundAndPackFloatx80(get_float_rounding_precision(status),
3561              zSign, zExp, zSig0, zSig1, status);
3562 }
3563 
3564 /*----------------------------------------------------------------------------
3565 | Returns the result of adding the absolute values of the quadruple-precision
3566 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3567 | before being returned. `zSign' is ignored if the result is a NaN.
3568 | The addition is performed according to the IEC/IEEE Standard for Binary
3569 | Floating-Point Arithmetic.
3570 *----------------------------------------------------------------------------*/
3571 
addFloat128Sigs(float128 a,float128 b,int zSign,float_status_t & status)3572 static float128 addFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
3573 {
3574     Bit32s aExp, bExp, zExp;
3575     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
3576     Bit32s expDiff;
3577 
3578     aSig1 = extractFloat128Frac1(a);
3579     aSig0 = extractFloat128Frac0(a);
3580     aExp = extractFloat128Exp(a);
3581     bSig1 = extractFloat128Frac1(b);
3582     bSig0 = extractFloat128Frac0(b);
3583     bExp = extractFloat128Exp(b);
3584     expDiff = aExp - bExp;
3585 
3586     if (0 < expDiff) {
3587         if (aExp == 0x7FFF) {
3588             if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3589             return a;
3590         }
3591         if (bExp == 0) --expDiff;
3592         else bSig0 |= BX_CONST64(0x0001000000000000);
3593         shift128ExtraRightJamming(bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2);
3594         zExp = aExp;
3595     }
3596     else if (expDiff < 0) {
3597         if (bExp == 0x7FFF) {
3598             if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3599             return packFloat128(zSign, 0x7FFF, 0, 0);
3600         }
3601         if (aExp == 0) ++expDiff;
3602         else aSig0 |= BX_CONST64(0x0001000000000000);
3603         shift128ExtraRightJamming(aSig0, aSig1, 0, -expDiff, &aSig0, &aSig1, &zSig2);
3604         zExp = bExp;
3605     }
3606     else {
3607         if (aExp == 0x7FFF) {
3608             if (aSig0 | aSig1 | bSig0 | bSig1)
3609                 return propagateFloat128NaN(a, b, status);
3610 
3611             return a;
3612         }
3613         add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3614         if (aExp == 0) return packFloat128(zSign, 0, zSig0, zSig1);
3615         zSig2 = 0;
3616         zSig0 |= BX_CONST64(0x0002000000000000);
3617         zExp = aExp;
3618         goto shiftRight1;
3619     }
3620     aSig0 |= BX_CONST64(0x0001000000000000);
3621     add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3622     --zExp;
3623     if (zSig0 < BX_CONST64(0x0002000000000000)) goto roundAndPack;
3624     ++zExp;
3625  shiftRight1:
3626     shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
3627  roundAndPack:
3628     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3629 }
3630 
3631 /*----------------------------------------------------------------------------
3632 | Returns the result of subtracting the absolute values of the quadruple-
3633 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3634 | difference is negated before being returned.  `zSign' is ignored if the
3635 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3636 | Standard for Binary Floating-Point Arithmetic.
3637 *----------------------------------------------------------------------------*/
3638 
subFloat128Sigs(float128 a,float128 b,int zSign,float_status_t & status)3639 static float128 subFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
3640 {
3641     Bit32s aExp, bExp, zExp;
3642     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
3643     Bit32s expDiff;
3644 
3645     aSig1 = extractFloat128Frac1(a);
3646     aSig0 = extractFloat128Frac0(a);
3647     aExp = extractFloat128Exp(a);
3648     bSig1 = extractFloat128Frac1(b);
3649     bSig0 = extractFloat128Frac0(b);
3650     bExp = extractFloat128Exp(b);
3651 
3652     expDiff = aExp - bExp;
3653     shortShift128Left(aSig0, aSig1, 14, &aSig0, &aSig1);
3654     shortShift128Left(bSig0, bSig1, 14, &bSig0, &bSig1);
3655     if (0 < expDiff) goto aExpBigger;
3656     if (expDiff < 0) goto bExpBigger;
3657     if (aExp == 0x7FFF) {
3658         if (aSig0 | aSig1 | bSig0 | bSig1)
3659             return propagateFloat128NaN(a, b, status);
3660 
3661         float_raise(status, float_flag_invalid);
3662         return float128_default_nan;
3663     }
3664     if (aExp == 0) {
3665         aExp = 1;
3666         bExp = 1;
3667     }
3668     if (bSig0 < aSig0) goto aBigger;
3669     if (aSig0 < bSig0) goto bBigger;
3670     if (bSig1 < aSig1) goto aBigger;
3671     if (aSig1 < bSig1) goto bBigger;
3672     return packFloat128(0, 0);
3673 
3674  bExpBigger:
3675     if (bExp == 0x7FFF) {
3676         if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3677         return packFloat128(zSign ^ 1, 0x7FFF, 0, 0);
3678     }
3679     if (aExp == 0) ++expDiff;
3680     else {
3681         aSig0 |= BX_CONST64(0x4000000000000000);
3682     }
3683     shift128RightJamming(aSig0, aSig1, - expDiff, &aSig0, &aSig1);
3684     bSig0 |= BX_CONST64(0x4000000000000000);
3685  bBigger:
3686     sub128(bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1);
3687     zExp = bExp;
3688     zSign ^= 1;
3689     goto normalizeRoundAndPack;
3690  aExpBigger:
3691     if (aExp == 0x7FFF) {
3692         if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3693         return a;
3694     }
3695     if (bExp == 0) --expDiff;
3696     else {
3697         bSig0 |= BX_CONST64(0x4000000000000000);
3698     }
3699     shift128RightJamming(bSig0, bSig1, expDiff, &bSig0, &bSig1);
3700     aSig0 |= BX_CONST64(0x4000000000000000);
3701  aBigger:
3702     sub128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3703     zExp = aExp;
3704  normalizeRoundAndPack:
3705     --zExp;
3706     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, status);
3707 }
3708 
3709 /*----------------------------------------------------------------------------
3710 | Returns the result of adding the quadruple-precision floating-point values
3711 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3712 | for Binary Floating-Point Arithmetic.
3713 *----------------------------------------------------------------------------*/
3714 
float128_add(float128 a,float128 b,float_status_t & status)3715 float128 float128_add(float128 a, float128 b, float_status_t &status)
3716 {
3717     int aSign = extractFloat128Sign(a);
3718     int bSign = extractFloat128Sign(b);
3719 
3720     if (aSign == bSign) {
3721         return addFloat128Sigs(a, b, aSign, status);
3722     }
3723     else {
3724         return subFloat128Sigs(a, b, aSign, status);
3725     }
3726 }
3727 
3728 /*----------------------------------------------------------------------------
3729 | Returns the result of subtracting the quadruple-precision floating-point
3730 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3731 | Standard for Binary Floating-Point Arithmetic.
3732 *----------------------------------------------------------------------------*/
3733 
float128_sub(float128 a,float128 b,float_status_t & status)3734 float128 float128_sub(float128 a, float128 b, float_status_t &status)
3735 {
3736     int aSign = extractFloat128Sign(a);
3737     int bSign = extractFloat128Sign(b);
3738 
3739     if (aSign == bSign) {
3740         return subFloat128Sigs(a, b, aSign, status);
3741     }
3742     else {
3743         return addFloat128Sigs(a, b, aSign, status);
3744     }
3745 }
3746 
3747 /*----------------------------------------------------------------------------
3748 | Returns the result of multiplying the quadruple-precision floating-point
3749 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3750 | Standard for Binary Floating-Point Arithmetic.
3751 *----------------------------------------------------------------------------*/
3752 
float128_mul(float128 a,float128 b,float_status_t & status)3753 float128 float128_mul(float128 a, float128 b, float_status_t &status)
3754 {
3755     int aSign, bSign, zSign;
3756     Bit32s aExp, bExp, zExp;
3757     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
3758 
3759     aSig1 = extractFloat128Frac1(a);
3760     aSig0 = extractFloat128Frac0(a);
3761     aExp = extractFloat128Exp(a);
3762     aSign = extractFloat128Sign(a);
3763     bSig1 = extractFloat128Frac1(b);
3764     bSig0 = extractFloat128Frac0(b);
3765     bExp = extractFloat128Exp(b);
3766     bSign = extractFloat128Sign(b);
3767 
3768     zSign = aSign ^ bSign;
3769     if (aExp == 0x7FFF) {
3770         if ((aSig0 | aSig1) || ((bExp == 0x7FFF) && (bSig0 | bSig1))) {
3771             return propagateFloat128NaN(a, b, status);
3772         }
3773         if ((bExp | bSig0 | bSig1) == 0) {
3774             float_raise(status, float_flag_invalid);
3775             return float128_default_nan;
3776         }
3777         return packFloat128(zSign, 0x7FFF, 0, 0);
3778     }
3779     if (bExp == 0x7FFF) {
3780         if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3781         if ((aExp | aSig0 | aSig1) == 0) {
3782             float_raise(status, float_flag_invalid);
3783             return float128_default_nan;
3784         }
3785         return packFloat128(zSign, 0x7FFF, 0, 0);
3786     }
3787     if (aExp == 0) {
3788         if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3789         float_raise(status, float_flag_denormal);
3790         normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3791     }
3792     if (bExp == 0) {
3793         if ((bSig0 | bSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3794         float_raise(status, float_flag_denormal);
3795         normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3796     }
3797     zExp = aExp + bExp - 0x4000;
3798     aSig0 |= BX_CONST64(0x0001000000000000);
3799     shortShift128Left(bSig0, bSig1, 16, &bSig0, &bSig1);
3800     mul128To256(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3);
3801     add128(zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1);
3802     zSig2 |= (zSig3 != 0);
3803     if (BX_CONST64(0x0002000000000000) <= zSig0) {
3804         shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
3805         ++zExp;
3806     }
3807     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3808 }
3809 
3810 /*----------------------------------------------------------------------------
3811 | Returns the result of dividing the quadruple-precision floating-point value
3812 | `a' by the corresponding value `b'.  The operation is performed according to
3813 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3814 *----------------------------------------------------------------------------*/
3815 
float128_div(float128 a,float128 b,float_status_t & status)3816 float128 float128_div(float128 a, float128 b, float_status_t &status)
3817 {
3818     int aSign, bSign, zSign;
3819     Bit32s aExp, bExp, zExp;
3820     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
3821     Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3822 
3823     aSig1 = extractFloat128Frac1(a);
3824     aSig0 = extractFloat128Frac0(a);
3825     aExp = extractFloat128Exp(a);
3826     aSign = extractFloat128Sign(a);
3827     bSig1 = extractFloat128Frac1(b);
3828     bSig0 = extractFloat128Frac0(b);
3829     bExp = extractFloat128Exp(b);
3830     bSign = extractFloat128Sign(b);
3831 
3832     zSign = aSign ^ bSign;
3833     if (aExp == 0x7FFF) {
3834         if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3835         if (bExp == 0x7FFF) {
3836             if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3837             float_raise(status, float_flag_invalid);
3838             return float128_default_nan;
3839         }
3840         return packFloat128(zSign, 0x7FFF, 0, 0);
3841     }
3842     if (bExp == 0x7FFF) {
3843         if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3844         return packFloat128(zSign, 0, 0, 0);
3845     }
3846     if (bExp == 0) {
3847         if ((bSig0 | bSig1) == 0) {
3848             if ((aExp | aSig0 | aSig1) == 0) {
3849                 float_raise(status, float_flag_invalid);
3850                 return float128_default_nan;
3851             }
3852             float_raise(status, float_flag_divbyzero);
3853             return packFloat128(zSign, 0x7FFF, 0, 0);
3854         }
3855         float_raise(status, float_flag_denormal);
3856         normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3857     }
3858     if (aExp == 0) {
3859         if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3860         float_raise(status, float_flag_denormal);
3861         normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3862     }
3863     zExp = aExp - bExp + 0x3FFD;
3864     shortShift128Left(
3865         aSig0 | BX_CONST64(0x0001000000000000), aSig1, 15, &aSig0, &aSig1);
3866     shortShift128Left(
3867         bSig0 | BX_CONST64(0x0001000000000000), bSig1, 15, &bSig0, &bSig1);
3868     if (le128(bSig0, bSig1, aSig0, aSig1)) {
3869         shift128Right(aSig0, aSig1, 1, &aSig0, &aSig1);
3870         ++zExp;
3871     }
3872     zSig0 = estimateDiv128To64(aSig0, aSig1, bSig0);
3873     mul128By64To192(bSig0, bSig1, zSig0, &term0, &term1, &term2);
3874     sub192(aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2);
3875     while ((Bit64s) rem0 < 0) {
3876         --zSig0;
3877         add192(rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2);
3878     }
3879     zSig1 = estimateDiv128To64(rem1, rem2, bSig0);
3880     if ((zSig1 & 0x3FFF) <= 4) {
3881         mul128By64To192(bSig0, bSig1, zSig1, &term1, &term2, &term3);
3882         sub192(rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3);
3883         while ((Bit64s) rem1 < 0) {
3884             --zSig1;
3885             add192(rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3);
3886         }
3887         zSig1 |= ((rem1 | rem2 | rem3) != 0);
3888     }
3889     shift128ExtraRightJamming(zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2);
3890     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3891 }
3892 
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of converting the 64-bit two's complement integer `a' to
3895 | the quadruple-precision floating-point format.  The conversion is performed
3896 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3897 *----------------------------------------------------------------------------*/
3898 
int64_to_float128(Bit64s a)3899 float128 int64_to_float128(Bit64s a)
3900 {
3901     Bit64u zSig0, zSig1;
3902 
3903     if (a == 0) return packFloat128(0, 0, 0, 0);
3904     int zSign = (a < 0);
3905     Bit64u absA = zSign ? - a : a;
3906     Bit8u shiftCount = countLeadingZeros64(absA) + 49;
3907     Bit32s zExp = 0x406E - shiftCount;
3908     if (64 <= shiftCount) {
3909         zSig1 = 0;
3910         zSig0 = absA;
3911         shiftCount -= 64;
3912     }
3913     else {
3914         zSig1 = absA;
3915         zSig0 = 0;
3916     }
3917     shortShift128Left(zSig0, zSig1, shiftCount, &zSig0, &zSig1);
3918     return packFloat128(zSign, zExp, zSig0, zSig1);
3919 }
3920 
3921 #endif
3922