1 /*============================================================================
2 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
3 Package, Release 2b.
4 
5 Written by John R. Hauser.  This work was made possible in part by the
6 International Computer Science Institute, located at Suite 600, 1947 Center
7 Street, Berkeley, California 94704.  Funding was partially provided by the
8 National Science Foundation under grant MIP-9311980.  The original version
9 of this code was written as part of a project to build a fixed-point vector
10 processor in collaboration with the University of California at Berkeley,
11 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
12 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
13 arithmetic/SoftFloat.html'.
14 
15 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
16 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
17 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
18 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
19 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
20 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
21 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
22 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
23 
24 Derivative works are acceptable, even for commercial purposes, so long as
25 (1) the source code for the derivative work includes prominent notice that
26 the work is derivative, and (2) the source code includes prominent notice with
27 these four paragraphs for those parts of this code that are retained.
28 =============================================================================*/
29 
30 #define FLOAT128
31 
32 /*============================================================================
33  * Adapted for Bochs (x86 achitecture simulator) by
34  *            Stanislav Shwartsman [sshwarts at sourceforge net]
35  * ==========================================================================*/
36 
37 #include "softfloat.h"
38 #include "softfloat-round-pack.h"
39 
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target
43 | if desired).
44 *----------------------------------------------------------------------------*/
45 #define USE_estimateDiv128To64
46 #define USE_estimateSqrt32
47 #include "softfloat-macros.h"
48 
49 /*----------------------------------------------------------------------------
50 | Functions and definitions to determine:  (1) whether tininess for underflow
51 | is detected before or after rounding by default, (2) what (if anything)
52 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
53 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
54 | are propagated from function inputs to output.  These details are target-
55 | specific.
56 *----------------------------------------------------------------------------*/
57 #include "softfloat-specialize.h"
58 
59 /*----------------------------------------------------------------------------
60 | Returns the result of converting the 32-bit two's complement integer `a'
61 | to the single-precision floating-point format.  The conversion is performed
62 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
63 *----------------------------------------------------------------------------*/
64 
int32_to_float32(Bit32s a,float_status_t & status)65 float32 int32_to_float32(Bit32s a, float_status_t &status)
66 {
67     if (a == 0) return 0;
68     if (a == (Bit32s) 0x80000000) return packFloat32(1, 0x9E, 0);
69     int zSign = (a < 0);
70     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
71 }
72 
73 /*----------------------------------------------------------------------------
74 | Returns the result of converting the 32-bit two's complement integer `a'
75 | to the double-precision floating-point format.  The conversion is performed
76 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
77 *----------------------------------------------------------------------------*/
78 
int32_to_float64(Bit32s a)79 float64 int32_to_float64(Bit32s a)
80 {
81     if (a == 0) return 0;
82     int zSign = (a < 0);
83     Bit32u absA = zSign ? -a : a;
84     int shiftCount = countLeadingZeros32(absA) + 21;
85     Bit64u zSig = absA;
86     return packFloat64(zSign, 0x432 - shiftCount, zSig<<shiftCount);
87 }
88 
89 /*----------------------------------------------------------------------------
90 | Returns the result of converting the 64-bit two's complement integer `a'
91 | to the single-precision floating-point format.  The conversion is performed
92 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
93 *----------------------------------------------------------------------------*/
94 
int64_to_float32(Bit64s a,float_status_t & status)95 float32 int64_to_float32(Bit64s a, float_status_t &status)
96 {
97     if (a == 0) return 0;
98     int zSign = (a < 0);
99     Bit64u absA = zSign ? -a : a;
100     int shiftCount = countLeadingZeros64(absA) - 40;
101     if (0 <= shiftCount) {
102         return packFloat32(zSign, 0x95 - shiftCount, (Bit32u)(absA<<shiftCount));
103     }
104     else {
105         shiftCount += 7;
106         if (shiftCount < 0) {
107             absA = shift64RightJamming(absA, -shiftCount);
108         }
109         else {
110             absA <<= shiftCount;
111         }
112         return roundAndPackFloat32(zSign, 0x9C - shiftCount, (Bit32u) absA, status);
113     }
114 }
115 
116 /*----------------------------------------------------------------------------
117 | Returns the result of converting the 64-bit two's complement integer `a'
118 | to the double-precision floating-point format.  The conversion is performed
119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
120 *----------------------------------------------------------------------------*/
121 
int64_to_float64(Bit64s a,float_status_t & status)122 float64 int64_to_float64(Bit64s a, float_status_t &status)
123 {
124     if (a == 0) return 0;
125     if (a == (Bit64s) BX_CONST64(0x8000000000000000)) {
126         return packFloat64(1, 0x43E, 0);
127     }
128     int zSign = (a < 0);
129     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
130 }
131 
132 /*----------------------------------------------------------------------------
133 | Returns the result of converting the 32-bit unsigned integer `a' to the
134 | single-precision floating-point format.  The conversion is performed
135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
136 *----------------------------------------------------------------------------*/
137 
uint32_to_float32(Bit32u a,float_status_t & status)138 float32 uint32_to_float32(Bit32u a, float_status_t &status)
139 {
140     if (a == 0) return 0;
141     if (a & 0x80000000) return normalizeRoundAndPackFloat32(0, 0x9D, a >> 1, status);
142     return normalizeRoundAndPackFloat32(0, 0x9C, a, status);
143 }
144 
145 /*----------------------------------------------------------------------------
146 | Returns the result of converting the 32-bit unsigned integer `a' to the
147 | double-precision floating-point format.  The conversion is performed
148 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
149 *----------------------------------------------------------------------------*/
150 
uint32_to_float64(Bit32u a)151 float64 uint32_to_float64(Bit32u a)
152 {
153    if (a == 0) return 0;
154    int shiftCount = countLeadingZeros32(a) + 21;
155    Bit64u zSig = a;
156    return packFloat64(0, 0x432 - shiftCount, zSig<<shiftCount);
157 }
158 
159 /*----------------------------------------------------------------------------
160 | Returns the result of converting the 64-bit unsigned integer integer `a'
161 | to the single-precision floating-point format.  The conversion is performed
162 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
163 *----------------------------------------------------------------------------*/
164 
uint64_to_float32(Bit64u a,float_status_t & status)165 float32 uint64_to_float32(Bit64u a, float_status_t &status)
166 {
167     if (a == 0) return 0;
168     int shiftCount = countLeadingZeros64(a) - 40;
169     if (0 <= shiftCount) {
170         return packFloat32(0, 0x95 - shiftCount, (Bit32u)(a<<shiftCount));
171     }
172     else {
173         shiftCount += 7;
174         if (shiftCount < 0) {
175             a = shift64RightJamming(a, -shiftCount);
176         }
177         else {
178             a <<= shiftCount;
179         }
180         return roundAndPackFloat32(0, 0x9C - shiftCount, (Bit32u) a, status);
181     }
182 }
183 
184 /*----------------------------------------------------------------------------
185 | Returns the result of converting the 64-bit unsigned integer integer `a'
186 | to the double-precision floating-point format.  The conversion is performed
187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
188 *----------------------------------------------------------------------------*/
189 
uint64_to_float64(Bit64u a,float_status_t & status)190 float64 uint64_to_float64(Bit64u a, float_status_t &status)
191 {
192     if (a == 0) return 0;
193     if (a & BX_CONST64(0x8000000000000000))
194         return normalizeRoundAndPackFloat64(0, 0x43D, a >> 1, status);
195     return normalizeRoundAndPackFloat64(0, 0x43C, a, status);
196 }
197 
198 /*----------------------------------------------------------------------------
199 | Returns the result of converting the single-precision floating-point value
200 | `a' to the 32-bit two's complement integer format.  The conversion is
201 | performed according to the IEC/IEEE Standard for Binary Floating-Point
202 | Arithmetic - which means in particular that the conversion is rounded
203 | according to the current rounding mode.  If `a' is a NaN or the
204 | conversion overflows the integer indefinite value is returned.
205 *----------------------------------------------------------------------------*/
206 
float32_to_int32(float32 a,float_status_t & status)207 Bit32s float32_to_int32(float32 a, float_status_t &status)
208 {
209     Bit32u aSig = extractFloat32Frac(a);
210     Bit16s aExp = extractFloat32Exp(a);
211     int aSign = extractFloat32Sign(a);
212     if ((aExp == 0xFF) && aSig) aSign = 0;
213     if (aExp) aSig |= 0x00800000;
214     else {
215         if (get_denormals_are_zeros(status)) aSig = 0;
216     }
217     int shiftCount = 0xAF - aExp;
218     Bit64u aSig64 = Bit64u(aSig) << 32;
219     if (0 < shiftCount) aSig64 = shift64RightJamming(aSig64, shiftCount);
220     return roundAndPackInt32(aSign, aSig64, status);
221 }
222 
223 /*----------------------------------------------------------------------------
224 | Returns the result of converting the single-precision floating-point value
225 | `a' to the 32-bit two's complement integer format.  The conversion is
226 | performed according to the IEC/IEEE Standard for Binary Floating-Point
227 | Arithmetic, except that the conversion is always rounded toward zero.
228 | If `a' is a NaN or the conversion overflows, the integer indefinite
229 | value is returned.
230 *----------------------------------------------------------------------------*/
231 
float32_to_int32_round_to_zero(float32 a,float_status_t & status)232 Bit32s float32_to_int32_round_to_zero(float32 a, float_status_t &status)
233 {
234     int aSign;
235     Bit16s aExp;
236     Bit32u aSig;
237     Bit32s z;
238 
239     aSig = extractFloat32Frac(a);
240     aExp = extractFloat32Exp(a);
241     aSign = extractFloat32Sign(a);
242     int shiftCount = aExp - 0x9E;
243     if (0 <= shiftCount) {
244         if (a != 0xCF000000) {
245             float_raise(status, float_flag_invalid);
246         }
247         return (Bit32s)(int32_indefinite);
248     }
249     else if (aExp <= 0x7E) {
250         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
251         if (aExp | aSig) float_raise(status, float_flag_inexact);
252         return 0;
253     }
254     aSig = (aSig | 0x800000)<<8;
255     z = aSig>>(-shiftCount);
256     if ((Bit32u) (aSig<<(shiftCount & 31))) {
257         float_raise(status, float_flag_inexact);
258     }
259     if (aSign) z = -z;
260     return z;
261 }
262 
263 /*----------------------------------------------------------------------------
264 | Returns the result of converting the single-precision floating-point value
265 | `a' to the 32-bit unsigned integer format.  The conversion is performed
266 | according to the IEC/IEEE Standard for Binary Floating-point Arithmetic,
267 | except that the conversion is always rounded toward zero.  If `a' is a NaN
268 | or conversion overflows, the largest positive integer is returned.
269 *----------------------------------------------------------------------------*/
270 
float32_to_uint32_round_to_zero(float32 a,float_status_t & status)271 Bit32u float32_to_uint32_round_to_zero(float32 a, float_status_t &status)
272 {
273     int aSign;
274     Bit16s aExp;
275     Bit32u aSig;
276 
277     aSig = extractFloat32Frac(a);
278     aExp = extractFloat32Exp(a);
279     aSign = extractFloat32Sign(a);
280     int shiftCount = aExp - 0x9E;
281 
282     if (aExp <= 0x7E) {
283         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
284         if (aExp | aSig) float_raise(status, float_flag_inexact);
285         return 0;
286     }
287     else if (0 < shiftCount || aSign) {
288         float_raise(status, float_flag_invalid);
289         return uint32_indefinite;
290     }
291 
292     aSig = (aSig | 0x800000)<<8;
293     Bit32u z = aSig >> (-shiftCount);
294     if (aSig << (shiftCount & 31)) {
295         float_raise(status, float_flag_inexact);
296     }
297     return z;
298 }
299 
300 /*----------------------------------------------------------------------------
301 | Returns the result of converting the single-precision floating-point value
302 | `a' to the 64-bit two's complement integer format.  The conversion is
303 | performed according to the IEC/IEEE Standard for Binary Floating-Point
304 | Arithmetic - which means in particular that the conversion is rounded
305 | according to the current rounding mode. If `a' is a NaN or the
306 | conversion overflows, the integer indefinite value is returned.
307 *----------------------------------------------------------------------------*/
308 
float32_to_int64(float32 a,float_status_t & status)309 Bit64s float32_to_int64(float32 a, float_status_t &status)
310 {
311     Bit64u aSig64, aSigExtra;
312 
313     Bit32u aSig = extractFloat32Frac(a);
314     Bit16s aExp = extractFloat32Exp(a);
315     int aSign = extractFloat32Sign(a);
316 
317     int shiftCount = 0xBE - aExp;
318     if (shiftCount < 0) {
319         float_raise(status, float_flag_invalid);
320         return (Bit64s)(int64_indefinite);
321     }
322     if (aExp) aSig |= 0x00800000;
323     else {
324         if (get_denormals_are_zeros(status)) aSig = 0;
325     }
326     aSig64 = aSig;
327     aSig64 <<= 40;
328     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
329     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
330 }
331 
332 /*----------------------------------------------------------------------------
333 | Returns the result of converting the single-precision floating-point value
334 | `a' to the 64-bit two's complement integer format.  The conversion is
335 | performed according to the IEC/IEEE Standard for Binary Floating-Point
336 | Arithmetic, except that the conversion is always rounded toward zero.
337 | If `a' is a NaN or the conversion overflows, the integer indefinite
338 | value is returned.
339 *----------------------------------------------------------------------------*/
340 
float32_to_int64_round_to_zero(float32 a,float_status_t & status)341 Bit64s float32_to_int64_round_to_zero(float32 a, float_status_t &status)
342 {
343     int aSign;
344     Bit16s aExp;
345     Bit32u aSig;
346     Bit64u aSig64;
347     Bit64s z;
348 
349     aSig = extractFloat32Frac(a);
350     aExp = extractFloat32Exp(a);
351     aSign = extractFloat32Sign(a);
352     int shiftCount = aExp - 0xBE;
353     if (0 <= shiftCount) {
354         if (a != 0xDF000000) {
355             float_raise(status, float_flag_invalid);
356         }
357         return (Bit64s)(int64_indefinite);
358     }
359     else if (aExp <= 0x7E) {
360         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
361         if (aExp | aSig) float_raise(status, float_flag_inexact);
362         return 0;
363     }
364     aSig64 = aSig | 0x00800000;
365     aSig64 <<= 40;
366     z = aSig64>>(-shiftCount);
367     if ((Bit64u) (aSig64<<(shiftCount & 63))) {
368         float_raise(status, float_flag_inexact);
369     }
370     if (aSign) z = -z;
371     return z;
372 }
373 
374 /*----------------------------------------------------------------------------
375 | Returns the result of converting the single-precision floating-point value
376 | `a' to the 64-bit unsigned integer format.  The conversion is performed
377 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
378 | except that the conversion is always rounded toward zero. If `a' is a NaN
379 | or the conversion overflows, the largest unsigned integer is returned.
380 *----------------------------------------------------------------------------*/
381 
float32_to_uint64_round_to_zero(float32 a,float_status_t & status)382 Bit64u float32_to_uint64_round_to_zero(float32 a, float_status_t &status)
383 {
384     int aSign;
385     Bit16s aExp;
386     Bit32u aSig;
387     Bit64u aSig64;
388 
389     aSig = extractFloat32Frac(a);
390     aExp = extractFloat32Exp(a);
391     aSign = extractFloat32Sign(a);
392     int shiftCount = aExp - 0xBE;
393 
394     if (aExp <= 0x7E) {
395         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
396         if (aExp | aSig) float_raise(status, float_flag_inexact);
397         return 0;
398     }
399     else if (0 < shiftCount || aSign) {
400         float_raise(status, float_flag_invalid);
401         return uint64_indefinite;
402     }
403 
404     aSig64 = aSig | 0x00800000;
405     aSig64 <<= 40;
406     Bit64u z = aSig64>>(-shiftCount);
407     if ((Bit64u) (aSig64<<(shiftCount & 63))) {
408         float_raise(status, float_flag_inexact);
409     }
410     return z;
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the result of converting the single-precision floating-point value
415 | `a' to the 64-bit unsigned integer format.  The conversion is
416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
417 | Arithmetic---which means in particular that the conversion is rounded
418 | according to the current rounding mode.  If `a' is a NaN or the conversion
419 | overflows, the largest unsigned integer is returned.
420 *----------------------------------------------------------------------------*/
421 
float32_to_uint64(float32 a,float_status_t & status)422 Bit64u float32_to_uint64(float32 a, float_status_t &status)
423 {
424     int aSign;
425     Bit16s aExp, shiftCount;
426     Bit32u aSig;
427     Bit64u aSig64, aSigExtra;
428 
429     aSig = extractFloat32Frac(a);
430     aExp = extractFloat32Exp(a);
431     aSign = extractFloat32Sign(a);
432 
433     if (get_denormals_are_zeros(status)) {
434         if (aExp == 0) aSig = 0;
435     }
436 
437     if ((aSign) && (aExp > 0x7E)) {
438         float_raise(status, float_flag_invalid);
439         return uint64_indefinite;
440     }
441 
442     shiftCount = 0xBE - aExp;
443     if (aExp) aSig |= 0x00800000;
444 
445     if (shiftCount < 0) {
446         float_raise(status, float_flag_invalid);
447         return uint64_indefinite;
448     }
449 
450     aSig64 = aSig;
451     aSig64 <<= 40;
452     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
453     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
454 }
455 
456 /*----------------------------------------------------------------------------
457 | Returns the result of converting the single-precision floating-point value
458 | `a' to the 32-bit unsigned integer format.  The conversion is
459 | performed according to the IEC/IEEE Standard for Binary Floating-Point
460 | Arithmetic---which means in particular that the conversion is rounded
461 | according to the current rounding mode.  If `a' is a NaN or the conversion
462 | overflows, the largest unsigned integer is returned.
463 *----------------------------------------------------------------------------*/
464 
float32_to_uint32(float32 a,float_status_t & status)465 Bit32u float32_to_uint32(float32 a, float_status_t &status)
466 {
467     Bit64u val_64 = float32_to_uint64(a, status);
468 
469     if (val_64 > 0xffffffff) {
470         status.float_exception_flags = float_flag_invalid; // throw away other flags
471         return uint32_indefinite;
472     }
473 
474     return (Bit32u) val_64;
475 }
476 
477 /*----------------------------------------------------------------------------
478 | Returns the result of converting the single-precision floating-point value
479 | `a' to the double-precision floating-point format.  The conversion is
480 | performed according to the IEC/IEEE Standard for Binary Floating-Point
481 | Arithmetic.
482 *----------------------------------------------------------------------------*/
483 
float32_to_float64(float32 a,float_status_t & status)484 float64 float32_to_float64(float32 a, float_status_t &status)
485 {
486     Bit32u aSig = extractFloat32Frac(a);
487     Bit16s aExp = extractFloat32Exp(a);
488     int  aSign = extractFloat32Sign(a);
489 
490     if (aExp == 0xFF) {
491         if (aSig) return commonNaNToFloat64(float32ToCommonNaN(a, status));
492         return packFloat64(aSign, 0x7FF, 0);
493     }
494     if (aExp == 0) {
495         if (aSig == 0 || get_denormals_are_zeros(status))
496             return packFloat64(aSign, 0, 0);
497 
498         float_raise(status, float_flag_denormal);
499         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
500         --aExp;
501     }
502     return packFloat64(aSign, aExp + 0x380, ((Bit64u) aSig)<<29);
503 }
504 
505 /*----------------------------------------------------------------------------
506 | Rounds the single-precision floating-point value `a' to an integer, and
507 | returns the result as a single-precision floating-point value.  The
508 | operation is performed according to the IEC/IEEE Standard for Binary
509 | Floating-Point Arithmetic.
510 *----------------------------------------------------------------------------*/
511 
float32_round_to_int(float32 a,Bit8u scale,float_status_t & status)512 float32 float32_round_to_int(float32 a, Bit8u scale, float_status_t &status)
513 {
514     Bit32u lastBitMask, roundBitsMask;
515     int roundingMode = get_float_rounding_mode(status);
516     Bit16s aExp = extractFloat32Exp(a);
517     scale &= 0xf;
518 
519     if ((aExp == 0xFF) && extractFloat32Frac(a)) {
520         return propagateFloat32NaN(a, status);
521     }
522 
523     aExp += scale; // scale the exponent
524 
525     if (0x96 <= aExp) {
526         return a;
527     }
528 
529     if (get_denormals_are_zeros(status)) {
530         a = float32_denormal_to_zero(a);
531     }
532 
533     if (aExp <= 0x7E) {
534         if ((Bit32u) (a<<1) == 0) return a;
535         float_raise(status, float_flag_inexact);
536         int aSign = extractFloat32Sign(a);
537         switch (roundingMode) {
538          case float_round_nearest_even:
539             if ((aExp == 0x7E) && extractFloat32Frac(a)) {
540                 return packFloat32(aSign, 0x7F - scale, 0);
541             }
542             break;
543          case float_round_down:
544             return aSign ? packFloat32(1, 0x7F - scale, 0) : float32_positive_zero;
545          case float_round_up:
546             return aSign ? float32_negative_zero : packFloat32(0, 0x7F - scale, 0);
547         }
548         return packFloat32(aSign, 0, 0);
549     }
550 
551     lastBitMask = 1;
552     lastBitMask <<= 0x96 - aExp;
553     roundBitsMask = lastBitMask - 1;
554     float32 z = a;
555     if (roundingMode == float_round_nearest_even) {
556         z += lastBitMask>>1;
557         if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
558     }
559     else if (roundingMode != float_round_to_zero) {
560         if (extractFloat32Sign(z) ^ (roundingMode == float_round_up)) {
561             z += roundBitsMask;
562         }
563     }
564     z &= ~roundBitsMask;
565     if (z != a) float_raise(status, float_flag_inexact);
566     return z;
567 }
568 
569 /*----------------------------------------------------------------------------
570 | Extracts the fractional portion of single-precision floating-point value `a',
571 | and returns the result  as a  single-precision  floating-point value. The
572 | fractional results are precise. The operation is performed according to the
573 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
574 *----------------------------------------------------------------------------*/
575 
float32_frc(float32 a,float_status_t & status)576 float32 float32_frc(float32 a, float_status_t &status)
577 {
578     int roundingMode = get_float_rounding_mode(status);
579 
580     Bit16s aExp = extractFloat32Exp(a);
581     Bit32u aSig = extractFloat32Frac(a);
582     int aSign = extractFloat32Sign(a);
583 
584     if (aExp == 0xFF) {
585         if (aSig) return propagateFloat32NaN(a, status);
586         float_raise(status, float_flag_invalid);
587         return float32_default_nan;
588     }
589 
590     if (aExp >= 0x96) {
591         return packFloat32(roundingMode == float_round_down, 0, 0);
592     }
593 
594     if (aExp < 0x7F) {
595         if (aExp == 0) {
596             if (aSig == 0 || get_denormals_are_zeros(status))
597                 return packFloat32(roundingMode == float_round_down, 0, 0);
598 
599             float_raise(status, float_flag_denormal);
600             if (! float_exception_masked(status, float_flag_underflow))
601                 float_raise(status, float_flag_underflow);
602 
603             if(get_flush_underflow_to_zero(status)) {
604                 float_raise(status, float_flag_underflow | float_flag_inexact);
605                 return packFloat32(aSign, 0, 0);
606             }
607         }
608         return a;
609     }
610 
611     Bit32u lastBitMask = 1 << (0x96 - aExp);
612     Bit32u roundBitsMask = lastBitMask - 1;
613 
614     aSig &= roundBitsMask;
615     aSig <<= 7;
616     aExp--;
617 
618     if (aSig == 0)
619        return packFloat32(roundingMode == float_round_down, 0, 0);
620 
621     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
622 }
623 
624 /*----------------------------------------------------------------------------
625 | Extracts the exponent portion of single-precision floating-point value 'a',
626 | and returns the result as a single-precision floating-point value
627 | representing unbiased integer exponent. The operation is performed according
628 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
629 *----------------------------------------------------------------------------*/
630 
float32_getexp(float32 a,float_status_t & status)631 float32 float32_getexp(float32 a, float_status_t &status)
632 {
633     Bit16s aExp = extractFloat32Exp(a);
634     Bit32u aSig = extractFloat32Frac(a);
635 
636     if (aExp == 0xFF) {
637         if (aSig) return propagateFloat32NaN(a, status);
638         return float32_positive_inf;
639     }
640 
641     if (aExp == 0) {
642         if (aSig == 0 || get_denormals_are_zeros(status))
643             return float32_negative_inf;
644 
645         float_raise(status, float_flag_denormal);
646         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
647     }
648 
649     return int32_to_float32(aExp - 0x7F, status);
650 }
651 
652 /*----------------------------------------------------------------------------
653 | Extracts the mantissa of single-precision floating-point value 'a' and
654 | returns the result as a single-precision floating-point after applying
655 | the mantissa interval normalization and sign control. The operation is
656 | performed according to the IEC/IEEE Standard for Binary Floating-Point
657 | Arithmetic.
658 *----------------------------------------------------------------------------*/
659 
float32_getmant(float32 a,float_status_t & status,int sign_ctrl,int interv)660 float32 float32_getmant(float32 a, float_status_t &status, int sign_ctrl, int interv)
661 {
662     Bit16s aExp = extractFloat32Exp(a);
663     Bit32u aSig = extractFloat32Frac(a);
664     int aSign = extractFloat32Sign(a);
665 
666     if (aExp == 0xFF) {
667         if (aSig) return propagateFloat32NaN(a, status);
668         if (aSign) {
669             if (sign_ctrl & 0x2) {
670                 float_raise(status, float_flag_invalid);
671                 return float32_default_nan;
672             }
673         }
674         return packFloat32(~sign_ctrl & aSign, 0x7F, 0);
675     }
676 
677     if (aExp == 0 && (aSig == 0 || get_denormals_are_zeros(status))) {
678         return packFloat32(~sign_ctrl & aSign, 0x7F, 0);
679     }
680 
681     if (aSign) {
682         if (sign_ctrl & 0x2) {
683             float_raise(status, float_flag_invalid);
684             return float32_default_nan;
685         }
686     }
687 
688     if (aExp == 0) {
689         float_raise(status, float_flag_denormal);
690         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
691 //      aExp += 0x7E;
692         aSig &= 0x7FFFFF;
693     }
694 
695     switch(interv) {
696     case 0x0: // interval [1,2)
697         aExp = 0x7F;
698         break;
699     case 0x1: // interval [1/2,2)
700         aExp -= 0x7F;
701         aExp  = 0x7F - (aExp & 0x1);
702         break;
703     case 0x2: // interval [1/2,1)
704         aExp = 0x7E;
705         break;
706     case 0x3: // interval [3/4,3/2)
707         aExp = 0x7F - ((aSig >> 22) & 0x1);
708         break;
709     }
710 
711     return packFloat32(~sign_ctrl & aSign, aExp, aSig);
712 }
713 
714 /*----------------------------------------------------------------------------
715 | Return the result of a floating point scale of the single-precision floating
716 | point value `a' by multiplying it by 2 power of the single-precision
717 | floating point value 'b' converted to integral value. If the result cannot
718 | be represented in single precision, then the proper overflow response (for
719 | positive scaling operand), or the proper underflow response (for negative
720 | scaling operand) is issued. The operation is performed according to the
721 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
722 *----------------------------------------------------------------------------*/
723 
float32_scalef(float32 a,float32 b,float_status_t & status)724 float32 float32_scalef(float32 a, float32 b, float_status_t &status)
725 {
726     Bit32u aSig = extractFloat32Frac(a);
727     Bit16s aExp = extractFloat32Exp(a);
728     int aSign = extractFloat32Sign(a);
729     Bit32u bSig = extractFloat32Frac(b);
730     Bit16s bExp = extractFloat32Exp(b);
731     int bSign = extractFloat32Sign(b);
732 
733     if (get_denormals_are_zeros(status)) {
734         if (aExp == 0) aSig = 0;
735         if (bExp == 0) bSig = 0;
736     }
737 
738     if (bExp == 0xFF) {
739         if (bSig) return propagateFloat32NaN(a, b, status);
740     }
741 
742     if (aExp == 0xFF) {
743         if (aSig) {
744             int aIsSignalingNaN = (aSig & 0x00400000) == 0;
745             if (aIsSignalingNaN || bExp != 0xFF || bSig)
746                 return propagateFloat32NaN(a, b, status);
747 
748             return bSign ? 0 : float32_positive_inf;
749         }
750 
751         if (bExp == 0xFF && bSign) {
752             float_raise(status, float_flag_invalid);
753             return float32_default_nan;
754         }
755         return a;
756     }
757 
758     if (aExp == 0) {
759         if (aSig == 0) {
760             if (bExp == 0xFF && ! bSign) {
761                 float_raise(status, float_flag_invalid);
762                 return float32_default_nan;
763             }
764             return a;
765         }
766         float_raise(status, float_flag_denormal);
767     }
768 
769     if ((bExp | bSig) == 0) return a;
770 
771     if (bExp == 0xFF) {
772         if (bSign) return packFloat32(aSign, 0, 0);
773         return packFloat32(aSign, 0xFF, 0);
774     }
775 
776     if (bExp >= 0x8E) {
777         // handle obvious overflow/underflow result
778         return roundAndPackFloat32(aSign, bSign ? -0x7F : 0xFF, aSig, status);
779     }
780 
781     int scale = 0;
782 
783     if (bExp <= 0x7E) {
784         if (bExp == 0)
785             float_raise(status, float_flag_denormal);
786         scale = -bSign;
787     }
788     else {
789         int shiftCount = bExp - 0x9E;
790         bSig = (bSig | 0x800000)<<8;
791         scale = bSig>>(-shiftCount);
792 
793         if (bSign) {
794             if ((Bit32u) (bSig<<(shiftCount & 31))) scale++;
795             scale = -scale;
796         }
797 
798         if (scale >  0x200) scale =  0x200;
799         if (scale < -0x200) scale = -0x200;
800     }
801 
802     if (aExp != 0) {
803         aSig |= 0x00800000;
804     } else {
805         aExp++;
806     }
807 
808     aExp += scale - 1;
809     aSig <<= 7;
810     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
811 }
812 
813 /*----------------------------------------------------------------------------
814 | Returns the result of adding the absolute values of the single-precision
815 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
816 | before being returned.  `zSign' is ignored if the result is a NaN.
817 | The addition is performed according to the IEC/IEEE Standard for Binary
818 | Floating-Point Arithmetic.
819 *----------------------------------------------------------------------------*/
820 
addFloat32Sigs(float32 a,float32 b,int zSign,float_status_t & status)821 static float32 addFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
822 {
823     Bit16s aExp, bExp, zExp;
824     Bit32u aSig, bSig, zSig;
825     Bit16s expDiff;
826 
827     aSig = extractFloat32Frac(a);
828     aExp = extractFloat32Exp(a);
829     bSig = extractFloat32Frac(b);
830     bExp = extractFloat32Exp(b);
831 
832     if (get_denormals_are_zeros(status)) {
833         if (aExp == 0) aSig = 0;
834         if (bExp == 0) bSig = 0;
835     }
836 
837     expDiff = aExp - bExp;
838     aSig <<= 6;
839     bSig <<= 6;
840 
841     if (0 < expDiff) {
842         if (aExp == 0xFF) {
843             if (aSig) return propagateFloat32NaN(a, b, status);
844             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
845             return a;
846         }
847         if ((aExp == 0) && aSig)
848             float_raise(status, float_flag_denormal);
849 
850         if (bExp == 0) {
851             if (bSig) float_raise(status, float_flag_denormal);
852             --expDiff;
853         }
854         else bSig |= 0x20000000;
855 
856         bSig = shift32RightJamming(bSig, expDiff);
857         zExp = aExp;
858     }
859     else if (expDiff < 0) {
860         if (bExp == 0xFF) {
861             if (bSig) return propagateFloat32NaN(a, b, status);
862             if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
863             return packFloat32(zSign, 0xFF, 0);
864         }
865         if ((bExp == 0) && bSig)
866             float_raise(status, float_flag_denormal);
867 
868         if (aExp == 0) {
869             if (aSig) float_raise(status, float_flag_denormal);
870             ++expDiff;
871         }
872         else aSig |= 0x20000000;
873 
874         aSig = shift32RightJamming(aSig, -expDiff);
875         zExp = bExp;
876     }
877     else {
878         if (aExp == 0xFF) {
879             if (aSig | bSig) return propagateFloat32NaN(a, b, status);
880             return a;
881         }
882         if (aExp == 0) {
883             zSig = (aSig + bSig) >> 6;
884             if (aSig | bSig) {
885                 float_raise(status, float_flag_denormal);
886                 if (get_flush_underflow_to_zero(status) && (extractFloat32Frac(zSig) == zSig)) {
887                     float_raise(status, float_flag_underflow | float_flag_inexact);
888                     return packFloat32(zSign, 0, 0);
889                 }
890                 if (! float_exception_masked(status, float_flag_underflow)) {
891                     if (extractFloat32Frac(zSig) == zSig)
892                         float_raise(status, float_flag_underflow);
893                 }
894             }
895             return packFloat32(zSign, 0, zSig);
896         }
897         zSig = 0x40000000 + aSig + bSig;
898         return roundAndPackFloat32(zSign, aExp, zSig, status);
899     }
900     aSig |= 0x20000000;
901     zSig = (aSig + bSig)<<1;
902     --zExp;
903     if ((Bit32s) zSig < 0) {
904         zSig = aSig + bSig;
905         ++zExp;
906     }
907     return roundAndPackFloat32(zSign, zExp, zSig, status);
908 }
909 
910 /*----------------------------------------------------------------------------
911 | Returns the result of subtracting the absolute values of the single-
912 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
913 | difference is negated before being returned.  `zSign' is ignored if the
914 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
915 | Standard for Binary Floating-Point Arithmetic.
916 *----------------------------------------------------------------------------*/
917 
subFloat32Sigs(float32 a,float32 b,int zSign,float_status_t & status)918 static float32 subFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
919 {
920     Bit16s aExp, bExp, zExp;
921     Bit32u aSig, bSig, zSig;
922     Bit16s expDiff;
923 
924     aSig = extractFloat32Frac(a);
925     aExp = extractFloat32Exp(a);
926     bSig = extractFloat32Frac(b);
927     bExp = extractFloat32Exp(b);
928 
929     if (get_denormals_are_zeros(status)) {
930         if (aExp == 0) aSig = 0;
931         if (bExp == 0) bSig = 0;
932     }
933 
934     expDiff = aExp - bExp;
935     aSig <<= 7;
936     bSig <<= 7;
937     if (0 < expDiff) goto aExpBigger;
938     if (expDiff < 0) goto bExpBigger;
939     if (aExp == 0xFF) {
940         if (aSig | bSig) return propagateFloat32NaN(a, b, status);
941         float_raise(status, float_flag_invalid);
942         return float32_default_nan;
943     }
944     if (aExp == 0) {
945         if (aSig | bSig) float_raise(status, float_flag_denormal);
946         aExp = 1;
947         bExp = 1;
948     }
949     if (bSig < aSig) goto aBigger;
950     if (aSig < bSig) goto bBigger;
951     return packFloat32(get_float_rounding_mode(status) == float_round_down, 0, 0);
952  bExpBigger:
953     if (bExp == 0xFF) {
954         if (bSig) return propagateFloat32NaN(a, b, status);
955         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
956         return packFloat32(zSign ^ 1, 0xFF, 0);
957     }
958     if ((bExp == 0) && bSig)
959         float_raise(status, float_flag_denormal);
960 
961     if (aExp == 0) {
962         if (aSig) float_raise(status, float_flag_denormal);
963         ++expDiff;
964     }
965     else aSig |= 0x40000000;
966 
967     aSig = shift32RightJamming(aSig, -expDiff);
968     bSig |= 0x40000000;
969  bBigger:
970     zSig = bSig - aSig;
971     zExp = bExp;
972     zSign ^= 1;
973     goto normalizeRoundAndPack;
974  aExpBigger:
975     if (aExp == 0xFF) {
976         if (aSig) return propagateFloat32NaN(a, b, status);
977         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
978         return a;
979     }
980     if ((aExp == 0) && aSig)
981         float_raise(status, float_flag_denormal);
982 
983     if (bExp == 0) {
984         if (bSig) float_raise(status, float_flag_denormal);
985         --expDiff;
986     }
987     else bSig |= 0x40000000;
988 
989     bSig = shift32RightJamming(bSig, expDiff);
990     aSig |= 0x40000000;
991  aBigger:
992     zSig = aSig - bSig;
993     zExp = aExp;
994  normalizeRoundAndPack:
995     --zExp;
996     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
997 }
998 
999 /*----------------------------------------------------------------------------
1000 | Returns the result of adding the single-precision floating-point values `a'
1001 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1002 | Binary Floating-Point Arithmetic.
1003 *----------------------------------------------------------------------------*/
1004 
float32_add(float32 a,float32 b,float_status_t & status)1005 float32 float32_add(float32 a, float32 b, float_status_t &status)
1006 {
1007     int aSign = extractFloat32Sign(a);
1008     int bSign = extractFloat32Sign(b);
1009 
1010     if (aSign == bSign) {
1011         return addFloat32Sigs(a, b, aSign, status);
1012     }
1013     else {
1014         return subFloat32Sigs(a, b, aSign, status);
1015     }
1016 }
1017 
1018 /*----------------------------------------------------------------------------
1019 | Returns the result of subtracting the single-precision floating-point values
1020 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1021 | for Binary Floating-Point Arithmetic.
1022 *----------------------------------------------------------------------------*/
1023 
float32_sub(float32 a,float32 b,float_status_t & status)1024 float32 float32_sub(float32 a, float32 b, float_status_t &status)
1025 {
1026     int aSign = extractFloat32Sign(a);
1027     int bSign = extractFloat32Sign(b);
1028 
1029     if (aSign == bSign) {
1030         return subFloat32Sigs(a, b, aSign, status);
1031     }
1032     else {
1033         return addFloat32Sigs(a, b, aSign, status);
1034     }
1035 }
1036 
1037 /*----------------------------------------------------------------------------
1038 | Returns the result of multiplying the single-precision floating-point values
1039 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1040 | for Binary Floating-Point Arithmetic.
1041 *----------------------------------------------------------------------------*/
1042 
float32_mul(float32 a,float32 b,float_status_t & status)1043 float32 float32_mul(float32 a, float32 b, float_status_t &status)
1044 {
1045     int aSign, bSign, zSign;
1046     Bit16s aExp, bExp, zExp;
1047     Bit32u aSig, bSig;
1048     Bit64u zSig64;
1049     Bit32u zSig;
1050 
1051     aSig = extractFloat32Frac(a);
1052     aExp = extractFloat32Exp(a);
1053     aSign = extractFloat32Sign(a);
1054     bSig = extractFloat32Frac(b);
1055     bExp = extractFloat32Exp(b);
1056     bSign = extractFloat32Sign(b);
1057     zSign = aSign ^ bSign;
1058 
1059     if (get_denormals_are_zeros(status)) {
1060         if (aExp == 0) aSig = 0;
1061         if (bExp == 0) bSig = 0;
1062     }
1063 
1064     if (aExp == 0xFF) {
1065         if (aSig || ((bExp == 0xFF) && bSig))
1066             return propagateFloat32NaN(a, b, status);
1067 
1068         if ((bExp | bSig) == 0) {
1069             float_raise(status, float_flag_invalid);
1070             return float32_default_nan;
1071         }
1072         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1073         return packFloat32(zSign, 0xFF, 0);
1074     }
1075     if (bExp == 0xFF) {
1076         if (bSig) return propagateFloat32NaN(a, b, status);
1077         if ((aExp | aSig) == 0) {
1078             float_raise(status, float_flag_invalid);
1079             return float32_default_nan;
1080         }
1081         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1082         return packFloat32(zSign, 0xFF, 0);
1083     }
1084     if (aExp == 0) {
1085         if (aSig == 0) {
1086             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1087             return packFloat32(zSign, 0, 0);
1088         }
1089         float_raise(status, float_flag_denormal);
1090         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1091     }
1092     if (bExp == 0) {
1093         if (bSig == 0) return packFloat32(zSign, 0, 0);
1094         float_raise(status, float_flag_denormal);
1095         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
1096     }
1097     zExp = aExp + bExp - 0x7F;
1098     aSig = (aSig | 0x00800000)<<7;
1099     bSig = (bSig | 0x00800000)<<8;
1100     zSig64 = shift64RightJamming(((Bit64u) aSig) * bSig, 32);
1101     zSig = (Bit32u) zSig64;
1102     if (0 <= (Bit32s) (zSig<<1)) {
1103         zSig <<= 1;
1104         --zExp;
1105     }
1106     return roundAndPackFloat32(zSign, zExp, zSig, status);
1107 }
1108 
1109 /*----------------------------------------------------------------------------
1110 | Returns the result of dividing the single-precision floating-point value `a'
1111 | by the corresponding value `b'.  The operation is performed according to the
1112 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 *----------------------------------------------------------------------------*/
1114 
float32_div(float32 a,float32 b,float_status_t & status)1115 float32 float32_div(float32 a, float32 b, float_status_t &status)
1116 {
1117     int aSign, bSign, zSign;
1118     Bit16s aExp, bExp, zExp;
1119     Bit32u aSig, bSig, zSig;
1120 
1121     aSig = extractFloat32Frac(a);
1122     aExp = extractFloat32Exp(a);
1123     aSign = extractFloat32Sign(a);
1124     bSig = extractFloat32Frac(b);
1125     bExp = extractFloat32Exp(b);
1126     bSign = extractFloat32Sign(b);
1127     zSign = aSign ^ bSign;
1128 
1129     if (get_denormals_are_zeros(status)) {
1130         if (aExp == 0) aSig = 0;
1131         if (bExp == 0) bSig = 0;
1132     }
1133 
1134     if (aExp == 0xFF) {
1135         if (aSig) return propagateFloat32NaN(a, b, status);
1136         if (bExp == 0xFF) {
1137             if (bSig) return propagateFloat32NaN(a, b, status);
1138             float_raise(status, float_flag_invalid);
1139             return float32_default_nan;
1140         }
1141         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1142         return packFloat32(zSign, 0xFF, 0);
1143     }
1144     if (bExp == 0xFF) {
1145         if (bSig) return propagateFloat32NaN(a, b, status);
1146         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1147         return packFloat32(zSign, 0, 0);
1148     }
1149     if (bExp == 0) {
1150         if (bSig == 0) {
1151             if ((aExp | aSig) == 0) {
1152                 float_raise(status, float_flag_invalid);
1153                 return float32_default_nan;
1154             }
1155             float_raise(status, float_flag_divbyzero);
1156             return packFloat32(zSign, 0xFF, 0);
1157         }
1158         float_raise(status, float_flag_denormal);
1159         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
1160     }
1161     if (aExp == 0) {
1162         if (aSig == 0) return packFloat32(zSign, 0, 0);
1163         float_raise(status, float_flag_denormal);
1164         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1165     }
1166     zExp = aExp - bExp + 0x7D;
1167     aSig = (aSig | 0x00800000)<<7;
1168     bSig = (bSig | 0x00800000)<<8;
1169     if (bSig <= (aSig + aSig)) {
1170         aSig >>= 1;
1171         ++zExp;
1172     }
1173     zSig = (((Bit64u) aSig)<<32) / bSig;
1174     if ((zSig & 0x3F) == 0) {
1175         zSig |= ((Bit64u) bSig * zSig != ((Bit64u) aSig)<<32);
1176     }
1177     return roundAndPackFloat32(zSign, zExp, zSig, status);
1178 }
1179 
1180 /*----------------------------------------------------------------------------
1181 | Returns the square root of the single-precision floating-point value `a'.
1182 | The operation is performed according to the IEC/IEEE Standard for Binary
1183 | Floating-Point Arithmetic.
1184 *----------------------------------------------------------------------------*/
1185 
float32_sqrt(float32 a,float_status_t & status)1186 float32 float32_sqrt(float32 a, float_status_t &status)
1187 {
1188     int aSign;
1189     Bit16s aExp, zExp;
1190     Bit32u aSig, zSig;
1191     Bit64u rem, term;
1192 
1193     aSig = extractFloat32Frac(a);
1194     aExp = extractFloat32Exp(a);
1195     aSign = extractFloat32Sign(a);
1196 
1197     if (aExp == 0xFF) {
1198         if (aSig) return propagateFloat32NaN(a, status);
1199         if (! aSign) return a;
1200         float_raise(status, float_flag_invalid);
1201         return float32_default_nan;
1202     }
1203 
1204     if (get_denormals_are_zeros(status)) {
1205         if (aExp == 0) aSig = 0;
1206     }
1207 
1208     if (aSign) {
1209         if ((aExp | aSig) == 0) return packFloat32(aSign, 0, 0);
1210         float_raise(status, float_flag_invalid);
1211         return float32_default_nan;
1212     }
1213     if (aExp == 0) {
1214         if (aSig == 0) return 0;
1215         float_raise(status, float_flag_denormal);
1216         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1217     }
1218     zExp = ((aExp - 0x7F)>>1) + 0x7E;
1219     aSig = (aSig | 0x00800000)<<8;
1220     zSig = estimateSqrt32(aExp, aSig) + 2;
1221     if ((zSig & 0x7F) <= 5) {
1222         if (zSig < 2) {
1223             zSig = 0x7FFFFFFF;
1224             goto roundAndPack;
1225         }
1226         aSig >>= aExp & 1;
1227         term = ((Bit64u) zSig) * zSig;
1228         rem = (((Bit64u) aSig)<<32) - term;
1229         while ((Bit64s) rem < 0) {
1230             --zSig;
1231             rem += (((Bit64u) zSig)<<1) | 1;
1232         }
1233         zSig |= (rem != 0);
1234     }
1235     zSig = shift32RightJamming(zSig, 1);
1236  roundAndPack:
1237     return roundAndPackFloat32(0, zExp, zSig, status);
1238 }
1239 
1240 /*----------------------------------------------------------------------------
1241 | Determine single-precision floating-point number class.
1242 *----------------------------------------------------------------------------*/
1243 
float32_class(float32 a)1244 float_class_t float32_class(float32 a)
1245 {
1246    Bit16s aExp = extractFloat32Exp(a);
1247    Bit32u aSig = extractFloat32Frac(a);
1248    int  aSign = extractFloat32Sign(a);
1249 
1250    if(aExp == 0xFF) {
1251        if (aSig == 0)
1252            return (aSign) ? float_negative_inf : float_positive_inf;
1253 
1254        return (aSig & 0x00400000) ? float_QNaN : float_SNaN;
1255    }
1256 
1257    if(aExp == 0) {
1258        if (aSig == 0) return float_zero;
1259        return float_denormal;
1260    }
1261 
1262    return float_normalized;
1263 }
1264 
1265 /*----------------------------------------------------------------------------
1266 | Compare  between  two  single  precision  floating  point  numbers. Returns
1267 | 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
1268 | the    value    'a'   is   less   than   the   corresponding   value   `b',
1269 | 'float_relation_greater' if the value 'a' is greater than the corresponding
1270 | value `b', or 'float_relation_unordered' otherwise.
1271 *----------------------------------------------------------------------------*/
1272 
float32_compare(float32 a,float32 b,int quiet,float_status_t & status)1273 int float32_compare(float32 a, float32 b, int quiet, float_status_t &status)
1274 {
1275     if (get_denormals_are_zeros(status)) {
1276         a = float32_denormal_to_zero(a);
1277         b = float32_denormal_to_zero(b);
1278     }
1279 
1280     float_class_t aClass = float32_class(a);
1281     float_class_t bClass = float32_class(b);
1282 
1283     if (aClass == float_SNaN || bClass == float_SNaN) {
1284         float_raise(status, float_flag_invalid);
1285         return float_relation_unordered;
1286     }
1287 
1288     if (aClass == float_QNaN || bClass == float_QNaN) {
1289         if (! quiet) float_raise(status, float_flag_invalid);
1290         return float_relation_unordered;
1291     }
1292 
1293     if (aClass == float_denormal || bClass == float_denormal) {
1294         float_raise(status, float_flag_denormal);
1295     }
1296 
1297     if ((a == b) || ((Bit32u) ((a | b)<<1) == 0)) return float_relation_equal;
1298 
1299     int aSign = extractFloat32Sign(a);
1300     int bSign = extractFloat32Sign(b);
1301     if (aSign != bSign)
1302         return (aSign) ? float_relation_less : float_relation_greater;
1303 
1304     if (aSign ^ (a < b)) return float_relation_less;
1305     return float_relation_greater;
1306 }
1307 
1308 /*----------------------------------------------------------------------------
1309 | Compare between two single precision floating point numbers and return the
1310 | smaller of them.
1311 *----------------------------------------------------------------------------*/
1312 
float32_min(float32 a,float32 b,float_status_t & status)1313 float32 float32_min(float32 a, float32 b, float_status_t &status)
1314 {
1315   if (get_denormals_are_zeros(status)) {
1316     a = float32_denormal_to_zero(a);
1317     b = float32_denormal_to_zero(b);
1318   }
1319 
1320   return (float32_compare(a, b, status) == float_relation_less) ? a : b;
1321 }
1322 
1323 /*----------------------------------------------------------------------------
1324 | Compare between two single precision floating point numbers and return the
1325 | larger of them.
1326 *----------------------------------------------------------------------------*/
1327 
float32_max(float32 a,float32 b,float_status_t & status)1328 float32 float32_max(float32 a, float32 b, float_status_t &status)
1329 {
1330   if (get_denormals_are_zeros(status)) {
1331     a = float32_denormal_to_zero(a);
1332     b = float32_denormal_to_zero(b);
1333   }
1334 
1335   return (float32_compare(a, b, status) == float_relation_greater) ? a : b;
1336 }
1337 
1338 /*----------------------------------------------------------------------------
1339 | Compare between two  single precision  floating point numbers and  return the
1340 | smaller/larger of them. The operation  is performed according to the IEC/IEEE
1341 | Standard for Binary Floating-Point Arithmetic.
1342 *----------------------------------------------------------------------------*/
1343 
float32_minmax(float32 a,float32 b,int is_max,int is_abs,float_status_t & status)1344 float32 float32_minmax(float32 a, float32 b, int is_max, int is_abs, float_status_t &status)
1345 {
1346     if (get_denormals_are_zeros(status)) {
1347         a = float32_denormal_to_zero(a);
1348         b = float32_denormal_to_zero(b);
1349     }
1350 
1351     if (float32_is_nan(a) || float32_is_nan(b)) {
1352         if (float32_is_signaling_nan(a)) {
1353             return propagateFloat32NaN(a, status);
1354         }
1355         if (float32_is_signaling_nan(b) ) {
1356             return propagateFloat32NaN(b, status);
1357         }
1358         if (! float32_is_nan(b)) {
1359             if (float32_is_denormal(b))
1360                 float_raise(status, float_flag_denormal);
1361             return b;
1362         }
1363         if (! float32_is_nan(a)) {
1364             if (float32_is_denormal(a))
1365                 float_raise(status, float_flag_denormal);
1366             return a;
1367         }
1368         return propagateFloat32NaN(a, b, status);
1369     }
1370 
1371     float32 tmp_a = a, tmp_b = b;
1372     if (is_abs) {
1373         tmp_a &= ~0x80000000; // clear the sign bit
1374         tmp_b &= ~0x80000000;
1375     }
1376 
1377     int aSign = extractFloat32Sign(tmp_a);
1378     int bSign = extractFloat32Sign(tmp_b);
1379 
1380     if (float32_is_denormal(a) || float32_is_denormal(b))
1381         float_raise(status, float_flag_denormal);
1382 
1383     if (aSign != bSign) {
1384         if (! is_max) {
1385             return aSign ? a : b;
1386         } else {
1387             return aSign ? b : a;
1388         }
1389     } else {
1390         if (! is_max) {
1391             return (aSign ^ (tmp_a < tmp_b)) ? a : b;
1392         } else {
1393             return (aSign ^ (tmp_a < tmp_b)) ? b : a;
1394         }
1395     }
1396 }
1397 
1398 /*----------------------------------------------------------------------------
1399 | Returns the result of converting the double-precision floating-point value
1400 | `a' to the 32-bit two's complement integer format.  The conversion is
1401 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1402 | Arithmetic - which means in particular that the conversion is rounded
1403 | according to the current rounding mode. If `a' is a NaN or the
1404 | conversion overflows, the integer indefinite value is returned.
1405 *----------------------------------------------------------------------------*/
1406 
float64_to_int32(float64 a,float_status_t & status)1407 Bit32s float64_to_int32(float64 a, float_status_t &status)
1408 {
1409     Bit64u aSig = extractFloat64Frac(a);
1410     Bit16s aExp = extractFloat64Exp(a);
1411     int aSign = extractFloat64Sign(a);
1412     if ((aExp == 0x7FF) && aSig) aSign = 0;
1413     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1414     else {
1415         if (get_denormals_are_zeros(status)) aSig = 0;
1416     }
1417     int shiftCount = 0x42C - aExp;
1418     if (0 < shiftCount) aSig = shift64RightJamming(aSig, shiftCount);
1419     return roundAndPackInt32(aSign, aSig, status);
1420 }
1421 
1422 /*----------------------------------------------------------------------------
1423 | Returns the result of converting the double-precision floating-point value
1424 | `a' to the 32-bit two's complement integer format.  The conversion is
1425 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1426 | Arithmetic, except that the conversion is always rounded toward zero.
1427 | If `a' is a NaN or the conversion overflows, the integer indefinite
1428 | value  is returned.
1429 *----------------------------------------------------------------------------*/
1430 
float64_to_int32_round_to_zero(float64 a,float_status_t & status)1431 Bit32s float64_to_int32_round_to_zero(float64 a, float_status_t &status)
1432 {
1433     int aSign;
1434     Bit16s aExp;
1435     Bit64u aSig, savedASig;
1436     Bit32s z;
1437     int shiftCount;
1438 
1439     aSig = extractFloat64Frac(a);
1440     aExp = extractFloat64Exp(a);
1441     aSign = extractFloat64Sign(a);
1442     if (0x41E < aExp) {
1443         float_raise(status, float_flag_invalid);
1444         return (Bit32s)(int32_indefinite);
1445     }
1446     else if (aExp < 0x3FF) {
1447         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1448         if (aExp || aSig) float_raise(status, float_flag_inexact);
1449         return 0;
1450     }
1451     aSig |= BX_CONST64(0x0010000000000000);
1452     shiftCount = 0x433 - aExp;
1453     savedASig = aSig;
1454     aSig >>= shiftCount;
1455     z = (Bit32s) aSig;
1456     if (aSign) z = -z;
1457     if ((z < 0) ^ aSign) {
1458         float_raise(status, float_flag_invalid);
1459         return (Bit32s)(int32_indefinite);
1460     }
1461     if ((aSig<<shiftCount) != savedASig) {
1462         float_raise(status, float_flag_inexact);
1463     }
1464     return z;
1465 }
1466 
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the double-precision floating-point value
1469 | `a' to the 32-bit unsigned integer format.  The conversion is performed
1470 | according to the IEC/IEEE Standard for Binary Floating-point Arithmetic,
1471 | except that the conversion is always rounded toward zero.  If `a' is a NaN
1472 | or conversion overflows, the largest positive integer is returned.
1473 *----------------------------------------------------------------------------*/
1474 
float64_to_uint32_round_to_zero(float64 a,float_status_t & status)1475 Bit32u float64_to_uint32_round_to_zero(float64 a, float_status_t &status)
1476 {
1477     Bit64u aSig = extractFloat64Frac(a);
1478     Bit16s aExp = extractFloat64Exp(a);
1479     int aSign = extractFloat64Sign(a);
1480 
1481     if (aExp < 0x3FF) {
1482         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1483         if (aExp || aSig) float_raise(status, float_flag_inexact);
1484         return 0;
1485     }
1486 
1487     if (0x41E < aExp || aSign) {
1488         float_raise(status, float_flag_invalid);
1489         return uint32_indefinite;
1490     }
1491 
1492     aSig |= BX_CONST64(0x0010000000000000);
1493     int shiftCount = 0x433 - aExp;
1494     Bit64u savedASig = aSig;
1495     aSig >>= shiftCount;
1496     if ((aSig<<shiftCount) != savedASig) {
1497         float_raise(status, float_flag_inexact);
1498     }
1499     return (Bit32u) aSig;
1500 }
1501 
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the double-precision floating-point value
1504 | `a' to the 64-bit two's complement integer format.  The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1506 | Arithmetic - which means in particular that the conversion is rounded
1507 | according to the current rounding mode.  If `a' is a NaN, the largest
1508 | positive integer is returned.  Otherwise, if the conversion overflows, the
1509 | largest integer with the same sign as `a' is returned.
1510 *----------------------------------------------------------------------------*/
1511 
float64_to_int64(float64 a,float_status_t & status)1512 Bit64s float64_to_int64(float64 a, float_status_t &status)
1513 {
1514     int aSign;
1515     Bit16s aExp;
1516     Bit64u aSig, aSigExtra;
1517 
1518     aSig = extractFloat64Frac(a);
1519     aExp = extractFloat64Exp(a);
1520     aSign = extractFloat64Sign(a);
1521     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1522     else {
1523         if (get_denormals_are_zeros(status)) aSig = 0;
1524     }
1525     int shiftCount = 0x433 - aExp;
1526     if (shiftCount <= 0) {
1527         if (0x43E < aExp) {
1528             float_raise(status, float_flag_invalid);
1529             return (Bit64s)(int64_indefinite);
1530         }
1531         aSigExtra = 0;
1532         aSig <<= -shiftCount;
1533     }
1534     else {
1535         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
1536     }
1537     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
1538 }
1539 
1540 /*----------------------------------------------------------------------------
1541 | Returns the result of converting the double-precision floating-point value
1542 | `a' to the 64-bit two's complement integer format.  The conversion is
1543 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1544 | Arithmetic, except that the conversion is always rounded toward zero.
1545 | If `a' is a NaN or the conversion overflows, the integer indefinite
1546 | value  is returned.
1547 *----------------------------------------------------------------------------*/
1548 
float64_to_int64_round_to_zero(float64 a,float_status_t & status)1549 Bit64s float64_to_int64_round_to_zero(float64 a, float_status_t &status)
1550 {
1551     int aSign;
1552     Bit16s aExp;
1553     Bit64u aSig;
1554     Bit64s z;
1555 
1556     aSig = extractFloat64Frac(a);
1557     aExp = extractFloat64Exp(a);
1558     aSign = extractFloat64Sign(a);
1559     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1560     int shiftCount = aExp - 0x433;
1561     if (0 <= shiftCount) {
1562         if (0x43E <= aExp) {
1563             if (a != BX_CONST64(0xC3E0000000000000)) {
1564                 float_raise(status, float_flag_invalid);
1565             }
1566             return (Bit64s)(int64_indefinite);
1567         }
1568         z = aSig<<shiftCount;
1569     }
1570     else {
1571         if (aExp < 0x3FE) {
1572             if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1573             if (aExp | aSig) float_raise(status, float_flag_inexact);
1574             return 0;
1575         }
1576         z = aSig>>(-shiftCount);
1577         if ((Bit64u) (aSig<<(shiftCount & 63))) {
1578              float_raise(status, float_flag_inexact);
1579         }
1580     }
1581     if (aSign) z = -z;
1582     return z;
1583 }
1584 
1585 /*----------------------------------------------------------------------------
1586 | Returns the result of converting the double-precision floating-point value
1587 | `a' to the 64-bit unsigned integer format.  The conversion is performed
1588 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
1589 | except that the conversion is always rounded toward zero. If `a' is a NaN
1590 | or the conversion overflows, the largest unsigned integer is returned.
1591 *----------------------------------------------------------------------------*/
1592 
float64_to_uint64_round_to_zero(float64 a,float_status_t & status)1593 Bit64u float64_to_uint64_round_to_zero(float64 a, float_status_t &status)
1594 {
1595     int aSign;
1596     Bit16s aExp;
1597     Bit64u aSig, z;
1598 
1599     aSig = extractFloat64Frac(a);
1600     aExp = extractFloat64Exp(a);
1601     aSign = extractFloat64Sign(a);
1602 
1603     if (aExp < 0x3FE) {
1604         if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1605         if (aExp | aSig) float_raise(status, float_flag_inexact);
1606         return 0;
1607     }
1608 
1609     if (0x43E <= aExp || aSign) {
1610         float_raise(status, float_flag_invalid);
1611         return uint64_indefinite;
1612     }
1613 
1614     if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1615     int shiftCount = aExp - 0x433;
1616 
1617     if (0 <= shiftCount) {
1618         z = aSig<<shiftCount;
1619     }
1620     else {
1621         z = aSig>>(-shiftCount);
1622         if ((Bit64u) (aSig<<(shiftCount & 63))) {
1623              float_raise(status, float_flag_inexact);
1624         }
1625     }
1626     return z;
1627 }
1628 
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the double-precision floating-point value
1631 | `a' to the 32-bit unsigned integer format.  The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1633 | Arithmetic---which means in particular that the conversion is rounded
1634 | according to the current rounding mode.  If `a' is a NaN or the conversion
1635 | overflows, the largest unsigned integer is returned.
1636 *----------------------------------------------------------------------------*/
1637 
float64_to_uint32(float64 a,float_status_t & status)1638 Bit32u float64_to_uint32(float64 a, float_status_t &status)
1639 {
1640     Bit64u val_64 = float64_to_uint64(a, status);
1641 
1642     if (val_64 > 0xffffffff) {
1643         status.float_exception_flags = float_flag_invalid; // throw away other flags
1644         return uint32_indefinite;
1645     }
1646 
1647     return (Bit32u) val_64;
1648 }
1649 
1650 /*----------------------------------------------------------------------------
1651 | Returns the result of converting the double-precision floating-point value
1652 | `a' to the 64-bit unsigned integer format.  The conversion is
1653 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1654 | Arithmetic---which means in particular that the conversion is rounded
1655 | according to the current rounding mode.  If `a' is a NaN or the conversion
1656 | overflows, the largest unsigned integer is returned.
1657 *----------------------------------------------------------------------------*/
1658 
float64_to_uint64(float64 a,float_status_t & status)1659 Bit64u float64_to_uint64(float64 a, float_status_t &status)
1660 {
1661     int aSign;
1662     Bit16s aExp, shiftCount;
1663     Bit64u aSig, aSigExtra;
1664 
1665     aSig = extractFloat64Frac(a);
1666     aExp = extractFloat64Exp(a);
1667     aSign = extractFloat64Sign(a);
1668 
1669     if (get_denormals_are_zeros(status)) {
1670         if (aExp == 0) aSig = 0;
1671     }
1672 
1673     if (aSign && (aExp > 0x3FE)) {
1674         float_raise(status, float_flag_invalid);
1675         return uint64_indefinite;
1676     }
1677 
1678     if (aExp) {
1679         aSig |= BX_CONST64(0x0010000000000000);
1680     }
1681     shiftCount = 0x433 - aExp;
1682     if (shiftCount <= 0) {
1683         if (0x43E < aExp) {
1684             float_raise(status, float_flag_invalid);
1685             return uint64_indefinite;
1686         }
1687         aSigExtra = 0;
1688         aSig <<= -shiftCount;
1689     } else {
1690         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
1691     }
1692 
1693     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
1694 }
1695 
1696 /*----------------------------------------------------------------------------
1697 | Returns the result of converting the double-precision floating-point value
1698 | `a' to the single-precision floating-point format.  The conversion is
1699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1700 | Arithmetic.
1701 *----------------------------------------------------------------------------*/
1702 
float64_to_float32(float64 a,float_status_t & status)1703 float32 float64_to_float32(float64 a, float_status_t &status)
1704 {
1705     int aSign;
1706     Bit16s aExp;
1707     Bit64u aSig;
1708     Bit32u zSig;
1709 
1710     aSig = extractFloat64Frac(a);
1711     aExp = extractFloat64Exp(a);
1712     aSign = extractFloat64Sign(a);
1713     if (aExp == 0x7FF) {
1714         if (aSig) return commonNaNToFloat32(float64ToCommonNaN(a, status));
1715         return packFloat32(aSign, 0xFF, 0);
1716     }
1717     if (aExp == 0) {
1718         if (aSig == 0 || get_denormals_are_zeros(status))
1719             return packFloat32(aSign, 0, 0);
1720         float_raise(status, float_flag_denormal);
1721     }
1722     aSig = shift64RightJamming(aSig, 22);
1723     zSig = (Bit32u) aSig;
1724     if (aExp || zSig) {
1725         zSig |= 0x40000000;
1726         aExp -= 0x381;
1727     }
1728     return roundAndPackFloat32(aSign, aExp, zSig, status);
1729 }
1730 
1731 /*----------------------------------------------------------------------------
1732 | Rounds the double-precision floating-point value `a' to an integer, and
1733 | returns the result as a double-precision floating-point value.  The
1734 | operation is performed according to the IEC/IEEE Standard for Binary
1735 | Floating-Point Arithmetic.
1736 *----------------------------------------------------------------------------*/
1737 
float64_round_to_int(float64 a,Bit8u scale,float_status_t & status)1738 float64 float64_round_to_int(float64 a, Bit8u scale, float_status_t &status)
1739 {
1740     Bit64u lastBitMask, roundBitsMask;
1741     int roundingMode = get_float_rounding_mode(status);
1742     Bit16s aExp = extractFloat64Exp(a);
1743     scale &= 0xf;
1744 
1745     if ((aExp == 0x7FF) && extractFloat64Frac(a)) {
1746         return propagateFloat64NaN(a, status);
1747     }
1748 
1749     aExp += scale; // scale the exponent
1750 
1751     if (0x433 <= aExp) {
1752         return a;
1753     }
1754 
1755     if (get_denormals_are_zeros(status)) {
1756         a = float64_denormal_to_zero(a);
1757     }
1758 
1759     if (aExp < 0x3FF) {
1760         if ((Bit64u) (a<<1) == 0) return a;
1761         float_raise(status, float_flag_inexact);
1762         int aSign = extractFloat64Sign(a);
1763         switch (roundingMode) {
1764          case float_round_nearest_even:
1765             if ((aExp == 0x3FE) && extractFloat64Frac(a)) {
1766               return packFloat64(aSign, 0x3FF - scale, 0);
1767             }
1768             break;
1769          case float_round_down:
1770             return aSign ? packFloat64(1, 0x3FF - scale, 0) : float64_positive_zero;
1771          case float_round_up:
1772             return aSign ? float64_negative_zero : packFloat64(0, 0x3FF - scale, 0);
1773         }
1774         return packFloat64(aSign, 0, 0);
1775     }
1776 
1777     lastBitMask = 1;
1778     lastBitMask <<= 0x433 - aExp;
1779     roundBitsMask = lastBitMask - 1;
1780     float64 z = a;
1781     if (roundingMode == float_round_nearest_even) {
1782         z += lastBitMask>>1;
1783         if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
1784     }
1785     else if (roundingMode != float_round_to_zero) {
1786         if (extractFloat64Sign(z) ^ (roundingMode == float_round_up)) {
1787             z += roundBitsMask;
1788         }
1789     }
1790     z &= ~roundBitsMask;
1791     if (z != a) float_raise(status, float_flag_inexact);
1792     return z;
1793 }
1794 
1795 /*----------------------------------------------------------------------------
1796 | Extracts the fractional portion of double-precision floating-point value `a',
1797 | and returns the result  as a  double-precision  floating-point value. The
1798 | fractional results are precise. The operation is performed according to the
1799 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1800 *----------------------------------------------------------------------------*/
1801 
float64_frc(float64 a,float_status_t & status)1802 float64 float64_frc(float64 a, float_status_t &status)
1803 {
1804     int roundingMode = get_float_rounding_mode(status);
1805 
1806     Bit64u aSig = extractFloat64Frac(a);
1807     Bit16s aExp = extractFloat64Exp(a);
1808     int aSign = extractFloat64Sign(a);
1809 
1810     if (aExp == 0x7FF) {
1811         if (aSig) return propagateFloat64NaN(a, status);
1812         float_raise(status, float_flag_invalid);
1813         return float64_default_nan;
1814     }
1815 
1816     if (aExp >= 0x433) {
1817         return packFloat64(roundingMode == float_round_down, 0, 0);
1818     }
1819 
1820     if (aExp < 0x3FF) {
1821         if (aExp == 0) {
1822             if (aSig == 0 || get_denormals_are_zeros(status))
1823                 return packFloat64(roundingMode == float_round_down, 0, 0);
1824 
1825             float_raise(status, float_flag_denormal);
1826             if (! float_exception_masked(status, float_flag_underflow))
1827                 float_raise(status, float_flag_underflow);
1828 
1829             if(get_flush_underflow_to_zero(status)) {
1830                 float_raise(status, float_flag_underflow | float_flag_inexact);
1831                 return packFloat64(aSign, 0, 0);
1832             }
1833         }
1834         return a;
1835     }
1836 
1837     Bit64u lastBitMask = BX_CONST64(1) << (0x433 - aExp);
1838     Bit64u roundBitsMask = lastBitMask - 1;
1839 
1840     aSig &= roundBitsMask;
1841     aSig <<= 10;
1842     aExp--;
1843 
1844     if (aSig == 0)
1845        return packFloat64(roundingMode == float_round_down, 0, 0);
1846 
1847     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
1848 }
1849 
1850 /*----------------------------------------------------------------------------
1851 | Extracts the exponent portion of double-precision floating-point value 'a',
1852 | and returns the result as a double-precision floating-point value
1853 | representing unbiased integer exponent. The operation is performed according
1854 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1855 *----------------------------------------------------------------------------*/
1856 
float64_getexp(float64 a,float_status_t & status)1857 float64 float64_getexp(float64 a, float_status_t &status)
1858 {
1859     Bit16s aExp = extractFloat64Exp(a);
1860     Bit64u aSig = extractFloat64Frac(a);
1861 
1862     if (aExp == 0x7FF) {
1863         if (aSig) return propagateFloat64NaN(a, status);
1864         return float64_positive_inf;
1865     }
1866 
1867     if (aExp == 0) {
1868         if (aSig == 0 || get_denormals_are_zeros(status))
1869             return float64_negative_inf;
1870 
1871         float_raise(status, float_flag_denormal);
1872         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1873     }
1874 
1875     return int32_to_float64(aExp - 0x3FF);
1876 }
1877 
1878 /*----------------------------------------------------------------------------
1879 | Extracts the mantissa of double-precision floating-point value 'a' and
1880 | returns the result as a double-precision floating-point after applying
1881 | the mantissa interval normalization and sign control. The operation is
1882 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1883 | Arithmetic.
1884 *----------------------------------------------------------------------------*/
1885 
float64_getmant(float64 a,float_status_t & status,int sign_ctrl,int interv)1886 float64 float64_getmant(float64 a, float_status_t &status, int sign_ctrl, int interv)
1887 {
1888     Bit16s aExp = extractFloat64Exp(a);
1889     Bit64u aSig = extractFloat64Frac(a);
1890     int aSign = extractFloat64Sign(a);
1891 
1892     if (aExp == 0x7FF) {
1893         if (aSig) return propagateFloat64NaN(a, status);
1894         if (aSign) {
1895             if (sign_ctrl & 0x2) {
1896                 float_raise(status, float_flag_invalid);
1897                 return float64_default_nan;
1898             }
1899         }
1900         return packFloat64(~sign_ctrl & aSign, 0x3FF, 0);
1901     }
1902 
1903     if (aExp == 0 && (aSig == 0 || get_denormals_are_zeros(status))) {
1904         return packFloat64(~sign_ctrl & aSign, 0x3FF, 0);
1905     }
1906 
1907     if (aSign) {
1908         if (sign_ctrl & 0x2) {
1909             float_raise(status, float_flag_invalid);
1910             return float64_default_nan;
1911         }
1912     }
1913 
1914     if (aExp == 0) {
1915         float_raise(status, float_flag_denormal);
1916         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1917 //      aExp += 0x3FE;
1918         aSig &= BX_CONST64(0xFFFFFFFFFFFFFFFF);
1919     }
1920 
1921     switch(interv) {
1922     case 0x0: // interval [1,2)
1923         aExp = 0x3FF;
1924         break;
1925     case 0x1: // interval [1/2,2)
1926         aExp -= 0x3FF;
1927         aExp  = 0x3FF - (aExp & 0x1);
1928         break;
1929     case 0x2: // interval [1/2,1)
1930         aExp = 0x3FE;
1931         break;
1932     case 0x3: // interval [3/4,3/2)
1933         aExp = 0x3FF - ((aSig >> 51) & 0x1);
1934         break;
1935     }
1936 
1937     return packFloat64(~sign_ctrl & aSign, aExp, aSig);
1938 }
1939 
1940 /*----------------------------------------------------------------------------
1941 | Return the result of a floating point scale of the double-precision floating
1942 | point value `a' by multiplying it by 2 power of the double-precision
1943 | floating point value 'b' converted to integral value. If the result cannot
1944 | be represented in double precision, then the proper overflow response (for
1945 | positive scaling operand), or the proper underflow response (for negative
1946 | scaling operand) is issued. The operation is performed according to the
1947 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1948 *----------------------------------------------------------------------------*/
1949 
float64_scalef(float64 a,float64 b,float_status_t & status)1950 float64 float64_scalef(float64 a, float64 b, float_status_t &status)
1951 {
1952     Bit64u aSig = extractFloat64Frac(a);
1953     Bit16s aExp = extractFloat64Exp(a);
1954     int aSign = extractFloat64Sign(a);
1955     Bit64u bSig = extractFloat64Frac(b);
1956     Bit16s bExp = extractFloat64Exp(b);
1957     int bSign = extractFloat64Sign(b);
1958 
1959     if (get_denormals_are_zeros(status)) {
1960         if (aExp == 0) aSig = 0;
1961         if (bExp == 0) bSig = 0;
1962     }
1963 
1964     if (bExp == 0x7FF) {
1965         if (bSig) return propagateFloat64NaN(a, b, status);
1966     }
1967 
1968     if (aExp == 0x7FF) {
1969         if (aSig) {
1970             int aIsSignalingNaN = (aSig & BX_CONST64(0x0008000000000000)) == 0;
1971             if (aIsSignalingNaN || bExp != 0x7FF || bSig)
1972                 return propagateFloat64NaN(a, b, status);
1973 
1974             return bSign ? 0 : float64_positive_inf;
1975         }
1976 
1977         if (bExp == 0x7FF && bSign) {
1978             float_raise(status, float_flag_invalid);
1979             return float64_default_nan;
1980         }
1981         return a;
1982     }
1983 
1984     if (aExp == 0) {
1985         if (aSig == 0) {
1986             if (bExp == 0x7FF && ! bSign) {
1987                 float_raise(status, float_flag_invalid);
1988                 return float64_default_nan;
1989             }
1990             return a;
1991         }
1992         float_raise(status, float_flag_denormal);
1993     }
1994 
1995     if ((bExp | bSig) == 0) return a;
1996 
1997     if (bExp == 0x7FF) {
1998         if (bSign) return packFloat64(aSign, 0, 0);
1999         return packFloat64(aSign, 0x7FF, 0);
2000     }
2001 
2002     if (0x40F <= bExp) {
2003         // handle obvious overflow/underflow result
2004         return roundAndPackFloat64(aSign, bSign ? -0x3FF : 0x7FF, aSig, status);
2005     }
2006 
2007     int scale = 0;
2008 
2009     if (bExp < 0x3FF) {
2010         if (bExp == 0)
2011             float_raise(status, float_flag_denormal);
2012         scale = -bSign;
2013     }
2014     else {
2015         bSig |= BX_CONST64(0x0010000000000000);
2016         int shiftCount = 0x433 - bExp;
2017         Bit64u savedBSig = bSig;
2018         bSig >>= shiftCount;
2019         scale = (Bit32s) bSig;
2020         if (bSign) {
2021             if ((bSig<<shiftCount) != savedBSig) scale++;
2022             scale = -scale;
2023         }
2024 
2025         if (scale >  0x1000) scale =  0x1000;
2026         if (scale < -0x1000) scale = -0x1000;
2027     }
2028 
2029     if (aExp != 0) {
2030         aSig |= BX_CONST64(0x0010000000000000);
2031     } else {
2032         aExp++;
2033     }
2034 
2035     aExp += scale - 1;
2036     aSig <<= 10;
2037     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
2038 }
2039 
2040 /*----------------------------------------------------------------------------
2041 | Returns the result of adding the absolute values of the double-precision
2042 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2043 | before being returned.  `zSign' is ignored if the result is a NaN.
2044 | The addition is performed according to the IEC/IEEE Standard for Binary
2045 | Floating-Point Arithmetic.
2046 *----------------------------------------------------------------------------*/
2047 
addFloat64Sigs(float64 a,float64 b,int zSign,float_status_t & status)2048 static float64 addFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
2049 {
2050     Bit16s aExp, bExp, zExp;
2051     Bit64u aSig, bSig, zSig;
2052     Bit16s expDiff;
2053 
2054     aSig = extractFloat64Frac(a);
2055     aExp = extractFloat64Exp(a);
2056     bSig = extractFloat64Frac(b);
2057     bExp = extractFloat64Exp(b);
2058 
2059     if (get_denormals_are_zeros(status)) {
2060         if (aExp == 0) aSig = 0;
2061         if (bExp == 0) bSig = 0;
2062     }
2063 
2064     expDiff = aExp - bExp;
2065     aSig <<= 9;
2066     bSig <<= 9;
2067     if (0 < expDiff) {
2068         if (aExp == 0x7FF) {
2069             if (aSig) return propagateFloat64NaN(a, b, status);
2070             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2071             return a;
2072         }
2073         if ((aExp == 0) && aSig)
2074             float_raise(status, float_flag_denormal);
2075 
2076         if (bExp == 0) {
2077             if (bSig) float_raise(status, float_flag_denormal);
2078             --expDiff;
2079         }
2080         else bSig |= BX_CONST64(0x2000000000000000);
2081 
2082         bSig = shift64RightJamming(bSig, expDiff);
2083         zExp = aExp;
2084     }
2085     else if (expDiff < 0) {
2086         if (bExp == 0x7FF) {
2087             if (bSig) return propagateFloat64NaN(a, b, status);
2088             if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2089             return packFloat64(zSign, 0x7FF, 0);
2090         }
2091         if ((bExp == 0) && bSig)
2092             float_raise(status, float_flag_denormal);
2093 
2094         if (aExp == 0) {
2095             if (aSig) float_raise(status, float_flag_denormal);
2096             ++expDiff;
2097         }
2098         else aSig |= BX_CONST64(0x2000000000000000);
2099 
2100         aSig = shift64RightJamming(aSig, -expDiff);
2101         zExp = bExp;
2102     }
2103     else {
2104         if (aExp == 0x7FF) {
2105             if (aSig | bSig) return propagateFloat64NaN(a, b, status);
2106             return a;
2107         }
2108         if (aExp == 0) {
2109             zSig = (aSig + bSig) >> 9;
2110             if (aSig | bSig) {
2111                 float_raise(status, float_flag_denormal);
2112                 if (get_flush_underflow_to_zero(status) && (extractFloat64Frac(zSig) == zSig)) {
2113                     float_raise(status, float_flag_underflow | float_flag_inexact);
2114                     return packFloat64(zSign, 0, 0);
2115                 }
2116                 if (! float_exception_masked(status, float_flag_underflow)) {
2117                     if (extractFloat64Frac(zSig) == zSig)
2118                         float_raise(status, float_flag_underflow);
2119                 }
2120             }
2121             return packFloat64(zSign, 0, zSig);
2122         }
2123         zSig = BX_CONST64(0x4000000000000000) + aSig + bSig;
2124         return roundAndPackFloat64(zSign, aExp, zSig, status);
2125     }
2126     aSig |= BX_CONST64(0x2000000000000000);
2127     zSig = (aSig + bSig)<<1;
2128     --zExp;
2129     if ((Bit64s) zSig < 0) {
2130         zSig = aSig + bSig;
2131         ++zExp;
2132     }
2133     return roundAndPackFloat64(zSign, zExp, zSig, status);
2134 }
2135 
2136 /*----------------------------------------------------------------------------
2137 | Returns the result of subtracting the absolute values of the double-
2138 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2139 | difference is negated before being returned.  `zSign' is ignored if the
2140 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2141 | Standard for Binary Floating-Point Arithmetic.
2142 *----------------------------------------------------------------------------*/
2143 
subFloat64Sigs(float64 a,float64 b,int zSign,float_status_t & status)2144 static float64 subFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
2145 {
2146     Bit16s aExp, bExp, zExp;
2147     Bit64u aSig, bSig, zSig;
2148     Bit16s expDiff;
2149 
2150     aSig = extractFloat64Frac(a);
2151     aExp = extractFloat64Exp(a);
2152     bSig = extractFloat64Frac(b);
2153     bExp = extractFloat64Exp(b);
2154 
2155     if (get_denormals_are_zeros(status)) {
2156         if (aExp == 0) aSig = 0;
2157         if (bExp == 0) bSig = 0;
2158     }
2159 
2160     expDiff = aExp - bExp;
2161     aSig <<= 10;
2162     bSig <<= 10;
2163     if (0 < expDiff) goto aExpBigger;
2164     if (expDiff < 0) goto bExpBigger;
2165     if (aExp == 0x7FF) {
2166         if (aSig | bSig) return propagateFloat64NaN(a, b, status);
2167         float_raise(status, float_flag_invalid);
2168         return float64_default_nan;
2169     }
2170     if (aExp == 0) {
2171         if (aSig | bSig) float_raise(status, float_flag_denormal);
2172         aExp = 1;
2173         bExp = 1;
2174     }
2175     if (bSig < aSig) goto aBigger;
2176     if (aSig < bSig) goto bBigger;
2177     return packFloat64(get_float_rounding_mode(status) == float_round_down, 0, 0);
2178  bExpBigger:
2179     if (bExp == 0x7FF) {
2180         if (bSig) return propagateFloat64NaN(a, b, status);
2181         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2182         return packFloat64(zSign ^ 1, 0x7FF, 0);
2183     }
2184     if ((bExp == 0) && bSig)
2185         float_raise(status, float_flag_denormal);
2186 
2187     if (aExp == 0) {
2188         if (aSig) float_raise(status, float_flag_denormal);
2189         ++expDiff;
2190     }
2191     else aSig |= BX_CONST64(0x4000000000000000);
2192 
2193     aSig = shift64RightJamming(aSig, -expDiff);
2194     bSig |= BX_CONST64(0x4000000000000000);
2195  bBigger:
2196     zSig = bSig - aSig;
2197     zExp = bExp;
2198     zSign ^= 1;
2199     goto normalizeRoundAndPack;
2200  aExpBigger:
2201     if (aExp == 0x7FF) {
2202         if (aSig) return propagateFloat64NaN(a, b, status);
2203         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2204         return a;
2205     }
2206     if ((aExp == 0) && aSig)
2207         float_raise(status, float_flag_denormal);
2208 
2209     if (bExp == 0) {
2210         if (bSig) float_raise(status, float_flag_denormal);
2211         --expDiff;
2212     }
2213     else bSig |= BX_CONST64(0x4000000000000000);
2214 
2215     bSig = shift64RightJamming(bSig, expDiff);
2216     aSig |= BX_CONST64(0x4000000000000000);
2217  aBigger:
2218     zSig = aSig - bSig;
2219     zExp = aExp;
2220  normalizeRoundAndPack:
2221     --zExp;
2222     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
2223 }
2224 
2225 /*----------------------------------------------------------------------------
2226 | Returns the result of adding the double-precision floating-point values `a'
2227 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2228 | Binary Floating-Point Arithmetic.
2229 *----------------------------------------------------------------------------*/
2230 
float64_add(float64 a,float64 b,float_status_t & status)2231 float64 float64_add(float64 a, float64 b, float_status_t &status)
2232 {
2233     int aSign = extractFloat64Sign(a);
2234     int bSign = extractFloat64Sign(b);
2235 
2236     if (aSign == bSign) {
2237         return addFloat64Sigs(a, b, aSign, status);
2238     }
2239     else {
2240         return subFloat64Sigs(a, b, aSign, status);
2241     }
2242 }
2243 
2244 /*----------------------------------------------------------------------------
2245 | Returns the result of subtracting the double-precision floating-point values
2246 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2247 | for Binary Floating-Point Arithmetic.
2248 *----------------------------------------------------------------------------*/
2249 
float64_sub(float64 a,float64 b,float_status_t & status)2250 float64 float64_sub(float64 a, float64 b, float_status_t &status)
2251 {
2252     int aSign = extractFloat64Sign(a);
2253     int bSign = extractFloat64Sign(b);
2254 
2255     if (aSign == bSign) {
2256         return subFloat64Sigs(a, b, aSign, status);
2257     }
2258     else {
2259         return addFloat64Sigs(a, b, aSign, status);
2260     }
2261 }
2262 
2263 /*----------------------------------------------------------------------------
2264 | Returns the result of multiplying the double-precision floating-point values
2265 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2266 | for Binary Floating-Point Arithmetic.
2267 *----------------------------------------------------------------------------*/
2268 
float64_mul(float64 a,float64 b,float_status_t & status)2269 float64 float64_mul(float64 a, float64 b, float_status_t &status)
2270 {
2271     int aSign, bSign, zSign;
2272     Bit16s aExp, bExp, zExp;
2273     Bit64u aSig, bSig, zSig0, zSig1;
2274 
2275     aSig = extractFloat64Frac(a);
2276     aExp = extractFloat64Exp(a);
2277     aSign = extractFloat64Sign(a);
2278     bSig = extractFloat64Frac(b);
2279     bExp = extractFloat64Exp(b);
2280     bSign = extractFloat64Sign(b);
2281     zSign = aSign ^ bSign;
2282 
2283     if (get_denormals_are_zeros(status)) {
2284         if (aExp == 0) aSig = 0;
2285         if (bExp == 0) bSig = 0;
2286     }
2287 
2288     if (aExp == 0x7FF) {
2289         if (aSig || ((bExp == 0x7FF) && bSig)) {
2290             return propagateFloat64NaN(a, b, status);
2291         }
2292         if ((bExp | bSig) == 0) {
2293             float_raise(status, float_flag_invalid);
2294             return float64_default_nan;
2295         }
2296         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2297         return packFloat64(zSign, 0x7FF, 0);
2298     }
2299     if (bExp == 0x7FF) {
2300         if (bSig) return propagateFloat64NaN(a, b, status);
2301         if ((aExp | aSig) == 0) {
2302             float_raise(status, float_flag_invalid);
2303             return float64_default_nan;
2304         }
2305         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2306         return packFloat64(zSign, 0x7FF, 0);
2307     }
2308     if (aExp == 0) {
2309         if (aSig == 0) {
2310             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2311             return packFloat64(zSign, 0, 0);
2312         }
2313         float_raise(status, float_flag_denormal);
2314         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2315     }
2316     if (bExp == 0) {
2317         if (bSig == 0) return packFloat64(zSign, 0, 0);
2318         float_raise(status, float_flag_denormal);
2319         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
2320     }
2321     zExp = aExp + bExp - 0x3FF;
2322     aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
2323     bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
2324     mul64To128(aSig, bSig, &zSig0, &zSig1);
2325     zSig0 |= (zSig1 != 0);
2326     if (0 <= (Bit64s) (zSig0<<1)) {
2327         zSig0 <<= 1;
2328         --zExp;
2329     }
2330     return roundAndPackFloat64(zSign, zExp, zSig0, status);
2331 }
2332 
2333 /*----------------------------------------------------------------------------
2334 | Returns the result of dividing the double-precision floating-point value `a'
2335 | by the corresponding value `b'.  The operation is performed according to
2336 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2337 *----------------------------------------------------------------------------*/
2338 
float64_div(float64 a,float64 b,float_status_t & status)2339 float64 float64_div(float64 a, float64 b, float_status_t &status)
2340 {
2341     int aSign, bSign, zSign;
2342     Bit16s aExp, bExp, zExp;
2343     Bit64u aSig, bSig, zSig;
2344     Bit64u rem0, rem1;
2345     Bit64u term0, term1;
2346 
2347     aSig = extractFloat64Frac(a);
2348     aExp = extractFloat64Exp(a);
2349     aSign = extractFloat64Sign(a);
2350     bSig = extractFloat64Frac(b);
2351     bExp = extractFloat64Exp(b);
2352     bSign = extractFloat64Sign(b);
2353     zSign = aSign ^ bSign;
2354 
2355     if (get_denormals_are_zeros(status)) {
2356         if (aExp == 0) aSig = 0;
2357         if (bExp == 0) bSig = 0;
2358     }
2359 
2360     if (aExp == 0x7FF) {
2361         if (aSig) return propagateFloat64NaN(a, b, status);
2362         if (bExp == 0x7FF) {
2363             if (bSig) return propagateFloat64NaN(a, b, status);
2364             float_raise(status, float_flag_invalid);
2365             return float64_default_nan;
2366         }
2367         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2368         return packFloat64(zSign, 0x7FF, 0);
2369     }
2370     if (bExp == 0x7FF) {
2371         if (bSig) return propagateFloat64NaN(a, b, status);
2372         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2373         return packFloat64(zSign, 0, 0);
2374     }
2375     if (bExp == 0) {
2376         if (bSig == 0) {
2377             if ((aExp | aSig) == 0) {
2378                 float_raise(status, float_flag_invalid);
2379                 return float64_default_nan;
2380             }
2381             float_raise(status, float_flag_divbyzero);
2382             return packFloat64(zSign, 0x7FF, 0);
2383         }
2384         float_raise(status, float_flag_denormal);
2385         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
2386     }
2387     if (aExp == 0) {
2388         if (aSig == 0) return packFloat64(zSign, 0, 0);
2389         float_raise(status, float_flag_denormal);
2390         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2391     }
2392     zExp = aExp - bExp + 0x3FD;
2393     aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
2394     bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
2395     if (bSig <= (aSig + aSig)) {
2396         aSig >>= 1;
2397         ++zExp;
2398     }
2399     zSig = estimateDiv128To64(aSig, 0, bSig);
2400     if ((zSig & 0x1FF) <= 2) {
2401         mul64To128(bSig, zSig, &term0, &term1);
2402         sub128(aSig, 0, term0, term1, &rem0, &rem1);
2403         while ((Bit64s) rem0 < 0) {
2404             --zSig;
2405             add128(rem0, rem1, 0, bSig, &rem0, &rem1);
2406         }
2407         zSig |= (rem1 != 0);
2408     }
2409     return roundAndPackFloat64(zSign, zExp, zSig, status);
2410 }
2411 
2412 /*----------------------------------------------------------------------------
2413 | Returns the square root of the double-precision floating-point value `a'.
2414 | The operation is performed according to the IEC/IEEE Standard for Binary
2415 | Floating-Point Arithmetic.
2416 *----------------------------------------------------------------------------*/
2417 
float64_sqrt(float64 a,float_status_t & status)2418 float64 float64_sqrt(float64 a, float_status_t &status)
2419 {
2420     int aSign;
2421     Bit16s aExp, zExp;
2422     Bit64u aSig, zSig, doubleZSig;
2423     Bit64u rem0, rem1, term0, term1;
2424 
2425     aSig = extractFloat64Frac(a);
2426     aExp = extractFloat64Exp(a);
2427     aSign = extractFloat64Sign(a);
2428 
2429     if (aExp == 0x7FF) {
2430         if (aSig) return propagateFloat64NaN(a, status);
2431         if (! aSign) return a;
2432         float_raise(status, float_flag_invalid);
2433         return float64_default_nan;
2434     }
2435 
2436     if (get_denormals_are_zeros(status)) {
2437         if (aExp == 0) aSig = 0;
2438     }
2439 
2440     if (aSign) {
2441         if ((aExp | aSig) == 0) return packFloat64(aSign, 0, 0);
2442         float_raise(status, float_flag_invalid);
2443         return float64_default_nan;
2444     }
2445     if (aExp == 0) {
2446         if (aSig == 0) return 0;
2447         float_raise(status, float_flag_denormal);
2448         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2449     }
2450     zExp = ((aExp - 0x3FF)>>1) + 0x3FE;
2451     aSig |= BX_CONST64(0x0010000000000000);
2452     zSig = estimateSqrt32(aExp, (Bit32u)(aSig>>21));
2453     aSig <<= 9 - (aExp & 1);
2454     zSig = estimateDiv128To64(aSig, 0, zSig<<32) + (zSig<<30);
2455     if ((zSig & 0x1FF) <= 5) {
2456         doubleZSig = zSig<<1;
2457         mul64To128(zSig, zSig, &term0, &term1);
2458         sub128(aSig, 0, term0, term1, &rem0, &rem1);
2459         while ((Bit64s) rem0 < 0) {
2460             --zSig;
2461             doubleZSig -= 2;
2462             add128(rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1);
2463         }
2464         zSig |= ((rem0 | rem1) != 0);
2465     }
2466     return roundAndPackFloat64(0, zExp, zSig, status);
2467 }
2468 
2469 /*----------------------------------------------------------------------------
2470 | Determine double-precision floating-point number class
2471 *----------------------------------------------------------------------------*/
2472 
float64_class(float64 a)2473 float_class_t float64_class(float64 a)
2474 {
2475    Bit16s aExp = extractFloat64Exp(a);
2476    Bit64u aSig = extractFloat64Frac(a);
2477    int  aSign = extractFloat64Sign(a);
2478 
2479    if(aExp == 0x7FF) {
2480        if (aSig == 0)
2481            return (aSign) ? float_negative_inf : float_positive_inf;
2482 
2483        return (aSig & BX_CONST64(0x0008000000000000)) ? float_QNaN : float_SNaN;
2484    }
2485 
2486    if(aExp == 0) {
2487        if (aSig == 0)
2488            return float_zero;
2489        return float_denormal;
2490    }
2491 
2492    return float_normalized;
2493 }
2494 
2495 /*----------------------------------------------------------------------------
2496 | Compare  between  two  double  precision  floating  point  numbers. Returns
2497 | 'float_relation_equal'  if the operands are equal, 'float_relation_less' if
2498 | the    value    'a'   is   less   than   the   corresponding   value   `b',
2499 | 'float_relation_greater' if the value 'a' is greater than the corresponding
2500 | value `b', or 'float_relation_unordered' otherwise.
2501 *----------------------------------------------------------------------------*/
2502 
float64_compare(float64 a,float64 b,int quiet,float_status_t & status)2503 int float64_compare(float64 a, float64 b, int quiet, float_status_t &status)
2504 {
2505     if (get_denormals_are_zeros(status)) {
2506         a = float64_denormal_to_zero(a);
2507         b = float64_denormal_to_zero(b);
2508     }
2509 
2510     float_class_t aClass = float64_class(a);
2511     float_class_t bClass = float64_class(b);
2512 
2513     if (aClass == float_SNaN || bClass == float_SNaN) {
2514         float_raise(status, float_flag_invalid);
2515         return float_relation_unordered;
2516     }
2517 
2518     if (aClass == float_QNaN || bClass == float_QNaN) {
2519         if (! quiet) float_raise(status, float_flag_invalid);
2520         return float_relation_unordered;
2521     }
2522 
2523     if (aClass == float_denormal || bClass == float_denormal) {
2524         float_raise(status, float_flag_denormal);
2525     }
2526 
2527     if ((a == b) || ((Bit64u) ((a | b)<<1) == 0)) return float_relation_equal;
2528 
2529     int aSign = extractFloat64Sign(a);
2530     int bSign = extractFloat64Sign(b);
2531     if (aSign != bSign)
2532         return (aSign) ? float_relation_less : float_relation_greater;
2533 
2534     if (aSign ^ (a < b)) return float_relation_less;
2535     return float_relation_greater;
2536 }
2537 
2538 /*----------------------------------------------------------------------------
2539 | Compare between two double precision floating point numbers and return the
2540 | smaller of them.
2541 *----------------------------------------------------------------------------*/
2542 
float64_min(float64 a,float64 b,float_status_t & status)2543 float64 float64_min(float64 a, float64 b, float_status_t &status)
2544 {
2545   if (get_denormals_are_zeros(status)) {
2546     a = float64_denormal_to_zero(a);
2547     b = float64_denormal_to_zero(b);
2548   }
2549 
2550   return (float64_compare(a, b, status) == float_relation_less) ? a : b;
2551 }
2552 
2553 /*----------------------------------------------------------------------------
2554 | Compare between two double precision floating point numbers and return the
2555 | larger of them.
2556 *----------------------------------------------------------------------------*/
2557 
float64_max(float64 a,float64 b,float_status_t & status)2558 float64 float64_max(float64 a, float64 b, float_status_t &status)
2559 {
2560   if (get_denormals_are_zeros(status)) {
2561     a = float64_denormal_to_zero(a);
2562     b = float64_denormal_to_zero(b);
2563   }
2564 
2565   return (float64_compare(a, b, status) == float_relation_greater) ? a : b;
2566 }
2567 
2568 /*----------------------------------------------------------------------------
2569 | Compare between two  double precision  floating point numbers and  return the
2570 | smaller/larger of them. The operation  is performed according to the IEC/IEEE
2571 | Standard for Binary Floating-Point Arithmetic.
2572 *----------------------------------------------------------------------------*/
2573 
float64_minmax(float64 a,float64 b,int is_max,int is_abs,float_status_t & status)2574 float64 float64_minmax(float64 a, float64 b, int is_max, int is_abs, float_status_t &status)
2575 {
2576     if (get_denormals_are_zeros(status)) {
2577         a = float64_denormal_to_zero(a);
2578         b = float64_denormal_to_zero(b);
2579     }
2580 
2581     if (float64_is_nan(a) || float64_is_nan(b)) {
2582         if (float64_is_signaling_nan(a)) {
2583             return propagateFloat64NaN(a, status);
2584         }
2585         if (float64_is_signaling_nan(b)) {
2586             return propagateFloat64NaN(b, status);
2587         }
2588         if (! float64_is_nan(b)) {
2589             if (float64_is_denormal(b))
2590                 float_raise(status, float_flag_denormal);
2591             return b;
2592         }
2593         if (! float64_is_nan(a)) {
2594             if (float64_is_denormal(a))
2595                 float_raise(status, float_flag_denormal);
2596             return a;
2597         }
2598         return propagateFloat64NaN(a, b, status);
2599     }
2600 
2601     float64 tmp_a = a, tmp_b = b;
2602     if (is_abs) {
2603         tmp_a &= ~BX_CONST64(0x8000000000000000); // clear the sign bit
2604         tmp_b &= ~BX_CONST64(0x8000000000000000);
2605     }
2606 
2607     int aSign = extractFloat64Sign(tmp_a);
2608     int bSign = extractFloat64Sign(tmp_b);
2609 
2610     if (float64_is_denormal(a) || float64_is_denormal(b))
2611         float_raise(status, float_flag_denormal);
2612 
2613     if (aSign != bSign) {
2614         if (! is_max) {
2615             return aSign ? a : b;
2616         } else {
2617             return aSign ? b : a;
2618         }
2619     } else {
2620         if (! is_max) {
2621             return (aSign ^ (tmp_a < tmp_b)) ? a : b;
2622         } else {
2623             return (aSign ^ (tmp_a < tmp_b)) ? b : a;
2624         }
2625     }
2626 }
2627 
2628 #ifdef FLOATX80
2629 
2630 /*----------------------------------------------------------------------------
2631 | Returns the result of converting the 32-bit two's complement integer `a'
2632 | to the extended double-precision floating-point format.  The conversion
2633 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2634 | Arithmetic.
2635 *----------------------------------------------------------------------------*/
2636 
int32_to_floatx80(Bit32s a)2637 floatx80 int32_to_floatx80(Bit32s a)
2638 {
2639     if (a == 0) return packFloatx80(0, 0, 0);
2640     int   zSign = (a < 0);
2641     Bit32u absA = zSign ? -a : a;
2642     int    shiftCount = countLeadingZeros32(absA) + 32;
2643     Bit64u zSig = absA;
2644     return packFloatx80(zSign, 0x403E - shiftCount, zSig<<shiftCount);
2645 }
2646 
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the 64-bit two's complement integer `a'
2649 | to the extended double-precision floating-point format.  The conversion
2650 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic.
2652 *----------------------------------------------------------------------------*/
2653 
int64_to_floatx80(Bit64s a)2654 floatx80 int64_to_floatx80(Bit64s a)
2655 {
2656     if (a == 0) return packFloatx80(0, 0, 0);
2657     int   zSign = (a < 0);
2658     Bit64u absA = zSign ? -a : a;
2659     int    shiftCount = countLeadingZeros64(absA);
2660     return packFloatx80(zSign, 0x403E - shiftCount, absA<<shiftCount);
2661 }
2662 
2663 /*----------------------------------------------------------------------------
2664 | Returns the result of converting the single-precision floating-point value
2665 | `a' to the extended double-precision floating-point format.  The conversion
2666 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2667 | Arithmetic.
2668 *----------------------------------------------------------------------------*/
2669 
float32_to_floatx80(float32 a,float_status_t & status)2670 floatx80 float32_to_floatx80(float32 a, float_status_t &status)
2671 {
2672     Bit32u aSig = extractFloat32Frac(a);
2673     Bit16s aExp = extractFloat32Exp(a);
2674     int aSign = extractFloat32Sign(a);
2675     if (aExp == 0xFF) {
2676         if (aSig) return commonNaNToFloatx80(float32ToCommonNaN(a, status));
2677         return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2678     }
2679     if (aExp == 0) {
2680         if (aSig == 0) return packFloatx80(aSign, 0, 0);
2681         float_raise(status, float_flag_denormal);
2682         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2683     }
2684     aSig |= 0x00800000;
2685     return packFloatx80(aSign, aExp + 0x3F80, ((Bit64u) aSig)<<40);
2686 }
2687 
2688 /*----------------------------------------------------------------------------
2689 | Returns the result of converting the double-precision floating-point value
2690 | `a' to the extended double-precision floating-point format.  The conversion
2691 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2692 | Arithmetic.
2693 *----------------------------------------------------------------------------*/
2694 
float64_to_floatx80(float64 a,float_status_t & status)2695 floatx80 float64_to_floatx80(float64 a, float_status_t &status)
2696 {
2697     Bit64u aSig = extractFloat64Frac(a);
2698     Bit16s aExp = extractFloat64Exp(a);
2699     int aSign = extractFloat64Sign(a);
2700 
2701     if (aExp == 0x7FF) {
2702         if (aSig) return commonNaNToFloatx80(float64ToCommonNaN(a, status));
2703         return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2704     }
2705     if (aExp == 0) {
2706         if (aSig == 0) return packFloatx80(aSign, 0, 0);
2707         float_raise(status, float_flag_denormal);
2708         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2709     }
2710     return
2711         packFloatx80(
2712             aSign, aExp + 0x3C00, (aSig | BX_CONST64(0x0010000000000000))<<11);
2713 }
2714 
2715 /*----------------------------------------------------------------------------
2716 | Returns the result of converting the extended double-precision floating-
2717 | point value `a' to the 32-bit two's complement integer format.  The
2718 | conversion is performed according to the IEC/IEEE Standard for Binary
2719 | Floating-Point Arithmetic - which means in particular that the conversion
2720 | is rounded according to the current rounding mode. If `a' is a NaN or the
2721 | conversion overflows, the integer indefinite value is returned.
2722 *----------------------------------------------------------------------------*/
2723 
floatx80_to_int32(floatx80 a,float_status_t & status)2724 Bit32s floatx80_to_int32(floatx80 a, float_status_t &status)
2725 {
2726     Bit64u aSig = extractFloatx80Frac(a);
2727     Bit32s aExp = extractFloatx80Exp(a);
2728     int aSign = extractFloatx80Sign(a);
2729 
2730     // handle unsupported extended double-precision floating encodings
2731     if (floatx80_is_unsupported(a))
2732     {
2733         float_raise(status, float_flag_invalid);
2734         return int32_indefinite;
2735     }
2736 
2737     if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) aSign = 0;
2738     int shiftCount = 0x4037 - aExp;
2739     if (shiftCount <= 0) shiftCount = 1;
2740     aSig = shift64RightJamming(aSig, shiftCount);
2741     return roundAndPackInt32(aSign, aSig, status);
2742 }
2743 
2744 /*----------------------------------------------------------------------------
2745 | Returns the result of converting the extended double-precision floating-
2746 | point value `a' to the 32-bit two's complement integer format.  The
2747 | conversion is performed according to the IEC/IEEE Standard for Binary
2748 | Floating-Point Arithmetic, except that the conversion is always rounded
2749 | toward zero.  If `a' is a NaN or the conversion overflows, the integer
2750 | indefinite value is returned.
2751 *----------------------------------------------------------------------------*/
2752 
floatx80_to_int32_round_to_zero(floatx80 a,float_status_t & status)2753 Bit32s floatx80_to_int32_round_to_zero(floatx80 a, float_status_t &status)
2754 {
2755     Bit32s aExp;
2756     Bit64u aSig, savedASig;
2757     Bit32s z;
2758     int shiftCount;
2759 
2760     // handle unsupported extended double-precision floating encodings
2761     if (floatx80_is_unsupported(a))
2762     {
2763         float_raise(status, float_flag_invalid);
2764         return int32_indefinite;
2765     }
2766 
2767     aSig = extractFloatx80Frac(a);
2768     aExp = extractFloatx80Exp(a);
2769     int aSign = extractFloatx80Sign(a);
2770 
2771     if (aExp > 0x401E) {
2772         float_raise(status, float_flag_invalid);
2773         return (Bit32s)(int32_indefinite);
2774     }
2775     if (aExp < 0x3FFF) {
2776         if (aExp || aSig) float_raise(status, float_flag_inexact);
2777         return 0;
2778     }
2779     shiftCount = 0x403E - aExp;
2780     savedASig = aSig;
2781     aSig >>= shiftCount;
2782     z = (Bit32s) aSig;
2783     if (aSign) z = -z;
2784     if ((z < 0) ^ aSign) {
2785         float_raise(status, float_flag_invalid);
2786         return (Bit32s)(int32_indefinite);
2787     }
2788     if ((aSig<<shiftCount) != savedASig)
2789     {
2790         float_raise(status, float_flag_inexact);
2791     }
2792     return z;
2793 }
2794 
2795 /*----------------------------------------------------------------------------
2796 | Returns the result of converting the extended double-precision floating-
2797 | point value `a' to the 64-bit two's complement integer format.  The
2798 | conversion is performed according to the IEC/IEEE Standard for Binary
2799 | Floating-Point Arithmetic - which means in particular that the conversion
2800 | is rounded according to the current rounding mode. If `a' is a NaN or the
2801 | conversion overflows, the integer indefinite value is returned.
2802 *----------------------------------------------------------------------------*/
2803 
floatx80_to_int64(floatx80 a,float_status_t & status)2804 Bit64s floatx80_to_int64(floatx80 a, float_status_t &status)
2805 {
2806     Bit32s aExp;
2807     Bit64u aSig, aSigExtra;
2808 
2809     // handle unsupported extended double-precision floating encodings
2810     if (floatx80_is_unsupported(a))
2811     {
2812         float_raise(status, float_flag_invalid);
2813         return int64_indefinite;
2814     }
2815 
2816     aSig = extractFloatx80Frac(a);
2817     aExp = extractFloatx80Exp(a);
2818     int aSign = extractFloatx80Sign(a);
2819 
2820     int shiftCount = 0x403E - aExp;
2821     if (shiftCount <= 0)
2822     {
2823         if (shiftCount)
2824         {
2825             float_raise(status, float_flag_invalid);
2826             return (Bit64s)(int64_indefinite);
2827         }
2828         aSigExtra = 0;
2829     }
2830     else {
2831         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
2832     }
2833 
2834     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
2835 }
2836 
2837 /*----------------------------------------------------------------------------
2838 | Returns the result of converting the extended double-precision floating-
2839 | point value `a' to the 64-bit two's complement integer format.  The
2840 | conversion is performed according to the IEC/IEEE Standard for Binary
2841 | Floating-Point Arithmetic, except that the conversion is always rounded
2842 | toward zero.  If `a' is a NaN or the conversion overflows, the integer
2843 | indefinite value is returned.
2844 *----------------------------------------------------------------------------*/
2845 
floatx80_to_int64_round_to_zero(floatx80 a,float_status_t & status)2846 Bit64s floatx80_to_int64_round_to_zero(floatx80 a, float_status_t &status)
2847 {
2848     int aSign;
2849     Bit32s aExp;
2850     Bit64u aSig;
2851     Bit64s z;
2852 
2853     // handle unsupported extended double-precision floating encodings
2854     if (floatx80_is_unsupported(a))
2855     {
2856         float_raise(status, float_flag_invalid);
2857         return int64_indefinite;
2858     }
2859 
2860     aSig = extractFloatx80Frac(a);
2861     aExp = extractFloatx80Exp(a);
2862     aSign = extractFloatx80Sign(a);
2863     int shiftCount = aExp - 0x403E;
2864     if (0 <= shiftCount) {
2865         aSig &= BX_CONST64(0x7FFFFFFFFFFFFFFF);
2866         if ((a.exp != 0xC03E) || aSig) {
2867             float_raise(status, float_flag_invalid);
2868         }
2869         return (Bit64s)(int64_indefinite);
2870     }
2871     else if (aExp < 0x3FFF) {
2872         if (aExp | aSig) float_raise(status, float_flag_inexact);
2873         return 0;
2874     }
2875     z = aSig>>(-shiftCount);
2876     if ((Bit64u) (aSig<<(shiftCount & 63))) {
2877         float_raise(status, float_flag_inexact);
2878     }
2879     if (aSign) z = -z;
2880     return z;
2881 }
2882 
2883 /*----------------------------------------------------------------------------
2884 | Returns the result of converting the extended double-precision floating-
2885 | point value `a' to the single-precision floating-point format.  The
2886 | conversion is performed according to the IEC/IEEE Standard for Binary
2887 | Floating-Point Arithmetic.
2888 *----------------------------------------------------------------------------*/
2889 
floatx80_to_float32(floatx80 a,float_status_t & status)2890 float32 floatx80_to_float32(floatx80 a, float_status_t &status)
2891 {
2892     Bit64u aSig = extractFloatx80Frac(a);
2893     Bit32s aExp = extractFloatx80Exp(a);
2894     int aSign = extractFloatx80Sign(a);
2895 
2896     // handle unsupported extended double-precision floating encodings
2897     if (floatx80_is_unsupported(a))
2898     {
2899         float_raise(status, float_flag_invalid);
2900         return float32_default_nan;
2901     }
2902 
2903     if (aExp == 0x7FFF) {
2904         if ((Bit64u) (aSig<<1))
2905             return commonNaNToFloat32(floatx80ToCommonNaN(a, status));
2906 
2907         return packFloat32(aSign, 0xFF, 0);
2908     }
2909     aSig = shift64RightJamming(aSig, 33);
2910     if (aExp || aSig) aExp -= 0x3F81;
2911     return roundAndPackFloat32(aSign, aExp, (Bit32u) aSig, status);
2912 }
2913 
2914 /*----------------------------------------------------------------------------
2915 | Returns the result of converting the extended double-precision floating-
2916 | point value `a' to the double-precision floating-point format.  The
2917 | conversion is performed according to the IEC/IEEE Standard for Binary
2918 | Floating-Point Arithmetic.
2919 *----------------------------------------------------------------------------*/
2920 
floatx80_to_float64(floatx80 a,float_status_t & status)2921 float64 floatx80_to_float64(floatx80 a, float_status_t &status)
2922 {
2923     Bit32s aExp;
2924     Bit64u aSig, zSig;
2925 
2926     // handle unsupported extended double-precision floating encodings
2927     if (floatx80_is_unsupported(a))
2928     {
2929         float_raise(status, float_flag_invalid);
2930         return float64_default_nan;
2931     }
2932 
2933     aSig = extractFloatx80Frac(a);
2934     aExp = extractFloatx80Exp(a);
2935     int aSign = extractFloatx80Sign(a);
2936 
2937     if (aExp == 0x7FFF) {
2938         if ((Bit64u) (aSig<<1)) {
2939             return commonNaNToFloat64(floatx80ToCommonNaN(a, status));
2940         }
2941         return packFloat64(aSign, 0x7FF, 0);
2942     }
2943     zSig = shift64RightJamming(aSig, 1);
2944     if (aExp || aSig) aExp -= 0x3C01;
2945     return roundAndPackFloat64(aSign, aExp, zSig, status);
2946 }
2947 
2948 /*----------------------------------------------------------------------------
2949 | Rounds the extended double-precision floating-point value `a' to an integer,
2950 | and returns the result as an extended double-precision floating-point
2951 | value.  The operation is performed according to the IEC/IEEE Standard for
2952 | Binary Floating-Point Arithmetic.
2953 *----------------------------------------------------------------------------*/
2954 
floatx80_round_to_int(floatx80 a,float_status_t & status)2955 floatx80 floatx80_round_to_int(floatx80 a, float_status_t &status)
2956 {
2957     int aSign;
2958     Bit64u lastBitMask, roundBitsMask;
2959     int roundingMode = get_float_rounding_mode(status);
2960     floatx80 z;
2961 
2962     // handle unsupported extended double-precision floating encodings
2963     if (floatx80_is_unsupported(a))
2964     {
2965         float_raise(status, float_flag_invalid);
2966         return floatx80_default_nan;
2967     }
2968 
2969     Bit32s aExp = extractFloatx80Exp(a);
2970     Bit64u aSig = extractFloatx80Frac(a);
2971     if (0x403E <= aExp) {
2972         if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) {
2973             return propagateFloatx80NaN(a, status);
2974         }
2975         return a;
2976     }
2977     if (aExp < 0x3FFF) {
2978         if (aExp == 0) {
2979             if ((aSig<<1) == 0) return a;
2980             float_raise(status, float_flag_denormal);
2981         }
2982         float_raise(status, float_flag_inexact);
2983         aSign = extractFloatx80Sign(a);
2984         switch (roundingMode) {
2985          case float_round_nearest_even:
2986             if ((aExp == 0x3FFE) && (Bit64u) (aSig<<1)) {
2987                 set_float_rounding_up(status);
2988                 return packFloatx80(aSign, 0x3FFF, BX_CONST64(0x8000000000000000));
2989             }
2990             break;
2991          case float_round_down:
2992             if (aSign) {
2993                 set_float_rounding_up(status);
2994                 return packFloatx80(1, 0x3FFF, BX_CONST64(0x8000000000000000));
2995             }
2996             else {
2997                 return packFloatx80(0, 0, 0);
2998             }
2999          case float_round_up:
3000             if (aSign) {
3001                 return packFloatx80(1, 0, 0);
3002             }
3003             else {
3004                 set_float_rounding_up(status);
3005                 return packFloatx80(0, 0x3FFF, BX_CONST64(0x8000000000000000));
3006             }
3007         }
3008         return packFloatx80(aSign, 0, 0);
3009     }
3010     lastBitMask = 1;
3011     lastBitMask <<= 0x403E - aExp;
3012     roundBitsMask = lastBitMask - 1;
3013     z = a;
3014     if (roundingMode == float_round_nearest_even) {
3015         z.fraction += lastBitMask>>1;
3016         if ((z.fraction & roundBitsMask) == 0) z.fraction &= ~lastBitMask;
3017     }
3018     else if (roundingMode != float_round_to_zero) {
3019         if (extractFloatx80Sign(z) ^ (roundingMode == float_round_up))
3020             z.fraction += roundBitsMask;
3021     }
3022     z.fraction &= ~roundBitsMask;
3023     if (z.fraction == 0) {
3024         z.exp++;
3025         z.fraction = BX_CONST64(0x8000000000000000);
3026     }
3027     if (z.fraction != a.fraction) {
3028         float_raise(status, float_flag_inexact);
3029         if (z.fraction > a.fraction || z.exp > a.exp)
3030             set_float_rounding_up(status);
3031     }
3032     return z;
3033 }
3034 
3035 /*----------------------------------------------------------------------------
3036 | Returns the result of adding the absolute values of the extended double-
3037 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3038 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3039 | The addition is performed according to the IEC/IEEE Standard for Binary
3040 | Floating-Point Arithmetic.
3041 *----------------------------------------------------------------------------*/
3042 
addFloatx80Sigs(floatx80 a,floatx80 b,int zSign,float_status_t & status)3043 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
3044 {
3045     Bit32s aExp, bExp, zExp;
3046     Bit64u aSig, bSig, zSig0, zSig1;
3047 
3048     // handle unsupported extended double-precision floating encodings
3049     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3050     {
3051         float_raise(status, float_flag_invalid);
3052         return floatx80_default_nan;
3053     }
3054 
3055     aSig = extractFloatx80Frac(a);
3056     aExp = extractFloatx80Exp(a);
3057     bSig = extractFloatx80Frac(b);
3058     bExp = extractFloatx80Exp(b);
3059 
3060     if (aExp == 0x7FFF) {
3061         if ((Bit64u) (aSig<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
3062             return propagateFloatx80NaN(a, b, status);
3063         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3064         return a;
3065     }
3066     if (bExp == 0x7FFF) {
3067         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3068         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3069         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3070     }
3071     if (aExp == 0) {
3072         if (aSig == 0) {
3073             if ((bExp == 0) && bSig) {
3074                 float_raise(status, float_flag_denormal);
3075                 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3076             }
3077             return roundAndPackFloatx80(get_float_rounding_precision(status),
3078                     zSign, bExp, bSig, 0, status);
3079         }
3080         float_raise(status, float_flag_denormal);
3081         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3082     }
3083     if (bExp == 0) {
3084         if (bSig == 0)
3085             return roundAndPackFloatx80(get_float_rounding_precision(status),
3086                     zSign, aExp, aSig, 0, status);
3087 
3088         float_raise(status, float_flag_denormal);
3089         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3090     }
3091     Bit32s expDiff = aExp - bExp;
3092     zExp = aExp;
3093     if (0 < expDiff) {
3094         shift64ExtraRightJamming(bSig, 0,  expDiff, &bSig, &zSig1);
3095     }
3096     else if (expDiff < 0) {
3097         shift64ExtraRightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
3098         zExp = bExp;
3099     }
3100     else {
3101         zSig0 = aSig + bSig;
3102         zSig1 = 0;
3103         goto shiftRight1;
3104     }
3105     zSig0 = aSig + bSig;
3106     if ((Bit64s) zSig0 < 0) goto roundAndPack;
3107  shiftRight1:
3108     shift64ExtraRightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3109     zSig0 |= BX_CONST64(0x8000000000000000);
3110     zExp++;
3111  roundAndPack:
3112     return
3113         roundAndPackFloatx80(get_float_rounding_precision(status),
3114             zSign, zExp, zSig0, zSig1, status);
3115 }
3116 
3117 /*----------------------------------------------------------------------------
3118 | Returns the result of subtracting the absolute values of the extended
3119 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3120 | difference is negated before being returned.  `zSign' is ignored if the
3121 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3122 | Standard for Binary Floating-Point Arithmetic.
3123 *----------------------------------------------------------------------------*/
3124 
subFloatx80Sigs(floatx80 a,floatx80 b,int zSign,float_status_t & status)3125 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
3126 {
3127     Bit32s aExp, bExp, zExp;
3128     Bit64u aSig, bSig, zSig0, zSig1;
3129 
3130     // handle unsupported extended double-precision floating encodings
3131     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3132     {
3133         float_raise(status, float_flag_invalid);
3134         return floatx80_default_nan;
3135     }
3136 
3137     aSig = extractFloatx80Frac(a);
3138     aExp = extractFloatx80Exp(a);
3139     bSig = extractFloatx80Frac(b);
3140     bExp = extractFloatx80Exp(b);
3141 
3142     if (aExp == 0x7FFF) {
3143         if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
3144         if (bExp == 0x7FFF) {
3145             if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3146             float_raise(status, float_flag_invalid);
3147             return floatx80_default_nan;
3148         }
3149         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3150         return a;
3151     }
3152     if (bExp == 0x7FFF) {
3153         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3154         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3155         return packFloatx80(zSign ^ 1, 0x7FFF, BX_CONST64(0x8000000000000000));
3156     }
3157     if (aExp == 0) {
3158         if (aSig == 0) {
3159             if (bExp == 0) {
3160                 if (bSig) {
3161                     float_raise(status, float_flag_denormal);
3162                     normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3163                     return roundAndPackFloatx80(get_float_rounding_precision(status),
3164                         zSign ^ 1, bExp, bSig, 0, status);
3165                 }
3166                 return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0);
3167             }
3168             return roundAndPackFloatx80(get_float_rounding_precision(status),
3169                     zSign ^ 1, bExp, bSig, 0, status);
3170         }
3171         float_raise(status, float_flag_denormal);
3172         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3173     }
3174     if (bExp == 0) {
3175         if (bSig == 0)
3176             return roundAndPackFloatx80(get_float_rounding_precision(status),
3177                     zSign, aExp, aSig, 0, status);
3178 
3179         float_raise(status, float_flag_denormal);
3180         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3181     }
3182     Bit32s expDiff = aExp - bExp;
3183     if (0 < expDiff) {
3184         shift128RightJamming(bSig, 0, expDiff, &bSig, &zSig1);
3185         goto aBigger;
3186     }
3187     if (expDiff < 0) {
3188         shift128RightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
3189         goto bBigger;
3190     }
3191     zSig1 = 0;
3192     if (bSig < aSig) goto aBigger;
3193     if (aSig < bSig) goto bBigger;
3194     return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0);
3195  bBigger:
3196     sub128(bSig, 0, aSig, zSig1, &zSig0, &zSig1);
3197     zExp = bExp;
3198     zSign ^= 1;
3199     goto normalizeRoundAndPack;
3200  aBigger:
3201     sub128(aSig, 0, bSig, zSig1, &zSig0, &zSig1);
3202     zExp = aExp;
3203  normalizeRoundAndPack:
3204     return
3205         normalizeRoundAndPackFloatx80(get_float_rounding_precision(status),
3206             zSign, zExp, zSig0, zSig1, status);
3207 }
3208 
3209 /*----------------------------------------------------------------------------
3210 | Returns the result of adding the extended double-precision floating-point
3211 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3212 | Standard for Binary Floating-Point Arithmetic.
3213 *----------------------------------------------------------------------------*/
3214 
floatx80_add(floatx80 a,floatx80 b,float_status_t & status)3215 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status_t &status)
3216 {
3217     int aSign = extractFloatx80Sign(a);
3218     int bSign = extractFloatx80Sign(b);
3219 
3220     if (aSign == bSign)
3221         return addFloatx80Sigs(a, b, aSign, status);
3222     else
3223         return subFloatx80Sigs(a, b, aSign, status);
3224 }
3225 
3226 /*----------------------------------------------------------------------------
3227 | Returns the result of subtracting the extended double-precision floating-
3228 | point values `a' and `b'.  The operation is performed according to the
3229 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3230 *----------------------------------------------------------------------------*/
3231 
floatx80_sub(floatx80 a,floatx80 b,float_status_t & status)3232 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status_t &status)
3233 {
3234     int aSign = extractFloatx80Sign(a);
3235     int bSign = extractFloatx80Sign(b);
3236 
3237     if (aSign == bSign)
3238         return subFloatx80Sigs(a, b, aSign, status);
3239     else
3240         return addFloatx80Sigs(a, b, aSign, status);
3241 }
3242 
3243 /*----------------------------------------------------------------------------
3244 | Returns the result of multiplying the extended double-precision floating-
3245 | point values `a' and `b'.  The operation is performed according to the
3246 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3247 *----------------------------------------------------------------------------*/
3248 
floatx80_mul(floatx80 a,floatx80 b,float_status_t & status)3249 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status_t &status)
3250 {
3251     int aSign, bSign, zSign;
3252     Bit32s aExp, bExp, zExp;
3253     Bit64u aSig, bSig, zSig0, zSig1;
3254 
3255     // handle unsupported extended double-precision floating encodings
3256     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3257     {
3258  invalid:
3259         float_raise(status, float_flag_invalid);
3260         return floatx80_default_nan;
3261     }
3262 
3263     aSig = extractFloatx80Frac(a);
3264     aExp = extractFloatx80Exp(a);
3265     aSign = extractFloatx80Sign(a);
3266     bSig = extractFloatx80Frac(b);
3267     bExp = extractFloatx80Exp(b);
3268     bSign = extractFloatx80Sign(b);
3269     zSign = aSign ^ bSign;
3270 
3271     if (aExp == 0x7FFF) {
3272         if ((Bit64u) (aSig<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1))) {
3273             return propagateFloatx80NaN(a, b, status);
3274         }
3275         if (bExp == 0) {
3276             if (bSig == 0) goto invalid;
3277             float_raise(status, float_flag_denormal);
3278         }
3279         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3280     }
3281     if (bExp == 0x7FFF) {
3282         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3283         if (aExp == 0) {
3284             if (aSig == 0) goto invalid;
3285             float_raise(status, float_flag_denormal);
3286         }
3287         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3288     }
3289     if (aExp == 0) {
3290         if (aSig == 0) {
3291             if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3292             return packFloatx80(zSign, 0, 0);
3293         }
3294         float_raise(status, float_flag_denormal);
3295         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3296     }
3297     if (bExp == 0) {
3298         if (bSig == 0) return packFloatx80(zSign, 0, 0);
3299         float_raise(status, float_flag_denormal);
3300         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3301     }
3302     zExp = aExp + bExp - 0x3FFE;
3303     mul64To128(aSig, bSig, &zSig0, &zSig1);
3304     if (0 < (Bit64s) zSig0) {
3305         shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
3306         --zExp;
3307     }
3308     return
3309         roundAndPackFloatx80(get_float_rounding_precision(status),
3310              zSign, zExp, zSig0, zSig1, status);
3311 }
3312 
3313 /*----------------------------------------------------------------------------
3314 | Returns the result of dividing the extended double-precision floating-point
3315 | value `a' by the corresponding value `b'.  The operation is performed
3316 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3317 *----------------------------------------------------------------------------*/
3318 
floatx80_div(floatx80 a,floatx80 b,float_status_t & status)3319 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status_t &status)
3320 {
3321     int aSign, bSign, zSign;
3322     Bit32s aExp, bExp, zExp;
3323     Bit64u aSig, bSig, zSig0, zSig1;
3324     Bit64u rem0, rem1, rem2, term0, term1, term2;
3325 
3326     // handle unsupported extended double-precision floating encodings
3327     if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3328     {
3329         float_raise(status, float_flag_invalid);
3330         return floatx80_default_nan;
3331     }
3332 
3333     aSig = extractFloatx80Frac(a);
3334     aExp = extractFloatx80Exp(a);
3335     aSign = extractFloatx80Sign(a);
3336     bSig = extractFloatx80Frac(b);
3337     bExp = extractFloatx80Exp(b);
3338     bSign = extractFloatx80Sign(b);
3339 
3340     zSign = aSign ^ bSign;
3341     if (aExp == 0x7FFF) {
3342         if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
3343         if (bExp == 0x7FFF) {
3344             if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3345             float_raise(status, float_flag_invalid);
3346             return floatx80_default_nan;
3347         }
3348         if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3349         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3350     }
3351     if (bExp == 0x7FFF) {
3352         if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3353         if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3354         return packFloatx80(zSign, 0, 0);
3355     }
3356     if (bExp == 0) {
3357         if (bSig == 0) {
3358             if ((aExp | aSig) == 0) {
3359                 float_raise(status, float_flag_invalid);
3360                 return floatx80_default_nan;
3361             }
3362             float_raise(status, float_flag_divbyzero);
3363             return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3364         }
3365         float_raise(status, float_flag_denormal);
3366         normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3367     }
3368     if (aExp == 0) {
3369         if (aSig == 0) return packFloatx80(zSign, 0, 0);
3370         float_raise(status, float_flag_denormal);
3371         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3372     }
3373     zExp = aExp - bExp + 0x3FFE;
3374     rem1 = 0;
3375     if (bSig <= aSig) {
3376         shift128Right(aSig, 0, 1, &aSig, &rem1);
3377         ++zExp;
3378     }
3379     zSig0 = estimateDiv128To64(aSig, rem1, bSig);
3380     mul64To128(bSig, zSig0, &term0, &term1);
3381     sub128(aSig, rem1, term0, term1, &rem0, &rem1);
3382     while ((Bit64s) rem0 < 0) {
3383         --zSig0;
3384         add128(rem0, rem1, 0, bSig, &rem0, &rem1);
3385     }
3386     zSig1 = estimateDiv128To64(rem1, 0, bSig);
3387     if ((Bit64u) (zSig1<<1) <= 8) {
3388         mul64To128(bSig, zSig1, &term1, &term2);
3389         sub128(rem1, 0, term1, term2, &rem1, &rem2);
3390         while ((Bit64s) rem1 < 0) {
3391             --zSig1;
3392             add128(rem1, rem2, 0, bSig, &rem1, &rem2);
3393         }
3394         zSig1 |= ((rem1 | rem2) != 0);
3395     }
3396     return
3397         roundAndPackFloatx80(get_float_rounding_precision(status),
3398             zSign, zExp, zSig0, zSig1, status);
3399 }
3400 
3401 /*----------------------------------------------------------------------------
3402 | Returns the square root of the extended double-precision floating-point
3403 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3404 | for Binary Floating-Point Arithmetic.
3405 *----------------------------------------------------------------------------*/
3406 
floatx80_sqrt(floatx80 a,float_status_t & status)3407 floatx80 floatx80_sqrt(floatx80 a, float_status_t &status)
3408 {
3409     int aSign;
3410     Bit32s aExp, zExp;
3411     Bit64u aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3412     Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3413 
3414     // handle unsupported extended double-precision floating encodings
3415     if (floatx80_is_unsupported(a))
3416     {
3417         float_raise(status, float_flag_invalid);
3418         return floatx80_default_nan;
3419     }
3420 
3421     aSig0 = extractFloatx80Frac(a);
3422     aExp = extractFloatx80Exp(a);
3423     aSign = extractFloatx80Sign(a);
3424     if (aExp == 0x7FFF) {
3425         if ((Bit64u) (aSig0<<1)) return propagateFloatx80NaN(a, status);
3426         if (! aSign) return a;
3427         float_raise(status, float_flag_invalid);
3428         return floatx80_default_nan;
3429     }
3430     if (aSign) {
3431         if ((aExp | aSig0) == 0) return a;
3432         float_raise(status, float_flag_invalid);
3433         return floatx80_default_nan;
3434     }
3435     if (aExp == 0) {
3436         if (aSig0 == 0) return packFloatx80(0, 0, 0);
3437         float_raise(status, float_flag_denormal);
3438         normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
3439     }
3440     zExp = ((aExp - 0x3FFF)>>1) + 0x3FFF;
3441     zSig0 = estimateSqrt32(aExp, aSig0>>32);
3442     shift128Right(aSig0, 0, 2 + (aExp & 1), &aSig0, &aSig1);
3443     zSig0 = estimateDiv128To64(aSig0, aSig1, zSig0<<32) + (zSig0<<30);
3444     doubleZSig0 = zSig0<<1;
3445     mul64To128(zSig0, zSig0, &term0, &term1);
3446     sub128(aSig0, aSig1, term0, term1, &rem0, &rem1);
3447     while ((Bit64s) rem0 < 0) {
3448         --zSig0;
3449         doubleZSig0 -= 2;
3450         add128(rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1);
3451     }
3452     zSig1 = estimateDiv128To64(rem1, 0, doubleZSig0);
3453     if ((zSig1 & BX_CONST64(0x3FFFFFFFFFFFFFFF)) <= 5) {
3454         if (zSig1 == 0) zSig1 = 1;
3455         mul64To128(doubleZSig0, zSig1, &term1, &term2);
3456         sub128(rem1, 0, term1, term2, &rem1, &rem2);
3457         mul64To128(zSig1, zSig1, &term2, &term3);
3458         sub192(rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3);
3459         while ((Bit64s) rem1 < 0) {
3460             --zSig1;
3461             shortShift128Left(0, zSig1, 1, &term2, &term3);
3462             term3 |= 1;
3463             term2 |= doubleZSig0;
3464             add192(rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3);
3465         }
3466         zSig1 |= ((rem1 | rem2 | rem3) != 0);
3467     }
3468     shortShift128Left(0, zSig1, 1, &zSig0, &zSig1);
3469     zSig0 |= doubleZSig0;
3470     return
3471         roundAndPackFloatx80(get_float_rounding_precision(status),
3472             0, zExp, zSig0, zSig1, status);
3473 }
3474 
3475 #endif
3476 
3477 #ifdef FLOAT128
3478 
3479 /*----------------------------------------------------------------------------
3480 | Returns the result of converting the extended double-precision floating-
3481 | point value `a' to the quadruple-precision floating-point format. The
3482 | conversion is performed according to the IEC/IEEE Standard for Binary
3483 | Floating-Point Arithmetic.
3484 *----------------------------------------------------------------------------*/
3485 
floatx80_to_float128(floatx80 a,float_status_t & status)3486 float128 floatx80_to_float128(floatx80 a, float_status_t &status)
3487 {
3488     Bit64u zSig0, zSig1;
3489 
3490     Bit64u aSig = extractFloatx80Frac(a);
3491     Bit32s aExp = extractFloatx80Exp(a);
3492     int   aSign = extractFloatx80Sign(a);
3493 
3494     if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1))
3495         return commonNaNToFloat128(floatx80ToCommonNaN(a, status));
3496 
3497     shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
3498     return packFloat128(aSign, aExp, zSig0, zSig1);
3499 }
3500 
3501 /*----------------------------------------------------------------------------
3502 | Returns the result of converting the quadruple-precision floating-point
3503 | value `a' to the extended double-precision floating-point format.  The
3504 | conversion is performed according to the IEC/IEEE Standard for Binary
3505 | Floating-Point Arithmetic.
3506 *----------------------------------------------------------------------------*/
3507 
float128_to_floatx80(float128 a,float_status_t & status)3508 floatx80 float128_to_floatx80(float128 a, float_status_t &status)
3509 {
3510     Bit32s aExp;
3511     Bit64u aSig0, aSig1;
3512 
3513     aSig1 = extractFloat128Frac1(a);
3514     aSig0 = extractFloat128Frac0(a);
3515     aExp = extractFloat128Exp(a);
3516     int aSign = extractFloat128Sign(a);
3517 
3518     if (aExp == 0x7FFF) {
3519         if (aSig0 | aSig1)
3520             return commonNaNToFloatx80(float128ToCommonNaN(a, status));
3521 
3522         return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3523     }
3524 
3525     if (aExp == 0) {
3526         if ((aSig0 | aSig1) == 0) return packFloatx80(aSign, 0, 0);
3527         float_raise(status, float_flag_denormal);
3528         normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3529     }
3530     else aSig0 |= BX_CONST64(0x0001000000000000);
3531 
3532     shortShift128Left(aSig0, aSig1, 15, &aSig0, &aSig1);
3533     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
3534 }
3535 
3536 /*----------------------------------------------------------------------------
3537 | Returns the result of multiplying the extended double-precision floating-
3538 | point value `a' and quadruple-precision floating point value `b'. The
3539 | operation is performed according to the IEC/IEEE Standard for Binary
3540 | Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3542 
floatx80_mul(floatx80 a,float128 b,float_status_t & status)3543 floatx80 floatx80_mul(floatx80 a, float128 b, float_status_t &status)
3544 {
3545     Bit32s aExp, bExp, zExp;
3546     Bit64u aSig, bSig0, bSig1, zSig0, zSig1, zSig2;
3547     int aSign, bSign, zSign;
3548 
3549     // handle unsupported extended double-precision floating encodings
3550     if (floatx80_is_unsupported(a))
3551     {
3552  invalid:
3553         float_raise(status, float_flag_invalid);
3554         return floatx80_default_nan;
3555     }
3556 
3557     aSig = extractFloatx80Frac(a);
3558     aExp = extractFloatx80Exp(a);
3559     aSign = extractFloatx80Sign(a);
3560     bSig0 = extractFloat128Frac0(b);
3561     bSig1 = extractFloat128Frac1(b);
3562     bExp = extractFloat128Exp(b);
3563     bSign = extractFloat128Sign(b);
3564 
3565     zSign = aSign ^ bSign;
3566 
3567     if (aExp == 0x7FFF) {
3568         if ((Bit64u) (aSig<<1)
3569              || ((bExp == 0x7FFF) && (bSig0 | bSig1)))
3570         {
3571             floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
3572             return propagateFloatx80NaN(a, r, status);
3573         }
3574         if (bExp == 0) {
3575             if ((bSig0 | bSig1) == 0) goto invalid;
3576             float_raise(status, float_flag_denormal);
3577         }
3578         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3579     }
3580     if (bExp == 0x7FFF) {
3581         if (bSig0 | bSig1) {
3582             floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
3583             return propagateFloatx80NaN(a, r, status);
3584         }
3585         if (aExp == 0) {
3586             if (aSig == 0) goto invalid;
3587             float_raise(status, float_flag_denormal);
3588         }
3589         return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3590     }
3591     if (aExp == 0) {
3592         if (aSig == 0) {
3593             if ((bExp == 0) && (bSig0 | bSig1)) float_raise(status, float_flag_denormal);
3594             return packFloatx80(zSign, 0, 0);
3595         }
3596         float_raise(status, float_flag_denormal);
3597         normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3598     }
3599     if (bExp == 0) {
3600         if ((bSig0 | bSig1) == 0) return packFloatx80(zSign, 0, 0);
3601         float_raise(status, float_flag_denormal);
3602         normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3603     }
3604     else bSig0 |= BX_CONST64(0x0001000000000000);
3605 
3606     zExp = aExp + bExp - 0x3FFE;
3607     shortShift128Left(bSig0, bSig1, 15, &bSig0, &bSig1);
3608     mul128By64To192(bSig0, bSig1, aSig, &zSig0, &zSig1, &zSig2);
3609     if (0 < (Bit64s) zSig0) {
3610         shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
3611         --zExp;
3612     }
3613     return
3614         roundAndPackFloatx80(get_float_rounding_precision(status),
3615              zSign, zExp, zSig0, zSig1, status);
3616 }
3617 
3618 /*----------------------------------------------------------------------------
3619 | Returns the result of adding the absolute values of the quadruple-precision
3620 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3621 | before being returned. `zSign' is ignored if the result is a NaN.
3622 | The addition is performed according to the IEC/IEEE Standard for Binary
3623 | Floating-Point Arithmetic.
3624 *----------------------------------------------------------------------------*/
3625 
addFloat128Sigs(float128 a,float128 b,int zSign,float_status_t & status)3626 static float128 addFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
3627 {
3628     Bit32s aExp, bExp, zExp;
3629     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
3630     Bit32s expDiff;
3631 
3632     aSig1 = extractFloat128Frac1(a);
3633     aSig0 = extractFloat128Frac0(a);
3634     aExp = extractFloat128Exp(a);
3635     bSig1 = extractFloat128Frac1(b);
3636     bSig0 = extractFloat128Frac0(b);
3637     bExp = extractFloat128Exp(b);
3638     expDiff = aExp - bExp;
3639 
3640     if (0 < expDiff) {
3641         if (aExp == 0x7FFF) {
3642             if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3643             return a;
3644         }
3645         if (bExp == 0) --expDiff;
3646         else bSig0 |= BX_CONST64(0x0001000000000000);
3647         shift128ExtraRightJamming(bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2);
3648         zExp = aExp;
3649     }
3650     else if (expDiff < 0) {
3651         if (bExp == 0x7FFF) {
3652             if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3653             return packFloat128(zSign, 0x7FFF, 0, 0);
3654         }
3655         if (aExp == 0) ++expDiff;
3656         else aSig0 |= BX_CONST64(0x0001000000000000);
3657         shift128ExtraRightJamming(aSig0, aSig1, 0, -expDiff, &aSig0, &aSig1, &zSig2);
3658         zExp = bExp;
3659     }
3660     else {
3661         if (aExp == 0x7FFF) {
3662             if (aSig0 | aSig1 | bSig0 | bSig1)
3663                 return propagateFloat128NaN(a, b, status);
3664 
3665             return a;
3666         }
3667         add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3668         if (aExp == 0) return packFloat128(zSign, 0, zSig0, zSig1);
3669         zSig2 = 0;
3670         zSig0 |= BX_CONST64(0x0002000000000000);
3671         zExp = aExp;
3672         goto shiftRight1;
3673     }
3674     aSig0 |= BX_CONST64(0x0001000000000000);
3675     add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3676     --zExp;
3677     if (zSig0 < BX_CONST64(0x0002000000000000)) goto roundAndPack;
3678     ++zExp;
3679  shiftRight1:
3680     shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
3681  roundAndPack:
3682     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3683 }
3684 
3685 /*----------------------------------------------------------------------------
3686 | Returns the result of subtracting the absolute values of the quadruple-
3687 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3688 | difference is negated before being returned.  `zSign' is ignored if the
3689 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3690 | Standard for Binary Floating-Point Arithmetic.
3691 *----------------------------------------------------------------------------*/
3692 
subFloat128Sigs(float128 a,float128 b,int zSign,float_status_t & status)3693 static float128 subFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
3694 {
3695     Bit32s aExp, bExp, zExp;
3696     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
3697     Bit32s expDiff;
3698 
3699     aSig1 = extractFloat128Frac1(a);
3700     aSig0 = extractFloat128Frac0(a);
3701     aExp = extractFloat128Exp(a);
3702     bSig1 = extractFloat128Frac1(b);
3703     bSig0 = extractFloat128Frac0(b);
3704     bExp = extractFloat128Exp(b);
3705 
3706     expDiff = aExp - bExp;
3707     shortShift128Left(aSig0, aSig1, 14, &aSig0, &aSig1);
3708     shortShift128Left(bSig0, bSig1, 14, &bSig0, &bSig1);
3709     if (0 < expDiff) goto aExpBigger;
3710     if (expDiff < 0) goto bExpBigger;
3711     if (aExp == 0x7FFF) {
3712         if (aSig0 | aSig1 | bSig0 | bSig1)
3713             return propagateFloat128NaN(a, b, status);
3714 
3715         float_raise(status, float_flag_invalid);
3716         return float128_default_nan;
3717     }
3718     if (aExp == 0) {
3719         aExp = 1;
3720         bExp = 1;
3721     }
3722     if (bSig0 < aSig0) goto aBigger;
3723     if (aSig0 < bSig0) goto bBigger;
3724     if (bSig1 < aSig1) goto aBigger;
3725     if (aSig1 < bSig1) goto bBigger;
3726     return packFloat128(0, 0);
3727 
3728  bExpBigger:
3729     if (bExp == 0x7FFF) {
3730         if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3731         return packFloat128(zSign ^ 1, 0x7FFF, 0, 0);
3732     }
3733     if (aExp == 0) ++expDiff;
3734     else {
3735         aSig0 |= BX_CONST64(0x4000000000000000);
3736     }
3737     shift128RightJamming(aSig0, aSig1, - expDiff, &aSig0, &aSig1);
3738     bSig0 |= BX_CONST64(0x4000000000000000);
3739  bBigger:
3740     sub128(bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1);
3741     zExp = bExp;
3742     zSign ^= 1;
3743     goto normalizeRoundAndPack;
3744  aExpBigger:
3745     if (aExp == 0x7FFF) {
3746         if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3747         return a;
3748     }
3749     if (bExp == 0) --expDiff;
3750     else {
3751         bSig0 |= BX_CONST64(0x4000000000000000);
3752     }
3753     shift128RightJamming(bSig0, bSig1, expDiff, &bSig0, &bSig1);
3754     aSig0 |= BX_CONST64(0x4000000000000000);
3755  aBigger:
3756     sub128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3757     zExp = aExp;
3758  normalizeRoundAndPack:
3759     --zExp;
3760     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, status);
3761 }
3762 
3763 /*----------------------------------------------------------------------------
3764 | Returns the result of adding the quadruple-precision floating-point values
3765 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3766 | for Binary Floating-Point Arithmetic.
3767 *----------------------------------------------------------------------------*/
3768 
float128_add(float128 a,float128 b,float_status_t & status)3769 float128 float128_add(float128 a, float128 b, float_status_t &status)
3770 {
3771     int aSign = extractFloat128Sign(a);
3772     int bSign = extractFloat128Sign(b);
3773 
3774     if (aSign == bSign) {
3775         return addFloat128Sigs(a, b, aSign, status);
3776     }
3777     else {
3778         return subFloat128Sigs(a, b, aSign, status);
3779     }
3780 }
3781 
3782 /*----------------------------------------------------------------------------
3783 | Returns the result of subtracting the quadruple-precision floating-point
3784 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3785 | Standard for Binary Floating-Point Arithmetic.
3786 *----------------------------------------------------------------------------*/
3787 
float128_sub(float128 a,float128 b,float_status_t & status)3788 float128 float128_sub(float128 a, float128 b, float_status_t &status)
3789 {
3790     int aSign = extractFloat128Sign(a);
3791     int bSign = extractFloat128Sign(b);
3792 
3793     if (aSign == bSign) {
3794         return subFloat128Sigs(a, b, aSign, status);
3795     }
3796     else {
3797         return addFloat128Sigs(a, b, aSign, status);
3798     }
3799 }
3800 
3801 /*----------------------------------------------------------------------------
3802 | Returns the result of multiplying the quadruple-precision floating-point
3803 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3804 | Standard for Binary Floating-Point Arithmetic.
3805 *----------------------------------------------------------------------------*/
3806 
float128_mul(float128 a,float128 b,float_status_t & status)3807 float128 float128_mul(float128 a, float128 b, float_status_t &status)
3808 {
3809     int aSign, bSign, zSign;
3810     Bit32s aExp, bExp, zExp;
3811     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
3812 
3813     aSig1 = extractFloat128Frac1(a);
3814     aSig0 = extractFloat128Frac0(a);
3815     aExp = extractFloat128Exp(a);
3816     aSign = extractFloat128Sign(a);
3817     bSig1 = extractFloat128Frac1(b);
3818     bSig0 = extractFloat128Frac0(b);
3819     bExp = extractFloat128Exp(b);
3820     bSign = extractFloat128Sign(b);
3821 
3822     zSign = aSign ^ bSign;
3823     if (aExp == 0x7FFF) {
3824         if ((aSig0 | aSig1) || ((bExp == 0x7FFF) && (bSig0 | bSig1))) {
3825             return propagateFloat128NaN(a, b, status);
3826         }
3827         if ((bExp | bSig0 | bSig1) == 0) {
3828             float_raise(status, float_flag_invalid);
3829             return float128_default_nan;
3830         }
3831         return packFloat128(zSign, 0x7FFF, 0, 0);
3832     }
3833     if (bExp == 0x7FFF) {
3834         if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3835         if ((aExp | aSig0 | aSig1) == 0) {
3836             float_raise(status, float_flag_invalid);
3837             return float128_default_nan;
3838         }
3839         return packFloat128(zSign, 0x7FFF, 0, 0);
3840     }
3841     if (aExp == 0) {
3842         if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3843         float_raise(status, float_flag_denormal);
3844         normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3845     }
3846     if (bExp == 0) {
3847         if ((bSig0 | bSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3848         float_raise(status, float_flag_denormal);
3849         normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3850     }
3851     zExp = aExp + bExp - 0x4000;
3852     aSig0 |= BX_CONST64(0x0001000000000000);
3853     shortShift128Left(bSig0, bSig1, 16, &bSig0, &bSig1);
3854     mul128To256(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3);
3855     add128(zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1);
3856     zSig2 |= (zSig3 != 0);
3857     if (BX_CONST64(0x0002000000000000) <= zSig0) {
3858         shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
3859         ++zExp;
3860     }
3861     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3862 }
3863 
3864 /*----------------------------------------------------------------------------
3865 | Returns the result of dividing the quadruple-precision floating-point value
3866 | `a' by the corresponding value `b'.  The operation is performed according to
3867 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3868 *----------------------------------------------------------------------------*/
3869 
float128_div(float128 a,float128 b,float_status_t & status)3870 float128 float128_div(float128 a, float128 b, float_status_t &status)
3871 {
3872     int aSign, bSign, zSign;
3873     Bit32s aExp, bExp, zExp;
3874     Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
3875     Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3876 
3877     aSig1 = extractFloat128Frac1(a);
3878     aSig0 = extractFloat128Frac0(a);
3879     aExp = extractFloat128Exp(a);
3880     aSign = extractFloat128Sign(a);
3881     bSig1 = extractFloat128Frac1(b);
3882     bSig0 = extractFloat128Frac0(b);
3883     bExp = extractFloat128Exp(b);
3884     bSign = extractFloat128Sign(b);
3885 
3886     zSign = aSign ^ bSign;
3887     if (aExp == 0x7FFF) {
3888         if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3889         if (bExp == 0x7FFF) {
3890             if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3891             float_raise(status, float_flag_invalid);
3892             return float128_default_nan;
3893         }
3894         return packFloat128(zSign, 0x7FFF, 0, 0);
3895     }
3896     if (bExp == 0x7FFF) {
3897         if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3898         return packFloat128(zSign, 0, 0, 0);
3899     }
3900     if (bExp == 0) {
3901         if ((bSig0 | bSig1) == 0) {
3902             if ((aExp | aSig0 | aSig1) == 0) {
3903                 float_raise(status, float_flag_invalid);
3904                 return float128_default_nan;
3905             }
3906             float_raise(status, float_flag_divbyzero);
3907             return packFloat128(zSign, 0x7FFF, 0, 0);
3908         }
3909         float_raise(status, float_flag_denormal);
3910         normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3911     }
3912     if (aExp == 0) {
3913         if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3914         float_raise(status, float_flag_denormal);
3915         normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3916     }
3917     zExp = aExp - bExp + 0x3FFD;
3918     shortShift128Left(
3919         aSig0 | BX_CONST64(0x0001000000000000), aSig1, 15, &aSig0, &aSig1);
3920     shortShift128Left(
3921         bSig0 | BX_CONST64(0x0001000000000000), bSig1, 15, &bSig0, &bSig1);
3922     if (le128(bSig0, bSig1, aSig0, aSig1)) {
3923         shift128Right(aSig0, aSig1, 1, &aSig0, &aSig1);
3924         ++zExp;
3925     }
3926     zSig0 = estimateDiv128To64(aSig0, aSig1, bSig0);
3927     mul128By64To192(bSig0, bSig1, zSig0, &term0, &term1, &term2);
3928     sub192(aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2);
3929     while ((Bit64s) rem0 < 0) {
3930         --zSig0;
3931         add192(rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2);
3932     }
3933     zSig1 = estimateDiv128To64(rem1, rem2, bSig0);
3934     if ((zSig1 & 0x3FFF) <= 4) {
3935         mul128By64To192(bSig0, bSig1, zSig1, &term1, &term2, &term3);
3936         sub192(rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3);
3937         while ((Bit64s) rem1 < 0) {
3938             --zSig1;
3939             add192(rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3);
3940         }
3941         zSig1 |= ((rem1 | rem2 | rem3) != 0);
3942     }
3943     shift128ExtraRightJamming(zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2);
3944     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3945 }
3946 
3947 /*----------------------------------------------------------------------------
3948 | Returns the result of converting the 64-bit two's complement integer `a' to
3949 | the quadruple-precision floating-point format.  The conversion is performed
3950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3951 *----------------------------------------------------------------------------*/
3952 
int64_to_float128(Bit64s a)3953 float128 int64_to_float128(Bit64s a)
3954 {
3955     Bit64u zSig0, zSig1;
3956 
3957     if (a == 0) return packFloat128(0, 0, 0, 0);
3958     int zSign = (a < 0);
3959     Bit64u absA = zSign ? - a : a;
3960     Bit8u shiftCount = countLeadingZeros64(absA) + 49;
3961     Bit32s zExp = 0x406E - shiftCount;
3962     if (64 <= shiftCount) {
3963         zSig1 = 0;
3964         zSig0 = absA;
3965         shiftCount -= 64;
3966     }
3967     else {
3968         zSig1 = absA;
3969         zSig0 = 0;
3970     }
3971     shortShift128Left(zSig0, zSig1, shiftCount, &zSig0, &zSig1);
3972     return packFloat128(zSign, zExp, zSig0, zSig1);
3973 }
3974 
3975 #endif
3976