1 /*============================================================================
2 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
3 Package, Release 2b.
4
5 Written by John R. Hauser. This work was made possible in part by the
6 International Computer Science Institute, located at Suite 600, 1947 Center
7 Street, Berkeley, California 94704. Funding was partially provided by the
8 National Science Foundation under grant MIP-9311980. The original version
9 of this code was written as part of a project to build a fixed-point vector
10 processor in collaboration with the University of California at Berkeley,
11 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
12 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
13 arithmetic/SoftFloat.html'.
14
15 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
16 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
17 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
18 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
19 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
20 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
21 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
22 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
23
24 Derivative works are acceptable, even for commercial purposes, so long as
25 (1) the source code for the derivative work includes prominent notice that
26 the work is derivative, and (2) the source code includes prominent notice with
27 these four paragraphs for those parts of this code that are retained.
28 =============================================================================*/
29
30 #define FLOAT128
31
32 /*============================================================================
33 * Adapted for Bochs (x86 achitecture simulator) by
34 * Stanislav Shwartsman [sshwarts at sourceforge net]
35 * ==========================================================================*/
36
37 #include "softfloat.h"
38 #include "softfloat-round-pack.h"
39
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target
43 | if desired).
44 *----------------------------------------------------------------------------*/
45 #define USE_estimateDiv128To64
46 #define USE_estimateSqrt32
47 #include "softfloat-macros.h"
48
49 /*----------------------------------------------------------------------------
50 | Functions and definitions to determine: (1) whether tininess for underflow
51 | is detected before or after rounding by default, (2) what (if anything)
52 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
53 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
54 | are propagated from function inputs to output. These details are target-
55 | specific.
56 *----------------------------------------------------------------------------*/
57 #include "softfloat-specialize.h"
58
59 /*----------------------------------------------------------------------------
60 | Returns the result of converting the 32-bit two's complement integer `a'
61 | to the single-precision floating-point format. The conversion is performed
62 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
63 *----------------------------------------------------------------------------*/
64
int32_to_float32(Bit32s a,float_status_t & status)65 float32 int32_to_float32(Bit32s a, float_status_t &status)
66 {
67 if (a == 0) return 0;
68 if (a == (Bit32s) 0x80000000) return packFloat32(1, 0x9E, 0);
69 int zSign = (a < 0);
70 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
71 }
72
73 /*----------------------------------------------------------------------------
74 | Returns the result of converting the 32-bit two's complement integer `a'
75 | to the double-precision floating-point format. The conversion is performed
76 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
77 *----------------------------------------------------------------------------*/
78
int32_to_float64(Bit32s a)79 float64 int32_to_float64(Bit32s a)
80 {
81 if (a == 0) return 0;
82 int zSign = (a < 0);
83 Bit32u absA = zSign ? -a : a;
84 int shiftCount = countLeadingZeros32(absA) + 21;
85 Bit64u zSig = absA;
86 return packFloat64(zSign, 0x432 - shiftCount, zSig<<shiftCount);
87 }
88
89 /*----------------------------------------------------------------------------
90 | Returns the result of converting the 64-bit two's complement integer `a'
91 | to the single-precision floating-point format. The conversion is performed
92 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
93 *----------------------------------------------------------------------------*/
94
int64_to_float32(Bit64s a,float_status_t & status)95 float32 int64_to_float32(Bit64s a, float_status_t &status)
96 {
97 if (a == 0) return 0;
98 int zSign = (a < 0);
99 Bit64u absA = zSign ? -a : a;
100 int shiftCount = countLeadingZeros64(absA) - 40;
101 if (0 <= shiftCount) {
102 return packFloat32(zSign, 0x95 - shiftCount, (Bit32u)(absA<<shiftCount));
103 }
104 else {
105 shiftCount += 7;
106 if (shiftCount < 0) {
107 absA = shift64RightJamming(absA, -shiftCount);
108 }
109 else {
110 absA <<= shiftCount;
111 }
112 return roundAndPackFloat32(zSign, 0x9C - shiftCount, (Bit32u) absA, status);
113 }
114 }
115
116 /*----------------------------------------------------------------------------
117 | Returns the result of converting the 64-bit two's complement integer `a'
118 | to the double-precision floating-point format. The conversion is performed
119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
120 *----------------------------------------------------------------------------*/
121
int64_to_float64(Bit64s a,float_status_t & status)122 float64 int64_to_float64(Bit64s a, float_status_t &status)
123 {
124 if (a == 0) return 0;
125 if (a == (Bit64s) BX_CONST64(0x8000000000000000)) {
126 return packFloat64(1, 0x43E, 0);
127 }
128 int zSign = (a < 0);
129 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
130 }
131
132 /*----------------------------------------------------------------------------
133 | Returns the result of converting the 32-bit unsigned integer `a' to the
134 | single-precision floating-point format. The conversion is performed
135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
136 *----------------------------------------------------------------------------*/
137
uint32_to_float32(Bit32u a,float_status_t & status)138 float32 uint32_to_float32(Bit32u a, float_status_t &status)
139 {
140 if (a == 0) return 0;
141 if (a & 0x80000000) return normalizeRoundAndPackFloat32(0, 0x9D, a >> 1, status);
142 return normalizeRoundAndPackFloat32(0, 0x9C, a, status);
143 }
144
145 /*----------------------------------------------------------------------------
146 | Returns the result of converting the 32-bit unsigned integer `a' to the
147 | double-precision floating-point format. The conversion is performed
148 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
149 *----------------------------------------------------------------------------*/
150
uint32_to_float64(Bit32u a)151 float64 uint32_to_float64(Bit32u a)
152 {
153 if (a == 0) return 0;
154 int shiftCount = countLeadingZeros32(a) + 21;
155 Bit64u zSig = a;
156 return packFloat64(0, 0x432 - shiftCount, zSig<<shiftCount);
157 }
158
159 /*----------------------------------------------------------------------------
160 | Returns the result of converting the 64-bit unsigned integer integer `a'
161 | to the single-precision floating-point format. The conversion is performed
162 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
163 *----------------------------------------------------------------------------*/
164
uint64_to_float32(Bit64u a,float_status_t & status)165 float32 uint64_to_float32(Bit64u a, float_status_t &status)
166 {
167 if (a == 0) return 0;
168 int shiftCount = countLeadingZeros64(a) - 40;
169 if (0 <= shiftCount) {
170 return packFloat32(0, 0x95 - shiftCount, (Bit32u)(a<<shiftCount));
171 }
172 else {
173 shiftCount += 7;
174 if (shiftCount < 0) {
175 a = shift64RightJamming(a, -shiftCount);
176 }
177 else {
178 a <<= shiftCount;
179 }
180 return roundAndPackFloat32(0, 0x9C - shiftCount, (Bit32u) a, status);
181 }
182 }
183
184 /*----------------------------------------------------------------------------
185 | Returns the result of converting the 64-bit unsigned integer integer `a'
186 | to the double-precision floating-point format. The conversion is performed
187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
188 *----------------------------------------------------------------------------*/
189
uint64_to_float64(Bit64u a,float_status_t & status)190 float64 uint64_to_float64(Bit64u a, float_status_t &status)
191 {
192 if (a == 0) return 0;
193 return normalizeRoundAndPackFloat64(0, 0x43C, a, status);
194 }
195
196 /*----------------------------------------------------------------------------
197 | Returns the result of converting the single-precision floating-point value
198 | `a' to the 32-bit two's complement integer format. The conversion is
199 | performed according to the IEC/IEEE Standard for Binary Floating-Point
200 | Arithmetic - which means in particular that the conversion is rounded
201 | according to the current rounding mode. If `a' is a NaN or the
202 | conversion overflows the integer indefinite value is returned.
203 *----------------------------------------------------------------------------*/
204
float32_to_int32(float32 a,float_status_t & status)205 Bit32s float32_to_int32(float32 a, float_status_t &status)
206 {
207 Bit32u aSig = extractFloat32Frac(a);
208 Bit16s aExp = extractFloat32Exp(a);
209 int aSign = extractFloat32Sign(a);
210 if ((aExp == 0xFF) && aSig) aSign = 0;
211 if (aExp) aSig |= 0x00800000;
212 else {
213 if (get_denormals_are_zeros(status)) aSig = 0;
214 }
215 int shiftCount = 0xAF - aExp;
216 Bit64u aSig64 = Bit64u(aSig) << 32;
217 if (0 < shiftCount) aSig64 = shift64RightJamming(aSig64, shiftCount);
218 return roundAndPackInt32(aSign, aSig64, status);
219 }
220
221 /*----------------------------------------------------------------------------
222 | Returns the result of converting the single-precision floating-point value
223 | `a' to the 32-bit two's complement integer format. The conversion is
224 | performed according to the IEC/IEEE Standard for Binary Floating-Point
225 | Arithmetic, except that the conversion is always rounded toward zero.
226 | If `a' is a NaN or the conversion overflows, the integer indefinite
227 | value is returned.
228 *----------------------------------------------------------------------------*/
229
float32_to_int32_round_to_zero(float32 a,float_status_t & status)230 Bit32s float32_to_int32_round_to_zero(float32 a, float_status_t &status)
231 {
232 int aSign;
233 Bit16s aExp;
234 Bit32u aSig;
235 Bit32s z;
236
237 aSig = extractFloat32Frac(a);
238 aExp = extractFloat32Exp(a);
239 aSign = extractFloat32Sign(a);
240 int shiftCount = aExp - 0x9E;
241 if (0 <= shiftCount) {
242 if (a != 0xCF000000) {
243 float_raise(status, float_flag_invalid);
244 }
245 return (Bit32s)(int32_indefinite);
246 }
247 else if (aExp <= 0x7E) {
248 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
249 if (aExp | aSig) float_raise(status, float_flag_inexact);
250 return 0;
251 }
252 aSig = (aSig | 0x800000)<<8;
253 z = aSig>>(-shiftCount);
254 if ((Bit32u) (aSig<<(shiftCount & 31))) {
255 float_raise(status, float_flag_inexact);
256 }
257 if (aSign) z = -z;
258 return z;
259 }
260
261 /*----------------------------------------------------------------------------
262 | Returns the result of converting the single-precision floating-point value
263 | `a' to the 32-bit unsigned integer format. The conversion is performed
264 | according to the IEC/IEEE Standard for Binary Floating-point Arithmetic,
265 | except that the conversion is always rounded toward zero. If `a' is a NaN
266 | or conversion overflows, the largest positive integer is returned.
267 *----------------------------------------------------------------------------*/
268
float32_to_uint32_round_to_zero(float32 a,float_status_t & status)269 Bit32u float32_to_uint32_round_to_zero(float32 a, float_status_t &status)
270 {
271 int aSign;
272 Bit16s aExp;
273 Bit32u aSig;
274
275 aSig = extractFloat32Frac(a);
276 aExp = extractFloat32Exp(a);
277 aSign = extractFloat32Sign(a);
278 int shiftCount = aExp - 0x9E;
279
280 if (aExp <= 0x7E) {
281 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
282 if (aExp | aSig) float_raise(status, float_flag_inexact);
283 return 0;
284 }
285 else if (0 < shiftCount || aSign) {
286 float_raise(status, float_flag_invalid);
287 return uint32_indefinite;
288 }
289
290 aSig = (aSig | 0x800000)<<8;
291 Bit32u z = aSig >> (-shiftCount);
292 if (aSig << (shiftCount & 31)) {
293 float_raise(status, float_flag_inexact);
294 }
295 return z;
296 }
297
298 /*----------------------------------------------------------------------------
299 | Returns the result of converting the single-precision floating-point value
300 | `a' to the 64-bit two's complement integer format. The conversion is
301 | performed according to the IEC/IEEE Standard for Binary Floating-Point
302 | Arithmetic - which means in particular that the conversion is rounded
303 | according to the current rounding mode. If `a' is a NaN or the
304 | conversion overflows, the integer indefinite value is returned.
305 *----------------------------------------------------------------------------*/
306
float32_to_int64(float32 a,float_status_t & status)307 Bit64s float32_to_int64(float32 a, float_status_t &status)
308 {
309 Bit64u aSig64, aSigExtra;
310
311 Bit32u aSig = extractFloat32Frac(a);
312 Bit16s aExp = extractFloat32Exp(a);
313 int aSign = extractFloat32Sign(a);
314
315 int shiftCount = 0xBE - aExp;
316 if (shiftCount < 0) {
317 float_raise(status, float_flag_invalid);
318 return (Bit64s)(int64_indefinite);
319 }
320 if (aExp) aSig |= 0x00800000;
321 else {
322 if (get_denormals_are_zeros(status)) aSig = 0;
323 }
324 aSig64 = aSig;
325 aSig64 <<= 40;
326 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
327 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
328 }
329
330 /*----------------------------------------------------------------------------
331 | Returns the result of converting the single-precision floating-point value
332 | `a' to the 64-bit two's complement integer format. The conversion is
333 | performed according to the IEC/IEEE Standard for Binary Floating-Point
334 | Arithmetic, except that the conversion is always rounded toward zero.
335 | If `a' is a NaN or the conversion overflows, the integer indefinite
336 | value is returned.
337 *----------------------------------------------------------------------------*/
338
float32_to_int64_round_to_zero(float32 a,float_status_t & status)339 Bit64s float32_to_int64_round_to_zero(float32 a, float_status_t &status)
340 {
341 int aSign;
342 Bit16s aExp;
343 Bit32u aSig;
344 Bit64u aSig64;
345 Bit64s z;
346
347 aSig = extractFloat32Frac(a);
348 aExp = extractFloat32Exp(a);
349 aSign = extractFloat32Sign(a);
350 int shiftCount = aExp - 0xBE;
351 if (0 <= shiftCount) {
352 if (a != 0xDF000000) {
353 float_raise(status, float_flag_invalid);
354 }
355 return (Bit64s)(int64_indefinite);
356 }
357 else if (aExp <= 0x7E) {
358 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
359 if (aExp | aSig) float_raise(status, float_flag_inexact);
360 return 0;
361 }
362 aSig64 = aSig | 0x00800000;
363 aSig64 <<= 40;
364 z = aSig64>>(-shiftCount);
365 if ((Bit64u) (aSig64<<(shiftCount & 63))) {
366 float_raise(status, float_flag_inexact);
367 }
368 if (aSign) z = -z;
369 return z;
370 }
371
372 /*----------------------------------------------------------------------------
373 | Returns the result of converting the single-precision floating-point value
374 | `a' to the 64-bit unsigned integer format. The conversion is performed
375 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
376 | except that the conversion is always rounded toward zero. If `a' is a NaN
377 | or the conversion overflows, the largest unsigned integer is returned.
378 *----------------------------------------------------------------------------*/
379
float32_to_uint64_round_to_zero(float32 a,float_status_t & status)380 Bit64u float32_to_uint64_round_to_zero(float32 a, float_status_t &status)
381 {
382 int aSign;
383 Bit16s aExp;
384 Bit32u aSig;
385 Bit64u aSig64;
386
387 aSig = extractFloat32Frac(a);
388 aExp = extractFloat32Exp(a);
389 aSign = extractFloat32Sign(a);
390 int shiftCount = aExp - 0xBE;
391
392 if (aExp <= 0x7E) {
393 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
394 if (aExp | aSig) float_raise(status, float_flag_inexact);
395 return 0;
396 }
397 else if (0 < shiftCount || aSign) {
398 float_raise(status, float_flag_invalid);
399 return uint64_indefinite;
400 }
401
402 aSig64 = aSig | 0x00800000;
403 aSig64 <<= 40;
404 Bit64u z = aSig64>>(-shiftCount);
405 if ((Bit64u) (aSig64<<(shiftCount & 63))) {
406 float_raise(status, float_flag_inexact);
407 }
408 return z;
409 }
410
411 /*----------------------------------------------------------------------------
412 | Returns the result of converting the single-precision floating-point value
413 | `a' to the 64-bit unsigned integer format. The conversion is
414 | performed according to the IEC/IEEE Standard for Binary Floating-Point
415 | Arithmetic---which means in particular that the conversion is rounded
416 | according to the current rounding mode. If `a' is a NaN or the conversion
417 | overflows, the largest unsigned integer is returned.
418 *----------------------------------------------------------------------------*/
419
float32_to_uint64(float32 a,float_status_t & status)420 Bit64u float32_to_uint64(float32 a, float_status_t &status)
421 {
422 int aSign;
423 Bit16s aExp, shiftCount;
424 Bit32u aSig;
425 Bit64u aSig64, aSigExtra;
426
427 aSig = extractFloat32Frac(a);
428 aExp = extractFloat32Exp(a);
429 aSign = extractFloat32Sign(a);
430
431 if (get_denormals_are_zeros(status)) {
432 if (aExp == 0) aSig = 0;
433 }
434
435 if ((aSign) && (aExp > 0x7E)) {
436 float_raise(status, float_flag_invalid);
437 return uint64_indefinite;
438 }
439
440 shiftCount = 0xBE - aExp;
441 if (aExp) aSig |= 0x00800000;
442
443 if (shiftCount < 0) {
444 float_raise(status, float_flag_invalid);
445 return uint64_indefinite;
446 }
447
448 aSig64 = aSig;
449 aSig64 <<= 40;
450 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
451 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
452 }
453
454 /*----------------------------------------------------------------------------
455 | Returns the result of converting the single-precision floating-point value
456 | `a' to the 32-bit unsigned integer format. The conversion is
457 | performed according to the IEC/IEEE Standard for Binary Floating-Point
458 | Arithmetic---which means in particular that the conversion is rounded
459 | according to the current rounding mode. If `a' is a NaN or the conversion
460 | overflows, the largest unsigned integer is returned.
461 *----------------------------------------------------------------------------*/
462
float32_to_uint32(float32 a,float_status_t & status)463 Bit32u float32_to_uint32(float32 a, float_status_t &status)
464 {
465 Bit64u val_64 = float32_to_uint64(a, status);
466
467 if (val_64 > 0xffffffff) {
468 status.float_exception_flags = float_flag_invalid; // throw away other flags
469 return uint32_indefinite;
470 }
471
472 return (Bit32u) val_64;
473 }
474
475 /*----------------------------------------------------------------------------
476 | Returns the result of converting the single-precision floating-point value
477 | `a' to the double-precision floating-point format. The conversion is
478 | performed according to the IEC/IEEE Standard for Binary Floating-Point
479 | Arithmetic.
480 *----------------------------------------------------------------------------*/
481
float32_to_float64(float32 a,float_status_t & status)482 float64 float32_to_float64(float32 a, float_status_t &status)
483 {
484 Bit32u aSig = extractFloat32Frac(a);
485 Bit16s aExp = extractFloat32Exp(a);
486 int aSign = extractFloat32Sign(a);
487
488 if (aExp == 0xFF) {
489 if (aSig) return commonNaNToFloat64(float32ToCommonNaN(a, status));
490 return packFloat64(aSign, 0x7FF, 0);
491 }
492 if (aExp == 0) {
493 if (aSig == 0 || get_denormals_are_zeros(status))
494 return packFloat64(aSign, 0, 0);
495
496 float_raise(status, float_flag_denormal);
497 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
498 --aExp;
499 }
500 return packFloat64(aSign, aExp + 0x380, ((Bit64u) aSig)<<29);
501 }
502
503 /*----------------------------------------------------------------------------
504 | Rounds the single-precision floating-point value `a' to an integer, and
505 | returns the result as a single-precision floating-point value. The
506 | operation is performed according to the IEC/IEEE Standard for Binary
507 | Floating-Point Arithmetic.
508 *----------------------------------------------------------------------------*/
509
float32_round_to_int(float32 a,Bit8u scale,float_status_t & status)510 float32 float32_round_to_int(float32 a, Bit8u scale, float_status_t &status)
511 {
512 Bit32u lastBitMask, roundBitsMask;
513 int roundingMode = get_float_rounding_mode(status);
514 Bit16s aExp = extractFloat32Exp(a);
515 scale &= 0xf;
516
517 if ((aExp == 0xFF) && extractFloat32Frac(a)) {
518 return propagateFloat32NaN(a, status);
519 }
520
521 aExp += scale; // scale the exponent
522
523 if (0x96 <= aExp) {
524 return a;
525 }
526
527 if (get_denormals_are_zeros(status)) {
528 a = float32_denormal_to_zero(a);
529 }
530
531 if (aExp <= 0x7E) {
532 if ((Bit32u) (a<<1) == 0) return a;
533 float_raise(status, float_flag_inexact);
534 int aSign = extractFloat32Sign(a);
535 switch (roundingMode) {
536 case float_round_nearest_even:
537 if ((aExp == 0x7E) && extractFloat32Frac(a)) {
538 return packFloat32(aSign, 0x7F - scale, 0);
539 }
540 break;
541 case float_round_down:
542 return aSign ? packFloat32(1, 0x7F - scale, 0) : float32_positive_zero;
543 case float_round_up:
544 return aSign ? float32_negative_zero : packFloat32(0, 0x7F - scale, 0);
545 }
546 return packFloat32(aSign, 0, 0);
547 }
548
549 lastBitMask = 1;
550 lastBitMask <<= 0x96 - aExp;
551 roundBitsMask = lastBitMask - 1;
552 float32 z = a;
553 if (roundingMode == float_round_nearest_even) {
554 z += lastBitMask>>1;
555 if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
556 }
557 else if (roundingMode != float_round_to_zero) {
558 if (extractFloat32Sign(z) ^ (roundingMode == float_round_up)) {
559 z += roundBitsMask;
560 }
561 }
562 z &= ~roundBitsMask;
563 if (z != a) float_raise(status, float_flag_inexact);
564 return z;
565 }
566
567 /*----------------------------------------------------------------------------
568 | Extracts the fractional portion of single-precision floating-point value `a',
569 | and returns the result as a single-precision floating-point value. The
570 | fractional results are precise. The operation is performed according to the
571 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
572 *----------------------------------------------------------------------------*/
573
float32_frc(float32 a,float_status_t & status)574 float32 float32_frc(float32 a, float_status_t &status)
575 {
576 int roundingMode = get_float_rounding_mode(status);
577
578 Bit16s aExp = extractFloat32Exp(a);
579 Bit32u aSig = extractFloat32Frac(a);
580 int aSign = extractFloat32Sign(a);
581
582 if (aExp == 0xFF) {
583 if (aSig) return propagateFloat32NaN(a, status);
584 float_raise(status, float_flag_invalid);
585 return float32_default_nan;
586 }
587
588 if (aExp >= 0x96) {
589 return packFloat32(roundingMode == float_round_down, 0, 0);
590 }
591
592 if (aExp < 0x7F) {
593 if (aExp == 0) {
594 if (aSig == 0 || get_denormals_are_zeros(status))
595 return packFloat32(roundingMode == float_round_down, 0, 0);
596
597 float_raise(status, float_flag_denormal);
598 if (! float_exception_masked(status, float_flag_underflow))
599 float_raise(status, float_flag_underflow);
600
601 if(get_flush_underflow_to_zero(status)) {
602 float_raise(status, float_flag_underflow | float_flag_inexact);
603 return packFloat32(aSign, 0, 0);
604 }
605 }
606 return a;
607 }
608
609 Bit32u lastBitMask = 1 << (0x96 - aExp);
610 Bit32u roundBitsMask = lastBitMask - 1;
611
612 aSig &= roundBitsMask;
613 aSig <<= 7;
614 aExp--;
615
616 if (aSig == 0)
617 return packFloat32(roundingMode == float_round_down, 0, 0);
618
619 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
620 }
621
622 /*----------------------------------------------------------------------------
623 | Extracts the exponent portion of single-precision floating-point value 'a',
624 | and returns the result as a single-precision floating-point value
625 | representing unbiased integer exponent. The operation is performed according
626 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
627 *----------------------------------------------------------------------------*/
628
float32_getexp(float32 a,float_status_t & status)629 float32 float32_getexp(float32 a, float_status_t &status)
630 {
631 Bit16s aExp = extractFloat32Exp(a);
632 Bit32u aSig = extractFloat32Frac(a);
633
634 if (aExp == 0xFF) {
635 if (aSig) return propagateFloat32NaN(a, status);
636 return float32_positive_inf;
637 }
638
639 if (aExp == 0) {
640 if (aSig == 0 || get_denormals_are_zeros(status))
641 return float32_negative_inf;
642
643 float_raise(status, float_flag_denormal);
644 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
645 }
646
647 return int32_to_float32(aExp - 0x7F, status);
648 }
649
650 /*----------------------------------------------------------------------------
651 | Extracts the mantissa of single-precision floating-point value 'a' and
652 | returns the result as a single-precision floating-point after applying
653 | the mantissa interval normalization and sign control. The operation is
654 | performed according to the IEC/IEEE Standard for Binary Floating-Point
655 | Arithmetic.
656 *----------------------------------------------------------------------------*/
657
float32_getmant(float32 a,float_status_t & status,int sign_ctrl,int interv)658 float32 float32_getmant(float32 a, float_status_t &status, int sign_ctrl, int interv)
659 {
660 Bit16s aExp = extractFloat32Exp(a);
661 Bit32u aSig = extractFloat32Frac(a);
662 int aSign = extractFloat32Sign(a);
663
664 if (aExp == 0xFF) {
665 if (aSig) return propagateFloat32NaN(a, status);
666 if (aSign) {
667 if (sign_ctrl & 0x2) {
668 float_raise(status, float_flag_invalid);
669 return float32_default_nan;
670 }
671 }
672 return packFloat32(~sign_ctrl & aSign, 0x7F, 0);
673 }
674
675 if (aExp == 0 && (aSig == 0 || get_denormals_are_zeros(status))) {
676 return packFloat32(~sign_ctrl & aSign, 0x7F, 0);
677 }
678
679 if (aSign) {
680 if (sign_ctrl & 0x2) {
681 float_raise(status, float_flag_invalid);
682 return float32_default_nan;
683 }
684 }
685
686 if (aExp == 0) {
687 float_raise(status, float_flag_denormal);
688 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
689 }
690
691 switch(interv) {
692 case 0x0: // interval [1,2)
693 aExp = 0x7F;
694 break;
695 case 0x1: // interval [1/2,2)
696 aExp -= 0x7F;
697 aExp = 0x7F - (aExp & 0x1);
698 break;
699 case 0x2: // interval [1/2,1)
700 aExp = 0x7E;
701 break;
702 case 0x3: // interval [3/4,3/2)
703 aExp = 0x7F - ((aSig >> 22) & 0x1);
704 break;
705 }
706
707 return packFloat32(~sign_ctrl & aSign, aExp, aSig);
708 }
709
710 /*----------------------------------------------------------------------------
711 | Return the result of a floating point scale of the single-precision floating
712 | point value `a' by multiplying it by 2 power of the single-precision
713 | floating point value 'b' converted to integral value. If the result cannot
714 | be represented in single precision, then the proper overflow response (for
715 | positive scaling operand), or the proper underflow response (for negative
716 | scaling operand) is issued. The operation is performed according to the
717 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
718 *----------------------------------------------------------------------------*/
719
float32_scalef(float32 a,float32 b,float_status_t & status)720 float32 float32_scalef(float32 a, float32 b, float_status_t &status)
721 {
722 Bit32u aSig = extractFloat32Frac(a);
723 Bit16s aExp = extractFloat32Exp(a);
724 int aSign = extractFloat32Sign(a);
725 Bit32u bSig = extractFloat32Frac(b);
726 Bit16s bExp = extractFloat32Exp(b);
727 int bSign = extractFloat32Sign(b);
728
729 if (get_denormals_are_zeros(status)) {
730 if (aExp == 0) aSig = 0;
731 if (bExp == 0) bSig = 0;
732 }
733
734 if (bExp == 0xFF) {
735 if (bSig) return propagateFloat32NaN(a, b, status);
736 }
737
738 if (aExp == 0xFF) {
739 if (aSig) {
740 int aIsSignalingNaN = (aSig & 0x00400000) == 0;
741 if (aIsSignalingNaN || bExp != 0xFF || bSig)
742 return propagateFloat32NaN(a, b, status);
743
744 return bSign ? 0 : float32_positive_inf;
745 }
746
747 if (bExp == 0xFF && bSign) {
748 float_raise(status, float_flag_invalid);
749 return float32_default_nan;
750 }
751 return a;
752 }
753
754 if ((aExp | aSig) == 0) {
755 if (bExp == 0xFF && ! bSign) {
756 float_raise(status, float_flag_invalid);
757 return float32_default_nan;
758 }
759 return a;
760 }
761
762 if ((bExp | bSig) == 0) return a;
763
764 if (bExp == 0xFF) {
765 if (bSign) return packFloat32(aSign, 0, 0);
766 return packFloat32(aSign, 0xFF, 0);
767 }
768
769 if (bExp >= 0x8E) {
770 // handle obvious overflow/underflow result
771 return roundAndPackFloat32(aSign, bSign ? -0x7F : 0xFF, aSig, status);
772 }
773
774 int scale = 0;
775
776 if (bExp <= 0x7E) {
777 scale = -bSign;
778 }
779 else {
780 int shiftCount = bExp - 0x9E;
781 bSig = (bSig | 0x800000)<<8;
782 scale = bSig>>(-shiftCount);
783
784 if (bSign) {
785 if ((Bit32u) (bSig<<(shiftCount & 31))) scale++;
786 scale = -scale;
787 }
788
789 if (scale > 0x200) scale = 0x200;
790 if (scale < -0x200) scale = -0x200;
791 }
792
793 if (aExp != 0) {
794 aSig |= 0x00800000;
795 } else {
796 aExp++;
797 }
798
799 aExp += scale - 1;
800 aSig <<= 7;
801 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
802 }
803
804 /*----------------------------------------------------------------------------
805 | Returns the result of adding the absolute values of the single-precision
806 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
807 | before being returned. `zSign' is ignored if the result is a NaN.
808 | The addition is performed according to the IEC/IEEE Standard for Binary
809 | Floating-Point Arithmetic.
810 *----------------------------------------------------------------------------*/
811
addFloat32Sigs(float32 a,float32 b,int zSign,float_status_t & status)812 static float32 addFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
813 {
814 Bit16s aExp, bExp, zExp;
815 Bit32u aSig, bSig, zSig;
816 Bit16s expDiff;
817
818 aSig = extractFloat32Frac(a);
819 aExp = extractFloat32Exp(a);
820 bSig = extractFloat32Frac(b);
821 bExp = extractFloat32Exp(b);
822
823 if (get_denormals_are_zeros(status)) {
824 if (aExp == 0) aSig = 0;
825 if (bExp == 0) bSig = 0;
826 }
827
828 expDiff = aExp - bExp;
829 aSig <<= 6;
830 bSig <<= 6;
831
832 if (0 < expDiff) {
833 if (aExp == 0xFF) {
834 if (aSig) return propagateFloat32NaN(a, b, status);
835 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
836 return a;
837 }
838 if ((aExp == 0) && aSig)
839 float_raise(status, float_flag_denormal);
840
841 if (bExp == 0) {
842 if (bSig) float_raise(status, float_flag_denormal);
843 --expDiff;
844 }
845 else bSig |= 0x20000000;
846
847 bSig = shift32RightJamming(bSig, expDiff);
848 zExp = aExp;
849 }
850 else if (expDiff < 0) {
851 if (bExp == 0xFF) {
852 if (bSig) return propagateFloat32NaN(a, b, status);
853 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
854 return packFloat32(zSign, 0xFF, 0);
855 }
856 if ((bExp == 0) && bSig)
857 float_raise(status, float_flag_denormal);
858
859 if (aExp == 0) {
860 if (aSig) float_raise(status, float_flag_denormal);
861 ++expDiff;
862 }
863 else aSig |= 0x20000000;
864
865 aSig = shift32RightJamming(aSig, -expDiff);
866 zExp = bExp;
867 }
868 else {
869 if (aExp == 0xFF) {
870 if (aSig | bSig) return propagateFloat32NaN(a, b, status);
871 return a;
872 }
873 if (aExp == 0) {
874 zSig = (aSig + bSig) >> 6;
875 if (aSig | bSig) {
876 float_raise(status, float_flag_denormal);
877 if (get_flush_underflow_to_zero(status)) {
878 float_raise(status, float_flag_underflow | float_flag_inexact);
879 return packFloat32(zSign, 0, 0);
880 }
881 if (! float_exception_masked(status, float_flag_underflow)) {
882 if (extractFloat32Frac(zSig) == zSig)
883 float_raise(status, float_flag_underflow);
884 }
885 }
886 return packFloat32(zSign, 0, zSig);
887 }
888 zSig = 0x40000000 + aSig + bSig;
889 return roundAndPackFloat32(zSign, aExp, zSig, status);
890 }
891 aSig |= 0x20000000;
892 zSig = (aSig + bSig)<<1;
893 --zExp;
894 if ((Bit32s) zSig < 0) {
895 zSig = aSig + bSig;
896 ++zExp;
897 }
898 return roundAndPackFloat32(zSign, zExp, zSig, status);
899 }
900
901 /*----------------------------------------------------------------------------
902 | Returns the result of subtracting the absolute values of the single-
903 | precision floating-point values `a' and `b'. If `zSign' is 1, the
904 | difference is negated before being returned. `zSign' is ignored if the
905 | result is a NaN. The subtraction is performed according to the IEC/IEEE
906 | Standard for Binary Floating-Point Arithmetic.
907 *----------------------------------------------------------------------------*/
908
subFloat32Sigs(float32 a,float32 b,int zSign,float_status_t & status)909 static float32 subFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
910 {
911 Bit16s aExp, bExp, zExp;
912 Bit32u aSig, bSig, zSig;
913 Bit16s expDiff;
914
915 aSig = extractFloat32Frac(a);
916 aExp = extractFloat32Exp(a);
917 bSig = extractFloat32Frac(b);
918 bExp = extractFloat32Exp(b);
919
920 if (get_denormals_are_zeros(status)) {
921 if (aExp == 0) aSig = 0;
922 if (bExp == 0) bSig = 0;
923 }
924
925 expDiff = aExp - bExp;
926 aSig <<= 7;
927 bSig <<= 7;
928 if (0 < expDiff) goto aExpBigger;
929 if (expDiff < 0) goto bExpBigger;
930 if (aExp == 0xFF) {
931 if (aSig | bSig) return propagateFloat32NaN(a, b, status);
932 float_raise(status, float_flag_invalid);
933 return float32_default_nan;
934 }
935 if (aExp == 0) {
936 if (aSig | bSig) float_raise(status, float_flag_denormal);
937 aExp = 1;
938 bExp = 1;
939 }
940 if (bSig < aSig) goto aBigger;
941 if (aSig < bSig) goto bBigger;
942 return packFloat32(get_float_rounding_mode(status) == float_round_down, 0, 0);
943 bExpBigger:
944 if (bExp == 0xFF) {
945 if (bSig) return propagateFloat32NaN(a, b, status);
946 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
947 return packFloat32(zSign ^ 1, 0xFF, 0);
948 }
949 if ((bExp == 0) && bSig)
950 float_raise(status, float_flag_denormal);
951
952 if (aExp == 0) {
953 if (aSig) float_raise(status, float_flag_denormal);
954 ++expDiff;
955 }
956 else aSig |= 0x40000000;
957
958 aSig = shift32RightJamming(aSig, -expDiff);
959 bSig |= 0x40000000;
960 bBigger:
961 zSig = bSig - aSig;
962 zExp = bExp;
963 zSign ^= 1;
964 goto normalizeRoundAndPack;
965 aExpBigger:
966 if (aExp == 0xFF) {
967 if (aSig) return propagateFloat32NaN(a, b, status);
968 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
969 return a;
970 }
971 if ((aExp == 0) && aSig)
972 float_raise(status, float_flag_denormal);
973
974 if (bExp == 0) {
975 if (bSig) float_raise(status, float_flag_denormal);
976 --expDiff;
977 }
978 else bSig |= 0x40000000;
979
980 bSig = shift32RightJamming(bSig, expDiff);
981 aSig |= 0x40000000;
982 aBigger:
983 zSig = aSig - bSig;
984 zExp = aExp;
985 normalizeRoundAndPack:
986 --zExp;
987 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
988 }
989
990 /*----------------------------------------------------------------------------
991 | Returns the result of adding the single-precision floating-point values `a'
992 | and `b'. The operation is performed according to the IEC/IEEE Standard for
993 | Binary Floating-Point Arithmetic.
994 *----------------------------------------------------------------------------*/
995
float32_add(float32 a,float32 b,float_status_t & status)996 float32 float32_add(float32 a, float32 b, float_status_t &status)
997 {
998 int aSign = extractFloat32Sign(a);
999 int bSign = extractFloat32Sign(b);
1000
1001 if (aSign == bSign) {
1002 return addFloat32Sigs(a, b, aSign, status);
1003 }
1004 else {
1005 return subFloat32Sigs(a, b, aSign, status);
1006 }
1007 }
1008
1009 /*----------------------------------------------------------------------------
1010 | Returns the result of subtracting the single-precision floating-point values
1011 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1012 | for Binary Floating-Point Arithmetic.
1013 *----------------------------------------------------------------------------*/
1014
float32_sub(float32 a,float32 b,float_status_t & status)1015 float32 float32_sub(float32 a, float32 b, float_status_t &status)
1016 {
1017 int aSign = extractFloat32Sign(a);
1018 int bSign = extractFloat32Sign(b);
1019
1020 if (aSign == bSign) {
1021 return subFloat32Sigs(a, b, aSign, status);
1022 }
1023 else {
1024 return addFloat32Sigs(a, b, aSign, status);
1025 }
1026 }
1027
1028 /*----------------------------------------------------------------------------
1029 | Returns the result of multiplying the single-precision floating-point values
1030 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1031 | for Binary Floating-Point Arithmetic.
1032 *----------------------------------------------------------------------------*/
1033
float32_mul(float32 a,float32 b,float_status_t & status)1034 float32 float32_mul(float32 a, float32 b, float_status_t &status)
1035 {
1036 int aSign, bSign, zSign;
1037 Bit16s aExp, bExp, zExp;
1038 Bit32u aSig, bSig;
1039 Bit64u zSig64;
1040 Bit32u zSig;
1041
1042 aSig = extractFloat32Frac(a);
1043 aExp = extractFloat32Exp(a);
1044 aSign = extractFloat32Sign(a);
1045 bSig = extractFloat32Frac(b);
1046 bExp = extractFloat32Exp(b);
1047 bSign = extractFloat32Sign(b);
1048 zSign = aSign ^ bSign;
1049
1050 if (get_denormals_are_zeros(status)) {
1051 if (aExp == 0) aSig = 0;
1052 if (bExp == 0) bSig = 0;
1053 }
1054
1055 if (aExp == 0xFF) {
1056 if (aSig || ((bExp == 0xFF) && bSig))
1057 return propagateFloat32NaN(a, b, status);
1058
1059 if ((bExp | bSig) == 0) {
1060 float_raise(status, float_flag_invalid);
1061 return float32_default_nan;
1062 }
1063 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1064 return packFloat32(zSign, 0xFF, 0);
1065 }
1066 if (bExp == 0xFF) {
1067 if (bSig) return propagateFloat32NaN(a, b, status);
1068 if ((aExp | aSig) == 0) {
1069 float_raise(status, float_flag_invalid);
1070 return float32_default_nan;
1071 }
1072 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1073 return packFloat32(zSign, 0xFF, 0);
1074 }
1075 if (aExp == 0) {
1076 if (aSig == 0) {
1077 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1078 return packFloat32(zSign, 0, 0);
1079 }
1080 float_raise(status, float_flag_denormal);
1081 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1082 }
1083 if (bExp == 0) {
1084 if (bSig == 0) return packFloat32(zSign, 0, 0);
1085 float_raise(status, float_flag_denormal);
1086 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
1087 }
1088 zExp = aExp + bExp - 0x7F;
1089 aSig = (aSig | 0x00800000)<<7;
1090 bSig = (bSig | 0x00800000)<<8;
1091 zSig64 = shift64RightJamming(((Bit64u) aSig) * bSig, 32);
1092 zSig = (Bit32u) zSig64;
1093 if (0 <= (Bit32s) (zSig<<1)) {
1094 zSig <<= 1;
1095 --zExp;
1096 }
1097 return roundAndPackFloat32(zSign, zExp, zSig, status);
1098 }
1099
1100 /*----------------------------------------------------------------------------
1101 | Returns the result of dividing the single-precision floating-point value `a'
1102 | by the corresponding value `b'. The operation is performed according to the
1103 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1104 *----------------------------------------------------------------------------*/
1105
float32_div(float32 a,float32 b,float_status_t & status)1106 float32 float32_div(float32 a, float32 b, float_status_t &status)
1107 {
1108 int aSign, bSign, zSign;
1109 Bit16s aExp, bExp, zExp;
1110 Bit32u aSig, bSig, zSig;
1111
1112 aSig = extractFloat32Frac(a);
1113 aExp = extractFloat32Exp(a);
1114 aSign = extractFloat32Sign(a);
1115 bSig = extractFloat32Frac(b);
1116 bExp = extractFloat32Exp(b);
1117 bSign = extractFloat32Sign(b);
1118 zSign = aSign ^ bSign;
1119
1120 if (get_denormals_are_zeros(status)) {
1121 if (aExp == 0) aSig = 0;
1122 if (bExp == 0) bSig = 0;
1123 }
1124
1125 if (aExp == 0xFF) {
1126 if (aSig) return propagateFloat32NaN(a, b, status);
1127 if (bExp == 0xFF) {
1128 if (bSig) return propagateFloat32NaN(a, b, status);
1129 float_raise(status, float_flag_invalid);
1130 return float32_default_nan;
1131 }
1132 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1133 return packFloat32(zSign, 0xFF, 0);
1134 }
1135 if (bExp == 0xFF) {
1136 if (bSig) return propagateFloat32NaN(a, b, status);
1137 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1138 return packFloat32(zSign, 0, 0);
1139 }
1140 if (bExp == 0) {
1141 if (bSig == 0) {
1142 if ((aExp | aSig) == 0) {
1143 float_raise(status, float_flag_invalid);
1144 return float32_default_nan;
1145 }
1146 float_raise(status, float_flag_divbyzero);
1147 return packFloat32(zSign, 0xFF, 0);
1148 }
1149 float_raise(status, float_flag_denormal);
1150 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
1151 }
1152 if (aExp == 0) {
1153 if (aSig == 0) return packFloat32(zSign, 0, 0);
1154 float_raise(status, float_flag_denormal);
1155 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1156 }
1157 zExp = aExp - bExp + 0x7D;
1158 aSig = (aSig | 0x00800000)<<7;
1159 bSig = (bSig | 0x00800000)<<8;
1160 if (bSig <= (aSig + aSig)) {
1161 aSig >>= 1;
1162 ++zExp;
1163 }
1164 zSig = (((Bit64u) aSig)<<32) / bSig;
1165 if ((zSig & 0x3F) == 0) {
1166 zSig |= ((Bit64u) bSig * zSig != ((Bit64u) aSig)<<32);
1167 }
1168 return roundAndPackFloat32(zSign, zExp, zSig, status);
1169 }
1170
1171 /*----------------------------------------------------------------------------
1172 | Returns the square root of the single-precision floating-point value `a'.
1173 | The operation is performed according to the IEC/IEEE Standard for Binary
1174 | Floating-Point Arithmetic.
1175 *----------------------------------------------------------------------------*/
1176
float32_sqrt(float32 a,float_status_t & status)1177 float32 float32_sqrt(float32 a, float_status_t &status)
1178 {
1179 int aSign;
1180 Bit16s aExp, zExp;
1181 Bit32u aSig, zSig;
1182 Bit64u rem, term;
1183
1184 aSig = extractFloat32Frac(a);
1185 aExp = extractFloat32Exp(a);
1186 aSign = extractFloat32Sign(a);
1187
1188 if (aExp == 0xFF) {
1189 if (aSig) return propagateFloat32NaN(a, status);
1190 if (! aSign) return a;
1191 float_raise(status, float_flag_invalid);
1192 return float32_default_nan;
1193 }
1194
1195 if (get_denormals_are_zeros(status)) {
1196 if (aExp == 0) aSig = 0;
1197 }
1198
1199 if (aSign) {
1200 if ((aExp | aSig) == 0) return packFloat32(aSign, 0, 0);
1201 float_raise(status, float_flag_invalid);
1202 return float32_default_nan;
1203 }
1204 if (aExp == 0) {
1205 if (aSig == 0) return 0;
1206 float_raise(status, float_flag_denormal);
1207 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1208 }
1209 zExp = ((aExp - 0x7F)>>1) + 0x7E;
1210 aSig = (aSig | 0x00800000)<<8;
1211 zSig = estimateSqrt32(aExp, aSig) + 2;
1212 if ((zSig & 0x7F) <= 5) {
1213 if (zSig < 2) {
1214 zSig = 0x7FFFFFFF;
1215 goto roundAndPack;
1216 }
1217 aSig >>= aExp & 1;
1218 term = ((Bit64u) zSig) * zSig;
1219 rem = (((Bit64u) aSig)<<32) - term;
1220 while ((Bit64s) rem < 0) {
1221 --zSig;
1222 rem += (((Bit64u) zSig)<<1) | 1;
1223 }
1224 zSig |= (rem != 0);
1225 }
1226 zSig = shift32RightJamming(zSig, 1);
1227 roundAndPack:
1228 return roundAndPackFloat32(0, zExp, zSig, status);
1229 }
1230
1231 /*----------------------------------------------------------------------------
1232 | Determine single-precision floating-point number class.
1233 *----------------------------------------------------------------------------*/
1234
float32_class(float32 a)1235 float_class_t float32_class(float32 a)
1236 {
1237 Bit16s aExp = extractFloat32Exp(a);
1238 Bit32u aSig = extractFloat32Frac(a);
1239 int aSign = extractFloat32Sign(a);
1240
1241 if(aExp == 0xFF) {
1242 if (aSig == 0)
1243 return (aSign) ? float_negative_inf : float_positive_inf;
1244
1245 return (aSig & 0x00400000) ? float_QNaN : float_SNaN;
1246 }
1247
1248 if(aExp == 0) {
1249 if (aSig == 0) return float_zero;
1250 return float_denormal;
1251 }
1252
1253 return float_normalized;
1254 }
1255
1256 /*----------------------------------------------------------------------------
1257 | Compare between two single precision floating point numbers. Returns
1258 | 'float_relation_equal' if the operands are equal, 'float_relation_less' if
1259 | the value 'a' is less than the corresponding value `b',
1260 | 'float_relation_greater' if the value 'a' is greater than the corresponding
1261 | value `b', or 'float_relation_unordered' otherwise.
1262 *----------------------------------------------------------------------------*/
1263
float32_compare(float32 a,float32 b,float_status_t & status)1264 int float32_compare(float32 a, float32 b, float_status_t &status)
1265 {
1266 if (get_denormals_are_zeros(status)) {
1267 a = float32_denormal_to_zero(a);
1268 b = float32_denormal_to_zero(b);
1269 }
1270
1271 float_class_t aClass = float32_class(a);
1272 float_class_t bClass = float32_class(b);
1273
1274 if (aClass == float_SNaN || aClass == float_QNaN || bClass == float_SNaN || bClass == float_QNaN)
1275 {
1276 float_raise(status, float_flag_invalid);
1277 return float_relation_unordered;
1278 }
1279
1280 if (aClass == float_denormal || bClass == float_denormal) {
1281 float_raise(status, float_flag_denormal);
1282 }
1283
1284 if ((a == b) || ((Bit32u) ((a | b)<<1) == 0)) return float_relation_equal;
1285
1286 int aSign = extractFloat32Sign(a);
1287 int bSign = extractFloat32Sign(b);
1288 if (aSign != bSign)
1289 return (aSign) ? float_relation_less : float_relation_greater;
1290
1291 if (aSign ^ (a < b)) return float_relation_less;
1292 return float_relation_greater;
1293 }
1294
1295 /*----------------------------------------------------------------------------
1296 | Compare between two double precision floating point numbers. Returns
1297 | 'float_relation_equal' if the operands are equal, 'float_relation_less' if
1298 | the value 'a' is less than the corresponding value `b',
1299 | 'float_relation_greater' if the value 'a' is greater than the corresponding
1300 | value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause
1301 | an exception.
1302 *----------------------------------------------------------------------------*/
1303
float32_compare_quiet(float32 a,float32 b,float_status_t & status)1304 int float32_compare_quiet(float32 a, float32 b, float_status_t &status)
1305 {
1306 if (get_denormals_are_zeros(status)) {
1307 a = float32_denormal_to_zero(a);
1308 b = float32_denormal_to_zero(b);
1309 }
1310
1311 float_class_t aClass = float32_class(a);
1312 float_class_t bClass = float32_class(b);
1313
1314 if (aClass == float_SNaN || bClass == float_SNaN) {
1315 float_raise(status, float_flag_invalid);
1316 return float_relation_unordered;
1317 }
1318
1319 if (aClass == float_QNaN || bClass == float_QNaN) {
1320 return float_relation_unordered;
1321 }
1322
1323 if (aClass == float_denormal || bClass == float_denormal) {
1324 float_raise(status, float_flag_denormal);
1325 }
1326
1327 if ((a == b) || ((Bit32u) ((a | b)<<1) == 0)) return float_relation_equal;
1328
1329 int aSign = extractFloat32Sign(a);
1330 int bSign = extractFloat32Sign(b);
1331 if (aSign != bSign)
1332 return (aSign) ? float_relation_less : float_relation_greater;
1333
1334 if (aSign ^ (a < b)) return float_relation_less;
1335 return float_relation_greater;
1336 }
1337
1338 /*----------------------------------------------------------------------------
1339 | Compare bewteen two single precision floating point numbers and return the
1340 | smaller of them. The operation is performed according to the IEC/IEEE
1341 | Standard for Binary Floating-Point Arithmetic.
1342 *----------------------------------------------------------------------------*/
1343
float32_min(float32 a,float32 b,float_status_t & status)1344 float32 float32_min(float32 a, float32 b, float_status_t &status)
1345 {
1346 if (get_denormals_are_zeros(status)) {
1347 a = float32_denormal_to_zero(a);
1348 b = float32_denormal_to_zero(b);
1349 }
1350
1351 return (float32_compare(a, b, status) == float_relation_less) ? a : b;
1352 }
1353
1354 /*----------------------------------------------------------------------------
1355 | Compare bewteen two single precision floating point numbers and return the
1356 | larger of them. The operation is performed according to the IEC/IEEE
1357 | Standard for Binary Floating-Point Arithmetic.
1358 *----------------------------------------------------------------------------*/
1359
float32_max(float32 a,float32 b,float_status_t & status)1360 float32 float32_max(float32 a, float32 b, float_status_t &status)
1361 {
1362 if (get_denormals_are_zeros(status)) {
1363 a = float32_denormal_to_zero(a);
1364 b = float32_denormal_to_zero(b);
1365 }
1366
1367 return (float32_compare(a, b, status) == float_relation_greater) ? a : b;
1368 }
1369
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the double-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format. The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic - which means in particular that the conversion is rounded
1375 | according to the current rounding mode. If `a' is a NaN or the
1376 | conversion overflows, the integer indefinite value is returned.
1377 *----------------------------------------------------------------------------*/
1378
float64_to_int32(float64 a,float_status_t & status)1379 Bit32s float64_to_int32(float64 a, float_status_t &status)
1380 {
1381 Bit64u aSig = extractFloat64Frac(a);
1382 Bit16s aExp = extractFloat64Exp(a);
1383 int aSign = extractFloat64Sign(a);
1384 if ((aExp == 0x7FF) && aSig) aSign = 0;
1385 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1386 else {
1387 if (get_denormals_are_zeros(status)) aSig = 0;
1388 }
1389 int shiftCount = 0x42C - aExp;
1390 if (0 < shiftCount) aSig = shift64RightJamming(aSig, shiftCount);
1391 return roundAndPackInt32(aSign, aSig, status);
1392 }
1393
1394 /*----------------------------------------------------------------------------
1395 | Returns the result of converting the double-precision floating-point value
1396 | `a' to the 32-bit two's complement integer format. The conversion is
1397 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1398 | Arithmetic, except that the conversion is always rounded toward zero.
1399 | If `a' is a NaN or the conversion overflows, the integer indefinite
1400 | value is returned.
1401 *----------------------------------------------------------------------------*/
1402
float64_to_int32_round_to_zero(float64 a,float_status_t & status)1403 Bit32s float64_to_int32_round_to_zero(float64 a, float_status_t &status)
1404 {
1405 int aSign;
1406 Bit16s aExp;
1407 Bit64u aSig, savedASig;
1408 Bit32s z;
1409 int shiftCount;
1410
1411 aSig = extractFloat64Frac(a);
1412 aExp = extractFloat64Exp(a);
1413 aSign = extractFloat64Sign(a);
1414 if (0x41E < aExp) {
1415 float_raise(status, float_flag_invalid);
1416 return (Bit32s)(int32_indefinite);
1417 }
1418 else if (aExp < 0x3FF) {
1419 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1420 if (aExp || aSig) float_raise(status, float_flag_inexact);
1421 return 0;
1422 }
1423 aSig |= BX_CONST64(0x0010000000000000);
1424 shiftCount = 0x433 - aExp;
1425 savedASig = aSig;
1426 aSig >>= shiftCount;
1427 z = (Bit32s) aSig;
1428 if (aSign) z = -z;
1429 if ((z < 0) ^ aSign) {
1430 float_raise(status, float_flag_invalid);
1431 return (Bit32s)(int32_indefinite);
1432 }
1433 if ((aSig<<shiftCount) != savedASig) {
1434 float_raise(status, float_flag_inexact);
1435 }
1436 return z;
1437 }
1438
1439 /*----------------------------------------------------------------------------
1440 | Returns the result of converting the double-precision floating-point value
1441 | `a' to the 32-bit unsigned integer format. The conversion is performed
1442 | according to the IEC/IEEE Standard for Binary Floating-point Arithmetic,
1443 | except that the conversion is always rounded toward zero. If `a' is a NaN
1444 | or conversion overflows, the largest positive integer is returned.
1445 *----------------------------------------------------------------------------*/
1446
float64_to_uint32_round_to_zero(float64 a,float_status_t & status)1447 Bit32u float64_to_uint32_round_to_zero(float64 a, float_status_t &status)
1448 {
1449 Bit64u aSig = extractFloat64Frac(a);
1450 Bit16s aExp = extractFloat64Exp(a);
1451 int aSign = extractFloat64Sign(a);
1452
1453 if (aExp < 0x3FF) {
1454 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1455 if (aExp || aSig) float_raise(status, float_flag_inexact);
1456 return 0;
1457 }
1458
1459 if (0x41E < aExp || aSign) {
1460 float_raise(status, float_flag_invalid);
1461 return uint32_indefinite;
1462 }
1463
1464 aSig |= BX_CONST64(0x0010000000000000);
1465 int shiftCount = 0x433 - aExp;
1466 Bit64u savedASig = aSig;
1467 aSig >>= shiftCount;
1468 if ((aSig<<shiftCount) != savedASig) {
1469 float_raise(status, float_flag_inexact);
1470 }
1471 return (Bit32u) aSig;
1472 }
1473
1474 /*----------------------------------------------------------------------------
1475 | Returns the result of converting the double-precision floating-point value
1476 | `a' to the 64-bit two's complement integer format. The conversion is
1477 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1478 | Arithmetic - which means in particular that the conversion is rounded
1479 | according to the current rounding mode. If `a' is a NaN, the largest
1480 | positive integer is returned. Otherwise, if the conversion overflows, the
1481 | largest integer with the same sign as `a' is returned.
1482 *----------------------------------------------------------------------------*/
1483
float64_to_int64(float64 a,float_status_t & status)1484 Bit64s float64_to_int64(float64 a, float_status_t &status)
1485 {
1486 int aSign;
1487 Bit16s aExp;
1488 Bit64u aSig, aSigExtra;
1489
1490 aSig = extractFloat64Frac(a);
1491 aExp = extractFloat64Exp(a);
1492 aSign = extractFloat64Sign(a);
1493 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1494 else {
1495 if (get_denormals_are_zeros(status)) aSig = 0;
1496 }
1497 int shiftCount = 0x433 - aExp;
1498 if (shiftCount <= 0) {
1499 if (0x43E < aExp) {
1500 float_raise(status, float_flag_invalid);
1501 return (Bit64s)(int64_indefinite);
1502 }
1503 aSigExtra = 0;
1504 aSig <<= -shiftCount;
1505 }
1506 else {
1507 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
1508 }
1509 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
1510 }
1511
1512 /*----------------------------------------------------------------------------
1513 | Returns the result of converting the double-precision floating-point value
1514 | `a' to the 64-bit two's complement integer format. The conversion is
1515 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1516 | Arithmetic, except that the conversion is always rounded toward zero.
1517 | If `a' is a NaN or the conversion overflows, the integer indefinite
1518 | value is returned.
1519 *----------------------------------------------------------------------------*/
1520
float64_to_int64_round_to_zero(float64 a,float_status_t & status)1521 Bit64s float64_to_int64_round_to_zero(float64 a, float_status_t &status)
1522 {
1523 int aSign;
1524 Bit16s aExp;
1525 Bit64u aSig;
1526 Bit64s z;
1527
1528 aSig = extractFloat64Frac(a);
1529 aExp = extractFloat64Exp(a);
1530 aSign = extractFloat64Sign(a);
1531 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1532 int shiftCount = aExp - 0x433;
1533 if (0 <= shiftCount) {
1534 if (0x43E <= aExp) {
1535 if (a != BX_CONST64(0xC3E0000000000000)) {
1536 float_raise(status, float_flag_invalid);
1537 }
1538 return (Bit64s)(int64_indefinite);
1539 }
1540 z = aSig<<shiftCount;
1541 }
1542 else {
1543 if (aExp < 0x3FE) {
1544 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1545 if (aExp | aSig) float_raise(status, float_flag_inexact);
1546 return 0;
1547 }
1548 z = aSig>>(-shiftCount);
1549 if ((Bit64u) (aSig<<(shiftCount & 63))) {
1550 float_raise(status, float_flag_inexact);
1551 }
1552 }
1553 if (aSign) z = -z;
1554 return z;
1555 }
1556
1557 /*----------------------------------------------------------------------------
1558 | Returns the result of converting the double-precision floating-point value
1559 | `a' to the 64-bit unsigned integer format. The conversion is performed
1560 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
1561 | except that the conversion is always rounded toward zero. If `a' is a NaN
1562 | or the conversion overflows, the largest unsigned integer is returned.
1563 *----------------------------------------------------------------------------*/
1564
float64_to_uint64_round_to_zero(float64 a,float_status_t & status)1565 Bit64u float64_to_uint64_round_to_zero(float64 a, float_status_t &status)
1566 {
1567 int aSign;
1568 Bit16s aExp;
1569 Bit64u aSig, z;
1570
1571 aSig = extractFloat64Frac(a);
1572 aExp = extractFloat64Exp(a);
1573 aSign = extractFloat64Sign(a);
1574
1575 if (aExp < 0x3FE) {
1576 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1577 if (aExp | aSig) float_raise(status, float_flag_inexact);
1578 return 0;
1579 }
1580
1581 if (0x43E <= aExp || aSign) {
1582 float_raise(status, float_flag_invalid);
1583 return uint64_indefinite;
1584 }
1585
1586 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1587 int shiftCount = aExp - 0x433;
1588
1589 if (0 <= shiftCount) {
1590 z = aSig<<shiftCount;
1591 }
1592 else {
1593 z = aSig>>(-shiftCount);
1594 if ((Bit64u) (aSig<<(shiftCount & 63))) {
1595 float_raise(status, float_flag_inexact);
1596 }
1597 }
1598 return z;
1599 }
1600
1601 /*----------------------------------------------------------------------------
1602 | Returns the result of converting the double-precision floating-point value
1603 | `a' to the 32-bit unsigned integer format. The conversion is
1604 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1605 | Arithmetic---which means in particular that the conversion is rounded
1606 | according to the current rounding mode. If `a' is a NaN or the conversion
1607 | overflows, the largest unsigned integer is returned.
1608 *----------------------------------------------------------------------------*/
1609
float64_to_uint32(float64 a,float_status_t & status)1610 Bit32u float64_to_uint32(float64 a, float_status_t &status)
1611 {
1612 Bit64u val_64 = float64_to_uint64(a, status);
1613
1614 if (val_64 > 0xffffffff) {
1615 status.float_exception_flags = float_flag_invalid; // throw away other flags
1616 return uint32_indefinite;
1617 }
1618
1619 return (Bit32u) val_64;
1620 }
1621
1622 /*----------------------------------------------------------------------------
1623 | Returns the result of converting the double-precision floating-point value
1624 | `a' to the 64-bit unsigned integer format. The conversion is
1625 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1626 | Arithmetic---which means in particular that the conversion is rounded
1627 | according to the current rounding mode. If `a' is a NaN or the conversion
1628 | overflows, the largest unsigned integer is returned.
1629 *----------------------------------------------------------------------------*/
1630
float64_to_uint64(float64 a,float_status_t & status)1631 Bit64u float64_to_uint64(float64 a, float_status_t &status)
1632 {
1633 int aSign;
1634 Bit16s aExp, shiftCount;
1635 Bit64u aSig, aSigExtra;
1636
1637 aSig = extractFloat64Frac(a);
1638 aExp = extractFloat64Exp(a);
1639 aSign = extractFloat64Sign(a);
1640
1641 if (get_denormals_are_zeros(status)) {
1642 if (aExp == 0) aSig = 0;
1643 }
1644
1645 if (aSign && (aExp > 0x3FE)) {
1646 float_raise(status, float_flag_invalid);
1647 return uint64_indefinite;
1648 }
1649
1650 if (aExp) {
1651 aSig |= BX_CONST64(0x0010000000000000);
1652 }
1653 shiftCount = 0x433 - aExp;
1654 if (shiftCount <= 0) {
1655 if (0x43E < aExp) {
1656 float_raise(status, float_flag_invalid);
1657 return uint64_indefinite;
1658 }
1659 aSigExtra = 0;
1660 aSig <<= -shiftCount;
1661 } else {
1662 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
1663 }
1664
1665 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
1666 }
1667
1668 /*----------------------------------------------------------------------------
1669 | Returns the result of converting the double-precision floating-point value
1670 | `a' to the single-precision floating-point format. The conversion is
1671 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1672 | Arithmetic.
1673 *----------------------------------------------------------------------------*/
1674
float64_to_float32(float64 a,float_status_t & status)1675 float32 float64_to_float32(float64 a, float_status_t &status)
1676 {
1677 int aSign;
1678 Bit16s aExp;
1679 Bit64u aSig;
1680 Bit32u zSig;
1681
1682 aSig = extractFloat64Frac(a);
1683 aExp = extractFloat64Exp(a);
1684 aSign = extractFloat64Sign(a);
1685 if (aExp == 0x7FF) {
1686 if (aSig) return commonNaNToFloat32(float64ToCommonNaN(a, status));
1687 return packFloat32(aSign, 0xFF, 0);
1688 }
1689 if (aExp == 0) {
1690 if (aSig == 0 || get_denormals_are_zeros(status))
1691 return packFloat32(aSign, 0, 0);
1692 float_raise(status, float_flag_denormal);
1693 }
1694 aSig = shift64RightJamming(aSig, 22);
1695 zSig = (Bit32u) aSig;
1696 if (aExp || zSig) {
1697 zSig |= 0x40000000;
1698 aExp -= 0x381;
1699 }
1700 return roundAndPackFloat32(aSign, aExp, zSig, status);
1701 }
1702
1703 /*----------------------------------------------------------------------------
1704 | Rounds the double-precision floating-point value `a' to an integer, and
1705 | returns the result as a double-precision floating-point value. The
1706 | operation is performed according to the IEC/IEEE Standard for Binary
1707 | Floating-Point Arithmetic.
1708 *----------------------------------------------------------------------------*/
1709
float64_round_to_int(float64 a,Bit8u scale,float_status_t & status)1710 float64 float64_round_to_int(float64 a, Bit8u scale, float_status_t &status)
1711 {
1712 Bit64u lastBitMask, roundBitsMask;
1713 int roundingMode = get_float_rounding_mode(status);
1714 Bit16s aExp = extractFloat64Exp(a);
1715 scale &= 0xf;
1716
1717 if ((aExp == 0x7FF) && extractFloat64Frac(a)) {
1718 return propagateFloat64NaN(a, status);
1719 }
1720
1721 aExp += scale; // scale the exponent
1722
1723 if (0x433 <= aExp) {
1724 return a;
1725 }
1726
1727 if (get_denormals_are_zeros(status)) {
1728 a = float64_denormal_to_zero(a);
1729 }
1730
1731 if (aExp < 0x3FF) {
1732 if ((Bit64u) (a<<1) == 0) return a;
1733 float_raise(status, float_flag_inexact);
1734 int aSign = extractFloat64Sign(a);
1735 switch (roundingMode) {
1736 case float_round_nearest_even:
1737 if ((aExp == 0x3FE) && extractFloat64Frac(a)) {
1738 return packFloat64(aSign, 0x3FF - scale, 0);
1739 }
1740 break;
1741 case float_round_down:
1742 return aSign ? packFloat64(1, 0x3FF - scale, 0) : float64_positive_zero;
1743 case float_round_up:
1744 return aSign ? float64_negative_zero : packFloat64(0, 0x3FF - scale, 0);
1745 }
1746 return packFloat64(aSign, 0, 0);
1747 }
1748
1749 lastBitMask = 1;
1750 lastBitMask <<= 0x433 - aExp;
1751 roundBitsMask = lastBitMask - 1;
1752 float64 z = a;
1753 if (roundingMode == float_round_nearest_even) {
1754 z += lastBitMask>>1;
1755 if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
1756 }
1757 else if (roundingMode != float_round_to_zero) {
1758 if (extractFloat64Sign(z) ^ (roundingMode == float_round_up)) {
1759 z += roundBitsMask;
1760 }
1761 }
1762 z &= ~roundBitsMask;
1763 if (z != a) float_raise(status, float_flag_inexact);
1764 return z;
1765 }
1766
1767 /*----------------------------------------------------------------------------
1768 | Extracts the fractional portion of double-precision floating-point value `a',
1769 | and returns the result as a double-precision floating-point value. The
1770 | fractional results are precise. The operation is performed according to the
1771 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1772 *----------------------------------------------------------------------------*/
1773
float64_frc(float64 a,float_status_t & status)1774 float64 float64_frc(float64 a, float_status_t &status)
1775 {
1776 int roundingMode = get_float_rounding_mode(status);
1777
1778 Bit64u aSig = extractFloat64Frac(a);
1779 Bit16s aExp = extractFloat64Exp(a);
1780 int aSign = extractFloat64Sign(a);
1781
1782 if (aExp == 0x7FF) {
1783 if (aSig) return propagateFloat64NaN(a, status);
1784 float_raise(status, float_flag_invalid);
1785 return float64_default_nan;
1786 }
1787
1788 if (aExp >= 0x433) {
1789 return packFloat64(roundingMode == float_round_down, 0, 0);
1790 }
1791
1792 if (aExp < 0x3FF) {
1793 if (aExp == 0) {
1794 if (aSig == 0 || get_denormals_are_zeros(status))
1795 return packFloat64(roundingMode == float_round_down, 0, 0);
1796
1797 float_raise(status, float_flag_denormal);
1798 if (! float_exception_masked(status, float_flag_underflow))
1799 float_raise(status, float_flag_underflow);
1800
1801 if(get_flush_underflow_to_zero(status)) {
1802 float_raise(status, float_flag_underflow | float_flag_inexact);
1803 return packFloat64(aSign, 0, 0);
1804 }
1805 }
1806 return a;
1807 }
1808
1809 Bit64u lastBitMask = BX_CONST64(1) << (0x433 - aExp);
1810 Bit64u roundBitsMask = lastBitMask - 1;
1811
1812 aSig &= roundBitsMask;
1813 aSig <<= 10;
1814 aExp--;
1815
1816 if (aSig == 0)
1817 return packFloat64(roundingMode == float_round_down, 0, 0);
1818
1819 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
1820 }
1821
1822 /*----------------------------------------------------------------------------
1823 | Extracts the exponent portion of double-precision floating-point value 'a',
1824 | and returns the result as a double-precision floating-point value
1825 | representing unbiased integer exponent. The operation is performed according
1826 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1827 *----------------------------------------------------------------------------*/
1828
float64_getexp(float64 a,float_status_t & status)1829 float64 float64_getexp(float64 a, float_status_t &status)
1830 {
1831 Bit16s aExp = extractFloat64Exp(a);
1832 Bit64u aSig = extractFloat64Frac(a);
1833
1834 if (aExp == 0x7FF) {
1835 if (aSig) return propagateFloat64NaN(a, status);
1836 return float64_positive_inf;
1837 }
1838
1839 if (aExp == 0) {
1840 if (aSig == 0 || get_denormals_are_zeros(status))
1841 return float64_negative_inf;
1842
1843 float_raise(status, float_flag_denormal);
1844 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1845 }
1846
1847 return int32_to_float64(aExp - 0x3FF);
1848 }
1849
1850 /*----------------------------------------------------------------------------
1851 | Extracts the mantissa of double-precision floating-point value 'a' and
1852 | returns the result as a double-precision floating-point after applying
1853 | the mantissa interval normalization and sign control. The operation is
1854 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1855 | Arithmetic.
1856 *----------------------------------------------------------------------------*/
1857
float64_getmant(float64 a,float_status_t & status,int sign_ctrl,int interv)1858 float64 float64_getmant(float64 a, float_status_t &status, int sign_ctrl, int interv)
1859 {
1860 Bit16s aExp = extractFloat64Exp(a);
1861 Bit64u aSig = extractFloat64Frac(a);
1862 int aSign = extractFloat64Sign(a);
1863
1864 if (aExp == 0x7FF) {
1865 if (aSig) return propagateFloat64NaN(a, status);
1866 if (aSign) {
1867 if (sign_ctrl & 0x2) {
1868 float_raise(status, float_flag_invalid);
1869 return float64_default_nan;
1870 }
1871 }
1872 return packFloat64(~sign_ctrl & aSign, 0x3FF, 0);
1873 }
1874
1875 if (aExp == 0 && (aSig == 0 || get_denormals_are_zeros(status))) {
1876 return packFloat64(~sign_ctrl & aSign, 0x3FF, 0);
1877 }
1878
1879 if (aSign) {
1880 if (sign_ctrl & 0x2) {
1881 float_raise(status, float_flag_invalid);
1882 return float64_default_nan;
1883 }
1884 }
1885
1886 if (aExp == 0) {
1887 float_raise(status, float_flag_denormal);
1888 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1889 }
1890
1891 switch(interv) {
1892 case 0x0: // interval [1,2)
1893 aExp = 0x3FF;
1894 break;
1895 case 0x1: // interval [1/2,2)
1896 aExp -= 0x3FF;
1897 aExp = 0x3FF - (aExp & 0x1);
1898 break;
1899 case 0x2: // interval [1/2,1)
1900 aExp = 0x3FE;
1901 break;
1902 case 0x3: // interval [3/4,3/2)
1903 aExp = 0x3FF - ((aSig >> 51) & 0x1);
1904 break;
1905 }
1906
1907 return packFloat64(~sign_ctrl & aSign, aExp, aSig);
1908 }
1909
1910 /*----------------------------------------------------------------------------
1911 | Return the result of a floating point scale of the double-precision floating
1912 | point value `a' by multiplying it by 2 power of the double-precision
1913 | floating point value 'b' converted to integral value. If the result cannot
1914 | be represented in double precision, then the proper overflow response (for
1915 | positive scaling operand), or the proper underflow response (for negative
1916 | scaling operand) is issued. The operation is performed according to the
1917 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1918 *----------------------------------------------------------------------------*/
1919
float64_scalef(float64 a,float64 b,float_status_t & status)1920 float64 float64_scalef(float64 a, float64 b, float_status_t &status)
1921 {
1922 Bit64u aSig = extractFloat64Frac(a);
1923 Bit16s aExp = extractFloat64Exp(a);
1924 int aSign = extractFloat64Sign(a);
1925 Bit64u bSig = extractFloat64Frac(b);
1926 Bit16s bExp = extractFloat64Exp(b);
1927 int bSign = extractFloat64Sign(b);
1928
1929 if (get_denormals_are_zeros(status)) {
1930 if (aExp == 0) aSig = 0;
1931 if (bExp == 0) bSig = 0;
1932 }
1933
1934 if (bExp == 0x7FF) {
1935 if (bSig) return propagateFloat64NaN(a, b, status);
1936 }
1937
1938 if (aExp == 0x7FF) {
1939 if (aSig) {
1940 int aIsSignalingNaN = (aSig & BX_CONST64(0x0008000000000000)) == 0;
1941 if (aIsSignalingNaN || bExp != 0x7FF || bSig)
1942 return propagateFloat64NaN(a, b, status);
1943
1944 return bSign ? 0 : float64_positive_inf;
1945 }
1946
1947 if (bExp == 0x7FF && bSign) {
1948 float_raise(status, float_flag_invalid);
1949 return float64_default_nan;
1950 }
1951 return a;
1952 }
1953
1954 if ((aExp | aSig) == 0) {
1955 if (bExp == 0x7FF && ! bSign) {
1956 float_raise(status, float_flag_invalid);
1957 return float64_default_nan;
1958 }
1959 return a;
1960 }
1961
1962 if ((bExp | bSig) == 0) return a;
1963
1964 if (bExp == 0x7FF) {
1965 if (bSign) return packFloat64(aSign, 0, 0);
1966 return packFloat64(aSign, 0x7FF, 0);
1967 }
1968
1969 if (0x40F <= bExp) {
1970 // handle obvious overflow/underflow result
1971 return roundAndPackFloat64(aSign, bSign ? -0x3FF : 0x7FF, aSig, status);
1972 }
1973
1974 int scale = 0;
1975
1976 if (bExp < 0x3FF) {
1977 scale = -bSign;
1978 }
1979 else {
1980 bSig |= BX_CONST64(0x0010000000000000);
1981 int shiftCount = 0x433 - bExp;
1982 Bit64u savedBSig = bSig;
1983 bSig >>= shiftCount;
1984 scale = (Bit32s) bSig;
1985 if (bSign) {
1986 if ((bSig<<shiftCount) != savedBSig) scale++;
1987 scale = -scale;
1988 }
1989
1990 if (scale > 0x1000) scale = 0x1000;
1991 if (scale < -0x1000) scale = -0x1000;
1992 }
1993
1994 if (aExp != 0) {
1995 aSig |= BX_CONST64(0x0010000000000000);
1996 } else {
1997 aExp++;
1998 }
1999
2000 aExp += scale - 1;
2001 aSig <<= 10;
2002 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
2003 }
2004
2005 /*----------------------------------------------------------------------------
2006 | Returns the result of adding the absolute values of the double-precision
2007 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2008 | before being returned. `zSign' is ignored if the result is a NaN.
2009 | The addition is performed according to the IEC/IEEE Standard for Binary
2010 | Floating-Point Arithmetic.
2011 *----------------------------------------------------------------------------*/
2012
addFloat64Sigs(float64 a,float64 b,int zSign,float_status_t & status)2013 static float64 addFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
2014 {
2015 Bit16s aExp, bExp, zExp;
2016 Bit64u aSig, bSig, zSig;
2017 Bit16s expDiff;
2018
2019 aSig = extractFloat64Frac(a);
2020 aExp = extractFloat64Exp(a);
2021 bSig = extractFloat64Frac(b);
2022 bExp = extractFloat64Exp(b);
2023
2024 if (get_denormals_are_zeros(status)) {
2025 if (aExp == 0) aSig = 0;
2026 if (bExp == 0) bSig = 0;
2027 }
2028
2029 expDiff = aExp - bExp;
2030 aSig <<= 9;
2031 bSig <<= 9;
2032 if (0 < expDiff) {
2033 if (aExp == 0x7FF) {
2034 if (aSig) return propagateFloat64NaN(a, b, status);
2035 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2036 return a;
2037 }
2038 if ((aExp == 0) && aSig)
2039 float_raise(status, float_flag_denormal);
2040
2041 if (bExp == 0) {
2042 if (bSig) float_raise(status, float_flag_denormal);
2043 --expDiff;
2044 }
2045 else bSig |= BX_CONST64(0x2000000000000000);
2046
2047 bSig = shift64RightJamming(bSig, expDiff);
2048 zExp = aExp;
2049 }
2050 else if (expDiff < 0) {
2051 if (bExp == 0x7FF) {
2052 if (bSig) return propagateFloat64NaN(a, b, status);
2053 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2054 return packFloat64(zSign, 0x7FF, 0);
2055 }
2056 if ((bExp == 0) && bSig)
2057 float_raise(status, float_flag_denormal);
2058
2059 if (aExp == 0) {
2060 if (aSig) float_raise(status, float_flag_denormal);
2061 ++expDiff;
2062 }
2063 else aSig |= BX_CONST64(0x2000000000000000);
2064
2065 aSig = shift64RightJamming(aSig, -expDiff);
2066 zExp = bExp;
2067 }
2068 else {
2069 if (aExp == 0x7FF) {
2070 if (aSig | bSig) return propagateFloat64NaN(a, b, status);
2071 return a;
2072 }
2073 if (aExp == 0) {
2074 zSig = (aSig + bSig) >> 9;
2075 if (aSig | bSig) {
2076 float_raise(status, float_flag_denormal);
2077 if (get_flush_underflow_to_zero(status)) {
2078 float_raise(status, float_flag_underflow | float_flag_inexact);
2079 return packFloat64(zSign, 0, 0);
2080 }
2081 if (! float_exception_masked(status, float_flag_underflow)) {
2082 if (extractFloat64Frac(zSig) == zSig)
2083 float_raise(status, float_flag_underflow);
2084 }
2085 }
2086 return packFloat64(zSign, 0, zSig);
2087 }
2088 zSig = BX_CONST64(0x4000000000000000) + aSig + bSig;
2089 return roundAndPackFloat64(zSign, aExp, zSig, status);
2090 }
2091 aSig |= BX_CONST64(0x2000000000000000);
2092 zSig = (aSig + bSig)<<1;
2093 --zExp;
2094 if ((Bit64s) zSig < 0) {
2095 zSig = aSig + bSig;
2096 ++zExp;
2097 }
2098 return roundAndPackFloat64(zSign, zExp, zSig, status);
2099 }
2100
2101 /*----------------------------------------------------------------------------
2102 | Returns the result of subtracting the absolute values of the double-
2103 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2104 | difference is negated before being returned. `zSign' is ignored if the
2105 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2106 | Standard for Binary Floating-Point Arithmetic.
2107 *----------------------------------------------------------------------------*/
2108
subFloat64Sigs(float64 a,float64 b,int zSign,float_status_t & status)2109 static float64 subFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
2110 {
2111 Bit16s aExp, bExp, zExp;
2112 Bit64u aSig, bSig, zSig;
2113 Bit16s expDiff;
2114
2115 aSig = extractFloat64Frac(a);
2116 aExp = extractFloat64Exp(a);
2117 bSig = extractFloat64Frac(b);
2118 bExp = extractFloat64Exp(b);
2119
2120 if (get_denormals_are_zeros(status)) {
2121 if (aExp == 0) aSig = 0;
2122 if (bExp == 0) bSig = 0;
2123 }
2124
2125 expDiff = aExp - bExp;
2126 aSig <<= 10;
2127 bSig <<= 10;
2128 if (0 < expDiff) goto aExpBigger;
2129 if (expDiff < 0) goto bExpBigger;
2130 if (aExp == 0x7FF) {
2131 if (aSig | bSig) return propagateFloat64NaN(a, b, status);
2132 float_raise(status, float_flag_invalid);
2133 return float64_default_nan;
2134 }
2135 if (aExp == 0) {
2136 if (aSig | bSig) float_raise(status, float_flag_denormal);
2137 aExp = 1;
2138 bExp = 1;
2139 }
2140 if (bSig < aSig) goto aBigger;
2141 if (aSig < bSig) goto bBigger;
2142 return packFloat64(get_float_rounding_mode(status) == float_round_down, 0, 0);
2143 bExpBigger:
2144 if (bExp == 0x7FF) {
2145 if (bSig) return propagateFloat64NaN(a, b, status);
2146 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2147 return packFloat64(zSign ^ 1, 0x7FF, 0);
2148 }
2149 if ((bExp == 0) && bSig)
2150 float_raise(status, float_flag_denormal);
2151
2152 if (aExp == 0) {
2153 if (aSig) float_raise(status, float_flag_denormal);
2154 ++expDiff;
2155 }
2156 else aSig |= BX_CONST64(0x4000000000000000);
2157
2158 aSig = shift64RightJamming(aSig, -expDiff);
2159 bSig |= BX_CONST64(0x4000000000000000);
2160 bBigger:
2161 zSig = bSig - aSig;
2162 zExp = bExp;
2163 zSign ^= 1;
2164 goto normalizeRoundAndPack;
2165 aExpBigger:
2166 if (aExp == 0x7FF) {
2167 if (aSig) return propagateFloat64NaN(a, b, status);
2168 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2169 return a;
2170 }
2171 if ((aExp == 0) && aSig)
2172 float_raise(status, float_flag_denormal);
2173
2174 if (bExp == 0) {
2175 if (bSig) float_raise(status, float_flag_denormal);
2176 --expDiff;
2177 }
2178 else bSig |= BX_CONST64(0x4000000000000000);
2179
2180 bSig = shift64RightJamming(bSig, expDiff);
2181 aSig |= BX_CONST64(0x4000000000000000);
2182 aBigger:
2183 zSig = aSig - bSig;
2184 zExp = aExp;
2185 normalizeRoundAndPack:
2186 --zExp;
2187 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
2188 }
2189
2190 /*----------------------------------------------------------------------------
2191 | Returns the result of adding the double-precision floating-point values `a'
2192 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2193 | Binary Floating-Point Arithmetic.
2194 *----------------------------------------------------------------------------*/
2195
float64_add(float64 a,float64 b,float_status_t & status)2196 float64 float64_add(float64 a, float64 b, float_status_t &status)
2197 {
2198 int aSign = extractFloat64Sign(a);
2199 int bSign = extractFloat64Sign(b);
2200
2201 if (aSign == bSign) {
2202 return addFloat64Sigs(a, b, aSign, status);
2203 }
2204 else {
2205 return subFloat64Sigs(a, b, aSign, status);
2206 }
2207 }
2208
2209 /*----------------------------------------------------------------------------
2210 | Returns the result of subtracting the double-precision floating-point values
2211 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2212 | for Binary Floating-Point Arithmetic.
2213 *----------------------------------------------------------------------------*/
2214
float64_sub(float64 a,float64 b,float_status_t & status)2215 float64 float64_sub(float64 a, float64 b, float_status_t &status)
2216 {
2217 int aSign = extractFloat64Sign(a);
2218 int bSign = extractFloat64Sign(b);
2219
2220 if (aSign == bSign) {
2221 return subFloat64Sigs(a, b, aSign, status);
2222 }
2223 else {
2224 return addFloat64Sigs(a, b, aSign, status);
2225 }
2226 }
2227
2228 /*----------------------------------------------------------------------------
2229 | Returns the result of multiplying the double-precision floating-point values
2230 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2231 | for Binary Floating-Point Arithmetic.
2232 *----------------------------------------------------------------------------*/
2233
float64_mul(float64 a,float64 b,float_status_t & status)2234 float64 float64_mul(float64 a, float64 b, float_status_t &status)
2235 {
2236 int aSign, bSign, zSign;
2237 Bit16s aExp, bExp, zExp;
2238 Bit64u aSig, bSig, zSig0, zSig1;
2239
2240 aSig = extractFloat64Frac(a);
2241 aExp = extractFloat64Exp(a);
2242 aSign = extractFloat64Sign(a);
2243 bSig = extractFloat64Frac(b);
2244 bExp = extractFloat64Exp(b);
2245 bSign = extractFloat64Sign(b);
2246 zSign = aSign ^ bSign;
2247
2248 if (get_denormals_are_zeros(status)) {
2249 if (aExp == 0) aSig = 0;
2250 if (bExp == 0) bSig = 0;
2251 }
2252
2253 if (aExp == 0x7FF) {
2254 if (aSig || ((bExp == 0x7FF) && bSig)) {
2255 return propagateFloat64NaN(a, b, status);
2256 }
2257 if ((bExp | bSig) == 0) {
2258 float_raise(status, float_flag_invalid);
2259 return float64_default_nan;
2260 }
2261 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2262 return packFloat64(zSign, 0x7FF, 0);
2263 }
2264 if (bExp == 0x7FF) {
2265 if (bSig) return propagateFloat64NaN(a, b, status);
2266 if ((aExp | aSig) == 0) {
2267 float_raise(status, float_flag_invalid);
2268 return float64_default_nan;
2269 }
2270 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2271 return packFloat64(zSign, 0x7FF, 0);
2272 }
2273 if (aExp == 0) {
2274 if (aSig == 0) {
2275 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2276 return packFloat64(zSign, 0, 0);
2277 }
2278 float_raise(status, float_flag_denormal);
2279 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2280 }
2281 if (bExp == 0) {
2282 if (bSig == 0) return packFloat64(zSign, 0, 0);
2283 float_raise(status, float_flag_denormal);
2284 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
2285 }
2286 zExp = aExp + bExp - 0x3FF;
2287 aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
2288 bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
2289 mul64To128(aSig, bSig, &zSig0, &zSig1);
2290 zSig0 |= (zSig1 != 0);
2291 if (0 <= (Bit64s) (zSig0<<1)) {
2292 zSig0 <<= 1;
2293 --zExp;
2294 }
2295 return roundAndPackFloat64(zSign, zExp, zSig0, status);
2296 }
2297
2298 /*----------------------------------------------------------------------------
2299 | Returns the result of dividing the double-precision floating-point value `a'
2300 | by the corresponding value `b'. The operation is performed according to
2301 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2302 *----------------------------------------------------------------------------*/
2303
float64_div(float64 a,float64 b,float_status_t & status)2304 float64 float64_div(float64 a, float64 b, float_status_t &status)
2305 {
2306 int aSign, bSign, zSign;
2307 Bit16s aExp, bExp, zExp;
2308 Bit64u aSig, bSig, zSig;
2309 Bit64u rem0, rem1;
2310 Bit64u term0, term1;
2311
2312 aSig = extractFloat64Frac(a);
2313 aExp = extractFloat64Exp(a);
2314 aSign = extractFloat64Sign(a);
2315 bSig = extractFloat64Frac(b);
2316 bExp = extractFloat64Exp(b);
2317 bSign = extractFloat64Sign(b);
2318 zSign = aSign ^ bSign;
2319
2320 if (get_denormals_are_zeros(status)) {
2321 if (aExp == 0) aSig = 0;
2322 if (bExp == 0) bSig = 0;
2323 }
2324
2325 if (aExp == 0x7FF) {
2326 if (aSig) return propagateFloat64NaN(a, b, status);
2327 if (bExp == 0x7FF) {
2328 if (bSig) return propagateFloat64NaN(a, b, status);
2329 float_raise(status, float_flag_invalid);
2330 return float64_default_nan;
2331 }
2332 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2333 return packFloat64(zSign, 0x7FF, 0);
2334 }
2335 if (bExp == 0x7FF) {
2336 if (bSig) return propagateFloat64NaN(a, b, status);
2337 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2338 return packFloat64(zSign, 0, 0);
2339 }
2340 if (bExp == 0) {
2341 if (bSig == 0) {
2342 if ((aExp | aSig) == 0) {
2343 float_raise(status, float_flag_invalid);
2344 return float64_default_nan;
2345 }
2346 float_raise(status, float_flag_divbyzero);
2347 return packFloat64(zSign, 0x7FF, 0);
2348 }
2349 float_raise(status, float_flag_denormal);
2350 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
2351 }
2352 if (aExp == 0) {
2353 if (aSig == 0) return packFloat64(zSign, 0, 0);
2354 float_raise(status, float_flag_denormal);
2355 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2356 }
2357 zExp = aExp - bExp + 0x3FD;
2358 aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
2359 bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
2360 if (bSig <= (aSig + aSig)) {
2361 aSig >>= 1;
2362 ++zExp;
2363 }
2364 zSig = estimateDiv128To64(aSig, 0, bSig);
2365 if ((zSig & 0x1FF) <= 2) {
2366 mul64To128(bSig, zSig, &term0, &term1);
2367 sub128(aSig, 0, term0, term1, &rem0, &rem1);
2368 while ((Bit64s) rem0 < 0) {
2369 --zSig;
2370 add128(rem0, rem1, 0, bSig, &rem0, &rem1);
2371 }
2372 zSig |= (rem1 != 0);
2373 }
2374 return roundAndPackFloat64(zSign, zExp, zSig, status);
2375 }
2376
2377 /*----------------------------------------------------------------------------
2378 | Returns the square root of the double-precision floating-point value `a'.
2379 | The operation is performed according to the IEC/IEEE Standard for Binary
2380 | Floating-Point Arithmetic.
2381 *----------------------------------------------------------------------------*/
2382
float64_sqrt(float64 a,float_status_t & status)2383 float64 float64_sqrt(float64 a, float_status_t &status)
2384 {
2385 int aSign;
2386 Bit16s aExp, zExp;
2387 Bit64u aSig, zSig, doubleZSig;
2388 Bit64u rem0, rem1, term0, term1;
2389
2390 aSig = extractFloat64Frac(a);
2391 aExp = extractFloat64Exp(a);
2392 aSign = extractFloat64Sign(a);
2393
2394 if (aExp == 0x7FF) {
2395 if (aSig) return propagateFloat64NaN(a, status);
2396 if (! aSign) return a;
2397 float_raise(status, float_flag_invalid);
2398 return float64_default_nan;
2399 }
2400
2401 if (get_denormals_are_zeros(status)) {
2402 if (aExp == 0) aSig = 0;
2403 }
2404
2405 if (aSign) {
2406 if ((aExp | aSig) == 0) return packFloat64(aSign, 0, 0);
2407 float_raise(status, float_flag_invalid);
2408 return float64_default_nan;
2409 }
2410 if (aExp == 0) {
2411 if (aSig == 0) return 0;
2412 float_raise(status, float_flag_denormal);
2413 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2414 }
2415 zExp = ((aExp - 0x3FF)>>1) + 0x3FE;
2416 aSig |= BX_CONST64(0x0010000000000000);
2417 zSig = estimateSqrt32(aExp, (Bit32u)(aSig>>21));
2418 aSig <<= 9 - (aExp & 1);
2419 zSig = estimateDiv128To64(aSig, 0, zSig<<32) + (zSig<<30);
2420 if ((zSig & 0x1FF) <= 5) {
2421 doubleZSig = zSig<<1;
2422 mul64To128(zSig, zSig, &term0, &term1);
2423 sub128(aSig, 0, term0, term1, &rem0, &rem1);
2424 while ((Bit64s) rem0 < 0) {
2425 --zSig;
2426 doubleZSig -= 2;
2427 add128(rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1);
2428 }
2429 zSig |= ((rem0 | rem1) != 0);
2430 }
2431 return roundAndPackFloat64(0, zExp, zSig, status);
2432 }
2433
2434 /*----------------------------------------------------------------------------
2435 | Determine double-precision floating-point number class
2436 *----------------------------------------------------------------------------*/
2437
float64_class(float64 a)2438 float_class_t float64_class(float64 a)
2439 {
2440 Bit16s aExp = extractFloat64Exp(a);
2441 Bit64u aSig = extractFloat64Frac(a);
2442 int aSign = extractFloat64Sign(a);
2443
2444 if(aExp == 0x7FF) {
2445 if (aSig == 0)
2446 return (aSign) ? float_negative_inf : float_positive_inf;
2447
2448 return (aSig & BX_CONST64(0x0008000000000000)) ? float_QNaN : float_SNaN;
2449 }
2450
2451 if(aExp == 0) {
2452 if (aSig == 0)
2453 return float_zero;
2454 return float_denormal;
2455 }
2456
2457 return float_normalized;
2458 }
2459
2460 /*----------------------------------------------------------------------------
2461 | Compare between two double precision floating point numbers. Returns
2462 | 'float_relation_equal' if the operands are equal, 'float_relation_less' if
2463 | the value 'a' is less than the corresponding value `b',
2464 | 'float_relation_greater' if the value 'a' is greater than the corresponding
2465 | value `b', or 'float_relation_unordered' otherwise.
2466 *----------------------------------------------------------------------------*/
2467
float64_compare(float64 a,float64 b,float_status_t & status)2468 int float64_compare(float64 a, float64 b, float_status_t &status)
2469 {
2470 if (get_denormals_are_zeros(status)) {
2471 a = float64_denormal_to_zero(a);
2472 b = float64_denormal_to_zero(b);
2473 }
2474
2475 float_class_t aClass = float64_class(a);
2476 float_class_t bClass = float64_class(b);
2477
2478 if (aClass == float_SNaN || aClass == float_QNaN || bClass == float_SNaN || bClass == float_QNaN)
2479 {
2480 float_raise(status, float_flag_invalid);
2481 return float_relation_unordered;
2482 }
2483
2484 if (aClass == float_denormal || bClass == float_denormal) {
2485 float_raise(status, float_flag_denormal);
2486 }
2487
2488 if ((a == b) || ((Bit64u) ((a | b)<<1) == 0)) return float_relation_equal;
2489
2490 int aSign = extractFloat64Sign(a);
2491 int bSign = extractFloat64Sign(b);
2492 if (aSign != bSign)
2493 return (aSign) ? float_relation_less : float_relation_greater;
2494
2495 if (aSign ^ (a < b)) return float_relation_less;
2496 return float_relation_greater;
2497 }
2498
2499 /*----------------------------------------------------------------------------
2500 | Compare between two double precision floating point numbers. Returns
2501 | 'float_relation_equal' if the operands are equal, 'float_relation_less' if
2502 | the value 'a' is less than the corresponding value `b',
2503 | 'float_relation_greater' if the value 'a' is greater than the corresponding
2504 | value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause
2505 | an exception.
2506 *----------------------------------------------------------------------------*/
2507
float64_compare_quiet(float64 a,float64 b,float_status_t & status)2508 int float64_compare_quiet(float64 a, float64 b, float_status_t &status)
2509 {
2510 if (get_denormals_are_zeros(status)) {
2511 a = float64_denormal_to_zero(a);
2512 b = float64_denormal_to_zero(b);
2513 }
2514
2515 float_class_t aClass = float64_class(a);
2516 float_class_t bClass = float64_class(b);
2517
2518 if (aClass == float_SNaN || bClass == float_SNaN) {
2519 float_raise(status, float_flag_invalid);
2520 return float_relation_unordered;
2521 }
2522
2523 if (aClass == float_QNaN || bClass == float_QNaN) {
2524 return float_relation_unordered;
2525 }
2526
2527 if (aClass == float_denormal || bClass == float_denormal) {
2528 float_raise(status, float_flag_denormal);
2529 }
2530
2531 if ((a == b) || ((Bit64u) ((a | b)<<1) == 0)) return float_relation_equal;
2532
2533 int aSign = extractFloat64Sign(a);
2534 int bSign = extractFloat64Sign(b);
2535 if (aSign != bSign)
2536 return (aSign) ? float_relation_less : float_relation_greater;
2537
2538 if (aSign ^ (a < b)) return float_relation_less;
2539 return float_relation_greater;
2540 }
2541
2542 /*----------------------------------------------------------------------------
2543 | Compare bewteen two double precision floating point numbers and return the
2544 | smaller of them. The operation is performed according to the IEC/IEEE
2545 | Standard for Binary Floating-Point Arithmetic.
2546 *----------------------------------------------------------------------------*/
2547
float64_min(float64 a,float64 b,float_status_t & status)2548 float64 float64_min(float64 a, float64 b, float_status_t &status)
2549 {
2550 if (get_denormals_are_zeros(status)) {
2551 a = float64_denormal_to_zero(a);
2552 b = float64_denormal_to_zero(b);
2553 }
2554
2555 return (float64_compare(a, b, status) == float_relation_less) ? a : b;
2556 }
2557
2558 /*----------------------------------------------------------------------------
2559 | Compare bewteen two double precision floating point numbers and return the
2560 | larger of them. The operation is performed according to the IEC/IEEE
2561 | Standard for Binary Floating-Point Arithmetic.
2562 *----------------------------------------------------------------------------*/
2563
float64_max(float64 a,float64 b,float_status_t & status)2564 float64 float64_max(float64 a, float64 b, float_status_t &status)
2565 {
2566 if (get_denormals_are_zeros(status)) {
2567 a = float64_denormal_to_zero(a);
2568 b = float64_denormal_to_zero(b);
2569 }
2570
2571 return (float64_compare(a, b, status) == float_relation_greater) ? a : b;
2572 }
2573
2574 #ifdef FLOATX80
2575
2576 /*----------------------------------------------------------------------------
2577 | Returns the result of converting the 32-bit two's complement integer `a'
2578 | to the extended double-precision floating-point format. The conversion
2579 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2580 | Arithmetic.
2581 *----------------------------------------------------------------------------*/
2582
int32_to_floatx80(Bit32s a)2583 floatx80 int32_to_floatx80(Bit32s a)
2584 {
2585 if (a == 0) return packFloatx80(0, 0, 0);
2586 int zSign = (a < 0);
2587 Bit32u absA = zSign ? -a : a;
2588 int shiftCount = countLeadingZeros32(absA) + 32;
2589 Bit64u zSig = absA;
2590 return packFloatx80(zSign, 0x403E - shiftCount, zSig<<shiftCount);
2591 }
2592
2593 /*----------------------------------------------------------------------------
2594 | Returns the result of converting the 64-bit two's complement integer `a'
2595 | to the extended double-precision floating-point format. The conversion
2596 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2597 | Arithmetic.
2598 *----------------------------------------------------------------------------*/
2599
int64_to_floatx80(Bit64s a)2600 floatx80 int64_to_floatx80(Bit64s a)
2601 {
2602 if (a == 0) return packFloatx80(0, 0, 0);
2603 int zSign = (a < 0);
2604 Bit64u absA = zSign ? -a : a;
2605 int shiftCount = countLeadingZeros64(absA);
2606 return packFloatx80(zSign, 0x403E - shiftCount, absA<<shiftCount);
2607 }
2608
2609 /*----------------------------------------------------------------------------
2610 | Returns the result of converting the single-precision floating-point value
2611 | `a' to the extended double-precision floating-point format. The conversion
2612 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2613 | Arithmetic.
2614 *----------------------------------------------------------------------------*/
2615
float32_to_floatx80(float32 a,float_status_t & status)2616 floatx80 float32_to_floatx80(float32 a, float_status_t &status)
2617 {
2618 Bit32u aSig = extractFloat32Frac(a);
2619 Bit16s aExp = extractFloat32Exp(a);
2620 int aSign = extractFloat32Sign(a);
2621 if (aExp == 0xFF) {
2622 if (aSig) return commonNaNToFloatx80(float32ToCommonNaN(a, status));
2623 return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2624 }
2625 if (aExp == 0) {
2626 if (aSig == 0) return packFloatx80(aSign, 0, 0);
2627 float_raise(status, float_flag_denormal);
2628 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2629 }
2630 aSig |= 0x00800000;
2631 return packFloatx80(aSign, aExp + 0x3F80, ((Bit64u) aSig)<<40);
2632 }
2633
2634 /*----------------------------------------------------------------------------
2635 | Returns the result of converting the double-precision floating-point value
2636 | `a' to the extended double-precision floating-point format. The conversion
2637 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2638 | Arithmetic.
2639 *----------------------------------------------------------------------------*/
2640
float64_to_floatx80(float64 a,float_status_t & status)2641 floatx80 float64_to_floatx80(float64 a, float_status_t &status)
2642 {
2643 Bit64u aSig = extractFloat64Frac(a);
2644 Bit16s aExp = extractFloat64Exp(a);
2645 int aSign = extractFloat64Sign(a);
2646
2647 if (aExp == 0x7FF) {
2648 if (aSig) return commonNaNToFloatx80(float64ToCommonNaN(a, status));
2649 return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2650 }
2651 if (aExp == 0) {
2652 if (aSig == 0) return packFloatx80(aSign, 0, 0);
2653 float_raise(status, float_flag_denormal);
2654 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2655 }
2656 return
2657 packFloatx80(
2658 aSign, aExp + 0x3C00, (aSig | BX_CONST64(0x0010000000000000))<<11);
2659 }
2660
2661 /*----------------------------------------------------------------------------
2662 | Returns the result of converting the extended double-precision floating-
2663 | point value `a' to the 32-bit two's complement integer format. The
2664 | conversion is performed according to the IEC/IEEE Standard for Binary
2665 | Floating-Point Arithmetic - which means in particular that the conversion
2666 | is rounded according to the current rounding mode. If `a' is a NaN or the
2667 | conversion overflows, the integer indefinite value is returned.
2668 *----------------------------------------------------------------------------*/
2669
floatx80_to_int32(floatx80 a,float_status_t & status)2670 Bit32s floatx80_to_int32(floatx80 a, float_status_t &status)
2671 {
2672 Bit64u aSig = extractFloatx80Frac(a);
2673 Bit32s aExp = extractFloatx80Exp(a);
2674 int aSign = extractFloatx80Sign(a);
2675
2676 // handle unsupported extended double-precision floating encodings
2677 if (floatx80_is_unsupported(a))
2678 {
2679 float_raise(status, float_flag_invalid);
2680 return int32_indefinite;
2681 }
2682
2683 if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) aSign = 0;
2684 int shiftCount = 0x4037 - aExp;
2685 if (shiftCount <= 0) shiftCount = 1;
2686 aSig = shift64RightJamming(aSig, shiftCount);
2687 return roundAndPackInt32(aSign, aSig, status);
2688 }
2689
2690 /*----------------------------------------------------------------------------
2691 | Returns the result of converting the extended double-precision floating-
2692 | point value `a' to the 32-bit two's complement integer format. The
2693 | conversion is performed according to the IEC/IEEE Standard for Binary
2694 | Floating-Point Arithmetic, except that the conversion is always rounded
2695 | toward zero. If `a' is a NaN or the conversion overflows, the integer
2696 | indefinite value is returned.
2697 *----------------------------------------------------------------------------*/
2698
floatx80_to_int32_round_to_zero(floatx80 a,float_status_t & status)2699 Bit32s floatx80_to_int32_round_to_zero(floatx80 a, float_status_t &status)
2700 {
2701 Bit32s aExp;
2702 Bit64u aSig, savedASig;
2703 Bit32s z;
2704 int shiftCount;
2705
2706 // handle unsupported extended double-precision floating encodings
2707 if (floatx80_is_unsupported(a))
2708 {
2709 float_raise(status, float_flag_invalid);
2710 return int32_indefinite;
2711 }
2712
2713 aSig = extractFloatx80Frac(a);
2714 aExp = extractFloatx80Exp(a);
2715 int aSign = extractFloatx80Sign(a);
2716
2717 if (aExp > 0x401E) {
2718 float_raise(status, float_flag_invalid);
2719 return (Bit32s)(int32_indefinite);
2720 }
2721 if (aExp < 0x3FFF) {
2722 if (aExp || aSig) float_raise(status, float_flag_inexact);
2723 return 0;
2724 }
2725 shiftCount = 0x403E - aExp;
2726 savedASig = aSig;
2727 aSig >>= shiftCount;
2728 z = (Bit32s) aSig;
2729 if (aSign) z = -z;
2730 if ((z < 0) ^ aSign) {
2731 float_raise(status, float_flag_invalid);
2732 return (Bit32s)(int32_indefinite);
2733 }
2734 if ((aSig<<shiftCount) != savedASig)
2735 {
2736 float_raise(status, float_flag_inexact);
2737 }
2738 return z;
2739 }
2740
2741 /*----------------------------------------------------------------------------
2742 | Returns the result of converting the extended double-precision floating-
2743 | point value `a' to the 64-bit two's complement integer format. The
2744 | conversion is performed according to the IEC/IEEE Standard for Binary
2745 | Floating-Point Arithmetic - which means in particular that the conversion
2746 | is rounded according to the current rounding mode. If `a' is a NaN or the
2747 | conversion overflows, the integer indefinite value is returned.
2748 *----------------------------------------------------------------------------*/
2749
floatx80_to_int64(floatx80 a,float_status_t & status)2750 Bit64s floatx80_to_int64(floatx80 a, float_status_t &status)
2751 {
2752 Bit32s aExp;
2753 Bit64u aSig, aSigExtra;
2754
2755 // handle unsupported extended double-precision floating encodings
2756 if (floatx80_is_unsupported(a))
2757 {
2758 float_raise(status, float_flag_invalid);
2759 return int64_indefinite;
2760 }
2761
2762 aSig = extractFloatx80Frac(a);
2763 aExp = extractFloatx80Exp(a);
2764 int aSign = extractFloatx80Sign(a);
2765
2766 int shiftCount = 0x403E - aExp;
2767 if (shiftCount <= 0)
2768 {
2769 if (shiftCount)
2770 {
2771 float_raise(status, float_flag_invalid);
2772 return (Bit64s)(int64_indefinite);
2773 }
2774 aSigExtra = 0;
2775 }
2776 else {
2777 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
2778 }
2779
2780 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
2781 }
2782
2783 /*----------------------------------------------------------------------------
2784 | Returns the result of converting the extended double-precision floating-
2785 | point value `a' to the 64-bit two's complement integer format. The
2786 | conversion is performed according to the IEC/IEEE Standard for Binary
2787 | Floating-Point Arithmetic, except that the conversion is always rounded
2788 | toward zero. If `a' is a NaN or the conversion overflows, the integer
2789 | indefinite value is returned.
2790 *----------------------------------------------------------------------------*/
2791
floatx80_to_int64_round_to_zero(floatx80 a,float_status_t & status)2792 Bit64s floatx80_to_int64_round_to_zero(floatx80 a, float_status_t &status)
2793 {
2794 int aSign;
2795 Bit32s aExp;
2796 Bit64u aSig;
2797 Bit64s z;
2798
2799 // handle unsupported extended double-precision floating encodings
2800 if (floatx80_is_unsupported(a))
2801 {
2802 float_raise(status, float_flag_invalid);
2803 return int64_indefinite;
2804 }
2805
2806 aSig = extractFloatx80Frac(a);
2807 aExp = extractFloatx80Exp(a);
2808 aSign = extractFloatx80Sign(a);
2809 int shiftCount = aExp - 0x403E;
2810 if (0 <= shiftCount) {
2811 aSig &= BX_CONST64(0x7FFFFFFFFFFFFFFF);
2812 if ((a.exp != 0xC03E) || aSig) {
2813 float_raise(status, float_flag_invalid);
2814 }
2815 return (Bit64s)(int64_indefinite);
2816 }
2817 else if (aExp < 0x3FFF) {
2818 if (aExp | aSig) float_raise(status, float_flag_inexact);
2819 return 0;
2820 }
2821 z = aSig>>(-shiftCount);
2822 if ((Bit64u) (aSig<<(shiftCount & 63))) {
2823 float_raise(status, float_flag_inexact);
2824 }
2825 if (aSign) z = -z;
2826 return z;
2827 }
2828
2829 /*----------------------------------------------------------------------------
2830 | Returns the result of converting the extended double-precision floating-
2831 | point value `a' to the single-precision floating-point format. The
2832 | conversion is performed according to the IEC/IEEE Standard for Binary
2833 | Floating-Point Arithmetic.
2834 *----------------------------------------------------------------------------*/
2835
floatx80_to_float32(floatx80 a,float_status_t & status)2836 float32 floatx80_to_float32(floatx80 a, float_status_t &status)
2837 {
2838 Bit64u aSig = extractFloatx80Frac(a);
2839 Bit32s aExp = extractFloatx80Exp(a);
2840 int aSign = extractFloatx80Sign(a);
2841
2842 // handle unsupported extended double-precision floating encodings
2843 if (floatx80_is_unsupported(a))
2844 {
2845 float_raise(status, float_flag_invalid);
2846 return float32_default_nan;
2847 }
2848
2849 if (aExp == 0x7FFF) {
2850 if ((Bit64u) (aSig<<1))
2851 return commonNaNToFloat32(floatx80ToCommonNaN(a, status));
2852
2853 return packFloat32(aSign, 0xFF, 0);
2854 }
2855 aSig = shift64RightJamming(aSig, 33);
2856 if (aExp || aSig) aExp -= 0x3F81;
2857 return roundAndPackFloat32(aSign, aExp, (Bit32u) aSig, status);
2858 }
2859
2860 /*----------------------------------------------------------------------------
2861 | Returns the result of converting the extended double-precision floating-
2862 | point value `a' to the double-precision floating-point format. The
2863 | conversion is performed according to the IEC/IEEE Standard for Binary
2864 | Floating-Point Arithmetic.
2865 *----------------------------------------------------------------------------*/
2866
floatx80_to_float64(floatx80 a,float_status_t & status)2867 float64 floatx80_to_float64(floatx80 a, float_status_t &status)
2868 {
2869 Bit32s aExp;
2870 Bit64u aSig, zSig;
2871
2872 // handle unsupported extended double-precision floating encodings
2873 if (floatx80_is_unsupported(a))
2874 {
2875 float_raise(status, float_flag_invalid);
2876 return float64_default_nan;
2877 }
2878
2879 aSig = extractFloatx80Frac(a);
2880 aExp = extractFloatx80Exp(a);
2881 int aSign = extractFloatx80Sign(a);
2882
2883 if (aExp == 0x7FFF) {
2884 if ((Bit64u) (aSig<<1)) {
2885 return commonNaNToFloat64(floatx80ToCommonNaN(a, status));
2886 }
2887 return packFloat64(aSign, 0x7FF, 0);
2888 }
2889 zSig = shift64RightJamming(aSig, 1);
2890 if (aExp || aSig) aExp -= 0x3C01;
2891 return roundAndPackFloat64(aSign, aExp, zSig, status);
2892 }
2893
2894 /*----------------------------------------------------------------------------
2895 | Rounds the extended double-precision floating-point value `a' to an integer,
2896 | and returns the result as an extended double-precision floating-point
2897 | value. The operation is performed according to the IEC/IEEE Standard for
2898 | Binary Floating-Point Arithmetic.
2899 *----------------------------------------------------------------------------*/
2900
floatx80_round_to_int(floatx80 a,float_status_t & status)2901 floatx80 floatx80_round_to_int(floatx80 a, float_status_t &status)
2902 {
2903 int aSign;
2904 Bit64u lastBitMask, roundBitsMask;
2905 int roundingMode = get_float_rounding_mode(status);
2906 floatx80 z;
2907
2908 // handle unsupported extended double-precision floating encodings
2909 if (floatx80_is_unsupported(a))
2910 {
2911 float_raise(status, float_flag_invalid);
2912 return floatx80_default_nan;
2913 }
2914
2915 Bit32s aExp = extractFloatx80Exp(a);
2916 Bit64u aSig = extractFloatx80Frac(a);
2917 if (0x403E <= aExp) {
2918 if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) {
2919 return propagateFloatx80NaN(a, status);
2920 }
2921 return a;
2922 }
2923 if (aExp < 0x3FFF) {
2924 if (aExp == 0) {
2925 if ((aSig<<1) == 0) return a;
2926 float_raise(status, float_flag_denormal);
2927 }
2928 float_raise(status, float_flag_inexact);
2929 aSign = extractFloatx80Sign(a);
2930 switch (roundingMode) {
2931 case float_round_nearest_even:
2932 if ((aExp == 0x3FFE) && (Bit64u) (aSig<<1)) {
2933 set_float_rounding_up(status);
2934 return packFloatx80(aSign, 0x3FFF, BX_CONST64(0x8000000000000000));
2935 }
2936 break;
2937 case float_round_down:
2938 if (aSign) {
2939 set_float_rounding_up(status);
2940 return packFloatx80(1, 0x3FFF, BX_CONST64(0x8000000000000000));
2941 }
2942 else {
2943 return packFloatx80(0, 0, 0);
2944 }
2945 case float_round_up:
2946 if (aSign) {
2947 return packFloatx80(1, 0, 0);
2948 }
2949 else {
2950 set_float_rounding_up(status);
2951 return packFloatx80(0, 0x3FFF, BX_CONST64(0x8000000000000000));
2952 }
2953 }
2954 return packFloatx80(aSign, 0, 0);
2955 }
2956 lastBitMask = 1;
2957 lastBitMask <<= 0x403E - aExp;
2958 roundBitsMask = lastBitMask - 1;
2959 z = a;
2960 if (roundingMode == float_round_nearest_even) {
2961 z.fraction += lastBitMask>>1;
2962 if ((z.fraction & roundBitsMask) == 0) z.fraction &= ~lastBitMask;
2963 }
2964 else if (roundingMode != float_round_to_zero) {
2965 if (extractFloatx80Sign(z) ^ (roundingMode == float_round_up))
2966 z.fraction += roundBitsMask;
2967 }
2968 z.fraction &= ~roundBitsMask;
2969 if (z.fraction == 0) {
2970 z.exp++;
2971 z.fraction = BX_CONST64(0x8000000000000000);
2972 }
2973 if (z.fraction != a.fraction) {
2974 float_raise(status, float_flag_inexact);
2975 if (z.fraction > a.fraction || z.exp > a.exp)
2976 set_float_rounding_up(status);
2977 }
2978 return z;
2979 }
2980
2981 /*----------------------------------------------------------------------------
2982 | Returns the result of adding the absolute values of the extended double-
2983 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
2984 | negated before being returned. `zSign' is ignored if the result is a NaN.
2985 | The addition is performed according to the IEC/IEEE Standard for Binary
2986 | Floating-Point Arithmetic.
2987 *----------------------------------------------------------------------------*/
2988
addFloatx80Sigs(floatx80 a,floatx80 b,int zSign,float_status_t & status)2989 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
2990 {
2991 Bit32s aExp, bExp, zExp;
2992 Bit64u aSig, bSig, zSig0, zSig1;
2993
2994 // handle unsupported extended double-precision floating encodings
2995 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
2996 {
2997 float_raise(status, float_flag_invalid);
2998 return floatx80_default_nan;
2999 }
3000
3001 aSig = extractFloatx80Frac(a);
3002 aExp = extractFloatx80Exp(a);
3003 bSig = extractFloatx80Frac(b);
3004 bExp = extractFloatx80Exp(b);
3005
3006 if (aExp == 0x7FFF) {
3007 if ((Bit64u) (aSig<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
3008 return propagateFloatx80NaN(a, b, status);
3009 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3010 return a;
3011 }
3012 if (bExp == 0x7FFF) {
3013 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3014 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3015 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3016 }
3017 if (aExp == 0) {
3018 if (aSig == 0) {
3019 if ((bExp == 0) && bSig) {
3020 float_raise(status, float_flag_denormal);
3021 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3022 }
3023 return roundAndPackFloatx80(get_float_rounding_precision(status),
3024 zSign, bExp, bSig, 0, status);
3025 }
3026 float_raise(status, float_flag_denormal);
3027 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3028 }
3029 if (bExp == 0) {
3030 if (bSig == 0)
3031 return roundAndPackFloatx80(get_float_rounding_precision(status),
3032 zSign, aExp, aSig, 0, status);
3033
3034 float_raise(status, float_flag_denormal);
3035 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3036 }
3037 Bit32s expDiff = aExp - bExp;
3038 zExp = aExp;
3039 if (0 < expDiff) {
3040 shift64ExtraRightJamming(bSig, 0, expDiff, &bSig, &zSig1);
3041 }
3042 else if (expDiff < 0) {
3043 shift64ExtraRightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
3044 zExp = bExp;
3045 }
3046 else {
3047 zSig0 = aSig + bSig;
3048 zSig1 = 0;
3049 goto shiftRight1;
3050 }
3051 zSig0 = aSig + bSig;
3052 if ((Bit64s) zSig0 < 0) goto roundAndPack;
3053 shiftRight1:
3054 shift64ExtraRightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3055 zSig0 |= BX_CONST64(0x8000000000000000);
3056 zExp++;
3057 roundAndPack:
3058 return
3059 roundAndPackFloatx80(get_float_rounding_precision(status),
3060 zSign, zExp, zSig0, zSig1, status);
3061 }
3062
3063 /*----------------------------------------------------------------------------
3064 | Returns the result of subtracting the absolute values of the extended
3065 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3066 | difference is negated before being returned. `zSign' is ignored if the
3067 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3068 | Standard for Binary Floating-Point Arithmetic.
3069 *----------------------------------------------------------------------------*/
3070
subFloatx80Sigs(floatx80 a,floatx80 b,int zSign,float_status_t & status)3071 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
3072 {
3073 Bit32s aExp, bExp, zExp;
3074 Bit64u aSig, bSig, zSig0, zSig1;
3075
3076 // handle unsupported extended double-precision floating encodings
3077 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3078 {
3079 float_raise(status, float_flag_invalid);
3080 return floatx80_default_nan;
3081 }
3082
3083 aSig = extractFloatx80Frac(a);
3084 aExp = extractFloatx80Exp(a);
3085 bSig = extractFloatx80Frac(b);
3086 bExp = extractFloatx80Exp(b);
3087
3088 if (aExp == 0x7FFF) {
3089 if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
3090 if (bExp == 0x7FFF) {
3091 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3092 float_raise(status, float_flag_invalid);
3093 return floatx80_default_nan;
3094 }
3095 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3096 return a;
3097 }
3098 if (bExp == 0x7FFF) {
3099 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3100 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3101 return packFloatx80(zSign ^ 1, 0x7FFF, BX_CONST64(0x8000000000000000));
3102 }
3103 if (aExp == 0) {
3104 if (aSig == 0) {
3105 if (bExp == 0) {
3106 if (bSig) {
3107 float_raise(status, float_flag_denormal);
3108 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3109 return roundAndPackFloatx80(get_float_rounding_precision(status),
3110 zSign ^ 1, bExp, bSig, 0, status);
3111 }
3112 return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0);
3113 }
3114 return roundAndPackFloatx80(get_float_rounding_precision(status),
3115 zSign ^ 1, bExp, bSig, 0, status);
3116 }
3117 float_raise(status, float_flag_denormal);
3118 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3119 }
3120 if (bExp == 0) {
3121 if (bSig == 0)
3122 return roundAndPackFloatx80(get_float_rounding_precision(status),
3123 zSign, aExp, aSig, 0, status);
3124
3125 float_raise(status, float_flag_denormal);
3126 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3127 }
3128 Bit32s expDiff = aExp - bExp;
3129 if (0 < expDiff) {
3130 shift128RightJamming(bSig, 0, expDiff, &bSig, &zSig1);
3131 goto aBigger;
3132 }
3133 if (expDiff < 0) {
3134 shift128RightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
3135 goto bBigger;
3136 }
3137 zSig1 = 0;
3138 if (bSig < aSig) goto aBigger;
3139 if (aSig < bSig) goto bBigger;
3140 return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0);
3141 bBigger:
3142 sub128(bSig, 0, aSig, zSig1, &zSig0, &zSig1);
3143 zExp = bExp;
3144 zSign ^= 1;
3145 goto normalizeRoundAndPack;
3146 aBigger:
3147 sub128(aSig, 0, bSig, zSig1, &zSig0, &zSig1);
3148 zExp = aExp;
3149 normalizeRoundAndPack:
3150 return
3151 normalizeRoundAndPackFloatx80(get_float_rounding_precision(status),
3152 zSign, zExp, zSig0, zSig1, status);
3153 }
3154
3155 /*----------------------------------------------------------------------------
3156 | Returns the result of adding the extended double-precision floating-point
3157 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3158 | Standard for Binary Floating-Point Arithmetic.
3159 *----------------------------------------------------------------------------*/
3160
floatx80_add(floatx80 a,floatx80 b,float_status_t & status)3161 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status_t &status)
3162 {
3163 int aSign = extractFloatx80Sign(a);
3164 int bSign = extractFloatx80Sign(b);
3165
3166 if (aSign == bSign)
3167 return addFloatx80Sigs(a, b, aSign, status);
3168 else
3169 return subFloatx80Sigs(a, b, aSign, status);
3170 }
3171
3172 /*----------------------------------------------------------------------------
3173 | Returns the result of subtracting the extended double-precision floating-
3174 | point values `a' and `b'. The operation is performed according to the
3175 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3176 *----------------------------------------------------------------------------*/
3177
floatx80_sub(floatx80 a,floatx80 b,float_status_t & status)3178 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status_t &status)
3179 {
3180 int aSign = extractFloatx80Sign(a);
3181 int bSign = extractFloatx80Sign(b);
3182
3183 if (aSign == bSign)
3184 return subFloatx80Sigs(a, b, aSign, status);
3185 else
3186 return addFloatx80Sigs(a, b, aSign, status);
3187 }
3188
3189 /*----------------------------------------------------------------------------
3190 | Returns the result of multiplying the extended double-precision floating-
3191 | point values `a' and `b'. The operation is performed according to the
3192 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3193 *----------------------------------------------------------------------------*/
3194
floatx80_mul(floatx80 a,floatx80 b,float_status_t & status)3195 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status_t &status)
3196 {
3197 int aSign, bSign, zSign;
3198 Bit32s aExp, bExp, zExp;
3199 Bit64u aSig, bSig, zSig0, zSig1;
3200
3201 // handle unsupported extended double-precision floating encodings
3202 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3203 {
3204 invalid:
3205 float_raise(status, float_flag_invalid);
3206 return floatx80_default_nan;
3207 }
3208
3209 aSig = extractFloatx80Frac(a);
3210 aExp = extractFloatx80Exp(a);
3211 aSign = extractFloatx80Sign(a);
3212 bSig = extractFloatx80Frac(b);
3213 bExp = extractFloatx80Exp(b);
3214 bSign = extractFloatx80Sign(b);
3215 zSign = aSign ^ bSign;
3216
3217 if (aExp == 0x7FFF) {
3218 if ((Bit64u) (aSig<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1))) {
3219 return propagateFloatx80NaN(a, b, status);
3220 }
3221 if (bExp == 0) {
3222 if (bSig == 0) goto invalid;
3223 float_raise(status, float_flag_denormal);
3224 }
3225 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3226 }
3227 if (bExp == 0x7FFF) {
3228 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3229 if (aExp == 0) {
3230 if (aSig == 0) goto invalid;
3231 float_raise(status, float_flag_denormal);
3232 }
3233 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3234 }
3235 if (aExp == 0) {
3236 if (aSig == 0) {
3237 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3238 return packFloatx80(zSign, 0, 0);
3239 }
3240 float_raise(status, float_flag_denormal);
3241 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3242 }
3243 if (bExp == 0) {
3244 if (bSig == 0) return packFloatx80(zSign, 0, 0);
3245 float_raise(status, float_flag_denormal);
3246 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3247 }
3248 zExp = aExp + bExp - 0x3FFE;
3249 mul64To128(aSig, bSig, &zSig0, &zSig1);
3250 if (0 < (Bit64s) zSig0) {
3251 shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
3252 --zExp;
3253 }
3254 return
3255 roundAndPackFloatx80(get_float_rounding_precision(status),
3256 zSign, zExp, zSig0, zSig1, status);
3257 }
3258
3259 /*----------------------------------------------------------------------------
3260 | Returns the result of dividing the extended double-precision floating-point
3261 | value `a' by the corresponding value `b'. The operation is performed
3262 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3263 *----------------------------------------------------------------------------*/
3264
floatx80_div(floatx80 a,floatx80 b,float_status_t & status)3265 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status_t &status)
3266 {
3267 int aSign, bSign, zSign;
3268 Bit32s aExp, bExp, zExp;
3269 Bit64u aSig, bSig, zSig0, zSig1;
3270 Bit64u rem0, rem1, rem2, term0, term1, term2;
3271
3272 // handle unsupported extended double-precision floating encodings
3273 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3274 {
3275 float_raise(status, float_flag_invalid);
3276 return floatx80_default_nan;
3277 }
3278
3279 aSig = extractFloatx80Frac(a);
3280 aExp = extractFloatx80Exp(a);
3281 aSign = extractFloatx80Sign(a);
3282 bSig = extractFloatx80Frac(b);
3283 bExp = extractFloatx80Exp(b);
3284 bSign = extractFloatx80Sign(b);
3285
3286 zSign = aSign ^ bSign;
3287 if (aExp == 0x7FFF) {
3288 if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
3289 if (bExp == 0x7FFF) {
3290 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3291 float_raise(status, float_flag_invalid);
3292 return floatx80_default_nan;
3293 }
3294 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3295 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3296 }
3297 if (bExp == 0x7FFF) {
3298 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3299 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3300 return packFloatx80(zSign, 0, 0);
3301 }
3302 if (bExp == 0) {
3303 if (bSig == 0) {
3304 if ((aExp | aSig) == 0) {
3305 float_raise(status, float_flag_invalid);
3306 return floatx80_default_nan;
3307 }
3308 float_raise(status, float_flag_divbyzero);
3309 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3310 }
3311 float_raise(status, float_flag_denormal);
3312 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3313 }
3314 if (aExp == 0) {
3315 if (aSig == 0) return packFloatx80(zSign, 0, 0);
3316 float_raise(status, float_flag_denormal);
3317 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3318 }
3319 zExp = aExp - bExp + 0x3FFE;
3320 rem1 = 0;
3321 if (bSig <= aSig) {
3322 shift128Right(aSig, 0, 1, &aSig, &rem1);
3323 ++zExp;
3324 }
3325 zSig0 = estimateDiv128To64(aSig, rem1, bSig);
3326 mul64To128(bSig, zSig0, &term0, &term1);
3327 sub128(aSig, rem1, term0, term1, &rem0, &rem1);
3328 while ((Bit64s) rem0 < 0) {
3329 --zSig0;
3330 add128(rem0, rem1, 0, bSig, &rem0, &rem1);
3331 }
3332 zSig1 = estimateDiv128To64(rem1, 0, bSig);
3333 if ((Bit64u) (zSig1<<1) <= 8) {
3334 mul64To128(bSig, zSig1, &term1, &term2);
3335 sub128(rem1, 0, term1, term2, &rem1, &rem2);
3336 while ((Bit64s) rem1 < 0) {
3337 --zSig1;
3338 add128(rem1, rem2, 0, bSig, &rem1, &rem2);
3339 }
3340 zSig1 |= ((rem1 | rem2) != 0);
3341 }
3342 return
3343 roundAndPackFloatx80(get_float_rounding_precision(status),
3344 zSign, zExp, zSig0, zSig1, status);
3345 }
3346
3347 /*----------------------------------------------------------------------------
3348 | Returns the square root of the extended double-precision floating-point
3349 | value `a'. The operation is performed according to the IEC/IEEE Standard
3350 | for Binary Floating-Point Arithmetic.
3351 *----------------------------------------------------------------------------*/
3352
floatx80_sqrt(floatx80 a,float_status_t & status)3353 floatx80 floatx80_sqrt(floatx80 a, float_status_t &status)
3354 {
3355 int aSign;
3356 Bit32s aExp, zExp;
3357 Bit64u aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3358 Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3359
3360 // handle unsupported extended double-precision floating encodings
3361 if (floatx80_is_unsupported(a))
3362 {
3363 float_raise(status, float_flag_invalid);
3364 return floatx80_default_nan;
3365 }
3366
3367 aSig0 = extractFloatx80Frac(a);
3368 aExp = extractFloatx80Exp(a);
3369 aSign = extractFloatx80Sign(a);
3370 if (aExp == 0x7FFF) {
3371 if ((Bit64u) (aSig0<<1)) return propagateFloatx80NaN(a, status);
3372 if (! aSign) return a;
3373 float_raise(status, float_flag_invalid);
3374 return floatx80_default_nan;
3375 }
3376 if (aSign) {
3377 if ((aExp | aSig0) == 0) return a;
3378 float_raise(status, float_flag_invalid);
3379 return floatx80_default_nan;
3380 }
3381 if (aExp == 0) {
3382 if (aSig0 == 0) return packFloatx80(0, 0, 0);
3383 float_raise(status, float_flag_denormal);
3384 normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
3385 }
3386 zExp = ((aExp - 0x3FFF)>>1) + 0x3FFF;
3387 zSig0 = estimateSqrt32(aExp, aSig0>>32);
3388 shift128Right(aSig0, 0, 2 + (aExp & 1), &aSig0, &aSig1);
3389 zSig0 = estimateDiv128To64(aSig0, aSig1, zSig0<<32) + (zSig0<<30);
3390 doubleZSig0 = zSig0<<1;
3391 mul64To128(zSig0, zSig0, &term0, &term1);
3392 sub128(aSig0, aSig1, term0, term1, &rem0, &rem1);
3393 while ((Bit64s) rem0 < 0) {
3394 --zSig0;
3395 doubleZSig0 -= 2;
3396 add128(rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1);
3397 }
3398 zSig1 = estimateDiv128To64(rem1, 0, doubleZSig0);
3399 if ((zSig1 & BX_CONST64(0x3FFFFFFFFFFFFFFF)) <= 5) {
3400 if (zSig1 == 0) zSig1 = 1;
3401 mul64To128(doubleZSig0, zSig1, &term1, &term2);
3402 sub128(rem1, 0, term1, term2, &rem1, &rem2);
3403 mul64To128(zSig1, zSig1, &term2, &term3);
3404 sub192(rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3);
3405 while ((Bit64s) rem1 < 0) {
3406 --zSig1;
3407 shortShift128Left(0, zSig1, 1, &term2, &term3);
3408 term3 |= 1;
3409 term2 |= doubleZSig0;
3410 add192(rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3);
3411 }
3412 zSig1 |= ((rem1 | rem2 | rem3) != 0);
3413 }
3414 shortShift128Left(0, zSig1, 1, &zSig0, &zSig1);
3415 zSig0 |= doubleZSig0;
3416 return
3417 roundAndPackFloatx80(get_float_rounding_precision(status),
3418 0, zExp, zSig0, zSig1, status);
3419 }
3420
3421 #endif
3422
3423 #ifdef FLOAT128
3424
3425 /*----------------------------------------------------------------------------
3426 | Returns the result of converting the extended double-precision floating-
3427 | point value `a' to the quadruple-precision floating-point format. The
3428 | conversion is performed according to the IEC/IEEE Standard for Binary
3429 | Floating-Point Arithmetic.
3430 *----------------------------------------------------------------------------*/
3431
floatx80_to_float128(floatx80 a,float_status_t & status)3432 float128 floatx80_to_float128(floatx80 a, float_status_t &status)
3433 {
3434 Bit64u zSig0, zSig1;
3435
3436 Bit64u aSig = extractFloatx80Frac(a);
3437 Bit32s aExp = extractFloatx80Exp(a);
3438 int aSign = extractFloatx80Sign(a);
3439
3440 if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1))
3441 return commonNaNToFloat128(floatx80ToCommonNaN(a, status));
3442
3443 shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
3444 return packFloat128(aSign, aExp, zSig0, zSig1);
3445 }
3446
3447 /*----------------------------------------------------------------------------
3448 | Returns the result of converting the quadruple-precision floating-point
3449 | value `a' to the extended double-precision floating-point format. The
3450 | conversion is performed according to the IEC/IEEE Standard for Binary
3451 | Floating-Point Arithmetic.
3452 *----------------------------------------------------------------------------*/
3453
float128_to_floatx80(float128 a,float_status_t & status)3454 floatx80 float128_to_floatx80(float128 a, float_status_t &status)
3455 {
3456 Bit32s aExp;
3457 Bit64u aSig0, aSig1;
3458
3459 aSig1 = extractFloat128Frac1(a);
3460 aSig0 = extractFloat128Frac0(a);
3461 aExp = extractFloat128Exp(a);
3462 int aSign = extractFloat128Sign(a);
3463
3464 if (aExp == 0x7FFF) {
3465 if (aSig0 | aSig1)
3466 return commonNaNToFloatx80(float128ToCommonNaN(a, status));
3467
3468 return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3469 }
3470
3471 if (aExp == 0) {
3472 if ((aSig0 | aSig1) == 0) return packFloatx80(aSign, 0, 0);
3473 float_raise(status, float_flag_denormal);
3474 normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3475 }
3476 else aSig0 |= BX_CONST64(0x0001000000000000);
3477
3478 shortShift128Left(aSig0, aSig1, 15, &aSig0, &aSig1);
3479 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
3480 }
3481
3482 /*----------------------------------------------------------------------------
3483 | Returns the result of multiplying the extended double-precision floating-
3484 | point value `a' and quadruple-precision floating point value `b'. The
3485 | operation is performed according to the IEC/IEEE Standard for Binary
3486 | Floating-Point Arithmetic.
3487 *----------------------------------------------------------------------------*/
3488
floatx80_mul(floatx80 a,float128 b,float_status_t & status)3489 floatx80 floatx80_mul(floatx80 a, float128 b, float_status_t &status)
3490 {
3491 Bit32s aExp, bExp, zExp;
3492 Bit64u aSig, bSig0, bSig1, zSig0, zSig1, zSig2;
3493 int aSign, bSign, zSign;
3494
3495 // handle unsupported extended double-precision floating encodings
3496 if (floatx80_is_unsupported(a))
3497 {
3498 invalid:
3499 float_raise(status, float_flag_invalid);
3500 return floatx80_default_nan;
3501 }
3502
3503 aSig = extractFloatx80Frac(a);
3504 aExp = extractFloatx80Exp(a);
3505 aSign = extractFloatx80Sign(a);
3506 bSig0 = extractFloat128Frac0(b);
3507 bSig1 = extractFloat128Frac1(b);
3508 bExp = extractFloat128Exp(b);
3509 bSign = extractFloat128Sign(b);
3510
3511 zSign = aSign ^ bSign;
3512
3513 if (aExp == 0x7FFF) {
3514 if ((Bit64u) (aSig<<1)
3515 || ((bExp == 0x7FFF) && (bSig0 | bSig1)))
3516 {
3517 floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
3518 return propagateFloatx80NaN(a, r, status);
3519 }
3520 if (bExp == 0) {
3521 if ((bSig0 | bSig1) == 0) goto invalid;
3522 float_raise(status, float_flag_denormal);
3523 }
3524 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3525 }
3526 if (bExp == 0x7FFF) {
3527 if (bSig0 | bSig1) {
3528 floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
3529 return propagateFloatx80NaN(a, r, status);
3530 }
3531 if (aExp == 0) {
3532 if (aSig == 0) goto invalid;
3533 float_raise(status, float_flag_denormal);
3534 }
3535 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3536 }
3537 if (aExp == 0) {
3538 if (aSig == 0) {
3539 if ((bExp == 0) && (bSig0 | bSig1)) float_raise(status, float_flag_denormal);
3540 return packFloatx80(zSign, 0, 0);
3541 }
3542 float_raise(status, float_flag_denormal);
3543 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3544 }
3545 if (bExp == 0) {
3546 if ((bSig0 | bSig1) == 0) return packFloatx80(zSign, 0, 0);
3547 float_raise(status, float_flag_denormal);
3548 normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3549 }
3550 else bSig0 |= BX_CONST64(0x0001000000000000);
3551
3552 zExp = aExp + bExp - 0x3FFE;
3553 shortShift128Left(bSig0, bSig1, 15, &bSig0, &bSig1);
3554 mul128By64To192(bSig0, bSig1, aSig, &zSig0, &zSig1, &zSig2);
3555 if (0 < (Bit64s) zSig0) {
3556 shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
3557 --zExp;
3558 }
3559 return
3560 roundAndPackFloatx80(get_float_rounding_precision(status),
3561 zSign, zExp, zSig0, zSig1, status);
3562 }
3563
3564 /*----------------------------------------------------------------------------
3565 | Returns the result of adding the absolute values of the quadruple-precision
3566 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3567 | before being returned. `zSign' is ignored if the result is a NaN.
3568 | The addition is performed according to the IEC/IEEE Standard for Binary
3569 | Floating-Point Arithmetic.
3570 *----------------------------------------------------------------------------*/
3571
addFloat128Sigs(float128 a,float128 b,int zSign,float_status_t & status)3572 static float128 addFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
3573 {
3574 Bit32s aExp, bExp, zExp;
3575 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
3576 Bit32s expDiff;
3577
3578 aSig1 = extractFloat128Frac1(a);
3579 aSig0 = extractFloat128Frac0(a);
3580 aExp = extractFloat128Exp(a);
3581 bSig1 = extractFloat128Frac1(b);
3582 bSig0 = extractFloat128Frac0(b);
3583 bExp = extractFloat128Exp(b);
3584 expDiff = aExp - bExp;
3585
3586 if (0 < expDiff) {
3587 if (aExp == 0x7FFF) {
3588 if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3589 return a;
3590 }
3591 if (bExp == 0) --expDiff;
3592 else bSig0 |= BX_CONST64(0x0001000000000000);
3593 shift128ExtraRightJamming(bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2);
3594 zExp = aExp;
3595 }
3596 else if (expDiff < 0) {
3597 if (bExp == 0x7FFF) {
3598 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3599 return packFloat128(zSign, 0x7FFF, 0, 0);
3600 }
3601 if (aExp == 0) ++expDiff;
3602 else aSig0 |= BX_CONST64(0x0001000000000000);
3603 shift128ExtraRightJamming(aSig0, aSig1, 0, -expDiff, &aSig0, &aSig1, &zSig2);
3604 zExp = bExp;
3605 }
3606 else {
3607 if (aExp == 0x7FFF) {
3608 if (aSig0 | aSig1 | bSig0 | bSig1)
3609 return propagateFloat128NaN(a, b, status);
3610
3611 return a;
3612 }
3613 add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3614 if (aExp == 0) return packFloat128(zSign, 0, zSig0, zSig1);
3615 zSig2 = 0;
3616 zSig0 |= BX_CONST64(0x0002000000000000);
3617 zExp = aExp;
3618 goto shiftRight1;
3619 }
3620 aSig0 |= BX_CONST64(0x0001000000000000);
3621 add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3622 --zExp;
3623 if (zSig0 < BX_CONST64(0x0002000000000000)) goto roundAndPack;
3624 ++zExp;
3625 shiftRight1:
3626 shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
3627 roundAndPack:
3628 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3629 }
3630
3631 /*----------------------------------------------------------------------------
3632 | Returns the result of subtracting the absolute values of the quadruple-
3633 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3634 | difference is negated before being returned. `zSign' is ignored if the
3635 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3636 | Standard for Binary Floating-Point Arithmetic.
3637 *----------------------------------------------------------------------------*/
3638
subFloat128Sigs(float128 a,float128 b,int zSign,float_status_t & status)3639 static float128 subFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
3640 {
3641 Bit32s aExp, bExp, zExp;
3642 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
3643 Bit32s expDiff;
3644
3645 aSig1 = extractFloat128Frac1(a);
3646 aSig0 = extractFloat128Frac0(a);
3647 aExp = extractFloat128Exp(a);
3648 bSig1 = extractFloat128Frac1(b);
3649 bSig0 = extractFloat128Frac0(b);
3650 bExp = extractFloat128Exp(b);
3651
3652 expDiff = aExp - bExp;
3653 shortShift128Left(aSig0, aSig1, 14, &aSig0, &aSig1);
3654 shortShift128Left(bSig0, bSig1, 14, &bSig0, &bSig1);
3655 if (0 < expDiff) goto aExpBigger;
3656 if (expDiff < 0) goto bExpBigger;
3657 if (aExp == 0x7FFF) {
3658 if (aSig0 | aSig1 | bSig0 | bSig1)
3659 return propagateFloat128NaN(a, b, status);
3660
3661 float_raise(status, float_flag_invalid);
3662 return float128_default_nan;
3663 }
3664 if (aExp == 0) {
3665 aExp = 1;
3666 bExp = 1;
3667 }
3668 if (bSig0 < aSig0) goto aBigger;
3669 if (aSig0 < bSig0) goto bBigger;
3670 if (bSig1 < aSig1) goto aBigger;
3671 if (aSig1 < bSig1) goto bBigger;
3672 return packFloat128(0, 0);
3673
3674 bExpBigger:
3675 if (bExp == 0x7FFF) {
3676 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3677 return packFloat128(zSign ^ 1, 0x7FFF, 0, 0);
3678 }
3679 if (aExp == 0) ++expDiff;
3680 else {
3681 aSig0 |= BX_CONST64(0x4000000000000000);
3682 }
3683 shift128RightJamming(aSig0, aSig1, - expDiff, &aSig0, &aSig1);
3684 bSig0 |= BX_CONST64(0x4000000000000000);
3685 bBigger:
3686 sub128(bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1);
3687 zExp = bExp;
3688 zSign ^= 1;
3689 goto normalizeRoundAndPack;
3690 aExpBigger:
3691 if (aExp == 0x7FFF) {
3692 if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3693 return a;
3694 }
3695 if (bExp == 0) --expDiff;
3696 else {
3697 bSig0 |= BX_CONST64(0x4000000000000000);
3698 }
3699 shift128RightJamming(bSig0, bSig1, expDiff, &bSig0, &bSig1);
3700 aSig0 |= BX_CONST64(0x4000000000000000);
3701 aBigger:
3702 sub128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3703 zExp = aExp;
3704 normalizeRoundAndPack:
3705 --zExp;
3706 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, status);
3707 }
3708
3709 /*----------------------------------------------------------------------------
3710 | Returns the result of adding the quadruple-precision floating-point values
3711 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3712 | for Binary Floating-Point Arithmetic.
3713 *----------------------------------------------------------------------------*/
3714
float128_add(float128 a,float128 b,float_status_t & status)3715 float128 float128_add(float128 a, float128 b, float_status_t &status)
3716 {
3717 int aSign = extractFloat128Sign(a);
3718 int bSign = extractFloat128Sign(b);
3719
3720 if (aSign == bSign) {
3721 return addFloat128Sigs(a, b, aSign, status);
3722 }
3723 else {
3724 return subFloat128Sigs(a, b, aSign, status);
3725 }
3726 }
3727
3728 /*----------------------------------------------------------------------------
3729 | Returns the result of subtracting the quadruple-precision floating-point
3730 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3731 | Standard for Binary Floating-Point Arithmetic.
3732 *----------------------------------------------------------------------------*/
3733
float128_sub(float128 a,float128 b,float_status_t & status)3734 float128 float128_sub(float128 a, float128 b, float_status_t &status)
3735 {
3736 int aSign = extractFloat128Sign(a);
3737 int bSign = extractFloat128Sign(b);
3738
3739 if (aSign == bSign) {
3740 return subFloat128Sigs(a, b, aSign, status);
3741 }
3742 else {
3743 return addFloat128Sigs(a, b, aSign, status);
3744 }
3745 }
3746
3747 /*----------------------------------------------------------------------------
3748 | Returns the result of multiplying the quadruple-precision floating-point
3749 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3750 | Standard for Binary Floating-Point Arithmetic.
3751 *----------------------------------------------------------------------------*/
3752
float128_mul(float128 a,float128 b,float_status_t & status)3753 float128 float128_mul(float128 a, float128 b, float_status_t &status)
3754 {
3755 int aSign, bSign, zSign;
3756 Bit32s aExp, bExp, zExp;
3757 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
3758
3759 aSig1 = extractFloat128Frac1(a);
3760 aSig0 = extractFloat128Frac0(a);
3761 aExp = extractFloat128Exp(a);
3762 aSign = extractFloat128Sign(a);
3763 bSig1 = extractFloat128Frac1(b);
3764 bSig0 = extractFloat128Frac0(b);
3765 bExp = extractFloat128Exp(b);
3766 bSign = extractFloat128Sign(b);
3767
3768 zSign = aSign ^ bSign;
3769 if (aExp == 0x7FFF) {
3770 if ((aSig0 | aSig1) || ((bExp == 0x7FFF) && (bSig0 | bSig1))) {
3771 return propagateFloat128NaN(a, b, status);
3772 }
3773 if ((bExp | bSig0 | bSig1) == 0) {
3774 float_raise(status, float_flag_invalid);
3775 return float128_default_nan;
3776 }
3777 return packFloat128(zSign, 0x7FFF, 0, 0);
3778 }
3779 if (bExp == 0x7FFF) {
3780 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3781 if ((aExp | aSig0 | aSig1) == 0) {
3782 float_raise(status, float_flag_invalid);
3783 return float128_default_nan;
3784 }
3785 return packFloat128(zSign, 0x7FFF, 0, 0);
3786 }
3787 if (aExp == 0) {
3788 if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3789 float_raise(status, float_flag_denormal);
3790 normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3791 }
3792 if (bExp == 0) {
3793 if ((bSig0 | bSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3794 float_raise(status, float_flag_denormal);
3795 normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3796 }
3797 zExp = aExp + bExp - 0x4000;
3798 aSig0 |= BX_CONST64(0x0001000000000000);
3799 shortShift128Left(bSig0, bSig1, 16, &bSig0, &bSig1);
3800 mul128To256(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3);
3801 add128(zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1);
3802 zSig2 |= (zSig3 != 0);
3803 if (BX_CONST64(0x0002000000000000) <= zSig0) {
3804 shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
3805 ++zExp;
3806 }
3807 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3808 }
3809
3810 /*----------------------------------------------------------------------------
3811 | Returns the result of dividing the quadruple-precision floating-point value
3812 | `a' by the corresponding value `b'. The operation is performed according to
3813 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3814 *----------------------------------------------------------------------------*/
3815
float128_div(float128 a,float128 b,float_status_t & status)3816 float128 float128_div(float128 a, float128 b, float_status_t &status)
3817 {
3818 int aSign, bSign, zSign;
3819 Bit32s aExp, bExp, zExp;
3820 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
3821 Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3822
3823 aSig1 = extractFloat128Frac1(a);
3824 aSig0 = extractFloat128Frac0(a);
3825 aExp = extractFloat128Exp(a);
3826 aSign = extractFloat128Sign(a);
3827 bSig1 = extractFloat128Frac1(b);
3828 bSig0 = extractFloat128Frac0(b);
3829 bExp = extractFloat128Exp(b);
3830 bSign = extractFloat128Sign(b);
3831
3832 zSign = aSign ^ bSign;
3833 if (aExp == 0x7FFF) {
3834 if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3835 if (bExp == 0x7FFF) {
3836 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3837 float_raise(status, float_flag_invalid);
3838 return float128_default_nan;
3839 }
3840 return packFloat128(zSign, 0x7FFF, 0, 0);
3841 }
3842 if (bExp == 0x7FFF) {
3843 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3844 return packFloat128(zSign, 0, 0, 0);
3845 }
3846 if (bExp == 0) {
3847 if ((bSig0 | bSig1) == 0) {
3848 if ((aExp | aSig0 | aSig1) == 0) {
3849 float_raise(status, float_flag_invalid);
3850 return float128_default_nan;
3851 }
3852 float_raise(status, float_flag_divbyzero);
3853 return packFloat128(zSign, 0x7FFF, 0, 0);
3854 }
3855 float_raise(status, float_flag_denormal);
3856 normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3857 }
3858 if (aExp == 0) {
3859 if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3860 float_raise(status, float_flag_denormal);
3861 normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3862 }
3863 zExp = aExp - bExp + 0x3FFD;
3864 shortShift128Left(
3865 aSig0 | BX_CONST64(0x0001000000000000), aSig1, 15, &aSig0, &aSig1);
3866 shortShift128Left(
3867 bSig0 | BX_CONST64(0x0001000000000000), bSig1, 15, &bSig0, &bSig1);
3868 if (le128(bSig0, bSig1, aSig0, aSig1)) {
3869 shift128Right(aSig0, aSig1, 1, &aSig0, &aSig1);
3870 ++zExp;
3871 }
3872 zSig0 = estimateDiv128To64(aSig0, aSig1, bSig0);
3873 mul128By64To192(bSig0, bSig1, zSig0, &term0, &term1, &term2);
3874 sub192(aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2);
3875 while ((Bit64s) rem0 < 0) {
3876 --zSig0;
3877 add192(rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2);
3878 }
3879 zSig1 = estimateDiv128To64(rem1, rem2, bSig0);
3880 if ((zSig1 & 0x3FFF) <= 4) {
3881 mul128By64To192(bSig0, bSig1, zSig1, &term1, &term2, &term3);
3882 sub192(rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3);
3883 while ((Bit64s) rem1 < 0) {
3884 --zSig1;
3885 add192(rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3);
3886 }
3887 zSig1 |= ((rem1 | rem2 | rem3) != 0);
3888 }
3889 shift128ExtraRightJamming(zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2);
3890 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3891 }
3892
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of converting the 64-bit two's complement integer `a' to
3895 | the quadruple-precision floating-point format. The conversion is performed
3896 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3897 *----------------------------------------------------------------------------*/
3898
int64_to_float128(Bit64s a)3899 float128 int64_to_float128(Bit64s a)
3900 {
3901 Bit64u zSig0, zSig1;
3902
3903 if (a == 0) return packFloat128(0, 0, 0, 0);
3904 int zSign = (a < 0);
3905 Bit64u absA = zSign ? - a : a;
3906 Bit8u shiftCount = countLeadingZeros64(absA) + 49;
3907 Bit32s zExp = 0x406E - shiftCount;
3908 if (64 <= shiftCount) {
3909 zSig1 = 0;
3910 zSig0 = absA;
3911 shiftCount -= 64;
3912 }
3913 else {
3914 zSig1 = absA;
3915 zSig0 = 0;
3916 }
3917 shortShift128Left(zSig0, zSig1, shiftCount, &zSig0, &zSig1);
3918 return packFloat128(zSign, zExp, zSig0, zSig1);
3919 }
3920
3921 #endif
3922