1 /*============================================================================
2 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
3 Package, Release 2b.
4
5 Written by John R. Hauser. This work was made possible in part by the
6 International Computer Science Institute, located at Suite 600, 1947 Center
7 Street, Berkeley, California 94704. Funding was partially provided by the
8 National Science Foundation under grant MIP-9311980. The original version
9 of this code was written as part of a project to build a fixed-point vector
10 processor in collaboration with the University of California at Berkeley,
11 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
12 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
13 arithmetic/SoftFloat.html'.
14
15 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
16 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
17 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
18 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
19 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
20 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
21 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
22 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
23
24 Derivative works are acceptable, even for commercial purposes, so long as
25 (1) the source code for the derivative work includes prominent notice that
26 the work is derivative, and (2) the source code includes prominent notice with
27 these four paragraphs for those parts of this code that are retained.
28 =============================================================================*/
29
30 #define FLOAT128
31
32 /*============================================================================
33 * Adapted for Bochs (x86 achitecture simulator) by
34 * Stanislav Shwartsman [sshwarts at sourceforge net]
35 * ==========================================================================*/
36
37 #include "softfloat.h"
38 #include "softfloat-round-pack.h"
39
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target
43 | if desired).
44 *----------------------------------------------------------------------------*/
45 #define USE_estimateDiv128To64
46 #define USE_estimateSqrt32
47 #include "softfloat-macros.h"
48
49 /*----------------------------------------------------------------------------
50 | Functions and definitions to determine: (1) whether tininess for underflow
51 | is detected before or after rounding by default, (2) what (if anything)
52 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
53 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
54 | are propagated from function inputs to output. These details are target-
55 | specific.
56 *----------------------------------------------------------------------------*/
57 #include "softfloat-specialize.h"
58
59 /*----------------------------------------------------------------------------
60 | Returns the result of converting the 32-bit two's complement integer `a'
61 | to the single-precision floating-point format. The conversion is performed
62 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
63 *----------------------------------------------------------------------------*/
64
int32_to_float32(Bit32s a,float_status_t & status)65 float32 int32_to_float32(Bit32s a, float_status_t &status)
66 {
67 if (a == 0) return 0;
68 if (a == (Bit32s) 0x80000000) return packFloat32(1, 0x9E, 0);
69 int zSign = (a < 0);
70 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
71 }
72
73 /*----------------------------------------------------------------------------
74 | Returns the result of converting the 32-bit two's complement integer `a'
75 | to the double-precision floating-point format. The conversion is performed
76 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
77 *----------------------------------------------------------------------------*/
78
int32_to_float64(Bit32s a)79 float64 int32_to_float64(Bit32s a)
80 {
81 if (a == 0) return 0;
82 int zSign = (a < 0);
83 Bit32u absA = zSign ? -a : a;
84 int shiftCount = countLeadingZeros32(absA) + 21;
85 Bit64u zSig = absA;
86 return packFloat64(zSign, 0x432 - shiftCount, zSig<<shiftCount);
87 }
88
89 /*----------------------------------------------------------------------------
90 | Returns the result of converting the 64-bit two's complement integer `a'
91 | to the single-precision floating-point format. The conversion is performed
92 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
93 *----------------------------------------------------------------------------*/
94
int64_to_float32(Bit64s a,float_status_t & status)95 float32 int64_to_float32(Bit64s a, float_status_t &status)
96 {
97 if (a == 0) return 0;
98 int zSign = (a < 0);
99 Bit64u absA = zSign ? -a : a;
100 int shiftCount = countLeadingZeros64(absA) - 40;
101 if (0 <= shiftCount) {
102 return packFloat32(zSign, 0x95 - shiftCount, (Bit32u)(absA<<shiftCount));
103 }
104 else {
105 shiftCount += 7;
106 if (shiftCount < 0) {
107 absA = shift64RightJamming(absA, -shiftCount);
108 }
109 else {
110 absA <<= shiftCount;
111 }
112 return roundAndPackFloat32(zSign, 0x9C - shiftCount, (Bit32u) absA, status);
113 }
114 }
115
116 /*----------------------------------------------------------------------------
117 | Returns the result of converting the 64-bit two's complement integer `a'
118 | to the double-precision floating-point format. The conversion is performed
119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
120 *----------------------------------------------------------------------------*/
121
int64_to_float64(Bit64s a,float_status_t & status)122 float64 int64_to_float64(Bit64s a, float_status_t &status)
123 {
124 if (a == 0) return 0;
125 if (a == (Bit64s) BX_CONST64(0x8000000000000000)) {
126 return packFloat64(1, 0x43E, 0);
127 }
128 int zSign = (a < 0);
129 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
130 }
131
132 /*----------------------------------------------------------------------------
133 | Returns the result of converting the 32-bit unsigned integer `a' to the
134 | single-precision floating-point format. The conversion is performed
135 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
136 *----------------------------------------------------------------------------*/
137
uint32_to_float32(Bit32u a,float_status_t & status)138 float32 uint32_to_float32(Bit32u a, float_status_t &status)
139 {
140 if (a == 0) return 0;
141 if (a & 0x80000000) return normalizeRoundAndPackFloat32(0, 0x9D, a >> 1, status);
142 return normalizeRoundAndPackFloat32(0, 0x9C, a, status);
143 }
144
145 /*----------------------------------------------------------------------------
146 | Returns the result of converting the 32-bit unsigned integer `a' to the
147 | double-precision floating-point format. The conversion is performed
148 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
149 *----------------------------------------------------------------------------*/
150
uint32_to_float64(Bit32u a)151 float64 uint32_to_float64(Bit32u a)
152 {
153 if (a == 0) return 0;
154 int shiftCount = countLeadingZeros32(a) + 21;
155 Bit64u zSig = a;
156 return packFloat64(0, 0x432 - shiftCount, zSig<<shiftCount);
157 }
158
159 /*----------------------------------------------------------------------------
160 | Returns the result of converting the 64-bit unsigned integer integer `a'
161 | to the single-precision floating-point format. The conversion is performed
162 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
163 *----------------------------------------------------------------------------*/
164
uint64_to_float32(Bit64u a,float_status_t & status)165 float32 uint64_to_float32(Bit64u a, float_status_t &status)
166 {
167 if (a == 0) return 0;
168 int shiftCount = countLeadingZeros64(a) - 40;
169 if (0 <= shiftCount) {
170 return packFloat32(0, 0x95 - shiftCount, (Bit32u)(a<<shiftCount));
171 }
172 else {
173 shiftCount += 7;
174 if (shiftCount < 0) {
175 a = shift64RightJamming(a, -shiftCount);
176 }
177 else {
178 a <<= shiftCount;
179 }
180 return roundAndPackFloat32(0, 0x9C - shiftCount, (Bit32u) a, status);
181 }
182 }
183
184 /*----------------------------------------------------------------------------
185 | Returns the result of converting the 64-bit unsigned integer integer `a'
186 | to the double-precision floating-point format. The conversion is performed
187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
188 *----------------------------------------------------------------------------*/
189
uint64_to_float64(Bit64u a,float_status_t & status)190 float64 uint64_to_float64(Bit64u a, float_status_t &status)
191 {
192 if (a == 0) return 0;
193 if (a & BX_CONST64(0x8000000000000000))
194 return normalizeRoundAndPackFloat64(0, 0x43D, a >> 1, status);
195 return normalizeRoundAndPackFloat64(0, 0x43C, a, status);
196 }
197
198 /*----------------------------------------------------------------------------
199 | Returns the result of converting the single-precision floating-point value
200 | `a' to the 32-bit two's complement integer format. The conversion is
201 | performed according to the IEC/IEEE Standard for Binary Floating-Point
202 | Arithmetic - which means in particular that the conversion is rounded
203 | according to the current rounding mode. If `a' is a NaN or the
204 | conversion overflows the integer indefinite value is returned.
205 *----------------------------------------------------------------------------*/
206
float32_to_int32(float32 a,float_status_t & status)207 Bit32s float32_to_int32(float32 a, float_status_t &status)
208 {
209 Bit32u aSig = extractFloat32Frac(a);
210 Bit16s aExp = extractFloat32Exp(a);
211 int aSign = extractFloat32Sign(a);
212 if ((aExp == 0xFF) && aSig) aSign = 0;
213 if (aExp) aSig |= 0x00800000;
214 else {
215 if (get_denormals_are_zeros(status)) aSig = 0;
216 }
217 int shiftCount = 0xAF - aExp;
218 Bit64u aSig64 = Bit64u(aSig) << 32;
219 if (0 < shiftCount) aSig64 = shift64RightJamming(aSig64, shiftCount);
220 return roundAndPackInt32(aSign, aSig64, status);
221 }
222
223 /*----------------------------------------------------------------------------
224 | Returns the result of converting the single-precision floating-point value
225 | `a' to the 32-bit two's complement integer format. The conversion is
226 | performed according to the IEC/IEEE Standard for Binary Floating-Point
227 | Arithmetic, except that the conversion is always rounded toward zero.
228 | If `a' is a NaN or the conversion overflows, the integer indefinite
229 | value is returned.
230 *----------------------------------------------------------------------------*/
231
float32_to_int32_round_to_zero(float32 a,float_status_t & status)232 Bit32s float32_to_int32_round_to_zero(float32 a, float_status_t &status)
233 {
234 int aSign;
235 Bit16s aExp;
236 Bit32u aSig;
237 Bit32s z;
238
239 aSig = extractFloat32Frac(a);
240 aExp = extractFloat32Exp(a);
241 aSign = extractFloat32Sign(a);
242 int shiftCount = aExp - 0x9E;
243 if (0 <= shiftCount) {
244 if (a != 0xCF000000) {
245 float_raise(status, float_flag_invalid);
246 }
247 return (Bit32s)(int32_indefinite);
248 }
249 else if (aExp <= 0x7E) {
250 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
251 if (aExp | aSig) float_raise(status, float_flag_inexact);
252 return 0;
253 }
254 aSig = (aSig | 0x800000)<<8;
255 z = aSig>>(-shiftCount);
256 if ((Bit32u) (aSig<<(shiftCount & 31))) {
257 float_raise(status, float_flag_inexact);
258 }
259 if (aSign) z = -z;
260 return z;
261 }
262
263 /*----------------------------------------------------------------------------
264 | Returns the result of converting the single-precision floating-point value
265 | `a' to the 32-bit unsigned integer format. The conversion is performed
266 | according to the IEC/IEEE Standard for Binary Floating-point Arithmetic,
267 | except that the conversion is always rounded toward zero. If `a' is a NaN
268 | or conversion overflows, the largest positive integer is returned.
269 *----------------------------------------------------------------------------*/
270
float32_to_uint32_round_to_zero(float32 a,float_status_t & status)271 Bit32u float32_to_uint32_round_to_zero(float32 a, float_status_t &status)
272 {
273 int aSign;
274 Bit16s aExp;
275 Bit32u aSig;
276
277 aSig = extractFloat32Frac(a);
278 aExp = extractFloat32Exp(a);
279 aSign = extractFloat32Sign(a);
280 int shiftCount = aExp - 0x9E;
281
282 if (aExp <= 0x7E) {
283 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
284 if (aExp | aSig) float_raise(status, float_flag_inexact);
285 return 0;
286 }
287 else if (0 < shiftCount || aSign) {
288 float_raise(status, float_flag_invalid);
289 return uint32_indefinite;
290 }
291
292 aSig = (aSig | 0x800000)<<8;
293 Bit32u z = aSig >> (-shiftCount);
294 if (aSig << (shiftCount & 31)) {
295 float_raise(status, float_flag_inexact);
296 }
297 return z;
298 }
299
300 /*----------------------------------------------------------------------------
301 | Returns the result of converting the single-precision floating-point value
302 | `a' to the 64-bit two's complement integer format. The conversion is
303 | performed according to the IEC/IEEE Standard for Binary Floating-Point
304 | Arithmetic - which means in particular that the conversion is rounded
305 | according to the current rounding mode. If `a' is a NaN or the
306 | conversion overflows, the integer indefinite value is returned.
307 *----------------------------------------------------------------------------*/
308
float32_to_int64(float32 a,float_status_t & status)309 Bit64s float32_to_int64(float32 a, float_status_t &status)
310 {
311 Bit64u aSig64, aSigExtra;
312
313 Bit32u aSig = extractFloat32Frac(a);
314 Bit16s aExp = extractFloat32Exp(a);
315 int aSign = extractFloat32Sign(a);
316
317 int shiftCount = 0xBE - aExp;
318 if (shiftCount < 0) {
319 float_raise(status, float_flag_invalid);
320 return (Bit64s)(int64_indefinite);
321 }
322 if (aExp) aSig |= 0x00800000;
323 else {
324 if (get_denormals_are_zeros(status)) aSig = 0;
325 }
326 aSig64 = aSig;
327 aSig64 <<= 40;
328 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
329 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
330 }
331
332 /*----------------------------------------------------------------------------
333 | Returns the result of converting the single-precision floating-point value
334 | `a' to the 64-bit two's complement integer format. The conversion is
335 | performed according to the IEC/IEEE Standard for Binary Floating-Point
336 | Arithmetic, except that the conversion is always rounded toward zero.
337 | If `a' is a NaN or the conversion overflows, the integer indefinite
338 | value is returned.
339 *----------------------------------------------------------------------------*/
340
float32_to_int64_round_to_zero(float32 a,float_status_t & status)341 Bit64s float32_to_int64_round_to_zero(float32 a, float_status_t &status)
342 {
343 int aSign;
344 Bit16s aExp;
345 Bit32u aSig;
346 Bit64u aSig64;
347 Bit64s z;
348
349 aSig = extractFloat32Frac(a);
350 aExp = extractFloat32Exp(a);
351 aSign = extractFloat32Sign(a);
352 int shiftCount = aExp - 0xBE;
353 if (0 <= shiftCount) {
354 if (a != 0xDF000000) {
355 float_raise(status, float_flag_invalid);
356 }
357 return (Bit64s)(int64_indefinite);
358 }
359 else if (aExp <= 0x7E) {
360 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
361 if (aExp | aSig) float_raise(status, float_flag_inexact);
362 return 0;
363 }
364 aSig64 = aSig | 0x00800000;
365 aSig64 <<= 40;
366 z = aSig64>>(-shiftCount);
367 if ((Bit64u) (aSig64<<(shiftCount & 63))) {
368 float_raise(status, float_flag_inexact);
369 }
370 if (aSign) z = -z;
371 return z;
372 }
373
374 /*----------------------------------------------------------------------------
375 | Returns the result of converting the single-precision floating-point value
376 | `a' to the 64-bit unsigned integer format. The conversion is performed
377 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
378 | except that the conversion is always rounded toward zero. If `a' is a NaN
379 | or the conversion overflows, the largest unsigned integer is returned.
380 *----------------------------------------------------------------------------*/
381
float32_to_uint64_round_to_zero(float32 a,float_status_t & status)382 Bit64u float32_to_uint64_round_to_zero(float32 a, float_status_t &status)
383 {
384 int aSign;
385 Bit16s aExp;
386 Bit32u aSig;
387 Bit64u aSig64;
388
389 aSig = extractFloat32Frac(a);
390 aExp = extractFloat32Exp(a);
391 aSign = extractFloat32Sign(a);
392 int shiftCount = aExp - 0xBE;
393
394 if (aExp <= 0x7E) {
395 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
396 if (aExp | aSig) float_raise(status, float_flag_inexact);
397 return 0;
398 }
399 else if (0 < shiftCount || aSign) {
400 float_raise(status, float_flag_invalid);
401 return uint64_indefinite;
402 }
403
404 aSig64 = aSig | 0x00800000;
405 aSig64 <<= 40;
406 Bit64u z = aSig64>>(-shiftCount);
407 if ((Bit64u) (aSig64<<(shiftCount & 63))) {
408 float_raise(status, float_flag_inexact);
409 }
410 return z;
411 }
412
413 /*----------------------------------------------------------------------------
414 | Returns the result of converting the single-precision floating-point value
415 | `a' to the 64-bit unsigned integer format. The conversion is
416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
417 | Arithmetic---which means in particular that the conversion is rounded
418 | according to the current rounding mode. If `a' is a NaN or the conversion
419 | overflows, the largest unsigned integer is returned.
420 *----------------------------------------------------------------------------*/
421
float32_to_uint64(float32 a,float_status_t & status)422 Bit64u float32_to_uint64(float32 a, float_status_t &status)
423 {
424 int aSign;
425 Bit16s aExp, shiftCount;
426 Bit32u aSig;
427 Bit64u aSig64, aSigExtra;
428
429 aSig = extractFloat32Frac(a);
430 aExp = extractFloat32Exp(a);
431 aSign = extractFloat32Sign(a);
432
433 if (get_denormals_are_zeros(status)) {
434 if (aExp == 0) aSig = 0;
435 }
436
437 if ((aSign) && (aExp > 0x7E)) {
438 float_raise(status, float_flag_invalid);
439 return uint64_indefinite;
440 }
441
442 shiftCount = 0xBE - aExp;
443 if (aExp) aSig |= 0x00800000;
444
445 if (shiftCount < 0) {
446 float_raise(status, float_flag_invalid);
447 return uint64_indefinite;
448 }
449
450 aSig64 = aSig;
451 aSig64 <<= 40;
452 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
453 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
454 }
455
456 /*----------------------------------------------------------------------------
457 | Returns the result of converting the single-precision floating-point value
458 | `a' to the 32-bit unsigned integer format. The conversion is
459 | performed according to the IEC/IEEE Standard for Binary Floating-Point
460 | Arithmetic---which means in particular that the conversion is rounded
461 | according to the current rounding mode. If `a' is a NaN or the conversion
462 | overflows, the largest unsigned integer is returned.
463 *----------------------------------------------------------------------------*/
464
float32_to_uint32(float32 a,float_status_t & status)465 Bit32u float32_to_uint32(float32 a, float_status_t &status)
466 {
467 Bit64u val_64 = float32_to_uint64(a, status);
468
469 if (val_64 > 0xffffffff) {
470 status.float_exception_flags = float_flag_invalid; // throw away other flags
471 return uint32_indefinite;
472 }
473
474 return (Bit32u) val_64;
475 }
476
477 /*----------------------------------------------------------------------------
478 | Returns the result of converting the single-precision floating-point value
479 | `a' to the double-precision floating-point format. The conversion is
480 | performed according to the IEC/IEEE Standard for Binary Floating-Point
481 | Arithmetic.
482 *----------------------------------------------------------------------------*/
483
float32_to_float64(float32 a,float_status_t & status)484 float64 float32_to_float64(float32 a, float_status_t &status)
485 {
486 Bit32u aSig = extractFloat32Frac(a);
487 Bit16s aExp = extractFloat32Exp(a);
488 int aSign = extractFloat32Sign(a);
489
490 if (aExp == 0xFF) {
491 if (aSig) return commonNaNToFloat64(float32ToCommonNaN(a, status));
492 return packFloat64(aSign, 0x7FF, 0);
493 }
494 if (aExp == 0) {
495 if (aSig == 0 || get_denormals_are_zeros(status))
496 return packFloat64(aSign, 0, 0);
497
498 float_raise(status, float_flag_denormal);
499 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
500 --aExp;
501 }
502 return packFloat64(aSign, aExp + 0x380, ((Bit64u) aSig)<<29);
503 }
504
505 /*----------------------------------------------------------------------------
506 | Rounds the single-precision floating-point value `a' to an integer, and
507 | returns the result as a single-precision floating-point value. The
508 | operation is performed according to the IEC/IEEE Standard for Binary
509 | Floating-Point Arithmetic.
510 *----------------------------------------------------------------------------*/
511
float32_round_to_int(float32 a,Bit8u scale,float_status_t & status)512 float32 float32_round_to_int(float32 a, Bit8u scale, float_status_t &status)
513 {
514 Bit32u lastBitMask, roundBitsMask;
515 int roundingMode = get_float_rounding_mode(status);
516 Bit16s aExp = extractFloat32Exp(a);
517 scale &= 0xf;
518
519 if ((aExp == 0xFF) && extractFloat32Frac(a)) {
520 return propagateFloat32NaN(a, status);
521 }
522
523 aExp += scale; // scale the exponent
524
525 if (0x96 <= aExp) {
526 return a;
527 }
528
529 if (get_denormals_are_zeros(status)) {
530 a = float32_denormal_to_zero(a);
531 }
532
533 if (aExp <= 0x7E) {
534 if ((Bit32u) (a<<1) == 0) return a;
535 float_raise(status, float_flag_inexact);
536 int aSign = extractFloat32Sign(a);
537 switch (roundingMode) {
538 case float_round_nearest_even:
539 if ((aExp == 0x7E) && extractFloat32Frac(a)) {
540 return packFloat32(aSign, 0x7F - scale, 0);
541 }
542 break;
543 case float_round_down:
544 return aSign ? packFloat32(1, 0x7F - scale, 0) : float32_positive_zero;
545 case float_round_up:
546 return aSign ? float32_negative_zero : packFloat32(0, 0x7F - scale, 0);
547 }
548 return packFloat32(aSign, 0, 0);
549 }
550
551 lastBitMask = 1;
552 lastBitMask <<= 0x96 - aExp;
553 roundBitsMask = lastBitMask - 1;
554 float32 z = a;
555 if (roundingMode == float_round_nearest_even) {
556 z += lastBitMask>>1;
557 if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
558 }
559 else if (roundingMode != float_round_to_zero) {
560 if (extractFloat32Sign(z) ^ (roundingMode == float_round_up)) {
561 z += roundBitsMask;
562 }
563 }
564 z &= ~roundBitsMask;
565 if (z != a) float_raise(status, float_flag_inexact);
566 return z;
567 }
568
569 /*----------------------------------------------------------------------------
570 | Extracts the fractional portion of single-precision floating-point value `a',
571 | and returns the result as a single-precision floating-point value. The
572 | fractional results are precise. The operation is performed according to the
573 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
574 *----------------------------------------------------------------------------*/
575
float32_frc(float32 a,float_status_t & status)576 float32 float32_frc(float32 a, float_status_t &status)
577 {
578 int roundingMode = get_float_rounding_mode(status);
579
580 Bit16s aExp = extractFloat32Exp(a);
581 Bit32u aSig = extractFloat32Frac(a);
582 int aSign = extractFloat32Sign(a);
583
584 if (aExp == 0xFF) {
585 if (aSig) return propagateFloat32NaN(a, status);
586 float_raise(status, float_flag_invalid);
587 return float32_default_nan;
588 }
589
590 if (aExp >= 0x96) {
591 return packFloat32(roundingMode == float_round_down, 0, 0);
592 }
593
594 if (aExp < 0x7F) {
595 if (aExp == 0) {
596 if (aSig == 0 || get_denormals_are_zeros(status))
597 return packFloat32(roundingMode == float_round_down, 0, 0);
598
599 float_raise(status, float_flag_denormal);
600 if (! float_exception_masked(status, float_flag_underflow))
601 float_raise(status, float_flag_underflow);
602
603 if(get_flush_underflow_to_zero(status)) {
604 float_raise(status, float_flag_underflow | float_flag_inexact);
605 return packFloat32(aSign, 0, 0);
606 }
607 }
608 return a;
609 }
610
611 Bit32u lastBitMask = 1 << (0x96 - aExp);
612 Bit32u roundBitsMask = lastBitMask - 1;
613
614 aSig &= roundBitsMask;
615 aSig <<= 7;
616 aExp--;
617
618 if (aSig == 0)
619 return packFloat32(roundingMode == float_round_down, 0, 0);
620
621 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
622 }
623
624 /*----------------------------------------------------------------------------
625 | Extracts the exponent portion of single-precision floating-point value 'a',
626 | and returns the result as a single-precision floating-point value
627 | representing unbiased integer exponent. The operation is performed according
628 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
629 *----------------------------------------------------------------------------*/
630
float32_getexp(float32 a,float_status_t & status)631 float32 float32_getexp(float32 a, float_status_t &status)
632 {
633 Bit16s aExp = extractFloat32Exp(a);
634 Bit32u aSig = extractFloat32Frac(a);
635
636 if (aExp == 0xFF) {
637 if (aSig) return propagateFloat32NaN(a, status);
638 return float32_positive_inf;
639 }
640
641 if (aExp == 0) {
642 if (aSig == 0 || get_denormals_are_zeros(status))
643 return float32_negative_inf;
644
645 float_raise(status, float_flag_denormal);
646 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
647 }
648
649 return int32_to_float32(aExp - 0x7F, status);
650 }
651
652 /*----------------------------------------------------------------------------
653 | Extracts the mantissa of single-precision floating-point value 'a' and
654 | returns the result as a single-precision floating-point after applying
655 | the mantissa interval normalization and sign control. The operation is
656 | performed according to the IEC/IEEE Standard for Binary Floating-Point
657 | Arithmetic.
658 *----------------------------------------------------------------------------*/
659
float32_getmant(float32 a,float_status_t & status,int sign_ctrl,int interv)660 float32 float32_getmant(float32 a, float_status_t &status, int sign_ctrl, int interv)
661 {
662 Bit16s aExp = extractFloat32Exp(a);
663 Bit32u aSig = extractFloat32Frac(a);
664 int aSign = extractFloat32Sign(a);
665
666 if (aExp == 0xFF) {
667 if (aSig) return propagateFloat32NaN(a, status);
668 if (aSign) {
669 if (sign_ctrl & 0x2) {
670 float_raise(status, float_flag_invalid);
671 return float32_default_nan;
672 }
673 }
674 return packFloat32(~sign_ctrl & aSign, 0x7F, 0);
675 }
676
677 if (aExp == 0 && (aSig == 0 || get_denormals_are_zeros(status))) {
678 return packFloat32(~sign_ctrl & aSign, 0x7F, 0);
679 }
680
681 if (aSign) {
682 if (sign_ctrl & 0x2) {
683 float_raise(status, float_flag_invalid);
684 return float32_default_nan;
685 }
686 }
687
688 if (aExp == 0) {
689 float_raise(status, float_flag_denormal);
690 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
691 // aExp += 0x7E;
692 aSig &= 0x7FFFFF;
693 }
694
695 switch(interv) {
696 case 0x0: // interval [1,2)
697 aExp = 0x7F;
698 break;
699 case 0x1: // interval [1/2,2)
700 aExp -= 0x7F;
701 aExp = 0x7F - (aExp & 0x1);
702 break;
703 case 0x2: // interval [1/2,1)
704 aExp = 0x7E;
705 break;
706 case 0x3: // interval [3/4,3/2)
707 aExp = 0x7F - ((aSig >> 22) & 0x1);
708 break;
709 }
710
711 return packFloat32(~sign_ctrl & aSign, aExp, aSig);
712 }
713
714 /*----------------------------------------------------------------------------
715 | Return the result of a floating point scale of the single-precision floating
716 | point value `a' by multiplying it by 2 power of the single-precision
717 | floating point value 'b' converted to integral value. If the result cannot
718 | be represented in single precision, then the proper overflow response (for
719 | positive scaling operand), or the proper underflow response (for negative
720 | scaling operand) is issued. The operation is performed according to the
721 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
722 *----------------------------------------------------------------------------*/
723
float32_scalef(float32 a,float32 b,float_status_t & status)724 float32 float32_scalef(float32 a, float32 b, float_status_t &status)
725 {
726 Bit32u aSig = extractFloat32Frac(a);
727 Bit16s aExp = extractFloat32Exp(a);
728 int aSign = extractFloat32Sign(a);
729 Bit32u bSig = extractFloat32Frac(b);
730 Bit16s bExp = extractFloat32Exp(b);
731 int bSign = extractFloat32Sign(b);
732
733 if (get_denormals_are_zeros(status)) {
734 if (aExp == 0) aSig = 0;
735 if (bExp == 0) bSig = 0;
736 }
737
738 if (bExp == 0xFF) {
739 if (bSig) return propagateFloat32NaN(a, b, status);
740 }
741
742 if (aExp == 0xFF) {
743 if (aSig) {
744 int aIsSignalingNaN = (aSig & 0x00400000) == 0;
745 if (aIsSignalingNaN || bExp != 0xFF || bSig)
746 return propagateFloat32NaN(a, b, status);
747
748 return bSign ? 0 : float32_positive_inf;
749 }
750
751 if (bExp == 0xFF && bSign) {
752 float_raise(status, float_flag_invalid);
753 return float32_default_nan;
754 }
755 return a;
756 }
757
758 if (aExp == 0) {
759 if (aSig == 0) {
760 if (bExp == 0xFF && ! bSign) {
761 float_raise(status, float_flag_invalid);
762 return float32_default_nan;
763 }
764 return a;
765 }
766 float_raise(status, float_flag_denormal);
767 }
768
769 if ((bExp | bSig) == 0) return a;
770
771 if (bExp == 0xFF) {
772 if (bSign) return packFloat32(aSign, 0, 0);
773 return packFloat32(aSign, 0xFF, 0);
774 }
775
776 if (bExp >= 0x8E) {
777 // handle obvious overflow/underflow result
778 return roundAndPackFloat32(aSign, bSign ? -0x7F : 0xFF, aSig, status);
779 }
780
781 int scale = 0;
782
783 if (bExp <= 0x7E) {
784 if (bExp == 0)
785 float_raise(status, float_flag_denormal);
786 scale = -bSign;
787 }
788 else {
789 int shiftCount = bExp - 0x9E;
790 bSig = (bSig | 0x800000)<<8;
791 scale = bSig>>(-shiftCount);
792
793 if (bSign) {
794 if ((Bit32u) (bSig<<(shiftCount & 31))) scale++;
795 scale = -scale;
796 }
797
798 if (scale > 0x200) scale = 0x200;
799 if (scale < -0x200) scale = -0x200;
800 }
801
802 if (aExp != 0) {
803 aSig |= 0x00800000;
804 } else {
805 aExp++;
806 }
807
808 aExp += scale - 1;
809 aSig <<= 7;
810 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
811 }
812
813 /*----------------------------------------------------------------------------
814 | Returns the result of adding the absolute values of the single-precision
815 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
816 | before being returned. `zSign' is ignored if the result is a NaN.
817 | The addition is performed according to the IEC/IEEE Standard for Binary
818 | Floating-Point Arithmetic.
819 *----------------------------------------------------------------------------*/
820
addFloat32Sigs(float32 a,float32 b,int zSign,float_status_t & status)821 static float32 addFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
822 {
823 Bit16s aExp, bExp, zExp;
824 Bit32u aSig, bSig, zSig;
825 Bit16s expDiff;
826
827 aSig = extractFloat32Frac(a);
828 aExp = extractFloat32Exp(a);
829 bSig = extractFloat32Frac(b);
830 bExp = extractFloat32Exp(b);
831
832 if (get_denormals_are_zeros(status)) {
833 if (aExp == 0) aSig = 0;
834 if (bExp == 0) bSig = 0;
835 }
836
837 expDiff = aExp - bExp;
838 aSig <<= 6;
839 bSig <<= 6;
840
841 if (0 < expDiff) {
842 if (aExp == 0xFF) {
843 if (aSig) return propagateFloat32NaN(a, b, status);
844 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
845 return a;
846 }
847 if ((aExp == 0) && aSig)
848 float_raise(status, float_flag_denormal);
849
850 if (bExp == 0) {
851 if (bSig) float_raise(status, float_flag_denormal);
852 --expDiff;
853 }
854 else bSig |= 0x20000000;
855
856 bSig = shift32RightJamming(bSig, expDiff);
857 zExp = aExp;
858 }
859 else if (expDiff < 0) {
860 if (bExp == 0xFF) {
861 if (bSig) return propagateFloat32NaN(a, b, status);
862 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
863 return packFloat32(zSign, 0xFF, 0);
864 }
865 if ((bExp == 0) && bSig)
866 float_raise(status, float_flag_denormal);
867
868 if (aExp == 0) {
869 if (aSig) float_raise(status, float_flag_denormal);
870 ++expDiff;
871 }
872 else aSig |= 0x20000000;
873
874 aSig = shift32RightJamming(aSig, -expDiff);
875 zExp = bExp;
876 }
877 else {
878 if (aExp == 0xFF) {
879 if (aSig | bSig) return propagateFloat32NaN(a, b, status);
880 return a;
881 }
882 if (aExp == 0) {
883 zSig = (aSig + bSig) >> 6;
884 if (aSig | bSig) {
885 float_raise(status, float_flag_denormal);
886 if (get_flush_underflow_to_zero(status) && (extractFloat32Frac(zSig) == zSig)) {
887 float_raise(status, float_flag_underflow | float_flag_inexact);
888 return packFloat32(zSign, 0, 0);
889 }
890 if (! float_exception_masked(status, float_flag_underflow)) {
891 if (extractFloat32Frac(zSig) == zSig)
892 float_raise(status, float_flag_underflow);
893 }
894 }
895 return packFloat32(zSign, 0, zSig);
896 }
897 zSig = 0x40000000 + aSig + bSig;
898 return roundAndPackFloat32(zSign, aExp, zSig, status);
899 }
900 aSig |= 0x20000000;
901 zSig = (aSig + bSig)<<1;
902 --zExp;
903 if ((Bit32s) zSig < 0) {
904 zSig = aSig + bSig;
905 ++zExp;
906 }
907 return roundAndPackFloat32(zSign, zExp, zSig, status);
908 }
909
910 /*----------------------------------------------------------------------------
911 | Returns the result of subtracting the absolute values of the single-
912 | precision floating-point values `a' and `b'. If `zSign' is 1, the
913 | difference is negated before being returned. `zSign' is ignored if the
914 | result is a NaN. The subtraction is performed according to the IEC/IEEE
915 | Standard for Binary Floating-Point Arithmetic.
916 *----------------------------------------------------------------------------*/
917
subFloat32Sigs(float32 a,float32 b,int zSign,float_status_t & status)918 static float32 subFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
919 {
920 Bit16s aExp, bExp, zExp;
921 Bit32u aSig, bSig, zSig;
922 Bit16s expDiff;
923
924 aSig = extractFloat32Frac(a);
925 aExp = extractFloat32Exp(a);
926 bSig = extractFloat32Frac(b);
927 bExp = extractFloat32Exp(b);
928
929 if (get_denormals_are_zeros(status)) {
930 if (aExp == 0) aSig = 0;
931 if (bExp == 0) bSig = 0;
932 }
933
934 expDiff = aExp - bExp;
935 aSig <<= 7;
936 bSig <<= 7;
937 if (0 < expDiff) goto aExpBigger;
938 if (expDiff < 0) goto bExpBigger;
939 if (aExp == 0xFF) {
940 if (aSig | bSig) return propagateFloat32NaN(a, b, status);
941 float_raise(status, float_flag_invalid);
942 return float32_default_nan;
943 }
944 if (aExp == 0) {
945 if (aSig | bSig) float_raise(status, float_flag_denormal);
946 aExp = 1;
947 bExp = 1;
948 }
949 if (bSig < aSig) goto aBigger;
950 if (aSig < bSig) goto bBigger;
951 return packFloat32(get_float_rounding_mode(status) == float_round_down, 0, 0);
952 bExpBigger:
953 if (bExp == 0xFF) {
954 if (bSig) return propagateFloat32NaN(a, b, status);
955 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
956 return packFloat32(zSign ^ 1, 0xFF, 0);
957 }
958 if ((bExp == 0) && bSig)
959 float_raise(status, float_flag_denormal);
960
961 if (aExp == 0) {
962 if (aSig) float_raise(status, float_flag_denormal);
963 ++expDiff;
964 }
965 else aSig |= 0x40000000;
966
967 aSig = shift32RightJamming(aSig, -expDiff);
968 bSig |= 0x40000000;
969 bBigger:
970 zSig = bSig - aSig;
971 zExp = bExp;
972 zSign ^= 1;
973 goto normalizeRoundAndPack;
974 aExpBigger:
975 if (aExp == 0xFF) {
976 if (aSig) return propagateFloat32NaN(a, b, status);
977 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
978 return a;
979 }
980 if ((aExp == 0) && aSig)
981 float_raise(status, float_flag_denormal);
982
983 if (bExp == 0) {
984 if (bSig) float_raise(status, float_flag_denormal);
985 --expDiff;
986 }
987 else bSig |= 0x40000000;
988
989 bSig = shift32RightJamming(bSig, expDiff);
990 aSig |= 0x40000000;
991 aBigger:
992 zSig = aSig - bSig;
993 zExp = aExp;
994 normalizeRoundAndPack:
995 --zExp;
996 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
997 }
998
999 /*----------------------------------------------------------------------------
1000 | Returns the result of adding the single-precision floating-point values `a'
1001 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1002 | Binary Floating-Point Arithmetic.
1003 *----------------------------------------------------------------------------*/
1004
float32_add(float32 a,float32 b,float_status_t & status)1005 float32 float32_add(float32 a, float32 b, float_status_t &status)
1006 {
1007 int aSign = extractFloat32Sign(a);
1008 int bSign = extractFloat32Sign(b);
1009
1010 if (aSign == bSign) {
1011 return addFloat32Sigs(a, b, aSign, status);
1012 }
1013 else {
1014 return subFloat32Sigs(a, b, aSign, status);
1015 }
1016 }
1017
1018 /*----------------------------------------------------------------------------
1019 | Returns the result of subtracting the single-precision floating-point values
1020 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1021 | for Binary Floating-Point Arithmetic.
1022 *----------------------------------------------------------------------------*/
1023
float32_sub(float32 a,float32 b,float_status_t & status)1024 float32 float32_sub(float32 a, float32 b, float_status_t &status)
1025 {
1026 int aSign = extractFloat32Sign(a);
1027 int bSign = extractFloat32Sign(b);
1028
1029 if (aSign == bSign) {
1030 return subFloat32Sigs(a, b, aSign, status);
1031 }
1032 else {
1033 return addFloat32Sigs(a, b, aSign, status);
1034 }
1035 }
1036
1037 /*----------------------------------------------------------------------------
1038 | Returns the result of multiplying the single-precision floating-point values
1039 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1040 | for Binary Floating-Point Arithmetic.
1041 *----------------------------------------------------------------------------*/
1042
float32_mul(float32 a,float32 b,float_status_t & status)1043 float32 float32_mul(float32 a, float32 b, float_status_t &status)
1044 {
1045 int aSign, bSign, zSign;
1046 Bit16s aExp, bExp, zExp;
1047 Bit32u aSig, bSig;
1048 Bit64u zSig64;
1049 Bit32u zSig;
1050
1051 aSig = extractFloat32Frac(a);
1052 aExp = extractFloat32Exp(a);
1053 aSign = extractFloat32Sign(a);
1054 bSig = extractFloat32Frac(b);
1055 bExp = extractFloat32Exp(b);
1056 bSign = extractFloat32Sign(b);
1057 zSign = aSign ^ bSign;
1058
1059 if (get_denormals_are_zeros(status)) {
1060 if (aExp == 0) aSig = 0;
1061 if (bExp == 0) bSig = 0;
1062 }
1063
1064 if (aExp == 0xFF) {
1065 if (aSig || ((bExp == 0xFF) && bSig))
1066 return propagateFloat32NaN(a, b, status);
1067
1068 if ((bExp | bSig) == 0) {
1069 float_raise(status, float_flag_invalid);
1070 return float32_default_nan;
1071 }
1072 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1073 return packFloat32(zSign, 0xFF, 0);
1074 }
1075 if (bExp == 0xFF) {
1076 if (bSig) return propagateFloat32NaN(a, b, status);
1077 if ((aExp | aSig) == 0) {
1078 float_raise(status, float_flag_invalid);
1079 return float32_default_nan;
1080 }
1081 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1082 return packFloat32(zSign, 0xFF, 0);
1083 }
1084 if (aExp == 0) {
1085 if (aSig == 0) {
1086 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1087 return packFloat32(zSign, 0, 0);
1088 }
1089 float_raise(status, float_flag_denormal);
1090 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1091 }
1092 if (bExp == 0) {
1093 if (bSig == 0) return packFloat32(zSign, 0, 0);
1094 float_raise(status, float_flag_denormal);
1095 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
1096 }
1097 zExp = aExp + bExp - 0x7F;
1098 aSig = (aSig | 0x00800000)<<7;
1099 bSig = (bSig | 0x00800000)<<8;
1100 zSig64 = shift64RightJamming(((Bit64u) aSig) * bSig, 32);
1101 zSig = (Bit32u) zSig64;
1102 if (0 <= (Bit32s) (zSig<<1)) {
1103 zSig <<= 1;
1104 --zExp;
1105 }
1106 return roundAndPackFloat32(zSign, zExp, zSig, status);
1107 }
1108
1109 /*----------------------------------------------------------------------------
1110 | Returns the result of dividing the single-precision floating-point value `a'
1111 | by the corresponding value `b'. The operation is performed according to the
1112 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 *----------------------------------------------------------------------------*/
1114
float32_div(float32 a,float32 b,float_status_t & status)1115 float32 float32_div(float32 a, float32 b, float_status_t &status)
1116 {
1117 int aSign, bSign, zSign;
1118 Bit16s aExp, bExp, zExp;
1119 Bit32u aSig, bSig, zSig;
1120
1121 aSig = extractFloat32Frac(a);
1122 aExp = extractFloat32Exp(a);
1123 aSign = extractFloat32Sign(a);
1124 bSig = extractFloat32Frac(b);
1125 bExp = extractFloat32Exp(b);
1126 bSign = extractFloat32Sign(b);
1127 zSign = aSign ^ bSign;
1128
1129 if (get_denormals_are_zeros(status)) {
1130 if (aExp == 0) aSig = 0;
1131 if (bExp == 0) bSig = 0;
1132 }
1133
1134 if (aExp == 0xFF) {
1135 if (aSig) return propagateFloat32NaN(a, b, status);
1136 if (bExp == 0xFF) {
1137 if (bSig) return propagateFloat32NaN(a, b, status);
1138 float_raise(status, float_flag_invalid);
1139 return float32_default_nan;
1140 }
1141 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1142 return packFloat32(zSign, 0xFF, 0);
1143 }
1144 if (bExp == 0xFF) {
1145 if (bSig) return propagateFloat32NaN(a, b, status);
1146 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1147 return packFloat32(zSign, 0, 0);
1148 }
1149 if (bExp == 0) {
1150 if (bSig == 0) {
1151 if ((aExp | aSig) == 0) {
1152 float_raise(status, float_flag_invalid);
1153 return float32_default_nan;
1154 }
1155 float_raise(status, float_flag_divbyzero);
1156 return packFloat32(zSign, 0xFF, 0);
1157 }
1158 float_raise(status, float_flag_denormal);
1159 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
1160 }
1161 if (aExp == 0) {
1162 if (aSig == 0) return packFloat32(zSign, 0, 0);
1163 float_raise(status, float_flag_denormal);
1164 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1165 }
1166 zExp = aExp - bExp + 0x7D;
1167 aSig = (aSig | 0x00800000)<<7;
1168 bSig = (bSig | 0x00800000)<<8;
1169 if (bSig <= (aSig + aSig)) {
1170 aSig >>= 1;
1171 ++zExp;
1172 }
1173 zSig = (((Bit64u) aSig)<<32) / bSig;
1174 if ((zSig & 0x3F) == 0) {
1175 zSig |= ((Bit64u) bSig * zSig != ((Bit64u) aSig)<<32);
1176 }
1177 return roundAndPackFloat32(zSign, zExp, zSig, status);
1178 }
1179
1180 /*----------------------------------------------------------------------------
1181 | Returns the square root of the single-precision floating-point value `a'.
1182 | The operation is performed according to the IEC/IEEE Standard for Binary
1183 | Floating-Point Arithmetic.
1184 *----------------------------------------------------------------------------*/
1185
float32_sqrt(float32 a,float_status_t & status)1186 float32 float32_sqrt(float32 a, float_status_t &status)
1187 {
1188 int aSign;
1189 Bit16s aExp, zExp;
1190 Bit32u aSig, zSig;
1191 Bit64u rem, term;
1192
1193 aSig = extractFloat32Frac(a);
1194 aExp = extractFloat32Exp(a);
1195 aSign = extractFloat32Sign(a);
1196
1197 if (aExp == 0xFF) {
1198 if (aSig) return propagateFloat32NaN(a, status);
1199 if (! aSign) return a;
1200 float_raise(status, float_flag_invalid);
1201 return float32_default_nan;
1202 }
1203
1204 if (get_denormals_are_zeros(status)) {
1205 if (aExp == 0) aSig = 0;
1206 }
1207
1208 if (aSign) {
1209 if ((aExp | aSig) == 0) return packFloat32(aSign, 0, 0);
1210 float_raise(status, float_flag_invalid);
1211 return float32_default_nan;
1212 }
1213 if (aExp == 0) {
1214 if (aSig == 0) return 0;
1215 float_raise(status, float_flag_denormal);
1216 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
1217 }
1218 zExp = ((aExp - 0x7F)>>1) + 0x7E;
1219 aSig = (aSig | 0x00800000)<<8;
1220 zSig = estimateSqrt32(aExp, aSig) + 2;
1221 if ((zSig & 0x7F) <= 5) {
1222 if (zSig < 2) {
1223 zSig = 0x7FFFFFFF;
1224 goto roundAndPack;
1225 }
1226 aSig >>= aExp & 1;
1227 term = ((Bit64u) zSig) * zSig;
1228 rem = (((Bit64u) aSig)<<32) - term;
1229 while ((Bit64s) rem < 0) {
1230 --zSig;
1231 rem += (((Bit64u) zSig)<<1) | 1;
1232 }
1233 zSig |= (rem != 0);
1234 }
1235 zSig = shift32RightJamming(zSig, 1);
1236 roundAndPack:
1237 return roundAndPackFloat32(0, zExp, zSig, status);
1238 }
1239
1240 /*----------------------------------------------------------------------------
1241 | Determine single-precision floating-point number class.
1242 *----------------------------------------------------------------------------*/
1243
float32_class(float32 a)1244 float_class_t float32_class(float32 a)
1245 {
1246 Bit16s aExp = extractFloat32Exp(a);
1247 Bit32u aSig = extractFloat32Frac(a);
1248 int aSign = extractFloat32Sign(a);
1249
1250 if(aExp == 0xFF) {
1251 if (aSig == 0)
1252 return (aSign) ? float_negative_inf : float_positive_inf;
1253
1254 return (aSig & 0x00400000) ? float_QNaN : float_SNaN;
1255 }
1256
1257 if(aExp == 0) {
1258 if (aSig == 0) return float_zero;
1259 return float_denormal;
1260 }
1261
1262 return float_normalized;
1263 }
1264
1265 /*----------------------------------------------------------------------------
1266 | Compare between two single precision floating point numbers. Returns
1267 | 'float_relation_equal' if the operands are equal, 'float_relation_less' if
1268 | the value 'a' is less than the corresponding value `b',
1269 | 'float_relation_greater' if the value 'a' is greater than the corresponding
1270 | value `b', or 'float_relation_unordered' otherwise.
1271 *----------------------------------------------------------------------------*/
1272
float32_compare(float32 a,float32 b,int quiet,float_status_t & status)1273 int float32_compare(float32 a, float32 b, int quiet, float_status_t &status)
1274 {
1275 if (get_denormals_are_zeros(status)) {
1276 a = float32_denormal_to_zero(a);
1277 b = float32_denormal_to_zero(b);
1278 }
1279
1280 float_class_t aClass = float32_class(a);
1281 float_class_t bClass = float32_class(b);
1282
1283 if (aClass == float_SNaN || bClass == float_SNaN) {
1284 float_raise(status, float_flag_invalid);
1285 return float_relation_unordered;
1286 }
1287
1288 if (aClass == float_QNaN || bClass == float_QNaN) {
1289 if (! quiet) float_raise(status, float_flag_invalid);
1290 return float_relation_unordered;
1291 }
1292
1293 if (aClass == float_denormal || bClass == float_denormal) {
1294 float_raise(status, float_flag_denormal);
1295 }
1296
1297 if ((a == b) || ((Bit32u) ((a | b)<<1) == 0)) return float_relation_equal;
1298
1299 int aSign = extractFloat32Sign(a);
1300 int bSign = extractFloat32Sign(b);
1301 if (aSign != bSign)
1302 return (aSign) ? float_relation_less : float_relation_greater;
1303
1304 if (aSign ^ (a < b)) return float_relation_less;
1305 return float_relation_greater;
1306 }
1307
1308 /*----------------------------------------------------------------------------
1309 | Compare between two single precision floating point numbers and return the
1310 | smaller of them.
1311 *----------------------------------------------------------------------------*/
1312
float32_min(float32 a,float32 b,float_status_t & status)1313 float32 float32_min(float32 a, float32 b, float_status_t &status)
1314 {
1315 if (get_denormals_are_zeros(status)) {
1316 a = float32_denormal_to_zero(a);
1317 b = float32_denormal_to_zero(b);
1318 }
1319
1320 return (float32_compare(a, b, status) == float_relation_less) ? a : b;
1321 }
1322
1323 /*----------------------------------------------------------------------------
1324 | Compare between two single precision floating point numbers and return the
1325 | larger of them.
1326 *----------------------------------------------------------------------------*/
1327
float32_max(float32 a,float32 b,float_status_t & status)1328 float32 float32_max(float32 a, float32 b, float_status_t &status)
1329 {
1330 if (get_denormals_are_zeros(status)) {
1331 a = float32_denormal_to_zero(a);
1332 b = float32_denormal_to_zero(b);
1333 }
1334
1335 return (float32_compare(a, b, status) == float_relation_greater) ? a : b;
1336 }
1337
1338 /*----------------------------------------------------------------------------
1339 | Compare between two single precision floating point numbers and return the
1340 | smaller/larger of them. The operation is performed according to the IEC/IEEE
1341 | Standard for Binary Floating-Point Arithmetic.
1342 *----------------------------------------------------------------------------*/
1343
float32_minmax(float32 a,float32 b,int is_max,int is_abs,float_status_t & status)1344 float32 float32_minmax(float32 a, float32 b, int is_max, int is_abs, float_status_t &status)
1345 {
1346 if (get_denormals_are_zeros(status)) {
1347 a = float32_denormal_to_zero(a);
1348 b = float32_denormal_to_zero(b);
1349 }
1350
1351 if (float32_is_nan(a) || float32_is_nan(b)) {
1352 if (float32_is_signaling_nan(a)) {
1353 return propagateFloat32NaN(a, status);
1354 }
1355 if (float32_is_signaling_nan(b) ) {
1356 return propagateFloat32NaN(b, status);
1357 }
1358 if (! float32_is_nan(b)) {
1359 if (float32_is_denormal(b))
1360 float_raise(status, float_flag_denormal);
1361 return b;
1362 }
1363 if (! float32_is_nan(a)) {
1364 if (float32_is_denormal(a))
1365 float_raise(status, float_flag_denormal);
1366 return a;
1367 }
1368 return propagateFloat32NaN(a, b, status);
1369 }
1370
1371 float32 tmp_a = a, tmp_b = b;
1372 if (is_abs) {
1373 tmp_a &= ~0x80000000; // clear the sign bit
1374 tmp_b &= ~0x80000000;
1375 }
1376
1377 int aSign = extractFloat32Sign(tmp_a);
1378 int bSign = extractFloat32Sign(tmp_b);
1379
1380 if (float32_is_denormal(a) || float32_is_denormal(b))
1381 float_raise(status, float_flag_denormal);
1382
1383 if (aSign != bSign) {
1384 if (! is_max) {
1385 return aSign ? a : b;
1386 } else {
1387 return aSign ? b : a;
1388 }
1389 } else {
1390 if (! is_max) {
1391 return (aSign ^ (tmp_a < tmp_b)) ? a : b;
1392 } else {
1393 return (aSign ^ (tmp_a < tmp_b)) ? b : a;
1394 }
1395 }
1396 }
1397
1398 /*----------------------------------------------------------------------------
1399 | Returns the result of converting the double-precision floating-point value
1400 | `a' to the 32-bit two's complement integer format. The conversion is
1401 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1402 | Arithmetic - which means in particular that the conversion is rounded
1403 | according to the current rounding mode. If `a' is a NaN or the
1404 | conversion overflows, the integer indefinite value is returned.
1405 *----------------------------------------------------------------------------*/
1406
float64_to_int32(float64 a,float_status_t & status)1407 Bit32s float64_to_int32(float64 a, float_status_t &status)
1408 {
1409 Bit64u aSig = extractFloat64Frac(a);
1410 Bit16s aExp = extractFloat64Exp(a);
1411 int aSign = extractFloat64Sign(a);
1412 if ((aExp == 0x7FF) && aSig) aSign = 0;
1413 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1414 else {
1415 if (get_denormals_are_zeros(status)) aSig = 0;
1416 }
1417 int shiftCount = 0x42C - aExp;
1418 if (0 < shiftCount) aSig = shift64RightJamming(aSig, shiftCount);
1419 return roundAndPackInt32(aSign, aSig, status);
1420 }
1421
1422 /*----------------------------------------------------------------------------
1423 | Returns the result of converting the double-precision floating-point value
1424 | `a' to the 32-bit two's complement integer format. The conversion is
1425 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1426 | Arithmetic, except that the conversion is always rounded toward zero.
1427 | If `a' is a NaN or the conversion overflows, the integer indefinite
1428 | value is returned.
1429 *----------------------------------------------------------------------------*/
1430
float64_to_int32_round_to_zero(float64 a,float_status_t & status)1431 Bit32s float64_to_int32_round_to_zero(float64 a, float_status_t &status)
1432 {
1433 int aSign;
1434 Bit16s aExp;
1435 Bit64u aSig, savedASig;
1436 Bit32s z;
1437 int shiftCount;
1438
1439 aSig = extractFloat64Frac(a);
1440 aExp = extractFloat64Exp(a);
1441 aSign = extractFloat64Sign(a);
1442 if (0x41E < aExp) {
1443 float_raise(status, float_flag_invalid);
1444 return (Bit32s)(int32_indefinite);
1445 }
1446 else if (aExp < 0x3FF) {
1447 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1448 if (aExp || aSig) float_raise(status, float_flag_inexact);
1449 return 0;
1450 }
1451 aSig |= BX_CONST64(0x0010000000000000);
1452 shiftCount = 0x433 - aExp;
1453 savedASig = aSig;
1454 aSig >>= shiftCount;
1455 z = (Bit32s) aSig;
1456 if (aSign) z = -z;
1457 if ((z < 0) ^ aSign) {
1458 float_raise(status, float_flag_invalid);
1459 return (Bit32s)(int32_indefinite);
1460 }
1461 if ((aSig<<shiftCount) != savedASig) {
1462 float_raise(status, float_flag_inexact);
1463 }
1464 return z;
1465 }
1466
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the double-precision floating-point value
1469 | `a' to the 32-bit unsigned integer format. The conversion is performed
1470 | according to the IEC/IEEE Standard for Binary Floating-point Arithmetic,
1471 | except that the conversion is always rounded toward zero. If `a' is a NaN
1472 | or conversion overflows, the largest positive integer is returned.
1473 *----------------------------------------------------------------------------*/
1474
float64_to_uint32_round_to_zero(float64 a,float_status_t & status)1475 Bit32u float64_to_uint32_round_to_zero(float64 a, float_status_t &status)
1476 {
1477 Bit64u aSig = extractFloat64Frac(a);
1478 Bit16s aExp = extractFloat64Exp(a);
1479 int aSign = extractFloat64Sign(a);
1480
1481 if (aExp < 0x3FF) {
1482 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1483 if (aExp || aSig) float_raise(status, float_flag_inexact);
1484 return 0;
1485 }
1486
1487 if (0x41E < aExp || aSign) {
1488 float_raise(status, float_flag_invalid);
1489 return uint32_indefinite;
1490 }
1491
1492 aSig |= BX_CONST64(0x0010000000000000);
1493 int shiftCount = 0x433 - aExp;
1494 Bit64u savedASig = aSig;
1495 aSig >>= shiftCount;
1496 if ((aSig<<shiftCount) != savedASig) {
1497 float_raise(status, float_flag_inexact);
1498 }
1499 return (Bit32u) aSig;
1500 }
1501
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the double-precision floating-point value
1504 | `a' to the 64-bit two's complement integer format. The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1506 | Arithmetic - which means in particular that the conversion is rounded
1507 | according to the current rounding mode. If `a' is a NaN, the largest
1508 | positive integer is returned. Otherwise, if the conversion overflows, the
1509 | largest integer with the same sign as `a' is returned.
1510 *----------------------------------------------------------------------------*/
1511
float64_to_int64(float64 a,float_status_t & status)1512 Bit64s float64_to_int64(float64 a, float_status_t &status)
1513 {
1514 int aSign;
1515 Bit16s aExp;
1516 Bit64u aSig, aSigExtra;
1517
1518 aSig = extractFloat64Frac(a);
1519 aExp = extractFloat64Exp(a);
1520 aSign = extractFloat64Sign(a);
1521 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1522 else {
1523 if (get_denormals_are_zeros(status)) aSig = 0;
1524 }
1525 int shiftCount = 0x433 - aExp;
1526 if (shiftCount <= 0) {
1527 if (0x43E < aExp) {
1528 float_raise(status, float_flag_invalid);
1529 return (Bit64s)(int64_indefinite);
1530 }
1531 aSigExtra = 0;
1532 aSig <<= -shiftCount;
1533 }
1534 else {
1535 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
1536 }
1537 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
1538 }
1539
1540 /*----------------------------------------------------------------------------
1541 | Returns the result of converting the double-precision floating-point value
1542 | `a' to the 64-bit two's complement integer format. The conversion is
1543 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1544 | Arithmetic, except that the conversion is always rounded toward zero.
1545 | If `a' is a NaN or the conversion overflows, the integer indefinite
1546 | value is returned.
1547 *----------------------------------------------------------------------------*/
1548
float64_to_int64_round_to_zero(float64 a,float_status_t & status)1549 Bit64s float64_to_int64_round_to_zero(float64 a, float_status_t &status)
1550 {
1551 int aSign;
1552 Bit16s aExp;
1553 Bit64u aSig;
1554 Bit64s z;
1555
1556 aSig = extractFloat64Frac(a);
1557 aExp = extractFloat64Exp(a);
1558 aSign = extractFloat64Sign(a);
1559 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1560 int shiftCount = aExp - 0x433;
1561 if (0 <= shiftCount) {
1562 if (0x43E <= aExp) {
1563 if (a != BX_CONST64(0xC3E0000000000000)) {
1564 float_raise(status, float_flag_invalid);
1565 }
1566 return (Bit64s)(int64_indefinite);
1567 }
1568 z = aSig<<shiftCount;
1569 }
1570 else {
1571 if (aExp < 0x3FE) {
1572 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1573 if (aExp | aSig) float_raise(status, float_flag_inexact);
1574 return 0;
1575 }
1576 z = aSig>>(-shiftCount);
1577 if ((Bit64u) (aSig<<(shiftCount & 63))) {
1578 float_raise(status, float_flag_inexact);
1579 }
1580 }
1581 if (aSign) z = -z;
1582 return z;
1583 }
1584
1585 /*----------------------------------------------------------------------------
1586 | Returns the result of converting the double-precision floating-point value
1587 | `a' to the 64-bit unsigned integer format. The conversion is performed
1588 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
1589 | except that the conversion is always rounded toward zero. If `a' is a NaN
1590 | or the conversion overflows, the largest unsigned integer is returned.
1591 *----------------------------------------------------------------------------*/
1592
float64_to_uint64_round_to_zero(float64 a,float_status_t & status)1593 Bit64u float64_to_uint64_round_to_zero(float64 a, float_status_t &status)
1594 {
1595 int aSign;
1596 Bit16s aExp;
1597 Bit64u aSig, z;
1598
1599 aSig = extractFloat64Frac(a);
1600 aExp = extractFloat64Exp(a);
1601 aSign = extractFloat64Sign(a);
1602
1603 if (aExp < 0x3FE) {
1604 if (get_denormals_are_zeros(status) && aExp == 0) aSig = 0;
1605 if (aExp | aSig) float_raise(status, float_flag_inexact);
1606 return 0;
1607 }
1608
1609 if (0x43E <= aExp || aSign) {
1610 float_raise(status, float_flag_invalid);
1611 return uint64_indefinite;
1612 }
1613
1614 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1615 int shiftCount = aExp - 0x433;
1616
1617 if (0 <= shiftCount) {
1618 z = aSig<<shiftCount;
1619 }
1620 else {
1621 z = aSig>>(-shiftCount);
1622 if ((Bit64u) (aSig<<(shiftCount & 63))) {
1623 float_raise(status, float_flag_inexact);
1624 }
1625 }
1626 return z;
1627 }
1628
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the double-precision floating-point value
1631 | `a' to the 32-bit unsigned integer format. The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1633 | Arithmetic---which means in particular that the conversion is rounded
1634 | according to the current rounding mode. If `a' is a NaN or the conversion
1635 | overflows, the largest unsigned integer is returned.
1636 *----------------------------------------------------------------------------*/
1637
float64_to_uint32(float64 a,float_status_t & status)1638 Bit32u float64_to_uint32(float64 a, float_status_t &status)
1639 {
1640 Bit64u val_64 = float64_to_uint64(a, status);
1641
1642 if (val_64 > 0xffffffff) {
1643 status.float_exception_flags = float_flag_invalid; // throw away other flags
1644 return uint32_indefinite;
1645 }
1646
1647 return (Bit32u) val_64;
1648 }
1649
1650 /*----------------------------------------------------------------------------
1651 | Returns the result of converting the double-precision floating-point value
1652 | `a' to the 64-bit unsigned integer format. The conversion is
1653 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1654 | Arithmetic---which means in particular that the conversion is rounded
1655 | according to the current rounding mode. If `a' is a NaN or the conversion
1656 | overflows, the largest unsigned integer is returned.
1657 *----------------------------------------------------------------------------*/
1658
float64_to_uint64(float64 a,float_status_t & status)1659 Bit64u float64_to_uint64(float64 a, float_status_t &status)
1660 {
1661 int aSign;
1662 Bit16s aExp, shiftCount;
1663 Bit64u aSig, aSigExtra;
1664
1665 aSig = extractFloat64Frac(a);
1666 aExp = extractFloat64Exp(a);
1667 aSign = extractFloat64Sign(a);
1668
1669 if (get_denormals_are_zeros(status)) {
1670 if (aExp == 0) aSig = 0;
1671 }
1672
1673 if (aSign && (aExp > 0x3FE)) {
1674 float_raise(status, float_flag_invalid);
1675 return uint64_indefinite;
1676 }
1677
1678 if (aExp) {
1679 aSig |= BX_CONST64(0x0010000000000000);
1680 }
1681 shiftCount = 0x433 - aExp;
1682 if (shiftCount <= 0) {
1683 if (0x43E < aExp) {
1684 float_raise(status, float_flag_invalid);
1685 return uint64_indefinite;
1686 }
1687 aSigExtra = 0;
1688 aSig <<= -shiftCount;
1689 } else {
1690 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
1691 }
1692
1693 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
1694 }
1695
1696 /*----------------------------------------------------------------------------
1697 | Returns the result of converting the double-precision floating-point value
1698 | `a' to the single-precision floating-point format. The conversion is
1699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1700 | Arithmetic.
1701 *----------------------------------------------------------------------------*/
1702
float64_to_float32(float64 a,float_status_t & status)1703 float32 float64_to_float32(float64 a, float_status_t &status)
1704 {
1705 int aSign;
1706 Bit16s aExp;
1707 Bit64u aSig;
1708 Bit32u zSig;
1709
1710 aSig = extractFloat64Frac(a);
1711 aExp = extractFloat64Exp(a);
1712 aSign = extractFloat64Sign(a);
1713 if (aExp == 0x7FF) {
1714 if (aSig) return commonNaNToFloat32(float64ToCommonNaN(a, status));
1715 return packFloat32(aSign, 0xFF, 0);
1716 }
1717 if (aExp == 0) {
1718 if (aSig == 0 || get_denormals_are_zeros(status))
1719 return packFloat32(aSign, 0, 0);
1720 float_raise(status, float_flag_denormal);
1721 }
1722 aSig = shift64RightJamming(aSig, 22);
1723 zSig = (Bit32u) aSig;
1724 if (aExp || zSig) {
1725 zSig |= 0x40000000;
1726 aExp -= 0x381;
1727 }
1728 return roundAndPackFloat32(aSign, aExp, zSig, status);
1729 }
1730
1731 /*----------------------------------------------------------------------------
1732 | Rounds the double-precision floating-point value `a' to an integer, and
1733 | returns the result as a double-precision floating-point value. The
1734 | operation is performed according to the IEC/IEEE Standard for Binary
1735 | Floating-Point Arithmetic.
1736 *----------------------------------------------------------------------------*/
1737
float64_round_to_int(float64 a,Bit8u scale,float_status_t & status)1738 float64 float64_round_to_int(float64 a, Bit8u scale, float_status_t &status)
1739 {
1740 Bit64u lastBitMask, roundBitsMask;
1741 int roundingMode = get_float_rounding_mode(status);
1742 Bit16s aExp = extractFloat64Exp(a);
1743 scale &= 0xf;
1744
1745 if ((aExp == 0x7FF) && extractFloat64Frac(a)) {
1746 return propagateFloat64NaN(a, status);
1747 }
1748
1749 aExp += scale; // scale the exponent
1750
1751 if (0x433 <= aExp) {
1752 return a;
1753 }
1754
1755 if (get_denormals_are_zeros(status)) {
1756 a = float64_denormal_to_zero(a);
1757 }
1758
1759 if (aExp < 0x3FF) {
1760 if ((Bit64u) (a<<1) == 0) return a;
1761 float_raise(status, float_flag_inexact);
1762 int aSign = extractFloat64Sign(a);
1763 switch (roundingMode) {
1764 case float_round_nearest_even:
1765 if ((aExp == 0x3FE) && extractFloat64Frac(a)) {
1766 return packFloat64(aSign, 0x3FF - scale, 0);
1767 }
1768 break;
1769 case float_round_down:
1770 return aSign ? packFloat64(1, 0x3FF - scale, 0) : float64_positive_zero;
1771 case float_round_up:
1772 return aSign ? float64_negative_zero : packFloat64(0, 0x3FF - scale, 0);
1773 }
1774 return packFloat64(aSign, 0, 0);
1775 }
1776
1777 lastBitMask = 1;
1778 lastBitMask <<= 0x433 - aExp;
1779 roundBitsMask = lastBitMask - 1;
1780 float64 z = a;
1781 if (roundingMode == float_round_nearest_even) {
1782 z += lastBitMask>>1;
1783 if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
1784 }
1785 else if (roundingMode != float_round_to_zero) {
1786 if (extractFloat64Sign(z) ^ (roundingMode == float_round_up)) {
1787 z += roundBitsMask;
1788 }
1789 }
1790 z &= ~roundBitsMask;
1791 if (z != a) float_raise(status, float_flag_inexact);
1792 return z;
1793 }
1794
1795 /*----------------------------------------------------------------------------
1796 | Extracts the fractional portion of double-precision floating-point value `a',
1797 | and returns the result as a double-precision floating-point value. The
1798 | fractional results are precise. The operation is performed according to the
1799 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1800 *----------------------------------------------------------------------------*/
1801
float64_frc(float64 a,float_status_t & status)1802 float64 float64_frc(float64 a, float_status_t &status)
1803 {
1804 int roundingMode = get_float_rounding_mode(status);
1805
1806 Bit64u aSig = extractFloat64Frac(a);
1807 Bit16s aExp = extractFloat64Exp(a);
1808 int aSign = extractFloat64Sign(a);
1809
1810 if (aExp == 0x7FF) {
1811 if (aSig) return propagateFloat64NaN(a, status);
1812 float_raise(status, float_flag_invalid);
1813 return float64_default_nan;
1814 }
1815
1816 if (aExp >= 0x433) {
1817 return packFloat64(roundingMode == float_round_down, 0, 0);
1818 }
1819
1820 if (aExp < 0x3FF) {
1821 if (aExp == 0) {
1822 if (aSig == 0 || get_denormals_are_zeros(status))
1823 return packFloat64(roundingMode == float_round_down, 0, 0);
1824
1825 float_raise(status, float_flag_denormal);
1826 if (! float_exception_masked(status, float_flag_underflow))
1827 float_raise(status, float_flag_underflow);
1828
1829 if(get_flush_underflow_to_zero(status)) {
1830 float_raise(status, float_flag_underflow | float_flag_inexact);
1831 return packFloat64(aSign, 0, 0);
1832 }
1833 }
1834 return a;
1835 }
1836
1837 Bit64u lastBitMask = BX_CONST64(1) << (0x433 - aExp);
1838 Bit64u roundBitsMask = lastBitMask - 1;
1839
1840 aSig &= roundBitsMask;
1841 aSig <<= 10;
1842 aExp--;
1843
1844 if (aSig == 0)
1845 return packFloat64(roundingMode == float_round_down, 0, 0);
1846
1847 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
1848 }
1849
1850 /*----------------------------------------------------------------------------
1851 | Extracts the exponent portion of double-precision floating-point value 'a',
1852 | and returns the result as a double-precision floating-point value
1853 | representing unbiased integer exponent. The operation is performed according
1854 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1855 *----------------------------------------------------------------------------*/
1856
float64_getexp(float64 a,float_status_t & status)1857 float64 float64_getexp(float64 a, float_status_t &status)
1858 {
1859 Bit16s aExp = extractFloat64Exp(a);
1860 Bit64u aSig = extractFloat64Frac(a);
1861
1862 if (aExp == 0x7FF) {
1863 if (aSig) return propagateFloat64NaN(a, status);
1864 return float64_positive_inf;
1865 }
1866
1867 if (aExp == 0) {
1868 if (aSig == 0 || get_denormals_are_zeros(status))
1869 return float64_negative_inf;
1870
1871 float_raise(status, float_flag_denormal);
1872 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1873 }
1874
1875 return int32_to_float64(aExp - 0x3FF);
1876 }
1877
1878 /*----------------------------------------------------------------------------
1879 | Extracts the mantissa of double-precision floating-point value 'a' and
1880 | returns the result as a double-precision floating-point after applying
1881 | the mantissa interval normalization and sign control. The operation is
1882 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1883 | Arithmetic.
1884 *----------------------------------------------------------------------------*/
1885
float64_getmant(float64 a,float_status_t & status,int sign_ctrl,int interv)1886 float64 float64_getmant(float64 a, float_status_t &status, int sign_ctrl, int interv)
1887 {
1888 Bit16s aExp = extractFloat64Exp(a);
1889 Bit64u aSig = extractFloat64Frac(a);
1890 int aSign = extractFloat64Sign(a);
1891
1892 if (aExp == 0x7FF) {
1893 if (aSig) return propagateFloat64NaN(a, status);
1894 if (aSign) {
1895 if (sign_ctrl & 0x2) {
1896 float_raise(status, float_flag_invalid);
1897 return float64_default_nan;
1898 }
1899 }
1900 return packFloat64(~sign_ctrl & aSign, 0x3FF, 0);
1901 }
1902
1903 if (aExp == 0 && (aSig == 0 || get_denormals_are_zeros(status))) {
1904 return packFloat64(~sign_ctrl & aSign, 0x3FF, 0);
1905 }
1906
1907 if (aSign) {
1908 if (sign_ctrl & 0x2) {
1909 float_raise(status, float_flag_invalid);
1910 return float64_default_nan;
1911 }
1912 }
1913
1914 if (aExp == 0) {
1915 float_raise(status, float_flag_denormal);
1916 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1917 // aExp += 0x3FE;
1918 aSig &= BX_CONST64(0xFFFFFFFFFFFFFFFF);
1919 }
1920
1921 switch(interv) {
1922 case 0x0: // interval [1,2)
1923 aExp = 0x3FF;
1924 break;
1925 case 0x1: // interval [1/2,2)
1926 aExp -= 0x3FF;
1927 aExp = 0x3FF - (aExp & 0x1);
1928 break;
1929 case 0x2: // interval [1/2,1)
1930 aExp = 0x3FE;
1931 break;
1932 case 0x3: // interval [3/4,3/2)
1933 aExp = 0x3FF - ((aSig >> 51) & 0x1);
1934 break;
1935 }
1936
1937 return packFloat64(~sign_ctrl & aSign, aExp, aSig);
1938 }
1939
1940 /*----------------------------------------------------------------------------
1941 | Return the result of a floating point scale of the double-precision floating
1942 | point value `a' by multiplying it by 2 power of the double-precision
1943 | floating point value 'b' converted to integral value. If the result cannot
1944 | be represented in double precision, then the proper overflow response (for
1945 | positive scaling operand), or the proper underflow response (for negative
1946 | scaling operand) is issued. The operation is performed according to the
1947 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1948 *----------------------------------------------------------------------------*/
1949
float64_scalef(float64 a,float64 b,float_status_t & status)1950 float64 float64_scalef(float64 a, float64 b, float_status_t &status)
1951 {
1952 Bit64u aSig = extractFloat64Frac(a);
1953 Bit16s aExp = extractFloat64Exp(a);
1954 int aSign = extractFloat64Sign(a);
1955 Bit64u bSig = extractFloat64Frac(b);
1956 Bit16s bExp = extractFloat64Exp(b);
1957 int bSign = extractFloat64Sign(b);
1958
1959 if (get_denormals_are_zeros(status)) {
1960 if (aExp == 0) aSig = 0;
1961 if (bExp == 0) bSig = 0;
1962 }
1963
1964 if (bExp == 0x7FF) {
1965 if (bSig) return propagateFloat64NaN(a, b, status);
1966 }
1967
1968 if (aExp == 0x7FF) {
1969 if (aSig) {
1970 int aIsSignalingNaN = (aSig & BX_CONST64(0x0008000000000000)) == 0;
1971 if (aIsSignalingNaN || bExp != 0x7FF || bSig)
1972 return propagateFloat64NaN(a, b, status);
1973
1974 return bSign ? 0 : float64_positive_inf;
1975 }
1976
1977 if (bExp == 0x7FF && bSign) {
1978 float_raise(status, float_flag_invalid);
1979 return float64_default_nan;
1980 }
1981 return a;
1982 }
1983
1984 if (aExp == 0) {
1985 if (aSig == 0) {
1986 if (bExp == 0x7FF && ! bSign) {
1987 float_raise(status, float_flag_invalid);
1988 return float64_default_nan;
1989 }
1990 return a;
1991 }
1992 float_raise(status, float_flag_denormal);
1993 }
1994
1995 if ((bExp | bSig) == 0) return a;
1996
1997 if (bExp == 0x7FF) {
1998 if (bSign) return packFloat64(aSign, 0, 0);
1999 return packFloat64(aSign, 0x7FF, 0);
2000 }
2001
2002 if (0x40F <= bExp) {
2003 // handle obvious overflow/underflow result
2004 return roundAndPackFloat64(aSign, bSign ? -0x3FF : 0x7FF, aSig, status);
2005 }
2006
2007 int scale = 0;
2008
2009 if (bExp < 0x3FF) {
2010 if (bExp == 0)
2011 float_raise(status, float_flag_denormal);
2012 scale = -bSign;
2013 }
2014 else {
2015 bSig |= BX_CONST64(0x0010000000000000);
2016 int shiftCount = 0x433 - bExp;
2017 Bit64u savedBSig = bSig;
2018 bSig >>= shiftCount;
2019 scale = (Bit32s) bSig;
2020 if (bSign) {
2021 if ((bSig<<shiftCount) != savedBSig) scale++;
2022 scale = -scale;
2023 }
2024
2025 if (scale > 0x1000) scale = 0x1000;
2026 if (scale < -0x1000) scale = -0x1000;
2027 }
2028
2029 if (aExp != 0) {
2030 aSig |= BX_CONST64(0x0010000000000000);
2031 } else {
2032 aExp++;
2033 }
2034
2035 aExp += scale - 1;
2036 aSig <<= 10;
2037 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
2038 }
2039
2040 /*----------------------------------------------------------------------------
2041 | Returns the result of adding the absolute values of the double-precision
2042 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2043 | before being returned. `zSign' is ignored if the result is a NaN.
2044 | The addition is performed according to the IEC/IEEE Standard for Binary
2045 | Floating-Point Arithmetic.
2046 *----------------------------------------------------------------------------*/
2047
addFloat64Sigs(float64 a,float64 b,int zSign,float_status_t & status)2048 static float64 addFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
2049 {
2050 Bit16s aExp, bExp, zExp;
2051 Bit64u aSig, bSig, zSig;
2052 Bit16s expDiff;
2053
2054 aSig = extractFloat64Frac(a);
2055 aExp = extractFloat64Exp(a);
2056 bSig = extractFloat64Frac(b);
2057 bExp = extractFloat64Exp(b);
2058
2059 if (get_denormals_are_zeros(status)) {
2060 if (aExp == 0) aSig = 0;
2061 if (bExp == 0) bSig = 0;
2062 }
2063
2064 expDiff = aExp - bExp;
2065 aSig <<= 9;
2066 bSig <<= 9;
2067 if (0 < expDiff) {
2068 if (aExp == 0x7FF) {
2069 if (aSig) return propagateFloat64NaN(a, b, status);
2070 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2071 return a;
2072 }
2073 if ((aExp == 0) && aSig)
2074 float_raise(status, float_flag_denormal);
2075
2076 if (bExp == 0) {
2077 if (bSig) float_raise(status, float_flag_denormal);
2078 --expDiff;
2079 }
2080 else bSig |= BX_CONST64(0x2000000000000000);
2081
2082 bSig = shift64RightJamming(bSig, expDiff);
2083 zExp = aExp;
2084 }
2085 else if (expDiff < 0) {
2086 if (bExp == 0x7FF) {
2087 if (bSig) return propagateFloat64NaN(a, b, status);
2088 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2089 return packFloat64(zSign, 0x7FF, 0);
2090 }
2091 if ((bExp == 0) && bSig)
2092 float_raise(status, float_flag_denormal);
2093
2094 if (aExp == 0) {
2095 if (aSig) float_raise(status, float_flag_denormal);
2096 ++expDiff;
2097 }
2098 else aSig |= BX_CONST64(0x2000000000000000);
2099
2100 aSig = shift64RightJamming(aSig, -expDiff);
2101 zExp = bExp;
2102 }
2103 else {
2104 if (aExp == 0x7FF) {
2105 if (aSig | bSig) return propagateFloat64NaN(a, b, status);
2106 return a;
2107 }
2108 if (aExp == 0) {
2109 zSig = (aSig + bSig) >> 9;
2110 if (aSig | bSig) {
2111 float_raise(status, float_flag_denormal);
2112 if (get_flush_underflow_to_zero(status) && (extractFloat64Frac(zSig) == zSig)) {
2113 float_raise(status, float_flag_underflow | float_flag_inexact);
2114 return packFloat64(zSign, 0, 0);
2115 }
2116 if (! float_exception_masked(status, float_flag_underflow)) {
2117 if (extractFloat64Frac(zSig) == zSig)
2118 float_raise(status, float_flag_underflow);
2119 }
2120 }
2121 return packFloat64(zSign, 0, zSig);
2122 }
2123 zSig = BX_CONST64(0x4000000000000000) + aSig + bSig;
2124 return roundAndPackFloat64(zSign, aExp, zSig, status);
2125 }
2126 aSig |= BX_CONST64(0x2000000000000000);
2127 zSig = (aSig + bSig)<<1;
2128 --zExp;
2129 if ((Bit64s) zSig < 0) {
2130 zSig = aSig + bSig;
2131 ++zExp;
2132 }
2133 return roundAndPackFloat64(zSign, zExp, zSig, status);
2134 }
2135
2136 /*----------------------------------------------------------------------------
2137 | Returns the result of subtracting the absolute values of the double-
2138 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2139 | difference is negated before being returned. `zSign' is ignored if the
2140 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2141 | Standard for Binary Floating-Point Arithmetic.
2142 *----------------------------------------------------------------------------*/
2143
subFloat64Sigs(float64 a,float64 b,int zSign,float_status_t & status)2144 static float64 subFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
2145 {
2146 Bit16s aExp, bExp, zExp;
2147 Bit64u aSig, bSig, zSig;
2148 Bit16s expDiff;
2149
2150 aSig = extractFloat64Frac(a);
2151 aExp = extractFloat64Exp(a);
2152 bSig = extractFloat64Frac(b);
2153 bExp = extractFloat64Exp(b);
2154
2155 if (get_denormals_are_zeros(status)) {
2156 if (aExp == 0) aSig = 0;
2157 if (bExp == 0) bSig = 0;
2158 }
2159
2160 expDiff = aExp - bExp;
2161 aSig <<= 10;
2162 bSig <<= 10;
2163 if (0 < expDiff) goto aExpBigger;
2164 if (expDiff < 0) goto bExpBigger;
2165 if (aExp == 0x7FF) {
2166 if (aSig | bSig) return propagateFloat64NaN(a, b, status);
2167 float_raise(status, float_flag_invalid);
2168 return float64_default_nan;
2169 }
2170 if (aExp == 0) {
2171 if (aSig | bSig) float_raise(status, float_flag_denormal);
2172 aExp = 1;
2173 bExp = 1;
2174 }
2175 if (bSig < aSig) goto aBigger;
2176 if (aSig < bSig) goto bBigger;
2177 return packFloat64(get_float_rounding_mode(status) == float_round_down, 0, 0);
2178 bExpBigger:
2179 if (bExp == 0x7FF) {
2180 if (bSig) return propagateFloat64NaN(a, b, status);
2181 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2182 return packFloat64(zSign ^ 1, 0x7FF, 0);
2183 }
2184 if ((bExp == 0) && bSig)
2185 float_raise(status, float_flag_denormal);
2186
2187 if (aExp == 0) {
2188 if (aSig) float_raise(status, float_flag_denormal);
2189 ++expDiff;
2190 }
2191 else aSig |= BX_CONST64(0x4000000000000000);
2192
2193 aSig = shift64RightJamming(aSig, -expDiff);
2194 bSig |= BX_CONST64(0x4000000000000000);
2195 bBigger:
2196 zSig = bSig - aSig;
2197 zExp = bExp;
2198 zSign ^= 1;
2199 goto normalizeRoundAndPack;
2200 aExpBigger:
2201 if (aExp == 0x7FF) {
2202 if (aSig) return propagateFloat64NaN(a, b, status);
2203 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2204 return a;
2205 }
2206 if ((aExp == 0) && aSig)
2207 float_raise(status, float_flag_denormal);
2208
2209 if (bExp == 0) {
2210 if (bSig) float_raise(status, float_flag_denormal);
2211 --expDiff;
2212 }
2213 else bSig |= BX_CONST64(0x4000000000000000);
2214
2215 bSig = shift64RightJamming(bSig, expDiff);
2216 aSig |= BX_CONST64(0x4000000000000000);
2217 aBigger:
2218 zSig = aSig - bSig;
2219 zExp = aExp;
2220 normalizeRoundAndPack:
2221 --zExp;
2222 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
2223 }
2224
2225 /*----------------------------------------------------------------------------
2226 | Returns the result of adding the double-precision floating-point values `a'
2227 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2228 | Binary Floating-Point Arithmetic.
2229 *----------------------------------------------------------------------------*/
2230
float64_add(float64 a,float64 b,float_status_t & status)2231 float64 float64_add(float64 a, float64 b, float_status_t &status)
2232 {
2233 int aSign = extractFloat64Sign(a);
2234 int bSign = extractFloat64Sign(b);
2235
2236 if (aSign == bSign) {
2237 return addFloat64Sigs(a, b, aSign, status);
2238 }
2239 else {
2240 return subFloat64Sigs(a, b, aSign, status);
2241 }
2242 }
2243
2244 /*----------------------------------------------------------------------------
2245 | Returns the result of subtracting the double-precision floating-point values
2246 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2247 | for Binary Floating-Point Arithmetic.
2248 *----------------------------------------------------------------------------*/
2249
float64_sub(float64 a,float64 b,float_status_t & status)2250 float64 float64_sub(float64 a, float64 b, float_status_t &status)
2251 {
2252 int aSign = extractFloat64Sign(a);
2253 int bSign = extractFloat64Sign(b);
2254
2255 if (aSign == bSign) {
2256 return subFloat64Sigs(a, b, aSign, status);
2257 }
2258 else {
2259 return addFloat64Sigs(a, b, aSign, status);
2260 }
2261 }
2262
2263 /*----------------------------------------------------------------------------
2264 | Returns the result of multiplying the double-precision floating-point values
2265 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2266 | for Binary Floating-Point Arithmetic.
2267 *----------------------------------------------------------------------------*/
2268
float64_mul(float64 a,float64 b,float_status_t & status)2269 float64 float64_mul(float64 a, float64 b, float_status_t &status)
2270 {
2271 int aSign, bSign, zSign;
2272 Bit16s aExp, bExp, zExp;
2273 Bit64u aSig, bSig, zSig0, zSig1;
2274
2275 aSig = extractFloat64Frac(a);
2276 aExp = extractFloat64Exp(a);
2277 aSign = extractFloat64Sign(a);
2278 bSig = extractFloat64Frac(b);
2279 bExp = extractFloat64Exp(b);
2280 bSign = extractFloat64Sign(b);
2281 zSign = aSign ^ bSign;
2282
2283 if (get_denormals_are_zeros(status)) {
2284 if (aExp == 0) aSig = 0;
2285 if (bExp == 0) bSig = 0;
2286 }
2287
2288 if (aExp == 0x7FF) {
2289 if (aSig || ((bExp == 0x7FF) && bSig)) {
2290 return propagateFloat64NaN(a, b, status);
2291 }
2292 if ((bExp | bSig) == 0) {
2293 float_raise(status, float_flag_invalid);
2294 return float64_default_nan;
2295 }
2296 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2297 return packFloat64(zSign, 0x7FF, 0);
2298 }
2299 if (bExp == 0x7FF) {
2300 if (bSig) return propagateFloat64NaN(a, b, status);
2301 if ((aExp | aSig) == 0) {
2302 float_raise(status, float_flag_invalid);
2303 return float64_default_nan;
2304 }
2305 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2306 return packFloat64(zSign, 0x7FF, 0);
2307 }
2308 if (aExp == 0) {
2309 if (aSig == 0) {
2310 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2311 return packFloat64(zSign, 0, 0);
2312 }
2313 float_raise(status, float_flag_denormal);
2314 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2315 }
2316 if (bExp == 0) {
2317 if (bSig == 0) return packFloat64(zSign, 0, 0);
2318 float_raise(status, float_flag_denormal);
2319 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
2320 }
2321 zExp = aExp + bExp - 0x3FF;
2322 aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
2323 bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
2324 mul64To128(aSig, bSig, &zSig0, &zSig1);
2325 zSig0 |= (zSig1 != 0);
2326 if (0 <= (Bit64s) (zSig0<<1)) {
2327 zSig0 <<= 1;
2328 --zExp;
2329 }
2330 return roundAndPackFloat64(zSign, zExp, zSig0, status);
2331 }
2332
2333 /*----------------------------------------------------------------------------
2334 | Returns the result of dividing the double-precision floating-point value `a'
2335 | by the corresponding value `b'. The operation is performed according to
2336 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2337 *----------------------------------------------------------------------------*/
2338
float64_div(float64 a,float64 b,float_status_t & status)2339 float64 float64_div(float64 a, float64 b, float_status_t &status)
2340 {
2341 int aSign, bSign, zSign;
2342 Bit16s aExp, bExp, zExp;
2343 Bit64u aSig, bSig, zSig;
2344 Bit64u rem0, rem1;
2345 Bit64u term0, term1;
2346
2347 aSig = extractFloat64Frac(a);
2348 aExp = extractFloat64Exp(a);
2349 aSign = extractFloat64Sign(a);
2350 bSig = extractFloat64Frac(b);
2351 bExp = extractFloat64Exp(b);
2352 bSign = extractFloat64Sign(b);
2353 zSign = aSign ^ bSign;
2354
2355 if (get_denormals_are_zeros(status)) {
2356 if (aExp == 0) aSig = 0;
2357 if (bExp == 0) bSig = 0;
2358 }
2359
2360 if (aExp == 0x7FF) {
2361 if (aSig) return propagateFloat64NaN(a, b, status);
2362 if (bExp == 0x7FF) {
2363 if (bSig) return propagateFloat64NaN(a, b, status);
2364 float_raise(status, float_flag_invalid);
2365 return float64_default_nan;
2366 }
2367 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2368 return packFloat64(zSign, 0x7FF, 0);
2369 }
2370 if (bExp == 0x7FF) {
2371 if (bSig) return propagateFloat64NaN(a, b, status);
2372 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2373 return packFloat64(zSign, 0, 0);
2374 }
2375 if (bExp == 0) {
2376 if (bSig == 0) {
2377 if ((aExp | aSig) == 0) {
2378 float_raise(status, float_flag_invalid);
2379 return float64_default_nan;
2380 }
2381 float_raise(status, float_flag_divbyzero);
2382 return packFloat64(zSign, 0x7FF, 0);
2383 }
2384 float_raise(status, float_flag_denormal);
2385 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
2386 }
2387 if (aExp == 0) {
2388 if (aSig == 0) return packFloat64(zSign, 0, 0);
2389 float_raise(status, float_flag_denormal);
2390 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2391 }
2392 zExp = aExp - bExp + 0x3FD;
2393 aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
2394 bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
2395 if (bSig <= (aSig + aSig)) {
2396 aSig >>= 1;
2397 ++zExp;
2398 }
2399 zSig = estimateDiv128To64(aSig, 0, bSig);
2400 if ((zSig & 0x1FF) <= 2) {
2401 mul64To128(bSig, zSig, &term0, &term1);
2402 sub128(aSig, 0, term0, term1, &rem0, &rem1);
2403 while ((Bit64s) rem0 < 0) {
2404 --zSig;
2405 add128(rem0, rem1, 0, bSig, &rem0, &rem1);
2406 }
2407 zSig |= (rem1 != 0);
2408 }
2409 return roundAndPackFloat64(zSign, zExp, zSig, status);
2410 }
2411
2412 /*----------------------------------------------------------------------------
2413 | Returns the square root of the double-precision floating-point value `a'.
2414 | The operation is performed according to the IEC/IEEE Standard for Binary
2415 | Floating-Point Arithmetic.
2416 *----------------------------------------------------------------------------*/
2417
float64_sqrt(float64 a,float_status_t & status)2418 float64 float64_sqrt(float64 a, float_status_t &status)
2419 {
2420 int aSign;
2421 Bit16s aExp, zExp;
2422 Bit64u aSig, zSig, doubleZSig;
2423 Bit64u rem0, rem1, term0, term1;
2424
2425 aSig = extractFloat64Frac(a);
2426 aExp = extractFloat64Exp(a);
2427 aSign = extractFloat64Sign(a);
2428
2429 if (aExp == 0x7FF) {
2430 if (aSig) return propagateFloat64NaN(a, status);
2431 if (! aSign) return a;
2432 float_raise(status, float_flag_invalid);
2433 return float64_default_nan;
2434 }
2435
2436 if (get_denormals_are_zeros(status)) {
2437 if (aExp == 0) aSig = 0;
2438 }
2439
2440 if (aSign) {
2441 if ((aExp | aSig) == 0) return packFloat64(aSign, 0, 0);
2442 float_raise(status, float_flag_invalid);
2443 return float64_default_nan;
2444 }
2445 if (aExp == 0) {
2446 if (aSig == 0) return 0;
2447 float_raise(status, float_flag_denormal);
2448 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2449 }
2450 zExp = ((aExp - 0x3FF)>>1) + 0x3FE;
2451 aSig |= BX_CONST64(0x0010000000000000);
2452 zSig = estimateSqrt32(aExp, (Bit32u)(aSig>>21));
2453 aSig <<= 9 - (aExp & 1);
2454 zSig = estimateDiv128To64(aSig, 0, zSig<<32) + (zSig<<30);
2455 if ((zSig & 0x1FF) <= 5) {
2456 doubleZSig = zSig<<1;
2457 mul64To128(zSig, zSig, &term0, &term1);
2458 sub128(aSig, 0, term0, term1, &rem0, &rem1);
2459 while ((Bit64s) rem0 < 0) {
2460 --zSig;
2461 doubleZSig -= 2;
2462 add128(rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1);
2463 }
2464 zSig |= ((rem0 | rem1) != 0);
2465 }
2466 return roundAndPackFloat64(0, zExp, zSig, status);
2467 }
2468
2469 /*----------------------------------------------------------------------------
2470 | Determine double-precision floating-point number class
2471 *----------------------------------------------------------------------------*/
2472
float64_class(float64 a)2473 float_class_t float64_class(float64 a)
2474 {
2475 Bit16s aExp = extractFloat64Exp(a);
2476 Bit64u aSig = extractFloat64Frac(a);
2477 int aSign = extractFloat64Sign(a);
2478
2479 if(aExp == 0x7FF) {
2480 if (aSig == 0)
2481 return (aSign) ? float_negative_inf : float_positive_inf;
2482
2483 return (aSig & BX_CONST64(0x0008000000000000)) ? float_QNaN : float_SNaN;
2484 }
2485
2486 if(aExp == 0) {
2487 if (aSig == 0)
2488 return float_zero;
2489 return float_denormal;
2490 }
2491
2492 return float_normalized;
2493 }
2494
2495 /*----------------------------------------------------------------------------
2496 | Compare between two double precision floating point numbers. Returns
2497 | 'float_relation_equal' if the operands are equal, 'float_relation_less' if
2498 | the value 'a' is less than the corresponding value `b',
2499 | 'float_relation_greater' if the value 'a' is greater than the corresponding
2500 | value `b', or 'float_relation_unordered' otherwise.
2501 *----------------------------------------------------------------------------*/
2502
float64_compare(float64 a,float64 b,int quiet,float_status_t & status)2503 int float64_compare(float64 a, float64 b, int quiet, float_status_t &status)
2504 {
2505 if (get_denormals_are_zeros(status)) {
2506 a = float64_denormal_to_zero(a);
2507 b = float64_denormal_to_zero(b);
2508 }
2509
2510 float_class_t aClass = float64_class(a);
2511 float_class_t bClass = float64_class(b);
2512
2513 if (aClass == float_SNaN || bClass == float_SNaN) {
2514 float_raise(status, float_flag_invalid);
2515 return float_relation_unordered;
2516 }
2517
2518 if (aClass == float_QNaN || bClass == float_QNaN) {
2519 if (! quiet) float_raise(status, float_flag_invalid);
2520 return float_relation_unordered;
2521 }
2522
2523 if (aClass == float_denormal || bClass == float_denormal) {
2524 float_raise(status, float_flag_denormal);
2525 }
2526
2527 if ((a == b) || ((Bit64u) ((a | b)<<1) == 0)) return float_relation_equal;
2528
2529 int aSign = extractFloat64Sign(a);
2530 int bSign = extractFloat64Sign(b);
2531 if (aSign != bSign)
2532 return (aSign) ? float_relation_less : float_relation_greater;
2533
2534 if (aSign ^ (a < b)) return float_relation_less;
2535 return float_relation_greater;
2536 }
2537
2538 /*----------------------------------------------------------------------------
2539 | Compare between two double precision floating point numbers and return the
2540 | smaller of them.
2541 *----------------------------------------------------------------------------*/
2542
float64_min(float64 a,float64 b,float_status_t & status)2543 float64 float64_min(float64 a, float64 b, float_status_t &status)
2544 {
2545 if (get_denormals_are_zeros(status)) {
2546 a = float64_denormal_to_zero(a);
2547 b = float64_denormal_to_zero(b);
2548 }
2549
2550 return (float64_compare(a, b, status) == float_relation_less) ? a : b;
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Compare between two double precision floating point numbers and return the
2555 | larger of them.
2556 *----------------------------------------------------------------------------*/
2557
float64_max(float64 a,float64 b,float_status_t & status)2558 float64 float64_max(float64 a, float64 b, float_status_t &status)
2559 {
2560 if (get_denormals_are_zeros(status)) {
2561 a = float64_denormal_to_zero(a);
2562 b = float64_denormal_to_zero(b);
2563 }
2564
2565 return (float64_compare(a, b, status) == float_relation_greater) ? a : b;
2566 }
2567
2568 /*----------------------------------------------------------------------------
2569 | Compare between two double precision floating point numbers and return the
2570 | smaller/larger of them. The operation is performed according to the IEC/IEEE
2571 | Standard for Binary Floating-Point Arithmetic.
2572 *----------------------------------------------------------------------------*/
2573
float64_minmax(float64 a,float64 b,int is_max,int is_abs,float_status_t & status)2574 float64 float64_minmax(float64 a, float64 b, int is_max, int is_abs, float_status_t &status)
2575 {
2576 if (get_denormals_are_zeros(status)) {
2577 a = float64_denormal_to_zero(a);
2578 b = float64_denormal_to_zero(b);
2579 }
2580
2581 if (float64_is_nan(a) || float64_is_nan(b)) {
2582 if (float64_is_signaling_nan(a)) {
2583 return propagateFloat64NaN(a, status);
2584 }
2585 if (float64_is_signaling_nan(b)) {
2586 return propagateFloat64NaN(b, status);
2587 }
2588 if (! float64_is_nan(b)) {
2589 if (float64_is_denormal(b))
2590 float_raise(status, float_flag_denormal);
2591 return b;
2592 }
2593 if (! float64_is_nan(a)) {
2594 if (float64_is_denormal(a))
2595 float_raise(status, float_flag_denormal);
2596 return a;
2597 }
2598 return propagateFloat64NaN(a, b, status);
2599 }
2600
2601 float64 tmp_a = a, tmp_b = b;
2602 if (is_abs) {
2603 tmp_a &= ~BX_CONST64(0x8000000000000000); // clear the sign bit
2604 tmp_b &= ~BX_CONST64(0x8000000000000000);
2605 }
2606
2607 int aSign = extractFloat64Sign(tmp_a);
2608 int bSign = extractFloat64Sign(tmp_b);
2609
2610 if (float64_is_denormal(a) || float64_is_denormal(b))
2611 float_raise(status, float_flag_denormal);
2612
2613 if (aSign != bSign) {
2614 if (! is_max) {
2615 return aSign ? a : b;
2616 } else {
2617 return aSign ? b : a;
2618 }
2619 } else {
2620 if (! is_max) {
2621 return (aSign ^ (tmp_a < tmp_b)) ? a : b;
2622 } else {
2623 return (aSign ^ (tmp_a < tmp_b)) ? b : a;
2624 }
2625 }
2626 }
2627
2628 #ifdef FLOATX80
2629
2630 /*----------------------------------------------------------------------------
2631 | Returns the result of converting the 32-bit two's complement integer `a'
2632 | to the extended double-precision floating-point format. The conversion
2633 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2634 | Arithmetic.
2635 *----------------------------------------------------------------------------*/
2636
int32_to_floatx80(Bit32s a)2637 floatx80 int32_to_floatx80(Bit32s a)
2638 {
2639 if (a == 0) return packFloatx80(0, 0, 0);
2640 int zSign = (a < 0);
2641 Bit32u absA = zSign ? -a : a;
2642 int shiftCount = countLeadingZeros32(absA) + 32;
2643 Bit64u zSig = absA;
2644 return packFloatx80(zSign, 0x403E - shiftCount, zSig<<shiftCount);
2645 }
2646
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the 64-bit two's complement integer `a'
2649 | to the extended double-precision floating-point format. The conversion
2650 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic.
2652 *----------------------------------------------------------------------------*/
2653
int64_to_floatx80(Bit64s a)2654 floatx80 int64_to_floatx80(Bit64s a)
2655 {
2656 if (a == 0) return packFloatx80(0, 0, 0);
2657 int zSign = (a < 0);
2658 Bit64u absA = zSign ? -a : a;
2659 int shiftCount = countLeadingZeros64(absA);
2660 return packFloatx80(zSign, 0x403E - shiftCount, absA<<shiftCount);
2661 }
2662
2663 /*----------------------------------------------------------------------------
2664 | Returns the result of converting the single-precision floating-point value
2665 | `a' to the extended double-precision floating-point format. The conversion
2666 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2667 | Arithmetic.
2668 *----------------------------------------------------------------------------*/
2669
float32_to_floatx80(float32 a,float_status_t & status)2670 floatx80 float32_to_floatx80(float32 a, float_status_t &status)
2671 {
2672 Bit32u aSig = extractFloat32Frac(a);
2673 Bit16s aExp = extractFloat32Exp(a);
2674 int aSign = extractFloat32Sign(a);
2675 if (aExp == 0xFF) {
2676 if (aSig) return commonNaNToFloatx80(float32ToCommonNaN(a, status));
2677 return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2678 }
2679 if (aExp == 0) {
2680 if (aSig == 0) return packFloatx80(aSign, 0, 0);
2681 float_raise(status, float_flag_denormal);
2682 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2683 }
2684 aSig |= 0x00800000;
2685 return packFloatx80(aSign, aExp + 0x3F80, ((Bit64u) aSig)<<40);
2686 }
2687
2688 /*----------------------------------------------------------------------------
2689 | Returns the result of converting the double-precision floating-point value
2690 | `a' to the extended double-precision floating-point format. The conversion
2691 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2692 | Arithmetic.
2693 *----------------------------------------------------------------------------*/
2694
float64_to_floatx80(float64 a,float_status_t & status)2695 floatx80 float64_to_floatx80(float64 a, float_status_t &status)
2696 {
2697 Bit64u aSig = extractFloat64Frac(a);
2698 Bit16s aExp = extractFloat64Exp(a);
2699 int aSign = extractFloat64Sign(a);
2700
2701 if (aExp == 0x7FF) {
2702 if (aSig) return commonNaNToFloatx80(float64ToCommonNaN(a, status));
2703 return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2704 }
2705 if (aExp == 0) {
2706 if (aSig == 0) return packFloatx80(aSign, 0, 0);
2707 float_raise(status, float_flag_denormal);
2708 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2709 }
2710 return
2711 packFloatx80(
2712 aSign, aExp + 0x3C00, (aSig | BX_CONST64(0x0010000000000000))<<11);
2713 }
2714
2715 /*----------------------------------------------------------------------------
2716 | Returns the result of converting the extended double-precision floating-
2717 | point value `a' to the 32-bit two's complement integer format. The
2718 | conversion is performed according to the IEC/IEEE Standard for Binary
2719 | Floating-Point Arithmetic - which means in particular that the conversion
2720 | is rounded according to the current rounding mode. If `a' is a NaN or the
2721 | conversion overflows, the integer indefinite value is returned.
2722 *----------------------------------------------------------------------------*/
2723
floatx80_to_int32(floatx80 a,float_status_t & status)2724 Bit32s floatx80_to_int32(floatx80 a, float_status_t &status)
2725 {
2726 Bit64u aSig = extractFloatx80Frac(a);
2727 Bit32s aExp = extractFloatx80Exp(a);
2728 int aSign = extractFloatx80Sign(a);
2729
2730 // handle unsupported extended double-precision floating encodings
2731 if (floatx80_is_unsupported(a))
2732 {
2733 float_raise(status, float_flag_invalid);
2734 return int32_indefinite;
2735 }
2736
2737 if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) aSign = 0;
2738 int shiftCount = 0x4037 - aExp;
2739 if (shiftCount <= 0) shiftCount = 1;
2740 aSig = shift64RightJamming(aSig, shiftCount);
2741 return roundAndPackInt32(aSign, aSig, status);
2742 }
2743
2744 /*----------------------------------------------------------------------------
2745 | Returns the result of converting the extended double-precision floating-
2746 | point value `a' to the 32-bit two's complement integer format. The
2747 | conversion is performed according to the IEC/IEEE Standard for Binary
2748 | Floating-Point Arithmetic, except that the conversion is always rounded
2749 | toward zero. If `a' is a NaN or the conversion overflows, the integer
2750 | indefinite value is returned.
2751 *----------------------------------------------------------------------------*/
2752
floatx80_to_int32_round_to_zero(floatx80 a,float_status_t & status)2753 Bit32s floatx80_to_int32_round_to_zero(floatx80 a, float_status_t &status)
2754 {
2755 Bit32s aExp;
2756 Bit64u aSig, savedASig;
2757 Bit32s z;
2758 int shiftCount;
2759
2760 // handle unsupported extended double-precision floating encodings
2761 if (floatx80_is_unsupported(a))
2762 {
2763 float_raise(status, float_flag_invalid);
2764 return int32_indefinite;
2765 }
2766
2767 aSig = extractFloatx80Frac(a);
2768 aExp = extractFloatx80Exp(a);
2769 int aSign = extractFloatx80Sign(a);
2770
2771 if (aExp > 0x401E) {
2772 float_raise(status, float_flag_invalid);
2773 return (Bit32s)(int32_indefinite);
2774 }
2775 if (aExp < 0x3FFF) {
2776 if (aExp || aSig) float_raise(status, float_flag_inexact);
2777 return 0;
2778 }
2779 shiftCount = 0x403E - aExp;
2780 savedASig = aSig;
2781 aSig >>= shiftCount;
2782 z = (Bit32s) aSig;
2783 if (aSign) z = -z;
2784 if ((z < 0) ^ aSign) {
2785 float_raise(status, float_flag_invalid);
2786 return (Bit32s)(int32_indefinite);
2787 }
2788 if ((aSig<<shiftCount) != savedASig)
2789 {
2790 float_raise(status, float_flag_inexact);
2791 }
2792 return z;
2793 }
2794
2795 /*----------------------------------------------------------------------------
2796 | Returns the result of converting the extended double-precision floating-
2797 | point value `a' to the 64-bit two's complement integer format. The
2798 | conversion is performed according to the IEC/IEEE Standard for Binary
2799 | Floating-Point Arithmetic - which means in particular that the conversion
2800 | is rounded according to the current rounding mode. If `a' is a NaN or the
2801 | conversion overflows, the integer indefinite value is returned.
2802 *----------------------------------------------------------------------------*/
2803
floatx80_to_int64(floatx80 a,float_status_t & status)2804 Bit64s floatx80_to_int64(floatx80 a, float_status_t &status)
2805 {
2806 Bit32s aExp;
2807 Bit64u aSig, aSigExtra;
2808
2809 // handle unsupported extended double-precision floating encodings
2810 if (floatx80_is_unsupported(a))
2811 {
2812 float_raise(status, float_flag_invalid);
2813 return int64_indefinite;
2814 }
2815
2816 aSig = extractFloatx80Frac(a);
2817 aExp = extractFloatx80Exp(a);
2818 int aSign = extractFloatx80Sign(a);
2819
2820 int shiftCount = 0x403E - aExp;
2821 if (shiftCount <= 0)
2822 {
2823 if (shiftCount)
2824 {
2825 float_raise(status, float_flag_invalid);
2826 return (Bit64s)(int64_indefinite);
2827 }
2828 aSigExtra = 0;
2829 }
2830 else {
2831 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
2832 }
2833
2834 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
2835 }
2836
2837 /*----------------------------------------------------------------------------
2838 | Returns the result of converting the extended double-precision floating-
2839 | point value `a' to the 64-bit two's complement integer format. The
2840 | conversion is performed according to the IEC/IEEE Standard for Binary
2841 | Floating-Point Arithmetic, except that the conversion is always rounded
2842 | toward zero. If `a' is a NaN or the conversion overflows, the integer
2843 | indefinite value is returned.
2844 *----------------------------------------------------------------------------*/
2845
floatx80_to_int64_round_to_zero(floatx80 a,float_status_t & status)2846 Bit64s floatx80_to_int64_round_to_zero(floatx80 a, float_status_t &status)
2847 {
2848 int aSign;
2849 Bit32s aExp;
2850 Bit64u aSig;
2851 Bit64s z;
2852
2853 // handle unsupported extended double-precision floating encodings
2854 if (floatx80_is_unsupported(a))
2855 {
2856 float_raise(status, float_flag_invalid);
2857 return int64_indefinite;
2858 }
2859
2860 aSig = extractFloatx80Frac(a);
2861 aExp = extractFloatx80Exp(a);
2862 aSign = extractFloatx80Sign(a);
2863 int shiftCount = aExp - 0x403E;
2864 if (0 <= shiftCount) {
2865 aSig &= BX_CONST64(0x7FFFFFFFFFFFFFFF);
2866 if ((a.exp != 0xC03E) || aSig) {
2867 float_raise(status, float_flag_invalid);
2868 }
2869 return (Bit64s)(int64_indefinite);
2870 }
2871 else if (aExp < 0x3FFF) {
2872 if (aExp | aSig) float_raise(status, float_flag_inexact);
2873 return 0;
2874 }
2875 z = aSig>>(-shiftCount);
2876 if ((Bit64u) (aSig<<(shiftCount & 63))) {
2877 float_raise(status, float_flag_inexact);
2878 }
2879 if (aSign) z = -z;
2880 return z;
2881 }
2882
2883 /*----------------------------------------------------------------------------
2884 | Returns the result of converting the extended double-precision floating-
2885 | point value `a' to the single-precision floating-point format. The
2886 | conversion is performed according to the IEC/IEEE Standard for Binary
2887 | Floating-Point Arithmetic.
2888 *----------------------------------------------------------------------------*/
2889
floatx80_to_float32(floatx80 a,float_status_t & status)2890 float32 floatx80_to_float32(floatx80 a, float_status_t &status)
2891 {
2892 Bit64u aSig = extractFloatx80Frac(a);
2893 Bit32s aExp = extractFloatx80Exp(a);
2894 int aSign = extractFloatx80Sign(a);
2895
2896 // handle unsupported extended double-precision floating encodings
2897 if (floatx80_is_unsupported(a))
2898 {
2899 float_raise(status, float_flag_invalid);
2900 return float32_default_nan;
2901 }
2902
2903 if (aExp == 0x7FFF) {
2904 if ((Bit64u) (aSig<<1))
2905 return commonNaNToFloat32(floatx80ToCommonNaN(a, status));
2906
2907 return packFloat32(aSign, 0xFF, 0);
2908 }
2909 aSig = shift64RightJamming(aSig, 33);
2910 if (aExp || aSig) aExp -= 0x3F81;
2911 return roundAndPackFloat32(aSign, aExp, (Bit32u) aSig, status);
2912 }
2913
2914 /*----------------------------------------------------------------------------
2915 | Returns the result of converting the extended double-precision floating-
2916 | point value `a' to the double-precision floating-point format. The
2917 | conversion is performed according to the IEC/IEEE Standard for Binary
2918 | Floating-Point Arithmetic.
2919 *----------------------------------------------------------------------------*/
2920
floatx80_to_float64(floatx80 a,float_status_t & status)2921 float64 floatx80_to_float64(floatx80 a, float_status_t &status)
2922 {
2923 Bit32s aExp;
2924 Bit64u aSig, zSig;
2925
2926 // handle unsupported extended double-precision floating encodings
2927 if (floatx80_is_unsupported(a))
2928 {
2929 float_raise(status, float_flag_invalid);
2930 return float64_default_nan;
2931 }
2932
2933 aSig = extractFloatx80Frac(a);
2934 aExp = extractFloatx80Exp(a);
2935 int aSign = extractFloatx80Sign(a);
2936
2937 if (aExp == 0x7FFF) {
2938 if ((Bit64u) (aSig<<1)) {
2939 return commonNaNToFloat64(floatx80ToCommonNaN(a, status));
2940 }
2941 return packFloat64(aSign, 0x7FF, 0);
2942 }
2943 zSig = shift64RightJamming(aSig, 1);
2944 if (aExp || aSig) aExp -= 0x3C01;
2945 return roundAndPackFloat64(aSign, aExp, zSig, status);
2946 }
2947
2948 /*----------------------------------------------------------------------------
2949 | Rounds the extended double-precision floating-point value `a' to an integer,
2950 | and returns the result as an extended double-precision floating-point
2951 | value. The operation is performed according to the IEC/IEEE Standard for
2952 | Binary Floating-Point Arithmetic.
2953 *----------------------------------------------------------------------------*/
2954
floatx80_round_to_int(floatx80 a,float_status_t & status)2955 floatx80 floatx80_round_to_int(floatx80 a, float_status_t &status)
2956 {
2957 int aSign;
2958 Bit64u lastBitMask, roundBitsMask;
2959 int roundingMode = get_float_rounding_mode(status);
2960 floatx80 z;
2961
2962 // handle unsupported extended double-precision floating encodings
2963 if (floatx80_is_unsupported(a))
2964 {
2965 float_raise(status, float_flag_invalid);
2966 return floatx80_default_nan;
2967 }
2968
2969 Bit32s aExp = extractFloatx80Exp(a);
2970 Bit64u aSig = extractFloatx80Frac(a);
2971 if (0x403E <= aExp) {
2972 if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) {
2973 return propagateFloatx80NaN(a, status);
2974 }
2975 return a;
2976 }
2977 if (aExp < 0x3FFF) {
2978 if (aExp == 0) {
2979 if ((aSig<<1) == 0) return a;
2980 float_raise(status, float_flag_denormal);
2981 }
2982 float_raise(status, float_flag_inexact);
2983 aSign = extractFloatx80Sign(a);
2984 switch (roundingMode) {
2985 case float_round_nearest_even:
2986 if ((aExp == 0x3FFE) && (Bit64u) (aSig<<1)) {
2987 set_float_rounding_up(status);
2988 return packFloatx80(aSign, 0x3FFF, BX_CONST64(0x8000000000000000));
2989 }
2990 break;
2991 case float_round_down:
2992 if (aSign) {
2993 set_float_rounding_up(status);
2994 return packFloatx80(1, 0x3FFF, BX_CONST64(0x8000000000000000));
2995 }
2996 else {
2997 return packFloatx80(0, 0, 0);
2998 }
2999 case float_round_up:
3000 if (aSign) {
3001 return packFloatx80(1, 0, 0);
3002 }
3003 else {
3004 set_float_rounding_up(status);
3005 return packFloatx80(0, 0x3FFF, BX_CONST64(0x8000000000000000));
3006 }
3007 }
3008 return packFloatx80(aSign, 0, 0);
3009 }
3010 lastBitMask = 1;
3011 lastBitMask <<= 0x403E - aExp;
3012 roundBitsMask = lastBitMask - 1;
3013 z = a;
3014 if (roundingMode == float_round_nearest_even) {
3015 z.fraction += lastBitMask>>1;
3016 if ((z.fraction & roundBitsMask) == 0) z.fraction &= ~lastBitMask;
3017 }
3018 else if (roundingMode != float_round_to_zero) {
3019 if (extractFloatx80Sign(z) ^ (roundingMode == float_round_up))
3020 z.fraction += roundBitsMask;
3021 }
3022 z.fraction &= ~roundBitsMask;
3023 if (z.fraction == 0) {
3024 z.exp++;
3025 z.fraction = BX_CONST64(0x8000000000000000);
3026 }
3027 if (z.fraction != a.fraction) {
3028 float_raise(status, float_flag_inexact);
3029 if (z.fraction > a.fraction || z.exp > a.exp)
3030 set_float_rounding_up(status);
3031 }
3032 return z;
3033 }
3034
3035 /*----------------------------------------------------------------------------
3036 | Returns the result of adding the absolute values of the extended double-
3037 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3038 | negated before being returned. `zSign' is ignored if the result is a NaN.
3039 | The addition is performed according to the IEC/IEEE Standard for Binary
3040 | Floating-Point Arithmetic.
3041 *----------------------------------------------------------------------------*/
3042
addFloatx80Sigs(floatx80 a,floatx80 b,int zSign,float_status_t & status)3043 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
3044 {
3045 Bit32s aExp, bExp, zExp;
3046 Bit64u aSig, bSig, zSig0, zSig1;
3047
3048 // handle unsupported extended double-precision floating encodings
3049 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3050 {
3051 float_raise(status, float_flag_invalid);
3052 return floatx80_default_nan;
3053 }
3054
3055 aSig = extractFloatx80Frac(a);
3056 aExp = extractFloatx80Exp(a);
3057 bSig = extractFloatx80Frac(b);
3058 bExp = extractFloatx80Exp(b);
3059
3060 if (aExp == 0x7FFF) {
3061 if ((Bit64u) (aSig<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
3062 return propagateFloatx80NaN(a, b, status);
3063 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3064 return a;
3065 }
3066 if (bExp == 0x7FFF) {
3067 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3068 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3069 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3070 }
3071 if (aExp == 0) {
3072 if (aSig == 0) {
3073 if ((bExp == 0) && bSig) {
3074 float_raise(status, float_flag_denormal);
3075 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3076 }
3077 return roundAndPackFloatx80(get_float_rounding_precision(status),
3078 zSign, bExp, bSig, 0, status);
3079 }
3080 float_raise(status, float_flag_denormal);
3081 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3082 }
3083 if (bExp == 0) {
3084 if (bSig == 0)
3085 return roundAndPackFloatx80(get_float_rounding_precision(status),
3086 zSign, aExp, aSig, 0, status);
3087
3088 float_raise(status, float_flag_denormal);
3089 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3090 }
3091 Bit32s expDiff = aExp - bExp;
3092 zExp = aExp;
3093 if (0 < expDiff) {
3094 shift64ExtraRightJamming(bSig, 0, expDiff, &bSig, &zSig1);
3095 }
3096 else if (expDiff < 0) {
3097 shift64ExtraRightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
3098 zExp = bExp;
3099 }
3100 else {
3101 zSig0 = aSig + bSig;
3102 zSig1 = 0;
3103 goto shiftRight1;
3104 }
3105 zSig0 = aSig + bSig;
3106 if ((Bit64s) zSig0 < 0) goto roundAndPack;
3107 shiftRight1:
3108 shift64ExtraRightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3109 zSig0 |= BX_CONST64(0x8000000000000000);
3110 zExp++;
3111 roundAndPack:
3112 return
3113 roundAndPackFloatx80(get_float_rounding_precision(status),
3114 zSign, zExp, zSig0, zSig1, status);
3115 }
3116
3117 /*----------------------------------------------------------------------------
3118 | Returns the result of subtracting the absolute values of the extended
3119 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3120 | difference is negated before being returned. `zSign' is ignored if the
3121 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3122 | Standard for Binary Floating-Point Arithmetic.
3123 *----------------------------------------------------------------------------*/
3124
subFloatx80Sigs(floatx80 a,floatx80 b,int zSign,float_status_t & status)3125 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
3126 {
3127 Bit32s aExp, bExp, zExp;
3128 Bit64u aSig, bSig, zSig0, zSig1;
3129
3130 // handle unsupported extended double-precision floating encodings
3131 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3132 {
3133 float_raise(status, float_flag_invalid);
3134 return floatx80_default_nan;
3135 }
3136
3137 aSig = extractFloatx80Frac(a);
3138 aExp = extractFloatx80Exp(a);
3139 bSig = extractFloatx80Frac(b);
3140 bExp = extractFloatx80Exp(b);
3141
3142 if (aExp == 0x7FFF) {
3143 if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
3144 if (bExp == 0x7FFF) {
3145 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3146 float_raise(status, float_flag_invalid);
3147 return floatx80_default_nan;
3148 }
3149 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3150 return a;
3151 }
3152 if (bExp == 0x7FFF) {
3153 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3154 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3155 return packFloatx80(zSign ^ 1, 0x7FFF, BX_CONST64(0x8000000000000000));
3156 }
3157 if (aExp == 0) {
3158 if (aSig == 0) {
3159 if (bExp == 0) {
3160 if (bSig) {
3161 float_raise(status, float_flag_denormal);
3162 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3163 return roundAndPackFloatx80(get_float_rounding_precision(status),
3164 zSign ^ 1, bExp, bSig, 0, status);
3165 }
3166 return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0);
3167 }
3168 return roundAndPackFloatx80(get_float_rounding_precision(status),
3169 zSign ^ 1, bExp, bSig, 0, status);
3170 }
3171 float_raise(status, float_flag_denormal);
3172 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3173 }
3174 if (bExp == 0) {
3175 if (bSig == 0)
3176 return roundAndPackFloatx80(get_float_rounding_precision(status),
3177 zSign, aExp, aSig, 0, status);
3178
3179 float_raise(status, float_flag_denormal);
3180 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3181 }
3182 Bit32s expDiff = aExp - bExp;
3183 if (0 < expDiff) {
3184 shift128RightJamming(bSig, 0, expDiff, &bSig, &zSig1);
3185 goto aBigger;
3186 }
3187 if (expDiff < 0) {
3188 shift128RightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
3189 goto bBigger;
3190 }
3191 zSig1 = 0;
3192 if (bSig < aSig) goto aBigger;
3193 if (aSig < bSig) goto bBigger;
3194 return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0);
3195 bBigger:
3196 sub128(bSig, 0, aSig, zSig1, &zSig0, &zSig1);
3197 zExp = bExp;
3198 zSign ^= 1;
3199 goto normalizeRoundAndPack;
3200 aBigger:
3201 sub128(aSig, 0, bSig, zSig1, &zSig0, &zSig1);
3202 zExp = aExp;
3203 normalizeRoundAndPack:
3204 return
3205 normalizeRoundAndPackFloatx80(get_float_rounding_precision(status),
3206 zSign, zExp, zSig0, zSig1, status);
3207 }
3208
3209 /*----------------------------------------------------------------------------
3210 | Returns the result of adding the extended double-precision floating-point
3211 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3212 | Standard for Binary Floating-Point Arithmetic.
3213 *----------------------------------------------------------------------------*/
3214
floatx80_add(floatx80 a,floatx80 b,float_status_t & status)3215 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status_t &status)
3216 {
3217 int aSign = extractFloatx80Sign(a);
3218 int bSign = extractFloatx80Sign(b);
3219
3220 if (aSign == bSign)
3221 return addFloatx80Sigs(a, b, aSign, status);
3222 else
3223 return subFloatx80Sigs(a, b, aSign, status);
3224 }
3225
3226 /*----------------------------------------------------------------------------
3227 | Returns the result of subtracting the extended double-precision floating-
3228 | point values `a' and `b'. The operation is performed according to the
3229 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3230 *----------------------------------------------------------------------------*/
3231
floatx80_sub(floatx80 a,floatx80 b,float_status_t & status)3232 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status_t &status)
3233 {
3234 int aSign = extractFloatx80Sign(a);
3235 int bSign = extractFloatx80Sign(b);
3236
3237 if (aSign == bSign)
3238 return subFloatx80Sigs(a, b, aSign, status);
3239 else
3240 return addFloatx80Sigs(a, b, aSign, status);
3241 }
3242
3243 /*----------------------------------------------------------------------------
3244 | Returns the result of multiplying the extended double-precision floating-
3245 | point values `a' and `b'. The operation is performed according to the
3246 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3247 *----------------------------------------------------------------------------*/
3248
floatx80_mul(floatx80 a,floatx80 b,float_status_t & status)3249 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status_t &status)
3250 {
3251 int aSign, bSign, zSign;
3252 Bit32s aExp, bExp, zExp;
3253 Bit64u aSig, bSig, zSig0, zSig1;
3254
3255 // handle unsupported extended double-precision floating encodings
3256 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3257 {
3258 invalid:
3259 float_raise(status, float_flag_invalid);
3260 return floatx80_default_nan;
3261 }
3262
3263 aSig = extractFloatx80Frac(a);
3264 aExp = extractFloatx80Exp(a);
3265 aSign = extractFloatx80Sign(a);
3266 bSig = extractFloatx80Frac(b);
3267 bExp = extractFloatx80Exp(b);
3268 bSign = extractFloatx80Sign(b);
3269 zSign = aSign ^ bSign;
3270
3271 if (aExp == 0x7FFF) {
3272 if ((Bit64u) (aSig<<1) || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1))) {
3273 return propagateFloatx80NaN(a, b, status);
3274 }
3275 if (bExp == 0) {
3276 if (bSig == 0) goto invalid;
3277 float_raise(status, float_flag_denormal);
3278 }
3279 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3280 }
3281 if (bExp == 0x7FFF) {
3282 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3283 if (aExp == 0) {
3284 if (aSig == 0) goto invalid;
3285 float_raise(status, float_flag_denormal);
3286 }
3287 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3288 }
3289 if (aExp == 0) {
3290 if (aSig == 0) {
3291 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3292 return packFloatx80(zSign, 0, 0);
3293 }
3294 float_raise(status, float_flag_denormal);
3295 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3296 }
3297 if (bExp == 0) {
3298 if (bSig == 0) return packFloatx80(zSign, 0, 0);
3299 float_raise(status, float_flag_denormal);
3300 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3301 }
3302 zExp = aExp + bExp - 0x3FFE;
3303 mul64To128(aSig, bSig, &zSig0, &zSig1);
3304 if (0 < (Bit64s) zSig0) {
3305 shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
3306 --zExp;
3307 }
3308 return
3309 roundAndPackFloatx80(get_float_rounding_precision(status),
3310 zSign, zExp, zSig0, zSig1, status);
3311 }
3312
3313 /*----------------------------------------------------------------------------
3314 | Returns the result of dividing the extended double-precision floating-point
3315 | value `a' by the corresponding value `b'. The operation is performed
3316 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3317 *----------------------------------------------------------------------------*/
3318
floatx80_div(floatx80 a,floatx80 b,float_status_t & status)3319 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status_t &status)
3320 {
3321 int aSign, bSign, zSign;
3322 Bit32s aExp, bExp, zExp;
3323 Bit64u aSig, bSig, zSig0, zSig1;
3324 Bit64u rem0, rem1, rem2, term0, term1, term2;
3325
3326 // handle unsupported extended double-precision floating encodings
3327 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
3328 {
3329 float_raise(status, float_flag_invalid);
3330 return floatx80_default_nan;
3331 }
3332
3333 aSig = extractFloatx80Frac(a);
3334 aExp = extractFloatx80Exp(a);
3335 aSign = extractFloatx80Sign(a);
3336 bSig = extractFloatx80Frac(b);
3337 bExp = extractFloatx80Exp(b);
3338 bSign = extractFloatx80Sign(b);
3339
3340 zSign = aSign ^ bSign;
3341 if (aExp == 0x7FFF) {
3342 if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
3343 if (bExp == 0x7FFF) {
3344 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3345 float_raise(status, float_flag_invalid);
3346 return floatx80_default_nan;
3347 }
3348 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
3349 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3350 }
3351 if (bExp == 0x7FFF) {
3352 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
3353 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
3354 return packFloatx80(zSign, 0, 0);
3355 }
3356 if (bExp == 0) {
3357 if (bSig == 0) {
3358 if ((aExp | aSig) == 0) {
3359 float_raise(status, float_flag_invalid);
3360 return floatx80_default_nan;
3361 }
3362 float_raise(status, float_flag_divbyzero);
3363 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3364 }
3365 float_raise(status, float_flag_denormal);
3366 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
3367 }
3368 if (aExp == 0) {
3369 if (aSig == 0) return packFloatx80(zSign, 0, 0);
3370 float_raise(status, float_flag_denormal);
3371 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3372 }
3373 zExp = aExp - bExp + 0x3FFE;
3374 rem1 = 0;
3375 if (bSig <= aSig) {
3376 shift128Right(aSig, 0, 1, &aSig, &rem1);
3377 ++zExp;
3378 }
3379 zSig0 = estimateDiv128To64(aSig, rem1, bSig);
3380 mul64To128(bSig, zSig0, &term0, &term1);
3381 sub128(aSig, rem1, term0, term1, &rem0, &rem1);
3382 while ((Bit64s) rem0 < 0) {
3383 --zSig0;
3384 add128(rem0, rem1, 0, bSig, &rem0, &rem1);
3385 }
3386 zSig1 = estimateDiv128To64(rem1, 0, bSig);
3387 if ((Bit64u) (zSig1<<1) <= 8) {
3388 mul64To128(bSig, zSig1, &term1, &term2);
3389 sub128(rem1, 0, term1, term2, &rem1, &rem2);
3390 while ((Bit64s) rem1 < 0) {
3391 --zSig1;
3392 add128(rem1, rem2, 0, bSig, &rem1, &rem2);
3393 }
3394 zSig1 |= ((rem1 | rem2) != 0);
3395 }
3396 return
3397 roundAndPackFloatx80(get_float_rounding_precision(status),
3398 zSign, zExp, zSig0, zSig1, status);
3399 }
3400
3401 /*----------------------------------------------------------------------------
3402 | Returns the square root of the extended double-precision floating-point
3403 | value `a'. The operation is performed according to the IEC/IEEE Standard
3404 | for Binary Floating-Point Arithmetic.
3405 *----------------------------------------------------------------------------*/
3406
floatx80_sqrt(floatx80 a,float_status_t & status)3407 floatx80 floatx80_sqrt(floatx80 a, float_status_t &status)
3408 {
3409 int aSign;
3410 Bit32s aExp, zExp;
3411 Bit64u aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3412 Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3413
3414 // handle unsupported extended double-precision floating encodings
3415 if (floatx80_is_unsupported(a))
3416 {
3417 float_raise(status, float_flag_invalid);
3418 return floatx80_default_nan;
3419 }
3420
3421 aSig0 = extractFloatx80Frac(a);
3422 aExp = extractFloatx80Exp(a);
3423 aSign = extractFloatx80Sign(a);
3424 if (aExp == 0x7FFF) {
3425 if ((Bit64u) (aSig0<<1)) return propagateFloatx80NaN(a, status);
3426 if (! aSign) return a;
3427 float_raise(status, float_flag_invalid);
3428 return floatx80_default_nan;
3429 }
3430 if (aSign) {
3431 if ((aExp | aSig0) == 0) return a;
3432 float_raise(status, float_flag_invalid);
3433 return floatx80_default_nan;
3434 }
3435 if (aExp == 0) {
3436 if (aSig0 == 0) return packFloatx80(0, 0, 0);
3437 float_raise(status, float_flag_denormal);
3438 normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
3439 }
3440 zExp = ((aExp - 0x3FFF)>>1) + 0x3FFF;
3441 zSig0 = estimateSqrt32(aExp, aSig0>>32);
3442 shift128Right(aSig0, 0, 2 + (aExp & 1), &aSig0, &aSig1);
3443 zSig0 = estimateDiv128To64(aSig0, aSig1, zSig0<<32) + (zSig0<<30);
3444 doubleZSig0 = zSig0<<1;
3445 mul64To128(zSig0, zSig0, &term0, &term1);
3446 sub128(aSig0, aSig1, term0, term1, &rem0, &rem1);
3447 while ((Bit64s) rem0 < 0) {
3448 --zSig0;
3449 doubleZSig0 -= 2;
3450 add128(rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1);
3451 }
3452 zSig1 = estimateDiv128To64(rem1, 0, doubleZSig0);
3453 if ((zSig1 & BX_CONST64(0x3FFFFFFFFFFFFFFF)) <= 5) {
3454 if (zSig1 == 0) zSig1 = 1;
3455 mul64To128(doubleZSig0, zSig1, &term1, &term2);
3456 sub128(rem1, 0, term1, term2, &rem1, &rem2);
3457 mul64To128(zSig1, zSig1, &term2, &term3);
3458 sub192(rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3);
3459 while ((Bit64s) rem1 < 0) {
3460 --zSig1;
3461 shortShift128Left(0, zSig1, 1, &term2, &term3);
3462 term3 |= 1;
3463 term2 |= doubleZSig0;
3464 add192(rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3);
3465 }
3466 zSig1 |= ((rem1 | rem2 | rem3) != 0);
3467 }
3468 shortShift128Left(0, zSig1, 1, &zSig0, &zSig1);
3469 zSig0 |= doubleZSig0;
3470 return
3471 roundAndPackFloatx80(get_float_rounding_precision(status),
3472 0, zExp, zSig0, zSig1, status);
3473 }
3474
3475 #endif
3476
3477 #ifdef FLOAT128
3478
3479 /*----------------------------------------------------------------------------
3480 | Returns the result of converting the extended double-precision floating-
3481 | point value `a' to the quadruple-precision floating-point format. The
3482 | conversion is performed according to the IEC/IEEE Standard for Binary
3483 | Floating-Point Arithmetic.
3484 *----------------------------------------------------------------------------*/
3485
floatx80_to_float128(floatx80 a,float_status_t & status)3486 float128 floatx80_to_float128(floatx80 a, float_status_t &status)
3487 {
3488 Bit64u zSig0, zSig1;
3489
3490 Bit64u aSig = extractFloatx80Frac(a);
3491 Bit32s aExp = extractFloatx80Exp(a);
3492 int aSign = extractFloatx80Sign(a);
3493
3494 if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1))
3495 return commonNaNToFloat128(floatx80ToCommonNaN(a, status));
3496
3497 shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
3498 return packFloat128(aSign, aExp, zSig0, zSig1);
3499 }
3500
3501 /*----------------------------------------------------------------------------
3502 | Returns the result of converting the quadruple-precision floating-point
3503 | value `a' to the extended double-precision floating-point format. The
3504 | conversion is performed according to the IEC/IEEE Standard for Binary
3505 | Floating-Point Arithmetic.
3506 *----------------------------------------------------------------------------*/
3507
float128_to_floatx80(float128 a,float_status_t & status)3508 floatx80 float128_to_floatx80(float128 a, float_status_t &status)
3509 {
3510 Bit32s aExp;
3511 Bit64u aSig0, aSig1;
3512
3513 aSig1 = extractFloat128Frac1(a);
3514 aSig0 = extractFloat128Frac0(a);
3515 aExp = extractFloat128Exp(a);
3516 int aSign = extractFloat128Sign(a);
3517
3518 if (aExp == 0x7FFF) {
3519 if (aSig0 | aSig1)
3520 return commonNaNToFloatx80(float128ToCommonNaN(a, status));
3521
3522 return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3523 }
3524
3525 if (aExp == 0) {
3526 if ((aSig0 | aSig1) == 0) return packFloatx80(aSign, 0, 0);
3527 float_raise(status, float_flag_denormal);
3528 normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3529 }
3530 else aSig0 |= BX_CONST64(0x0001000000000000);
3531
3532 shortShift128Left(aSig0, aSig1, 15, &aSig0, &aSig1);
3533 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
3534 }
3535
3536 /*----------------------------------------------------------------------------
3537 | Returns the result of multiplying the extended double-precision floating-
3538 | point value `a' and quadruple-precision floating point value `b'. The
3539 | operation is performed according to the IEC/IEEE Standard for Binary
3540 | Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3542
floatx80_mul(floatx80 a,float128 b,float_status_t & status)3543 floatx80 floatx80_mul(floatx80 a, float128 b, float_status_t &status)
3544 {
3545 Bit32s aExp, bExp, zExp;
3546 Bit64u aSig, bSig0, bSig1, zSig0, zSig1, zSig2;
3547 int aSign, bSign, zSign;
3548
3549 // handle unsupported extended double-precision floating encodings
3550 if (floatx80_is_unsupported(a))
3551 {
3552 invalid:
3553 float_raise(status, float_flag_invalid);
3554 return floatx80_default_nan;
3555 }
3556
3557 aSig = extractFloatx80Frac(a);
3558 aExp = extractFloatx80Exp(a);
3559 aSign = extractFloatx80Sign(a);
3560 bSig0 = extractFloat128Frac0(b);
3561 bSig1 = extractFloat128Frac1(b);
3562 bExp = extractFloat128Exp(b);
3563 bSign = extractFloat128Sign(b);
3564
3565 zSign = aSign ^ bSign;
3566
3567 if (aExp == 0x7FFF) {
3568 if ((Bit64u) (aSig<<1)
3569 || ((bExp == 0x7FFF) && (bSig0 | bSig1)))
3570 {
3571 floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
3572 return propagateFloatx80NaN(a, r, status);
3573 }
3574 if (bExp == 0) {
3575 if ((bSig0 | bSig1) == 0) goto invalid;
3576 float_raise(status, float_flag_denormal);
3577 }
3578 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3579 }
3580 if (bExp == 0x7FFF) {
3581 if (bSig0 | bSig1) {
3582 floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
3583 return propagateFloatx80NaN(a, r, status);
3584 }
3585 if (aExp == 0) {
3586 if (aSig == 0) goto invalid;
3587 float_raise(status, float_flag_denormal);
3588 }
3589 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
3590 }
3591 if (aExp == 0) {
3592 if (aSig == 0) {
3593 if ((bExp == 0) && (bSig0 | bSig1)) float_raise(status, float_flag_denormal);
3594 return packFloatx80(zSign, 0, 0);
3595 }
3596 float_raise(status, float_flag_denormal);
3597 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
3598 }
3599 if (bExp == 0) {
3600 if ((bSig0 | bSig1) == 0) return packFloatx80(zSign, 0, 0);
3601 float_raise(status, float_flag_denormal);
3602 normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3603 }
3604 else bSig0 |= BX_CONST64(0x0001000000000000);
3605
3606 zExp = aExp + bExp - 0x3FFE;
3607 shortShift128Left(bSig0, bSig1, 15, &bSig0, &bSig1);
3608 mul128By64To192(bSig0, bSig1, aSig, &zSig0, &zSig1, &zSig2);
3609 if (0 < (Bit64s) zSig0) {
3610 shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
3611 --zExp;
3612 }
3613 return
3614 roundAndPackFloatx80(get_float_rounding_precision(status),
3615 zSign, zExp, zSig0, zSig1, status);
3616 }
3617
3618 /*----------------------------------------------------------------------------
3619 | Returns the result of adding the absolute values of the quadruple-precision
3620 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3621 | before being returned. `zSign' is ignored if the result is a NaN.
3622 | The addition is performed according to the IEC/IEEE Standard for Binary
3623 | Floating-Point Arithmetic.
3624 *----------------------------------------------------------------------------*/
3625
addFloat128Sigs(float128 a,float128 b,int zSign,float_status_t & status)3626 static float128 addFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
3627 {
3628 Bit32s aExp, bExp, zExp;
3629 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
3630 Bit32s expDiff;
3631
3632 aSig1 = extractFloat128Frac1(a);
3633 aSig0 = extractFloat128Frac0(a);
3634 aExp = extractFloat128Exp(a);
3635 bSig1 = extractFloat128Frac1(b);
3636 bSig0 = extractFloat128Frac0(b);
3637 bExp = extractFloat128Exp(b);
3638 expDiff = aExp - bExp;
3639
3640 if (0 < expDiff) {
3641 if (aExp == 0x7FFF) {
3642 if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3643 return a;
3644 }
3645 if (bExp == 0) --expDiff;
3646 else bSig0 |= BX_CONST64(0x0001000000000000);
3647 shift128ExtraRightJamming(bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2);
3648 zExp = aExp;
3649 }
3650 else if (expDiff < 0) {
3651 if (bExp == 0x7FFF) {
3652 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3653 return packFloat128(zSign, 0x7FFF, 0, 0);
3654 }
3655 if (aExp == 0) ++expDiff;
3656 else aSig0 |= BX_CONST64(0x0001000000000000);
3657 shift128ExtraRightJamming(aSig0, aSig1, 0, -expDiff, &aSig0, &aSig1, &zSig2);
3658 zExp = bExp;
3659 }
3660 else {
3661 if (aExp == 0x7FFF) {
3662 if (aSig0 | aSig1 | bSig0 | bSig1)
3663 return propagateFloat128NaN(a, b, status);
3664
3665 return a;
3666 }
3667 add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3668 if (aExp == 0) return packFloat128(zSign, 0, zSig0, zSig1);
3669 zSig2 = 0;
3670 zSig0 |= BX_CONST64(0x0002000000000000);
3671 zExp = aExp;
3672 goto shiftRight1;
3673 }
3674 aSig0 |= BX_CONST64(0x0001000000000000);
3675 add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3676 --zExp;
3677 if (zSig0 < BX_CONST64(0x0002000000000000)) goto roundAndPack;
3678 ++zExp;
3679 shiftRight1:
3680 shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
3681 roundAndPack:
3682 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3683 }
3684
3685 /*----------------------------------------------------------------------------
3686 | Returns the result of subtracting the absolute values of the quadruple-
3687 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3688 | difference is negated before being returned. `zSign' is ignored if the
3689 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3690 | Standard for Binary Floating-Point Arithmetic.
3691 *----------------------------------------------------------------------------*/
3692
subFloat128Sigs(float128 a,float128 b,int zSign,float_status_t & status)3693 static float128 subFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
3694 {
3695 Bit32s aExp, bExp, zExp;
3696 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
3697 Bit32s expDiff;
3698
3699 aSig1 = extractFloat128Frac1(a);
3700 aSig0 = extractFloat128Frac0(a);
3701 aExp = extractFloat128Exp(a);
3702 bSig1 = extractFloat128Frac1(b);
3703 bSig0 = extractFloat128Frac0(b);
3704 bExp = extractFloat128Exp(b);
3705
3706 expDiff = aExp - bExp;
3707 shortShift128Left(aSig0, aSig1, 14, &aSig0, &aSig1);
3708 shortShift128Left(bSig0, bSig1, 14, &bSig0, &bSig1);
3709 if (0 < expDiff) goto aExpBigger;
3710 if (expDiff < 0) goto bExpBigger;
3711 if (aExp == 0x7FFF) {
3712 if (aSig0 | aSig1 | bSig0 | bSig1)
3713 return propagateFloat128NaN(a, b, status);
3714
3715 float_raise(status, float_flag_invalid);
3716 return float128_default_nan;
3717 }
3718 if (aExp == 0) {
3719 aExp = 1;
3720 bExp = 1;
3721 }
3722 if (bSig0 < aSig0) goto aBigger;
3723 if (aSig0 < bSig0) goto bBigger;
3724 if (bSig1 < aSig1) goto aBigger;
3725 if (aSig1 < bSig1) goto bBigger;
3726 return packFloat128(0, 0);
3727
3728 bExpBigger:
3729 if (bExp == 0x7FFF) {
3730 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3731 return packFloat128(zSign ^ 1, 0x7FFF, 0, 0);
3732 }
3733 if (aExp == 0) ++expDiff;
3734 else {
3735 aSig0 |= BX_CONST64(0x4000000000000000);
3736 }
3737 shift128RightJamming(aSig0, aSig1, - expDiff, &aSig0, &aSig1);
3738 bSig0 |= BX_CONST64(0x4000000000000000);
3739 bBigger:
3740 sub128(bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1);
3741 zExp = bExp;
3742 zSign ^= 1;
3743 goto normalizeRoundAndPack;
3744 aExpBigger:
3745 if (aExp == 0x7FFF) {
3746 if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3747 return a;
3748 }
3749 if (bExp == 0) --expDiff;
3750 else {
3751 bSig0 |= BX_CONST64(0x4000000000000000);
3752 }
3753 shift128RightJamming(bSig0, bSig1, expDiff, &bSig0, &bSig1);
3754 aSig0 |= BX_CONST64(0x4000000000000000);
3755 aBigger:
3756 sub128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3757 zExp = aExp;
3758 normalizeRoundAndPack:
3759 --zExp;
3760 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, status);
3761 }
3762
3763 /*----------------------------------------------------------------------------
3764 | Returns the result of adding the quadruple-precision floating-point values
3765 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3766 | for Binary Floating-Point Arithmetic.
3767 *----------------------------------------------------------------------------*/
3768
float128_add(float128 a,float128 b,float_status_t & status)3769 float128 float128_add(float128 a, float128 b, float_status_t &status)
3770 {
3771 int aSign = extractFloat128Sign(a);
3772 int bSign = extractFloat128Sign(b);
3773
3774 if (aSign == bSign) {
3775 return addFloat128Sigs(a, b, aSign, status);
3776 }
3777 else {
3778 return subFloat128Sigs(a, b, aSign, status);
3779 }
3780 }
3781
3782 /*----------------------------------------------------------------------------
3783 | Returns the result of subtracting the quadruple-precision floating-point
3784 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3785 | Standard for Binary Floating-Point Arithmetic.
3786 *----------------------------------------------------------------------------*/
3787
float128_sub(float128 a,float128 b,float_status_t & status)3788 float128 float128_sub(float128 a, float128 b, float_status_t &status)
3789 {
3790 int aSign = extractFloat128Sign(a);
3791 int bSign = extractFloat128Sign(b);
3792
3793 if (aSign == bSign) {
3794 return subFloat128Sigs(a, b, aSign, status);
3795 }
3796 else {
3797 return addFloat128Sigs(a, b, aSign, status);
3798 }
3799 }
3800
3801 /*----------------------------------------------------------------------------
3802 | Returns the result of multiplying the quadruple-precision floating-point
3803 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3804 | Standard for Binary Floating-Point Arithmetic.
3805 *----------------------------------------------------------------------------*/
3806
float128_mul(float128 a,float128 b,float_status_t & status)3807 float128 float128_mul(float128 a, float128 b, float_status_t &status)
3808 {
3809 int aSign, bSign, zSign;
3810 Bit32s aExp, bExp, zExp;
3811 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
3812
3813 aSig1 = extractFloat128Frac1(a);
3814 aSig0 = extractFloat128Frac0(a);
3815 aExp = extractFloat128Exp(a);
3816 aSign = extractFloat128Sign(a);
3817 bSig1 = extractFloat128Frac1(b);
3818 bSig0 = extractFloat128Frac0(b);
3819 bExp = extractFloat128Exp(b);
3820 bSign = extractFloat128Sign(b);
3821
3822 zSign = aSign ^ bSign;
3823 if (aExp == 0x7FFF) {
3824 if ((aSig0 | aSig1) || ((bExp == 0x7FFF) && (bSig0 | bSig1))) {
3825 return propagateFloat128NaN(a, b, status);
3826 }
3827 if ((bExp | bSig0 | bSig1) == 0) {
3828 float_raise(status, float_flag_invalid);
3829 return float128_default_nan;
3830 }
3831 return packFloat128(zSign, 0x7FFF, 0, 0);
3832 }
3833 if (bExp == 0x7FFF) {
3834 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3835 if ((aExp | aSig0 | aSig1) == 0) {
3836 float_raise(status, float_flag_invalid);
3837 return float128_default_nan;
3838 }
3839 return packFloat128(zSign, 0x7FFF, 0, 0);
3840 }
3841 if (aExp == 0) {
3842 if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3843 float_raise(status, float_flag_denormal);
3844 normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3845 }
3846 if (bExp == 0) {
3847 if ((bSig0 | bSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3848 float_raise(status, float_flag_denormal);
3849 normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3850 }
3851 zExp = aExp + bExp - 0x4000;
3852 aSig0 |= BX_CONST64(0x0001000000000000);
3853 shortShift128Left(bSig0, bSig1, 16, &bSig0, &bSig1);
3854 mul128To256(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3);
3855 add128(zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1);
3856 zSig2 |= (zSig3 != 0);
3857 if (BX_CONST64(0x0002000000000000) <= zSig0) {
3858 shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
3859 ++zExp;
3860 }
3861 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3862 }
3863
3864 /*----------------------------------------------------------------------------
3865 | Returns the result of dividing the quadruple-precision floating-point value
3866 | `a' by the corresponding value `b'. The operation is performed according to
3867 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3868 *----------------------------------------------------------------------------*/
3869
float128_div(float128 a,float128 b,float_status_t & status)3870 float128 float128_div(float128 a, float128 b, float_status_t &status)
3871 {
3872 int aSign, bSign, zSign;
3873 Bit32s aExp, bExp, zExp;
3874 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
3875 Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3876
3877 aSig1 = extractFloat128Frac1(a);
3878 aSig0 = extractFloat128Frac0(a);
3879 aExp = extractFloat128Exp(a);
3880 aSign = extractFloat128Sign(a);
3881 bSig1 = extractFloat128Frac1(b);
3882 bSig0 = extractFloat128Frac0(b);
3883 bExp = extractFloat128Exp(b);
3884 bSign = extractFloat128Sign(b);
3885
3886 zSign = aSign ^ bSign;
3887 if (aExp == 0x7FFF) {
3888 if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3889 if (bExp == 0x7FFF) {
3890 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3891 float_raise(status, float_flag_invalid);
3892 return float128_default_nan;
3893 }
3894 return packFloat128(zSign, 0x7FFF, 0, 0);
3895 }
3896 if (bExp == 0x7FFF) {
3897 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3898 return packFloat128(zSign, 0, 0, 0);
3899 }
3900 if (bExp == 0) {
3901 if ((bSig0 | bSig1) == 0) {
3902 if ((aExp | aSig0 | aSig1) == 0) {
3903 float_raise(status, float_flag_invalid);
3904 return float128_default_nan;
3905 }
3906 float_raise(status, float_flag_divbyzero);
3907 return packFloat128(zSign, 0x7FFF, 0, 0);
3908 }
3909 float_raise(status, float_flag_denormal);
3910 normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3911 }
3912 if (aExp == 0) {
3913 if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3914 float_raise(status, float_flag_denormal);
3915 normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3916 }
3917 zExp = aExp - bExp + 0x3FFD;
3918 shortShift128Left(
3919 aSig0 | BX_CONST64(0x0001000000000000), aSig1, 15, &aSig0, &aSig1);
3920 shortShift128Left(
3921 bSig0 | BX_CONST64(0x0001000000000000), bSig1, 15, &bSig0, &bSig1);
3922 if (le128(bSig0, bSig1, aSig0, aSig1)) {
3923 shift128Right(aSig0, aSig1, 1, &aSig0, &aSig1);
3924 ++zExp;
3925 }
3926 zSig0 = estimateDiv128To64(aSig0, aSig1, bSig0);
3927 mul128By64To192(bSig0, bSig1, zSig0, &term0, &term1, &term2);
3928 sub192(aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2);
3929 while ((Bit64s) rem0 < 0) {
3930 --zSig0;
3931 add192(rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2);
3932 }
3933 zSig1 = estimateDiv128To64(rem1, rem2, bSig0);
3934 if ((zSig1 & 0x3FFF) <= 4) {
3935 mul128By64To192(bSig0, bSig1, zSig1, &term1, &term2, &term3);
3936 sub192(rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3);
3937 while ((Bit64s) rem1 < 0) {
3938 --zSig1;
3939 add192(rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3);
3940 }
3941 zSig1 |= ((rem1 | rem2 | rem3) != 0);
3942 }
3943 shift128ExtraRightJamming(zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2);
3944 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3945 }
3946
3947 /*----------------------------------------------------------------------------
3948 | Returns the result of converting the 64-bit two's complement integer `a' to
3949 | the quadruple-precision floating-point format. The conversion is performed
3950 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3951 *----------------------------------------------------------------------------*/
3952
int64_to_float128(Bit64s a)3953 float128 int64_to_float128(Bit64s a)
3954 {
3955 Bit64u zSig0, zSig1;
3956
3957 if (a == 0) return packFloat128(0, 0, 0, 0);
3958 int zSign = (a < 0);
3959 Bit64u absA = zSign ? - a : a;
3960 Bit8u shiftCount = countLeadingZeros64(absA) + 49;
3961 Bit32s zExp = 0x406E - shiftCount;
3962 if (64 <= shiftCount) {
3963 zSig1 = 0;
3964 zSig0 = absA;
3965 shiftCount -= 64;
3966 }
3967 else {
3968 zSig1 = absA;
3969 zSig0 = 0;
3970 }
3971 shortShift128Left(zSig0, zSig1, shiftCount, &zSig0, &zSig1);
3972 return packFloat128(zSign, zExp, zSig0, zSig1);
3973 }
3974
3975 #endif
3976