1 /* $NetBSD: softfloat.c,v 1.2 2002/12/05 17:12:06 thorpej Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 =============================================================================== 19 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* If you need this in a boot program, you have bigger problems... */ 48 #ifndef _STANDALONE 49 50 #include <sys/cdefs.h> 51 #if defined(LIBC_SCCS) && !defined(lint) 52 __RCSID("$NetBSD: softfloat.c,v 1.2 2002/12/05 17:12:06 thorpej Exp $"); 53 #endif /* LIBC_SCCS and not lint */ 54 55 #ifdef SOFTFLOAT_FOR_GCC 56 #include "softfloat-for-gcc.h" 57 #endif 58 59 #include "milieu.h" 60 #include "softfloat.h" 61 62 /* 63 * Conversions between floats as stored in memory and floats as 64 * SoftFloat uses them 65 */ 66 #ifndef FLOAT64_DEMANGLE 67 #define FLOAT64_DEMANGLE(a) (a) 68 #endif 69 #ifndef FLOAT64_MANGLE 70 #define FLOAT64_MANGLE(a) (a) 71 #endif 72 73 /* 74 ------------------------------------------------------------------------------- 75 Floating-point rounding mode, extended double-precision rounding precision, 76 and exception flags. 77 ------------------------------------------------------------------------------- 78 */ 79 80 /* 81 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday. 82 * Right now, it does not. I've removed all other dynamic global 83 * variables. [ross] 84 */ 85 #ifdef FLOATX80 86 int8 floatx80_rounding_precision = 80; 87 #endif 88 89 /* 90 ------------------------------------------------------------------------------- 91 Primitive arithmetic functions, including multi-word arithmetic, and 92 division and square root approximations. (Can be specialized to target if 93 desired.) 94 ------------------------------------------------------------------------------- 95 */ 96 #include "softfloat-macros.h" 97 98 /* 99 ------------------------------------------------------------------------------- 100 Functions and definitions to determine: (1) whether tininess for underflow 101 is detected before or after rounding by default, (2) what (if anything) 102 happens when exceptions are raised, (3) how signaling NaNs are distinguished 103 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 104 are propagated from function inputs to output. These details are target- 105 specific. 106 ------------------------------------------------------------------------------- 107 */ 108 #include "softfloat-specialize.h" 109 110 #ifndef SOFTFLOAT_FOR_GCC /* Not used */ 111 /* 112 ------------------------------------------------------------------------------- 113 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 114 and 7, and returns the properly rounded 32-bit integer corresponding to the 115 input. If `zSign' is 1, the input is negated before being converted to an 116 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 117 is simply rounded to an integer, with the inexact exception raised if the 118 input cannot be represented exactly as an integer. However, if the fixed- 119 point input is too large, the invalid exception is raised and the largest 120 positive or negative integer is returned. 121 ------------------------------------------------------------------------------- 122 */ 123 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 124 { 125 int8 roundingMode; 126 flag roundNearestEven; 127 int8 roundIncrement, roundBits; 128 int32 z; 129 130 roundingMode = float_rounding_mode(); 131 roundNearestEven = ( roundingMode == float_round_nearest_even ); 132 roundIncrement = 0x40; 133 if ( ! roundNearestEven ) { 134 if ( roundingMode == float_round_to_zero ) { 135 roundIncrement = 0; 136 } 137 else { 138 roundIncrement = 0x7F; 139 if ( zSign ) { 140 if ( roundingMode == float_round_up ) roundIncrement = 0; 141 } 142 else { 143 if ( roundingMode == float_round_down ) roundIncrement = 0; 144 } 145 } 146 } 147 roundBits = absZ & 0x7F; 148 absZ = ( absZ + roundIncrement )>>7; 149 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 150 z = absZ; 151 if ( zSign ) z = - z; 152 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 153 float_raise( float_flag_invalid ); 154 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 155 } 156 if ( roundBits ) float_set_inexact(); 157 return z; 158 159 } 160 161 /* 162 ------------------------------------------------------------------------------- 163 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 164 `absZ1', with binary point between bits 63 and 64 (between the input words), 165 and returns the properly rounded 64-bit integer corresponding to the input. 166 If `zSign' is 1, the input is negated before being converted to an integer. 167 Ordinarily, the fixed-point input is simply rounded to an integer, with 168 the inexact exception raised if the input cannot be represented exactly as 169 an integer. However, if the fixed-point input is too large, the invalid 170 exception is raised and the largest positive or negative integer is 171 returned. 172 ------------------------------------------------------------------------------- 173 */ 174 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 175 { 176 int8 roundingMode; 177 flag roundNearestEven, increment; 178 int64 z; 179 180 roundingMode = float_rounding_mode(); 181 roundNearestEven = ( roundingMode == float_round_nearest_even ); 182 increment = ( (sbits64) absZ1 < 0 ); 183 if ( ! roundNearestEven ) { 184 if ( roundingMode == float_round_to_zero ) { 185 increment = 0; 186 } 187 else { 188 if ( zSign ) { 189 increment = ( roundingMode == float_round_down ) && absZ1; 190 } 191 else { 192 increment = ( roundingMode == float_round_up ) && absZ1; 193 } 194 } 195 } 196 if ( increment ) { 197 ++absZ0; 198 if ( absZ0 == 0 ) goto overflow; 199 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 200 } 201 z = absZ0; 202 if ( zSign ) z = - z; 203 if ( z && ( ( z < 0 ) ^ zSign ) ) { 204 overflow: 205 float_raise( float_flag_invalid ); 206 return 207 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 208 : LIT64( 0x7FFFFFFFFFFFFFFF ); 209 } 210 if ( absZ1 ) float_set_inexact(); 211 return z; 212 213 } 214 #endif 215 216 /* 217 ------------------------------------------------------------------------------- 218 Returns the fraction bits of the single-precision floating-point value `a'. 219 ------------------------------------------------------------------------------- 220 */ 221 INLINE bits32 extractFloat32Frac( float32 a ) 222 { 223 224 return a & 0x007FFFFF; 225 226 } 227 228 /* 229 ------------------------------------------------------------------------------- 230 Returns the exponent bits of the single-precision floating-point value `a'. 231 ------------------------------------------------------------------------------- 232 */ 233 INLINE int16 extractFloat32Exp( float32 a ) 234 { 235 236 return ( a>>23 ) & 0xFF; 237 238 } 239 240 /* 241 ------------------------------------------------------------------------------- 242 Returns the sign bit of the single-precision floating-point value `a'. 243 ------------------------------------------------------------------------------- 244 */ 245 INLINE flag extractFloat32Sign( float32 a ) 246 { 247 248 return a>>31; 249 250 } 251 252 /* 253 ------------------------------------------------------------------------------- 254 Normalizes the subnormal single-precision floating-point value represented 255 by the denormalized significand `aSig'. The normalized exponent and 256 significand are stored at the locations pointed to by `zExpPtr' and 257 `zSigPtr', respectively. 258 ------------------------------------------------------------------------------- 259 */ 260 static void 261 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 262 { 263 int8 shiftCount; 264 265 shiftCount = countLeadingZeros32( aSig ) - 8; 266 *zSigPtr = aSig<<shiftCount; 267 *zExpPtr = 1 - shiftCount; 268 269 } 270 271 /* 272 ------------------------------------------------------------------------------- 273 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 274 single-precision floating-point value, returning the result. After being 275 shifted into the proper positions, the three fields are simply added 276 together to form the result. This means that any integer portion of `zSig' 277 will be added into the exponent. Since a properly normalized significand 278 will have an integer portion equal to 1, the `zExp' input should be 1 less 279 than the desired result exponent whenever `zSig' is a complete, normalized 280 significand. 281 ------------------------------------------------------------------------------- 282 */ 283 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 284 { 285 286 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 287 288 } 289 290 /* 291 ------------------------------------------------------------------------------- 292 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 293 and significand `zSig', and returns the proper single-precision floating- 294 point value corresponding to the abstract input. Ordinarily, the abstract 295 value is simply rounded and packed into the single-precision format, with 296 the inexact exception raised if the abstract input cannot be represented 297 exactly. However, if the abstract value is too large, the overflow and 298 inexact exceptions are raised and an infinity or maximal finite value is 299 returned. If the abstract value is too small, the input value is rounded to 300 a subnormal number, and the underflow and inexact exceptions are raised if 301 the abstract input cannot be represented exactly as a subnormal single- 302 precision floating-point number. 303 The input significand `zSig' has its binary point between bits 30 304 and 29, which is 7 bits to the left of the usual location. This shifted 305 significand must be normalized or smaller. If `zSig' is not normalized, 306 `zExp' must be 0; in that case, the result returned is a subnormal number, 307 and it must not require rounding. In the usual case that `zSig' is 308 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 309 The handling of underflow and overflow follows the IEC/IEEE Standard for 310 Binary Floating-Point Arithmetic. 311 ------------------------------------------------------------------------------- 312 */ 313 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 314 { 315 int8 roundingMode; 316 flag roundNearestEven; 317 int8 roundIncrement, roundBits; 318 flag isTiny; 319 320 roundingMode = float_rounding_mode(); 321 roundNearestEven = ( roundingMode == float_round_nearest_even ); 322 roundIncrement = 0x40; 323 if ( ! roundNearestEven ) { 324 if ( roundingMode == float_round_to_zero ) { 325 roundIncrement = 0; 326 } 327 else { 328 roundIncrement = 0x7F; 329 if ( zSign ) { 330 if ( roundingMode == float_round_up ) roundIncrement = 0; 331 } 332 else { 333 if ( roundingMode == float_round_down ) roundIncrement = 0; 334 } 335 } 336 } 337 roundBits = zSig & 0x7F; 338 if ( 0xFD <= (bits16) zExp ) { 339 if ( ( 0xFD < zExp ) 340 || ( ( zExp == 0xFD ) 341 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 342 ) { 343 float_raise( float_flag_overflow | float_flag_inexact ); 344 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 345 } 346 if ( zExp < 0 ) { 347 isTiny = 348 ( float_detect_tininess == float_tininess_before_rounding ) 349 || ( zExp < -1 ) 350 || ( zSig + roundIncrement < 0x80000000 ); 351 shift32RightJamming( zSig, - zExp, &zSig ); 352 zExp = 0; 353 roundBits = zSig & 0x7F; 354 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 355 } 356 } 357 if ( roundBits ) float_set_inexact(); 358 zSig = ( zSig + roundIncrement )>>7; 359 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 360 if ( zSig == 0 ) zExp = 0; 361 return packFloat32( zSign, zExp, zSig ); 362 363 } 364 365 /* 366 ------------------------------------------------------------------------------- 367 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 368 and significand `zSig', and returns the proper single-precision floating- 369 point value corresponding to the abstract input. This routine is just like 370 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 371 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 372 floating-point exponent. 373 ------------------------------------------------------------------------------- 374 */ 375 static float32 376 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 377 { 378 int8 shiftCount; 379 380 shiftCount = countLeadingZeros32( zSig ) - 1; 381 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 382 383 } 384 385 /* 386 ------------------------------------------------------------------------------- 387 Returns the fraction bits of the double-precision floating-point value `a'. 388 ------------------------------------------------------------------------------- 389 */ 390 INLINE bits64 extractFloat64Frac( float64 a ) 391 { 392 393 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 394 395 } 396 397 /* 398 ------------------------------------------------------------------------------- 399 Returns the exponent bits of the double-precision floating-point value `a'. 400 ------------------------------------------------------------------------------- 401 */ 402 INLINE int16 extractFloat64Exp( float64 a ) 403 { 404 405 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 406 407 } 408 409 /* 410 ------------------------------------------------------------------------------- 411 Returns the sign bit of the double-precision floating-point value `a'. 412 ------------------------------------------------------------------------------- 413 */ 414 INLINE flag extractFloat64Sign( float64 a ) 415 { 416 417 return FLOAT64_DEMANGLE(a)>>63; 418 419 } 420 421 /* 422 ------------------------------------------------------------------------------- 423 Normalizes the subnormal double-precision floating-point value represented 424 by the denormalized significand `aSig'. The normalized exponent and 425 significand are stored at the locations pointed to by `zExpPtr' and 426 `zSigPtr', respectively. 427 ------------------------------------------------------------------------------- 428 */ 429 static void 430 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 431 { 432 int8 shiftCount; 433 434 shiftCount = countLeadingZeros64( aSig ) - 11; 435 *zSigPtr = aSig<<shiftCount; 436 *zExpPtr = 1 - shiftCount; 437 438 } 439 440 /* 441 ------------------------------------------------------------------------------- 442 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 443 double-precision floating-point value, returning the result. After being 444 shifted into the proper positions, the three fields are simply added 445 together to form the result. This means that any integer portion of `zSig' 446 will be added into the exponent. Since a properly normalized significand 447 will have an integer portion equal to 1, the `zExp' input should be 1 less 448 than the desired result exponent whenever `zSig' is a complete, normalized 449 significand. 450 ------------------------------------------------------------------------------- 451 */ 452 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 453 { 454 455 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 456 ( ( (bits64) zExp )<<52 ) + zSig ); 457 458 } 459 460 /* 461 ------------------------------------------------------------------------------- 462 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 463 and significand `zSig', and returns the proper double-precision floating- 464 point value corresponding to the abstract input. Ordinarily, the abstract 465 value is simply rounded and packed into the double-precision format, with 466 the inexact exception raised if the abstract input cannot be represented 467 exactly. However, if the abstract value is too large, the overflow and 468 inexact exceptions are raised and an infinity or maximal finite value is 469 returned. If the abstract value is too small, the input value is rounded to 470 a subnormal number, and the underflow and inexact exceptions are raised if 471 the abstract input cannot be represented exactly as a subnormal double- 472 precision floating-point number. 473 The input significand `zSig' has its binary point between bits 62 474 and 61, which is 10 bits to the left of the usual location. This shifted 475 significand must be normalized or smaller. If `zSig' is not normalized, 476 `zExp' must be 0; in that case, the result returned is a subnormal number, 477 and it must not require rounding. In the usual case that `zSig' is 478 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 479 The handling of underflow and overflow follows the IEC/IEEE Standard for 480 Binary Floating-Point Arithmetic. 481 ------------------------------------------------------------------------------- 482 */ 483 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 484 { 485 int8 roundingMode; 486 flag roundNearestEven; 487 int16 roundIncrement, roundBits; 488 flag isTiny; 489 490 roundingMode = float_rounding_mode(); 491 roundNearestEven = ( roundingMode == float_round_nearest_even ); 492 roundIncrement = 0x200; 493 if ( ! roundNearestEven ) { 494 if ( roundingMode == float_round_to_zero ) { 495 roundIncrement = 0; 496 } 497 else { 498 roundIncrement = 0x3FF; 499 if ( zSign ) { 500 if ( roundingMode == float_round_up ) roundIncrement = 0; 501 } 502 else { 503 if ( roundingMode == float_round_down ) roundIncrement = 0; 504 } 505 } 506 } 507 roundBits = zSig & 0x3FF; 508 if ( 0x7FD <= (bits16) zExp ) { 509 if ( ( 0x7FD < zExp ) 510 || ( ( zExp == 0x7FD ) 511 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 512 ) { 513 float_raise( float_flag_overflow | float_flag_inexact ); 514 return FLOAT64_MANGLE( 515 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 516 ( roundIncrement == 0 )); 517 } 518 if ( zExp < 0 ) { 519 isTiny = 520 ( float_detect_tininess == float_tininess_before_rounding ) 521 || ( zExp < -1 ) 522 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 523 shift64RightJamming( zSig, - zExp, &zSig ); 524 zExp = 0; 525 roundBits = zSig & 0x3FF; 526 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 527 } 528 } 529 if ( roundBits ) float_set_inexact(); 530 zSig = ( zSig + roundIncrement )>>10; 531 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 532 if ( zSig == 0 ) zExp = 0; 533 return packFloat64( zSign, zExp, zSig ); 534 535 } 536 537 /* 538 ------------------------------------------------------------------------------- 539 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 540 and significand `zSig', and returns the proper double-precision floating- 541 point value corresponding to the abstract input. This routine is just like 542 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 543 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 544 floating-point exponent. 545 ------------------------------------------------------------------------------- 546 */ 547 static float64 548 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 549 { 550 int8 shiftCount; 551 552 shiftCount = countLeadingZeros64( zSig ) - 1; 553 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 554 555 } 556 557 #ifdef FLOATX80 558 559 /* 560 ------------------------------------------------------------------------------- 561 Returns the fraction bits of the extended double-precision floating-point 562 value `a'. 563 ------------------------------------------------------------------------------- 564 */ 565 INLINE bits64 extractFloatx80Frac( floatx80 a ) 566 { 567 568 return a.low; 569 570 } 571 572 /* 573 ------------------------------------------------------------------------------- 574 Returns the exponent bits of the extended double-precision floating-point 575 value `a'. 576 ------------------------------------------------------------------------------- 577 */ 578 INLINE int32 extractFloatx80Exp( floatx80 a ) 579 { 580 581 return a.high & 0x7FFF; 582 583 } 584 585 /* 586 ------------------------------------------------------------------------------- 587 Returns the sign bit of the extended double-precision floating-point value 588 `a'. 589 ------------------------------------------------------------------------------- 590 */ 591 INLINE flag extractFloatx80Sign( floatx80 a ) 592 { 593 594 return a.high>>15; 595 596 } 597 598 /* 599 ------------------------------------------------------------------------------- 600 Normalizes the subnormal extended double-precision floating-point value 601 represented by the denormalized significand `aSig'. The normalized exponent 602 and significand are stored at the locations pointed to by `zExpPtr' and 603 `zSigPtr', respectively. 604 ------------------------------------------------------------------------------- 605 */ 606 static void 607 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 608 { 609 int8 shiftCount; 610 611 shiftCount = countLeadingZeros64( aSig ); 612 *zSigPtr = aSig<<shiftCount; 613 *zExpPtr = 1 - shiftCount; 614 615 } 616 617 /* 618 ------------------------------------------------------------------------------- 619 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 620 extended double-precision floating-point value, returning the result. 621 ------------------------------------------------------------------------------- 622 */ 623 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 624 { 625 floatx80 z; 626 627 z.low = zSig; 628 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 629 return z; 630 631 } 632 633 /* 634 ------------------------------------------------------------------------------- 635 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 636 and extended significand formed by the concatenation of `zSig0' and `zSig1', 637 and returns the proper extended double-precision floating-point value 638 corresponding to the abstract input. Ordinarily, the abstract value is 639 rounded and packed into the extended double-precision format, with the 640 inexact exception raised if the abstract input cannot be represented 641 exactly. However, if the abstract value is too large, the overflow and 642 inexact exceptions are raised and an infinity or maximal finite value is 643 returned. If the abstract value is too small, the input value is rounded to 644 a subnormal number, and the underflow and inexact exceptions are raised if 645 the abstract input cannot be represented exactly as a subnormal extended 646 double-precision floating-point number. 647 If `roundingPrecision' is 32 or 64, the result is rounded to the same 648 number of bits as single or double precision, respectively. Otherwise, the 649 result is rounded to the full precision of the extended double-precision 650 format. 651 The input significand must be normalized or smaller. If the input 652 significand is not normalized, `zExp' must be 0; in that case, the result 653 returned is a subnormal number, and it must not require rounding. The 654 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 655 Floating-Point Arithmetic. 656 ------------------------------------------------------------------------------- 657 */ 658 static floatx80 659 roundAndPackFloatx80( 660 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 661 ) 662 { 663 int8 roundingMode; 664 flag roundNearestEven, increment, isTiny; 665 int64 roundIncrement, roundMask, roundBits; 666 667 roundingMode = float_rounding_mode(); 668 roundNearestEven = ( roundingMode == float_round_nearest_even ); 669 if ( roundingPrecision == 80 ) goto precision80; 670 if ( roundingPrecision == 64 ) { 671 roundIncrement = LIT64( 0x0000000000000400 ); 672 roundMask = LIT64( 0x00000000000007FF ); 673 } 674 else if ( roundingPrecision == 32 ) { 675 roundIncrement = LIT64( 0x0000008000000000 ); 676 roundMask = LIT64( 0x000000FFFFFFFFFF ); 677 } 678 else { 679 goto precision80; 680 } 681 zSig0 |= ( zSig1 != 0 ); 682 if ( ! roundNearestEven ) { 683 if ( roundingMode == float_round_to_zero ) { 684 roundIncrement = 0; 685 } 686 else { 687 roundIncrement = roundMask; 688 if ( zSign ) { 689 if ( roundingMode == float_round_up ) roundIncrement = 0; 690 } 691 else { 692 if ( roundingMode == float_round_down ) roundIncrement = 0; 693 } 694 } 695 } 696 roundBits = zSig0 & roundMask; 697 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 698 if ( ( 0x7FFE < zExp ) 699 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 700 ) { 701 goto overflow; 702 } 703 if ( zExp <= 0 ) { 704 isTiny = 705 ( float_detect_tininess == float_tininess_before_rounding ) 706 || ( zExp < 0 ) 707 || ( zSig0 <= zSig0 + roundIncrement ); 708 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 709 zExp = 0; 710 roundBits = zSig0 & roundMask; 711 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 712 if ( roundBits ) float_set_inexact(); 713 zSig0 += roundIncrement; 714 if ( (sbits64) zSig0 < 0 ) zExp = 1; 715 roundIncrement = roundMask + 1; 716 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 717 roundMask |= roundIncrement; 718 } 719 zSig0 &= ~ roundMask; 720 return packFloatx80( zSign, zExp, zSig0 ); 721 } 722 } 723 if ( roundBits ) float_set_inexact(); 724 zSig0 += roundIncrement; 725 if ( zSig0 < roundIncrement ) { 726 ++zExp; 727 zSig0 = LIT64( 0x8000000000000000 ); 728 } 729 roundIncrement = roundMask + 1; 730 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 731 roundMask |= roundIncrement; 732 } 733 zSig0 &= ~ roundMask; 734 if ( zSig0 == 0 ) zExp = 0; 735 return packFloatx80( zSign, zExp, zSig0 ); 736 precision80: 737 increment = ( (sbits64) zSig1 < 0 ); 738 if ( ! roundNearestEven ) { 739 if ( roundingMode == float_round_to_zero ) { 740 increment = 0; 741 } 742 else { 743 if ( zSign ) { 744 increment = ( roundingMode == float_round_down ) && zSig1; 745 } 746 else { 747 increment = ( roundingMode == float_round_up ) && zSig1; 748 } 749 } 750 } 751 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 752 if ( ( 0x7FFE < zExp ) 753 || ( ( zExp == 0x7FFE ) 754 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 755 && increment 756 ) 757 ) { 758 roundMask = 0; 759 overflow: 760 float_raise( float_flag_overflow | float_flag_inexact ); 761 if ( ( roundingMode == float_round_to_zero ) 762 || ( zSign && ( roundingMode == float_round_up ) ) 763 || ( ! zSign && ( roundingMode == float_round_down ) ) 764 ) { 765 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 766 } 767 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 768 } 769 if ( zExp <= 0 ) { 770 isTiny = 771 ( float_detect_tininess == float_tininess_before_rounding ) 772 || ( zExp < 0 ) 773 || ! increment 774 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 775 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 776 zExp = 0; 777 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 778 if ( zSig1 ) float_set_inexact(); 779 if ( roundNearestEven ) { 780 increment = ( (sbits64) zSig1 < 0 ); 781 } 782 else { 783 if ( zSign ) { 784 increment = ( roundingMode == float_round_down ) && zSig1; 785 } 786 else { 787 increment = ( roundingMode == float_round_up ) && zSig1; 788 } 789 } 790 if ( increment ) { 791 ++zSig0; 792 zSig0 &= 793 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 794 if ( (sbits64) zSig0 < 0 ) zExp = 1; 795 } 796 return packFloatx80( zSign, zExp, zSig0 ); 797 } 798 } 799 if ( zSig1 ) float_set_inexact(); 800 if ( increment ) { 801 ++zSig0; 802 if ( zSig0 == 0 ) { 803 ++zExp; 804 zSig0 = LIT64( 0x8000000000000000 ); 805 } 806 else { 807 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 808 } 809 } 810 else { 811 if ( zSig0 == 0 ) zExp = 0; 812 } 813 return packFloatx80( zSign, zExp, zSig0 ); 814 815 } 816 817 /* 818 ------------------------------------------------------------------------------- 819 Takes an abstract floating-point value having sign `zSign', exponent 820 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 821 and returns the proper extended double-precision floating-point value 822 corresponding to the abstract input. This routine is just like 823 `roundAndPackFloatx80' except that the input significand does not have to be 824 normalized. 825 ------------------------------------------------------------------------------- 826 */ 827 static floatx80 828 normalizeRoundAndPackFloatx80( 829 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 830 ) 831 { 832 int8 shiftCount; 833 834 if ( zSig0 == 0 ) { 835 zSig0 = zSig1; 836 zSig1 = 0; 837 zExp -= 64; 838 } 839 shiftCount = countLeadingZeros64( zSig0 ); 840 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 841 zExp -= shiftCount; 842 return 843 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 844 845 } 846 847 #endif 848 849 #ifdef FLOAT128 850 851 /* 852 ------------------------------------------------------------------------------- 853 Returns the least-significant 64 fraction bits of the quadruple-precision 854 floating-point value `a'. 855 ------------------------------------------------------------------------------- 856 */ 857 INLINE bits64 extractFloat128Frac1( float128 a ) 858 { 859 860 return a.low; 861 862 } 863 864 /* 865 ------------------------------------------------------------------------------- 866 Returns the most-significant 48 fraction bits of the quadruple-precision 867 floating-point value `a'. 868 ------------------------------------------------------------------------------- 869 */ 870 INLINE bits64 extractFloat128Frac0( float128 a ) 871 { 872 873 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 874 875 } 876 877 /* 878 ------------------------------------------------------------------------------- 879 Returns the exponent bits of the quadruple-precision floating-point value 880 `a'. 881 ------------------------------------------------------------------------------- 882 */ 883 INLINE int32 extractFloat128Exp( float128 a ) 884 { 885 886 return ( a.high>>48 ) & 0x7FFF; 887 888 } 889 890 /* 891 ------------------------------------------------------------------------------- 892 Returns the sign bit of the quadruple-precision floating-point value `a'. 893 ------------------------------------------------------------------------------- 894 */ 895 INLINE flag extractFloat128Sign( float128 a ) 896 { 897 898 return a.high>>63; 899 900 } 901 902 /* 903 ------------------------------------------------------------------------------- 904 Normalizes the subnormal quadruple-precision floating-point value 905 represented by the denormalized significand formed by the concatenation of 906 `aSig0' and `aSig1'. The normalized exponent is stored at the location 907 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 908 significand are stored at the location pointed to by `zSig0Ptr', and the 909 least significant 64 bits of the normalized significand are stored at the 910 location pointed to by `zSig1Ptr'. 911 ------------------------------------------------------------------------------- 912 */ 913 static void 914 normalizeFloat128Subnormal( 915 bits64 aSig0, 916 bits64 aSig1, 917 int32 *zExpPtr, 918 bits64 *zSig0Ptr, 919 bits64 *zSig1Ptr 920 ) 921 { 922 int8 shiftCount; 923 924 if ( aSig0 == 0 ) { 925 shiftCount = countLeadingZeros64( aSig1 ) - 15; 926 if ( shiftCount < 0 ) { 927 *zSig0Ptr = aSig1>>( - shiftCount ); 928 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 929 } 930 else { 931 *zSig0Ptr = aSig1<<shiftCount; 932 *zSig1Ptr = 0; 933 } 934 *zExpPtr = - shiftCount - 63; 935 } 936 else { 937 shiftCount = countLeadingZeros64( aSig0 ) - 15; 938 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 939 *zExpPtr = 1 - shiftCount; 940 } 941 942 } 943 944 /* 945 ------------------------------------------------------------------------------- 946 Packs the sign `zSign', the exponent `zExp', and the significand formed 947 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 948 floating-point value, returning the result. After being shifted into the 949 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 950 added together to form the most significant 32 bits of the result. This 951 means that any integer portion of `zSig0' will be added into the exponent. 952 Since a properly normalized significand will have an integer portion equal 953 to 1, the `zExp' input should be 1 less than the desired result exponent 954 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 955 significand. 956 ------------------------------------------------------------------------------- 957 */ 958 INLINE float128 959 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 960 { 961 float128 z; 962 963 z.low = zSig1; 964 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 965 return z; 966 967 } 968 969 /* 970 ------------------------------------------------------------------------------- 971 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 972 and extended significand formed by the concatenation of `zSig0', `zSig1', 973 and `zSig2', and returns the proper quadruple-precision floating-point value 974 corresponding to the abstract input. Ordinarily, the abstract value is 975 simply rounded and packed into the quadruple-precision format, with the 976 inexact exception raised if the abstract input cannot be represented 977 exactly. However, if the abstract value is too large, the overflow and 978 inexact exceptions are raised and an infinity or maximal finite value is 979 returned. If the abstract value is too small, the input value is rounded to 980 a subnormal number, and the underflow and inexact exceptions are raised if 981 the abstract input cannot be represented exactly as a subnormal quadruple- 982 precision floating-point number. 983 The input significand must be normalized or smaller. If the input 984 significand is not normalized, `zExp' must be 0; in that case, the result 985 returned is a subnormal number, and it must not require rounding. In the 986 usual case that the input significand is normalized, `zExp' must be 1 less 987 than the ``true'' floating-point exponent. The handling of underflow and 988 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 989 ------------------------------------------------------------------------------- 990 */ 991 static float128 992 roundAndPackFloat128( 993 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 994 { 995 int8 roundingMode; 996 flag roundNearestEven, increment, isTiny; 997 998 roundingMode = float_rounding_mode(); 999 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1000 increment = ( (sbits64) zSig2 < 0 ); 1001 if ( ! roundNearestEven ) { 1002 if ( roundingMode == float_round_to_zero ) { 1003 increment = 0; 1004 } 1005 else { 1006 if ( zSign ) { 1007 increment = ( roundingMode == float_round_down ) && zSig2; 1008 } 1009 else { 1010 increment = ( roundingMode == float_round_up ) && zSig2; 1011 } 1012 } 1013 } 1014 if ( 0x7FFD <= (bits32) zExp ) { 1015 if ( ( 0x7FFD < zExp ) 1016 || ( ( zExp == 0x7FFD ) 1017 && eq128( 1018 LIT64( 0x0001FFFFFFFFFFFF ), 1019 LIT64( 0xFFFFFFFFFFFFFFFF ), 1020 zSig0, 1021 zSig1 1022 ) 1023 && increment 1024 ) 1025 ) { 1026 float_raise( float_flag_overflow | float_flag_inexact ); 1027 if ( ( roundingMode == float_round_to_zero ) 1028 || ( zSign && ( roundingMode == float_round_up ) ) 1029 || ( ! zSign && ( roundingMode == float_round_down ) ) 1030 ) { 1031 return 1032 packFloat128( 1033 zSign, 1034 0x7FFE, 1035 LIT64( 0x0000FFFFFFFFFFFF ), 1036 LIT64( 0xFFFFFFFFFFFFFFFF ) 1037 ); 1038 } 1039 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1040 } 1041 if ( zExp < 0 ) { 1042 isTiny = 1043 ( float_detect_tininess == float_tininess_before_rounding ) 1044 || ( zExp < -1 ) 1045 || ! increment 1046 || lt128( 1047 zSig0, 1048 zSig1, 1049 LIT64( 0x0001FFFFFFFFFFFF ), 1050 LIT64( 0xFFFFFFFFFFFFFFFF ) 1051 ); 1052 shift128ExtraRightJamming( 1053 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1054 zExp = 0; 1055 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1056 if ( roundNearestEven ) { 1057 increment = ( (sbits64) zSig2 < 0 ); 1058 } 1059 else { 1060 if ( zSign ) { 1061 increment = ( roundingMode == float_round_down ) && zSig2; 1062 } 1063 else { 1064 increment = ( roundingMode == float_round_up ) && zSig2; 1065 } 1066 } 1067 } 1068 } 1069 if ( zSig2 ) float_set_inexact(); 1070 if ( increment ) { 1071 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1072 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1073 } 1074 else { 1075 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1076 } 1077 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1078 1079 } 1080 1081 /* 1082 ------------------------------------------------------------------------------- 1083 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1084 and significand formed by the concatenation of `zSig0' and `zSig1', and 1085 returns the proper quadruple-precision floating-point value corresponding 1086 to the abstract input. This routine is just like `roundAndPackFloat128' 1087 except that the input significand has fewer bits and does not have to be 1088 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1089 point exponent. 1090 ------------------------------------------------------------------------------- 1091 */ 1092 static float128 1093 normalizeRoundAndPackFloat128( 1094 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1095 { 1096 int8 shiftCount; 1097 bits64 zSig2; 1098 1099 if ( zSig0 == 0 ) { 1100 zSig0 = zSig1; 1101 zSig1 = 0; 1102 zExp -= 64; 1103 } 1104 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1105 if ( 0 <= shiftCount ) { 1106 zSig2 = 0; 1107 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1108 } 1109 else { 1110 shift128ExtraRightJamming( 1111 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1112 } 1113 zExp -= shiftCount; 1114 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1115 1116 } 1117 1118 #endif 1119 1120 /* 1121 ------------------------------------------------------------------------------- 1122 Returns the result of converting the 32-bit two's complement integer `a' 1123 to the single-precision floating-point format. The conversion is performed 1124 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1125 ------------------------------------------------------------------------------- 1126 */ 1127 float32 int32_to_float32( int32 a ) 1128 { 1129 flag zSign; 1130 1131 if ( a == 0 ) return 0; 1132 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1133 zSign = ( a < 0 ); 1134 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1135 1136 } 1137 1138 /* 1139 ------------------------------------------------------------------------------- 1140 Returns the result of converting the 32-bit two's complement integer `a' 1141 to the double-precision floating-point format. The conversion is performed 1142 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1143 ------------------------------------------------------------------------------- 1144 */ 1145 float64 int32_to_float64( int32 a ) 1146 { 1147 flag zSign; 1148 uint32 absA; 1149 int8 shiftCount; 1150 bits64 zSig; 1151 1152 if ( a == 0 ) return 0; 1153 zSign = ( a < 0 ); 1154 absA = zSign ? - a : a; 1155 shiftCount = countLeadingZeros32( absA ) + 21; 1156 zSig = absA; 1157 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1158 1159 } 1160 1161 #ifdef FLOATX80 1162 1163 /* 1164 ------------------------------------------------------------------------------- 1165 Returns the result of converting the 32-bit two's complement integer `a' 1166 to the extended double-precision floating-point format. The conversion 1167 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1168 Arithmetic. 1169 ------------------------------------------------------------------------------- 1170 */ 1171 floatx80 int32_to_floatx80( int32 a ) 1172 { 1173 flag zSign; 1174 uint32 absA; 1175 int8 shiftCount; 1176 bits64 zSig; 1177 1178 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1179 zSign = ( a < 0 ); 1180 absA = zSign ? - a : a; 1181 shiftCount = countLeadingZeros32( absA ) + 32; 1182 zSig = absA; 1183 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1184 1185 } 1186 1187 #endif 1188 1189 #ifdef FLOAT128 1190 1191 /* 1192 ------------------------------------------------------------------------------- 1193 Returns the result of converting the 32-bit two's complement integer `a' to 1194 the quadruple-precision floating-point format. The conversion is performed 1195 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1196 ------------------------------------------------------------------------------- 1197 */ 1198 float128 int32_to_float128( int32 a ) 1199 { 1200 flag zSign; 1201 uint32 absA; 1202 int8 shiftCount; 1203 bits64 zSig0; 1204 1205 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1206 zSign = ( a < 0 ); 1207 absA = zSign ? - a : a; 1208 shiftCount = countLeadingZeros32( absA ) + 17; 1209 zSig0 = absA; 1210 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1211 1212 } 1213 1214 #endif 1215 1216 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1217 /* 1218 ------------------------------------------------------------------------------- 1219 Returns the result of converting the 64-bit two's complement integer `a' 1220 to the single-precision floating-point format. The conversion is performed 1221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1222 ------------------------------------------------------------------------------- 1223 */ 1224 float32 int64_to_float32( int64 a ) 1225 { 1226 flag zSign; 1227 uint64 absA; 1228 int8 shiftCount; 1229 1230 if ( a == 0 ) return 0; 1231 zSign = ( a < 0 ); 1232 absA = zSign ? - a : a; 1233 shiftCount = countLeadingZeros64( absA ) - 40; 1234 if ( 0 <= shiftCount ) { 1235 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1236 } 1237 else { 1238 shiftCount += 7; 1239 if ( shiftCount < 0 ) { 1240 shift64RightJamming( absA, - shiftCount, &absA ); 1241 } 1242 else { 1243 absA <<= shiftCount; 1244 } 1245 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1246 } 1247 1248 } 1249 1250 /* 1251 ------------------------------------------------------------------------------- 1252 Returns the result of converting the 64-bit two's complement integer `a' 1253 to the double-precision floating-point format. The conversion is performed 1254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1255 ------------------------------------------------------------------------------- 1256 */ 1257 float64 int64_to_float64( int64 a ) 1258 { 1259 flag zSign; 1260 1261 if ( a == 0 ) return 0; 1262 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1263 return packFloat64( 1, 0x43E, 0 ); 1264 } 1265 zSign = ( a < 0 ); 1266 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1267 1268 } 1269 1270 #ifdef FLOATX80 1271 1272 /* 1273 ------------------------------------------------------------------------------- 1274 Returns the result of converting the 64-bit two's complement integer `a' 1275 to the extended double-precision floating-point format. The conversion 1276 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1277 Arithmetic. 1278 ------------------------------------------------------------------------------- 1279 */ 1280 floatx80 int64_to_floatx80( int64 a ) 1281 { 1282 flag zSign; 1283 uint64 absA; 1284 int8 shiftCount; 1285 1286 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1287 zSign = ( a < 0 ); 1288 absA = zSign ? - a : a; 1289 shiftCount = countLeadingZeros64( absA ); 1290 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1291 1292 } 1293 1294 #endif 1295 1296 #ifdef FLOAT128 1297 1298 /* 1299 ------------------------------------------------------------------------------- 1300 Returns the result of converting the 64-bit two's complement integer `a' to 1301 the quadruple-precision floating-point format. The conversion is performed 1302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1303 ------------------------------------------------------------------------------- 1304 */ 1305 float128 int64_to_float128( int64 a ) 1306 { 1307 flag zSign; 1308 uint64 absA; 1309 int8 shiftCount; 1310 int32 zExp; 1311 bits64 zSig0, zSig1; 1312 1313 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1314 zSign = ( a < 0 ); 1315 absA = zSign ? - a : a; 1316 shiftCount = countLeadingZeros64( absA ) + 49; 1317 zExp = 0x406E - shiftCount; 1318 if ( 64 <= shiftCount ) { 1319 zSig1 = 0; 1320 zSig0 = absA; 1321 shiftCount -= 64; 1322 } 1323 else { 1324 zSig1 = absA; 1325 zSig0 = 0; 1326 } 1327 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1328 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1329 1330 } 1331 1332 #endif 1333 #endif /* !SOFTFLOAT_FOR_GCC */ 1334 1335 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1336 /* 1337 ------------------------------------------------------------------------------- 1338 Returns the result of converting the single-precision floating-point value 1339 `a' to the 32-bit two's complement integer format. The conversion is 1340 performed according to the IEC/IEEE Standard for Binary Floating-Point 1341 Arithmetic---which means in particular that the conversion is rounded 1342 according to the current rounding mode. If `a' is a NaN, the largest 1343 positive integer is returned. Otherwise, if the conversion overflows, the 1344 largest integer with the same sign as `a' is returned. 1345 ------------------------------------------------------------------------------- 1346 */ 1347 int32 float32_to_int32( float32 a ) 1348 { 1349 flag aSign; 1350 int16 aExp, shiftCount; 1351 bits32 aSig; 1352 bits64 aSig64; 1353 1354 aSig = extractFloat32Frac( a ); 1355 aExp = extractFloat32Exp( a ); 1356 aSign = extractFloat32Sign( a ); 1357 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1358 if ( aExp ) aSig |= 0x00800000; 1359 shiftCount = 0xAF - aExp; 1360 aSig64 = aSig; 1361 aSig64 <<= 32; 1362 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1363 return roundAndPackInt32( aSign, aSig64 ); 1364 1365 } 1366 #endif /* !SOFTFLOAT_FOR_GCC */ 1367 1368 /* 1369 ------------------------------------------------------------------------------- 1370 Returns the result of converting the single-precision floating-point value 1371 `a' to the 32-bit two's complement integer format. The conversion is 1372 performed according to the IEC/IEEE Standard for Binary Floating-Point 1373 Arithmetic, except that the conversion is always rounded toward zero. 1374 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1375 the conversion overflows, the largest integer with the same sign as `a' is 1376 returned. 1377 ------------------------------------------------------------------------------- 1378 */ 1379 int32 float32_to_int32_round_to_zero( float32 a ) 1380 { 1381 flag aSign; 1382 int16 aExp, shiftCount; 1383 bits32 aSig; 1384 int32 z; 1385 1386 aSig = extractFloat32Frac( a ); 1387 aExp = extractFloat32Exp( a ); 1388 aSign = extractFloat32Sign( a ); 1389 shiftCount = aExp - 0x9E; 1390 if ( 0 <= shiftCount ) { 1391 if ( a != 0xCF000000 ) { 1392 float_raise( float_flag_invalid ); 1393 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1394 } 1395 return (sbits32) 0x80000000; 1396 } 1397 else if ( aExp <= 0x7E ) { 1398 if ( aExp | aSig ) float_set_inexact(); 1399 return 0; 1400 } 1401 aSig = ( aSig | 0x00800000 )<<8; 1402 z = aSig>>( - shiftCount ); 1403 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1404 float_set_inexact(); 1405 } 1406 if ( aSign ) z = - z; 1407 return z; 1408 1409 } 1410 1411 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1412 /* 1413 ------------------------------------------------------------------------------- 1414 Returns the result of converting the single-precision floating-point value 1415 `a' to the 64-bit two's complement integer format. The conversion is 1416 performed according to the IEC/IEEE Standard for Binary Floating-Point 1417 Arithmetic---which means in particular that the conversion is rounded 1418 according to the current rounding mode. If `a' is a NaN, the largest 1419 positive integer is returned. Otherwise, if the conversion overflows, the 1420 largest integer with the same sign as `a' is returned. 1421 ------------------------------------------------------------------------------- 1422 */ 1423 int64 float32_to_int64( float32 a ) 1424 { 1425 flag aSign; 1426 int16 aExp, shiftCount; 1427 bits32 aSig; 1428 bits64 aSig64, aSigExtra; 1429 1430 aSig = extractFloat32Frac( a ); 1431 aExp = extractFloat32Exp( a ); 1432 aSign = extractFloat32Sign( a ); 1433 shiftCount = 0xBE - aExp; 1434 if ( shiftCount < 0 ) { 1435 float_raise( float_flag_invalid ); 1436 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1437 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1438 } 1439 return (sbits64) LIT64( 0x8000000000000000 ); 1440 } 1441 if ( aExp ) aSig |= 0x00800000; 1442 aSig64 = aSig; 1443 aSig64 <<= 40; 1444 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1445 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1446 1447 } 1448 1449 /* 1450 ------------------------------------------------------------------------------- 1451 Returns the result of converting the single-precision floating-point value 1452 `a' to the 64-bit two's complement integer format. The conversion is 1453 performed according to the IEC/IEEE Standard for Binary Floating-Point 1454 Arithmetic, except that the conversion is always rounded toward zero. If 1455 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1456 conversion overflows, the largest integer with the same sign as `a' is 1457 returned. 1458 ------------------------------------------------------------------------------- 1459 */ 1460 int64 float32_to_int64_round_to_zero( float32 a ) 1461 { 1462 flag aSign; 1463 int16 aExp, shiftCount; 1464 bits32 aSig; 1465 bits64 aSig64; 1466 int64 z; 1467 1468 aSig = extractFloat32Frac( a ); 1469 aExp = extractFloat32Exp( a ); 1470 aSign = extractFloat32Sign( a ); 1471 shiftCount = aExp - 0xBE; 1472 if ( 0 <= shiftCount ) { 1473 if ( a != 0xDF000000 ) { 1474 float_raise( float_flag_invalid ); 1475 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1476 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1477 } 1478 } 1479 return (sbits64) LIT64( 0x8000000000000000 ); 1480 } 1481 else if ( aExp <= 0x7E ) { 1482 if ( aExp | aSig ) float_set_inexact(); 1483 return 0; 1484 } 1485 aSig64 = aSig | 0x00800000; 1486 aSig64 <<= 40; 1487 z = aSig64>>( - shiftCount ); 1488 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1489 float_set_inexact(); 1490 } 1491 if ( aSign ) z = - z; 1492 return z; 1493 1494 } 1495 #endif /* !SOFTFLOAT_FOR_GCC */ 1496 1497 /* 1498 ------------------------------------------------------------------------------- 1499 Returns the result of converting the single-precision floating-point value 1500 `a' to the double-precision floating-point format. The conversion is 1501 performed according to the IEC/IEEE Standard for Binary Floating-Point 1502 Arithmetic. 1503 ------------------------------------------------------------------------------- 1504 */ 1505 float64 float32_to_float64( float32 a ) 1506 { 1507 flag aSign; 1508 int16 aExp; 1509 bits32 aSig; 1510 1511 aSig = extractFloat32Frac( a ); 1512 aExp = extractFloat32Exp( a ); 1513 aSign = extractFloat32Sign( a ); 1514 if ( aExp == 0xFF ) { 1515 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1516 return packFloat64( aSign, 0x7FF, 0 ); 1517 } 1518 if ( aExp == 0 ) { 1519 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1520 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1521 --aExp; 1522 } 1523 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1524 1525 } 1526 1527 #ifdef FLOATX80 1528 1529 /* 1530 ------------------------------------------------------------------------------- 1531 Returns the result of converting the single-precision floating-point value 1532 `a' to the extended double-precision floating-point format. The conversion 1533 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1534 Arithmetic. 1535 ------------------------------------------------------------------------------- 1536 */ 1537 floatx80 float32_to_floatx80( float32 a ) 1538 { 1539 flag aSign; 1540 int16 aExp; 1541 bits32 aSig; 1542 1543 aSig = extractFloat32Frac( a ); 1544 aExp = extractFloat32Exp( a ); 1545 aSign = extractFloat32Sign( a ); 1546 if ( aExp == 0xFF ) { 1547 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1548 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1549 } 1550 if ( aExp == 0 ) { 1551 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1552 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1553 } 1554 aSig |= 0x00800000; 1555 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1556 1557 } 1558 1559 #endif 1560 1561 #ifdef FLOAT128 1562 1563 /* 1564 ------------------------------------------------------------------------------- 1565 Returns the result of converting the single-precision floating-point value 1566 `a' to the double-precision floating-point format. The conversion is 1567 performed according to the IEC/IEEE Standard for Binary Floating-Point 1568 Arithmetic. 1569 ------------------------------------------------------------------------------- 1570 */ 1571 float128 float32_to_float128( float32 a ) 1572 { 1573 flag aSign; 1574 int16 aExp; 1575 bits32 aSig; 1576 1577 aSig = extractFloat32Frac( a ); 1578 aExp = extractFloat32Exp( a ); 1579 aSign = extractFloat32Sign( a ); 1580 if ( aExp == 0xFF ) { 1581 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1582 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1583 } 1584 if ( aExp == 0 ) { 1585 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1586 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1587 --aExp; 1588 } 1589 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1590 1591 } 1592 1593 #endif 1594 1595 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1596 /* 1597 ------------------------------------------------------------------------------- 1598 Rounds the single-precision floating-point value `a' to an integer, and 1599 returns the result as a single-precision floating-point value. The 1600 operation is performed according to the IEC/IEEE Standard for Binary 1601 Floating-Point Arithmetic. 1602 ------------------------------------------------------------------------------- 1603 */ 1604 float32 float32_round_to_int( float32 a ) 1605 { 1606 flag aSign; 1607 int16 aExp; 1608 bits32 lastBitMask, roundBitsMask; 1609 int8 roundingMode; 1610 float32 z; 1611 1612 aExp = extractFloat32Exp( a ); 1613 if ( 0x96 <= aExp ) { 1614 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1615 return propagateFloat32NaN( a, a ); 1616 } 1617 return a; 1618 } 1619 if ( aExp <= 0x7E ) { 1620 if ( (bits32) ( a<<1 ) == 0 ) return a; 1621 float_set_inexact(); 1622 aSign = extractFloat32Sign( a ); 1623 switch ( float_rounding_mode() ) { 1624 case float_round_nearest_even: 1625 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1626 return packFloat32( aSign, 0x7F, 0 ); 1627 } 1628 break; 1629 case float_round_down: 1630 return aSign ? 0xBF800000 : 0; 1631 case float_round_up: 1632 return aSign ? 0x80000000 : 0x3F800000; 1633 } 1634 return packFloat32( aSign, 0, 0 ); 1635 } 1636 lastBitMask = 1; 1637 lastBitMask <<= 0x96 - aExp; 1638 roundBitsMask = lastBitMask - 1; 1639 z = a; 1640 roundingMode = float_rounding_mode(); 1641 if ( roundingMode == float_round_nearest_even ) { 1642 z += lastBitMask>>1; 1643 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1644 } 1645 else if ( roundingMode != float_round_to_zero ) { 1646 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1647 z += roundBitsMask; 1648 } 1649 } 1650 z &= ~ roundBitsMask; 1651 if ( z != a ) float_set_inexact(); 1652 return z; 1653 1654 } 1655 #endif /* !SOFTFLOAT_FOR_GCC */ 1656 1657 /* 1658 ------------------------------------------------------------------------------- 1659 Returns the result of adding the absolute values of the single-precision 1660 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1661 before being returned. `zSign' is ignored if the result is a NaN. 1662 The addition is performed according to the IEC/IEEE Standard for Binary 1663 Floating-Point Arithmetic. 1664 ------------------------------------------------------------------------------- 1665 */ 1666 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1667 { 1668 int16 aExp, bExp, zExp; 1669 bits32 aSig, bSig, zSig; 1670 int16 expDiff; 1671 1672 aSig = extractFloat32Frac( a ); 1673 aExp = extractFloat32Exp( a ); 1674 bSig = extractFloat32Frac( b ); 1675 bExp = extractFloat32Exp( b ); 1676 expDiff = aExp - bExp; 1677 aSig <<= 6; 1678 bSig <<= 6; 1679 if ( 0 < expDiff ) { 1680 if ( aExp == 0xFF ) { 1681 if ( aSig ) return propagateFloat32NaN( a, b ); 1682 return a; 1683 } 1684 if ( bExp == 0 ) { 1685 --expDiff; 1686 } 1687 else { 1688 bSig |= 0x20000000; 1689 } 1690 shift32RightJamming( bSig, expDiff, &bSig ); 1691 zExp = aExp; 1692 } 1693 else if ( expDiff < 0 ) { 1694 if ( bExp == 0xFF ) { 1695 if ( bSig ) return propagateFloat32NaN( a, b ); 1696 return packFloat32( zSign, 0xFF, 0 ); 1697 } 1698 if ( aExp == 0 ) { 1699 ++expDiff; 1700 } 1701 else { 1702 aSig |= 0x20000000; 1703 } 1704 shift32RightJamming( aSig, - expDiff, &aSig ); 1705 zExp = bExp; 1706 } 1707 else { 1708 if ( aExp == 0xFF ) { 1709 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1710 return a; 1711 } 1712 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1713 zSig = 0x40000000 + aSig + bSig; 1714 zExp = aExp; 1715 goto roundAndPack; 1716 } 1717 aSig |= 0x20000000; 1718 zSig = ( aSig + bSig )<<1; 1719 --zExp; 1720 if ( (sbits32) zSig < 0 ) { 1721 zSig = aSig + bSig; 1722 ++zExp; 1723 } 1724 roundAndPack: 1725 return roundAndPackFloat32( zSign, zExp, zSig ); 1726 1727 } 1728 1729 /* 1730 ------------------------------------------------------------------------------- 1731 Returns the result of subtracting the absolute values of the single- 1732 precision floating-point values `a' and `b'. If `zSign' is 1, the 1733 difference is negated before being returned. `zSign' is ignored if the 1734 result is a NaN. The subtraction is performed according to the IEC/IEEE 1735 Standard for Binary Floating-Point Arithmetic. 1736 ------------------------------------------------------------------------------- 1737 */ 1738 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1739 { 1740 int16 aExp, bExp, zExp; 1741 bits32 aSig, bSig, zSig; 1742 int16 expDiff; 1743 1744 aSig = extractFloat32Frac( a ); 1745 aExp = extractFloat32Exp( a ); 1746 bSig = extractFloat32Frac( b ); 1747 bExp = extractFloat32Exp( b ); 1748 expDiff = aExp - bExp; 1749 aSig <<= 7; 1750 bSig <<= 7; 1751 if ( 0 < expDiff ) goto aExpBigger; 1752 if ( expDiff < 0 ) goto bExpBigger; 1753 if ( aExp == 0xFF ) { 1754 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1755 float_raise( float_flag_invalid ); 1756 return float32_default_nan; 1757 } 1758 if ( aExp == 0 ) { 1759 aExp = 1; 1760 bExp = 1; 1761 } 1762 if ( bSig < aSig ) goto aBigger; 1763 if ( aSig < bSig ) goto bBigger; 1764 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 ); 1765 bExpBigger: 1766 if ( bExp == 0xFF ) { 1767 if ( bSig ) return propagateFloat32NaN( a, b ); 1768 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1769 } 1770 if ( aExp == 0 ) { 1771 ++expDiff; 1772 } 1773 else { 1774 aSig |= 0x40000000; 1775 } 1776 shift32RightJamming( aSig, - expDiff, &aSig ); 1777 bSig |= 0x40000000; 1778 bBigger: 1779 zSig = bSig - aSig; 1780 zExp = bExp; 1781 zSign ^= 1; 1782 goto normalizeRoundAndPack; 1783 aExpBigger: 1784 if ( aExp == 0xFF ) { 1785 if ( aSig ) return propagateFloat32NaN( a, b ); 1786 return a; 1787 } 1788 if ( bExp == 0 ) { 1789 --expDiff; 1790 } 1791 else { 1792 bSig |= 0x40000000; 1793 } 1794 shift32RightJamming( bSig, expDiff, &bSig ); 1795 aSig |= 0x40000000; 1796 aBigger: 1797 zSig = aSig - bSig; 1798 zExp = aExp; 1799 normalizeRoundAndPack: 1800 --zExp; 1801 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1802 1803 } 1804 1805 /* 1806 ------------------------------------------------------------------------------- 1807 Returns the result of adding the single-precision floating-point values `a' 1808 and `b'. The operation is performed according to the IEC/IEEE Standard for 1809 Binary Floating-Point Arithmetic. 1810 ------------------------------------------------------------------------------- 1811 */ 1812 float32 float32_add( float32 a, float32 b ) 1813 { 1814 flag aSign, bSign; 1815 1816 aSign = extractFloat32Sign( a ); 1817 bSign = extractFloat32Sign( b ); 1818 if ( aSign == bSign ) { 1819 return addFloat32Sigs( a, b, aSign ); 1820 } 1821 else { 1822 return subFloat32Sigs( a, b, aSign ); 1823 } 1824 1825 } 1826 1827 /* 1828 ------------------------------------------------------------------------------- 1829 Returns the result of subtracting the single-precision floating-point values 1830 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1831 for Binary Floating-Point Arithmetic. 1832 ------------------------------------------------------------------------------- 1833 */ 1834 float32 float32_sub( float32 a, float32 b ) 1835 { 1836 flag aSign, bSign; 1837 1838 aSign = extractFloat32Sign( a ); 1839 bSign = extractFloat32Sign( b ); 1840 if ( aSign == bSign ) { 1841 return subFloat32Sigs( a, b, aSign ); 1842 } 1843 else { 1844 return addFloat32Sigs( a, b, aSign ); 1845 } 1846 1847 } 1848 1849 /* 1850 ------------------------------------------------------------------------------- 1851 Returns the result of multiplying the single-precision floating-point values 1852 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1853 for Binary Floating-Point Arithmetic. 1854 ------------------------------------------------------------------------------- 1855 */ 1856 float32 float32_mul( float32 a, float32 b ) 1857 { 1858 flag aSign, bSign, zSign; 1859 int16 aExp, bExp, zExp; 1860 bits32 aSig, bSig; 1861 bits64 zSig64; 1862 bits32 zSig; 1863 1864 aSig = extractFloat32Frac( a ); 1865 aExp = extractFloat32Exp( a ); 1866 aSign = extractFloat32Sign( a ); 1867 bSig = extractFloat32Frac( b ); 1868 bExp = extractFloat32Exp( b ); 1869 bSign = extractFloat32Sign( b ); 1870 zSign = aSign ^ bSign; 1871 if ( aExp == 0xFF ) { 1872 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1873 return propagateFloat32NaN( a, b ); 1874 } 1875 if ( ( bExp | bSig ) == 0 ) { 1876 float_raise( float_flag_invalid ); 1877 return float32_default_nan; 1878 } 1879 return packFloat32( zSign, 0xFF, 0 ); 1880 } 1881 if ( bExp == 0xFF ) { 1882 if ( bSig ) return propagateFloat32NaN( a, b ); 1883 if ( ( aExp | aSig ) == 0 ) { 1884 float_raise( float_flag_invalid ); 1885 return float32_default_nan; 1886 } 1887 return packFloat32( zSign, 0xFF, 0 ); 1888 } 1889 if ( aExp == 0 ) { 1890 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1891 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1892 } 1893 if ( bExp == 0 ) { 1894 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1895 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1896 } 1897 zExp = aExp + bExp - 0x7F; 1898 aSig = ( aSig | 0x00800000 )<<7; 1899 bSig = ( bSig | 0x00800000 )<<8; 1900 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1901 zSig = zSig64; 1902 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1903 zSig <<= 1; 1904 --zExp; 1905 } 1906 return roundAndPackFloat32( zSign, zExp, zSig ); 1907 1908 } 1909 1910 /* 1911 ------------------------------------------------------------------------------- 1912 Returns the result of dividing the single-precision floating-point value `a' 1913 by the corresponding value `b'. The operation is performed according to the 1914 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1915 ------------------------------------------------------------------------------- 1916 */ 1917 float32 float32_div( float32 a, float32 b ) 1918 { 1919 flag aSign, bSign, zSign; 1920 int16 aExp, bExp, zExp; 1921 bits32 aSig, bSig, zSig; 1922 1923 aSig = extractFloat32Frac( a ); 1924 aExp = extractFloat32Exp( a ); 1925 aSign = extractFloat32Sign( a ); 1926 bSig = extractFloat32Frac( b ); 1927 bExp = extractFloat32Exp( b ); 1928 bSign = extractFloat32Sign( b ); 1929 zSign = aSign ^ bSign; 1930 if ( aExp == 0xFF ) { 1931 if ( aSig ) return propagateFloat32NaN( a, b ); 1932 if ( bExp == 0xFF ) { 1933 if ( bSig ) return propagateFloat32NaN( a, b ); 1934 float_raise( float_flag_invalid ); 1935 return float32_default_nan; 1936 } 1937 return packFloat32( zSign, 0xFF, 0 ); 1938 } 1939 if ( bExp == 0xFF ) { 1940 if ( bSig ) return propagateFloat32NaN( a, b ); 1941 return packFloat32( zSign, 0, 0 ); 1942 } 1943 if ( bExp == 0 ) { 1944 if ( bSig == 0 ) { 1945 if ( ( aExp | aSig ) == 0 ) { 1946 float_raise( float_flag_invalid ); 1947 return float32_default_nan; 1948 } 1949 float_raise( float_flag_divbyzero ); 1950 return packFloat32( zSign, 0xFF, 0 ); 1951 } 1952 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1953 } 1954 if ( aExp == 0 ) { 1955 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1956 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1957 } 1958 zExp = aExp - bExp + 0x7D; 1959 aSig = ( aSig | 0x00800000 )<<7; 1960 bSig = ( bSig | 0x00800000 )<<8; 1961 if ( bSig <= ( aSig + aSig ) ) { 1962 aSig >>= 1; 1963 ++zExp; 1964 } 1965 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1966 if ( ( zSig & 0x3F ) == 0 ) { 1967 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 1968 } 1969 return roundAndPackFloat32( zSign, zExp, zSig ); 1970 1971 } 1972 1973 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1974 /* 1975 ------------------------------------------------------------------------------- 1976 Returns the remainder of the single-precision floating-point value `a' 1977 with respect to the corresponding value `b'. The operation is performed 1978 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1979 ------------------------------------------------------------------------------- 1980 */ 1981 float32 float32_rem( float32 a, float32 b ) 1982 { 1983 flag aSign, bSign, zSign; 1984 int16 aExp, bExp, expDiff; 1985 bits32 aSig, bSig; 1986 bits32 q; 1987 bits64 aSig64, bSig64, q64; 1988 bits32 alternateASig; 1989 sbits32 sigMean; 1990 1991 aSig = extractFloat32Frac( a ); 1992 aExp = extractFloat32Exp( a ); 1993 aSign = extractFloat32Sign( a ); 1994 bSig = extractFloat32Frac( b ); 1995 bExp = extractFloat32Exp( b ); 1996 bSign = extractFloat32Sign( b ); 1997 if ( aExp == 0xFF ) { 1998 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1999 return propagateFloat32NaN( a, b ); 2000 } 2001 float_raise( float_flag_invalid ); 2002 return float32_default_nan; 2003 } 2004 if ( bExp == 0xFF ) { 2005 if ( bSig ) return propagateFloat32NaN( a, b ); 2006 return a; 2007 } 2008 if ( bExp == 0 ) { 2009 if ( bSig == 0 ) { 2010 float_raise( float_flag_invalid ); 2011 return float32_default_nan; 2012 } 2013 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2014 } 2015 if ( aExp == 0 ) { 2016 if ( aSig == 0 ) return a; 2017 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2018 } 2019 expDiff = aExp - bExp; 2020 aSig |= 0x00800000; 2021 bSig |= 0x00800000; 2022 if ( expDiff < 32 ) { 2023 aSig <<= 8; 2024 bSig <<= 8; 2025 if ( expDiff < 0 ) { 2026 if ( expDiff < -1 ) return a; 2027 aSig >>= 1; 2028 } 2029 q = ( bSig <= aSig ); 2030 if ( q ) aSig -= bSig; 2031 if ( 0 < expDiff ) { 2032 q = ( ( (bits64) aSig )<<32 ) / bSig; 2033 q >>= 32 - expDiff; 2034 bSig >>= 2; 2035 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2036 } 2037 else { 2038 aSig >>= 2; 2039 bSig >>= 2; 2040 } 2041 } 2042 else { 2043 if ( bSig <= aSig ) aSig -= bSig; 2044 aSig64 = ( (bits64) aSig )<<40; 2045 bSig64 = ( (bits64) bSig )<<40; 2046 expDiff -= 64; 2047 while ( 0 < expDiff ) { 2048 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2049 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2050 aSig64 = - ( ( bSig * q64 )<<38 ); 2051 expDiff -= 62; 2052 } 2053 expDiff += 64; 2054 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2055 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2056 q = q64>>( 64 - expDiff ); 2057 bSig <<= 6; 2058 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2059 } 2060 do { 2061 alternateASig = aSig; 2062 ++q; 2063 aSig -= bSig; 2064 } while ( 0 <= (sbits32) aSig ); 2065 sigMean = aSig + alternateASig; 2066 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2067 aSig = alternateASig; 2068 } 2069 zSign = ( (sbits32) aSig < 0 ); 2070 if ( zSign ) aSig = - aSig; 2071 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2072 2073 } 2074 #endif /* !SOFTFLOAT_FOR_GCC */ 2075 2076 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2077 /* 2078 ------------------------------------------------------------------------------- 2079 Returns the square root of the single-precision floating-point value `a'. 2080 The operation is performed according to the IEC/IEEE Standard for Binary 2081 Floating-Point Arithmetic. 2082 ------------------------------------------------------------------------------- 2083 */ 2084 float32 float32_sqrt( float32 a ) 2085 { 2086 flag aSign; 2087 int16 aExp, zExp; 2088 bits32 aSig, zSig; 2089 bits64 rem, term; 2090 2091 aSig = extractFloat32Frac( a ); 2092 aExp = extractFloat32Exp( a ); 2093 aSign = extractFloat32Sign( a ); 2094 if ( aExp == 0xFF ) { 2095 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2096 if ( ! aSign ) return a; 2097 float_raise( float_flag_invalid ); 2098 return float32_default_nan; 2099 } 2100 if ( aSign ) { 2101 if ( ( aExp | aSig ) == 0 ) return a; 2102 float_raise( float_flag_invalid ); 2103 return float32_default_nan; 2104 } 2105 if ( aExp == 0 ) { 2106 if ( aSig == 0 ) return 0; 2107 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2108 } 2109 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2110 aSig = ( aSig | 0x00800000 )<<8; 2111 zSig = estimateSqrt32( aExp, aSig ) + 2; 2112 if ( ( zSig & 0x7F ) <= 5 ) { 2113 if ( zSig < 2 ) { 2114 zSig = 0x7FFFFFFF; 2115 goto roundAndPack; 2116 } 2117 aSig >>= aExp & 1; 2118 term = ( (bits64) zSig ) * zSig; 2119 rem = ( ( (bits64) aSig )<<32 ) - term; 2120 while ( (sbits64) rem < 0 ) { 2121 --zSig; 2122 rem += ( ( (bits64) zSig )<<1 ) | 1; 2123 } 2124 zSig |= ( rem != 0 ); 2125 } 2126 shift32RightJamming( zSig, 1, &zSig ); 2127 roundAndPack: 2128 return roundAndPackFloat32( 0, zExp, zSig ); 2129 2130 } 2131 #endif /* !SOFTFLOAT_FOR_GCC */ 2132 2133 /* 2134 ------------------------------------------------------------------------------- 2135 Returns 1 if the single-precision floating-point value `a' is equal to 2136 the corresponding value `b', and 0 otherwise. The comparison is performed 2137 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2138 ------------------------------------------------------------------------------- 2139 */ 2140 flag float32_eq( float32 a, float32 b ) 2141 { 2142 2143 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2144 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2145 ) { 2146 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2147 float_raise( float_flag_invalid ); 2148 } 2149 return 0; 2150 } 2151 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2152 2153 } 2154 2155 /* 2156 ------------------------------------------------------------------------------- 2157 Returns 1 if the single-precision floating-point value `a' is less than 2158 or equal to the corresponding value `b', and 0 otherwise. The comparison 2159 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2160 Arithmetic. 2161 ------------------------------------------------------------------------------- 2162 */ 2163 flag float32_le( float32 a, float32 b ) 2164 { 2165 flag aSign, bSign; 2166 2167 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2168 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2169 ) { 2170 float_raise( float_flag_invalid ); 2171 return 0; 2172 } 2173 aSign = extractFloat32Sign( a ); 2174 bSign = extractFloat32Sign( b ); 2175 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2176 return ( a == b ) || ( aSign ^ ( a < b ) ); 2177 2178 } 2179 2180 /* 2181 ------------------------------------------------------------------------------- 2182 Returns 1 if the single-precision floating-point value `a' is less than 2183 the corresponding value `b', and 0 otherwise. The comparison is performed 2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2185 ------------------------------------------------------------------------------- 2186 */ 2187 flag float32_lt( float32 a, float32 b ) 2188 { 2189 flag aSign, bSign; 2190 2191 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2192 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2193 ) { 2194 float_raise( float_flag_invalid ); 2195 return 0; 2196 } 2197 aSign = extractFloat32Sign( a ); 2198 bSign = extractFloat32Sign( b ); 2199 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2200 return ( a != b ) && ( aSign ^ ( a < b ) ); 2201 2202 } 2203 2204 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2205 /* 2206 ------------------------------------------------------------------------------- 2207 Returns 1 if the single-precision floating-point value `a' is equal to 2208 the corresponding value `b', and 0 otherwise. The invalid exception is 2209 raised if either operand is a NaN. Otherwise, the comparison is performed 2210 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2211 ------------------------------------------------------------------------------- 2212 */ 2213 flag float32_eq_signaling( float32 a, float32 b ) 2214 { 2215 2216 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2217 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2218 ) { 2219 float_raise( float_flag_invalid ); 2220 return 0; 2221 } 2222 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2223 2224 } 2225 2226 /* 2227 ------------------------------------------------------------------------------- 2228 Returns 1 if the single-precision floating-point value `a' is less than or 2229 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2230 cause an exception. Otherwise, the comparison is performed according to the 2231 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2232 ------------------------------------------------------------------------------- 2233 */ 2234 flag float32_le_quiet( float32 a, float32 b ) 2235 { 2236 flag aSign, bSign; 2237 2238 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2239 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2240 ) { 2241 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2242 float_raise( float_flag_invalid ); 2243 } 2244 return 0; 2245 } 2246 aSign = extractFloat32Sign( a ); 2247 bSign = extractFloat32Sign( b ); 2248 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2249 return ( a == b ) || ( aSign ^ ( a < b ) ); 2250 2251 } 2252 2253 /* 2254 ------------------------------------------------------------------------------- 2255 Returns 1 if the single-precision floating-point value `a' is less than 2256 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2257 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2258 Standard for Binary Floating-Point Arithmetic. 2259 ------------------------------------------------------------------------------- 2260 */ 2261 flag float32_lt_quiet( float32 a, float32 b ) 2262 { 2263 flag aSign, bSign; 2264 2265 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2266 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2267 ) { 2268 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2269 float_raise( float_flag_invalid ); 2270 } 2271 return 0; 2272 } 2273 aSign = extractFloat32Sign( a ); 2274 bSign = extractFloat32Sign( b ); 2275 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2276 return ( a != b ) && ( aSign ^ ( a < b ) ); 2277 2278 } 2279 #endif /* !SOFTFLOAT_FOR_GCC */ 2280 2281 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2282 /* 2283 ------------------------------------------------------------------------------- 2284 Returns the result of converting the double-precision floating-point value 2285 `a' to the 32-bit two's complement integer format. The conversion is 2286 performed according to the IEC/IEEE Standard for Binary Floating-Point 2287 Arithmetic---which means in particular that the conversion is rounded 2288 according to the current rounding mode. If `a' is a NaN, the largest 2289 positive integer is returned. Otherwise, if the conversion overflows, the 2290 largest integer with the same sign as `a' is returned. 2291 ------------------------------------------------------------------------------- 2292 */ 2293 int32 float64_to_int32( float64 a ) 2294 { 2295 flag aSign; 2296 int16 aExp, shiftCount; 2297 bits64 aSig; 2298 2299 aSig = extractFloat64Frac( a ); 2300 aExp = extractFloat64Exp( a ); 2301 aSign = extractFloat64Sign( a ); 2302 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2303 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2304 shiftCount = 0x42C - aExp; 2305 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2306 return roundAndPackInt32( aSign, aSig ); 2307 2308 } 2309 #endif /* !SOFTFLOAT_FOR_GCC */ 2310 2311 /* 2312 ------------------------------------------------------------------------------- 2313 Returns the result of converting the double-precision floating-point value 2314 `a' to the 32-bit two's complement integer format. The conversion is 2315 performed according to the IEC/IEEE Standard for Binary Floating-Point 2316 Arithmetic, except that the conversion is always rounded toward zero. 2317 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2318 the conversion overflows, the largest integer with the same sign as `a' is 2319 returned. 2320 ------------------------------------------------------------------------------- 2321 */ 2322 int32 float64_to_int32_round_to_zero( float64 a ) 2323 { 2324 flag aSign; 2325 int16 aExp, shiftCount; 2326 bits64 aSig, savedASig; 2327 int32 z; 2328 2329 aSig = extractFloat64Frac( a ); 2330 aExp = extractFloat64Exp( a ); 2331 aSign = extractFloat64Sign( a ); 2332 if ( 0x41E < aExp ) { 2333 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2334 goto invalid; 2335 } 2336 else if ( aExp < 0x3FF ) { 2337 if ( aExp || aSig ) float_set_inexact(); 2338 return 0; 2339 } 2340 aSig |= LIT64( 0x0010000000000000 ); 2341 shiftCount = 0x433 - aExp; 2342 savedASig = aSig; 2343 aSig >>= shiftCount; 2344 z = aSig; 2345 if ( aSign ) z = - z; 2346 if ( ( z < 0 ) ^ aSign ) { 2347 invalid: 2348 float_raise( float_flag_invalid ); 2349 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2350 } 2351 if ( ( aSig<<shiftCount ) != savedASig ) { 2352 float_set_inexact(); 2353 } 2354 return z; 2355 2356 } 2357 2358 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2359 /* 2360 ------------------------------------------------------------------------------- 2361 Returns the result of converting the double-precision floating-point value 2362 `a' to the 64-bit two's complement integer format. The conversion is 2363 performed according to the IEC/IEEE Standard for Binary Floating-Point 2364 Arithmetic---which means in particular that the conversion is rounded 2365 according to the current rounding mode. If `a' is a NaN, the largest 2366 positive integer is returned. Otherwise, if the conversion overflows, the 2367 largest integer with the same sign as `a' is returned. 2368 ------------------------------------------------------------------------------- 2369 */ 2370 int64 float64_to_int64( float64 a ) 2371 { 2372 flag aSign; 2373 int16 aExp, shiftCount; 2374 bits64 aSig, aSigExtra; 2375 2376 aSig = extractFloat64Frac( a ); 2377 aExp = extractFloat64Exp( a ); 2378 aSign = extractFloat64Sign( a ); 2379 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2380 shiftCount = 0x433 - aExp; 2381 if ( shiftCount <= 0 ) { 2382 if ( 0x43E < aExp ) { 2383 float_raise( float_flag_invalid ); 2384 if ( ! aSign 2385 || ( ( aExp == 0x7FF ) 2386 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2387 ) { 2388 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2389 } 2390 return (sbits64) LIT64( 0x8000000000000000 ); 2391 } 2392 aSigExtra = 0; 2393 aSig <<= - shiftCount; 2394 } 2395 else { 2396 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2397 } 2398 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2399 2400 } 2401 2402 /* 2403 ------------------------------------------------------------------------------- 2404 Returns the result of converting the double-precision floating-point value 2405 `a' to the 64-bit two's complement integer format. The conversion is 2406 performed according to the IEC/IEEE Standard for Binary Floating-Point 2407 Arithmetic, except that the conversion is always rounded toward zero. 2408 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2409 the conversion overflows, the largest integer with the same sign as `a' is 2410 returned. 2411 ------------------------------------------------------------------------------- 2412 */ 2413 int64 float64_to_int64_round_to_zero( float64 a ) 2414 { 2415 flag aSign; 2416 int16 aExp, shiftCount; 2417 bits64 aSig; 2418 int64 z; 2419 2420 aSig = extractFloat64Frac( a ); 2421 aExp = extractFloat64Exp( a ); 2422 aSign = extractFloat64Sign( a ); 2423 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2424 shiftCount = aExp - 0x433; 2425 if ( 0 <= shiftCount ) { 2426 if ( 0x43E <= aExp ) { 2427 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2428 float_raise( float_flag_invalid ); 2429 if ( ! aSign 2430 || ( ( aExp == 0x7FF ) 2431 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2432 ) { 2433 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2434 } 2435 } 2436 return (sbits64) LIT64( 0x8000000000000000 ); 2437 } 2438 z = aSig<<shiftCount; 2439 } 2440 else { 2441 if ( aExp < 0x3FE ) { 2442 if ( aExp | aSig ) float_set_inexact(); 2443 return 0; 2444 } 2445 z = aSig>>( - shiftCount ); 2446 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2447 float_set_inexact(); 2448 } 2449 } 2450 if ( aSign ) z = - z; 2451 return z; 2452 2453 } 2454 #endif /* !SOFTFLOAT_FOR_GCC */ 2455 2456 /* 2457 ------------------------------------------------------------------------------- 2458 Returns the result of converting the double-precision floating-point value 2459 `a' to the single-precision floating-point format. The conversion is 2460 performed according to the IEC/IEEE Standard for Binary Floating-Point 2461 Arithmetic. 2462 ------------------------------------------------------------------------------- 2463 */ 2464 float32 float64_to_float32( float64 a ) 2465 { 2466 flag aSign; 2467 int16 aExp; 2468 bits64 aSig; 2469 bits32 zSig; 2470 2471 aSig = extractFloat64Frac( a ); 2472 aExp = extractFloat64Exp( a ); 2473 aSign = extractFloat64Sign( a ); 2474 if ( aExp == 0x7FF ) { 2475 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2476 return packFloat32( aSign, 0xFF, 0 ); 2477 } 2478 shift64RightJamming( aSig, 22, &aSig ); 2479 zSig = aSig; 2480 if ( aExp || zSig ) { 2481 zSig |= 0x40000000; 2482 aExp -= 0x381; 2483 } 2484 return roundAndPackFloat32( aSign, aExp, zSig ); 2485 2486 } 2487 2488 #ifdef FLOATX80 2489 2490 /* 2491 ------------------------------------------------------------------------------- 2492 Returns the result of converting the double-precision floating-point value 2493 `a' to the extended double-precision floating-point format. The conversion 2494 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2495 Arithmetic. 2496 ------------------------------------------------------------------------------- 2497 */ 2498 floatx80 float64_to_floatx80( float64 a ) 2499 { 2500 flag aSign; 2501 int16 aExp; 2502 bits64 aSig; 2503 2504 aSig = extractFloat64Frac( a ); 2505 aExp = extractFloat64Exp( a ); 2506 aSign = extractFloat64Sign( a ); 2507 if ( aExp == 0x7FF ) { 2508 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2509 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2510 } 2511 if ( aExp == 0 ) { 2512 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2513 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2514 } 2515 return 2516 packFloatx80( 2517 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2518 2519 } 2520 2521 #endif 2522 2523 #ifdef FLOAT128 2524 2525 /* 2526 ------------------------------------------------------------------------------- 2527 Returns the result of converting the double-precision floating-point value 2528 `a' to the quadruple-precision floating-point format. The conversion is 2529 performed according to the IEC/IEEE Standard for Binary Floating-Point 2530 Arithmetic. 2531 ------------------------------------------------------------------------------- 2532 */ 2533 float128 float64_to_float128( float64 a ) 2534 { 2535 flag aSign; 2536 int16 aExp; 2537 bits64 aSig, zSig0, zSig1; 2538 2539 aSig = extractFloat64Frac( a ); 2540 aExp = extractFloat64Exp( a ); 2541 aSign = extractFloat64Sign( a ); 2542 if ( aExp == 0x7FF ) { 2543 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2544 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2545 } 2546 if ( aExp == 0 ) { 2547 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2548 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2549 --aExp; 2550 } 2551 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2552 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2553 2554 } 2555 2556 #endif 2557 2558 #ifndef SOFTFLOAT_FOR_GCC 2559 /* 2560 ------------------------------------------------------------------------------- 2561 Rounds the double-precision floating-point value `a' to an integer, and 2562 returns the result as a double-precision floating-point value. The 2563 operation is performed according to the IEC/IEEE Standard for Binary 2564 Floating-Point Arithmetic. 2565 ------------------------------------------------------------------------------- 2566 */ 2567 float64 float64_round_to_int( float64 a ) 2568 { 2569 flag aSign; 2570 int16 aExp; 2571 bits64 lastBitMask, roundBitsMask; 2572 int8 roundingMode; 2573 float64 z; 2574 2575 aExp = extractFloat64Exp( a ); 2576 if ( 0x433 <= aExp ) { 2577 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2578 return propagateFloat64NaN( a, a ); 2579 } 2580 return a; 2581 } 2582 if ( aExp < 0x3FF ) { 2583 if ( (bits64) ( a<<1 ) == 0 ) return a; 2584 float_set_inexact(); 2585 aSign = extractFloat64Sign( a ); 2586 switch ( float_rounding_mode() ) { 2587 case float_round_nearest_even: 2588 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2589 return packFloat64( aSign, 0x3FF, 0 ); 2590 } 2591 break; 2592 case float_round_down: 2593 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2594 case float_round_up: 2595 return 2596 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2597 } 2598 return packFloat64( aSign, 0, 0 ); 2599 } 2600 lastBitMask = 1; 2601 lastBitMask <<= 0x433 - aExp; 2602 roundBitsMask = lastBitMask - 1; 2603 z = a; 2604 roundingMode = float_rounding_mode(); 2605 if ( roundingMode == float_round_nearest_even ) { 2606 z += lastBitMask>>1; 2607 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2608 } 2609 else if ( roundingMode != float_round_to_zero ) { 2610 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2611 z += roundBitsMask; 2612 } 2613 } 2614 z &= ~ roundBitsMask; 2615 if ( z != a ) float_set_inexact(); 2616 return z; 2617 2618 } 2619 #endif 2620 2621 /* 2622 ------------------------------------------------------------------------------- 2623 Returns the result of adding the absolute values of the double-precision 2624 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2625 before being returned. `zSign' is ignored if the result is a NaN. 2626 The addition is performed according to the IEC/IEEE Standard for Binary 2627 Floating-Point Arithmetic. 2628 ------------------------------------------------------------------------------- 2629 */ 2630 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2631 { 2632 int16 aExp, bExp, zExp; 2633 bits64 aSig, bSig, zSig; 2634 int16 expDiff; 2635 2636 aSig = extractFloat64Frac( a ); 2637 aExp = extractFloat64Exp( a ); 2638 bSig = extractFloat64Frac( b ); 2639 bExp = extractFloat64Exp( b ); 2640 expDiff = aExp - bExp; 2641 aSig <<= 9; 2642 bSig <<= 9; 2643 if ( 0 < expDiff ) { 2644 if ( aExp == 0x7FF ) { 2645 if ( aSig ) return propagateFloat64NaN( a, b ); 2646 return a; 2647 } 2648 if ( bExp == 0 ) { 2649 --expDiff; 2650 } 2651 else { 2652 bSig |= LIT64( 0x2000000000000000 ); 2653 } 2654 shift64RightJamming( bSig, expDiff, &bSig ); 2655 zExp = aExp; 2656 } 2657 else if ( expDiff < 0 ) { 2658 if ( bExp == 0x7FF ) { 2659 if ( bSig ) return propagateFloat64NaN( a, b ); 2660 return packFloat64( zSign, 0x7FF, 0 ); 2661 } 2662 if ( aExp == 0 ) { 2663 ++expDiff; 2664 } 2665 else { 2666 aSig |= LIT64( 0x2000000000000000 ); 2667 } 2668 shift64RightJamming( aSig, - expDiff, &aSig ); 2669 zExp = bExp; 2670 } 2671 else { 2672 if ( aExp == 0x7FF ) { 2673 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2674 return a; 2675 } 2676 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2677 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2678 zExp = aExp; 2679 goto roundAndPack; 2680 } 2681 aSig |= LIT64( 0x2000000000000000 ); 2682 zSig = ( aSig + bSig )<<1; 2683 --zExp; 2684 if ( (sbits64) zSig < 0 ) { 2685 zSig = aSig + bSig; 2686 ++zExp; 2687 } 2688 roundAndPack: 2689 return roundAndPackFloat64( zSign, zExp, zSig ); 2690 2691 } 2692 2693 /* 2694 ------------------------------------------------------------------------------- 2695 Returns the result of subtracting the absolute values of the double- 2696 precision floating-point values `a' and `b'. If `zSign' is 1, the 2697 difference is negated before being returned. `zSign' is ignored if the 2698 result is a NaN. The subtraction is performed according to the IEC/IEEE 2699 Standard for Binary Floating-Point Arithmetic. 2700 ------------------------------------------------------------------------------- 2701 */ 2702 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2703 { 2704 int16 aExp, bExp, zExp; 2705 bits64 aSig, bSig, zSig; 2706 int16 expDiff; 2707 2708 aSig = extractFloat64Frac( a ); 2709 aExp = extractFloat64Exp( a ); 2710 bSig = extractFloat64Frac( b ); 2711 bExp = extractFloat64Exp( b ); 2712 expDiff = aExp - bExp; 2713 aSig <<= 10; 2714 bSig <<= 10; 2715 if ( 0 < expDiff ) goto aExpBigger; 2716 if ( expDiff < 0 ) goto bExpBigger; 2717 if ( aExp == 0x7FF ) { 2718 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2719 float_raise( float_flag_invalid ); 2720 return float64_default_nan; 2721 } 2722 if ( aExp == 0 ) { 2723 aExp = 1; 2724 bExp = 1; 2725 } 2726 if ( bSig < aSig ) goto aBigger; 2727 if ( aSig < bSig ) goto bBigger; 2728 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 ); 2729 bExpBigger: 2730 if ( bExp == 0x7FF ) { 2731 if ( bSig ) return propagateFloat64NaN( a, b ); 2732 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2733 } 2734 if ( aExp == 0 ) { 2735 ++expDiff; 2736 } 2737 else { 2738 aSig |= LIT64( 0x4000000000000000 ); 2739 } 2740 shift64RightJamming( aSig, - expDiff, &aSig ); 2741 bSig |= LIT64( 0x4000000000000000 ); 2742 bBigger: 2743 zSig = bSig - aSig; 2744 zExp = bExp; 2745 zSign ^= 1; 2746 goto normalizeRoundAndPack; 2747 aExpBigger: 2748 if ( aExp == 0x7FF ) { 2749 if ( aSig ) return propagateFloat64NaN( a, b ); 2750 return a; 2751 } 2752 if ( bExp == 0 ) { 2753 --expDiff; 2754 } 2755 else { 2756 bSig |= LIT64( 0x4000000000000000 ); 2757 } 2758 shift64RightJamming( bSig, expDiff, &bSig ); 2759 aSig |= LIT64( 0x4000000000000000 ); 2760 aBigger: 2761 zSig = aSig - bSig; 2762 zExp = aExp; 2763 normalizeRoundAndPack: 2764 --zExp; 2765 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2766 2767 } 2768 2769 /* 2770 ------------------------------------------------------------------------------- 2771 Returns the result of adding the double-precision floating-point values `a' 2772 and `b'. The operation is performed according to the IEC/IEEE Standard for 2773 Binary Floating-Point Arithmetic. 2774 ------------------------------------------------------------------------------- 2775 */ 2776 float64 float64_add( float64 a, float64 b ) 2777 { 2778 flag aSign, bSign; 2779 2780 aSign = extractFloat64Sign( a ); 2781 bSign = extractFloat64Sign( b ); 2782 if ( aSign == bSign ) { 2783 return addFloat64Sigs( a, b, aSign ); 2784 } 2785 else { 2786 return subFloat64Sigs( a, b, aSign ); 2787 } 2788 2789 } 2790 2791 /* 2792 ------------------------------------------------------------------------------- 2793 Returns the result of subtracting the double-precision floating-point values 2794 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2795 for Binary Floating-Point Arithmetic. 2796 ------------------------------------------------------------------------------- 2797 */ 2798 float64 float64_sub( float64 a, float64 b ) 2799 { 2800 flag aSign, bSign; 2801 2802 aSign = extractFloat64Sign( a ); 2803 bSign = extractFloat64Sign( b ); 2804 if ( aSign == bSign ) { 2805 return subFloat64Sigs( a, b, aSign ); 2806 } 2807 else { 2808 return addFloat64Sigs( a, b, aSign ); 2809 } 2810 2811 } 2812 2813 /* 2814 ------------------------------------------------------------------------------- 2815 Returns the result of multiplying the double-precision floating-point values 2816 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2817 for Binary Floating-Point Arithmetic. 2818 ------------------------------------------------------------------------------- 2819 */ 2820 float64 float64_mul( float64 a, float64 b ) 2821 { 2822 flag aSign, bSign, zSign; 2823 int16 aExp, bExp, zExp; 2824 bits64 aSig, bSig, zSig0, zSig1; 2825 2826 aSig = extractFloat64Frac( a ); 2827 aExp = extractFloat64Exp( a ); 2828 aSign = extractFloat64Sign( a ); 2829 bSig = extractFloat64Frac( b ); 2830 bExp = extractFloat64Exp( b ); 2831 bSign = extractFloat64Sign( b ); 2832 zSign = aSign ^ bSign; 2833 if ( aExp == 0x7FF ) { 2834 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2835 return propagateFloat64NaN( a, b ); 2836 } 2837 if ( ( bExp | bSig ) == 0 ) { 2838 float_raise( float_flag_invalid ); 2839 return float64_default_nan; 2840 } 2841 return packFloat64( zSign, 0x7FF, 0 ); 2842 } 2843 if ( bExp == 0x7FF ) { 2844 if ( bSig ) return propagateFloat64NaN( a, b ); 2845 if ( ( aExp | aSig ) == 0 ) { 2846 float_raise( float_flag_invalid ); 2847 return float64_default_nan; 2848 } 2849 return packFloat64( zSign, 0x7FF, 0 ); 2850 } 2851 if ( aExp == 0 ) { 2852 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2853 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2854 } 2855 if ( bExp == 0 ) { 2856 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2857 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2858 } 2859 zExp = aExp + bExp - 0x3FF; 2860 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2861 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2862 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2863 zSig0 |= ( zSig1 != 0 ); 2864 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2865 zSig0 <<= 1; 2866 --zExp; 2867 } 2868 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2869 2870 } 2871 2872 /* 2873 ------------------------------------------------------------------------------- 2874 Returns the result of dividing the double-precision floating-point value `a' 2875 by the corresponding value `b'. The operation is performed according to 2876 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2877 ------------------------------------------------------------------------------- 2878 */ 2879 float64 float64_div( float64 a, float64 b ) 2880 { 2881 flag aSign, bSign, zSign; 2882 int16 aExp, bExp, zExp; 2883 bits64 aSig, bSig, zSig; 2884 bits64 rem0, rem1; 2885 bits64 term0, term1; 2886 2887 aSig = extractFloat64Frac( a ); 2888 aExp = extractFloat64Exp( a ); 2889 aSign = extractFloat64Sign( a ); 2890 bSig = extractFloat64Frac( b ); 2891 bExp = extractFloat64Exp( b ); 2892 bSign = extractFloat64Sign( b ); 2893 zSign = aSign ^ bSign; 2894 if ( aExp == 0x7FF ) { 2895 if ( aSig ) return propagateFloat64NaN( a, b ); 2896 if ( bExp == 0x7FF ) { 2897 if ( bSig ) return propagateFloat64NaN( a, b ); 2898 float_raise( float_flag_invalid ); 2899 return float64_default_nan; 2900 } 2901 return packFloat64( zSign, 0x7FF, 0 ); 2902 } 2903 if ( bExp == 0x7FF ) { 2904 if ( bSig ) return propagateFloat64NaN( a, b ); 2905 return packFloat64( zSign, 0, 0 ); 2906 } 2907 if ( bExp == 0 ) { 2908 if ( bSig == 0 ) { 2909 if ( ( aExp | aSig ) == 0 ) { 2910 float_raise( float_flag_invalid ); 2911 return float64_default_nan; 2912 } 2913 float_raise( float_flag_divbyzero ); 2914 return packFloat64( zSign, 0x7FF, 0 ); 2915 } 2916 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2917 } 2918 if ( aExp == 0 ) { 2919 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2920 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2921 } 2922 zExp = aExp - bExp + 0x3FD; 2923 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2924 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2925 if ( bSig <= ( aSig + aSig ) ) { 2926 aSig >>= 1; 2927 ++zExp; 2928 } 2929 zSig = estimateDiv128To64( aSig, 0, bSig ); 2930 if ( ( zSig & 0x1FF ) <= 2 ) { 2931 mul64To128( bSig, zSig, &term0, &term1 ); 2932 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2933 while ( (sbits64) rem0 < 0 ) { 2934 --zSig; 2935 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2936 } 2937 zSig |= ( rem1 != 0 ); 2938 } 2939 return roundAndPackFloat64( zSign, zExp, zSig ); 2940 2941 } 2942 2943 #ifndef SOFTFLOAT_FOR_GCC 2944 /* 2945 ------------------------------------------------------------------------------- 2946 Returns the remainder of the double-precision floating-point value `a' 2947 with respect to the corresponding value `b'. The operation is performed 2948 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2949 ------------------------------------------------------------------------------- 2950 */ 2951 float64 float64_rem( float64 a, float64 b ) 2952 { 2953 flag aSign, bSign, zSign; 2954 int16 aExp, bExp, expDiff; 2955 bits64 aSig, bSig; 2956 bits64 q, alternateASig; 2957 sbits64 sigMean; 2958 2959 aSig = extractFloat64Frac( a ); 2960 aExp = extractFloat64Exp( a ); 2961 aSign = extractFloat64Sign( a ); 2962 bSig = extractFloat64Frac( b ); 2963 bExp = extractFloat64Exp( b ); 2964 bSign = extractFloat64Sign( b ); 2965 if ( aExp == 0x7FF ) { 2966 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2967 return propagateFloat64NaN( a, b ); 2968 } 2969 float_raise( float_flag_invalid ); 2970 return float64_default_nan; 2971 } 2972 if ( bExp == 0x7FF ) { 2973 if ( bSig ) return propagateFloat64NaN( a, b ); 2974 return a; 2975 } 2976 if ( bExp == 0 ) { 2977 if ( bSig == 0 ) { 2978 float_raise( float_flag_invalid ); 2979 return float64_default_nan; 2980 } 2981 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2982 } 2983 if ( aExp == 0 ) { 2984 if ( aSig == 0 ) return a; 2985 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2986 } 2987 expDiff = aExp - bExp; 2988 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2989 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2990 if ( expDiff < 0 ) { 2991 if ( expDiff < -1 ) return a; 2992 aSig >>= 1; 2993 } 2994 q = ( bSig <= aSig ); 2995 if ( q ) aSig -= bSig; 2996 expDiff -= 64; 2997 while ( 0 < expDiff ) { 2998 q = estimateDiv128To64( aSig, 0, bSig ); 2999 q = ( 2 < q ) ? q - 2 : 0; 3000 aSig = - ( ( bSig>>2 ) * q ); 3001 expDiff -= 62; 3002 } 3003 expDiff += 64; 3004 if ( 0 < expDiff ) { 3005 q = estimateDiv128To64( aSig, 0, bSig ); 3006 q = ( 2 < q ) ? q - 2 : 0; 3007 q >>= 64 - expDiff; 3008 bSig >>= 2; 3009 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3010 } 3011 else { 3012 aSig >>= 2; 3013 bSig >>= 2; 3014 } 3015 do { 3016 alternateASig = aSig; 3017 ++q; 3018 aSig -= bSig; 3019 } while ( 0 <= (sbits64) aSig ); 3020 sigMean = aSig + alternateASig; 3021 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3022 aSig = alternateASig; 3023 } 3024 zSign = ( (sbits64) aSig < 0 ); 3025 if ( zSign ) aSig = - aSig; 3026 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3027 3028 } 3029 3030 /* 3031 ------------------------------------------------------------------------------- 3032 Returns the square root of the double-precision floating-point value `a'. 3033 The operation is performed according to the IEC/IEEE Standard for Binary 3034 Floating-Point Arithmetic. 3035 ------------------------------------------------------------------------------- 3036 */ 3037 float64 float64_sqrt( float64 a ) 3038 { 3039 flag aSign; 3040 int16 aExp, zExp; 3041 bits64 aSig, zSig, doubleZSig; 3042 bits64 rem0, rem1, term0, term1; 3043 3044 aSig = extractFloat64Frac( a ); 3045 aExp = extractFloat64Exp( a ); 3046 aSign = extractFloat64Sign( a ); 3047 if ( aExp == 0x7FF ) { 3048 if ( aSig ) return propagateFloat64NaN( a, a ); 3049 if ( ! aSign ) return a; 3050 float_raise( float_flag_invalid ); 3051 return float64_default_nan; 3052 } 3053 if ( aSign ) { 3054 if ( ( aExp | aSig ) == 0 ) return a; 3055 float_raise( float_flag_invalid ); 3056 return float64_default_nan; 3057 } 3058 if ( aExp == 0 ) { 3059 if ( aSig == 0 ) return 0; 3060 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3061 } 3062 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3063 aSig |= LIT64( 0x0010000000000000 ); 3064 zSig = estimateSqrt32( aExp, aSig>>21 ); 3065 aSig <<= 9 - ( aExp & 1 ); 3066 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3067 if ( ( zSig & 0x1FF ) <= 5 ) { 3068 doubleZSig = zSig<<1; 3069 mul64To128( zSig, zSig, &term0, &term1 ); 3070 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3071 while ( (sbits64) rem0 < 0 ) { 3072 --zSig; 3073 doubleZSig -= 2; 3074 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3075 } 3076 zSig |= ( ( rem0 | rem1 ) != 0 ); 3077 } 3078 return roundAndPackFloat64( 0, zExp, zSig ); 3079 3080 } 3081 #endif 3082 3083 /* 3084 ------------------------------------------------------------------------------- 3085 Returns 1 if the double-precision floating-point value `a' is equal to the 3086 corresponding value `b', and 0 otherwise. The comparison is performed 3087 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3088 ------------------------------------------------------------------------------- 3089 */ 3090 flag float64_eq( float64 a, float64 b ) 3091 { 3092 3093 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3094 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3095 ) { 3096 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3097 float_raise( float_flag_invalid ); 3098 } 3099 return 0; 3100 } 3101 return ( a == b ) || 3102 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3103 3104 } 3105 3106 /* 3107 ------------------------------------------------------------------------------- 3108 Returns 1 if the double-precision floating-point value `a' is less than or 3109 equal to the corresponding value `b', and 0 otherwise. The comparison is 3110 performed according to the IEC/IEEE Standard for Binary Floating-Point 3111 Arithmetic. 3112 ------------------------------------------------------------------------------- 3113 */ 3114 flag float64_le( float64 a, float64 b ) 3115 { 3116 flag aSign, bSign; 3117 3118 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3119 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3120 ) { 3121 float_raise( float_flag_invalid ); 3122 return 0; 3123 } 3124 aSign = extractFloat64Sign( a ); 3125 bSign = extractFloat64Sign( b ); 3126 if ( aSign != bSign ) 3127 return aSign || 3128 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3129 0 ); 3130 return ( a == b ) || 3131 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3132 3133 } 3134 3135 /* 3136 ------------------------------------------------------------------------------- 3137 Returns 1 if the double-precision floating-point value `a' is less than 3138 the corresponding value `b', and 0 otherwise. The comparison is performed 3139 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3140 ------------------------------------------------------------------------------- 3141 */ 3142 flag float64_lt( float64 a, float64 b ) 3143 { 3144 flag aSign, bSign; 3145 3146 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3147 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3148 ) { 3149 float_raise( float_flag_invalid ); 3150 return 0; 3151 } 3152 aSign = extractFloat64Sign( a ); 3153 bSign = extractFloat64Sign( b ); 3154 if ( aSign != bSign ) 3155 return aSign && 3156 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3157 0 ); 3158 return ( a != b ) && 3159 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3160 3161 } 3162 3163 #ifndef SOFTFLOAT_FOR_GCC 3164 /* 3165 ------------------------------------------------------------------------------- 3166 Returns 1 if the double-precision floating-point value `a' is equal to the 3167 corresponding value `b', and 0 otherwise. The invalid exception is raised 3168 if either operand is a NaN. Otherwise, the comparison is performed 3169 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3170 ------------------------------------------------------------------------------- 3171 */ 3172 flag float64_eq_signaling( float64 a, float64 b ) 3173 { 3174 3175 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3176 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3177 ) { 3178 float_raise( float_flag_invalid ); 3179 return 0; 3180 } 3181 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3182 3183 } 3184 3185 /* 3186 ------------------------------------------------------------------------------- 3187 Returns 1 if the double-precision floating-point value `a' is less than or 3188 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3189 cause an exception. Otherwise, the comparison is performed according to the 3190 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3191 ------------------------------------------------------------------------------- 3192 */ 3193 flag float64_le_quiet( float64 a, float64 b ) 3194 { 3195 flag aSign, bSign; 3196 3197 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3198 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3199 ) { 3200 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3201 float_raise( float_flag_invalid ); 3202 } 3203 return 0; 3204 } 3205 aSign = extractFloat64Sign( a ); 3206 bSign = extractFloat64Sign( b ); 3207 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3208 return ( a == b ) || ( aSign ^ ( a < b ) ); 3209 3210 } 3211 3212 /* 3213 ------------------------------------------------------------------------------- 3214 Returns 1 if the double-precision floating-point value `a' is less than 3215 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3216 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3217 Standard for Binary Floating-Point Arithmetic. 3218 ------------------------------------------------------------------------------- 3219 */ 3220 flag float64_lt_quiet( float64 a, float64 b ) 3221 { 3222 flag aSign, bSign; 3223 3224 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3225 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3226 ) { 3227 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3228 float_raise( float_flag_invalid ); 3229 } 3230 return 0; 3231 } 3232 aSign = extractFloat64Sign( a ); 3233 bSign = extractFloat64Sign( b ); 3234 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3235 return ( a != b ) && ( aSign ^ ( a < b ) ); 3236 3237 } 3238 #endif 3239 3240 #ifdef FLOATX80 3241 3242 /* 3243 ------------------------------------------------------------------------------- 3244 Returns the result of converting the extended double-precision floating- 3245 point value `a' to the 32-bit two's complement integer format. The 3246 conversion is performed according to the IEC/IEEE Standard for Binary 3247 Floating-Point Arithmetic---which means in particular that the conversion 3248 is rounded according to the current rounding mode. If `a' is a NaN, the 3249 largest positive integer is returned. Otherwise, if the conversion 3250 overflows, the largest integer with the same sign as `a' is returned. 3251 ------------------------------------------------------------------------------- 3252 */ 3253 int32 floatx80_to_int32( floatx80 a ) 3254 { 3255 flag aSign; 3256 int32 aExp, shiftCount; 3257 bits64 aSig; 3258 3259 aSig = extractFloatx80Frac( a ); 3260 aExp = extractFloatx80Exp( a ); 3261 aSign = extractFloatx80Sign( a ); 3262 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3263 shiftCount = 0x4037 - aExp; 3264 if ( shiftCount <= 0 ) shiftCount = 1; 3265 shift64RightJamming( aSig, shiftCount, &aSig ); 3266 return roundAndPackInt32( aSign, aSig ); 3267 3268 } 3269 3270 /* 3271 ------------------------------------------------------------------------------- 3272 Returns the result of converting the extended double-precision floating- 3273 point value `a' to the 32-bit two's complement integer format. The 3274 conversion is performed according to the IEC/IEEE Standard for Binary 3275 Floating-Point Arithmetic, except that the conversion is always rounded 3276 toward zero. If `a' is a NaN, the largest positive integer is returned. 3277 Otherwise, if the conversion overflows, the largest integer with the same 3278 sign as `a' is returned. 3279 ------------------------------------------------------------------------------- 3280 */ 3281 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3282 { 3283 flag aSign; 3284 int32 aExp, shiftCount; 3285 bits64 aSig, savedASig; 3286 int32 z; 3287 3288 aSig = extractFloatx80Frac( a ); 3289 aExp = extractFloatx80Exp( a ); 3290 aSign = extractFloatx80Sign( a ); 3291 if ( 0x401E < aExp ) { 3292 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3293 goto invalid; 3294 } 3295 else if ( aExp < 0x3FFF ) { 3296 if ( aExp || aSig ) float_set_inexact(); 3297 return 0; 3298 } 3299 shiftCount = 0x403E - aExp; 3300 savedASig = aSig; 3301 aSig >>= shiftCount; 3302 z = aSig; 3303 if ( aSign ) z = - z; 3304 if ( ( z < 0 ) ^ aSign ) { 3305 invalid: 3306 float_raise( float_flag_invalid ); 3307 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3308 } 3309 if ( ( aSig<<shiftCount ) != savedASig ) { 3310 float_set_inexact(); 3311 } 3312 return z; 3313 3314 } 3315 3316 /* 3317 ------------------------------------------------------------------------------- 3318 Returns the result of converting the extended double-precision floating- 3319 point value `a' to the 64-bit two's complement integer format. The 3320 conversion is performed according to the IEC/IEEE Standard for Binary 3321 Floating-Point Arithmetic---which means in particular that the conversion 3322 is rounded according to the current rounding mode. If `a' is a NaN, 3323 the largest positive integer is returned. Otherwise, if the conversion 3324 overflows, the largest integer with the same sign as `a' is returned. 3325 ------------------------------------------------------------------------------- 3326 */ 3327 int64 floatx80_to_int64( floatx80 a ) 3328 { 3329 flag aSign; 3330 int32 aExp, shiftCount; 3331 bits64 aSig, aSigExtra; 3332 3333 aSig = extractFloatx80Frac( a ); 3334 aExp = extractFloatx80Exp( a ); 3335 aSign = extractFloatx80Sign( a ); 3336 shiftCount = 0x403E - aExp; 3337 if ( shiftCount <= 0 ) { 3338 if ( shiftCount ) { 3339 float_raise( float_flag_invalid ); 3340 if ( ! aSign 3341 || ( ( aExp == 0x7FFF ) 3342 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3343 ) { 3344 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3345 } 3346 return (sbits64) LIT64( 0x8000000000000000 ); 3347 } 3348 aSigExtra = 0; 3349 } 3350 else { 3351 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3352 } 3353 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3354 3355 } 3356 3357 /* 3358 ------------------------------------------------------------------------------- 3359 Returns the result of converting the extended double-precision floating- 3360 point value `a' to the 64-bit two's complement integer format. The 3361 conversion is performed according to the IEC/IEEE Standard for Binary 3362 Floating-Point Arithmetic, except that the conversion is always rounded 3363 toward zero. If `a' is a NaN, the largest positive integer is returned. 3364 Otherwise, if the conversion overflows, the largest integer with the same 3365 sign as `a' is returned. 3366 ------------------------------------------------------------------------------- 3367 */ 3368 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3369 { 3370 flag aSign; 3371 int32 aExp, shiftCount; 3372 bits64 aSig; 3373 int64 z; 3374 3375 aSig = extractFloatx80Frac( a ); 3376 aExp = extractFloatx80Exp( a ); 3377 aSign = extractFloatx80Sign( a ); 3378 shiftCount = aExp - 0x403E; 3379 if ( 0 <= shiftCount ) { 3380 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3381 if ( ( a.high != 0xC03E ) || aSig ) { 3382 float_raise( float_flag_invalid ); 3383 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3384 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3385 } 3386 } 3387 return (sbits64) LIT64( 0x8000000000000000 ); 3388 } 3389 else if ( aExp < 0x3FFF ) { 3390 if ( aExp | aSig ) float_set_inexact(); 3391 return 0; 3392 } 3393 z = aSig>>( - shiftCount ); 3394 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3395 float_set_inexact(); 3396 } 3397 if ( aSign ) z = - z; 3398 return z; 3399 3400 } 3401 3402 /* 3403 ------------------------------------------------------------------------------- 3404 Returns the result of converting the extended double-precision floating- 3405 point value `a' to the single-precision floating-point format. The 3406 conversion is performed according to the IEC/IEEE Standard for Binary 3407 Floating-Point Arithmetic. 3408 ------------------------------------------------------------------------------- 3409 */ 3410 float32 floatx80_to_float32( floatx80 a ) 3411 { 3412 flag aSign; 3413 int32 aExp; 3414 bits64 aSig; 3415 3416 aSig = extractFloatx80Frac( a ); 3417 aExp = extractFloatx80Exp( a ); 3418 aSign = extractFloatx80Sign( a ); 3419 if ( aExp == 0x7FFF ) { 3420 if ( (bits64) ( aSig<<1 ) ) { 3421 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3422 } 3423 return packFloat32( aSign, 0xFF, 0 ); 3424 } 3425 shift64RightJamming( aSig, 33, &aSig ); 3426 if ( aExp || aSig ) aExp -= 0x3F81; 3427 return roundAndPackFloat32( aSign, aExp, aSig ); 3428 3429 } 3430 3431 /* 3432 ------------------------------------------------------------------------------- 3433 Returns the result of converting the extended double-precision floating- 3434 point value `a' to the double-precision floating-point format. The 3435 conversion is performed according to the IEC/IEEE Standard for Binary 3436 Floating-Point Arithmetic. 3437 ------------------------------------------------------------------------------- 3438 */ 3439 float64 floatx80_to_float64( floatx80 a ) 3440 { 3441 flag aSign; 3442 int32 aExp; 3443 bits64 aSig, zSig; 3444 3445 aSig = extractFloatx80Frac( a ); 3446 aExp = extractFloatx80Exp( a ); 3447 aSign = extractFloatx80Sign( a ); 3448 if ( aExp == 0x7FFF ) { 3449 if ( (bits64) ( aSig<<1 ) ) { 3450 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3451 } 3452 return packFloat64( aSign, 0x7FF, 0 ); 3453 } 3454 shift64RightJamming( aSig, 1, &zSig ); 3455 if ( aExp || aSig ) aExp -= 0x3C01; 3456 return roundAndPackFloat64( aSign, aExp, zSig ); 3457 3458 } 3459 3460 #ifdef FLOAT128 3461 3462 /* 3463 ------------------------------------------------------------------------------- 3464 Returns the result of converting the extended double-precision floating- 3465 point value `a' to the quadruple-precision floating-point format. The 3466 conversion is performed according to the IEC/IEEE Standard for Binary 3467 Floating-Point Arithmetic. 3468 ------------------------------------------------------------------------------- 3469 */ 3470 float128 floatx80_to_float128( floatx80 a ) 3471 { 3472 flag aSign; 3473 int16 aExp; 3474 bits64 aSig, zSig0, zSig1; 3475 3476 aSig = extractFloatx80Frac( a ); 3477 aExp = extractFloatx80Exp( a ); 3478 aSign = extractFloatx80Sign( a ); 3479 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3480 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3481 } 3482 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3483 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3484 3485 } 3486 3487 #endif 3488 3489 /* 3490 ------------------------------------------------------------------------------- 3491 Rounds the extended double-precision floating-point value `a' to an integer, 3492 and returns the result as an extended quadruple-precision floating-point 3493 value. The operation is performed according to the IEC/IEEE Standard for 3494 Binary Floating-Point Arithmetic. 3495 ------------------------------------------------------------------------------- 3496 */ 3497 floatx80 floatx80_round_to_int( floatx80 a ) 3498 { 3499 flag aSign; 3500 int32 aExp; 3501 bits64 lastBitMask, roundBitsMask; 3502 int8 roundingMode; 3503 floatx80 z; 3504 3505 aExp = extractFloatx80Exp( a ); 3506 if ( 0x403E <= aExp ) { 3507 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3508 return propagateFloatx80NaN( a, a ); 3509 } 3510 return a; 3511 } 3512 if ( aExp < 0x3FFF ) { 3513 if ( ( aExp == 0 ) 3514 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3515 return a; 3516 } 3517 float_set_inexact(); 3518 aSign = extractFloatx80Sign( a ); 3519 switch ( float_rounding_mode() ) { 3520 case float_round_nearest_even: 3521 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3522 ) { 3523 return 3524 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3525 } 3526 break; 3527 case float_round_down: 3528 return 3529 aSign ? 3530 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3531 : packFloatx80( 0, 0, 0 ); 3532 case float_round_up: 3533 return 3534 aSign ? packFloatx80( 1, 0, 0 ) 3535 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3536 } 3537 return packFloatx80( aSign, 0, 0 ); 3538 } 3539 lastBitMask = 1; 3540 lastBitMask <<= 0x403E - aExp; 3541 roundBitsMask = lastBitMask - 1; 3542 z = a; 3543 roundingMode = float_rounding_mode(); 3544 if ( roundingMode == float_round_nearest_even ) { 3545 z.low += lastBitMask>>1; 3546 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3547 } 3548 else if ( roundingMode != float_round_to_zero ) { 3549 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3550 z.low += roundBitsMask; 3551 } 3552 } 3553 z.low &= ~ roundBitsMask; 3554 if ( z.low == 0 ) { 3555 ++z.high; 3556 z.low = LIT64( 0x8000000000000000 ); 3557 } 3558 if ( z.low != a.low ) float_set_inexact(); 3559 return z; 3560 3561 } 3562 3563 /* 3564 ------------------------------------------------------------------------------- 3565 Returns the result of adding the absolute values of the extended double- 3566 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3567 negated before being returned. `zSign' is ignored if the result is a NaN. 3568 The addition is performed according to the IEC/IEEE Standard for Binary 3569 Floating-Point Arithmetic. 3570 ------------------------------------------------------------------------------- 3571 */ 3572 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3573 { 3574 int32 aExp, bExp, zExp; 3575 bits64 aSig, bSig, zSig0, zSig1; 3576 int32 expDiff; 3577 3578 aSig = extractFloatx80Frac( a ); 3579 aExp = extractFloatx80Exp( a ); 3580 bSig = extractFloatx80Frac( b ); 3581 bExp = extractFloatx80Exp( b ); 3582 expDiff = aExp - bExp; 3583 if ( 0 < expDiff ) { 3584 if ( aExp == 0x7FFF ) { 3585 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3586 return a; 3587 } 3588 if ( bExp == 0 ) --expDiff; 3589 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3590 zExp = aExp; 3591 } 3592 else if ( expDiff < 0 ) { 3593 if ( bExp == 0x7FFF ) { 3594 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3595 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3596 } 3597 if ( aExp == 0 ) ++expDiff; 3598 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3599 zExp = bExp; 3600 } 3601 else { 3602 if ( aExp == 0x7FFF ) { 3603 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3604 return propagateFloatx80NaN( a, b ); 3605 } 3606 return a; 3607 } 3608 zSig1 = 0; 3609 zSig0 = aSig + bSig; 3610 if ( aExp == 0 ) { 3611 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3612 goto roundAndPack; 3613 } 3614 zExp = aExp; 3615 goto shiftRight1; 3616 } 3617 zSig0 = aSig + bSig; 3618 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3619 shiftRight1: 3620 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3621 zSig0 |= LIT64( 0x8000000000000000 ); 3622 ++zExp; 3623 roundAndPack: 3624 return 3625 roundAndPackFloatx80( 3626 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3627 3628 } 3629 3630 /* 3631 ------------------------------------------------------------------------------- 3632 Returns the result of subtracting the absolute values of the extended 3633 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3634 difference is negated before being returned. `zSign' is ignored if the 3635 result is a NaN. The subtraction is performed according to the IEC/IEEE 3636 Standard for Binary Floating-Point Arithmetic. 3637 ------------------------------------------------------------------------------- 3638 */ 3639 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3640 { 3641 int32 aExp, bExp, zExp; 3642 bits64 aSig, bSig, zSig0, zSig1; 3643 int32 expDiff; 3644 floatx80 z; 3645 3646 aSig = extractFloatx80Frac( a ); 3647 aExp = extractFloatx80Exp( a ); 3648 bSig = extractFloatx80Frac( b ); 3649 bExp = extractFloatx80Exp( b ); 3650 expDiff = aExp - bExp; 3651 if ( 0 < expDiff ) goto aExpBigger; 3652 if ( expDiff < 0 ) goto bExpBigger; 3653 if ( aExp == 0x7FFF ) { 3654 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3655 return propagateFloatx80NaN( a, b ); 3656 } 3657 float_raise( float_flag_invalid ); 3658 z.low = floatx80_default_nan_low; 3659 z.high = floatx80_default_nan_high; 3660 return z; 3661 } 3662 if ( aExp == 0 ) { 3663 aExp = 1; 3664 bExp = 1; 3665 } 3666 zSig1 = 0; 3667 if ( bSig < aSig ) goto aBigger; 3668 if ( aSig < bSig ) goto bBigger; 3669 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 ); 3670 bExpBigger: 3671 if ( bExp == 0x7FFF ) { 3672 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3673 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3674 } 3675 if ( aExp == 0 ) ++expDiff; 3676 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3677 bBigger: 3678 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3679 zExp = bExp; 3680 zSign ^= 1; 3681 goto normalizeRoundAndPack; 3682 aExpBigger: 3683 if ( aExp == 0x7FFF ) { 3684 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3685 return a; 3686 } 3687 if ( bExp == 0 ) --expDiff; 3688 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3689 aBigger: 3690 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3691 zExp = aExp; 3692 normalizeRoundAndPack: 3693 return 3694 normalizeRoundAndPackFloatx80( 3695 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3696 3697 } 3698 3699 /* 3700 ------------------------------------------------------------------------------- 3701 Returns the result of adding the extended double-precision floating-point 3702 values `a' and `b'. The operation is performed according to the IEC/IEEE 3703 Standard for Binary Floating-Point Arithmetic. 3704 ------------------------------------------------------------------------------- 3705 */ 3706 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3707 { 3708 flag aSign, bSign; 3709 3710 aSign = extractFloatx80Sign( a ); 3711 bSign = extractFloatx80Sign( b ); 3712 if ( aSign == bSign ) { 3713 return addFloatx80Sigs( a, b, aSign ); 3714 } 3715 else { 3716 return subFloatx80Sigs( a, b, aSign ); 3717 } 3718 3719 } 3720 3721 /* 3722 ------------------------------------------------------------------------------- 3723 Returns the result of subtracting the extended double-precision floating- 3724 point values `a' and `b'. The operation is performed according to the 3725 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3726 ------------------------------------------------------------------------------- 3727 */ 3728 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3729 { 3730 flag aSign, bSign; 3731 3732 aSign = extractFloatx80Sign( a ); 3733 bSign = extractFloatx80Sign( b ); 3734 if ( aSign == bSign ) { 3735 return subFloatx80Sigs( a, b, aSign ); 3736 } 3737 else { 3738 return addFloatx80Sigs( a, b, aSign ); 3739 } 3740 3741 } 3742 3743 /* 3744 ------------------------------------------------------------------------------- 3745 Returns the result of multiplying the extended double-precision floating- 3746 point values `a' and `b'. The operation is performed according to the 3747 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3748 ------------------------------------------------------------------------------- 3749 */ 3750 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3751 { 3752 flag aSign, bSign, zSign; 3753 int32 aExp, bExp, zExp; 3754 bits64 aSig, bSig, zSig0, zSig1; 3755 floatx80 z; 3756 3757 aSig = extractFloatx80Frac( a ); 3758 aExp = extractFloatx80Exp( a ); 3759 aSign = extractFloatx80Sign( a ); 3760 bSig = extractFloatx80Frac( b ); 3761 bExp = extractFloatx80Exp( b ); 3762 bSign = extractFloatx80Sign( b ); 3763 zSign = aSign ^ bSign; 3764 if ( aExp == 0x7FFF ) { 3765 if ( (bits64) ( aSig<<1 ) 3766 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3767 return propagateFloatx80NaN( a, b ); 3768 } 3769 if ( ( bExp | bSig ) == 0 ) goto invalid; 3770 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3771 } 3772 if ( bExp == 0x7FFF ) { 3773 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3774 if ( ( aExp | aSig ) == 0 ) { 3775 invalid: 3776 float_raise( float_flag_invalid ); 3777 z.low = floatx80_default_nan_low; 3778 z.high = floatx80_default_nan_high; 3779 return z; 3780 } 3781 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3782 } 3783 if ( aExp == 0 ) { 3784 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3785 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3786 } 3787 if ( bExp == 0 ) { 3788 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3789 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3790 } 3791 zExp = aExp + bExp - 0x3FFE; 3792 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3793 if ( 0 < (sbits64) zSig0 ) { 3794 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3795 --zExp; 3796 } 3797 return 3798 roundAndPackFloatx80( 3799 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3800 3801 } 3802 3803 /* 3804 ------------------------------------------------------------------------------- 3805 Returns the result of dividing the extended double-precision floating-point 3806 value `a' by the corresponding value `b'. The operation is performed 3807 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3808 ------------------------------------------------------------------------------- 3809 */ 3810 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3811 { 3812 flag aSign, bSign, zSign; 3813 int32 aExp, bExp, zExp; 3814 bits64 aSig, bSig, zSig0, zSig1; 3815 bits64 rem0, rem1, rem2, term0, term1, term2; 3816 floatx80 z; 3817 3818 aSig = extractFloatx80Frac( a ); 3819 aExp = extractFloatx80Exp( a ); 3820 aSign = extractFloatx80Sign( a ); 3821 bSig = extractFloatx80Frac( b ); 3822 bExp = extractFloatx80Exp( b ); 3823 bSign = extractFloatx80Sign( b ); 3824 zSign = aSign ^ bSign; 3825 if ( aExp == 0x7FFF ) { 3826 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3827 if ( bExp == 0x7FFF ) { 3828 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3829 goto invalid; 3830 } 3831 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3832 } 3833 if ( bExp == 0x7FFF ) { 3834 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3835 return packFloatx80( zSign, 0, 0 ); 3836 } 3837 if ( bExp == 0 ) { 3838 if ( bSig == 0 ) { 3839 if ( ( aExp | aSig ) == 0 ) { 3840 invalid: 3841 float_raise( float_flag_invalid ); 3842 z.low = floatx80_default_nan_low; 3843 z.high = floatx80_default_nan_high; 3844 return z; 3845 } 3846 float_raise( float_flag_divbyzero ); 3847 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3848 } 3849 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3850 } 3851 if ( aExp == 0 ) { 3852 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3853 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3854 } 3855 zExp = aExp - bExp + 0x3FFE; 3856 rem1 = 0; 3857 if ( bSig <= aSig ) { 3858 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3859 ++zExp; 3860 } 3861 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3862 mul64To128( bSig, zSig0, &term0, &term1 ); 3863 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3864 while ( (sbits64) rem0 < 0 ) { 3865 --zSig0; 3866 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3867 } 3868 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3869 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3870 mul64To128( bSig, zSig1, &term1, &term2 ); 3871 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3872 while ( (sbits64) rem1 < 0 ) { 3873 --zSig1; 3874 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3875 } 3876 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3877 } 3878 return 3879 roundAndPackFloatx80( 3880 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3881 3882 } 3883 3884 /* 3885 ------------------------------------------------------------------------------- 3886 Returns the remainder of the extended double-precision floating-point value 3887 `a' with respect to the corresponding value `b'. The operation is performed 3888 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3889 ------------------------------------------------------------------------------- 3890 */ 3891 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3892 { 3893 flag aSign, bSign, zSign; 3894 int32 aExp, bExp, expDiff; 3895 bits64 aSig0, aSig1, bSig; 3896 bits64 q, term0, term1, alternateASig0, alternateASig1; 3897 floatx80 z; 3898 3899 aSig0 = extractFloatx80Frac( a ); 3900 aExp = extractFloatx80Exp( a ); 3901 aSign = extractFloatx80Sign( a ); 3902 bSig = extractFloatx80Frac( b ); 3903 bExp = extractFloatx80Exp( b ); 3904 bSign = extractFloatx80Sign( b ); 3905 if ( aExp == 0x7FFF ) { 3906 if ( (bits64) ( aSig0<<1 ) 3907 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3908 return propagateFloatx80NaN( a, b ); 3909 } 3910 goto invalid; 3911 } 3912 if ( bExp == 0x7FFF ) { 3913 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3914 return a; 3915 } 3916 if ( bExp == 0 ) { 3917 if ( bSig == 0 ) { 3918 invalid: 3919 float_raise( float_flag_invalid ); 3920 z.low = floatx80_default_nan_low; 3921 z.high = floatx80_default_nan_high; 3922 return z; 3923 } 3924 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3925 } 3926 if ( aExp == 0 ) { 3927 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3928 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3929 } 3930 bSig |= LIT64( 0x8000000000000000 ); 3931 zSign = aSign; 3932 expDiff = aExp - bExp; 3933 aSig1 = 0; 3934 if ( expDiff < 0 ) { 3935 if ( expDiff < -1 ) return a; 3936 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3937 expDiff = 0; 3938 } 3939 q = ( bSig <= aSig0 ); 3940 if ( q ) aSig0 -= bSig; 3941 expDiff -= 64; 3942 while ( 0 < expDiff ) { 3943 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3944 q = ( 2 < q ) ? q - 2 : 0; 3945 mul64To128( bSig, q, &term0, &term1 ); 3946 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3947 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3948 expDiff -= 62; 3949 } 3950 expDiff += 64; 3951 if ( 0 < expDiff ) { 3952 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3953 q = ( 2 < q ) ? q - 2 : 0; 3954 q >>= 64 - expDiff; 3955 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3956 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3957 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3958 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3959 ++q; 3960 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3961 } 3962 } 3963 else { 3964 term1 = 0; 3965 term0 = bSig; 3966 } 3967 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3968 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3969 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3970 && ( q & 1 ) ) 3971 ) { 3972 aSig0 = alternateASig0; 3973 aSig1 = alternateASig1; 3974 zSign = ! zSign; 3975 } 3976 return 3977 normalizeRoundAndPackFloatx80( 3978 80, zSign, bExp + expDiff, aSig0, aSig1 ); 3979 3980 } 3981 3982 /* 3983 ------------------------------------------------------------------------------- 3984 Returns the square root of the extended double-precision floating-point 3985 value `a'. The operation is performed according to the IEC/IEEE Standard 3986 for Binary Floating-Point Arithmetic. 3987 ------------------------------------------------------------------------------- 3988 */ 3989 floatx80 floatx80_sqrt( floatx80 a ) 3990 { 3991 flag aSign; 3992 int32 aExp, zExp; 3993 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 3994 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3995 floatx80 z; 3996 3997 aSig0 = extractFloatx80Frac( a ); 3998 aExp = extractFloatx80Exp( a ); 3999 aSign = extractFloatx80Sign( a ); 4000 if ( aExp == 0x7FFF ) { 4001 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4002 if ( ! aSign ) return a; 4003 goto invalid; 4004 } 4005 if ( aSign ) { 4006 if ( ( aExp | aSig0 ) == 0 ) return a; 4007 invalid: 4008 float_raise( float_flag_invalid ); 4009 z.low = floatx80_default_nan_low; 4010 z.high = floatx80_default_nan_high; 4011 return z; 4012 } 4013 if ( aExp == 0 ) { 4014 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4015 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4016 } 4017 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4018 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4019 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4020 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4021 doubleZSig0 = zSig0<<1; 4022 mul64To128( zSig0, zSig0, &term0, &term1 ); 4023 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4024 while ( (sbits64) rem0 < 0 ) { 4025 --zSig0; 4026 doubleZSig0 -= 2; 4027 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4028 } 4029 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4030 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4031 if ( zSig1 == 0 ) zSig1 = 1; 4032 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4033 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4034 mul64To128( zSig1, zSig1, &term2, &term3 ); 4035 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4036 while ( (sbits64) rem1 < 0 ) { 4037 --zSig1; 4038 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4039 term3 |= 1; 4040 term2 |= doubleZSig0; 4041 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4042 } 4043 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4044 } 4045 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4046 zSig0 |= doubleZSig0; 4047 return 4048 roundAndPackFloatx80( 4049 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4050 4051 } 4052 4053 /* 4054 ------------------------------------------------------------------------------- 4055 Returns 1 if the extended double-precision floating-point value `a' is 4056 equal to the corresponding value `b', and 0 otherwise. The comparison is 4057 performed according to the IEC/IEEE Standard for Binary Floating-Point 4058 Arithmetic. 4059 ------------------------------------------------------------------------------- 4060 */ 4061 flag floatx80_eq( floatx80 a, floatx80 b ) 4062 { 4063 4064 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4065 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4066 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4067 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4068 ) { 4069 if ( floatx80_is_signaling_nan( a ) 4070 || floatx80_is_signaling_nan( b ) ) { 4071 float_raise( float_flag_invalid ); 4072 } 4073 return 0; 4074 } 4075 return 4076 ( a.low == b.low ) 4077 && ( ( a.high == b.high ) 4078 || ( ( a.low == 0 ) 4079 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4080 ); 4081 4082 } 4083 4084 /* 4085 ------------------------------------------------------------------------------- 4086 Returns 1 if the extended double-precision floating-point value `a' is 4087 less than or equal to the corresponding value `b', and 0 otherwise. The 4088 comparison is performed according to the IEC/IEEE Standard for Binary 4089 Floating-Point Arithmetic. 4090 ------------------------------------------------------------------------------- 4091 */ 4092 flag floatx80_le( floatx80 a, floatx80 b ) 4093 { 4094 flag aSign, bSign; 4095 4096 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4097 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4098 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4099 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4100 ) { 4101 float_raise( float_flag_invalid ); 4102 return 0; 4103 } 4104 aSign = extractFloatx80Sign( a ); 4105 bSign = extractFloatx80Sign( b ); 4106 if ( aSign != bSign ) { 4107 return 4108 aSign 4109 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4110 == 0 ); 4111 } 4112 return 4113 aSign ? le128( b.high, b.low, a.high, a.low ) 4114 : le128( a.high, a.low, b.high, b.low ); 4115 4116 } 4117 4118 /* 4119 ------------------------------------------------------------------------------- 4120 Returns 1 if the extended double-precision floating-point value `a' is 4121 less than the corresponding value `b', and 0 otherwise. The comparison 4122 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4123 Arithmetic. 4124 ------------------------------------------------------------------------------- 4125 */ 4126 flag floatx80_lt( floatx80 a, floatx80 b ) 4127 { 4128 flag aSign, bSign; 4129 4130 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4131 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4132 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4133 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4134 ) { 4135 float_raise( float_flag_invalid ); 4136 return 0; 4137 } 4138 aSign = extractFloatx80Sign( a ); 4139 bSign = extractFloatx80Sign( b ); 4140 if ( aSign != bSign ) { 4141 return 4142 aSign 4143 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4144 != 0 ); 4145 } 4146 return 4147 aSign ? lt128( b.high, b.low, a.high, a.low ) 4148 : lt128( a.high, a.low, b.high, b.low ); 4149 4150 } 4151 4152 /* 4153 ------------------------------------------------------------------------------- 4154 Returns 1 if the extended double-precision floating-point value `a' is equal 4155 to the corresponding value `b', and 0 otherwise. The invalid exception is 4156 raised if either operand is a NaN. Otherwise, the comparison is performed 4157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4158 ------------------------------------------------------------------------------- 4159 */ 4160 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4161 { 4162 4163 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4164 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4165 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4166 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4167 ) { 4168 float_raise( float_flag_invalid ); 4169 return 0; 4170 } 4171 return 4172 ( a.low == b.low ) 4173 && ( ( a.high == b.high ) 4174 || ( ( a.low == 0 ) 4175 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4176 ); 4177 4178 } 4179 4180 /* 4181 ------------------------------------------------------------------------------- 4182 Returns 1 if the extended double-precision floating-point value `a' is less 4183 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4184 do not cause an exception. Otherwise, the comparison is performed according 4185 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4186 ------------------------------------------------------------------------------- 4187 */ 4188 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4189 { 4190 flag aSign, bSign; 4191 4192 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4193 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4194 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4195 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4196 ) { 4197 if ( floatx80_is_signaling_nan( a ) 4198 || floatx80_is_signaling_nan( b ) ) { 4199 float_raise( float_flag_invalid ); 4200 } 4201 return 0; 4202 } 4203 aSign = extractFloatx80Sign( a ); 4204 bSign = extractFloatx80Sign( b ); 4205 if ( aSign != bSign ) { 4206 return 4207 aSign 4208 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4209 == 0 ); 4210 } 4211 return 4212 aSign ? le128( b.high, b.low, a.high, a.low ) 4213 : le128( a.high, a.low, b.high, b.low ); 4214 4215 } 4216 4217 /* 4218 ------------------------------------------------------------------------------- 4219 Returns 1 if the extended double-precision floating-point value `a' is less 4220 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4221 an exception. Otherwise, the comparison is performed according to the 4222 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4223 ------------------------------------------------------------------------------- 4224 */ 4225 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4226 { 4227 flag aSign, bSign; 4228 4229 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4230 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4231 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4232 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4233 ) { 4234 if ( floatx80_is_signaling_nan( a ) 4235 || floatx80_is_signaling_nan( b ) ) { 4236 float_raise( float_flag_invalid ); 4237 } 4238 return 0; 4239 } 4240 aSign = extractFloatx80Sign( a ); 4241 bSign = extractFloatx80Sign( b ); 4242 if ( aSign != bSign ) { 4243 return 4244 aSign 4245 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4246 != 0 ); 4247 } 4248 return 4249 aSign ? lt128( b.high, b.low, a.high, a.low ) 4250 : lt128( a.high, a.low, b.high, b.low ); 4251 4252 } 4253 4254 #endif 4255 4256 #ifdef FLOAT128 4257 4258 /* 4259 ------------------------------------------------------------------------------- 4260 Returns the result of converting the quadruple-precision floating-point 4261 value `a' to the 32-bit two's complement integer format. The conversion 4262 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4263 Arithmetic---which means in particular that the conversion is rounded 4264 according to the current rounding mode. If `a' is a NaN, the largest 4265 positive integer is returned. Otherwise, if the conversion overflows, the 4266 largest integer with the same sign as `a' is returned. 4267 ------------------------------------------------------------------------------- 4268 */ 4269 int32 float128_to_int32( float128 a ) 4270 { 4271 flag aSign; 4272 int32 aExp, shiftCount; 4273 bits64 aSig0, aSig1; 4274 4275 aSig1 = extractFloat128Frac1( a ); 4276 aSig0 = extractFloat128Frac0( a ); 4277 aExp = extractFloat128Exp( a ); 4278 aSign = extractFloat128Sign( a ); 4279 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4280 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4281 aSig0 |= ( aSig1 != 0 ); 4282 shiftCount = 0x4028 - aExp; 4283 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4284 return roundAndPackInt32( aSign, aSig0 ); 4285 4286 } 4287 4288 /* 4289 ------------------------------------------------------------------------------- 4290 Returns the result of converting the quadruple-precision floating-point 4291 value `a' to the 32-bit two's complement integer format. The conversion 4292 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4293 Arithmetic, except that the conversion is always rounded toward zero. If 4294 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4295 conversion overflows, the largest integer with the same sign as `a' is 4296 returned. 4297 ------------------------------------------------------------------------------- 4298 */ 4299 int32 float128_to_int32_round_to_zero( float128 a ) 4300 { 4301 flag aSign; 4302 int32 aExp, shiftCount; 4303 bits64 aSig0, aSig1, savedASig; 4304 int32 z; 4305 4306 aSig1 = extractFloat128Frac1( a ); 4307 aSig0 = extractFloat128Frac0( a ); 4308 aExp = extractFloat128Exp( a ); 4309 aSign = extractFloat128Sign( a ); 4310 aSig0 |= ( aSig1 != 0 ); 4311 if ( 0x401E < aExp ) { 4312 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4313 goto invalid; 4314 } 4315 else if ( aExp < 0x3FFF ) { 4316 if ( aExp || aSig0 ) float_set_inexact(); 4317 return 0; 4318 } 4319 aSig0 |= LIT64( 0x0001000000000000 ); 4320 shiftCount = 0x402F - aExp; 4321 savedASig = aSig0; 4322 aSig0 >>= shiftCount; 4323 z = aSig0; 4324 if ( aSign ) z = - z; 4325 if ( ( z < 0 ) ^ aSign ) { 4326 invalid: 4327 float_raise( float_flag_invalid ); 4328 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4329 } 4330 if ( ( aSig0<<shiftCount ) != savedASig ) { 4331 float_set_inexact(); 4332 } 4333 return z; 4334 4335 } 4336 4337 /* 4338 ------------------------------------------------------------------------------- 4339 Returns the result of converting the quadruple-precision floating-point 4340 value `a' to the 64-bit two's complement integer format. The conversion 4341 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4342 Arithmetic---which means in particular that the conversion is rounded 4343 according to the current rounding mode. If `a' is a NaN, the largest 4344 positive integer is returned. Otherwise, if the conversion overflows, the 4345 largest integer with the same sign as `a' is returned. 4346 ------------------------------------------------------------------------------- 4347 */ 4348 int64 float128_to_int64( float128 a ) 4349 { 4350 flag aSign; 4351 int32 aExp, shiftCount; 4352 bits64 aSig0, aSig1; 4353 4354 aSig1 = extractFloat128Frac1( a ); 4355 aSig0 = extractFloat128Frac0( a ); 4356 aExp = extractFloat128Exp( a ); 4357 aSign = extractFloat128Sign( a ); 4358 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4359 shiftCount = 0x402F - aExp; 4360 if ( shiftCount <= 0 ) { 4361 if ( 0x403E < aExp ) { 4362 float_raise( float_flag_invalid ); 4363 if ( ! aSign 4364 || ( ( aExp == 0x7FFF ) 4365 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4366 ) 4367 ) { 4368 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4369 } 4370 return (sbits64) LIT64( 0x8000000000000000 ); 4371 } 4372 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4373 } 4374 else { 4375 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4376 } 4377 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4378 4379 } 4380 4381 /* 4382 ------------------------------------------------------------------------------- 4383 Returns the result of converting the quadruple-precision floating-point 4384 value `a' to the 64-bit two's complement integer format. The conversion 4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4386 Arithmetic, except that the conversion is always rounded toward zero. 4387 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4388 the conversion overflows, the largest integer with the same sign as `a' is 4389 returned. 4390 ------------------------------------------------------------------------------- 4391 */ 4392 int64 float128_to_int64_round_to_zero( float128 a ) 4393 { 4394 flag aSign; 4395 int32 aExp, shiftCount; 4396 bits64 aSig0, aSig1; 4397 int64 z; 4398 4399 aSig1 = extractFloat128Frac1( a ); 4400 aSig0 = extractFloat128Frac0( a ); 4401 aExp = extractFloat128Exp( a ); 4402 aSign = extractFloat128Sign( a ); 4403 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4404 shiftCount = aExp - 0x402F; 4405 if ( 0 < shiftCount ) { 4406 if ( 0x403E <= aExp ) { 4407 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4408 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4409 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4410 if ( aSig1 ) float_set_inexact(); 4411 } 4412 else { 4413 float_raise( float_flag_invalid ); 4414 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4415 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4416 } 4417 } 4418 return (sbits64) LIT64( 0x8000000000000000 ); 4419 } 4420 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4421 if ( (bits64) ( aSig1<<shiftCount ) ) { 4422 float_set_inexact(); 4423 } 4424 } 4425 else { 4426 if ( aExp < 0x3FFF ) { 4427 if ( aExp | aSig0 | aSig1 ) { 4428 float_set_inexact(); 4429 } 4430 return 0; 4431 } 4432 z = aSig0>>( - shiftCount ); 4433 if ( aSig1 4434 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4435 float_set_inexact(); 4436 } 4437 } 4438 if ( aSign ) z = - z; 4439 return z; 4440 4441 } 4442 4443 /* 4444 ------------------------------------------------------------------------------- 4445 Returns the result of converting the quadruple-precision floating-point 4446 value `a' to the single-precision floating-point format. The conversion 4447 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4448 Arithmetic. 4449 ------------------------------------------------------------------------------- 4450 */ 4451 float32 float128_to_float32( float128 a ) 4452 { 4453 flag aSign; 4454 int32 aExp; 4455 bits64 aSig0, aSig1; 4456 bits32 zSig; 4457 4458 aSig1 = extractFloat128Frac1( a ); 4459 aSig0 = extractFloat128Frac0( a ); 4460 aExp = extractFloat128Exp( a ); 4461 aSign = extractFloat128Sign( a ); 4462 if ( aExp == 0x7FFF ) { 4463 if ( aSig0 | aSig1 ) { 4464 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4465 } 4466 return packFloat32( aSign, 0xFF, 0 ); 4467 } 4468 aSig0 |= ( aSig1 != 0 ); 4469 shift64RightJamming( aSig0, 18, &aSig0 ); 4470 zSig = aSig0; 4471 if ( aExp || zSig ) { 4472 zSig |= 0x40000000; 4473 aExp -= 0x3F81; 4474 } 4475 return roundAndPackFloat32( aSign, aExp, zSig ); 4476 4477 } 4478 4479 /* 4480 ------------------------------------------------------------------------------- 4481 Returns the result of converting the quadruple-precision floating-point 4482 value `a' to the double-precision floating-point format. The conversion 4483 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4484 Arithmetic. 4485 ------------------------------------------------------------------------------- 4486 */ 4487 float64 float128_to_float64( float128 a ) 4488 { 4489 flag aSign; 4490 int32 aExp; 4491 bits64 aSig0, aSig1; 4492 4493 aSig1 = extractFloat128Frac1( a ); 4494 aSig0 = extractFloat128Frac0( a ); 4495 aExp = extractFloat128Exp( a ); 4496 aSign = extractFloat128Sign( a ); 4497 if ( aExp == 0x7FFF ) { 4498 if ( aSig0 | aSig1 ) { 4499 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4500 } 4501 return packFloat64( aSign, 0x7FF, 0 ); 4502 } 4503 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4504 aSig0 |= ( aSig1 != 0 ); 4505 if ( aExp || aSig0 ) { 4506 aSig0 |= LIT64( 0x4000000000000000 ); 4507 aExp -= 0x3C01; 4508 } 4509 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4510 4511 } 4512 4513 #ifdef FLOATX80 4514 4515 /* 4516 ------------------------------------------------------------------------------- 4517 Returns the result of converting the quadruple-precision floating-point 4518 value `a' to the extended double-precision floating-point format. The 4519 conversion is performed according to the IEC/IEEE Standard for Binary 4520 Floating-Point Arithmetic. 4521 ------------------------------------------------------------------------------- 4522 */ 4523 floatx80 float128_to_floatx80( float128 a ) 4524 { 4525 flag aSign; 4526 int32 aExp; 4527 bits64 aSig0, aSig1; 4528 4529 aSig1 = extractFloat128Frac1( a ); 4530 aSig0 = extractFloat128Frac0( a ); 4531 aExp = extractFloat128Exp( a ); 4532 aSign = extractFloat128Sign( a ); 4533 if ( aExp == 0x7FFF ) { 4534 if ( aSig0 | aSig1 ) { 4535 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4536 } 4537 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4538 } 4539 if ( aExp == 0 ) { 4540 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4541 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4542 } 4543 else { 4544 aSig0 |= LIT64( 0x0001000000000000 ); 4545 } 4546 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4547 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4548 4549 } 4550 4551 #endif 4552 4553 /* 4554 ------------------------------------------------------------------------------- 4555 Rounds the quadruple-precision floating-point value `a' to an integer, and 4556 returns the result as a quadruple-precision floating-point value. The 4557 operation is performed according to the IEC/IEEE Standard for Binary 4558 Floating-Point Arithmetic. 4559 ------------------------------------------------------------------------------- 4560 */ 4561 float128 float128_round_to_int( float128 a ) 4562 { 4563 flag aSign; 4564 int32 aExp; 4565 bits64 lastBitMask, roundBitsMask; 4566 int8 roundingMode; 4567 float128 z; 4568 4569 aExp = extractFloat128Exp( a ); 4570 if ( 0x402F <= aExp ) { 4571 if ( 0x406F <= aExp ) { 4572 if ( ( aExp == 0x7FFF ) 4573 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4574 ) { 4575 return propagateFloat128NaN( a, a ); 4576 } 4577 return a; 4578 } 4579 lastBitMask = 1; 4580 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4581 roundBitsMask = lastBitMask - 1; 4582 z = a; 4583 roundingMode = float_rounding_mode(); 4584 if ( roundingMode == float_round_nearest_even ) { 4585 if ( lastBitMask ) { 4586 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4587 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4588 } 4589 else { 4590 if ( (sbits64) z.low < 0 ) { 4591 ++z.high; 4592 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4593 } 4594 } 4595 } 4596 else if ( roundingMode != float_round_to_zero ) { 4597 if ( extractFloat128Sign( z ) 4598 ^ ( roundingMode == float_round_up ) ) { 4599 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4600 } 4601 } 4602 z.low &= ~ roundBitsMask; 4603 } 4604 else { 4605 if ( aExp < 0x3FFF ) { 4606 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4607 float_set_inexact(); 4608 aSign = extractFloat128Sign( a ); 4609 switch ( float_rounding_mode() ) { 4610 case float_round_nearest_even: 4611 if ( ( aExp == 0x3FFE ) 4612 && ( extractFloat128Frac0( a ) 4613 | extractFloat128Frac1( a ) ) 4614 ) { 4615 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4616 } 4617 break; 4618 case float_round_down: 4619 return 4620 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4621 : packFloat128( 0, 0, 0, 0 ); 4622 case float_round_up: 4623 return 4624 aSign ? packFloat128( 1, 0, 0, 0 ) 4625 : packFloat128( 0, 0x3FFF, 0, 0 ); 4626 } 4627 return packFloat128( aSign, 0, 0, 0 ); 4628 } 4629 lastBitMask = 1; 4630 lastBitMask <<= 0x402F - aExp; 4631 roundBitsMask = lastBitMask - 1; 4632 z.low = 0; 4633 z.high = a.high; 4634 roundingMode = float_rounding_mode(); 4635 if ( roundingMode == float_round_nearest_even ) { 4636 z.high += lastBitMask>>1; 4637 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4638 z.high &= ~ lastBitMask; 4639 } 4640 } 4641 else if ( roundingMode != float_round_to_zero ) { 4642 if ( extractFloat128Sign( z ) 4643 ^ ( roundingMode == float_round_up ) ) { 4644 z.high |= ( a.low != 0 ); 4645 z.high += roundBitsMask; 4646 } 4647 } 4648 z.high &= ~ roundBitsMask; 4649 } 4650 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4651 float_set_inexact(); 4652 } 4653 return z; 4654 4655 } 4656 4657 /* 4658 ------------------------------------------------------------------------------- 4659 Returns the result of adding the absolute values of the quadruple-precision 4660 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4661 before being returned. `zSign' is ignored if the result is a NaN. 4662 The addition is performed according to the IEC/IEEE Standard for Binary 4663 Floating-Point Arithmetic. 4664 ------------------------------------------------------------------------------- 4665 */ 4666 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4667 { 4668 int32 aExp, bExp, zExp; 4669 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4670 int32 expDiff; 4671 4672 aSig1 = extractFloat128Frac1( a ); 4673 aSig0 = extractFloat128Frac0( a ); 4674 aExp = extractFloat128Exp( a ); 4675 bSig1 = extractFloat128Frac1( b ); 4676 bSig0 = extractFloat128Frac0( b ); 4677 bExp = extractFloat128Exp( b ); 4678 expDiff = aExp - bExp; 4679 if ( 0 < expDiff ) { 4680 if ( aExp == 0x7FFF ) { 4681 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4682 return a; 4683 } 4684 if ( bExp == 0 ) { 4685 --expDiff; 4686 } 4687 else { 4688 bSig0 |= LIT64( 0x0001000000000000 ); 4689 } 4690 shift128ExtraRightJamming( 4691 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4692 zExp = aExp; 4693 } 4694 else if ( expDiff < 0 ) { 4695 if ( bExp == 0x7FFF ) { 4696 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4697 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4698 } 4699 if ( aExp == 0 ) { 4700 ++expDiff; 4701 } 4702 else { 4703 aSig0 |= LIT64( 0x0001000000000000 ); 4704 } 4705 shift128ExtraRightJamming( 4706 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4707 zExp = bExp; 4708 } 4709 else { 4710 if ( aExp == 0x7FFF ) { 4711 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4712 return propagateFloat128NaN( a, b ); 4713 } 4714 return a; 4715 } 4716 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4717 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4718 zSig2 = 0; 4719 zSig0 |= LIT64( 0x0002000000000000 ); 4720 zExp = aExp; 4721 goto shiftRight1; 4722 } 4723 aSig0 |= LIT64( 0x0001000000000000 ); 4724 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4725 --zExp; 4726 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4727 ++zExp; 4728 shiftRight1: 4729 shift128ExtraRightJamming( 4730 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4731 roundAndPack: 4732 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4733 4734 } 4735 4736 /* 4737 ------------------------------------------------------------------------------- 4738 Returns the result of subtracting the absolute values of the quadruple- 4739 precision floating-point values `a' and `b'. If `zSign' is 1, the 4740 difference is negated before being returned. `zSign' is ignored if the 4741 result is a NaN. The subtraction is performed according to the IEC/IEEE 4742 Standard for Binary Floating-Point Arithmetic. 4743 ------------------------------------------------------------------------------- 4744 */ 4745 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4746 { 4747 int32 aExp, bExp, zExp; 4748 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4749 int32 expDiff; 4750 float128 z; 4751 4752 aSig1 = extractFloat128Frac1( a ); 4753 aSig0 = extractFloat128Frac0( a ); 4754 aExp = extractFloat128Exp( a ); 4755 bSig1 = extractFloat128Frac1( b ); 4756 bSig0 = extractFloat128Frac0( b ); 4757 bExp = extractFloat128Exp( b ); 4758 expDiff = aExp - bExp; 4759 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4760 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4761 if ( 0 < expDiff ) goto aExpBigger; 4762 if ( expDiff < 0 ) goto bExpBigger; 4763 if ( aExp == 0x7FFF ) { 4764 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4765 return propagateFloat128NaN( a, b ); 4766 } 4767 float_raise( float_flag_invalid ); 4768 z.low = float128_default_nan_low; 4769 z.high = float128_default_nan_high; 4770 return z; 4771 } 4772 if ( aExp == 0 ) { 4773 aExp = 1; 4774 bExp = 1; 4775 } 4776 if ( bSig0 < aSig0 ) goto aBigger; 4777 if ( aSig0 < bSig0 ) goto bBigger; 4778 if ( bSig1 < aSig1 ) goto aBigger; 4779 if ( aSig1 < bSig1 ) goto bBigger; 4780 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 ); 4781 bExpBigger: 4782 if ( bExp == 0x7FFF ) { 4783 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4784 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4785 } 4786 if ( aExp == 0 ) { 4787 ++expDiff; 4788 } 4789 else { 4790 aSig0 |= LIT64( 0x4000000000000000 ); 4791 } 4792 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4793 bSig0 |= LIT64( 0x4000000000000000 ); 4794 bBigger: 4795 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4796 zExp = bExp; 4797 zSign ^= 1; 4798 goto normalizeRoundAndPack; 4799 aExpBigger: 4800 if ( aExp == 0x7FFF ) { 4801 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4802 return a; 4803 } 4804 if ( bExp == 0 ) { 4805 --expDiff; 4806 } 4807 else { 4808 bSig0 |= LIT64( 0x4000000000000000 ); 4809 } 4810 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4811 aSig0 |= LIT64( 0x4000000000000000 ); 4812 aBigger: 4813 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4814 zExp = aExp; 4815 normalizeRoundAndPack: 4816 --zExp; 4817 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4818 4819 } 4820 4821 /* 4822 ------------------------------------------------------------------------------- 4823 Returns the result of adding the quadruple-precision floating-point values 4824 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4825 for Binary Floating-Point Arithmetic. 4826 ------------------------------------------------------------------------------- 4827 */ 4828 float128 float128_add( float128 a, float128 b ) 4829 { 4830 flag aSign, bSign; 4831 4832 aSign = extractFloat128Sign( a ); 4833 bSign = extractFloat128Sign( b ); 4834 if ( aSign == bSign ) { 4835 return addFloat128Sigs( a, b, aSign ); 4836 } 4837 else { 4838 return subFloat128Sigs( a, b, aSign ); 4839 } 4840 4841 } 4842 4843 /* 4844 ------------------------------------------------------------------------------- 4845 Returns the result of subtracting the quadruple-precision floating-point 4846 values `a' and `b'. The operation is performed according to the IEC/IEEE 4847 Standard for Binary Floating-Point Arithmetic. 4848 ------------------------------------------------------------------------------- 4849 */ 4850 float128 float128_sub( float128 a, float128 b ) 4851 { 4852 flag aSign, bSign; 4853 4854 aSign = extractFloat128Sign( a ); 4855 bSign = extractFloat128Sign( b ); 4856 if ( aSign == bSign ) { 4857 return subFloat128Sigs( a, b, aSign ); 4858 } 4859 else { 4860 return addFloat128Sigs( a, b, aSign ); 4861 } 4862 4863 } 4864 4865 /* 4866 ------------------------------------------------------------------------------- 4867 Returns the result of multiplying the quadruple-precision floating-point 4868 values `a' and `b'. The operation is performed according to the IEC/IEEE 4869 Standard for Binary Floating-Point Arithmetic. 4870 ------------------------------------------------------------------------------- 4871 */ 4872 float128 float128_mul( float128 a, float128 b ) 4873 { 4874 flag aSign, bSign, zSign; 4875 int32 aExp, bExp, zExp; 4876 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4877 float128 z; 4878 4879 aSig1 = extractFloat128Frac1( a ); 4880 aSig0 = extractFloat128Frac0( a ); 4881 aExp = extractFloat128Exp( a ); 4882 aSign = extractFloat128Sign( a ); 4883 bSig1 = extractFloat128Frac1( b ); 4884 bSig0 = extractFloat128Frac0( b ); 4885 bExp = extractFloat128Exp( b ); 4886 bSign = extractFloat128Sign( b ); 4887 zSign = aSign ^ bSign; 4888 if ( aExp == 0x7FFF ) { 4889 if ( ( aSig0 | aSig1 ) 4890 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4891 return propagateFloat128NaN( a, b ); 4892 } 4893 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4894 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4895 } 4896 if ( bExp == 0x7FFF ) { 4897 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4898 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4899 invalid: 4900 float_raise( float_flag_invalid ); 4901 z.low = float128_default_nan_low; 4902 z.high = float128_default_nan_high; 4903 return z; 4904 } 4905 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4906 } 4907 if ( aExp == 0 ) { 4908 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4909 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4910 } 4911 if ( bExp == 0 ) { 4912 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4913 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4914 } 4915 zExp = aExp + bExp - 0x4000; 4916 aSig0 |= LIT64( 0x0001000000000000 ); 4917 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 4918 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 4919 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4920 zSig2 |= ( zSig3 != 0 ); 4921 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 4922 shift128ExtraRightJamming( 4923 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4924 ++zExp; 4925 } 4926 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4927 4928 } 4929 4930 /* 4931 ------------------------------------------------------------------------------- 4932 Returns the result of dividing the quadruple-precision floating-point value 4933 `a' by the corresponding value `b'. The operation is performed according to 4934 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4935 ------------------------------------------------------------------------------- 4936 */ 4937 float128 float128_div( float128 a, float128 b ) 4938 { 4939 flag aSign, bSign, zSign; 4940 int32 aExp, bExp, zExp; 4941 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4942 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4943 float128 z; 4944 4945 aSig1 = extractFloat128Frac1( a ); 4946 aSig0 = extractFloat128Frac0( a ); 4947 aExp = extractFloat128Exp( a ); 4948 aSign = extractFloat128Sign( a ); 4949 bSig1 = extractFloat128Frac1( b ); 4950 bSig0 = extractFloat128Frac0( b ); 4951 bExp = extractFloat128Exp( b ); 4952 bSign = extractFloat128Sign( b ); 4953 zSign = aSign ^ bSign; 4954 if ( aExp == 0x7FFF ) { 4955 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4956 if ( bExp == 0x7FFF ) { 4957 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4958 goto invalid; 4959 } 4960 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4961 } 4962 if ( bExp == 0x7FFF ) { 4963 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4964 return packFloat128( zSign, 0, 0, 0 ); 4965 } 4966 if ( bExp == 0 ) { 4967 if ( ( bSig0 | bSig1 ) == 0 ) { 4968 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4969 invalid: 4970 float_raise( float_flag_invalid ); 4971 z.low = float128_default_nan_low; 4972 z.high = float128_default_nan_high; 4973 return z; 4974 } 4975 float_raise( float_flag_divbyzero ); 4976 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4977 } 4978 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4979 } 4980 if ( aExp == 0 ) { 4981 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4982 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4983 } 4984 zExp = aExp - bExp + 0x3FFD; 4985 shortShift128Left( 4986 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 4987 shortShift128Left( 4988 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4989 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 4990 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 4991 ++zExp; 4992 } 4993 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4994 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 4995 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 4996 while ( (sbits64) rem0 < 0 ) { 4997 --zSig0; 4998 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 4999 } 5000 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5001 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5002 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5003 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5004 while ( (sbits64) rem1 < 0 ) { 5005 --zSig1; 5006 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5007 } 5008 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5009 } 5010 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5011 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5012 5013 } 5014 5015 /* 5016 ------------------------------------------------------------------------------- 5017 Returns the remainder of the quadruple-precision floating-point value `a' 5018 with respect to the corresponding value `b'. The operation is performed 5019 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5020 ------------------------------------------------------------------------------- 5021 */ 5022 float128 float128_rem( float128 a, float128 b ) 5023 { 5024 flag aSign, bSign, zSign; 5025 int32 aExp, bExp, expDiff; 5026 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5027 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5028 sbits64 sigMean0; 5029 float128 z; 5030 5031 aSig1 = extractFloat128Frac1( a ); 5032 aSig0 = extractFloat128Frac0( a ); 5033 aExp = extractFloat128Exp( a ); 5034 aSign = extractFloat128Sign( a ); 5035 bSig1 = extractFloat128Frac1( b ); 5036 bSig0 = extractFloat128Frac0( b ); 5037 bExp = extractFloat128Exp( b ); 5038 bSign = extractFloat128Sign( b ); 5039 if ( aExp == 0x7FFF ) { 5040 if ( ( aSig0 | aSig1 ) 5041 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5042 return propagateFloat128NaN( a, b ); 5043 } 5044 goto invalid; 5045 } 5046 if ( bExp == 0x7FFF ) { 5047 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5048 return a; 5049 } 5050 if ( bExp == 0 ) { 5051 if ( ( bSig0 | bSig1 ) == 0 ) { 5052 invalid: 5053 float_raise( float_flag_invalid ); 5054 z.low = float128_default_nan_low; 5055 z.high = float128_default_nan_high; 5056 return z; 5057 } 5058 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5059 } 5060 if ( aExp == 0 ) { 5061 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5062 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5063 } 5064 expDiff = aExp - bExp; 5065 if ( expDiff < -1 ) return a; 5066 shortShift128Left( 5067 aSig0 | LIT64( 0x0001000000000000 ), 5068 aSig1, 5069 15 - ( expDiff < 0 ), 5070 &aSig0, 5071 &aSig1 5072 ); 5073 shortShift128Left( 5074 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5075 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5076 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5077 expDiff -= 64; 5078 while ( 0 < expDiff ) { 5079 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5080 q = ( 4 < q ) ? q - 4 : 0; 5081 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5082 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5083 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5084 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5085 expDiff -= 61; 5086 } 5087 if ( -64 < expDiff ) { 5088 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5089 q = ( 4 < q ) ? q - 4 : 0; 5090 q >>= - expDiff; 5091 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5092 expDiff += 52; 5093 if ( expDiff < 0 ) { 5094 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5095 } 5096 else { 5097 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5098 } 5099 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5100 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5101 } 5102 else { 5103 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5104 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5105 } 5106 do { 5107 alternateASig0 = aSig0; 5108 alternateASig1 = aSig1; 5109 ++q; 5110 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5111 } while ( 0 <= (sbits64) aSig0 ); 5112 add128( 5113 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5114 if ( ( sigMean0 < 0 ) 5115 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5116 aSig0 = alternateASig0; 5117 aSig1 = alternateASig1; 5118 } 5119 zSign = ( (sbits64) aSig0 < 0 ); 5120 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5121 return 5122 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5123 5124 } 5125 5126 /* 5127 ------------------------------------------------------------------------------- 5128 Returns the square root of the quadruple-precision floating-point value `a'. 5129 The operation is performed according to the IEC/IEEE Standard for Binary 5130 Floating-Point Arithmetic. 5131 ------------------------------------------------------------------------------- 5132 */ 5133 float128 float128_sqrt( float128 a ) 5134 { 5135 flag aSign; 5136 int32 aExp, zExp; 5137 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5138 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5139 float128 z; 5140 5141 aSig1 = extractFloat128Frac1( a ); 5142 aSig0 = extractFloat128Frac0( a ); 5143 aExp = extractFloat128Exp( a ); 5144 aSign = extractFloat128Sign( a ); 5145 if ( aExp == 0x7FFF ) { 5146 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5147 if ( ! aSign ) return a; 5148 goto invalid; 5149 } 5150 if ( aSign ) { 5151 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5152 invalid: 5153 float_raise( float_flag_invalid ); 5154 z.low = float128_default_nan_low; 5155 z.high = float128_default_nan_high; 5156 return z; 5157 } 5158 if ( aExp == 0 ) { 5159 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5160 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5161 } 5162 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5163 aSig0 |= LIT64( 0x0001000000000000 ); 5164 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5165 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5166 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5167 doubleZSig0 = zSig0<<1; 5168 mul64To128( zSig0, zSig0, &term0, &term1 ); 5169 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5170 while ( (sbits64) rem0 < 0 ) { 5171 --zSig0; 5172 doubleZSig0 -= 2; 5173 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5174 } 5175 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5176 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5177 if ( zSig1 == 0 ) zSig1 = 1; 5178 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5179 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5180 mul64To128( zSig1, zSig1, &term2, &term3 ); 5181 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5182 while ( (sbits64) rem1 < 0 ) { 5183 --zSig1; 5184 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5185 term3 |= 1; 5186 term2 |= doubleZSig0; 5187 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5188 } 5189 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5190 } 5191 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5192 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5193 5194 } 5195 5196 /* 5197 ------------------------------------------------------------------------------- 5198 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5199 the corresponding value `b', and 0 otherwise. The comparison is performed 5200 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5201 ------------------------------------------------------------------------------- 5202 */ 5203 flag float128_eq( float128 a, float128 b ) 5204 { 5205 5206 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5207 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5208 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5209 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5210 ) { 5211 if ( float128_is_signaling_nan( a ) 5212 || float128_is_signaling_nan( b ) ) { 5213 float_raise( float_flag_invalid ); 5214 } 5215 return 0; 5216 } 5217 return 5218 ( a.low == b.low ) 5219 && ( ( a.high == b.high ) 5220 || ( ( a.low == 0 ) 5221 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5222 ); 5223 5224 } 5225 5226 /* 5227 ------------------------------------------------------------------------------- 5228 Returns 1 if the quadruple-precision floating-point value `a' is less than 5229 or equal to the corresponding value `b', and 0 otherwise. The comparison 5230 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5231 Arithmetic. 5232 ------------------------------------------------------------------------------- 5233 */ 5234 flag float128_le( float128 a, float128 b ) 5235 { 5236 flag aSign, bSign; 5237 5238 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5239 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5240 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5241 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5242 ) { 5243 float_raise( float_flag_invalid ); 5244 return 0; 5245 } 5246 aSign = extractFloat128Sign( a ); 5247 bSign = extractFloat128Sign( b ); 5248 if ( aSign != bSign ) { 5249 return 5250 aSign 5251 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5252 == 0 ); 5253 } 5254 return 5255 aSign ? le128( b.high, b.low, a.high, a.low ) 5256 : le128( a.high, a.low, b.high, b.low ); 5257 5258 } 5259 5260 /* 5261 ------------------------------------------------------------------------------- 5262 Returns 1 if the quadruple-precision floating-point value `a' is less than 5263 the corresponding value `b', and 0 otherwise. The comparison is performed 5264 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5265 ------------------------------------------------------------------------------- 5266 */ 5267 flag float128_lt( float128 a, float128 b ) 5268 { 5269 flag aSign, bSign; 5270 5271 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5272 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5273 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5274 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5275 ) { 5276 float_raise( float_flag_invalid ); 5277 return 0; 5278 } 5279 aSign = extractFloat128Sign( a ); 5280 bSign = extractFloat128Sign( b ); 5281 if ( aSign != bSign ) { 5282 return 5283 aSign 5284 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5285 != 0 ); 5286 } 5287 return 5288 aSign ? lt128( b.high, b.low, a.high, a.low ) 5289 : lt128( a.high, a.low, b.high, b.low ); 5290 5291 } 5292 5293 /* 5294 ------------------------------------------------------------------------------- 5295 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5296 the corresponding value `b', and 0 otherwise. The invalid exception is 5297 raised if either operand is a NaN. Otherwise, the comparison is performed 5298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5299 ------------------------------------------------------------------------------- 5300 */ 5301 flag float128_eq_signaling( float128 a, float128 b ) 5302 { 5303 5304 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5305 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5306 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5307 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5308 ) { 5309 float_raise( float_flag_invalid ); 5310 return 0; 5311 } 5312 return 5313 ( a.low == b.low ) 5314 && ( ( a.high == b.high ) 5315 || ( ( a.low == 0 ) 5316 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5317 ); 5318 5319 } 5320 5321 /* 5322 ------------------------------------------------------------------------------- 5323 Returns 1 if the quadruple-precision floating-point value `a' is less than 5324 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5325 cause an exception. Otherwise, the comparison is performed according to the 5326 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5327 ------------------------------------------------------------------------------- 5328 */ 5329 flag float128_le_quiet( float128 a, float128 b ) 5330 { 5331 flag aSign, bSign; 5332 5333 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5334 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5335 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5336 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5337 ) { 5338 if ( float128_is_signaling_nan( a ) 5339 || float128_is_signaling_nan( b ) ) { 5340 float_raise( float_flag_invalid ); 5341 } 5342 return 0; 5343 } 5344 aSign = extractFloat128Sign( a ); 5345 bSign = extractFloat128Sign( b ); 5346 if ( aSign != bSign ) { 5347 return 5348 aSign 5349 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5350 == 0 ); 5351 } 5352 return 5353 aSign ? le128( b.high, b.low, a.high, a.low ) 5354 : le128( a.high, a.low, b.high, b.low ); 5355 5356 } 5357 5358 /* 5359 ------------------------------------------------------------------------------- 5360 Returns 1 if the quadruple-precision floating-point value `a' is less than 5361 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5362 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5363 Standard for Binary Floating-Point Arithmetic. 5364 ------------------------------------------------------------------------------- 5365 */ 5366 flag float128_lt_quiet( float128 a, float128 b ) 5367 { 5368 flag aSign, bSign; 5369 5370 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5371 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5372 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5373 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5374 ) { 5375 if ( float128_is_signaling_nan( a ) 5376 || float128_is_signaling_nan( b ) ) { 5377 float_raise( float_flag_invalid ); 5378 } 5379 return 0; 5380 } 5381 aSign = extractFloat128Sign( a ); 5382 bSign = extractFloat128Sign( b ); 5383 if ( aSign != bSign ) { 5384 return 5385 aSign 5386 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5387 != 0 ); 5388 } 5389 return 5390 aSign ? lt128( b.high, b.low, a.high, a.low ) 5391 : lt128( a.high, a.low, b.high, b.low ); 5392 5393 } 5394 5395 #endif 5396 5397 5398 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5399 5400 /* 5401 * These two routines are not part of the original softfloat distribution. 5402 * 5403 * They are based on the corresponding conversions to integer but return 5404 * unsigned numbers instead since these functions are required by GCC. 5405 * 5406 * Added by Mark Brinicombe <mark@netbsd.org> 27/09/97 5407 * 5408 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5409 */ 5410 5411 /* 5412 ------------------------------------------------------------------------------- 5413 Returns the result of converting the double-precision floating-point value 5414 `a' to the 32-bit unsigned integer format. The conversion is 5415 performed according to the IEC/IEEE Standard for Binary Floating-point 5416 Arithmetic, except that the conversion is always rounded toward zero. If 5417 `a' is a NaN, the largest positive integer is returned. If the conversion 5418 overflows, the largest integer positive is returned. 5419 ------------------------------------------------------------------------------- 5420 */ 5421 uint32 float64_to_uint32_round_to_zero( float64 a ) 5422 { 5423 flag aSign; 5424 int16 aExp, shiftCount; 5425 bits64 aSig, savedASig; 5426 uint32 z; 5427 5428 aSig = extractFloat64Frac( a ); 5429 aExp = extractFloat64Exp( a ); 5430 aSign = extractFloat64Sign( a ); 5431 5432 if (aSign) { 5433 float_raise( float_flag_invalid ); 5434 return(0); 5435 } 5436 5437 if ( 0x41E < aExp ) { 5438 float_raise( float_flag_invalid ); 5439 return 0xffffffff; 5440 } 5441 else if ( aExp < 0x3FF ) { 5442 if ( aExp || aSig ) float_set_inexact(); 5443 return 0; 5444 } 5445 aSig |= LIT64( 0x0010000000000000 ); 5446 shiftCount = 0x433 - aExp; 5447 savedASig = aSig; 5448 aSig >>= shiftCount; 5449 z = aSig; 5450 if ( ( aSig<<shiftCount ) != savedASig ) { 5451 float_set_inexact(); 5452 } 5453 return z; 5454 5455 } 5456 5457 /* 5458 ------------------------------------------------------------------------------- 5459 Returns the result of converting the single-precision floating-point value 5460 `a' to the 32-bit unsigned integer format. The conversion is 5461 performed according to the IEC/IEEE Standard for Binary Floating-point 5462 Arithmetic, except that the conversion is always rounded toward zero. If 5463 `a' is a NaN, the largest positive integer is returned. If the conversion 5464 overflows, the largest positive integer is returned. 5465 ------------------------------------------------------------------------------- 5466 */ 5467 uint32 float32_to_uint32_round_to_zero( float32 a ) 5468 { 5469 flag aSign; 5470 int16 aExp, shiftCount; 5471 bits32 aSig; 5472 uint32 z; 5473 5474 aSig = extractFloat32Frac( a ); 5475 aExp = extractFloat32Exp( a ); 5476 aSign = extractFloat32Sign( a ); 5477 shiftCount = aExp - 0x9E; 5478 5479 if (aSign) { 5480 float_raise( float_flag_invalid ); 5481 return(0); 5482 } 5483 if ( 0 < shiftCount ) { 5484 float_raise( float_flag_invalid ); 5485 return 0xFFFFFFFF; 5486 } 5487 else if ( aExp <= 0x7E ) { 5488 if ( aExp | aSig ) float_set_inexact(); 5489 return 0; 5490 } 5491 aSig = ( aSig | 0x800000 )<<8; 5492 z = aSig>>( - shiftCount ); 5493 if ( aSig<<( shiftCount & 31 ) ) { 5494 float_set_inexact(); 5495 } 5496 return z; 5497 5498 } 5499 5500 #endif 5501 5502 #endif /* _STANDALONE */ 5503