1 /* $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 =============================================================================== 19 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 #include <sys/cdefs.h> 48 #if defined(LIBC_SCCS) && !defined(lint) 49 __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $"); 50 #endif /* LIBC_SCCS and not lint */ 51 52 #ifdef SOFTFLOAT_FOR_GCC 53 #include "softfloat-for-gcc.h" 54 #endif 55 56 #include "milieu.h" 57 #include "softfloat.h" 58 59 /* 60 * Conversions between floats as stored in memory and floats as 61 * SoftFloat uses them 62 */ 63 #ifndef FLOAT64_DEMANGLE 64 #define FLOAT64_DEMANGLE(a) (a) 65 #endif 66 #ifndef FLOAT64_MANGLE 67 #define FLOAT64_MANGLE(a) (a) 68 #endif 69 70 /* 71 ------------------------------------------------------------------------------- 72 Floating-point rounding mode, extended double-precision rounding precision, 73 and exception flags. 74 ------------------------------------------------------------------------------- 75 */ 76 77 /* 78 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday. 79 * Right now, it does not. I've removed all other dynamic global 80 * variables. [ross] 81 */ 82 #ifdef FLOATX80 83 int8 floatx80_rounding_precision = 80; 84 #endif 85 86 /* 87 ------------------------------------------------------------------------------- 88 Primitive arithmetic functions, including multi-word arithmetic, and 89 division and square root approximations. (Can be specialized to target if 90 desired.) 91 ------------------------------------------------------------------------------- 92 */ 93 #include "softfloat-macros.h" 94 95 /* 96 ------------------------------------------------------------------------------- 97 Functions and definitions to determine: (1) whether tininess for underflow 98 is detected before or after rounding by default, (2) what (if anything) 99 happens when exceptions are raised, (3) how signaling NaNs are distinguished 100 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 101 are propagated from function inputs to output. These details are target- 102 specific. 103 ------------------------------------------------------------------------------- 104 */ 105 #include "softfloat-specialize.h" 106 107 #ifndef SOFTFLOAT_FOR_GCC /* Not used */ 108 /* 109 ------------------------------------------------------------------------------- 110 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 111 and 7, and returns the properly rounded 32-bit integer corresponding to the 112 input. If `zSign' is 1, the input is negated before being converted to an 113 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 114 is simply rounded to an integer, with the inexact exception raised if the 115 input cannot be represented exactly as an integer. However, if the fixed- 116 point input is too large, the invalid exception is raised and the largest 117 positive or negative integer is returned. 118 ------------------------------------------------------------------------------- 119 */ 120 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 121 { 122 int8 roundingMode; 123 flag roundNearestEven; 124 int8 roundIncrement, roundBits; 125 int32 z; 126 127 roundingMode = float_rounding_mode(); 128 roundNearestEven = ( roundingMode == float_round_nearest_even ); 129 roundIncrement = 0x40; 130 if ( ! roundNearestEven ) { 131 if ( roundingMode == float_round_to_zero ) { 132 roundIncrement = 0; 133 } 134 else { 135 roundIncrement = 0x7F; 136 if ( zSign ) { 137 if ( roundingMode == float_round_up ) roundIncrement = 0; 138 } 139 else { 140 if ( roundingMode == float_round_down ) roundIncrement = 0; 141 } 142 } 143 } 144 roundBits = absZ & 0x7F; 145 absZ = ( absZ + roundIncrement )>>7; 146 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 147 z = absZ; 148 if ( zSign ) z = - z; 149 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 150 float_raise( float_flag_invalid ); 151 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 152 } 153 if ( roundBits ) float_set_inexact(); 154 return z; 155 156 } 157 158 /* 159 ------------------------------------------------------------------------------- 160 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 161 `absZ1', with binary point between bits 63 and 64 (between the input words), 162 and returns the properly rounded 64-bit integer corresponding to the input. 163 If `zSign' is 1, the input is negated before being converted to an integer. 164 Ordinarily, the fixed-point input is simply rounded to an integer, with 165 the inexact exception raised if the input cannot be represented exactly as 166 an integer. However, if the fixed-point input is too large, the invalid 167 exception is raised and the largest positive or negative integer is 168 returned. 169 ------------------------------------------------------------------------------- 170 */ 171 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 172 { 173 int8 roundingMode; 174 flag roundNearestEven, increment; 175 int64 z; 176 177 roundingMode = float_rounding_mode(); 178 roundNearestEven = ( roundingMode == float_round_nearest_even ); 179 increment = ( (sbits64) absZ1 < 0 ); 180 if ( ! roundNearestEven ) { 181 if ( roundingMode == float_round_to_zero ) { 182 increment = 0; 183 } 184 else { 185 if ( zSign ) { 186 increment = ( roundingMode == float_round_down ) && absZ1; 187 } 188 else { 189 increment = ( roundingMode == float_round_up ) && absZ1; 190 } 191 } 192 } 193 if ( increment ) { 194 ++absZ0; 195 if ( absZ0 == 0 ) goto overflow; 196 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 197 } 198 z = absZ0; 199 if ( zSign ) z = - z; 200 if ( z && ( ( z < 0 ) ^ zSign ) ) { 201 overflow: 202 float_raise( float_flag_invalid ); 203 return 204 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 205 : LIT64( 0x7FFFFFFFFFFFFFFF ); 206 } 207 if ( absZ1 ) float_set_inexact(); 208 return z; 209 210 } 211 #endif 212 213 /* 214 ------------------------------------------------------------------------------- 215 Returns the fraction bits of the single-precision floating-point value `a'. 216 ------------------------------------------------------------------------------- 217 */ 218 INLINE bits32 extractFloat32Frac( float32 a ) 219 { 220 221 return a & 0x007FFFFF; 222 223 } 224 225 /* 226 ------------------------------------------------------------------------------- 227 Returns the exponent bits of the single-precision floating-point value `a'. 228 ------------------------------------------------------------------------------- 229 */ 230 INLINE int16 extractFloat32Exp( float32 a ) 231 { 232 233 return ( a>>23 ) & 0xFF; 234 235 } 236 237 /* 238 ------------------------------------------------------------------------------- 239 Returns the sign bit of the single-precision floating-point value `a'. 240 ------------------------------------------------------------------------------- 241 */ 242 INLINE flag extractFloat32Sign( float32 a ) 243 { 244 245 return a>>31; 246 247 } 248 249 /* 250 ------------------------------------------------------------------------------- 251 Normalizes the subnormal single-precision floating-point value represented 252 by the denormalized significand `aSig'. The normalized exponent and 253 significand are stored at the locations pointed to by `zExpPtr' and 254 `zSigPtr', respectively. 255 ------------------------------------------------------------------------------- 256 */ 257 static void 258 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 259 { 260 int8 shiftCount; 261 262 shiftCount = countLeadingZeros32( aSig ) - 8; 263 *zSigPtr = aSig<<shiftCount; 264 *zExpPtr = 1 - shiftCount; 265 266 } 267 268 /* 269 ------------------------------------------------------------------------------- 270 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 271 single-precision floating-point value, returning the result. After being 272 shifted into the proper positions, the three fields are simply added 273 together to form the result. This means that any integer portion of `zSig' 274 will be added into the exponent. Since a properly normalized significand 275 will have an integer portion equal to 1, the `zExp' input should be 1 less 276 than the desired result exponent whenever `zSig' is a complete, normalized 277 significand. 278 ------------------------------------------------------------------------------- 279 */ 280 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 281 { 282 283 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 284 285 } 286 287 /* 288 ------------------------------------------------------------------------------- 289 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 290 and significand `zSig', and returns the proper single-precision floating- 291 point value corresponding to the abstract input. Ordinarily, the abstract 292 value is simply rounded and packed into the single-precision format, with 293 the inexact exception raised if the abstract input cannot be represented 294 exactly. However, if the abstract value is too large, the overflow and 295 inexact exceptions are raised and an infinity or maximal finite value is 296 returned. If the abstract value is too small, the input value is rounded to 297 a subnormal number, and the underflow and inexact exceptions are raised if 298 the abstract input cannot be represented exactly as a subnormal single- 299 precision floating-point number. 300 The input significand `zSig' has its binary point between bits 30 301 and 29, which is 7 bits to the left of the usual location. This shifted 302 significand must be normalized or smaller. If `zSig' is not normalized, 303 `zExp' must be 0; in that case, the result returned is a subnormal number, 304 and it must not require rounding. In the usual case that `zSig' is 305 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 306 The handling of underflow and overflow follows the IEC/IEEE Standard for 307 Binary Floating-Point Arithmetic. 308 ------------------------------------------------------------------------------- 309 */ 310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 311 { 312 int8 roundingMode; 313 flag roundNearestEven; 314 int8 roundIncrement, roundBits; 315 flag isTiny; 316 317 roundingMode = float_rounding_mode(); 318 roundNearestEven = ( roundingMode == float_round_nearest_even ); 319 roundIncrement = 0x40; 320 if ( ! roundNearestEven ) { 321 if ( roundingMode == float_round_to_zero ) { 322 roundIncrement = 0; 323 } 324 else { 325 roundIncrement = 0x7F; 326 if ( zSign ) { 327 if ( roundingMode == float_round_up ) roundIncrement = 0; 328 } 329 else { 330 if ( roundingMode == float_round_down ) roundIncrement = 0; 331 } 332 } 333 } 334 roundBits = zSig & 0x7F; 335 if ( 0xFD <= (bits16) zExp ) { 336 if ( ( 0xFD < zExp ) 337 || ( ( zExp == 0xFD ) 338 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 339 ) { 340 float_raise( float_flag_overflow | float_flag_inexact ); 341 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 342 } 343 if ( zExp < 0 ) { 344 isTiny = 345 ( float_detect_tininess == float_tininess_before_rounding ) 346 || ( zExp < -1 ) 347 || ( zSig + roundIncrement < 0x80000000 ); 348 shift32RightJamming( zSig, - zExp, &zSig ); 349 zExp = 0; 350 roundBits = zSig & 0x7F; 351 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 352 } 353 } 354 if ( roundBits ) float_set_inexact(); 355 zSig = ( zSig + roundIncrement )>>7; 356 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 357 if ( zSig == 0 ) zExp = 0; 358 return packFloat32( zSign, zExp, zSig ); 359 360 } 361 362 /* 363 ------------------------------------------------------------------------------- 364 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 365 and significand `zSig', and returns the proper single-precision floating- 366 point value corresponding to the abstract input. This routine is just like 367 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 368 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 369 floating-point exponent. 370 ------------------------------------------------------------------------------- 371 */ 372 static float32 373 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 374 { 375 int8 shiftCount; 376 377 shiftCount = countLeadingZeros32( zSig ) - 1; 378 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 379 380 } 381 382 /* 383 ------------------------------------------------------------------------------- 384 Returns the fraction bits of the double-precision floating-point value `a'. 385 ------------------------------------------------------------------------------- 386 */ 387 INLINE bits64 extractFloat64Frac( float64 a ) 388 { 389 390 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 391 392 } 393 394 /* 395 ------------------------------------------------------------------------------- 396 Returns the exponent bits of the double-precision floating-point value `a'. 397 ------------------------------------------------------------------------------- 398 */ 399 INLINE int16 extractFloat64Exp( float64 a ) 400 { 401 402 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 403 404 } 405 406 /* 407 ------------------------------------------------------------------------------- 408 Returns the sign bit of the double-precision floating-point value `a'. 409 ------------------------------------------------------------------------------- 410 */ 411 INLINE flag extractFloat64Sign( float64 a ) 412 { 413 414 return FLOAT64_DEMANGLE(a)>>63; 415 416 } 417 418 /* 419 ------------------------------------------------------------------------------- 420 Normalizes the subnormal double-precision floating-point value represented 421 by the denormalized significand `aSig'. The normalized exponent and 422 significand are stored at the locations pointed to by `zExpPtr' and 423 `zSigPtr', respectively. 424 ------------------------------------------------------------------------------- 425 */ 426 static void 427 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 428 { 429 int8 shiftCount; 430 431 shiftCount = countLeadingZeros64( aSig ) - 11; 432 *zSigPtr = aSig<<shiftCount; 433 *zExpPtr = 1 - shiftCount; 434 435 } 436 437 /* 438 ------------------------------------------------------------------------------- 439 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 440 double-precision floating-point value, returning the result. After being 441 shifted into the proper positions, the three fields are simply added 442 together to form the result. This means that any integer portion of `zSig' 443 will be added into the exponent. Since a properly normalized significand 444 will have an integer portion equal to 1, the `zExp' input should be 1 less 445 than the desired result exponent whenever `zSig' is a complete, normalized 446 significand. 447 ------------------------------------------------------------------------------- 448 */ 449 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 450 { 451 452 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 453 ( ( (bits64) zExp )<<52 ) + zSig ); 454 455 } 456 457 /* 458 ------------------------------------------------------------------------------- 459 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 460 and significand `zSig', and returns the proper double-precision floating- 461 point value corresponding to the abstract input. Ordinarily, the abstract 462 value is simply rounded and packed into the double-precision format, with 463 the inexact exception raised if the abstract input cannot be represented 464 exactly. However, if the abstract value is too large, the overflow and 465 inexact exceptions are raised and an infinity or maximal finite value is 466 returned. If the abstract value is too small, the input value is rounded to 467 a subnormal number, and the underflow and inexact exceptions are raised if 468 the abstract input cannot be represented exactly as a subnormal double- 469 precision floating-point number. 470 The input significand `zSig' has its binary point between bits 62 471 and 61, which is 10 bits to the left of the usual location. This shifted 472 significand must be normalized or smaller. If `zSig' is not normalized, 473 `zExp' must be 0; in that case, the result returned is a subnormal number, 474 and it must not require rounding. In the usual case that `zSig' is 475 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 476 The handling of underflow and overflow follows the IEC/IEEE Standard for 477 Binary Floating-Point Arithmetic. 478 ------------------------------------------------------------------------------- 479 */ 480 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 481 { 482 int8 roundingMode; 483 flag roundNearestEven; 484 int16 roundIncrement, roundBits; 485 flag isTiny; 486 487 roundingMode = float_rounding_mode(); 488 roundNearestEven = ( roundingMode == float_round_nearest_even ); 489 roundIncrement = 0x200; 490 if ( ! roundNearestEven ) { 491 if ( roundingMode == float_round_to_zero ) { 492 roundIncrement = 0; 493 } 494 else { 495 roundIncrement = 0x3FF; 496 if ( zSign ) { 497 if ( roundingMode == float_round_up ) roundIncrement = 0; 498 } 499 else { 500 if ( roundingMode == float_round_down ) roundIncrement = 0; 501 } 502 } 503 } 504 roundBits = zSig & 0x3FF; 505 if ( 0x7FD <= (bits16) zExp ) { 506 if ( ( 0x7FD < zExp ) 507 || ( ( zExp == 0x7FD ) 508 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 509 ) { 510 float_raise( float_flag_overflow | float_flag_inexact ); 511 return FLOAT64_MANGLE( 512 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 513 ( roundIncrement == 0 )); 514 } 515 if ( zExp < 0 ) { 516 isTiny = 517 ( float_detect_tininess == float_tininess_before_rounding ) 518 || ( zExp < -1 ) 519 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 520 shift64RightJamming( zSig, - zExp, &zSig ); 521 zExp = 0; 522 roundBits = zSig & 0x3FF; 523 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 524 } 525 } 526 if ( roundBits ) float_set_inexact(); 527 zSig = ( zSig + roundIncrement )>>10; 528 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 529 if ( zSig == 0 ) zExp = 0; 530 return packFloat64( zSign, zExp, zSig ); 531 532 } 533 534 /* 535 ------------------------------------------------------------------------------- 536 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 537 and significand `zSig', and returns the proper double-precision floating- 538 point value corresponding to the abstract input. This routine is just like 539 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 540 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 541 floating-point exponent. 542 ------------------------------------------------------------------------------- 543 */ 544 static float64 545 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 546 { 547 int8 shiftCount; 548 549 shiftCount = countLeadingZeros64( zSig ) - 1; 550 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 551 552 } 553 554 #ifdef FLOATX80 555 556 /* 557 ------------------------------------------------------------------------------- 558 Returns the fraction bits of the extended double-precision floating-point 559 value `a'. 560 ------------------------------------------------------------------------------- 561 */ 562 INLINE bits64 extractFloatx80Frac( floatx80 a ) 563 { 564 565 return a.low; 566 567 } 568 569 /* 570 ------------------------------------------------------------------------------- 571 Returns the exponent bits of the extended double-precision floating-point 572 value `a'. 573 ------------------------------------------------------------------------------- 574 */ 575 INLINE int32 extractFloatx80Exp( floatx80 a ) 576 { 577 578 return a.high & 0x7FFF; 579 580 } 581 582 /* 583 ------------------------------------------------------------------------------- 584 Returns the sign bit of the extended double-precision floating-point value 585 `a'. 586 ------------------------------------------------------------------------------- 587 */ 588 INLINE flag extractFloatx80Sign( floatx80 a ) 589 { 590 591 return a.high>>15; 592 593 } 594 595 /* 596 ------------------------------------------------------------------------------- 597 Normalizes the subnormal extended double-precision floating-point value 598 represented by the denormalized significand `aSig'. The normalized exponent 599 and significand are stored at the locations pointed to by `zExpPtr' and 600 `zSigPtr', respectively. 601 ------------------------------------------------------------------------------- 602 */ 603 static void 604 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 605 { 606 int8 shiftCount; 607 608 shiftCount = countLeadingZeros64( aSig ); 609 *zSigPtr = aSig<<shiftCount; 610 *zExpPtr = 1 - shiftCount; 611 612 } 613 614 /* 615 ------------------------------------------------------------------------------- 616 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 617 extended double-precision floating-point value, returning the result. 618 ------------------------------------------------------------------------------- 619 */ 620 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 621 { 622 floatx80 z; 623 624 z.low = zSig; 625 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 626 return z; 627 628 } 629 630 /* 631 ------------------------------------------------------------------------------- 632 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 633 and extended significand formed by the concatenation of `zSig0' and `zSig1', 634 and returns the proper extended double-precision floating-point value 635 corresponding to the abstract input. Ordinarily, the abstract value is 636 rounded and packed into the extended double-precision format, with the 637 inexact exception raised if the abstract input cannot be represented 638 exactly. However, if the abstract value is too large, the overflow and 639 inexact exceptions are raised and an infinity or maximal finite value is 640 returned. If the abstract value is too small, the input value is rounded to 641 a subnormal number, and the underflow and inexact exceptions are raised if 642 the abstract input cannot be represented exactly as a subnormal extended 643 double-precision floating-point number. 644 If `roundingPrecision' is 32 or 64, the result is rounded to the same 645 number of bits as single or double precision, respectively. Otherwise, the 646 result is rounded to the full precision of the extended double-precision 647 format. 648 The input significand must be normalized or smaller. If the input 649 significand is not normalized, `zExp' must be 0; in that case, the result 650 returned is a subnormal number, and it must not require rounding. The 651 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 652 Floating-Point Arithmetic. 653 ------------------------------------------------------------------------------- 654 */ 655 static floatx80 656 roundAndPackFloatx80( 657 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 658 ) 659 { 660 int8 roundingMode; 661 flag roundNearestEven, increment, isTiny; 662 int64 roundIncrement, roundMask, roundBits; 663 664 roundingMode = float_rounding_mode(); 665 roundNearestEven = ( roundingMode == float_round_nearest_even ); 666 if ( roundingPrecision == 80 ) goto precision80; 667 if ( roundingPrecision == 64 ) { 668 roundIncrement = LIT64( 0x0000000000000400 ); 669 roundMask = LIT64( 0x00000000000007FF ); 670 } 671 else if ( roundingPrecision == 32 ) { 672 roundIncrement = LIT64( 0x0000008000000000 ); 673 roundMask = LIT64( 0x000000FFFFFFFFFF ); 674 } 675 else { 676 goto precision80; 677 } 678 zSig0 |= ( zSig1 != 0 ); 679 if ( ! roundNearestEven ) { 680 if ( roundingMode == float_round_to_zero ) { 681 roundIncrement = 0; 682 } 683 else { 684 roundIncrement = roundMask; 685 if ( zSign ) { 686 if ( roundingMode == float_round_up ) roundIncrement = 0; 687 } 688 else { 689 if ( roundingMode == float_round_down ) roundIncrement = 0; 690 } 691 } 692 } 693 roundBits = zSig0 & roundMask; 694 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 695 if ( ( 0x7FFE < zExp ) 696 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 697 ) { 698 goto overflow; 699 } 700 if ( zExp <= 0 ) { 701 isTiny = 702 ( float_detect_tininess == float_tininess_before_rounding ) 703 || ( zExp < 0 ) 704 || ( zSig0 <= zSig0 + roundIncrement ); 705 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 706 zExp = 0; 707 roundBits = zSig0 & roundMask; 708 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 709 if ( roundBits ) float_set_inexact(); 710 zSig0 += roundIncrement; 711 if ( (sbits64) zSig0 < 0 ) zExp = 1; 712 roundIncrement = roundMask + 1; 713 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 714 roundMask |= roundIncrement; 715 } 716 zSig0 &= ~ roundMask; 717 return packFloatx80( zSign, zExp, zSig0 ); 718 } 719 } 720 if ( roundBits ) float_set_inexact(); 721 zSig0 += roundIncrement; 722 if ( zSig0 < roundIncrement ) { 723 ++zExp; 724 zSig0 = LIT64( 0x8000000000000000 ); 725 } 726 roundIncrement = roundMask + 1; 727 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 728 roundMask |= roundIncrement; 729 } 730 zSig0 &= ~ roundMask; 731 if ( zSig0 == 0 ) zExp = 0; 732 return packFloatx80( zSign, zExp, zSig0 ); 733 precision80: 734 increment = ( (sbits64) zSig1 < 0 ); 735 if ( ! roundNearestEven ) { 736 if ( roundingMode == float_round_to_zero ) { 737 increment = 0; 738 } 739 else { 740 if ( zSign ) { 741 increment = ( roundingMode == float_round_down ) && zSig1; 742 } 743 else { 744 increment = ( roundingMode == float_round_up ) && zSig1; 745 } 746 } 747 } 748 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 749 if ( ( 0x7FFE < zExp ) 750 || ( ( zExp == 0x7FFE ) 751 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 752 && increment 753 ) 754 ) { 755 roundMask = 0; 756 overflow: 757 float_raise( float_flag_overflow | float_flag_inexact ); 758 if ( ( roundingMode == float_round_to_zero ) 759 || ( zSign && ( roundingMode == float_round_up ) ) 760 || ( ! zSign && ( roundingMode == float_round_down ) ) 761 ) { 762 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 763 } 764 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 765 } 766 if ( zExp <= 0 ) { 767 isTiny = 768 ( float_detect_tininess == float_tininess_before_rounding ) 769 || ( zExp < 0 ) 770 || ! increment 771 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 772 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 773 zExp = 0; 774 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 775 if ( zSig1 ) float_set_inexact(); 776 if ( roundNearestEven ) { 777 increment = ( (sbits64) zSig1 < 0 ); 778 } 779 else { 780 if ( zSign ) { 781 increment = ( roundingMode == float_round_down ) && zSig1; 782 } 783 else { 784 increment = ( roundingMode == float_round_up ) && zSig1; 785 } 786 } 787 if ( increment ) { 788 ++zSig0; 789 zSig0 &= 790 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 791 if ( (sbits64) zSig0 < 0 ) zExp = 1; 792 } 793 return packFloatx80( zSign, zExp, zSig0 ); 794 } 795 } 796 if ( zSig1 ) float_set_inexact(); 797 if ( increment ) { 798 ++zSig0; 799 if ( zSig0 == 0 ) { 800 ++zExp; 801 zSig0 = LIT64( 0x8000000000000000 ); 802 } 803 else { 804 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 805 } 806 } 807 else { 808 if ( zSig0 == 0 ) zExp = 0; 809 } 810 return packFloatx80( zSign, zExp, zSig0 ); 811 812 } 813 814 /* 815 ------------------------------------------------------------------------------- 816 Takes an abstract floating-point value having sign `zSign', exponent 817 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 818 and returns the proper extended double-precision floating-point value 819 corresponding to the abstract input. This routine is just like 820 `roundAndPackFloatx80' except that the input significand does not have to be 821 normalized. 822 ------------------------------------------------------------------------------- 823 */ 824 static floatx80 825 normalizeRoundAndPackFloatx80( 826 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 827 ) 828 { 829 int8 shiftCount; 830 831 if ( zSig0 == 0 ) { 832 zSig0 = zSig1; 833 zSig1 = 0; 834 zExp -= 64; 835 } 836 shiftCount = countLeadingZeros64( zSig0 ); 837 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 838 zExp -= shiftCount; 839 return 840 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 841 842 } 843 844 #endif 845 846 #ifdef FLOAT128 847 848 /* 849 ------------------------------------------------------------------------------- 850 Returns the least-significant 64 fraction bits of the quadruple-precision 851 floating-point value `a'. 852 ------------------------------------------------------------------------------- 853 */ 854 INLINE bits64 extractFloat128Frac1( float128 a ) 855 { 856 857 return a.low; 858 859 } 860 861 /* 862 ------------------------------------------------------------------------------- 863 Returns the most-significant 48 fraction bits of the quadruple-precision 864 floating-point value `a'. 865 ------------------------------------------------------------------------------- 866 */ 867 INLINE bits64 extractFloat128Frac0( float128 a ) 868 { 869 870 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 871 872 } 873 874 /* 875 ------------------------------------------------------------------------------- 876 Returns the exponent bits of the quadruple-precision floating-point value 877 `a'. 878 ------------------------------------------------------------------------------- 879 */ 880 INLINE int32 extractFloat128Exp( float128 a ) 881 { 882 883 return ( a.high>>48 ) & 0x7FFF; 884 885 } 886 887 /* 888 ------------------------------------------------------------------------------- 889 Returns the sign bit of the quadruple-precision floating-point value `a'. 890 ------------------------------------------------------------------------------- 891 */ 892 INLINE flag extractFloat128Sign( float128 a ) 893 { 894 895 return a.high>>63; 896 897 } 898 899 /* 900 ------------------------------------------------------------------------------- 901 Normalizes the subnormal quadruple-precision floating-point value 902 represented by the denormalized significand formed by the concatenation of 903 `aSig0' and `aSig1'. The normalized exponent is stored at the location 904 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 905 significand are stored at the location pointed to by `zSig0Ptr', and the 906 least significant 64 bits of the normalized significand are stored at the 907 location pointed to by `zSig1Ptr'. 908 ------------------------------------------------------------------------------- 909 */ 910 static void 911 normalizeFloat128Subnormal( 912 bits64 aSig0, 913 bits64 aSig1, 914 int32 *zExpPtr, 915 bits64 *zSig0Ptr, 916 bits64 *zSig1Ptr 917 ) 918 { 919 int8 shiftCount; 920 921 if ( aSig0 == 0 ) { 922 shiftCount = countLeadingZeros64( aSig1 ) - 15; 923 if ( shiftCount < 0 ) { 924 *zSig0Ptr = aSig1>>( - shiftCount ); 925 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 926 } 927 else { 928 *zSig0Ptr = aSig1<<shiftCount; 929 *zSig1Ptr = 0; 930 } 931 *zExpPtr = - shiftCount - 63; 932 } 933 else { 934 shiftCount = countLeadingZeros64( aSig0 ) - 15; 935 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 936 *zExpPtr = 1 - shiftCount; 937 } 938 939 } 940 941 /* 942 ------------------------------------------------------------------------------- 943 Packs the sign `zSign', the exponent `zExp', and the significand formed 944 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 945 floating-point value, returning the result. After being shifted into the 946 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 947 added together to form the most significant 32 bits of the result. This 948 means that any integer portion of `zSig0' will be added into the exponent. 949 Since a properly normalized significand will have an integer portion equal 950 to 1, the `zExp' input should be 1 less than the desired result exponent 951 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 952 significand. 953 ------------------------------------------------------------------------------- 954 */ 955 INLINE float128 956 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 957 { 958 float128 z; 959 960 z.low = zSig1; 961 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 962 return z; 963 964 } 965 966 /* 967 ------------------------------------------------------------------------------- 968 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 969 and extended significand formed by the concatenation of `zSig0', `zSig1', 970 and `zSig2', and returns the proper quadruple-precision floating-point value 971 corresponding to the abstract input. Ordinarily, the abstract value is 972 simply rounded and packed into the quadruple-precision format, with the 973 inexact exception raised if the abstract input cannot be represented 974 exactly. However, if the abstract value is too large, the overflow and 975 inexact exceptions are raised and an infinity or maximal finite value is 976 returned. If the abstract value is too small, the input value is rounded to 977 a subnormal number, and the underflow and inexact exceptions are raised if 978 the abstract input cannot be represented exactly as a subnormal quadruple- 979 precision floating-point number. 980 The input significand must be normalized or smaller. If the input 981 significand is not normalized, `zExp' must be 0; in that case, the result 982 returned is a subnormal number, and it must not require rounding. In the 983 usual case that the input significand is normalized, `zExp' must be 1 less 984 than the ``true'' floating-point exponent. The handling of underflow and 985 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 986 ------------------------------------------------------------------------------- 987 */ 988 static float128 989 roundAndPackFloat128( 990 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 991 { 992 int8 roundingMode; 993 flag roundNearestEven, increment, isTiny; 994 995 roundingMode = float_rounding_mode(); 996 roundNearestEven = ( roundingMode == float_round_nearest_even ); 997 increment = ( (sbits64) zSig2 < 0 ); 998 if ( ! roundNearestEven ) { 999 if ( roundingMode == float_round_to_zero ) { 1000 increment = 0; 1001 } 1002 else { 1003 if ( zSign ) { 1004 increment = ( roundingMode == float_round_down ) && zSig2; 1005 } 1006 else { 1007 increment = ( roundingMode == float_round_up ) && zSig2; 1008 } 1009 } 1010 } 1011 if ( 0x7FFD <= (bits32) zExp ) { 1012 if ( ( 0x7FFD < zExp ) 1013 || ( ( zExp == 0x7FFD ) 1014 && eq128( 1015 LIT64( 0x0001FFFFFFFFFFFF ), 1016 LIT64( 0xFFFFFFFFFFFFFFFF ), 1017 zSig0, 1018 zSig1 1019 ) 1020 && increment 1021 ) 1022 ) { 1023 float_raise( float_flag_overflow | float_flag_inexact ); 1024 if ( ( roundingMode == float_round_to_zero ) 1025 || ( zSign && ( roundingMode == float_round_up ) ) 1026 || ( ! zSign && ( roundingMode == float_round_down ) ) 1027 ) { 1028 return 1029 packFloat128( 1030 zSign, 1031 0x7FFE, 1032 LIT64( 0x0000FFFFFFFFFFFF ), 1033 LIT64( 0xFFFFFFFFFFFFFFFF ) 1034 ); 1035 } 1036 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1037 } 1038 if ( zExp < 0 ) { 1039 isTiny = 1040 ( float_detect_tininess == float_tininess_before_rounding ) 1041 || ( zExp < -1 ) 1042 || ! increment 1043 || lt128( 1044 zSig0, 1045 zSig1, 1046 LIT64( 0x0001FFFFFFFFFFFF ), 1047 LIT64( 0xFFFFFFFFFFFFFFFF ) 1048 ); 1049 shift128ExtraRightJamming( 1050 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1051 zExp = 0; 1052 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1053 if ( roundNearestEven ) { 1054 increment = ( (sbits64) zSig2 < 0 ); 1055 } 1056 else { 1057 if ( zSign ) { 1058 increment = ( roundingMode == float_round_down ) && zSig2; 1059 } 1060 else { 1061 increment = ( roundingMode == float_round_up ) && zSig2; 1062 } 1063 } 1064 } 1065 } 1066 if ( zSig2 ) float_set_inexact(); 1067 if ( increment ) { 1068 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1069 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1070 } 1071 else { 1072 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1073 } 1074 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1075 1076 } 1077 1078 /* 1079 ------------------------------------------------------------------------------- 1080 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1081 and significand formed by the concatenation of `zSig0' and `zSig1', and 1082 returns the proper quadruple-precision floating-point value corresponding 1083 to the abstract input. This routine is just like `roundAndPackFloat128' 1084 except that the input significand has fewer bits and does not have to be 1085 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1086 point exponent. 1087 ------------------------------------------------------------------------------- 1088 */ 1089 static float128 1090 normalizeRoundAndPackFloat128( 1091 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1092 { 1093 int8 shiftCount; 1094 bits64 zSig2; 1095 1096 if ( zSig0 == 0 ) { 1097 zSig0 = zSig1; 1098 zSig1 = 0; 1099 zExp -= 64; 1100 } 1101 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1102 if ( 0 <= shiftCount ) { 1103 zSig2 = 0; 1104 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1105 } 1106 else { 1107 shift128ExtraRightJamming( 1108 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1109 } 1110 zExp -= shiftCount; 1111 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1112 1113 } 1114 1115 #endif 1116 1117 /* 1118 ------------------------------------------------------------------------------- 1119 Returns the result of converting the 32-bit two's complement integer `a' 1120 to the single-precision floating-point format. The conversion is performed 1121 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1122 ------------------------------------------------------------------------------- 1123 */ 1124 float32 int32_to_float32( int32 a ) 1125 { 1126 flag zSign; 1127 1128 if ( a == 0 ) return 0; 1129 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1130 zSign = ( a < 0 ); 1131 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1132 1133 } 1134 1135 /* 1136 ------------------------------------------------------------------------------- 1137 Returns the result of converting the 32-bit two's complement integer `a' 1138 to the double-precision floating-point format. The conversion is performed 1139 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1140 ------------------------------------------------------------------------------- 1141 */ 1142 float64 int32_to_float64( int32 a ) 1143 { 1144 flag zSign; 1145 uint32 absA; 1146 int8 shiftCount; 1147 bits64 zSig; 1148 1149 if ( a == 0 ) return 0; 1150 zSign = ( a < 0 ); 1151 absA = zSign ? - a : a; 1152 shiftCount = countLeadingZeros32( absA ) + 21; 1153 zSig = absA; 1154 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1155 1156 } 1157 1158 #ifdef FLOATX80 1159 1160 /* 1161 ------------------------------------------------------------------------------- 1162 Returns the result of converting the 32-bit two's complement integer `a' 1163 to the extended double-precision floating-point format. The conversion 1164 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1165 Arithmetic. 1166 ------------------------------------------------------------------------------- 1167 */ 1168 floatx80 int32_to_floatx80( int32 a ) 1169 { 1170 flag zSign; 1171 uint32 absA; 1172 int8 shiftCount; 1173 bits64 zSig; 1174 1175 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1176 zSign = ( a < 0 ); 1177 absA = zSign ? - a : a; 1178 shiftCount = countLeadingZeros32( absA ) + 32; 1179 zSig = absA; 1180 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1181 1182 } 1183 1184 #endif 1185 1186 #ifdef FLOAT128 1187 1188 /* 1189 ------------------------------------------------------------------------------- 1190 Returns the result of converting the 32-bit two's complement integer `a' to 1191 the quadruple-precision floating-point format. The conversion is performed 1192 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1193 ------------------------------------------------------------------------------- 1194 */ 1195 float128 int32_to_float128( int32 a ) 1196 { 1197 flag zSign; 1198 uint32 absA; 1199 int8 shiftCount; 1200 bits64 zSig0; 1201 1202 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1203 zSign = ( a < 0 ); 1204 absA = zSign ? - a : a; 1205 shiftCount = countLeadingZeros32( absA ) + 17; 1206 zSig0 = absA; 1207 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1208 1209 } 1210 1211 #endif 1212 1213 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1214 /* 1215 ------------------------------------------------------------------------------- 1216 Returns the result of converting the 64-bit two's complement integer `a' 1217 to the single-precision floating-point format. The conversion is performed 1218 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1219 ------------------------------------------------------------------------------- 1220 */ 1221 float32 int64_to_float32( int64 a ) 1222 { 1223 flag zSign; 1224 uint64 absA; 1225 int8 shiftCount; 1226 1227 if ( a == 0 ) return 0; 1228 zSign = ( a < 0 ); 1229 absA = zSign ? - a : a; 1230 shiftCount = countLeadingZeros64( absA ) - 40; 1231 if ( 0 <= shiftCount ) { 1232 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1233 } 1234 else { 1235 shiftCount += 7; 1236 if ( shiftCount < 0 ) { 1237 shift64RightJamming( absA, - shiftCount, &absA ); 1238 } 1239 else { 1240 absA <<= shiftCount; 1241 } 1242 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1243 } 1244 1245 } 1246 1247 /* 1248 ------------------------------------------------------------------------------- 1249 Returns the result of converting the 64-bit two's complement integer `a' 1250 to the double-precision floating-point format. The conversion is performed 1251 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1252 ------------------------------------------------------------------------------- 1253 */ 1254 float64 int64_to_float64( int64 a ) 1255 { 1256 flag zSign; 1257 1258 if ( a == 0 ) return 0; 1259 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1260 return packFloat64( 1, 0x43E, 0 ); 1261 } 1262 zSign = ( a < 0 ); 1263 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1264 1265 } 1266 1267 #ifdef FLOATX80 1268 1269 /* 1270 ------------------------------------------------------------------------------- 1271 Returns the result of converting the 64-bit two's complement integer `a' 1272 to the extended double-precision floating-point format. The conversion 1273 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1274 Arithmetic. 1275 ------------------------------------------------------------------------------- 1276 */ 1277 floatx80 int64_to_floatx80( int64 a ) 1278 { 1279 flag zSign; 1280 uint64 absA; 1281 int8 shiftCount; 1282 1283 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1284 zSign = ( a < 0 ); 1285 absA = zSign ? - a : a; 1286 shiftCount = countLeadingZeros64( absA ); 1287 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1288 1289 } 1290 1291 #endif 1292 1293 #ifdef FLOAT128 1294 1295 /* 1296 ------------------------------------------------------------------------------- 1297 Returns the result of converting the 64-bit two's complement integer `a' to 1298 the quadruple-precision floating-point format. The conversion is performed 1299 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1300 ------------------------------------------------------------------------------- 1301 */ 1302 float128 int64_to_float128( int64 a ) 1303 { 1304 flag zSign; 1305 uint64 absA; 1306 int8 shiftCount; 1307 int32 zExp; 1308 bits64 zSig0, zSig1; 1309 1310 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1311 zSign = ( a < 0 ); 1312 absA = zSign ? - a : a; 1313 shiftCount = countLeadingZeros64( absA ) + 49; 1314 zExp = 0x406E - shiftCount; 1315 if ( 64 <= shiftCount ) { 1316 zSig1 = 0; 1317 zSig0 = absA; 1318 shiftCount -= 64; 1319 } 1320 else { 1321 zSig1 = absA; 1322 zSig0 = 0; 1323 } 1324 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1325 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1326 1327 } 1328 1329 #endif 1330 #endif /* !SOFTFLOAT_FOR_GCC */ 1331 1332 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1333 /* 1334 ------------------------------------------------------------------------------- 1335 Returns the result of converting the single-precision floating-point value 1336 `a' to the 32-bit two's complement integer format. The conversion is 1337 performed according to the IEC/IEEE Standard for Binary Floating-Point 1338 Arithmetic---which means in particular that the conversion is rounded 1339 according to the current rounding mode. If `a' is a NaN, the largest 1340 positive integer is returned. Otherwise, if the conversion overflows, the 1341 largest integer with the same sign as `a' is returned. 1342 ------------------------------------------------------------------------------- 1343 */ 1344 int32 float32_to_int32( float32 a ) 1345 { 1346 flag aSign; 1347 int16 aExp, shiftCount; 1348 bits32 aSig; 1349 bits64 aSig64; 1350 1351 aSig = extractFloat32Frac( a ); 1352 aExp = extractFloat32Exp( a ); 1353 aSign = extractFloat32Sign( a ); 1354 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1355 if ( aExp ) aSig |= 0x00800000; 1356 shiftCount = 0xAF - aExp; 1357 aSig64 = aSig; 1358 aSig64 <<= 32; 1359 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1360 return roundAndPackInt32( aSign, aSig64 ); 1361 1362 } 1363 #endif /* !SOFTFLOAT_FOR_GCC */ 1364 1365 /* 1366 ------------------------------------------------------------------------------- 1367 Returns the result of converting the single-precision floating-point value 1368 `a' to the 32-bit two's complement integer format. The conversion is 1369 performed according to the IEC/IEEE Standard for Binary Floating-Point 1370 Arithmetic, except that the conversion is always rounded toward zero. 1371 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1372 the conversion overflows, the largest integer with the same sign as `a' is 1373 returned. 1374 ------------------------------------------------------------------------------- 1375 */ 1376 int32 float32_to_int32_round_to_zero( float32 a ) 1377 { 1378 flag aSign; 1379 int16 aExp, shiftCount; 1380 bits32 aSig; 1381 int32 z; 1382 1383 aSig = extractFloat32Frac( a ); 1384 aExp = extractFloat32Exp( a ); 1385 aSign = extractFloat32Sign( a ); 1386 shiftCount = aExp - 0x9E; 1387 if ( 0 <= shiftCount ) { 1388 if ( a != 0xCF000000 ) { 1389 float_raise( float_flag_invalid ); 1390 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1391 } 1392 return (sbits32) 0x80000000; 1393 } 1394 else if ( aExp <= 0x7E ) { 1395 if ( aExp | aSig ) float_set_inexact(); 1396 return 0; 1397 } 1398 aSig = ( aSig | 0x00800000 )<<8; 1399 z = aSig>>( - shiftCount ); 1400 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1401 float_set_inexact(); 1402 } 1403 if ( aSign ) z = - z; 1404 return z; 1405 1406 } 1407 1408 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1409 /* 1410 ------------------------------------------------------------------------------- 1411 Returns the result of converting the single-precision floating-point value 1412 `a' to the 64-bit two's complement integer format. The conversion is 1413 performed according to the IEC/IEEE Standard for Binary Floating-Point 1414 Arithmetic---which means in particular that the conversion is rounded 1415 according to the current rounding mode. If `a' is a NaN, the largest 1416 positive integer is returned. Otherwise, if the conversion overflows, the 1417 largest integer with the same sign as `a' is returned. 1418 ------------------------------------------------------------------------------- 1419 */ 1420 int64 float32_to_int64( float32 a ) 1421 { 1422 flag aSign; 1423 int16 aExp, shiftCount; 1424 bits32 aSig; 1425 bits64 aSig64, aSigExtra; 1426 1427 aSig = extractFloat32Frac( a ); 1428 aExp = extractFloat32Exp( a ); 1429 aSign = extractFloat32Sign( a ); 1430 shiftCount = 0xBE - aExp; 1431 if ( shiftCount < 0 ) { 1432 float_raise( float_flag_invalid ); 1433 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1434 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1435 } 1436 return (sbits64) LIT64( 0x8000000000000000 ); 1437 } 1438 if ( aExp ) aSig |= 0x00800000; 1439 aSig64 = aSig; 1440 aSig64 <<= 40; 1441 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1442 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1443 1444 } 1445 1446 /* 1447 ------------------------------------------------------------------------------- 1448 Returns the result of converting the single-precision floating-point value 1449 `a' to the 64-bit two's complement integer format. The conversion is 1450 performed according to the IEC/IEEE Standard for Binary Floating-Point 1451 Arithmetic, except that the conversion is always rounded toward zero. If 1452 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1453 conversion overflows, the largest integer with the same sign as `a' is 1454 returned. 1455 ------------------------------------------------------------------------------- 1456 */ 1457 int64 float32_to_int64_round_to_zero( float32 a ) 1458 { 1459 flag aSign; 1460 int16 aExp, shiftCount; 1461 bits32 aSig; 1462 bits64 aSig64; 1463 int64 z; 1464 1465 aSig = extractFloat32Frac( a ); 1466 aExp = extractFloat32Exp( a ); 1467 aSign = extractFloat32Sign( a ); 1468 shiftCount = aExp - 0xBE; 1469 if ( 0 <= shiftCount ) { 1470 if ( a != 0xDF000000 ) { 1471 float_raise( float_flag_invalid ); 1472 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1473 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1474 } 1475 } 1476 return (sbits64) LIT64( 0x8000000000000000 ); 1477 } 1478 else if ( aExp <= 0x7E ) { 1479 if ( aExp | aSig ) float_set_inexact(); 1480 return 0; 1481 } 1482 aSig64 = aSig | 0x00800000; 1483 aSig64 <<= 40; 1484 z = aSig64>>( - shiftCount ); 1485 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1486 float_set_inexact(); 1487 } 1488 if ( aSign ) z = - z; 1489 return z; 1490 1491 } 1492 #endif /* !SOFTFLOAT_FOR_GCC */ 1493 1494 /* 1495 ------------------------------------------------------------------------------- 1496 Returns the result of converting the single-precision floating-point value 1497 `a' to the double-precision floating-point format. The conversion is 1498 performed according to the IEC/IEEE Standard for Binary Floating-Point 1499 Arithmetic. 1500 ------------------------------------------------------------------------------- 1501 */ 1502 float64 float32_to_float64( float32 a ) 1503 { 1504 flag aSign; 1505 int16 aExp; 1506 bits32 aSig; 1507 1508 aSig = extractFloat32Frac( a ); 1509 aExp = extractFloat32Exp( a ); 1510 aSign = extractFloat32Sign( a ); 1511 if ( aExp == 0xFF ) { 1512 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1513 return packFloat64( aSign, 0x7FF, 0 ); 1514 } 1515 if ( aExp == 0 ) { 1516 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1517 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1518 --aExp; 1519 } 1520 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1521 1522 } 1523 1524 #ifdef FLOATX80 1525 1526 /* 1527 ------------------------------------------------------------------------------- 1528 Returns the result of converting the single-precision floating-point value 1529 `a' to the extended double-precision floating-point format. The conversion 1530 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1531 Arithmetic. 1532 ------------------------------------------------------------------------------- 1533 */ 1534 floatx80 float32_to_floatx80( float32 a ) 1535 { 1536 flag aSign; 1537 int16 aExp; 1538 bits32 aSig; 1539 1540 aSig = extractFloat32Frac( a ); 1541 aExp = extractFloat32Exp( a ); 1542 aSign = extractFloat32Sign( a ); 1543 if ( aExp == 0xFF ) { 1544 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1545 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1546 } 1547 if ( aExp == 0 ) { 1548 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1549 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1550 } 1551 aSig |= 0x00800000; 1552 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1553 1554 } 1555 1556 #endif 1557 1558 #ifdef FLOAT128 1559 1560 /* 1561 ------------------------------------------------------------------------------- 1562 Returns the result of converting the single-precision floating-point value 1563 `a' to the double-precision floating-point format. The conversion is 1564 performed according to the IEC/IEEE Standard for Binary Floating-Point 1565 Arithmetic. 1566 ------------------------------------------------------------------------------- 1567 */ 1568 float128 float32_to_float128( float32 a ) 1569 { 1570 flag aSign; 1571 int16 aExp; 1572 bits32 aSig; 1573 1574 aSig = extractFloat32Frac( a ); 1575 aExp = extractFloat32Exp( a ); 1576 aSign = extractFloat32Sign( a ); 1577 if ( aExp == 0xFF ) { 1578 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1579 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1580 } 1581 if ( aExp == 0 ) { 1582 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1583 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1584 --aExp; 1585 } 1586 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1587 1588 } 1589 1590 #endif 1591 1592 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1593 /* 1594 ------------------------------------------------------------------------------- 1595 Rounds the single-precision floating-point value `a' to an integer, and 1596 returns the result as a single-precision floating-point value. The 1597 operation is performed according to the IEC/IEEE Standard for Binary 1598 Floating-Point Arithmetic. 1599 ------------------------------------------------------------------------------- 1600 */ 1601 float32 float32_round_to_int( float32 a ) 1602 { 1603 flag aSign; 1604 int16 aExp; 1605 bits32 lastBitMask, roundBitsMask; 1606 int8 roundingMode; 1607 float32 z; 1608 1609 aExp = extractFloat32Exp( a ); 1610 if ( 0x96 <= aExp ) { 1611 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1612 return propagateFloat32NaN( a, a ); 1613 } 1614 return a; 1615 } 1616 if ( aExp <= 0x7E ) { 1617 if ( (bits32) ( a<<1 ) == 0 ) return a; 1618 float_set_inexact(); 1619 aSign = extractFloat32Sign( a ); 1620 switch ( float_rounding_mode() ) { 1621 case float_round_nearest_even: 1622 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1623 return packFloat32( aSign, 0x7F, 0 ); 1624 } 1625 break; 1626 case float_round_down: 1627 return aSign ? 0xBF800000 : 0; 1628 case float_round_up: 1629 return aSign ? 0x80000000 : 0x3F800000; 1630 } 1631 return packFloat32( aSign, 0, 0 ); 1632 } 1633 lastBitMask = 1; 1634 lastBitMask <<= 0x96 - aExp; 1635 roundBitsMask = lastBitMask - 1; 1636 z = a; 1637 roundingMode = float_rounding_mode(); 1638 if ( roundingMode == float_round_nearest_even ) { 1639 z += lastBitMask>>1; 1640 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1641 } 1642 else if ( roundingMode != float_round_to_zero ) { 1643 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1644 z += roundBitsMask; 1645 } 1646 } 1647 z &= ~ roundBitsMask; 1648 if ( z != a ) float_set_inexact(); 1649 return z; 1650 1651 } 1652 #endif /* !SOFTFLOAT_FOR_GCC */ 1653 1654 /* 1655 ------------------------------------------------------------------------------- 1656 Returns the result of adding the absolute values of the single-precision 1657 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1658 before being returned. `zSign' is ignored if the result is a NaN. 1659 The addition is performed according to the IEC/IEEE Standard for Binary 1660 Floating-Point Arithmetic. 1661 ------------------------------------------------------------------------------- 1662 */ 1663 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1664 { 1665 int16 aExp, bExp, zExp; 1666 bits32 aSig, bSig, zSig; 1667 int16 expDiff; 1668 1669 aSig = extractFloat32Frac( a ); 1670 aExp = extractFloat32Exp( a ); 1671 bSig = extractFloat32Frac( b ); 1672 bExp = extractFloat32Exp( b ); 1673 expDiff = aExp - bExp; 1674 aSig <<= 6; 1675 bSig <<= 6; 1676 if ( 0 < expDiff ) { 1677 if ( aExp == 0xFF ) { 1678 if ( aSig ) return propagateFloat32NaN( a, b ); 1679 return a; 1680 } 1681 if ( bExp == 0 ) { 1682 --expDiff; 1683 } 1684 else { 1685 bSig |= 0x20000000; 1686 } 1687 shift32RightJamming( bSig, expDiff, &bSig ); 1688 zExp = aExp; 1689 } 1690 else if ( expDiff < 0 ) { 1691 if ( bExp == 0xFF ) { 1692 if ( bSig ) return propagateFloat32NaN( a, b ); 1693 return packFloat32( zSign, 0xFF, 0 ); 1694 } 1695 if ( aExp == 0 ) { 1696 ++expDiff; 1697 } 1698 else { 1699 aSig |= 0x20000000; 1700 } 1701 shift32RightJamming( aSig, - expDiff, &aSig ); 1702 zExp = bExp; 1703 } 1704 else { 1705 if ( aExp == 0xFF ) { 1706 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1707 return a; 1708 } 1709 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1710 zSig = 0x40000000 + aSig + bSig; 1711 zExp = aExp; 1712 goto roundAndPack; 1713 } 1714 aSig |= 0x20000000; 1715 zSig = ( aSig + bSig )<<1; 1716 --zExp; 1717 if ( (sbits32) zSig < 0 ) { 1718 zSig = aSig + bSig; 1719 ++zExp; 1720 } 1721 roundAndPack: 1722 return roundAndPackFloat32( zSign, zExp, zSig ); 1723 1724 } 1725 1726 /* 1727 ------------------------------------------------------------------------------- 1728 Returns the result of subtracting the absolute values of the single- 1729 precision floating-point values `a' and `b'. If `zSign' is 1, the 1730 difference is negated before being returned. `zSign' is ignored if the 1731 result is a NaN. The subtraction is performed according to the IEC/IEEE 1732 Standard for Binary Floating-Point Arithmetic. 1733 ------------------------------------------------------------------------------- 1734 */ 1735 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1736 { 1737 int16 aExp, bExp, zExp; 1738 bits32 aSig, bSig, zSig; 1739 int16 expDiff; 1740 1741 aSig = extractFloat32Frac( a ); 1742 aExp = extractFloat32Exp( a ); 1743 bSig = extractFloat32Frac( b ); 1744 bExp = extractFloat32Exp( b ); 1745 expDiff = aExp - bExp; 1746 aSig <<= 7; 1747 bSig <<= 7; 1748 if ( 0 < expDiff ) goto aExpBigger; 1749 if ( expDiff < 0 ) goto bExpBigger; 1750 if ( aExp == 0xFF ) { 1751 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1752 float_raise( float_flag_invalid ); 1753 return float32_default_nan; 1754 } 1755 if ( aExp == 0 ) { 1756 aExp = 1; 1757 bExp = 1; 1758 } 1759 if ( bSig < aSig ) goto aBigger; 1760 if ( aSig < bSig ) goto bBigger; 1761 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 ); 1762 bExpBigger: 1763 if ( bExp == 0xFF ) { 1764 if ( bSig ) return propagateFloat32NaN( a, b ); 1765 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1766 } 1767 if ( aExp == 0 ) { 1768 ++expDiff; 1769 } 1770 else { 1771 aSig |= 0x40000000; 1772 } 1773 shift32RightJamming( aSig, - expDiff, &aSig ); 1774 bSig |= 0x40000000; 1775 bBigger: 1776 zSig = bSig - aSig; 1777 zExp = bExp; 1778 zSign ^= 1; 1779 goto normalizeRoundAndPack; 1780 aExpBigger: 1781 if ( aExp == 0xFF ) { 1782 if ( aSig ) return propagateFloat32NaN( a, b ); 1783 return a; 1784 } 1785 if ( bExp == 0 ) { 1786 --expDiff; 1787 } 1788 else { 1789 bSig |= 0x40000000; 1790 } 1791 shift32RightJamming( bSig, expDiff, &bSig ); 1792 aSig |= 0x40000000; 1793 aBigger: 1794 zSig = aSig - bSig; 1795 zExp = aExp; 1796 normalizeRoundAndPack: 1797 --zExp; 1798 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1799 1800 } 1801 1802 /* 1803 ------------------------------------------------------------------------------- 1804 Returns the result of adding the single-precision floating-point values `a' 1805 and `b'. The operation is performed according to the IEC/IEEE Standard for 1806 Binary Floating-Point Arithmetic. 1807 ------------------------------------------------------------------------------- 1808 */ 1809 float32 float32_add( float32 a, float32 b ) 1810 { 1811 flag aSign, bSign; 1812 1813 aSign = extractFloat32Sign( a ); 1814 bSign = extractFloat32Sign( b ); 1815 if ( aSign == bSign ) { 1816 return addFloat32Sigs( a, b, aSign ); 1817 } 1818 else { 1819 return subFloat32Sigs( a, b, aSign ); 1820 } 1821 1822 } 1823 1824 /* 1825 ------------------------------------------------------------------------------- 1826 Returns the result of subtracting the single-precision floating-point values 1827 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1828 for Binary Floating-Point Arithmetic. 1829 ------------------------------------------------------------------------------- 1830 */ 1831 float32 float32_sub( float32 a, float32 b ) 1832 { 1833 flag aSign, bSign; 1834 1835 aSign = extractFloat32Sign( a ); 1836 bSign = extractFloat32Sign( b ); 1837 if ( aSign == bSign ) { 1838 return subFloat32Sigs( a, b, aSign ); 1839 } 1840 else { 1841 return addFloat32Sigs( a, b, aSign ); 1842 } 1843 1844 } 1845 1846 /* 1847 ------------------------------------------------------------------------------- 1848 Returns the result of multiplying the single-precision floating-point values 1849 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1850 for Binary Floating-Point Arithmetic. 1851 ------------------------------------------------------------------------------- 1852 */ 1853 float32 float32_mul( float32 a, float32 b ) 1854 { 1855 flag aSign, bSign, zSign; 1856 int16 aExp, bExp, zExp; 1857 bits32 aSig, bSig; 1858 bits64 zSig64; 1859 bits32 zSig; 1860 1861 aSig = extractFloat32Frac( a ); 1862 aExp = extractFloat32Exp( a ); 1863 aSign = extractFloat32Sign( a ); 1864 bSig = extractFloat32Frac( b ); 1865 bExp = extractFloat32Exp( b ); 1866 bSign = extractFloat32Sign( b ); 1867 zSign = aSign ^ bSign; 1868 if ( aExp == 0xFF ) { 1869 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1870 return propagateFloat32NaN( a, b ); 1871 } 1872 if ( ( bExp | bSig ) == 0 ) { 1873 float_raise( float_flag_invalid ); 1874 return float32_default_nan; 1875 } 1876 return packFloat32( zSign, 0xFF, 0 ); 1877 } 1878 if ( bExp == 0xFF ) { 1879 if ( bSig ) return propagateFloat32NaN( a, b ); 1880 if ( ( aExp | aSig ) == 0 ) { 1881 float_raise( float_flag_invalid ); 1882 return float32_default_nan; 1883 } 1884 return packFloat32( zSign, 0xFF, 0 ); 1885 } 1886 if ( aExp == 0 ) { 1887 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1888 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1889 } 1890 if ( bExp == 0 ) { 1891 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1892 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1893 } 1894 zExp = aExp + bExp - 0x7F; 1895 aSig = ( aSig | 0x00800000 )<<7; 1896 bSig = ( bSig | 0x00800000 )<<8; 1897 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1898 zSig = zSig64; 1899 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1900 zSig <<= 1; 1901 --zExp; 1902 } 1903 return roundAndPackFloat32( zSign, zExp, zSig ); 1904 1905 } 1906 1907 /* 1908 ------------------------------------------------------------------------------- 1909 Returns the result of dividing the single-precision floating-point value `a' 1910 by the corresponding value `b'. The operation is performed according to the 1911 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1912 ------------------------------------------------------------------------------- 1913 */ 1914 float32 float32_div( float32 a, float32 b ) 1915 { 1916 flag aSign, bSign, zSign; 1917 int16 aExp, bExp, zExp; 1918 bits32 aSig, bSig, zSig; 1919 1920 aSig = extractFloat32Frac( a ); 1921 aExp = extractFloat32Exp( a ); 1922 aSign = extractFloat32Sign( a ); 1923 bSig = extractFloat32Frac( b ); 1924 bExp = extractFloat32Exp( b ); 1925 bSign = extractFloat32Sign( b ); 1926 zSign = aSign ^ bSign; 1927 if ( aExp == 0xFF ) { 1928 if ( aSig ) return propagateFloat32NaN( a, b ); 1929 if ( bExp == 0xFF ) { 1930 if ( bSig ) return propagateFloat32NaN( a, b ); 1931 float_raise( float_flag_invalid ); 1932 return float32_default_nan; 1933 } 1934 return packFloat32( zSign, 0xFF, 0 ); 1935 } 1936 if ( bExp == 0xFF ) { 1937 if ( bSig ) return propagateFloat32NaN( a, b ); 1938 return packFloat32( zSign, 0, 0 ); 1939 } 1940 if ( bExp == 0 ) { 1941 if ( bSig == 0 ) { 1942 if ( ( aExp | aSig ) == 0 ) { 1943 float_raise( float_flag_invalid ); 1944 return float32_default_nan; 1945 } 1946 float_raise( float_flag_divbyzero ); 1947 return packFloat32( zSign, 0xFF, 0 ); 1948 } 1949 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1950 } 1951 if ( aExp == 0 ) { 1952 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1953 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1954 } 1955 zExp = aExp - bExp + 0x7D; 1956 aSig = ( aSig | 0x00800000 )<<7; 1957 bSig = ( bSig | 0x00800000 )<<8; 1958 if ( bSig <= ( aSig + aSig ) ) { 1959 aSig >>= 1; 1960 ++zExp; 1961 } 1962 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1963 if ( ( zSig & 0x3F ) == 0 ) { 1964 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 1965 } 1966 return roundAndPackFloat32( zSign, zExp, zSig ); 1967 1968 } 1969 1970 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1971 /* 1972 ------------------------------------------------------------------------------- 1973 Returns the remainder of the single-precision floating-point value `a' 1974 with respect to the corresponding value `b'. The operation is performed 1975 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1976 ------------------------------------------------------------------------------- 1977 */ 1978 float32 float32_rem( float32 a, float32 b ) 1979 { 1980 flag aSign, bSign, zSign; 1981 int16 aExp, bExp, expDiff; 1982 bits32 aSig, bSig; 1983 bits32 q; 1984 bits64 aSig64, bSig64, q64; 1985 bits32 alternateASig; 1986 sbits32 sigMean; 1987 1988 aSig = extractFloat32Frac( a ); 1989 aExp = extractFloat32Exp( a ); 1990 aSign = extractFloat32Sign( a ); 1991 bSig = extractFloat32Frac( b ); 1992 bExp = extractFloat32Exp( b ); 1993 bSign = extractFloat32Sign( b ); 1994 if ( aExp == 0xFF ) { 1995 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1996 return propagateFloat32NaN( a, b ); 1997 } 1998 float_raise( float_flag_invalid ); 1999 return float32_default_nan; 2000 } 2001 if ( bExp == 0xFF ) { 2002 if ( bSig ) return propagateFloat32NaN( a, b ); 2003 return a; 2004 } 2005 if ( bExp == 0 ) { 2006 if ( bSig == 0 ) { 2007 float_raise( float_flag_invalid ); 2008 return float32_default_nan; 2009 } 2010 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2011 } 2012 if ( aExp == 0 ) { 2013 if ( aSig == 0 ) return a; 2014 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2015 } 2016 expDiff = aExp - bExp; 2017 aSig |= 0x00800000; 2018 bSig |= 0x00800000; 2019 if ( expDiff < 32 ) { 2020 aSig <<= 8; 2021 bSig <<= 8; 2022 if ( expDiff < 0 ) { 2023 if ( expDiff < -1 ) return a; 2024 aSig >>= 1; 2025 } 2026 q = ( bSig <= aSig ); 2027 if ( q ) aSig -= bSig; 2028 if ( 0 < expDiff ) { 2029 q = ( ( (bits64) aSig )<<32 ) / bSig; 2030 q >>= 32 - expDiff; 2031 bSig >>= 2; 2032 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2033 } 2034 else { 2035 aSig >>= 2; 2036 bSig >>= 2; 2037 } 2038 } 2039 else { 2040 if ( bSig <= aSig ) aSig -= bSig; 2041 aSig64 = ( (bits64) aSig )<<40; 2042 bSig64 = ( (bits64) bSig )<<40; 2043 expDiff -= 64; 2044 while ( 0 < expDiff ) { 2045 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2046 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2047 aSig64 = - ( ( bSig * q64 )<<38 ); 2048 expDiff -= 62; 2049 } 2050 expDiff += 64; 2051 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2052 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2053 q = q64>>( 64 - expDiff ); 2054 bSig <<= 6; 2055 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2056 } 2057 do { 2058 alternateASig = aSig; 2059 ++q; 2060 aSig -= bSig; 2061 } while ( 0 <= (sbits32) aSig ); 2062 sigMean = aSig + alternateASig; 2063 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2064 aSig = alternateASig; 2065 } 2066 zSign = ( (sbits32) aSig < 0 ); 2067 if ( zSign ) aSig = - aSig; 2068 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2069 2070 } 2071 #endif /* !SOFTFLOAT_FOR_GCC */ 2072 2073 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2074 /* 2075 ------------------------------------------------------------------------------- 2076 Returns the square root of the single-precision floating-point value `a'. 2077 The operation is performed according to the IEC/IEEE Standard for Binary 2078 Floating-Point Arithmetic. 2079 ------------------------------------------------------------------------------- 2080 */ 2081 float32 float32_sqrt( float32 a ) 2082 { 2083 flag aSign; 2084 int16 aExp, zExp; 2085 bits32 aSig, zSig; 2086 bits64 rem, term; 2087 2088 aSig = extractFloat32Frac( a ); 2089 aExp = extractFloat32Exp( a ); 2090 aSign = extractFloat32Sign( a ); 2091 if ( aExp == 0xFF ) { 2092 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2093 if ( ! aSign ) return a; 2094 float_raise( float_flag_invalid ); 2095 return float32_default_nan; 2096 } 2097 if ( aSign ) { 2098 if ( ( aExp | aSig ) == 0 ) return a; 2099 float_raise( float_flag_invalid ); 2100 return float32_default_nan; 2101 } 2102 if ( aExp == 0 ) { 2103 if ( aSig == 0 ) return 0; 2104 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2105 } 2106 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2107 aSig = ( aSig | 0x00800000 )<<8; 2108 zSig = estimateSqrt32( aExp, aSig ) + 2; 2109 if ( ( zSig & 0x7F ) <= 5 ) { 2110 if ( zSig < 2 ) { 2111 zSig = 0x7FFFFFFF; 2112 goto roundAndPack; 2113 } 2114 aSig >>= aExp & 1; 2115 term = ( (bits64) zSig ) * zSig; 2116 rem = ( ( (bits64) aSig )<<32 ) - term; 2117 while ( (sbits64) rem < 0 ) { 2118 --zSig; 2119 rem += ( ( (bits64) zSig )<<1 ) | 1; 2120 } 2121 zSig |= ( rem != 0 ); 2122 } 2123 shift32RightJamming( zSig, 1, &zSig ); 2124 roundAndPack: 2125 return roundAndPackFloat32( 0, zExp, zSig ); 2126 2127 } 2128 #endif /* !SOFTFLOAT_FOR_GCC */ 2129 2130 /* 2131 ------------------------------------------------------------------------------- 2132 Returns 1 if the single-precision floating-point value `a' is equal to 2133 the corresponding value `b', and 0 otherwise. The comparison is performed 2134 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2135 ------------------------------------------------------------------------------- 2136 */ 2137 flag float32_eq( float32 a, float32 b ) 2138 { 2139 2140 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2141 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2142 ) { 2143 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2144 float_raise( float_flag_invalid ); 2145 } 2146 return 0; 2147 } 2148 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2149 2150 } 2151 2152 /* 2153 ------------------------------------------------------------------------------- 2154 Returns 1 if the single-precision floating-point value `a' is less than 2155 or equal to the corresponding value `b', and 0 otherwise. The comparison 2156 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2157 Arithmetic. 2158 ------------------------------------------------------------------------------- 2159 */ 2160 flag float32_le( float32 a, float32 b ) 2161 { 2162 flag aSign, bSign; 2163 2164 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2165 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2166 ) { 2167 float_raise( float_flag_invalid ); 2168 return 0; 2169 } 2170 aSign = extractFloat32Sign( a ); 2171 bSign = extractFloat32Sign( b ); 2172 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2173 return ( a == b ) || ( aSign ^ ( a < b ) ); 2174 2175 } 2176 2177 /* 2178 ------------------------------------------------------------------------------- 2179 Returns 1 if the single-precision floating-point value `a' is less than 2180 the corresponding value `b', and 0 otherwise. The comparison is performed 2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2182 ------------------------------------------------------------------------------- 2183 */ 2184 flag float32_lt( float32 a, float32 b ) 2185 { 2186 flag aSign, bSign; 2187 2188 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2189 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2190 ) { 2191 float_raise( float_flag_invalid ); 2192 return 0; 2193 } 2194 aSign = extractFloat32Sign( a ); 2195 bSign = extractFloat32Sign( b ); 2196 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2197 return ( a != b ) && ( aSign ^ ( a < b ) ); 2198 2199 } 2200 2201 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2202 /* 2203 ------------------------------------------------------------------------------- 2204 Returns 1 if the single-precision floating-point value `a' is equal to 2205 the corresponding value `b', and 0 otherwise. The invalid exception is 2206 raised if either operand is a NaN. Otherwise, the comparison is performed 2207 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2208 ------------------------------------------------------------------------------- 2209 */ 2210 flag float32_eq_signaling( float32 a, float32 b ) 2211 { 2212 2213 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2214 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2215 ) { 2216 float_raise( float_flag_invalid ); 2217 return 0; 2218 } 2219 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2220 2221 } 2222 2223 /* 2224 ------------------------------------------------------------------------------- 2225 Returns 1 if the single-precision floating-point value `a' is less than or 2226 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2227 cause an exception. Otherwise, the comparison is performed according to the 2228 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2229 ------------------------------------------------------------------------------- 2230 */ 2231 flag float32_le_quiet( float32 a, float32 b ) 2232 { 2233 flag aSign, bSign; 2234 2235 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2236 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2237 ) { 2238 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2239 float_raise( float_flag_invalid ); 2240 } 2241 return 0; 2242 } 2243 aSign = extractFloat32Sign( a ); 2244 bSign = extractFloat32Sign( b ); 2245 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2246 return ( a == b ) || ( aSign ^ ( a < b ) ); 2247 2248 } 2249 2250 /* 2251 ------------------------------------------------------------------------------- 2252 Returns 1 if the single-precision floating-point value `a' is less than 2253 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2254 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2255 Standard for Binary Floating-Point Arithmetic. 2256 ------------------------------------------------------------------------------- 2257 */ 2258 flag float32_lt_quiet( float32 a, float32 b ) 2259 { 2260 flag aSign, bSign; 2261 2262 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2263 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2264 ) { 2265 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2266 float_raise( float_flag_invalid ); 2267 } 2268 return 0; 2269 } 2270 aSign = extractFloat32Sign( a ); 2271 bSign = extractFloat32Sign( b ); 2272 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2273 return ( a != b ) && ( aSign ^ ( a < b ) ); 2274 2275 } 2276 #endif /* !SOFTFLOAT_FOR_GCC */ 2277 2278 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2279 /* 2280 ------------------------------------------------------------------------------- 2281 Returns the result of converting the double-precision floating-point value 2282 `a' to the 32-bit two's complement integer format. The conversion is 2283 performed according to the IEC/IEEE Standard for Binary Floating-Point 2284 Arithmetic---which means in particular that the conversion is rounded 2285 according to the current rounding mode. If `a' is a NaN, the largest 2286 positive integer is returned. Otherwise, if the conversion overflows, the 2287 largest integer with the same sign as `a' is returned. 2288 ------------------------------------------------------------------------------- 2289 */ 2290 int32 float64_to_int32( float64 a ) 2291 { 2292 flag aSign; 2293 int16 aExp, shiftCount; 2294 bits64 aSig; 2295 2296 aSig = extractFloat64Frac( a ); 2297 aExp = extractFloat64Exp( a ); 2298 aSign = extractFloat64Sign( a ); 2299 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2300 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2301 shiftCount = 0x42C - aExp; 2302 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2303 return roundAndPackInt32( aSign, aSig ); 2304 2305 } 2306 #endif /* !SOFTFLOAT_FOR_GCC */ 2307 2308 /* 2309 ------------------------------------------------------------------------------- 2310 Returns the result of converting the double-precision floating-point value 2311 `a' to the 32-bit two's complement integer format. The conversion is 2312 performed according to the IEC/IEEE Standard for Binary Floating-Point 2313 Arithmetic, except that the conversion is always rounded toward zero. 2314 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2315 the conversion overflows, the largest integer with the same sign as `a' is 2316 returned. 2317 ------------------------------------------------------------------------------- 2318 */ 2319 int32 float64_to_int32_round_to_zero( float64 a ) 2320 { 2321 flag aSign; 2322 int16 aExp, shiftCount; 2323 bits64 aSig, savedASig; 2324 int32 z; 2325 2326 aSig = extractFloat64Frac( a ); 2327 aExp = extractFloat64Exp( a ); 2328 aSign = extractFloat64Sign( a ); 2329 if ( 0x41E < aExp ) { 2330 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2331 goto invalid; 2332 } 2333 else if ( aExp < 0x3FF ) { 2334 if ( aExp || aSig ) float_set_inexact(); 2335 return 0; 2336 } 2337 aSig |= LIT64( 0x0010000000000000 ); 2338 shiftCount = 0x433 - aExp; 2339 savedASig = aSig; 2340 aSig >>= shiftCount; 2341 z = aSig; 2342 if ( aSign ) z = - z; 2343 if ( ( z < 0 ) ^ aSign ) { 2344 invalid: 2345 float_raise( float_flag_invalid ); 2346 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2347 } 2348 if ( ( aSig<<shiftCount ) != savedASig ) { 2349 float_set_inexact(); 2350 } 2351 return z; 2352 2353 } 2354 2355 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2356 /* 2357 ------------------------------------------------------------------------------- 2358 Returns the result of converting the double-precision floating-point value 2359 `a' to the 64-bit two's complement integer format. The conversion is 2360 performed according to the IEC/IEEE Standard for Binary Floating-Point 2361 Arithmetic---which means in particular that the conversion is rounded 2362 according to the current rounding mode. If `a' is a NaN, the largest 2363 positive integer is returned. Otherwise, if the conversion overflows, the 2364 largest integer with the same sign as `a' is returned. 2365 ------------------------------------------------------------------------------- 2366 */ 2367 int64 float64_to_int64( float64 a ) 2368 { 2369 flag aSign; 2370 int16 aExp, shiftCount; 2371 bits64 aSig, aSigExtra; 2372 2373 aSig = extractFloat64Frac( a ); 2374 aExp = extractFloat64Exp( a ); 2375 aSign = extractFloat64Sign( a ); 2376 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2377 shiftCount = 0x433 - aExp; 2378 if ( shiftCount <= 0 ) { 2379 if ( 0x43E < aExp ) { 2380 float_raise( float_flag_invalid ); 2381 if ( ! aSign 2382 || ( ( aExp == 0x7FF ) 2383 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2384 ) { 2385 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2386 } 2387 return (sbits64) LIT64( 0x8000000000000000 ); 2388 } 2389 aSigExtra = 0; 2390 aSig <<= - shiftCount; 2391 } 2392 else { 2393 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2394 } 2395 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2396 2397 } 2398 2399 /* 2400 ------------------------------------------------------------------------------- 2401 Returns the result of converting the double-precision floating-point value 2402 `a' to the 64-bit two's complement integer format. The conversion is 2403 performed according to the IEC/IEEE Standard for Binary Floating-Point 2404 Arithmetic, except that the conversion is always rounded toward zero. 2405 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2406 the conversion overflows, the largest integer with the same sign as `a' is 2407 returned. 2408 ------------------------------------------------------------------------------- 2409 */ 2410 int64 float64_to_int64_round_to_zero( float64 a ) 2411 { 2412 flag aSign; 2413 int16 aExp, shiftCount; 2414 bits64 aSig; 2415 int64 z; 2416 2417 aSig = extractFloat64Frac( a ); 2418 aExp = extractFloat64Exp( a ); 2419 aSign = extractFloat64Sign( a ); 2420 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2421 shiftCount = aExp - 0x433; 2422 if ( 0 <= shiftCount ) { 2423 if ( 0x43E <= aExp ) { 2424 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2425 float_raise( float_flag_invalid ); 2426 if ( ! aSign 2427 || ( ( aExp == 0x7FF ) 2428 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2429 ) { 2430 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2431 } 2432 } 2433 return (sbits64) LIT64( 0x8000000000000000 ); 2434 } 2435 z = aSig<<shiftCount; 2436 } 2437 else { 2438 if ( aExp < 0x3FE ) { 2439 if ( aExp | aSig ) float_set_inexact(); 2440 return 0; 2441 } 2442 z = aSig>>( - shiftCount ); 2443 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2444 float_set_inexact(); 2445 } 2446 } 2447 if ( aSign ) z = - z; 2448 return z; 2449 2450 } 2451 #endif /* !SOFTFLOAT_FOR_GCC */ 2452 2453 /* 2454 ------------------------------------------------------------------------------- 2455 Returns the result of converting the double-precision floating-point value 2456 `a' to the single-precision floating-point format. The conversion is 2457 performed according to the IEC/IEEE Standard for Binary Floating-Point 2458 Arithmetic. 2459 ------------------------------------------------------------------------------- 2460 */ 2461 float32 float64_to_float32( float64 a ) 2462 { 2463 flag aSign; 2464 int16 aExp; 2465 bits64 aSig; 2466 bits32 zSig; 2467 2468 aSig = extractFloat64Frac( a ); 2469 aExp = extractFloat64Exp( a ); 2470 aSign = extractFloat64Sign( a ); 2471 if ( aExp == 0x7FF ) { 2472 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2473 return packFloat32( aSign, 0xFF, 0 ); 2474 } 2475 shift64RightJamming( aSig, 22, &aSig ); 2476 zSig = aSig; 2477 if ( aExp || zSig ) { 2478 zSig |= 0x40000000; 2479 aExp -= 0x381; 2480 } 2481 return roundAndPackFloat32( aSign, aExp, zSig ); 2482 2483 } 2484 2485 #ifdef FLOATX80 2486 2487 /* 2488 ------------------------------------------------------------------------------- 2489 Returns the result of converting the double-precision floating-point value 2490 `a' to the extended double-precision floating-point format. The conversion 2491 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2492 Arithmetic. 2493 ------------------------------------------------------------------------------- 2494 */ 2495 floatx80 float64_to_floatx80( float64 a ) 2496 { 2497 flag aSign; 2498 int16 aExp; 2499 bits64 aSig; 2500 2501 aSig = extractFloat64Frac( a ); 2502 aExp = extractFloat64Exp( a ); 2503 aSign = extractFloat64Sign( a ); 2504 if ( aExp == 0x7FF ) { 2505 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2506 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2507 } 2508 if ( aExp == 0 ) { 2509 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2510 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2511 } 2512 return 2513 packFloatx80( 2514 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2515 2516 } 2517 2518 #endif 2519 2520 #ifdef FLOAT128 2521 2522 /* 2523 ------------------------------------------------------------------------------- 2524 Returns the result of converting the double-precision floating-point value 2525 `a' to the quadruple-precision floating-point format. The conversion is 2526 performed according to the IEC/IEEE Standard for Binary Floating-Point 2527 Arithmetic. 2528 ------------------------------------------------------------------------------- 2529 */ 2530 float128 float64_to_float128( float64 a ) 2531 { 2532 flag aSign; 2533 int16 aExp; 2534 bits64 aSig, zSig0, zSig1; 2535 2536 aSig = extractFloat64Frac( a ); 2537 aExp = extractFloat64Exp( a ); 2538 aSign = extractFloat64Sign( a ); 2539 if ( aExp == 0x7FF ) { 2540 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2541 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2542 } 2543 if ( aExp == 0 ) { 2544 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2545 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2546 --aExp; 2547 } 2548 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2549 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2550 2551 } 2552 2553 #endif 2554 2555 #ifndef SOFTFLOAT_FOR_GCC 2556 /* 2557 ------------------------------------------------------------------------------- 2558 Rounds the double-precision floating-point value `a' to an integer, and 2559 returns the result as a double-precision floating-point value. The 2560 operation is performed according to the IEC/IEEE Standard for Binary 2561 Floating-Point Arithmetic. 2562 ------------------------------------------------------------------------------- 2563 */ 2564 float64 float64_round_to_int( float64 a ) 2565 { 2566 flag aSign; 2567 int16 aExp; 2568 bits64 lastBitMask, roundBitsMask; 2569 int8 roundingMode; 2570 float64 z; 2571 2572 aExp = extractFloat64Exp( a ); 2573 if ( 0x433 <= aExp ) { 2574 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2575 return propagateFloat64NaN( a, a ); 2576 } 2577 return a; 2578 } 2579 if ( aExp < 0x3FF ) { 2580 if ( (bits64) ( a<<1 ) == 0 ) return a; 2581 float_set_inexact(); 2582 aSign = extractFloat64Sign( a ); 2583 switch ( float_rounding_mode() ) { 2584 case float_round_nearest_even: 2585 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2586 return packFloat64( aSign, 0x3FF, 0 ); 2587 } 2588 break; 2589 case float_round_down: 2590 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2591 case float_round_up: 2592 return 2593 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2594 } 2595 return packFloat64( aSign, 0, 0 ); 2596 } 2597 lastBitMask = 1; 2598 lastBitMask <<= 0x433 - aExp; 2599 roundBitsMask = lastBitMask - 1; 2600 z = a; 2601 roundingMode = float_rounding_mode(); 2602 if ( roundingMode == float_round_nearest_even ) { 2603 z += lastBitMask>>1; 2604 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2605 } 2606 else if ( roundingMode != float_round_to_zero ) { 2607 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2608 z += roundBitsMask; 2609 } 2610 } 2611 z &= ~ roundBitsMask; 2612 if ( z != a ) float_set_inexact(); 2613 return z; 2614 2615 } 2616 #endif 2617 2618 /* 2619 ------------------------------------------------------------------------------- 2620 Returns the result of adding the absolute values of the double-precision 2621 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2622 before being returned. `zSign' is ignored if the result is a NaN. 2623 The addition is performed according to the IEC/IEEE Standard for Binary 2624 Floating-Point Arithmetic. 2625 ------------------------------------------------------------------------------- 2626 */ 2627 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2628 { 2629 int16 aExp, bExp, zExp; 2630 bits64 aSig, bSig, zSig; 2631 int16 expDiff; 2632 2633 aSig = extractFloat64Frac( a ); 2634 aExp = extractFloat64Exp( a ); 2635 bSig = extractFloat64Frac( b ); 2636 bExp = extractFloat64Exp( b ); 2637 expDiff = aExp - bExp; 2638 aSig <<= 9; 2639 bSig <<= 9; 2640 if ( 0 < expDiff ) { 2641 if ( aExp == 0x7FF ) { 2642 if ( aSig ) return propagateFloat64NaN( a, b ); 2643 return a; 2644 } 2645 if ( bExp == 0 ) { 2646 --expDiff; 2647 } 2648 else { 2649 bSig |= LIT64( 0x2000000000000000 ); 2650 } 2651 shift64RightJamming( bSig, expDiff, &bSig ); 2652 zExp = aExp; 2653 } 2654 else if ( expDiff < 0 ) { 2655 if ( bExp == 0x7FF ) { 2656 if ( bSig ) return propagateFloat64NaN( a, b ); 2657 return packFloat64( zSign, 0x7FF, 0 ); 2658 } 2659 if ( aExp == 0 ) { 2660 ++expDiff; 2661 } 2662 else { 2663 aSig |= LIT64( 0x2000000000000000 ); 2664 } 2665 shift64RightJamming( aSig, - expDiff, &aSig ); 2666 zExp = bExp; 2667 } 2668 else { 2669 if ( aExp == 0x7FF ) { 2670 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2671 return a; 2672 } 2673 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2674 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2675 zExp = aExp; 2676 goto roundAndPack; 2677 } 2678 aSig |= LIT64( 0x2000000000000000 ); 2679 zSig = ( aSig + bSig )<<1; 2680 --zExp; 2681 if ( (sbits64) zSig < 0 ) { 2682 zSig = aSig + bSig; 2683 ++zExp; 2684 } 2685 roundAndPack: 2686 return roundAndPackFloat64( zSign, zExp, zSig ); 2687 2688 } 2689 2690 /* 2691 ------------------------------------------------------------------------------- 2692 Returns the result of subtracting the absolute values of the double- 2693 precision floating-point values `a' and `b'. If `zSign' is 1, the 2694 difference is negated before being returned. `zSign' is ignored if the 2695 result is a NaN. The subtraction is performed according to the IEC/IEEE 2696 Standard for Binary Floating-Point Arithmetic. 2697 ------------------------------------------------------------------------------- 2698 */ 2699 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2700 { 2701 int16 aExp, bExp, zExp; 2702 bits64 aSig, bSig, zSig; 2703 int16 expDiff; 2704 2705 aSig = extractFloat64Frac( a ); 2706 aExp = extractFloat64Exp( a ); 2707 bSig = extractFloat64Frac( b ); 2708 bExp = extractFloat64Exp( b ); 2709 expDiff = aExp - bExp; 2710 aSig <<= 10; 2711 bSig <<= 10; 2712 if ( 0 < expDiff ) goto aExpBigger; 2713 if ( expDiff < 0 ) goto bExpBigger; 2714 if ( aExp == 0x7FF ) { 2715 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2716 float_raise( float_flag_invalid ); 2717 return float64_default_nan; 2718 } 2719 if ( aExp == 0 ) { 2720 aExp = 1; 2721 bExp = 1; 2722 } 2723 if ( bSig < aSig ) goto aBigger; 2724 if ( aSig < bSig ) goto bBigger; 2725 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 ); 2726 bExpBigger: 2727 if ( bExp == 0x7FF ) { 2728 if ( bSig ) return propagateFloat64NaN( a, b ); 2729 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2730 } 2731 if ( aExp == 0 ) { 2732 ++expDiff; 2733 } 2734 else { 2735 aSig |= LIT64( 0x4000000000000000 ); 2736 } 2737 shift64RightJamming( aSig, - expDiff, &aSig ); 2738 bSig |= LIT64( 0x4000000000000000 ); 2739 bBigger: 2740 zSig = bSig - aSig; 2741 zExp = bExp; 2742 zSign ^= 1; 2743 goto normalizeRoundAndPack; 2744 aExpBigger: 2745 if ( aExp == 0x7FF ) { 2746 if ( aSig ) return propagateFloat64NaN( a, b ); 2747 return a; 2748 } 2749 if ( bExp == 0 ) { 2750 --expDiff; 2751 } 2752 else { 2753 bSig |= LIT64( 0x4000000000000000 ); 2754 } 2755 shift64RightJamming( bSig, expDiff, &bSig ); 2756 aSig |= LIT64( 0x4000000000000000 ); 2757 aBigger: 2758 zSig = aSig - bSig; 2759 zExp = aExp; 2760 normalizeRoundAndPack: 2761 --zExp; 2762 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2763 2764 } 2765 2766 /* 2767 ------------------------------------------------------------------------------- 2768 Returns the result of adding the double-precision floating-point values `a' 2769 and `b'. The operation is performed according to the IEC/IEEE Standard for 2770 Binary Floating-Point Arithmetic. 2771 ------------------------------------------------------------------------------- 2772 */ 2773 float64 float64_add( float64 a, float64 b ) 2774 { 2775 flag aSign, bSign; 2776 2777 aSign = extractFloat64Sign( a ); 2778 bSign = extractFloat64Sign( b ); 2779 if ( aSign == bSign ) { 2780 return addFloat64Sigs( a, b, aSign ); 2781 } 2782 else { 2783 return subFloat64Sigs( a, b, aSign ); 2784 } 2785 2786 } 2787 2788 /* 2789 ------------------------------------------------------------------------------- 2790 Returns the result of subtracting the double-precision floating-point values 2791 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2792 for Binary Floating-Point Arithmetic. 2793 ------------------------------------------------------------------------------- 2794 */ 2795 float64 float64_sub( float64 a, float64 b ) 2796 { 2797 flag aSign, bSign; 2798 2799 aSign = extractFloat64Sign( a ); 2800 bSign = extractFloat64Sign( b ); 2801 if ( aSign == bSign ) { 2802 return subFloat64Sigs( a, b, aSign ); 2803 } 2804 else { 2805 return addFloat64Sigs( a, b, aSign ); 2806 } 2807 2808 } 2809 2810 /* 2811 ------------------------------------------------------------------------------- 2812 Returns the result of multiplying the double-precision floating-point values 2813 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2814 for Binary Floating-Point Arithmetic. 2815 ------------------------------------------------------------------------------- 2816 */ 2817 float64 float64_mul( float64 a, float64 b ) 2818 { 2819 flag aSign, bSign, zSign; 2820 int16 aExp, bExp, zExp; 2821 bits64 aSig, bSig, zSig0, zSig1; 2822 2823 aSig = extractFloat64Frac( a ); 2824 aExp = extractFloat64Exp( a ); 2825 aSign = extractFloat64Sign( a ); 2826 bSig = extractFloat64Frac( b ); 2827 bExp = extractFloat64Exp( b ); 2828 bSign = extractFloat64Sign( b ); 2829 zSign = aSign ^ bSign; 2830 if ( aExp == 0x7FF ) { 2831 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2832 return propagateFloat64NaN( a, b ); 2833 } 2834 if ( ( bExp | bSig ) == 0 ) { 2835 float_raise( float_flag_invalid ); 2836 return float64_default_nan; 2837 } 2838 return packFloat64( zSign, 0x7FF, 0 ); 2839 } 2840 if ( bExp == 0x7FF ) { 2841 if ( bSig ) return propagateFloat64NaN( a, b ); 2842 if ( ( aExp | aSig ) == 0 ) { 2843 float_raise( float_flag_invalid ); 2844 return float64_default_nan; 2845 } 2846 return packFloat64( zSign, 0x7FF, 0 ); 2847 } 2848 if ( aExp == 0 ) { 2849 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2850 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2851 } 2852 if ( bExp == 0 ) { 2853 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2854 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2855 } 2856 zExp = aExp + bExp - 0x3FF; 2857 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2858 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2859 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2860 zSig0 |= ( zSig1 != 0 ); 2861 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2862 zSig0 <<= 1; 2863 --zExp; 2864 } 2865 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2866 2867 } 2868 2869 /* 2870 ------------------------------------------------------------------------------- 2871 Returns the result of dividing the double-precision floating-point value `a' 2872 by the corresponding value `b'. The operation is performed according to 2873 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2874 ------------------------------------------------------------------------------- 2875 */ 2876 float64 float64_div( float64 a, float64 b ) 2877 { 2878 flag aSign, bSign, zSign; 2879 int16 aExp, bExp, zExp; 2880 bits64 aSig, bSig, zSig; 2881 bits64 rem0, rem1; 2882 bits64 term0, term1; 2883 2884 aSig = extractFloat64Frac( a ); 2885 aExp = extractFloat64Exp( a ); 2886 aSign = extractFloat64Sign( a ); 2887 bSig = extractFloat64Frac( b ); 2888 bExp = extractFloat64Exp( b ); 2889 bSign = extractFloat64Sign( b ); 2890 zSign = aSign ^ bSign; 2891 if ( aExp == 0x7FF ) { 2892 if ( aSig ) return propagateFloat64NaN( a, b ); 2893 if ( bExp == 0x7FF ) { 2894 if ( bSig ) return propagateFloat64NaN( a, b ); 2895 float_raise( float_flag_invalid ); 2896 return float64_default_nan; 2897 } 2898 return packFloat64( zSign, 0x7FF, 0 ); 2899 } 2900 if ( bExp == 0x7FF ) { 2901 if ( bSig ) return propagateFloat64NaN( a, b ); 2902 return packFloat64( zSign, 0, 0 ); 2903 } 2904 if ( bExp == 0 ) { 2905 if ( bSig == 0 ) { 2906 if ( ( aExp | aSig ) == 0 ) { 2907 float_raise( float_flag_invalid ); 2908 return float64_default_nan; 2909 } 2910 float_raise( float_flag_divbyzero ); 2911 return packFloat64( zSign, 0x7FF, 0 ); 2912 } 2913 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2914 } 2915 if ( aExp == 0 ) { 2916 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2917 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2918 } 2919 zExp = aExp - bExp + 0x3FD; 2920 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2921 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2922 if ( bSig <= ( aSig + aSig ) ) { 2923 aSig >>= 1; 2924 ++zExp; 2925 } 2926 zSig = estimateDiv128To64( aSig, 0, bSig ); 2927 if ( ( zSig & 0x1FF ) <= 2 ) { 2928 mul64To128( bSig, zSig, &term0, &term1 ); 2929 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2930 while ( (sbits64) rem0 < 0 ) { 2931 --zSig; 2932 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2933 } 2934 zSig |= ( rem1 != 0 ); 2935 } 2936 return roundAndPackFloat64( zSign, zExp, zSig ); 2937 2938 } 2939 2940 #ifndef SOFTFLOAT_FOR_GCC 2941 /* 2942 ------------------------------------------------------------------------------- 2943 Returns the remainder of the double-precision floating-point value `a' 2944 with respect to the corresponding value `b'. The operation is performed 2945 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2946 ------------------------------------------------------------------------------- 2947 */ 2948 float64 float64_rem( float64 a, float64 b ) 2949 { 2950 flag aSign, bSign, zSign; 2951 int16 aExp, bExp, expDiff; 2952 bits64 aSig, bSig; 2953 bits64 q, alternateASig; 2954 sbits64 sigMean; 2955 2956 aSig = extractFloat64Frac( a ); 2957 aExp = extractFloat64Exp( a ); 2958 aSign = extractFloat64Sign( a ); 2959 bSig = extractFloat64Frac( b ); 2960 bExp = extractFloat64Exp( b ); 2961 bSign = extractFloat64Sign( b ); 2962 if ( aExp == 0x7FF ) { 2963 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2964 return propagateFloat64NaN( a, b ); 2965 } 2966 float_raise( float_flag_invalid ); 2967 return float64_default_nan; 2968 } 2969 if ( bExp == 0x7FF ) { 2970 if ( bSig ) return propagateFloat64NaN( a, b ); 2971 return a; 2972 } 2973 if ( bExp == 0 ) { 2974 if ( bSig == 0 ) { 2975 float_raise( float_flag_invalid ); 2976 return float64_default_nan; 2977 } 2978 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2979 } 2980 if ( aExp == 0 ) { 2981 if ( aSig == 0 ) return a; 2982 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2983 } 2984 expDiff = aExp - bExp; 2985 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2986 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2987 if ( expDiff < 0 ) { 2988 if ( expDiff < -1 ) return a; 2989 aSig >>= 1; 2990 } 2991 q = ( bSig <= aSig ); 2992 if ( q ) aSig -= bSig; 2993 expDiff -= 64; 2994 while ( 0 < expDiff ) { 2995 q = estimateDiv128To64( aSig, 0, bSig ); 2996 q = ( 2 < q ) ? q - 2 : 0; 2997 aSig = - ( ( bSig>>2 ) * q ); 2998 expDiff -= 62; 2999 } 3000 expDiff += 64; 3001 if ( 0 < expDiff ) { 3002 q = estimateDiv128To64( aSig, 0, bSig ); 3003 q = ( 2 < q ) ? q - 2 : 0; 3004 q >>= 64 - expDiff; 3005 bSig >>= 2; 3006 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3007 } 3008 else { 3009 aSig >>= 2; 3010 bSig >>= 2; 3011 } 3012 do { 3013 alternateASig = aSig; 3014 ++q; 3015 aSig -= bSig; 3016 } while ( 0 <= (sbits64) aSig ); 3017 sigMean = aSig + alternateASig; 3018 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3019 aSig = alternateASig; 3020 } 3021 zSign = ( (sbits64) aSig < 0 ); 3022 if ( zSign ) aSig = - aSig; 3023 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3024 3025 } 3026 3027 /* 3028 ------------------------------------------------------------------------------- 3029 Returns the square root of the double-precision floating-point value `a'. 3030 The operation is performed according to the IEC/IEEE Standard for Binary 3031 Floating-Point Arithmetic. 3032 ------------------------------------------------------------------------------- 3033 */ 3034 float64 float64_sqrt( float64 a ) 3035 { 3036 flag aSign; 3037 int16 aExp, zExp; 3038 bits64 aSig, zSig, doubleZSig; 3039 bits64 rem0, rem1, term0, term1; 3040 3041 aSig = extractFloat64Frac( a ); 3042 aExp = extractFloat64Exp( a ); 3043 aSign = extractFloat64Sign( a ); 3044 if ( aExp == 0x7FF ) { 3045 if ( aSig ) return propagateFloat64NaN( a, a ); 3046 if ( ! aSign ) return a; 3047 float_raise( float_flag_invalid ); 3048 return float64_default_nan; 3049 } 3050 if ( aSign ) { 3051 if ( ( aExp | aSig ) == 0 ) return a; 3052 float_raise( float_flag_invalid ); 3053 return float64_default_nan; 3054 } 3055 if ( aExp == 0 ) { 3056 if ( aSig == 0 ) return 0; 3057 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3058 } 3059 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3060 aSig |= LIT64( 0x0010000000000000 ); 3061 zSig = estimateSqrt32( aExp, aSig>>21 ); 3062 aSig <<= 9 - ( aExp & 1 ); 3063 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3064 if ( ( zSig & 0x1FF ) <= 5 ) { 3065 doubleZSig = zSig<<1; 3066 mul64To128( zSig, zSig, &term0, &term1 ); 3067 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3068 while ( (sbits64) rem0 < 0 ) { 3069 --zSig; 3070 doubleZSig -= 2; 3071 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3072 } 3073 zSig |= ( ( rem0 | rem1 ) != 0 ); 3074 } 3075 return roundAndPackFloat64( 0, zExp, zSig ); 3076 3077 } 3078 #endif 3079 3080 /* 3081 ------------------------------------------------------------------------------- 3082 Returns 1 if the double-precision floating-point value `a' is equal to the 3083 corresponding value `b', and 0 otherwise. The comparison is performed 3084 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3085 ------------------------------------------------------------------------------- 3086 */ 3087 flag float64_eq( float64 a, float64 b ) 3088 { 3089 3090 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3091 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3092 ) { 3093 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3094 float_raise( float_flag_invalid ); 3095 } 3096 return 0; 3097 } 3098 return ( a == b ) || 3099 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3100 3101 } 3102 3103 /* 3104 ------------------------------------------------------------------------------- 3105 Returns 1 if the double-precision floating-point value `a' is less than or 3106 equal to the corresponding value `b', and 0 otherwise. The comparison is 3107 performed according to the IEC/IEEE Standard for Binary Floating-Point 3108 Arithmetic. 3109 ------------------------------------------------------------------------------- 3110 */ 3111 flag float64_le( float64 a, float64 b ) 3112 { 3113 flag aSign, bSign; 3114 3115 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3116 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3117 ) { 3118 float_raise( float_flag_invalid ); 3119 return 0; 3120 } 3121 aSign = extractFloat64Sign( a ); 3122 bSign = extractFloat64Sign( b ); 3123 if ( aSign != bSign ) 3124 return aSign || 3125 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3126 0 ); 3127 return ( a == b ) || 3128 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3129 3130 } 3131 3132 /* 3133 ------------------------------------------------------------------------------- 3134 Returns 1 if the double-precision floating-point value `a' is less than 3135 the corresponding value `b', and 0 otherwise. The comparison is performed 3136 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3137 ------------------------------------------------------------------------------- 3138 */ 3139 flag float64_lt( float64 a, float64 b ) 3140 { 3141 flag aSign, bSign; 3142 3143 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3144 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3145 ) { 3146 float_raise( float_flag_invalid ); 3147 return 0; 3148 } 3149 aSign = extractFloat64Sign( a ); 3150 bSign = extractFloat64Sign( b ); 3151 if ( aSign != bSign ) 3152 return aSign && 3153 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3154 0 ); 3155 return ( a != b ) && 3156 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3157 3158 } 3159 3160 #ifndef SOFTFLOAT_FOR_GCC 3161 /* 3162 ------------------------------------------------------------------------------- 3163 Returns 1 if the double-precision floating-point value `a' is equal to the 3164 corresponding value `b', and 0 otherwise. The invalid exception is raised 3165 if either operand is a NaN. Otherwise, the comparison is performed 3166 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3167 ------------------------------------------------------------------------------- 3168 */ 3169 flag float64_eq_signaling( float64 a, float64 b ) 3170 { 3171 3172 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3173 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3174 ) { 3175 float_raise( float_flag_invalid ); 3176 return 0; 3177 } 3178 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3179 3180 } 3181 3182 /* 3183 ------------------------------------------------------------------------------- 3184 Returns 1 if the double-precision floating-point value `a' is less than or 3185 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3186 cause an exception. Otherwise, the comparison is performed according to the 3187 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3188 ------------------------------------------------------------------------------- 3189 */ 3190 flag float64_le_quiet( float64 a, float64 b ) 3191 { 3192 flag aSign, bSign; 3193 3194 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3195 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3196 ) { 3197 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3198 float_raise( float_flag_invalid ); 3199 } 3200 return 0; 3201 } 3202 aSign = extractFloat64Sign( a ); 3203 bSign = extractFloat64Sign( b ); 3204 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3205 return ( a == b ) || ( aSign ^ ( a < b ) ); 3206 3207 } 3208 3209 /* 3210 ------------------------------------------------------------------------------- 3211 Returns 1 if the double-precision floating-point value `a' is less than 3212 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3213 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3214 Standard for Binary Floating-Point Arithmetic. 3215 ------------------------------------------------------------------------------- 3216 */ 3217 flag float64_lt_quiet( float64 a, float64 b ) 3218 { 3219 flag aSign, bSign; 3220 3221 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3222 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3223 ) { 3224 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3225 float_raise( float_flag_invalid ); 3226 } 3227 return 0; 3228 } 3229 aSign = extractFloat64Sign( a ); 3230 bSign = extractFloat64Sign( b ); 3231 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3232 return ( a != b ) && ( aSign ^ ( a < b ) ); 3233 3234 } 3235 #endif 3236 3237 #ifdef FLOATX80 3238 3239 /* 3240 ------------------------------------------------------------------------------- 3241 Returns the result of converting the extended double-precision floating- 3242 point value `a' to the 32-bit two's complement integer format. The 3243 conversion is performed according to the IEC/IEEE Standard for Binary 3244 Floating-Point Arithmetic---which means in particular that the conversion 3245 is rounded according to the current rounding mode. If `a' is a NaN, the 3246 largest positive integer is returned. Otherwise, if the conversion 3247 overflows, the largest integer with the same sign as `a' is returned. 3248 ------------------------------------------------------------------------------- 3249 */ 3250 int32 floatx80_to_int32( floatx80 a ) 3251 { 3252 flag aSign; 3253 int32 aExp, shiftCount; 3254 bits64 aSig; 3255 3256 aSig = extractFloatx80Frac( a ); 3257 aExp = extractFloatx80Exp( a ); 3258 aSign = extractFloatx80Sign( a ); 3259 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3260 shiftCount = 0x4037 - aExp; 3261 if ( shiftCount <= 0 ) shiftCount = 1; 3262 shift64RightJamming( aSig, shiftCount, &aSig ); 3263 return roundAndPackInt32( aSign, aSig ); 3264 3265 } 3266 3267 /* 3268 ------------------------------------------------------------------------------- 3269 Returns the result of converting the extended double-precision floating- 3270 point value `a' to the 32-bit two's complement integer format. The 3271 conversion is performed according to the IEC/IEEE Standard for Binary 3272 Floating-Point Arithmetic, except that the conversion is always rounded 3273 toward zero. If `a' is a NaN, the largest positive integer is returned. 3274 Otherwise, if the conversion overflows, the largest integer with the same 3275 sign as `a' is returned. 3276 ------------------------------------------------------------------------------- 3277 */ 3278 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3279 { 3280 flag aSign; 3281 int32 aExp, shiftCount; 3282 bits64 aSig, savedASig; 3283 int32 z; 3284 3285 aSig = extractFloatx80Frac( a ); 3286 aExp = extractFloatx80Exp( a ); 3287 aSign = extractFloatx80Sign( a ); 3288 if ( 0x401E < aExp ) { 3289 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3290 goto invalid; 3291 } 3292 else if ( aExp < 0x3FFF ) { 3293 if ( aExp || aSig ) float_set_inexact(); 3294 return 0; 3295 } 3296 shiftCount = 0x403E - aExp; 3297 savedASig = aSig; 3298 aSig >>= shiftCount; 3299 z = aSig; 3300 if ( aSign ) z = - z; 3301 if ( ( z < 0 ) ^ aSign ) { 3302 invalid: 3303 float_raise( float_flag_invalid ); 3304 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3305 } 3306 if ( ( aSig<<shiftCount ) != savedASig ) { 3307 float_set_inexact(); 3308 } 3309 return z; 3310 3311 } 3312 3313 /* 3314 ------------------------------------------------------------------------------- 3315 Returns the result of converting the extended double-precision floating- 3316 point value `a' to the 64-bit two's complement integer format. The 3317 conversion is performed according to the IEC/IEEE Standard for Binary 3318 Floating-Point Arithmetic---which means in particular that the conversion 3319 is rounded according to the current rounding mode. If `a' is a NaN, 3320 the largest positive integer is returned. Otherwise, if the conversion 3321 overflows, the largest integer with the same sign as `a' is returned. 3322 ------------------------------------------------------------------------------- 3323 */ 3324 int64 floatx80_to_int64( floatx80 a ) 3325 { 3326 flag aSign; 3327 int32 aExp, shiftCount; 3328 bits64 aSig, aSigExtra; 3329 3330 aSig = extractFloatx80Frac( a ); 3331 aExp = extractFloatx80Exp( a ); 3332 aSign = extractFloatx80Sign( a ); 3333 shiftCount = 0x403E - aExp; 3334 if ( shiftCount <= 0 ) { 3335 if ( shiftCount ) { 3336 float_raise( float_flag_invalid ); 3337 if ( ! aSign 3338 || ( ( aExp == 0x7FFF ) 3339 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3340 ) { 3341 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3342 } 3343 return (sbits64) LIT64( 0x8000000000000000 ); 3344 } 3345 aSigExtra = 0; 3346 } 3347 else { 3348 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3349 } 3350 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3351 3352 } 3353 3354 /* 3355 ------------------------------------------------------------------------------- 3356 Returns the result of converting the extended double-precision floating- 3357 point value `a' to the 64-bit two's complement integer format. The 3358 conversion is performed according to the IEC/IEEE Standard for Binary 3359 Floating-Point Arithmetic, except that the conversion is always rounded 3360 toward zero. If `a' is a NaN, the largest positive integer is returned. 3361 Otherwise, if the conversion overflows, the largest integer with the same 3362 sign as `a' is returned. 3363 ------------------------------------------------------------------------------- 3364 */ 3365 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3366 { 3367 flag aSign; 3368 int32 aExp, shiftCount; 3369 bits64 aSig; 3370 int64 z; 3371 3372 aSig = extractFloatx80Frac( a ); 3373 aExp = extractFloatx80Exp( a ); 3374 aSign = extractFloatx80Sign( a ); 3375 shiftCount = aExp - 0x403E; 3376 if ( 0 <= shiftCount ) { 3377 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3378 if ( ( a.high != 0xC03E ) || aSig ) { 3379 float_raise( float_flag_invalid ); 3380 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3381 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3382 } 3383 } 3384 return (sbits64) LIT64( 0x8000000000000000 ); 3385 } 3386 else if ( aExp < 0x3FFF ) { 3387 if ( aExp | aSig ) float_set_inexact(); 3388 return 0; 3389 } 3390 z = aSig>>( - shiftCount ); 3391 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3392 float_set_inexact(); 3393 } 3394 if ( aSign ) z = - z; 3395 return z; 3396 3397 } 3398 3399 /* 3400 ------------------------------------------------------------------------------- 3401 Returns the result of converting the extended double-precision floating- 3402 point value `a' to the single-precision floating-point format. The 3403 conversion is performed according to the IEC/IEEE Standard for Binary 3404 Floating-Point Arithmetic. 3405 ------------------------------------------------------------------------------- 3406 */ 3407 float32 floatx80_to_float32( floatx80 a ) 3408 { 3409 flag aSign; 3410 int32 aExp; 3411 bits64 aSig; 3412 3413 aSig = extractFloatx80Frac( a ); 3414 aExp = extractFloatx80Exp( a ); 3415 aSign = extractFloatx80Sign( a ); 3416 if ( aExp == 0x7FFF ) { 3417 if ( (bits64) ( aSig<<1 ) ) { 3418 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3419 } 3420 return packFloat32( aSign, 0xFF, 0 ); 3421 } 3422 shift64RightJamming( aSig, 33, &aSig ); 3423 if ( aExp || aSig ) aExp -= 0x3F81; 3424 return roundAndPackFloat32( aSign, aExp, aSig ); 3425 3426 } 3427 3428 /* 3429 ------------------------------------------------------------------------------- 3430 Returns the result of converting the extended double-precision floating- 3431 point value `a' to the double-precision floating-point format. The 3432 conversion is performed according to the IEC/IEEE Standard for Binary 3433 Floating-Point Arithmetic. 3434 ------------------------------------------------------------------------------- 3435 */ 3436 float64 floatx80_to_float64( floatx80 a ) 3437 { 3438 flag aSign; 3439 int32 aExp; 3440 bits64 aSig, zSig; 3441 3442 aSig = extractFloatx80Frac( a ); 3443 aExp = extractFloatx80Exp( a ); 3444 aSign = extractFloatx80Sign( a ); 3445 if ( aExp == 0x7FFF ) { 3446 if ( (bits64) ( aSig<<1 ) ) { 3447 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3448 } 3449 return packFloat64( aSign, 0x7FF, 0 ); 3450 } 3451 shift64RightJamming( aSig, 1, &zSig ); 3452 if ( aExp || aSig ) aExp -= 0x3C01; 3453 return roundAndPackFloat64( aSign, aExp, zSig ); 3454 3455 } 3456 3457 #ifdef FLOAT128 3458 3459 /* 3460 ------------------------------------------------------------------------------- 3461 Returns the result of converting the extended double-precision floating- 3462 point value `a' to the quadruple-precision floating-point format. The 3463 conversion is performed according to the IEC/IEEE Standard for Binary 3464 Floating-Point Arithmetic. 3465 ------------------------------------------------------------------------------- 3466 */ 3467 float128 floatx80_to_float128( floatx80 a ) 3468 { 3469 flag aSign; 3470 int16 aExp; 3471 bits64 aSig, zSig0, zSig1; 3472 3473 aSig = extractFloatx80Frac( a ); 3474 aExp = extractFloatx80Exp( a ); 3475 aSign = extractFloatx80Sign( a ); 3476 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3477 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3478 } 3479 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3480 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3481 3482 } 3483 3484 #endif 3485 3486 /* 3487 ------------------------------------------------------------------------------- 3488 Rounds the extended double-precision floating-point value `a' to an integer, 3489 and returns the result as an extended quadruple-precision floating-point 3490 value. The operation is performed according to the IEC/IEEE Standard for 3491 Binary Floating-Point Arithmetic. 3492 ------------------------------------------------------------------------------- 3493 */ 3494 floatx80 floatx80_round_to_int( floatx80 a ) 3495 { 3496 flag aSign; 3497 int32 aExp; 3498 bits64 lastBitMask, roundBitsMask; 3499 int8 roundingMode; 3500 floatx80 z; 3501 3502 aExp = extractFloatx80Exp( a ); 3503 if ( 0x403E <= aExp ) { 3504 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3505 return propagateFloatx80NaN( a, a ); 3506 } 3507 return a; 3508 } 3509 if ( aExp < 0x3FFF ) { 3510 if ( ( aExp == 0 ) 3511 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3512 return a; 3513 } 3514 float_set_inexact(); 3515 aSign = extractFloatx80Sign( a ); 3516 switch ( float_rounding_mode() ) { 3517 case float_round_nearest_even: 3518 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3519 ) { 3520 return 3521 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3522 } 3523 break; 3524 case float_round_down: 3525 return 3526 aSign ? 3527 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3528 : packFloatx80( 0, 0, 0 ); 3529 case float_round_up: 3530 return 3531 aSign ? packFloatx80( 1, 0, 0 ) 3532 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3533 } 3534 return packFloatx80( aSign, 0, 0 ); 3535 } 3536 lastBitMask = 1; 3537 lastBitMask <<= 0x403E - aExp; 3538 roundBitsMask = lastBitMask - 1; 3539 z = a; 3540 roundingMode = float_rounding_mode(); 3541 if ( roundingMode == float_round_nearest_even ) { 3542 z.low += lastBitMask>>1; 3543 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3544 } 3545 else if ( roundingMode != float_round_to_zero ) { 3546 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3547 z.low += roundBitsMask; 3548 } 3549 } 3550 z.low &= ~ roundBitsMask; 3551 if ( z.low == 0 ) { 3552 ++z.high; 3553 z.low = LIT64( 0x8000000000000000 ); 3554 } 3555 if ( z.low != a.low ) float_set_inexact(); 3556 return z; 3557 3558 } 3559 3560 /* 3561 ------------------------------------------------------------------------------- 3562 Returns the result of adding the absolute values of the extended double- 3563 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3564 negated before being returned. `zSign' is ignored if the result is a NaN. 3565 The addition is performed according to the IEC/IEEE Standard for Binary 3566 Floating-Point Arithmetic. 3567 ------------------------------------------------------------------------------- 3568 */ 3569 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3570 { 3571 int32 aExp, bExp, zExp; 3572 bits64 aSig, bSig, zSig0, zSig1; 3573 int32 expDiff; 3574 3575 aSig = extractFloatx80Frac( a ); 3576 aExp = extractFloatx80Exp( a ); 3577 bSig = extractFloatx80Frac( b ); 3578 bExp = extractFloatx80Exp( b ); 3579 expDiff = aExp - bExp; 3580 if ( 0 < expDiff ) { 3581 if ( aExp == 0x7FFF ) { 3582 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3583 return a; 3584 } 3585 if ( bExp == 0 ) --expDiff; 3586 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3587 zExp = aExp; 3588 } 3589 else if ( expDiff < 0 ) { 3590 if ( bExp == 0x7FFF ) { 3591 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3592 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3593 } 3594 if ( aExp == 0 ) ++expDiff; 3595 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3596 zExp = bExp; 3597 } 3598 else { 3599 if ( aExp == 0x7FFF ) { 3600 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3601 return propagateFloatx80NaN( a, b ); 3602 } 3603 return a; 3604 } 3605 zSig1 = 0; 3606 zSig0 = aSig + bSig; 3607 if ( aExp == 0 ) { 3608 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3609 goto roundAndPack; 3610 } 3611 zExp = aExp; 3612 goto shiftRight1; 3613 } 3614 zSig0 = aSig + bSig; 3615 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3616 shiftRight1: 3617 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3618 zSig0 |= LIT64( 0x8000000000000000 ); 3619 ++zExp; 3620 roundAndPack: 3621 return 3622 roundAndPackFloatx80( 3623 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3624 3625 } 3626 3627 /* 3628 ------------------------------------------------------------------------------- 3629 Returns the result of subtracting the absolute values of the extended 3630 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3631 difference is negated before being returned. `zSign' is ignored if the 3632 result is a NaN. The subtraction is performed according to the IEC/IEEE 3633 Standard for Binary Floating-Point Arithmetic. 3634 ------------------------------------------------------------------------------- 3635 */ 3636 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3637 { 3638 int32 aExp, bExp, zExp; 3639 bits64 aSig, bSig, zSig0, zSig1; 3640 int32 expDiff; 3641 floatx80 z; 3642 3643 aSig = extractFloatx80Frac( a ); 3644 aExp = extractFloatx80Exp( a ); 3645 bSig = extractFloatx80Frac( b ); 3646 bExp = extractFloatx80Exp( b ); 3647 expDiff = aExp - bExp; 3648 if ( 0 < expDiff ) goto aExpBigger; 3649 if ( expDiff < 0 ) goto bExpBigger; 3650 if ( aExp == 0x7FFF ) { 3651 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3652 return propagateFloatx80NaN( a, b ); 3653 } 3654 float_raise( float_flag_invalid ); 3655 z.low = floatx80_default_nan_low; 3656 z.high = floatx80_default_nan_high; 3657 return z; 3658 } 3659 if ( aExp == 0 ) { 3660 aExp = 1; 3661 bExp = 1; 3662 } 3663 zSig1 = 0; 3664 if ( bSig < aSig ) goto aBigger; 3665 if ( aSig < bSig ) goto bBigger; 3666 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 ); 3667 bExpBigger: 3668 if ( bExp == 0x7FFF ) { 3669 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3670 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3671 } 3672 if ( aExp == 0 ) ++expDiff; 3673 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3674 bBigger: 3675 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3676 zExp = bExp; 3677 zSign ^= 1; 3678 goto normalizeRoundAndPack; 3679 aExpBigger: 3680 if ( aExp == 0x7FFF ) { 3681 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3682 return a; 3683 } 3684 if ( bExp == 0 ) --expDiff; 3685 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3686 aBigger: 3687 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3688 zExp = aExp; 3689 normalizeRoundAndPack: 3690 return 3691 normalizeRoundAndPackFloatx80( 3692 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3693 3694 } 3695 3696 /* 3697 ------------------------------------------------------------------------------- 3698 Returns the result of adding the extended double-precision floating-point 3699 values `a' and `b'. The operation is performed according to the IEC/IEEE 3700 Standard for Binary Floating-Point Arithmetic. 3701 ------------------------------------------------------------------------------- 3702 */ 3703 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3704 { 3705 flag aSign, bSign; 3706 3707 aSign = extractFloatx80Sign( a ); 3708 bSign = extractFloatx80Sign( b ); 3709 if ( aSign == bSign ) { 3710 return addFloatx80Sigs( a, b, aSign ); 3711 } 3712 else { 3713 return subFloatx80Sigs( a, b, aSign ); 3714 } 3715 3716 } 3717 3718 /* 3719 ------------------------------------------------------------------------------- 3720 Returns the result of subtracting the extended double-precision floating- 3721 point values `a' and `b'. The operation is performed according to the 3722 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3723 ------------------------------------------------------------------------------- 3724 */ 3725 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3726 { 3727 flag aSign, bSign; 3728 3729 aSign = extractFloatx80Sign( a ); 3730 bSign = extractFloatx80Sign( b ); 3731 if ( aSign == bSign ) { 3732 return subFloatx80Sigs( a, b, aSign ); 3733 } 3734 else { 3735 return addFloatx80Sigs( a, b, aSign ); 3736 } 3737 3738 } 3739 3740 /* 3741 ------------------------------------------------------------------------------- 3742 Returns the result of multiplying the extended double-precision floating- 3743 point values `a' and `b'. The operation is performed according to the 3744 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3745 ------------------------------------------------------------------------------- 3746 */ 3747 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3748 { 3749 flag aSign, bSign, zSign; 3750 int32 aExp, bExp, zExp; 3751 bits64 aSig, bSig, zSig0, zSig1; 3752 floatx80 z; 3753 3754 aSig = extractFloatx80Frac( a ); 3755 aExp = extractFloatx80Exp( a ); 3756 aSign = extractFloatx80Sign( a ); 3757 bSig = extractFloatx80Frac( b ); 3758 bExp = extractFloatx80Exp( b ); 3759 bSign = extractFloatx80Sign( b ); 3760 zSign = aSign ^ bSign; 3761 if ( aExp == 0x7FFF ) { 3762 if ( (bits64) ( aSig<<1 ) 3763 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3764 return propagateFloatx80NaN( a, b ); 3765 } 3766 if ( ( bExp | bSig ) == 0 ) goto invalid; 3767 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3768 } 3769 if ( bExp == 0x7FFF ) { 3770 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3771 if ( ( aExp | aSig ) == 0 ) { 3772 invalid: 3773 float_raise( float_flag_invalid ); 3774 z.low = floatx80_default_nan_low; 3775 z.high = floatx80_default_nan_high; 3776 return z; 3777 } 3778 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3779 } 3780 if ( aExp == 0 ) { 3781 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3782 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3783 } 3784 if ( bExp == 0 ) { 3785 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3786 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3787 } 3788 zExp = aExp + bExp - 0x3FFE; 3789 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3790 if ( 0 < (sbits64) zSig0 ) { 3791 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3792 --zExp; 3793 } 3794 return 3795 roundAndPackFloatx80( 3796 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3797 3798 } 3799 3800 /* 3801 ------------------------------------------------------------------------------- 3802 Returns the result of dividing the extended double-precision floating-point 3803 value `a' by the corresponding value `b'. The operation is performed 3804 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3805 ------------------------------------------------------------------------------- 3806 */ 3807 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3808 { 3809 flag aSign, bSign, zSign; 3810 int32 aExp, bExp, zExp; 3811 bits64 aSig, bSig, zSig0, zSig1; 3812 bits64 rem0, rem1, rem2, term0, term1, term2; 3813 floatx80 z; 3814 3815 aSig = extractFloatx80Frac( a ); 3816 aExp = extractFloatx80Exp( a ); 3817 aSign = extractFloatx80Sign( a ); 3818 bSig = extractFloatx80Frac( b ); 3819 bExp = extractFloatx80Exp( b ); 3820 bSign = extractFloatx80Sign( b ); 3821 zSign = aSign ^ bSign; 3822 if ( aExp == 0x7FFF ) { 3823 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3824 if ( bExp == 0x7FFF ) { 3825 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3826 goto invalid; 3827 } 3828 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3829 } 3830 if ( bExp == 0x7FFF ) { 3831 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3832 return packFloatx80( zSign, 0, 0 ); 3833 } 3834 if ( bExp == 0 ) { 3835 if ( bSig == 0 ) { 3836 if ( ( aExp | aSig ) == 0 ) { 3837 invalid: 3838 float_raise( float_flag_invalid ); 3839 z.low = floatx80_default_nan_low; 3840 z.high = floatx80_default_nan_high; 3841 return z; 3842 } 3843 float_raise( float_flag_divbyzero ); 3844 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3845 } 3846 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3847 } 3848 if ( aExp == 0 ) { 3849 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3850 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3851 } 3852 zExp = aExp - bExp + 0x3FFE; 3853 rem1 = 0; 3854 if ( bSig <= aSig ) { 3855 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3856 ++zExp; 3857 } 3858 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3859 mul64To128( bSig, zSig0, &term0, &term1 ); 3860 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3861 while ( (sbits64) rem0 < 0 ) { 3862 --zSig0; 3863 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3864 } 3865 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3866 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3867 mul64To128( bSig, zSig1, &term1, &term2 ); 3868 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3869 while ( (sbits64) rem1 < 0 ) { 3870 --zSig1; 3871 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3872 } 3873 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3874 } 3875 return 3876 roundAndPackFloatx80( 3877 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3878 3879 } 3880 3881 /* 3882 ------------------------------------------------------------------------------- 3883 Returns the remainder of the extended double-precision floating-point value 3884 `a' with respect to the corresponding value `b'. The operation is performed 3885 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3886 ------------------------------------------------------------------------------- 3887 */ 3888 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3889 { 3890 flag aSign, bSign, zSign; 3891 int32 aExp, bExp, expDiff; 3892 bits64 aSig0, aSig1, bSig; 3893 bits64 q, term0, term1, alternateASig0, alternateASig1; 3894 floatx80 z; 3895 3896 aSig0 = extractFloatx80Frac( a ); 3897 aExp = extractFloatx80Exp( a ); 3898 aSign = extractFloatx80Sign( a ); 3899 bSig = extractFloatx80Frac( b ); 3900 bExp = extractFloatx80Exp( b ); 3901 bSign = extractFloatx80Sign( b ); 3902 if ( aExp == 0x7FFF ) { 3903 if ( (bits64) ( aSig0<<1 ) 3904 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3905 return propagateFloatx80NaN( a, b ); 3906 } 3907 goto invalid; 3908 } 3909 if ( bExp == 0x7FFF ) { 3910 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3911 return a; 3912 } 3913 if ( bExp == 0 ) { 3914 if ( bSig == 0 ) { 3915 invalid: 3916 float_raise( float_flag_invalid ); 3917 z.low = floatx80_default_nan_low; 3918 z.high = floatx80_default_nan_high; 3919 return z; 3920 } 3921 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3922 } 3923 if ( aExp == 0 ) { 3924 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3925 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3926 } 3927 bSig |= LIT64( 0x8000000000000000 ); 3928 zSign = aSign; 3929 expDiff = aExp - bExp; 3930 aSig1 = 0; 3931 if ( expDiff < 0 ) { 3932 if ( expDiff < -1 ) return a; 3933 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3934 expDiff = 0; 3935 } 3936 q = ( bSig <= aSig0 ); 3937 if ( q ) aSig0 -= bSig; 3938 expDiff -= 64; 3939 while ( 0 < expDiff ) { 3940 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3941 q = ( 2 < q ) ? q - 2 : 0; 3942 mul64To128( bSig, q, &term0, &term1 ); 3943 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3944 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3945 expDiff -= 62; 3946 } 3947 expDiff += 64; 3948 if ( 0 < expDiff ) { 3949 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3950 q = ( 2 < q ) ? q - 2 : 0; 3951 q >>= 64 - expDiff; 3952 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3953 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3954 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3955 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3956 ++q; 3957 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3958 } 3959 } 3960 else { 3961 term1 = 0; 3962 term0 = bSig; 3963 } 3964 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3965 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3966 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3967 && ( q & 1 ) ) 3968 ) { 3969 aSig0 = alternateASig0; 3970 aSig1 = alternateASig1; 3971 zSign = ! zSign; 3972 } 3973 return 3974 normalizeRoundAndPackFloatx80( 3975 80, zSign, bExp + expDiff, aSig0, aSig1 ); 3976 3977 } 3978 3979 /* 3980 ------------------------------------------------------------------------------- 3981 Returns the square root of the extended double-precision floating-point 3982 value `a'. The operation is performed according to the IEC/IEEE Standard 3983 for Binary Floating-Point Arithmetic. 3984 ------------------------------------------------------------------------------- 3985 */ 3986 floatx80 floatx80_sqrt( floatx80 a ) 3987 { 3988 flag aSign; 3989 int32 aExp, zExp; 3990 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 3991 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3992 floatx80 z; 3993 3994 aSig0 = extractFloatx80Frac( a ); 3995 aExp = extractFloatx80Exp( a ); 3996 aSign = extractFloatx80Sign( a ); 3997 if ( aExp == 0x7FFF ) { 3998 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 3999 if ( ! aSign ) return a; 4000 goto invalid; 4001 } 4002 if ( aSign ) { 4003 if ( ( aExp | aSig0 ) == 0 ) return a; 4004 invalid: 4005 float_raise( float_flag_invalid ); 4006 z.low = floatx80_default_nan_low; 4007 z.high = floatx80_default_nan_high; 4008 return z; 4009 } 4010 if ( aExp == 0 ) { 4011 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4012 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4013 } 4014 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4015 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4016 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4017 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4018 doubleZSig0 = zSig0<<1; 4019 mul64To128( zSig0, zSig0, &term0, &term1 ); 4020 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4021 while ( (sbits64) rem0 < 0 ) { 4022 --zSig0; 4023 doubleZSig0 -= 2; 4024 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4025 } 4026 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4027 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4028 if ( zSig1 == 0 ) zSig1 = 1; 4029 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4030 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4031 mul64To128( zSig1, zSig1, &term2, &term3 ); 4032 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4033 while ( (sbits64) rem1 < 0 ) { 4034 --zSig1; 4035 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4036 term3 |= 1; 4037 term2 |= doubleZSig0; 4038 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4039 } 4040 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4041 } 4042 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4043 zSig0 |= doubleZSig0; 4044 return 4045 roundAndPackFloatx80( 4046 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4047 4048 } 4049 4050 /* 4051 ------------------------------------------------------------------------------- 4052 Returns 1 if the extended double-precision floating-point value `a' is 4053 equal to the corresponding value `b', and 0 otherwise. The comparison is 4054 performed according to the IEC/IEEE Standard for Binary Floating-Point 4055 Arithmetic. 4056 ------------------------------------------------------------------------------- 4057 */ 4058 flag floatx80_eq( floatx80 a, floatx80 b ) 4059 { 4060 4061 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4062 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4063 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4064 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4065 ) { 4066 if ( floatx80_is_signaling_nan( a ) 4067 || floatx80_is_signaling_nan( b ) ) { 4068 float_raise( float_flag_invalid ); 4069 } 4070 return 0; 4071 } 4072 return 4073 ( a.low == b.low ) 4074 && ( ( a.high == b.high ) 4075 || ( ( a.low == 0 ) 4076 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4077 ); 4078 4079 } 4080 4081 /* 4082 ------------------------------------------------------------------------------- 4083 Returns 1 if the extended double-precision floating-point value `a' is 4084 less than or equal to the corresponding value `b', and 0 otherwise. The 4085 comparison is performed according to the IEC/IEEE Standard for Binary 4086 Floating-Point Arithmetic. 4087 ------------------------------------------------------------------------------- 4088 */ 4089 flag floatx80_le( floatx80 a, floatx80 b ) 4090 { 4091 flag aSign, bSign; 4092 4093 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4094 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4095 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4096 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4097 ) { 4098 float_raise( float_flag_invalid ); 4099 return 0; 4100 } 4101 aSign = extractFloatx80Sign( a ); 4102 bSign = extractFloatx80Sign( b ); 4103 if ( aSign != bSign ) { 4104 return 4105 aSign 4106 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4107 == 0 ); 4108 } 4109 return 4110 aSign ? le128( b.high, b.low, a.high, a.low ) 4111 : le128( a.high, a.low, b.high, b.low ); 4112 4113 } 4114 4115 /* 4116 ------------------------------------------------------------------------------- 4117 Returns 1 if the extended double-precision floating-point value `a' is 4118 less than the corresponding value `b', and 0 otherwise. The comparison 4119 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4120 Arithmetic. 4121 ------------------------------------------------------------------------------- 4122 */ 4123 flag floatx80_lt( floatx80 a, floatx80 b ) 4124 { 4125 flag aSign, bSign; 4126 4127 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4128 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4129 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4130 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4131 ) { 4132 float_raise( float_flag_invalid ); 4133 return 0; 4134 } 4135 aSign = extractFloatx80Sign( a ); 4136 bSign = extractFloatx80Sign( b ); 4137 if ( aSign != bSign ) { 4138 return 4139 aSign 4140 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4141 != 0 ); 4142 } 4143 return 4144 aSign ? lt128( b.high, b.low, a.high, a.low ) 4145 : lt128( a.high, a.low, b.high, b.low ); 4146 4147 } 4148 4149 /* 4150 ------------------------------------------------------------------------------- 4151 Returns 1 if the extended double-precision floating-point value `a' is equal 4152 to the corresponding value `b', and 0 otherwise. The invalid exception is 4153 raised if either operand is a NaN. Otherwise, the comparison is performed 4154 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4155 ------------------------------------------------------------------------------- 4156 */ 4157 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4158 { 4159 4160 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4161 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4162 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4163 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4164 ) { 4165 float_raise( float_flag_invalid ); 4166 return 0; 4167 } 4168 return 4169 ( a.low == b.low ) 4170 && ( ( a.high == b.high ) 4171 || ( ( a.low == 0 ) 4172 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4173 ); 4174 4175 } 4176 4177 /* 4178 ------------------------------------------------------------------------------- 4179 Returns 1 if the extended double-precision floating-point value `a' is less 4180 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4181 do not cause an exception. Otherwise, the comparison is performed according 4182 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4183 ------------------------------------------------------------------------------- 4184 */ 4185 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4186 { 4187 flag aSign, bSign; 4188 4189 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4190 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4191 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4192 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4193 ) { 4194 if ( floatx80_is_signaling_nan( a ) 4195 || floatx80_is_signaling_nan( b ) ) { 4196 float_raise( float_flag_invalid ); 4197 } 4198 return 0; 4199 } 4200 aSign = extractFloatx80Sign( a ); 4201 bSign = extractFloatx80Sign( b ); 4202 if ( aSign != bSign ) { 4203 return 4204 aSign 4205 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4206 == 0 ); 4207 } 4208 return 4209 aSign ? le128( b.high, b.low, a.high, a.low ) 4210 : le128( a.high, a.low, b.high, b.low ); 4211 4212 } 4213 4214 /* 4215 ------------------------------------------------------------------------------- 4216 Returns 1 if the extended double-precision floating-point value `a' is less 4217 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4218 an exception. Otherwise, the comparison is performed according to the 4219 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4220 ------------------------------------------------------------------------------- 4221 */ 4222 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4223 { 4224 flag aSign, bSign; 4225 4226 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4227 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4228 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4229 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4230 ) { 4231 if ( floatx80_is_signaling_nan( a ) 4232 || floatx80_is_signaling_nan( b ) ) { 4233 float_raise( float_flag_invalid ); 4234 } 4235 return 0; 4236 } 4237 aSign = extractFloatx80Sign( a ); 4238 bSign = extractFloatx80Sign( b ); 4239 if ( aSign != bSign ) { 4240 return 4241 aSign 4242 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4243 != 0 ); 4244 } 4245 return 4246 aSign ? lt128( b.high, b.low, a.high, a.low ) 4247 : lt128( a.high, a.low, b.high, b.low ); 4248 4249 } 4250 4251 #endif 4252 4253 #ifdef FLOAT128 4254 4255 /* 4256 ------------------------------------------------------------------------------- 4257 Returns the result of converting the quadruple-precision floating-point 4258 value `a' to the 32-bit two's complement integer format. The conversion 4259 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4260 Arithmetic---which means in particular that the conversion is rounded 4261 according to the current rounding mode. If `a' is a NaN, the largest 4262 positive integer is returned. Otherwise, if the conversion overflows, the 4263 largest integer with the same sign as `a' is returned. 4264 ------------------------------------------------------------------------------- 4265 */ 4266 int32 float128_to_int32( float128 a ) 4267 { 4268 flag aSign; 4269 int32 aExp, shiftCount; 4270 bits64 aSig0, aSig1; 4271 4272 aSig1 = extractFloat128Frac1( a ); 4273 aSig0 = extractFloat128Frac0( a ); 4274 aExp = extractFloat128Exp( a ); 4275 aSign = extractFloat128Sign( a ); 4276 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4277 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4278 aSig0 |= ( aSig1 != 0 ); 4279 shiftCount = 0x4028 - aExp; 4280 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4281 return roundAndPackInt32( aSign, aSig0 ); 4282 4283 } 4284 4285 /* 4286 ------------------------------------------------------------------------------- 4287 Returns the result of converting the quadruple-precision floating-point 4288 value `a' to the 32-bit two's complement integer format. The conversion 4289 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4290 Arithmetic, except that the conversion is always rounded toward zero. If 4291 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4292 conversion overflows, the largest integer with the same sign as `a' is 4293 returned. 4294 ------------------------------------------------------------------------------- 4295 */ 4296 int32 float128_to_int32_round_to_zero( float128 a ) 4297 { 4298 flag aSign; 4299 int32 aExp, shiftCount; 4300 bits64 aSig0, aSig1, savedASig; 4301 int32 z; 4302 4303 aSig1 = extractFloat128Frac1( a ); 4304 aSig0 = extractFloat128Frac0( a ); 4305 aExp = extractFloat128Exp( a ); 4306 aSign = extractFloat128Sign( a ); 4307 aSig0 |= ( aSig1 != 0 ); 4308 if ( 0x401E < aExp ) { 4309 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4310 goto invalid; 4311 } 4312 else if ( aExp < 0x3FFF ) { 4313 if ( aExp || aSig0 ) float_set_inexact(); 4314 return 0; 4315 } 4316 aSig0 |= LIT64( 0x0001000000000000 ); 4317 shiftCount = 0x402F - aExp; 4318 savedASig = aSig0; 4319 aSig0 >>= shiftCount; 4320 z = aSig0; 4321 if ( aSign ) z = - z; 4322 if ( ( z < 0 ) ^ aSign ) { 4323 invalid: 4324 float_raise( float_flag_invalid ); 4325 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4326 } 4327 if ( ( aSig0<<shiftCount ) != savedASig ) { 4328 float_set_inexact(); 4329 } 4330 return z; 4331 4332 } 4333 4334 /* 4335 ------------------------------------------------------------------------------- 4336 Returns the result of converting the quadruple-precision floating-point 4337 value `a' to the 64-bit two's complement integer format. The conversion 4338 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4339 Arithmetic---which means in particular that the conversion is rounded 4340 according to the current rounding mode. If `a' is a NaN, the largest 4341 positive integer is returned. Otherwise, if the conversion overflows, the 4342 largest integer with the same sign as `a' is returned. 4343 ------------------------------------------------------------------------------- 4344 */ 4345 int64 float128_to_int64( float128 a ) 4346 { 4347 flag aSign; 4348 int32 aExp, shiftCount; 4349 bits64 aSig0, aSig1; 4350 4351 aSig1 = extractFloat128Frac1( a ); 4352 aSig0 = extractFloat128Frac0( a ); 4353 aExp = extractFloat128Exp( a ); 4354 aSign = extractFloat128Sign( a ); 4355 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4356 shiftCount = 0x402F - aExp; 4357 if ( shiftCount <= 0 ) { 4358 if ( 0x403E < aExp ) { 4359 float_raise( float_flag_invalid ); 4360 if ( ! aSign 4361 || ( ( aExp == 0x7FFF ) 4362 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4363 ) 4364 ) { 4365 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4366 } 4367 return (sbits64) LIT64( 0x8000000000000000 ); 4368 } 4369 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4370 } 4371 else { 4372 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4373 } 4374 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4375 4376 } 4377 4378 /* 4379 ------------------------------------------------------------------------------- 4380 Returns the result of converting the quadruple-precision floating-point 4381 value `a' to the 64-bit two's complement integer format. The conversion 4382 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4383 Arithmetic, except that the conversion is always rounded toward zero. 4384 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4385 the conversion overflows, the largest integer with the same sign as `a' is 4386 returned. 4387 ------------------------------------------------------------------------------- 4388 */ 4389 int64 float128_to_int64_round_to_zero( float128 a ) 4390 { 4391 flag aSign; 4392 int32 aExp, shiftCount; 4393 bits64 aSig0, aSig1; 4394 int64 z; 4395 4396 aSig1 = extractFloat128Frac1( a ); 4397 aSig0 = extractFloat128Frac0( a ); 4398 aExp = extractFloat128Exp( a ); 4399 aSign = extractFloat128Sign( a ); 4400 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4401 shiftCount = aExp - 0x402F; 4402 if ( 0 < shiftCount ) { 4403 if ( 0x403E <= aExp ) { 4404 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4405 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4406 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4407 if ( aSig1 ) float_set_inexact(); 4408 } 4409 else { 4410 float_raise( float_flag_invalid ); 4411 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4412 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4413 } 4414 } 4415 return (sbits64) LIT64( 0x8000000000000000 ); 4416 } 4417 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4418 if ( (bits64) ( aSig1<<shiftCount ) ) { 4419 float_set_inexact(); 4420 } 4421 } 4422 else { 4423 if ( aExp < 0x3FFF ) { 4424 if ( aExp | aSig0 | aSig1 ) { 4425 float_set_inexact(); 4426 } 4427 return 0; 4428 } 4429 z = aSig0>>( - shiftCount ); 4430 if ( aSig1 4431 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4432 float_set_inexact(); 4433 } 4434 } 4435 if ( aSign ) z = - z; 4436 return z; 4437 4438 } 4439 4440 /* 4441 ------------------------------------------------------------------------------- 4442 Returns the result of converting the quadruple-precision floating-point 4443 value `a' to the single-precision floating-point format. The conversion 4444 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4445 Arithmetic. 4446 ------------------------------------------------------------------------------- 4447 */ 4448 float32 float128_to_float32( float128 a ) 4449 { 4450 flag aSign; 4451 int32 aExp; 4452 bits64 aSig0, aSig1; 4453 bits32 zSig; 4454 4455 aSig1 = extractFloat128Frac1( a ); 4456 aSig0 = extractFloat128Frac0( a ); 4457 aExp = extractFloat128Exp( a ); 4458 aSign = extractFloat128Sign( a ); 4459 if ( aExp == 0x7FFF ) { 4460 if ( aSig0 | aSig1 ) { 4461 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4462 } 4463 return packFloat32( aSign, 0xFF, 0 ); 4464 } 4465 aSig0 |= ( aSig1 != 0 ); 4466 shift64RightJamming( aSig0, 18, &aSig0 ); 4467 zSig = aSig0; 4468 if ( aExp || zSig ) { 4469 zSig |= 0x40000000; 4470 aExp -= 0x3F81; 4471 } 4472 return roundAndPackFloat32( aSign, aExp, zSig ); 4473 4474 } 4475 4476 /* 4477 ------------------------------------------------------------------------------- 4478 Returns the result of converting the quadruple-precision floating-point 4479 value `a' to the double-precision floating-point format. The conversion 4480 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4481 Arithmetic. 4482 ------------------------------------------------------------------------------- 4483 */ 4484 float64 float128_to_float64( float128 a ) 4485 { 4486 flag aSign; 4487 int32 aExp; 4488 bits64 aSig0, aSig1; 4489 4490 aSig1 = extractFloat128Frac1( a ); 4491 aSig0 = extractFloat128Frac0( a ); 4492 aExp = extractFloat128Exp( a ); 4493 aSign = extractFloat128Sign( a ); 4494 if ( aExp == 0x7FFF ) { 4495 if ( aSig0 | aSig1 ) { 4496 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4497 } 4498 return packFloat64( aSign, 0x7FF, 0 ); 4499 } 4500 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4501 aSig0 |= ( aSig1 != 0 ); 4502 if ( aExp || aSig0 ) { 4503 aSig0 |= LIT64( 0x4000000000000000 ); 4504 aExp -= 0x3C01; 4505 } 4506 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4507 4508 } 4509 4510 #ifdef FLOATX80 4511 4512 /* 4513 ------------------------------------------------------------------------------- 4514 Returns the result of converting the quadruple-precision floating-point 4515 value `a' to the extended double-precision floating-point format. The 4516 conversion is performed according to the IEC/IEEE Standard for Binary 4517 Floating-Point Arithmetic. 4518 ------------------------------------------------------------------------------- 4519 */ 4520 floatx80 float128_to_floatx80( float128 a ) 4521 { 4522 flag aSign; 4523 int32 aExp; 4524 bits64 aSig0, aSig1; 4525 4526 aSig1 = extractFloat128Frac1( a ); 4527 aSig0 = extractFloat128Frac0( a ); 4528 aExp = extractFloat128Exp( a ); 4529 aSign = extractFloat128Sign( a ); 4530 if ( aExp == 0x7FFF ) { 4531 if ( aSig0 | aSig1 ) { 4532 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4533 } 4534 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4535 } 4536 if ( aExp == 0 ) { 4537 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4538 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4539 } 4540 else { 4541 aSig0 |= LIT64( 0x0001000000000000 ); 4542 } 4543 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4544 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4545 4546 } 4547 4548 #endif 4549 4550 /* 4551 ------------------------------------------------------------------------------- 4552 Rounds the quadruple-precision floating-point value `a' to an integer, and 4553 returns the result as a quadruple-precision floating-point value. The 4554 operation is performed according to the IEC/IEEE Standard for Binary 4555 Floating-Point Arithmetic. 4556 ------------------------------------------------------------------------------- 4557 */ 4558 float128 float128_round_to_int( float128 a ) 4559 { 4560 flag aSign; 4561 int32 aExp; 4562 bits64 lastBitMask, roundBitsMask; 4563 int8 roundingMode; 4564 float128 z; 4565 4566 aExp = extractFloat128Exp( a ); 4567 if ( 0x402F <= aExp ) { 4568 if ( 0x406F <= aExp ) { 4569 if ( ( aExp == 0x7FFF ) 4570 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4571 ) { 4572 return propagateFloat128NaN( a, a ); 4573 } 4574 return a; 4575 } 4576 lastBitMask = 1; 4577 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4578 roundBitsMask = lastBitMask - 1; 4579 z = a; 4580 roundingMode = float_rounding_mode(); 4581 if ( roundingMode == float_round_nearest_even ) { 4582 if ( lastBitMask ) { 4583 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4584 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4585 } 4586 else { 4587 if ( (sbits64) z.low < 0 ) { 4588 ++z.high; 4589 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4590 } 4591 } 4592 } 4593 else if ( roundingMode != float_round_to_zero ) { 4594 if ( extractFloat128Sign( z ) 4595 ^ ( roundingMode == float_round_up ) ) { 4596 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4597 } 4598 } 4599 z.low &= ~ roundBitsMask; 4600 } 4601 else { 4602 if ( aExp < 0x3FFF ) { 4603 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4604 float_set_inexact(); 4605 aSign = extractFloat128Sign( a ); 4606 switch ( float_rounding_mode() ) { 4607 case float_round_nearest_even: 4608 if ( ( aExp == 0x3FFE ) 4609 && ( extractFloat128Frac0( a ) 4610 | extractFloat128Frac1( a ) ) 4611 ) { 4612 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4613 } 4614 break; 4615 case float_round_down: 4616 return 4617 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4618 : packFloat128( 0, 0, 0, 0 ); 4619 case float_round_up: 4620 return 4621 aSign ? packFloat128( 1, 0, 0, 0 ) 4622 : packFloat128( 0, 0x3FFF, 0, 0 ); 4623 } 4624 return packFloat128( aSign, 0, 0, 0 ); 4625 } 4626 lastBitMask = 1; 4627 lastBitMask <<= 0x402F - aExp; 4628 roundBitsMask = lastBitMask - 1; 4629 z.low = 0; 4630 z.high = a.high; 4631 roundingMode = float_rounding_mode(); 4632 if ( roundingMode == float_round_nearest_even ) { 4633 z.high += lastBitMask>>1; 4634 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4635 z.high &= ~ lastBitMask; 4636 } 4637 } 4638 else if ( roundingMode != float_round_to_zero ) { 4639 if ( extractFloat128Sign( z ) 4640 ^ ( roundingMode == float_round_up ) ) { 4641 z.high |= ( a.low != 0 ); 4642 z.high += roundBitsMask; 4643 } 4644 } 4645 z.high &= ~ roundBitsMask; 4646 } 4647 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4648 float_set_inexact(); 4649 } 4650 return z; 4651 4652 } 4653 4654 /* 4655 ------------------------------------------------------------------------------- 4656 Returns the result of adding the absolute values of the quadruple-precision 4657 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4658 before being returned. `zSign' is ignored if the result is a NaN. 4659 The addition is performed according to the IEC/IEEE Standard for Binary 4660 Floating-Point Arithmetic. 4661 ------------------------------------------------------------------------------- 4662 */ 4663 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4664 { 4665 int32 aExp, bExp, zExp; 4666 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4667 int32 expDiff; 4668 4669 aSig1 = extractFloat128Frac1( a ); 4670 aSig0 = extractFloat128Frac0( a ); 4671 aExp = extractFloat128Exp( a ); 4672 bSig1 = extractFloat128Frac1( b ); 4673 bSig0 = extractFloat128Frac0( b ); 4674 bExp = extractFloat128Exp( b ); 4675 expDiff = aExp - bExp; 4676 if ( 0 < expDiff ) { 4677 if ( aExp == 0x7FFF ) { 4678 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4679 return a; 4680 } 4681 if ( bExp == 0 ) { 4682 --expDiff; 4683 } 4684 else { 4685 bSig0 |= LIT64( 0x0001000000000000 ); 4686 } 4687 shift128ExtraRightJamming( 4688 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4689 zExp = aExp; 4690 } 4691 else if ( expDiff < 0 ) { 4692 if ( bExp == 0x7FFF ) { 4693 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4694 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4695 } 4696 if ( aExp == 0 ) { 4697 ++expDiff; 4698 } 4699 else { 4700 aSig0 |= LIT64( 0x0001000000000000 ); 4701 } 4702 shift128ExtraRightJamming( 4703 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4704 zExp = bExp; 4705 } 4706 else { 4707 if ( aExp == 0x7FFF ) { 4708 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4709 return propagateFloat128NaN( a, b ); 4710 } 4711 return a; 4712 } 4713 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4714 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4715 zSig2 = 0; 4716 zSig0 |= LIT64( 0x0002000000000000 ); 4717 zExp = aExp; 4718 goto shiftRight1; 4719 } 4720 aSig0 |= LIT64( 0x0001000000000000 ); 4721 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4722 --zExp; 4723 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4724 ++zExp; 4725 shiftRight1: 4726 shift128ExtraRightJamming( 4727 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4728 roundAndPack: 4729 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4730 4731 } 4732 4733 /* 4734 ------------------------------------------------------------------------------- 4735 Returns the result of subtracting the absolute values of the quadruple- 4736 precision floating-point values `a' and `b'. If `zSign' is 1, the 4737 difference is negated before being returned. `zSign' is ignored if the 4738 result is a NaN. The subtraction is performed according to the IEC/IEEE 4739 Standard for Binary Floating-Point Arithmetic. 4740 ------------------------------------------------------------------------------- 4741 */ 4742 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4743 { 4744 int32 aExp, bExp, zExp; 4745 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4746 int32 expDiff; 4747 float128 z; 4748 4749 aSig1 = extractFloat128Frac1( a ); 4750 aSig0 = extractFloat128Frac0( a ); 4751 aExp = extractFloat128Exp( a ); 4752 bSig1 = extractFloat128Frac1( b ); 4753 bSig0 = extractFloat128Frac0( b ); 4754 bExp = extractFloat128Exp( b ); 4755 expDiff = aExp - bExp; 4756 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4757 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4758 if ( 0 < expDiff ) goto aExpBigger; 4759 if ( expDiff < 0 ) goto bExpBigger; 4760 if ( aExp == 0x7FFF ) { 4761 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4762 return propagateFloat128NaN( a, b ); 4763 } 4764 float_raise( float_flag_invalid ); 4765 z.low = float128_default_nan_low; 4766 z.high = float128_default_nan_high; 4767 return z; 4768 } 4769 if ( aExp == 0 ) { 4770 aExp = 1; 4771 bExp = 1; 4772 } 4773 if ( bSig0 < aSig0 ) goto aBigger; 4774 if ( aSig0 < bSig0 ) goto bBigger; 4775 if ( bSig1 < aSig1 ) goto aBigger; 4776 if ( aSig1 < bSig1 ) goto bBigger; 4777 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 ); 4778 bExpBigger: 4779 if ( bExp == 0x7FFF ) { 4780 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4781 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4782 } 4783 if ( aExp == 0 ) { 4784 ++expDiff; 4785 } 4786 else { 4787 aSig0 |= LIT64( 0x4000000000000000 ); 4788 } 4789 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4790 bSig0 |= LIT64( 0x4000000000000000 ); 4791 bBigger: 4792 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4793 zExp = bExp; 4794 zSign ^= 1; 4795 goto normalizeRoundAndPack; 4796 aExpBigger: 4797 if ( aExp == 0x7FFF ) { 4798 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4799 return a; 4800 } 4801 if ( bExp == 0 ) { 4802 --expDiff; 4803 } 4804 else { 4805 bSig0 |= LIT64( 0x4000000000000000 ); 4806 } 4807 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4808 aSig0 |= LIT64( 0x4000000000000000 ); 4809 aBigger: 4810 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4811 zExp = aExp; 4812 normalizeRoundAndPack: 4813 --zExp; 4814 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4815 4816 } 4817 4818 /* 4819 ------------------------------------------------------------------------------- 4820 Returns the result of adding the quadruple-precision floating-point values 4821 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4822 for Binary Floating-Point Arithmetic. 4823 ------------------------------------------------------------------------------- 4824 */ 4825 float128 float128_add( float128 a, float128 b ) 4826 { 4827 flag aSign, bSign; 4828 4829 aSign = extractFloat128Sign( a ); 4830 bSign = extractFloat128Sign( b ); 4831 if ( aSign == bSign ) { 4832 return addFloat128Sigs( a, b, aSign ); 4833 } 4834 else { 4835 return subFloat128Sigs( a, b, aSign ); 4836 } 4837 4838 } 4839 4840 /* 4841 ------------------------------------------------------------------------------- 4842 Returns the result of subtracting the quadruple-precision floating-point 4843 values `a' and `b'. The operation is performed according to the IEC/IEEE 4844 Standard for Binary Floating-Point Arithmetic. 4845 ------------------------------------------------------------------------------- 4846 */ 4847 float128 float128_sub( float128 a, float128 b ) 4848 { 4849 flag aSign, bSign; 4850 4851 aSign = extractFloat128Sign( a ); 4852 bSign = extractFloat128Sign( b ); 4853 if ( aSign == bSign ) { 4854 return subFloat128Sigs( a, b, aSign ); 4855 } 4856 else { 4857 return addFloat128Sigs( a, b, aSign ); 4858 } 4859 4860 } 4861 4862 /* 4863 ------------------------------------------------------------------------------- 4864 Returns the result of multiplying the quadruple-precision floating-point 4865 values `a' and `b'. The operation is performed according to the IEC/IEEE 4866 Standard for Binary Floating-Point Arithmetic. 4867 ------------------------------------------------------------------------------- 4868 */ 4869 float128 float128_mul( float128 a, float128 b ) 4870 { 4871 flag aSign, bSign, zSign; 4872 int32 aExp, bExp, zExp; 4873 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4874 float128 z; 4875 4876 aSig1 = extractFloat128Frac1( a ); 4877 aSig0 = extractFloat128Frac0( a ); 4878 aExp = extractFloat128Exp( a ); 4879 aSign = extractFloat128Sign( a ); 4880 bSig1 = extractFloat128Frac1( b ); 4881 bSig0 = extractFloat128Frac0( b ); 4882 bExp = extractFloat128Exp( b ); 4883 bSign = extractFloat128Sign( b ); 4884 zSign = aSign ^ bSign; 4885 if ( aExp == 0x7FFF ) { 4886 if ( ( aSig0 | aSig1 ) 4887 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4888 return propagateFloat128NaN( a, b ); 4889 } 4890 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4891 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4892 } 4893 if ( bExp == 0x7FFF ) { 4894 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4895 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4896 invalid: 4897 float_raise( float_flag_invalid ); 4898 z.low = float128_default_nan_low; 4899 z.high = float128_default_nan_high; 4900 return z; 4901 } 4902 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4903 } 4904 if ( aExp == 0 ) { 4905 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4906 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4907 } 4908 if ( bExp == 0 ) { 4909 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4910 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4911 } 4912 zExp = aExp + bExp - 0x4000; 4913 aSig0 |= LIT64( 0x0001000000000000 ); 4914 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 4915 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 4916 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4917 zSig2 |= ( zSig3 != 0 ); 4918 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 4919 shift128ExtraRightJamming( 4920 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4921 ++zExp; 4922 } 4923 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4924 4925 } 4926 4927 /* 4928 ------------------------------------------------------------------------------- 4929 Returns the result of dividing the quadruple-precision floating-point value 4930 `a' by the corresponding value `b'. The operation is performed according to 4931 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4932 ------------------------------------------------------------------------------- 4933 */ 4934 float128 float128_div( float128 a, float128 b ) 4935 { 4936 flag aSign, bSign, zSign; 4937 int32 aExp, bExp, zExp; 4938 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4939 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4940 float128 z; 4941 4942 aSig1 = extractFloat128Frac1( a ); 4943 aSig0 = extractFloat128Frac0( a ); 4944 aExp = extractFloat128Exp( a ); 4945 aSign = extractFloat128Sign( a ); 4946 bSig1 = extractFloat128Frac1( b ); 4947 bSig0 = extractFloat128Frac0( b ); 4948 bExp = extractFloat128Exp( b ); 4949 bSign = extractFloat128Sign( b ); 4950 zSign = aSign ^ bSign; 4951 if ( aExp == 0x7FFF ) { 4952 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4953 if ( bExp == 0x7FFF ) { 4954 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4955 goto invalid; 4956 } 4957 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4958 } 4959 if ( bExp == 0x7FFF ) { 4960 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4961 return packFloat128( zSign, 0, 0, 0 ); 4962 } 4963 if ( bExp == 0 ) { 4964 if ( ( bSig0 | bSig1 ) == 0 ) { 4965 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4966 invalid: 4967 float_raise( float_flag_invalid ); 4968 z.low = float128_default_nan_low; 4969 z.high = float128_default_nan_high; 4970 return z; 4971 } 4972 float_raise( float_flag_divbyzero ); 4973 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4974 } 4975 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4976 } 4977 if ( aExp == 0 ) { 4978 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4979 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4980 } 4981 zExp = aExp - bExp + 0x3FFD; 4982 shortShift128Left( 4983 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 4984 shortShift128Left( 4985 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4986 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 4987 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 4988 ++zExp; 4989 } 4990 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4991 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 4992 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 4993 while ( (sbits64) rem0 < 0 ) { 4994 --zSig0; 4995 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 4996 } 4997 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 4998 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 4999 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5000 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5001 while ( (sbits64) rem1 < 0 ) { 5002 --zSig1; 5003 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5004 } 5005 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5006 } 5007 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5008 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5009 5010 } 5011 5012 /* 5013 ------------------------------------------------------------------------------- 5014 Returns the remainder of the quadruple-precision floating-point value `a' 5015 with respect to the corresponding value `b'. The operation is performed 5016 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5017 ------------------------------------------------------------------------------- 5018 */ 5019 float128 float128_rem( float128 a, float128 b ) 5020 { 5021 flag aSign, bSign, zSign; 5022 int32 aExp, bExp, expDiff; 5023 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5024 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5025 sbits64 sigMean0; 5026 float128 z; 5027 5028 aSig1 = extractFloat128Frac1( a ); 5029 aSig0 = extractFloat128Frac0( a ); 5030 aExp = extractFloat128Exp( a ); 5031 aSign = extractFloat128Sign( a ); 5032 bSig1 = extractFloat128Frac1( b ); 5033 bSig0 = extractFloat128Frac0( b ); 5034 bExp = extractFloat128Exp( b ); 5035 bSign = extractFloat128Sign( b ); 5036 if ( aExp == 0x7FFF ) { 5037 if ( ( aSig0 | aSig1 ) 5038 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5039 return propagateFloat128NaN( a, b ); 5040 } 5041 goto invalid; 5042 } 5043 if ( bExp == 0x7FFF ) { 5044 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5045 return a; 5046 } 5047 if ( bExp == 0 ) { 5048 if ( ( bSig0 | bSig1 ) == 0 ) { 5049 invalid: 5050 float_raise( float_flag_invalid ); 5051 z.low = float128_default_nan_low; 5052 z.high = float128_default_nan_high; 5053 return z; 5054 } 5055 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5056 } 5057 if ( aExp == 0 ) { 5058 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5059 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5060 } 5061 expDiff = aExp - bExp; 5062 if ( expDiff < -1 ) return a; 5063 shortShift128Left( 5064 aSig0 | LIT64( 0x0001000000000000 ), 5065 aSig1, 5066 15 - ( expDiff < 0 ), 5067 &aSig0, 5068 &aSig1 5069 ); 5070 shortShift128Left( 5071 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5072 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5073 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5074 expDiff -= 64; 5075 while ( 0 < expDiff ) { 5076 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5077 q = ( 4 < q ) ? q - 4 : 0; 5078 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5079 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5080 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5081 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5082 expDiff -= 61; 5083 } 5084 if ( -64 < expDiff ) { 5085 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5086 q = ( 4 < q ) ? q - 4 : 0; 5087 q >>= - expDiff; 5088 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5089 expDiff += 52; 5090 if ( expDiff < 0 ) { 5091 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5092 } 5093 else { 5094 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5095 } 5096 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5097 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5098 } 5099 else { 5100 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5101 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5102 } 5103 do { 5104 alternateASig0 = aSig0; 5105 alternateASig1 = aSig1; 5106 ++q; 5107 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5108 } while ( 0 <= (sbits64) aSig0 ); 5109 add128( 5110 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5111 if ( ( sigMean0 < 0 ) 5112 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5113 aSig0 = alternateASig0; 5114 aSig1 = alternateASig1; 5115 } 5116 zSign = ( (sbits64) aSig0 < 0 ); 5117 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5118 return 5119 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5120 5121 } 5122 5123 /* 5124 ------------------------------------------------------------------------------- 5125 Returns the square root of the quadruple-precision floating-point value `a'. 5126 The operation is performed according to the IEC/IEEE Standard for Binary 5127 Floating-Point Arithmetic. 5128 ------------------------------------------------------------------------------- 5129 */ 5130 float128 float128_sqrt( float128 a ) 5131 { 5132 flag aSign; 5133 int32 aExp, zExp; 5134 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5135 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5136 float128 z; 5137 5138 aSig1 = extractFloat128Frac1( a ); 5139 aSig0 = extractFloat128Frac0( a ); 5140 aExp = extractFloat128Exp( a ); 5141 aSign = extractFloat128Sign( a ); 5142 if ( aExp == 0x7FFF ) { 5143 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5144 if ( ! aSign ) return a; 5145 goto invalid; 5146 } 5147 if ( aSign ) { 5148 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5149 invalid: 5150 float_raise( float_flag_invalid ); 5151 z.low = float128_default_nan_low; 5152 z.high = float128_default_nan_high; 5153 return z; 5154 } 5155 if ( aExp == 0 ) { 5156 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5157 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5158 } 5159 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5160 aSig0 |= LIT64( 0x0001000000000000 ); 5161 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5162 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5163 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5164 doubleZSig0 = zSig0<<1; 5165 mul64To128( zSig0, zSig0, &term0, &term1 ); 5166 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5167 while ( (sbits64) rem0 < 0 ) { 5168 --zSig0; 5169 doubleZSig0 -= 2; 5170 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5171 } 5172 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5173 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5174 if ( zSig1 == 0 ) zSig1 = 1; 5175 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5176 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5177 mul64To128( zSig1, zSig1, &term2, &term3 ); 5178 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5179 while ( (sbits64) rem1 < 0 ) { 5180 --zSig1; 5181 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5182 term3 |= 1; 5183 term2 |= doubleZSig0; 5184 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5185 } 5186 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5187 } 5188 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5189 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5190 5191 } 5192 5193 /* 5194 ------------------------------------------------------------------------------- 5195 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5196 the corresponding value `b', and 0 otherwise. The comparison is performed 5197 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5198 ------------------------------------------------------------------------------- 5199 */ 5200 flag float128_eq( float128 a, float128 b ) 5201 { 5202 5203 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5204 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5205 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5206 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5207 ) { 5208 if ( float128_is_signaling_nan( a ) 5209 || float128_is_signaling_nan( b ) ) { 5210 float_raise( float_flag_invalid ); 5211 } 5212 return 0; 5213 } 5214 return 5215 ( a.low == b.low ) 5216 && ( ( a.high == b.high ) 5217 || ( ( a.low == 0 ) 5218 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5219 ); 5220 5221 } 5222 5223 /* 5224 ------------------------------------------------------------------------------- 5225 Returns 1 if the quadruple-precision floating-point value `a' is less than 5226 or equal to the corresponding value `b', and 0 otherwise. The comparison 5227 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5228 Arithmetic. 5229 ------------------------------------------------------------------------------- 5230 */ 5231 flag float128_le( float128 a, float128 b ) 5232 { 5233 flag aSign, bSign; 5234 5235 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5236 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5237 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5238 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5239 ) { 5240 float_raise( float_flag_invalid ); 5241 return 0; 5242 } 5243 aSign = extractFloat128Sign( a ); 5244 bSign = extractFloat128Sign( b ); 5245 if ( aSign != bSign ) { 5246 return 5247 aSign 5248 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5249 == 0 ); 5250 } 5251 return 5252 aSign ? le128( b.high, b.low, a.high, a.low ) 5253 : le128( a.high, a.low, b.high, b.low ); 5254 5255 } 5256 5257 /* 5258 ------------------------------------------------------------------------------- 5259 Returns 1 if the quadruple-precision floating-point value `a' is less than 5260 the corresponding value `b', and 0 otherwise. The comparison is performed 5261 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5262 ------------------------------------------------------------------------------- 5263 */ 5264 flag float128_lt( float128 a, float128 b ) 5265 { 5266 flag aSign, bSign; 5267 5268 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5269 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5270 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5271 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5272 ) { 5273 float_raise( float_flag_invalid ); 5274 return 0; 5275 } 5276 aSign = extractFloat128Sign( a ); 5277 bSign = extractFloat128Sign( b ); 5278 if ( aSign != bSign ) { 5279 return 5280 aSign 5281 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5282 != 0 ); 5283 } 5284 return 5285 aSign ? lt128( b.high, b.low, a.high, a.low ) 5286 : lt128( a.high, a.low, b.high, b.low ); 5287 5288 } 5289 5290 /* 5291 ------------------------------------------------------------------------------- 5292 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5293 the corresponding value `b', and 0 otherwise. The invalid exception is 5294 raised if either operand is a NaN. Otherwise, the comparison is performed 5295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5296 ------------------------------------------------------------------------------- 5297 */ 5298 flag float128_eq_signaling( float128 a, float128 b ) 5299 { 5300 5301 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5302 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5303 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5304 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5305 ) { 5306 float_raise( float_flag_invalid ); 5307 return 0; 5308 } 5309 return 5310 ( a.low == b.low ) 5311 && ( ( a.high == b.high ) 5312 || ( ( a.low == 0 ) 5313 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5314 ); 5315 5316 } 5317 5318 /* 5319 ------------------------------------------------------------------------------- 5320 Returns 1 if the quadruple-precision floating-point value `a' is less than 5321 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5322 cause an exception. Otherwise, the comparison is performed according to the 5323 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5324 ------------------------------------------------------------------------------- 5325 */ 5326 flag float128_le_quiet( float128 a, float128 b ) 5327 { 5328 flag aSign, bSign; 5329 5330 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5331 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5332 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5333 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5334 ) { 5335 if ( float128_is_signaling_nan( a ) 5336 || float128_is_signaling_nan( b ) ) { 5337 float_raise( float_flag_invalid ); 5338 } 5339 return 0; 5340 } 5341 aSign = extractFloat128Sign( a ); 5342 bSign = extractFloat128Sign( b ); 5343 if ( aSign != bSign ) { 5344 return 5345 aSign 5346 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5347 == 0 ); 5348 } 5349 return 5350 aSign ? le128( b.high, b.low, a.high, a.low ) 5351 : le128( a.high, a.low, b.high, b.low ); 5352 5353 } 5354 5355 /* 5356 ------------------------------------------------------------------------------- 5357 Returns 1 if the quadruple-precision floating-point value `a' is less than 5358 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5359 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5360 Standard for Binary Floating-Point Arithmetic. 5361 ------------------------------------------------------------------------------- 5362 */ 5363 flag float128_lt_quiet( float128 a, float128 b ) 5364 { 5365 flag aSign, bSign; 5366 5367 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5368 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5369 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5370 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5371 ) { 5372 if ( float128_is_signaling_nan( a ) 5373 || float128_is_signaling_nan( b ) ) { 5374 float_raise( float_flag_invalid ); 5375 } 5376 return 0; 5377 } 5378 aSign = extractFloat128Sign( a ); 5379 bSign = extractFloat128Sign( b ); 5380 if ( aSign != bSign ) { 5381 return 5382 aSign 5383 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5384 != 0 ); 5385 } 5386 return 5387 aSign ? lt128( b.high, b.low, a.high, a.low ) 5388 : lt128( a.high, a.low, b.high, b.low ); 5389 5390 } 5391 5392 #endif 5393 5394 5395 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5396 5397 /* 5398 * These two routines are not part of the original softfloat distribution. 5399 * 5400 * They are based on the corresponding conversions to integer but return 5401 * unsigned numbers instead since these functions are required by GCC. 5402 * 5403 * Added by Mark Brinicombe <mark@netbsd.org> 27/09/97 5404 * 5405 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5406 */ 5407 5408 /* 5409 ------------------------------------------------------------------------------- 5410 Returns the result of converting the double-precision floating-point value 5411 `a' to the 32-bit unsigned integer format. The conversion is 5412 performed according to the IEC/IEEE Standard for Binary Floating-point 5413 Arithmetic, except that the conversion is always rounded toward zero. If 5414 `a' is a NaN, the largest positive integer is returned. If the conversion 5415 overflows, the largest integer positive is returned. 5416 ------------------------------------------------------------------------------- 5417 */ 5418 uint32 float64_to_uint32_round_to_zero( float64 a ) 5419 { 5420 flag aSign; 5421 int16 aExp, shiftCount; 5422 bits64 aSig, savedASig; 5423 uint32 z; 5424 5425 aSig = extractFloat64Frac( a ); 5426 aExp = extractFloat64Exp( a ); 5427 aSign = extractFloat64Sign( a ); 5428 5429 if (aSign) { 5430 float_raise( float_flag_invalid ); 5431 return(0); 5432 } 5433 5434 if ( 0x41E < aExp ) { 5435 float_raise( float_flag_invalid ); 5436 return 0xffffffff; 5437 } 5438 else if ( aExp < 0x3FF ) { 5439 if ( aExp || aSig ) float_set_inexact(); 5440 return 0; 5441 } 5442 aSig |= LIT64( 0x0010000000000000 ); 5443 shiftCount = 0x433 - aExp; 5444 savedASig = aSig; 5445 aSig >>= shiftCount; 5446 z = aSig; 5447 if ( ( aSig<<shiftCount ) != savedASig ) { 5448 float_set_inexact(); 5449 } 5450 return z; 5451 5452 } 5453 5454 /* 5455 ------------------------------------------------------------------------------- 5456 Returns the result of converting the single-precision floating-point value 5457 `a' to the 32-bit unsigned integer format. The conversion is 5458 performed according to the IEC/IEEE Standard for Binary Floating-point 5459 Arithmetic, except that the conversion is always rounded toward zero. If 5460 `a' is a NaN, the largest positive integer is returned. If the conversion 5461 overflows, the largest positive integer is returned. 5462 ------------------------------------------------------------------------------- 5463 */ 5464 uint32 float32_to_uint32_round_to_zero( float32 a ) 5465 { 5466 flag aSign; 5467 int16 aExp, shiftCount; 5468 bits32 aSig; 5469 uint32 z; 5470 5471 aSig = extractFloat32Frac( a ); 5472 aExp = extractFloat32Exp( a ); 5473 aSign = extractFloat32Sign( a ); 5474 shiftCount = aExp - 0x9E; 5475 5476 if (aSign) { 5477 float_raise( float_flag_invalid ); 5478 return(0); 5479 } 5480 if ( 0 < shiftCount ) { 5481 float_raise( float_flag_invalid ); 5482 return 0xFFFFFFFF; 5483 } 5484 else if ( aExp <= 0x7E ) { 5485 if ( aExp | aSig ) float_set_inexact(); 5486 return 0; 5487 } 5488 aSig = ( aSig | 0x800000 )<<8; 5489 z = aSig>>( - shiftCount ); 5490 if ( aSig<<( shiftCount & 31 ) ) { 5491 float_set_inexact(); 5492 } 5493 return z; 5494 5495 } 5496 5497 #endif 5498