1 /* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:08 bjh21 Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 =============================================================================== 19 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 #include <sys/cdefs.h> 48 #if defined(LIBC_SCCS) && !defined(lint) 49 __RCSID("$NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:08 bjh21 Exp $"); 50 #endif /* LIBC_SCCS and not lint */ 51 52 #ifdef SOFTFLOAT_FOR_GCC 53 #include "softfloat-for-gcc.h" 54 #endif 55 56 #include "milieu.h" 57 #include "softfloat.h" 58 59 /* 60 * Conversions between floats as stored in memory and floats as 61 * SoftFloat uses them 62 */ 63 #ifndef FLOAT64_DEMANGLE 64 #define FLOAT64_DEMANGLE(a) (a) 65 #endif 66 #ifndef FLOAT64_MANGLE 67 #define FLOAT64_MANGLE(a) (a) 68 #endif 69 70 /* 71 ------------------------------------------------------------------------------- 72 Floating-point rounding mode, extended double-precision rounding precision, 73 and exception flags. 74 ------------------------------------------------------------------------------- 75 */ 76 fp_rnd float_rounding_mode = float_round_nearest_even; 77 fp_except float_exception_flags = 0; 78 #ifdef FLOATX80 79 int8 floatx80_rounding_precision = 80; 80 #endif 81 82 /* 83 ------------------------------------------------------------------------------- 84 Primitive arithmetic functions, including multi-word arithmetic, and 85 division and square root approximations. (Can be specialized to target if 86 desired.) 87 ------------------------------------------------------------------------------- 88 */ 89 #include "softfloat-macros" 90 91 /* 92 ------------------------------------------------------------------------------- 93 Functions and definitions to determine: (1) whether tininess for underflow 94 is detected before or after rounding by default, (2) what (if anything) 95 happens when exceptions are raised, (3) how signaling NaNs are distinguished 96 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 97 are propagated from function inputs to output. These details are target- 98 specific. 99 ------------------------------------------------------------------------------- 100 */ 101 #include "softfloat-specialize" 102 103 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128) 104 /* 105 ------------------------------------------------------------------------------- 106 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 107 and 7, and returns the properly rounded 32-bit integer corresponding to the 108 input. If `zSign' is 1, the input is negated before being converted to an 109 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 110 is simply rounded to an integer, with the inexact exception raised if the 111 input cannot be represented exactly as an integer. However, if the fixed- 112 point input is too large, the invalid exception is raised and the largest 113 positive or negative integer is returned. 114 ------------------------------------------------------------------------------- 115 */ 116 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 117 { 118 int8 roundingMode; 119 flag roundNearestEven; 120 int8 roundIncrement, roundBits; 121 int32 z; 122 123 roundingMode = float_rounding_mode; 124 roundNearestEven = ( roundingMode == float_round_nearest_even ); 125 roundIncrement = 0x40; 126 if ( ! roundNearestEven ) { 127 if ( roundingMode == float_round_to_zero ) { 128 roundIncrement = 0; 129 } 130 else { 131 roundIncrement = 0x7F; 132 if ( zSign ) { 133 if ( roundingMode == float_round_up ) roundIncrement = 0; 134 } 135 else { 136 if ( roundingMode == float_round_down ) roundIncrement = 0; 137 } 138 } 139 } 140 roundBits = absZ & 0x7F; 141 absZ = ( absZ + roundIncrement )>>7; 142 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 143 z = absZ; 144 if ( zSign ) z = - z; 145 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 146 float_raise( float_flag_invalid ); 147 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 148 } 149 if ( roundBits ) float_exception_flags |= float_flag_inexact; 150 return z; 151 152 } 153 154 /* 155 ------------------------------------------------------------------------------- 156 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 157 `absZ1', with binary point between bits 63 and 64 (between the input words), 158 and returns the properly rounded 64-bit integer corresponding to the input. 159 If `zSign' is 1, the input is negated before being converted to an integer. 160 Ordinarily, the fixed-point input is simply rounded to an integer, with 161 the inexact exception raised if the input cannot be represented exactly as 162 an integer. However, if the fixed-point input is too large, the invalid 163 exception is raised and the largest positive or negative integer is 164 returned. 165 ------------------------------------------------------------------------------- 166 */ 167 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 168 { 169 int8 roundingMode; 170 flag roundNearestEven, increment; 171 int64 z; 172 173 roundingMode = float_rounding_mode; 174 roundNearestEven = ( roundingMode == float_round_nearest_even ); 175 increment = ( (sbits64) absZ1 < 0 ); 176 if ( ! roundNearestEven ) { 177 if ( roundingMode == float_round_to_zero ) { 178 increment = 0; 179 } 180 else { 181 if ( zSign ) { 182 increment = ( roundingMode == float_round_down ) && absZ1; 183 } 184 else { 185 increment = ( roundingMode == float_round_up ) && absZ1; 186 } 187 } 188 } 189 if ( increment ) { 190 ++absZ0; 191 if ( absZ0 == 0 ) goto overflow; 192 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 193 } 194 z = absZ0; 195 if ( zSign ) z = - z; 196 if ( z && ( ( z < 0 ) ^ zSign ) ) { 197 overflow: 198 float_raise( float_flag_invalid ); 199 return 200 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 201 : LIT64( 0x7FFFFFFFFFFFFFFF ); 202 } 203 if ( absZ1 ) float_exception_flags |= float_flag_inexact; 204 return z; 205 206 } 207 #endif 208 209 /* 210 ------------------------------------------------------------------------------- 211 Returns the fraction bits of the single-precision floating-point value `a'. 212 ------------------------------------------------------------------------------- 213 */ 214 INLINE bits32 extractFloat32Frac( float32 a ) 215 { 216 217 return a & 0x007FFFFF; 218 219 } 220 221 /* 222 ------------------------------------------------------------------------------- 223 Returns the exponent bits of the single-precision floating-point value `a'. 224 ------------------------------------------------------------------------------- 225 */ 226 INLINE int16 extractFloat32Exp( float32 a ) 227 { 228 229 return ( a>>23 ) & 0xFF; 230 231 } 232 233 /* 234 ------------------------------------------------------------------------------- 235 Returns the sign bit of the single-precision floating-point value `a'. 236 ------------------------------------------------------------------------------- 237 */ 238 INLINE flag extractFloat32Sign( float32 a ) 239 { 240 241 return a>>31; 242 243 } 244 245 /* 246 ------------------------------------------------------------------------------- 247 Normalizes the subnormal single-precision floating-point value represented 248 by the denormalized significand `aSig'. The normalized exponent and 249 significand are stored at the locations pointed to by `zExpPtr' and 250 `zSigPtr', respectively. 251 ------------------------------------------------------------------------------- 252 */ 253 static void 254 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 255 { 256 int8 shiftCount; 257 258 shiftCount = countLeadingZeros32( aSig ) - 8; 259 *zSigPtr = aSig<<shiftCount; 260 *zExpPtr = 1 - shiftCount; 261 262 } 263 264 /* 265 ------------------------------------------------------------------------------- 266 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 267 single-precision floating-point value, returning the result. After being 268 shifted into the proper positions, the three fields are simply added 269 together to form the result. This means that any integer portion of `zSig' 270 will be added into the exponent. Since a properly normalized significand 271 will have an integer portion equal to 1, the `zExp' input should be 1 less 272 than the desired result exponent whenever `zSig' is a complete, normalized 273 significand. 274 ------------------------------------------------------------------------------- 275 */ 276 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 277 { 278 279 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 280 281 } 282 283 /* 284 ------------------------------------------------------------------------------- 285 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 286 and significand `zSig', and returns the proper single-precision floating- 287 point value corresponding to the abstract input. Ordinarily, the abstract 288 value is simply rounded and packed into the single-precision format, with 289 the inexact exception raised if the abstract input cannot be represented 290 exactly. However, if the abstract value is too large, the overflow and 291 inexact exceptions are raised and an infinity or maximal finite value is 292 returned. If the abstract value is too small, the input value is rounded to 293 a subnormal number, and the underflow and inexact exceptions are raised if 294 the abstract input cannot be represented exactly as a subnormal single- 295 precision floating-point number. 296 The input significand `zSig' has its binary point between bits 30 297 and 29, which is 7 bits to the left of the usual location. This shifted 298 significand must be normalized or smaller. If `zSig' is not normalized, 299 `zExp' must be 0; in that case, the result returned is a subnormal number, 300 and it must not require rounding. In the usual case that `zSig' is 301 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 302 The handling of underflow and overflow follows the IEC/IEEE Standard for 303 Binary Floating-Point Arithmetic. 304 ------------------------------------------------------------------------------- 305 */ 306 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 307 { 308 int8 roundingMode; 309 flag roundNearestEven; 310 int8 roundIncrement, roundBits; 311 flag isTiny; 312 313 roundingMode = float_rounding_mode; 314 roundNearestEven = ( roundingMode == float_round_nearest_even ); 315 roundIncrement = 0x40; 316 if ( ! roundNearestEven ) { 317 if ( roundingMode == float_round_to_zero ) { 318 roundIncrement = 0; 319 } 320 else { 321 roundIncrement = 0x7F; 322 if ( zSign ) { 323 if ( roundingMode == float_round_up ) roundIncrement = 0; 324 } 325 else { 326 if ( roundingMode == float_round_down ) roundIncrement = 0; 327 } 328 } 329 } 330 roundBits = zSig & 0x7F; 331 if ( 0xFD <= (bits16) zExp ) { 332 if ( ( 0xFD < zExp ) 333 || ( ( zExp == 0xFD ) 334 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 335 ) { 336 float_raise( float_flag_overflow | float_flag_inexact ); 337 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 338 } 339 if ( zExp < 0 ) { 340 isTiny = 341 ( float_detect_tininess == float_tininess_before_rounding ) 342 || ( zExp < -1 ) 343 || ( zSig + roundIncrement < 0x80000000 ); 344 shift32RightJamming( zSig, - zExp, &zSig ); 345 zExp = 0; 346 roundBits = zSig & 0x7F; 347 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 348 } 349 } 350 if ( roundBits ) float_exception_flags |= float_flag_inexact; 351 zSig = ( zSig + roundIncrement )>>7; 352 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 353 if ( zSig == 0 ) zExp = 0; 354 return packFloat32( zSign, zExp, zSig ); 355 356 } 357 358 /* 359 ------------------------------------------------------------------------------- 360 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 361 and significand `zSig', and returns the proper single-precision floating- 362 point value corresponding to the abstract input. This routine is just like 363 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 364 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 365 floating-point exponent. 366 ------------------------------------------------------------------------------- 367 */ 368 static float32 369 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 370 { 371 int8 shiftCount; 372 373 shiftCount = countLeadingZeros32( zSig ) - 1; 374 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 375 376 } 377 378 /* 379 ------------------------------------------------------------------------------- 380 Returns the fraction bits of the double-precision floating-point value `a'. 381 ------------------------------------------------------------------------------- 382 */ 383 INLINE bits64 extractFloat64Frac( float64 a ) 384 { 385 386 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 387 388 } 389 390 /* 391 ------------------------------------------------------------------------------- 392 Returns the exponent bits of the double-precision floating-point value `a'. 393 ------------------------------------------------------------------------------- 394 */ 395 INLINE int16 extractFloat64Exp( float64 a ) 396 { 397 398 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 399 400 } 401 402 /* 403 ------------------------------------------------------------------------------- 404 Returns the sign bit of the double-precision floating-point value `a'. 405 ------------------------------------------------------------------------------- 406 */ 407 INLINE flag extractFloat64Sign( float64 a ) 408 { 409 410 return FLOAT64_DEMANGLE(a)>>63; 411 412 } 413 414 /* 415 ------------------------------------------------------------------------------- 416 Normalizes the subnormal double-precision floating-point value represented 417 by the denormalized significand `aSig'. The normalized exponent and 418 significand are stored at the locations pointed to by `zExpPtr' and 419 `zSigPtr', respectively. 420 ------------------------------------------------------------------------------- 421 */ 422 static void 423 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 424 { 425 int8 shiftCount; 426 427 shiftCount = countLeadingZeros64( aSig ) - 11; 428 *zSigPtr = aSig<<shiftCount; 429 *zExpPtr = 1 - shiftCount; 430 431 } 432 433 /* 434 ------------------------------------------------------------------------------- 435 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 436 double-precision floating-point value, returning the result. After being 437 shifted into the proper positions, the three fields are simply added 438 together to form the result. This means that any integer portion of `zSig' 439 will be added into the exponent. Since a properly normalized significand 440 will have an integer portion equal to 1, the `zExp' input should be 1 less 441 than the desired result exponent whenever `zSig' is a complete, normalized 442 significand. 443 ------------------------------------------------------------------------------- 444 */ 445 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 446 { 447 448 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 449 ( ( (bits64) zExp )<<52 ) + zSig ); 450 451 } 452 453 /* 454 ------------------------------------------------------------------------------- 455 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 456 and significand `zSig', and returns the proper double-precision floating- 457 point value corresponding to the abstract input. Ordinarily, the abstract 458 value is simply rounded and packed into the double-precision format, with 459 the inexact exception raised if the abstract input cannot be represented 460 exactly. However, if the abstract value is too large, the overflow and 461 inexact exceptions are raised and an infinity or maximal finite value is 462 returned. If the abstract value is too small, the input value is rounded to 463 a subnormal number, and the underflow and inexact exceptions are raised if 464 the abstract input cannot be represented exactly as a subnormal double- 465 precision floating-point number. 466 The input significand `zSig' has its binary point between bits 62 467 and 61, which is 10 bits to the left of the usual location. This shifted 468 significand must be normalized or smaller. If `zSig' is not normalized, 469 `zExp' must be 0; in that case, the result returned is a subnormal number, 470 and it must not require rounding. In the usual case that `zSig' is 471 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 472 The handling of underflow and overflow follows the IEC/IEEE Standard for 473 Binary Floating-Point Arithmetic. 474 ------------------------------------------------------------------------------- 475 */ 476 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 477 { 478 int8 roundingMode; 479 flag roundNearestEven; 480 int16 roundIncrement, roundBits; 481 flag isTiny; 482 483 roundingMode = float_rounding_mode; 484 roundNearestEven = ( roundingMode == float_round_nearest_even ); 485 roundIncrement = 0x200; 486 if ( ! roundNearestEven ) { 487 if ( roundingMode == float_round_to_zero ) { 488 roundIncrement = 0; 489 } 490 else { 491 roundIncrement = 0x3FF; 492 if ( zSign ) { 493 if ( roundingMode == float_round_up ) roundIncrement = 0; 494 } 495 else { 496 if ( roundingMode == float_round_down ) roundIncrement = 0; 497 } 498 } 499 } 500 roundBits = zSig & 0x3FF; 501 if ( 0x7FD <= (bits16) zExp ) { 502 if ( ( 0x7FD < zExp ) 503 || ( ( zExp == 0x7FD ) 504 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 505 ) { 506 float_raise( float_flag_overflow | float_flag_inexact ); 507 return FLOAT64_MANGLE( 508 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 509 ( roundIncrement == 0 )); 510 } 511 if ( zExp < 0 ) { 512 isTiny = 513 ( float_detect_tininess == float_tininess_before_rounding ) 514 || ( zExp < -1 ) 515 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 516 shift64RightJamming( zSig, - zExp, &zSig ); 517 zExp = 0; 518 roundBits = zSig & 0x3FF; 519 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 520 } 521 } 522 if ( roundBits ) float_exception_flags |= float_flag_inexact; 523 zSig = ( zSig + roundIncrement )>>10; 524 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 525 if ( zSig == 0 ) zExp = 0; 526 return packFloat64( zSign, zExp, zSig ); 527 528 } 529 530 /* 531 ------------------------------------------------------------------------------- 532 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 533 and significand `zSig', and returns the proper double-precision floating- 534 point value corresponding to the abstract input. This routine is just like 535 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 536 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 537 floating-point exponent. 538 ------------------------------------------------------------------------------- 539 */ 540 static float64 541 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 542 { 543 int8 shiftCount; 544 545 shiftCount = countLeadingZeros64( zSig ) - 1; 546 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 547 548 } 549 550 #ifdef FLOATX80 551 552 /* 553 ------------------------------------------------------------------------------- 554 Returns the fraction bits of the extended double-precision floating-point 555 value `a'. 556 ------------------------------------------------------------------------------- 557 */ 558 INLINE bits64 extractFloatx80Frac( floatx80 a ) 559 { 560 561 return a.low; 562 563 } 564 565 /* 566 ------------------------------------------------------------------------------- 567 Returns the exponent bits of the extended double-precision floating-point 568 value `a'. 569 ------------------------------------------------------------------------------- 570 */ 571 INLINE int32 extractFloatx80Exp( floatx80 a ) 572 { 573 574 return a.high & 0x7FFF; 575 576 } 577 578 /* 579 ------------------------------------------------------------------------------- 580 Returns the sign bit of the extended double-precision floating-point value 581 `a'. 582 ------------------------------------------------------------------------------- 583 */ 584 INLINE flag extractFloatx80Sign( floatx80 a ) 585 { 586 587 return a.high>>15; 588 589 } 590 591 /* 592 ------------------------------------------------------------------------------- 593 Normalizes the subnormal extended double-precision floating-point value 594 represented by the denormalized significand `aSig'. The normalized exponent 595 and significand are stored at the locations pointed to by `zExpPtr' and 596 `zSigPtr', respectively. 597 ------------------------------------------------------------------------------- 598 */ 599 static void 600 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 601 { 602 int8 shiftCount; 603 604 shiftCount = countLeadingZeros64( aSig ); 605 *zSigPtr = aSig<<shiftCount; 606 *zExpPtr = 1 - shiftCount; 607 608 } 609 610 /* 611 ------------------------------------------------------------------------------- 612 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 613 extended double-precision floating-point value, returning the result. 614 ------------------------------------------------------------------------------- 615 */ 616 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 617 { 618 floatx80 z; 619 620 z.low = zSig; 621 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 622 return z; 623 624 } 625 626 /* 627 ------------------------------------------------------------------------------- 628 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 629 and extended significand formed by the concatenation of `zSig0' and `zSig1', 630 and returns the proper extended double-precision floating-point value 631 corresponding to the abstract input. Ordinarily, the abstract value is 632 rounded and packed into the extended double-precision format, with the 633 inexact exception raised if the abstract input cannot be represented 634 exactly. However, if the abstract value is too large, the overflow and 635 inexact exceptions are raised and an infinity or maximal finite value is 636 returned. If the abstract value is too small, the input value is rounded to 637 a subnormal number, and the underflow and inexact exceptions are raised if 638 the abstract input cannot be represented exactly as a subnormal extended 639 double-precision floating-point number. 640 If `roundingPrecision' is 32 or 64, the result is rounded to the same 641 number of bits as single or double precision, respectively. Otherwise, the 642 result is rounded to the full precision of the extended double-precision 643 format. 644 The input significand must be normalized or smaller. If the input 645 significand is not normalized, `zExp' must be 0; in that case, the result 646 returned is a subnormal number, and it must not require rounding. The 647 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 648 Floating-Point Arithmetic. 649 ------------------------------------------------------------------------------- 650 */ 651 static floatx80 652 roundAndPackFloatx80( 653 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 654 ) 655 { 656 int8 roundingMode; 657 flag roundNearestEven, increment, isTiny; 658 int64 roundIncrement, roundMask, roundBits; 659 660 roundingMode = float_rounding_mode; 661 roundNearestEven = ( roundingMode == float_round_nearest_even ); 662 if ( roundingPrecision == 80 ) goto precision80; 663 if ( roundingPrecision == 64 ) { 664 roundIncrement = LIT64( 0x0000000000000400 ); 665 roundMask = LIT64( 0x00000000000007FF ); 666 } 667 else if ( roundingPrecision == 32 ) { 668 roundIncrement = LIT64( 0x0000008000000000 ); 669 roundMask = LIT64( 0x000000FFFFFFFFFF ); 670 } 671 else { 672 goto precision80; 673 } 674 zSig0 |= ( zSig1 != 0 ); 675 if ( ! roundNearestEven ) { 676 if ( roundingMode == float_round_to_zero ) { 677 roundIncrement = 0; 678 } 679 else { 680 roundIncrement = roundMask; 681 if ( zSign ) { 682 if ( roundingMode == float_round_up ) roundIncrement = 0; 683 } 684 else { 685 if ( roundingMode == float_round_down ) roundIncrement = 0; 686 } 687 } 688 } 689 roundBits = zSig0 & roundMask; 690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 691 if ( ( 0x7FFE < zExp ) 692 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 693 ) { 694 goto overflow; 695 } 696 if ( zExp <= 0 ) { 697 isTiny = 698 ( float_detect_tininess == float_tininess_before_rounding ) 699 || ( zExp < 0 ) 700 || ( zSig0 <= zSig0 + roundIncrement ); 701 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 702 zExp = 0; 703 roundBits = zSig0 & roundMask; 704 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 705 if ( roundBits ) float_exception_flags |= float_flag_inexact; 706 zSig0 += roundIncrement; 707 if ( (sbits64) zSig0 < 0 ) zExp = 1; 708 roundIncrement = roundMask + 1; 709 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 710 roundMask |= roundIncrement; 711 } 712 zSig0 &= ~ roundMask; 713 return packFloatx80( zSign, zExp, zSig0 ); 714 } 715 } 716 if ( roundBits ) float_exception_flags |= float_flag_inexact; 717 zSig0 += roundIncrement; 718 if ( zSig0 < roundIncrement ) { 719 ++zExp; 720 zSig0 = LIT64( 0x8000000000000000 ); 721 } 722 roundIncrement = roundMask + 1; 723 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 724 roundMask |= roundIncrement; 725 } 726 zSig0 &= ~ roundMask; 727 if ( zSig0 == 0 ) zExp = 0; 728 return packFloatx80( zSign, zExp, zSig0 ); 729 precision80: 730 increment = ( (sbits64) zSig1 < 0 ); 731 if ( ! roundNearestEven ) { 732 if ( roundingMode == float_round_to_zero ) { 733 increment = 0; 734 } 735 else { 736 if ( zSign ) { 737 increment = ( roundingMode == float_round_down ) && zSig1; 738 } 739 else { 740 increment = ( roundingMode == float_round_up ) && zSig1; 741 } 742 } 743 } 744 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 745 if ( ( 0x7FFE < zExp ) 746 || ( ( zExp == 0x7FFE ) 747 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 748 && increment 749 ) 750 ) { 751 roundMask = 0; 752 overflow: 753 float_raise( float_flag_overflow | float_flag_inexact ); 754 if ( ( roundingMode == float_round_to_zero ) 755 || ( zSign && ( roundingMode == float_round_up ) ) 756 || ( ! zSign && ( roundingMode == float_round_down ) ) 757 ) { 758 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 759 } 760 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 761 } 762 if ( zExp <= 0 ) { 763 isTiny = 764 ( float_detect_tininess == float_tininess_before_rounding ) 765 || ( zExp < 0 ) 766 || ! increment 767 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 768 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 769 zExp = 0; 770 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 771 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 772 if ( roundNearestEven ) { 773 increment = ( (sbits64) zSig1 < 0 ); 774 } 775 else { 776 if ( zSign ) { 777 increment = ( roundingMode == float_round_down ) && zSig1; 778 } 779 else { 780 increment = ( roundingMode == float_round_up ) && zSig1; 781 } 782 } 783 if ( increment ) { 784 ++zSig0; 785 zSig0 &= 786 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 787 if ( (sbits64) zSig0 < 0 ) zExp = 1; 788 } 789 return packFloatx80( zSign, zExp, zSig0 ); 790 } 791 } 792 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 793 if ( increment ) { 794 ++zSig0; 795 if ( zSig0 == 0 ) { 796 ++zExp; 797 zSig0 = LIT64( 0x8000000000000000 ); 798 } 799 else { 800 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 801 } 802 } 803 else { 804 if ( zSig0 == 0 ) zExp = 0; 805 } 806 return packFloatx80( zSign, zExp, zSig0 ); 807 808 } 809 810 /* 811 ------------------------------------------------------------------------------- 812 Takes an abstract floating-point value having sign `zSign', exponent 813 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 814 and returns the proper extended double-precision floating-point value 815 corresponding to the abstract input. This routine is just like 816 `roundAndPackFloatx80' except that the input significand does not have to be 817 normalized. 818 ------------------------------------------------------------------------------- 819 */ 820 static floatx80 821 normalizeRoundAndPackFloatx80( 822 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 823 ) 824 { 825 int8 shiftCount; 826 827 if ( zSig0 == 0 ) { 828 zSig0 = zSig1; 829 zSig1 = 0; 830 zExp -= 64; 831 } 832 shiftCount = countLeadingZeros64( zSig0 ); 833 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 834 zExp -= shiftCount; 835 return 836 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 837 838 } 839 840 #endif 841 842 #ifdef FLOAT128 843 844 /* 845 ------------------------------------------------------------------------------- 846 Returns the least-significant 64 fraction bits of the quadruple-precision 847 floating-point value `a'. 848 ------------------------------------------------------------------------------- 849 */ 850 INLINE bits64 extractFloat128Frac1( float128 a ) 851 { 852 853 return a.low; 854 855 } 856 857 /* 858 ------------------------------------------------------------------------------- 859 Returns the most-significant 48 fraction bits of the quadruple-precision 860 floating-point value `a'. 861 ------------------------------------------------------------------------------- 862 */ 863 INLINE bits64 extractFloat128Frac0( float128 a ) 864 { 865 866 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 867 868 } 869 870 /* 871 ------------------------------------------------------------------------------- 872 Returns the exponent bits of the quadruple-precision floating-point value 873 `a'. 874 ------------------------------------------------------------------------------- 875 */ 876 INLINE int32 extractFloat128Exp( float128 a ) 877 { 878 879 return ( a.high>>48 ) & 0x7FFF; 880 881 } 882 883 /* 884 ------------------------------------------------------------------------------- 885 Returns the sign bit of the quadruple-precision floating-point value `a'. 886 ------------------------------------------------------------------------------- 887 */ 888 INLINE flag extractFloat128Sign( float128 a ) 889 { 890 891 return a.high>>63; 892 893 } 894 895 /* 896 ------------------------------------------------------------------------------- 897 Normalizes the subnormal quadruple-precision floating-point value 898 represented by the denormalized significand formed by the concatenation of 899 `aSig0' and `aSig1'. The normalized exponent is stored at the location 900 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 901 significand are stored at the location pointed to by `zSig0Ptr', and the 902 least significant 64 bits of the normalized significand are stored at the 903 location pointed to by `zSig1Ptr'. 904 ------------------------------------------------------------------------------- 905 */ 906 static void 907 normalizeFloat128Subnormal( 908 bits64 aSig0, 909 bits64 aSig1, 910 int32 *zExpPtr, 911 bits64 *zSig0Ptr, 912 bits64 *zSig1Ptr 913 ) 914 { 915 int8 shiftCount; 916 917 if ( aSig0 == 0 ) { 918 shiftCount = countLeadingZeros64( aSig1 ) - 15; 919 if ( shiftCount < 0 ) { 920 *zSig0Ptr = aSig1>>( - shiftCount ); 921 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 922 } 923 else { 924 *zSig0Ptr = aSig1<<shiftCount; 925 *zSig1Ptr = 0; 926 } 927 *zExpPtr = - shiftCount - 63; 928 } 929 else { 930 shiftCount = countLeadingZeros64( aSig0 ) - 15; 931 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 932 *zExpPtr = 1 - shiftCount; 933 } 934 935 } 936 937 /* 938 ------------------------------------------------------------------------------- 939 Packs the sign `zSign', the exponent `zExp', and the significand formed 940 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 941 floating-point value, returning the result. After being shifted into the 942 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 943 added together to form the most significant 32 bits of the result. This 944 means that any integer portion of `zSig0' will be added into the exponent. 945 Since a properly normalized significand will have an integer portion equal 946 to 1, the `zExp' input should be 1 less than the desired result exponent 947 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 948 significand. 949 ------------------------------------------------------------------------------- 950 */ 951 INLINE float128 952 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 953 { 954 float128 z; 955 956 z.low = zSig1; 957 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 958 return z; 959 960 } 961 962 /* 963 ------------------------------------------------------------------------------- 964 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 965 and extended significand formed by the concatenation of `zSig0', `zSig1', 966 and `zSig2', and returns the proper quadruple-precision floating-point value 967 corresponding to the abstract input. Ordinarily, the abstract value is 968 simply rounded and packed into the quadruple-precision format, with the 969 inexact exception raised if the abstract input cannot be represented 970 exactly. However, if the abstract value is too large, the overflow and 971 inexact exceptions are raised and an infinity or maximal finite value is 972 returned. If the abstract value is too small, the input value is rounded to 973 a subnormal number, and the underflow and inexact exceptions are raised if 974 the abstract input cannot be represented exactly as a subnormal quadruple- 975 precision floating-point number. 976 The input significand must be normalized or smaller. If the input 977 significand is not normalized, `zExp' must be 0; in that case, the result 978 returned is a subnormal number, and it must not require rounding. In the 979 usual case that the input significand is normalized, `zExp' must be 1 less 980 than the ``true'' floating-point exponent. The handling of underflow and 981 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 982 ------------------------------------------------------------------------------- 983 */ 984 static float128 985 roundAndPackFloat128( 986 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 987 { 988 int8 roundingMode; 989 flag roundNearestEven, increment, isTiny; 990 991 roundingMode = float_rounding_mode; 992 roundNearestEven = ( roundingMode == float_round_nearest_even ); 993 increment = ( (sbits64) zSig2 < 0 ); 994 if ( ! roundNearestEven ) { 995 if ( roundingMode == float_round_to_zero ) { 996 increment = 0; 997 } 998 else { 999 if ( zSign ) { 1000 increment = ( roundingMode == float_round_down ) && zSig2; 1001 } 1002 else { 1003 increment = ( roundingMode == float_round_up ) && zSig2; 1004 } 1005 } 1006 } 1007 if ( 0x7FFD <= (bits32) zExp ) { 1008 if ( ( 0x7FFD < zExp ) 1009 || ( ( zExp == 0x7FFD ) 1010 && eq128( 1011 LIT64( 0x0001FFFFFFFFFFFF ), 1012 LIT64( 0xFFFFFFFFFFFFFFFF ), 1013 zSig0, 1014 zSig1 1015 ) 1016 && increment 1017 ) 1018 ) { 1019 float_raise( float_flag_overflow | float_flag_inexact ); 1020 if ( ( roundingMode == float_round_to_zero ) 1021 || ( zSign && ( roundingMode == float_round_up ) ) 1022 || ( ! zSign && ( roundingMode == float_round_down ) ) 1023 ) { 1024 return 1025 packFloat128( 1026 zSign, 1027 0x7FFE, 1028 LIT64( 0x0000FFFFFFFFFFFF ), 1029 LIT64( 0xFFFFFFFFFFFFFFFF ) 1030 ); 1031 } 1032 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1033 } 1034 if ( zExp < 0 ) { 1035 isTiny = 1036 ( float_detect_tininess == float_tininess_before_rounding ) 1037 || ( zExp < -1 ) 1038 || ! increment 1039 || lt128( 1040 zSig0, 1041 zSig1, 1042 LIT64( 0x0001FFFFFFFFFFFF ), 1043 LIT64( 0xFFFFFFFFFFFFFFFF ) 1044 ); 1045 shift128ExtraRightJamming( 1046 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1047 zExp = 0; 1048 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1049 if ( roundNearestEven ) { 1050 increment = ( (sbits64) zSig2 < 0 ); 1051 } 1052 else { 1053 if ( zSign ) { 1054 increment = ( roundingMode == float_round_down ) && zSig2; 1055 } 1056 else { 1057 increment = ( roundingMode == float_round_up ) && zSig2; 1058 } 1059 } 1060 } 1061 } 1062 if ( zSig2 ) float_exception_flags |= float_flag_inexact; 1063 if ( increment ) { 1064 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1065 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1066 } 1067 else { 1068 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1069 } 1070 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1071 1072 } 1073 1074 /* 1075 ------------------------------------------------------------------------------- 1076 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1077 and significand formed by the concatenation of `zSig0' and `zSig1', and 1078 returns the proper quadruple-precision floating-point value corresponding 1079 to the abstract input. This routine is just like `roundAndPackFloat128' 1080 except that the input significand has fewer bits and does not have to be 1081 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1082 point exponent. 1083 ------------------------------------------------------------------------------- 1084 */ 1085 static float128 1086 normalizeRoundAndPackFloat128( 1087 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1088 { 1089 int8 shiftCount; 1090 bits64 zSig2; 1091 1092 if ( zSig0 == 0 ) { 1093 zSig0 = zSig1; 1094 zSig1 = 0; 1095 zExp -= 64; 1096 } 1097 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1098 if ( 0 <= shiftCount ) { 1099 zSig2 = 0; 1100 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1101 } 1102 else { 1103 shift128ExtraRightJamming( 1104 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1105 } 1106 zExp -= shiftCount; 1107 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1108 1109 } 1110 1111 #endif 1112 1113 /* 1114 ------------------------------------------------------------------------------- 1115 Returns the result of converting the 32-bit two's complement integer `a' 1116 to the single-precision floating-point format. The conversion is performed 1117 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1118 ------------------------------------------------------------------------------- 1119 */ 1120 float32 int32_to_float32( int32 a ) 1121 { 1122 flag zSign; 1123 1124 if ( a == 0 ) return 0; 1125 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1126 zSign = ( a < 0 ); 1127 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1128 1129 } 1130 1131 /* 1132 ------------------------------------------------------------------------------- 1133 Returns the result of converting the 32-bit two's complement integer `a' 1134 to the double-precision floating-point format. The conversion is performed 1135 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1136 ------------------------------------------------------------------------------- 1137 */ 1138 float64 int32_to_float64( int32 a ) 1139 { 1140 flag zSign; 1141 uint32 absA; 1142 int8 shiftCount; 1143 bits64 zSig; 1144 1145 if ( a == 0 ) return 0; 1146 zSign = ( a < 0 ); 1147 absA = zSign ? - a : a; 1148 shiftCount = countLeadingZeros32( absA ) + 21; 1149 zSig = absA; 1150 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1151 1152 } 1153 1154 #ifdef FLOATX80 1155 1156 /* 1157 ------------------------------------------------------------------------------- 1158 Returns the result of converting the 32-bit two's complement integer `a' 1159 to the extended double-precision floating-point format. The conversion 1160 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1161 Arithmetic. 1162 ------------------------------------------------------------------------------- 1163 */ 1164 floatx80 int32_to_floatx80( int32 a ) 1165 { 1166 flag zSign; 1167 uint32 absA; 1168 int8 shiftCount; 1169 bits64 zSig; 1170 1171 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1172 zSign = ( a < 0 ); 1173 absA = zSign ? - a : a; 1174 shiftCount = countLeadingZeros32( absA ) + 32; 1175 zSig = absA; 1176 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1177 1178 } 1179 1180 #endif 1181 1182 #ifdef FLOAT128 1183 1184 /* 1185 ------------------------------------------------------------------------------- 1186 Returns the result of converting the 32-bit two's complement integer `a' to 1187 the quadruple-precision floating-point format. The conversion is performed 1188 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1189 ------------------------------------------------------------------------------- 1190 */ 1191 float128 int32_to_float128( int32 a ) 1192 { 1193 flag zSign; 1194 uint32 absA; 1195 int8 shiftCount; 1196 bits64 zSig0; 1197 1198 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1199 zSign = ( a < 0 ); 1200 absA = zSign ? - a : a; 1201 shiftCount = countLeadingZeros32( absA ) + 17; 1202 zSig0 = absA; 1203 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1204 1205 } 1206 1207 #endif 1208 1209 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1210 /* 1211 ------------------------------------------------------------------------------- 1212 Returns the result of converting the 64-bit two's complement integer `a' 1213 to the single-precision floating-point format. The conversion is performed 1214 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1215 ------------------------------------------------------------------------------- 1216 */ 1217 float32 int64_to_float32( int64 a ) 1218 { 1219 flag zSign; 1220 uint64 absA; 1221 int8 shiftCount; 1222 1223 if ( a == 0 ) return 0; 1224 zSign = ( a < 0 ); 1225 absA = zSign ? - a : a; 1226 shiftCount = countLeadingZeros64( absA ) - 40; 1227 if ( 0 <= shiftCount ) { 1228 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1229 } 1230 else { 1231 shiftCount += 7; 1232 if ( shiftCount < 0 ) { 1233 shift64RightJamming( absA, - shiftCount, &absA ); 1234 } 1235 else { 1236 absA <<= shiftCount; 1237 } 1238 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1239 } 1240 1241 } 1242 1243 /* 1244 ------------------------------------------------------------------------------- 1245 Returns the result of converting the 64-bit two's complement integer `a' 1246 to the double-precision floating-point format. The conversion is performed 1247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1248 ------------------------------------------------------------------------------- 1249 */ 1250 float64 int64_to_float64( int64 a ) 1251 { 1252 flag zSign; 1253 1254 if ( a == 0 ) return 0; 1255 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1256 return packFloat64( 1, 0x43E, 0 ); 1257 } 1258 zSign = ( a < 0 ); 1259 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1260 1261 } 1262 1263 #ifdef FLOATX80 1264 1265 /* 1266 ------------------------------------------------------------------------------- 1267 Returns the result of converting the 64-bit two's complement integer `a' 1268 to the extended double-precision floating-point format. The conversion 1269 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1270 Arithmetic. 1271 ------------------------------------------------------------------------------- 1272 */ 1273 floatx80 int64_to_floatx80( int64 a ) 1274 { 1275 flag zSign; 1276 uint64 absA; 1277 int8 shiftCount; 1278 1279 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1280 zSign = ( a < 0 ); 1281 absA = zSign ? - a : a; 1282 shiftCount = countLeadingZeros64( absA ); 1283 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1284 1285 } 1286 1287 #endif 1288 1289 #endif /* !SOFTFLOAT_FOR_GCC */ 1290 1291 #ifdef FLOAT128 1292 1293 /* 1294 ------------------------------------------------------------------------------- 1295 Returns the result of converting the 64-bit two's complement integer `a' to 1296 the quadruple-precision floating-point format. The conversion is performed 1297 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1298 ------------------------------------------------------------------------------- 1299 */ 1300 float128 int64_to_float128( int64 a ) 1301 { 1302 flag zSign; 1303 uint64 absA; 1304 int8 shiftCount; 1305 int32 zExp; 1306 bits64 zSig0, zSig1; 1307 1308 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1309 zSign = ( a < 0 ); 1310 absA = zSign ? - a : a; 1311 shiftCount = countLeadingZeros64( absA ) + 49; 1312 zExp = 0x406E - shiftCount; 1313 if ( 64 <= shiftCount ) { 1314 zSig1 = 0; 1315 zSig0 = absA; 1316 shiftCount -= 64; 1317 } 1318 else { 1319 zSig1 = absA; 1320 zSig0 = 0; 1321 } 1322 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1323 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1324 1325 } 1326 1327 #endif 1328 1329 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1330 /* 1331 ------------------------------------------------------------------------------- 1332 Returns the result of converting the single-precision floating-point value 1333 `a' to the 32-bit two's complement integer format. The conversion is 1334 performed according to the IEC/IEEE Standard for Binary Floating-Point 1335 Arithmetic---which means in particular that the conversion is rounded 1336 according to the current rounding mode. If `a' is a NaN, the largest 1337 positive integer is returned. Otherwise, if the conversion overflows, the 1338 largest integer with the same sign as `a' is returned. 1339 ------------------------------------------------------------------------------- 1340 */ 1341 int32 float32_to_int32( float32 a ) 1342 { 1343 flag aSign; 1344 int16 aExp, shiftCount; 1345 bits32 aSig; 1346 bits64 aSig64; 1347 1348 aSig = extractFloat32Frac( a ); 1349 aExp = extractFloat32Exp( a ); 1350 aSign = extractFloat32Sign( a ); 1351 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1352 if ( aExp ) aSig |= 0x00800000; 1353 shiftCount = 0xAF - aExp; 1354 aSig64 = aSig; 1355 aSig64 <<= 32; 1356 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1357 return roundAndPackInt32( aSign, aSig64 ); 1358 1359 } 1360 #endif /* !SOFTFLOAT_FOR_GCC */ 1361 1362 /* 1363 ------------------------------------------------------------------------------- 1364 Returns the result of converting the single-precision floating-point value 1365 `a' to the 32-bit two's complement integer format. The conversion is 1366 performed according to the IEC/IEEE Standard for Binary Floating-Point 1367 Arithmetic, except that the conversion is always rounded toward zero. 1368 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1369 the conversion overflows, the largest integer with the same sign as `a' is 1370 returned. 1371 ------------------------------------------------------------------------------- 1372 */ 1373 int32 float32_to_int32_round_to_zero( float32 a ) 1374 { 1375 flag aSign; 1376 int16 aExp, shiftCount; 1377 bits32 aSig; 1378 int32 z; 1379 1380 aSig = extractFloat32Frac( a ); 1381 aExp = extractFloat32Exp( a ); 1382 aSign = extractFloat32Sign( a ); 1383 shiftCount = aExp - 0x9E; 1384 if ( 0 <= shiftCount ) { 1385 if ( a != 0xCF000000 ) { 1386 float_raise( float_flag_invalid ); 1387 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1388 } 1389 return (sbits32) 0x80000000; 1390 } 1391 else if ( aExp <= 0x7E ) { 1392 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1393 return 0; 1394 } 1395 aSig = ( aSig | 0x00800000 )<<8; 1396 z = aSig>>( - shiftCount ); 1397 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1398 float_exception_flags |= float_flag_inexact; 1399 } 1400 if ( aSign ) z = - z; 1401 return z; 1402 1403 } 1404 1405 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1406 /* 1407 ------------------------------------------------------------------------------- 1408 Returns the result of converting the single-precision floating-point value 1409 `a' to the 64-bit two's complement integer format. The conversion is 1410 performed according to the IEC/IEEE Standard for Binary Floating-Point 1411 Arithmetic---which means in particular that the conversion is rounded 1412 according to the current rounding mode. If `a' is a NaN, the largest 1413 positive integer is returned. Otherwise, if the conversion overflows, the 1414 largest integer with the same sign as `a' is returned. 1415 ------------------------------------------------------------------------------- 1416 */ 1417 int64 float32_to_int64( float32 a ) 1418 { 1419 flag aSign; 1420 int16 aExp, shiftCount; 1421 bits32 aSig; 1422 bits64 aSig64, aSigExtra; 1423 1424 aSig = extractFloat32Frac( a ); 1425 aExp = extractFloat32Exp( a ); 1426 aSign = extractFloat32Sign( a ); 1427 shiftCount = 0xBE - aExp; 1428 if ( shiftCount < 0 ) { 1429 float_raise( float_flag_invalid ); 1430 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1431 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1432 } 1433 return (sbits64) LIT64( 0x8000000000000000 ); 1434 } 1435 if ( aExp ) aSig |= 0x00800000; 1436 aSig64 = aSig; 1437 aSig64 <<= 40; 1438 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1439 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1440 1441 } 1442 1443 /* 1444 ------------------------------------------------------------------------------- 1445 Returns the result of converting the single-precision floating-point value 1446 `a' to the 64-bit two's complement integer format. The conversion is 1447 performed according to the IEC/IEEE Standard for Binary Floating-Point 1448 Arithmetic, except that the conversion is always rounded toward zero. If 1449 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1450 conversion overflows, the largest integer with the same sign as `a' is 1451 returned. 1452 ------------------------------------------------------------------------------- 1453 */ 1454 int64 float32_to_int64_round_to_zero( float32 a ) 1455 { 1456 flag aSign; 1457 int16 aExp, shiftCount; 1458 bits32 aSig; 1459 bits64 aSig64; 1460 int64 z; 1461 1462 aSig = extractFloat32Frac( a ); 1463 aExp = extractFloat32Exp( a ); 1464 aSign = extractFloat32Sign( a ); 1465 shiftCount = aExp - 0xBE; 1466 if ( 0 <= shiftCount ) { 1467 if ( a != 0xDF000000 ) { 1468 float_raise( float_flag_invalid ); 1469 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1470 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1471 } 1472 } 1473 return (sbits64) LIT64( 0x8000000000000000 ); 1474 } 1475 else if ( aExp <= 0x7E ) { 1476 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1477 return 0; 1478 } 1479 aSig64 = aSig | 0x00800000; 1480 aSig64 <<= 40; 1481 z = aSig64>>( - shiftCount ); 1482 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1483 float_exception_flags |= float_flag_inexact; 1484 } 1485 if ( aSign ) z = - z; 1486 return z; 1487 1488 } 1489 #endif /* !SOFTFLOAT_FOR_GCC */ 1490 1491 /* 1492 ------------------------------------------------------------------------------- 1493 Returns the result of converting the single-precision floating-point value 1494 `a' to the double-precision floating-point format. The conversion is 1495 performed according to the IEC/IEEE Standard for Binary Floating-Point 1496 Arithmetic. 1497 ------------------------------------------------------------------------------- 1498 */ 1499 float64 float32_to_float64( float32 a ) 1500 { 1501 flag aSign; 1502 int16 aExp; 1503 bits32 aSig; 1504 1505 aSig = extractFloat32Frac( a ); 1506 aExp = extractFloat32Exp( a ); 1507 aSign = extractFloat32Sign( a ); 1508 if ( aExp == 0xFF ) { 1509 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1510 return packFloat64( aSign, 0x7FF, 0 ); 1511 } 1512 if ( aExp == 0 ) { 1513 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1514 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1515 --aExp; 1516 } 1517 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1518 1519 } 1520 1521 #ifdef FLOATX80 1522 1523 /* 1524 ------------------------------------------------------------------------------- 1525 Returns the result of converting the single-precision floating-point value 1526 `a' to the extended double-precision floating-point format. The conversion 1527 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1528 Arithmetic. 1529 ------------------------------------------------------------------------------- 1530 */ 1531 floatx80 float32_to_floatx80( float32 a ) 1532 { 1533 flag aSign; 1534 int16 aExp; 1535 bits32 aSig; 1536 1537 aSig = extractFloat32Frac( a ); 1538 aExp = extractFloat32Exp( a ); 1539 aSign = extractFloat32Sign( a ); 1540 if ( aExp == 0xFF ) { 1541 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1542 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1543 } 1544 if ( aExp == 0 ) { 1545 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1546 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1547 } 1548 aSig |= 0x00800000; 1549 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1550 1551 } 1552 1553 #endif 1554 1555 #ifdef FLOAT128 1556 1557 /* 1558 ------------------------------------------------------------------------------- 1559 Returns the result of converting the single-precision floating-point value 1560 `a' to the double-precision floating-point format. The conversion is 1561 performed according to the IEC/IEEE Standard for Binary Floating-Point 1562 Arithmetic. 1563 ------------------------------------------------------------------------------- 1564 */ 1565 float128 float32_to_float128( float32 a ) 1566 { 1567 flag aSign; 1568 int16 aExp; 1569 bits32 aSig; 1570 1571 aSig = extractFloat32Frac( a ); 1572 aExp = extractFloat32Exp( a ); 1573 aSign = extractFloat32Sign( a ); 1574 if ( aExp == 0xFF ) { 1575 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1576 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1577 } 1578 if ( aExp == 0 ) { 1579 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1580 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1581 --aExp; 1582 } 1583 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1584 1585 } 1586 1587 #endif 1588 1589 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1590 /* 1591 ------------------------------------------------------------------------------- 1592 Rounds the single-precision floating-point value `a' to an integer, and 1593 returns the result as a single-precision floating-point value. The 1594 operation is performed according to the IEC/IEEE Standard for Binary 1595 Floating-Point Arithmetic. 1596 ------------------------------------------------------------------------------- 1597 */ 1598 float32 float32_round_to_int( float32 a ) 1599 { 1600 flag aSign; 1601 int16 aExp; 1602 bits32 lastBitMask, roundBitsMask; 1603 int8 roundingMode; 1604 float32 z; 1605 1606 aExp = extractFloat32Exp( a ); 1607 if ( 0x96 <= aExp ) { 1608 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1609 return propagateFloat32NaN( a, a ); 1610 } 1611 return a; 1612 } 1613 if ( aExp <= 0x7E ) { 1614 if ( (bits32) ( a<<1 ) == 0 ) return a; 1615 float_exception_flags |= float_flag_inexact; 1616 aSign = extractFloat32Sign( a ); 1617 switch ( float_rounding_mode ) { 1618 case float_round_nearest_even: 1619 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1620 return packFloat32( aSign, 0x7F, 0 ); 1621 } 1622 break; 1623 case float_round_to_zero: 1624 break; 1625 case float_round_down: 1626 return aSign ? 0xBF800000 : 0; 1627 case float_round_up: 1628 return aSign ? 0x80000000 : 0x3F800000; 1629 } 1630 return packFloat32( aSign, 0, 0 ); 1631 } 1632 lastBitMask = 1; 1633 lastBitMask <<= 0x96 - aExp; 1634 roundBitsMask = lastBitMask - 1; 1635 z = a; 1636 roundingMode = float_rounding_mode; 1637 if ( roundingMode == float_round_nearest_even ) { 1638 z += lastBitMask>>1; 1639 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1640 } 1641 else if ( roundingMode != float_round_to_zero ) { 1642 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1643 z += roundBitsMask; 1644 } 1645 } 1646 z &= ~ roundBitsMask; 1647 if ( z != a ) float_exception_flags |= float_flag_inexact; 1648 return z; 1649 1650 } 1651 #endif /* !SOFTFLOAT_FOR_GCC */ 1652 1653 /* 1654 ------------------------------------------------------------------------------- 1655 Returns the result of adding the absolute values of the single-precision 1656 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1657 before being returned. `zSign' is ignored if the result is a NaN. 1658 The addition is performed according to the IEC/IEEE Standard for Binary 1659 Floating-Point Arithmetic. 1660 ------------------------------------------------------------------------------- 1661 */ 1662 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1663 { 1664 int16 aExp, bExp, zExp; 1665 bits32 aSig, bSig, zSig; 1666 int16 expDiff; 1667 1668 aSig = extractFloat32Frac( a ); 1669 aExp = extractFloat32Exp( a ); 1670 bSig = extractFloat32Frac( b ); 1671 bExp = extractFloat32Exp( b ); 1672 expDiff = aExp - bExp; 1673 aSig <<= 6; 1674 bSig <<= 6; 1675 if ( 0 < expDiff ) { 1676 if ( aExp == 0xFF ) { 1677 if ( aSig ) return propagateFloat32NaN( a, b ); 1678 return a; 1679 } 1680 if ( bExp == 0 ) { 1681 --expDiff; 1682 } 1683 else { 1684 bSig |= 0x20000000; 1685 } 1686 shift32RightJamming( bSig, expDiff, &bSig ); 1687 zExp = aExp; 1688 } 1689 else if ( expDiff < 0 ) { 1690 if ( bExp == 0xFF ) { 1691 if ( bSig ) return propagateFloat32NaN( a, b ); 1692 return packFloat32( zSign, 0xFF, 0 ); 1693 } 1694 if ( aExp == 0 ) { 1695 ++expDiff; 1696 } 1697 else { 1698 aSig |= 0x20000000; 1699 } 1700 shift32RightJamming( aSig, - expDiff, &aSig ); 1701 zExp = bExp; 1702 } 1703 else { 1704 if ( aExp == 0xFF ) { 1705 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1706 return a; 1707 } 1708 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1709 zSig = 0x40000000 + aSig + bSig; 1710 zExp = aExp; 1711 goto roundAndPack; 1712 } 1713 aSig |= 0x20000000; 1714 zSig = ( aSig + bSig )<<1; 1715 --zExp; 1716 if ( (sbits32) zSig < 0 ) { 1717 zSig = aSig + bSig; 1718 ++zExp; 1719 } 1720 roundAndPack: 1721 return roundAndPackFloat32( zSign, zExp, zSig ); 1722 1723 } 1724 1725 /* 1726 ------------------------------------------------------------------------------- 1727 Returns the result of subtracting the absolute values of the single- 1728 precision floating-point values `a' and `b'. If `zSign' is 1, the 1729 difference is negated before being returned. `zSign' is ignored if the 1730 result is a NaN. The subtraction is performed according to the IEC/IEEE 1731 Standard for Binary Floating-Point Arithmetic. 1732 ------------------------------------------------------------------------------- 1733 */ 1734 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1735 { 1736 int16 aExp, bExp, zExp; 1737 bits32 aSig, bSig, zSig; 1738 int16 expDiff; 1739 1740 aSig = extractFloat32Frac( a ); 1741 aExp = extractFloat32Exp( a ); 1742 bSig = extractFloat32Frac( b ); 1743 bExp = extractFloat32Exp( b ); 1744 expDiff = aExp - bExp; 1745 aSig <<= 7; 1746 bSig <<= 7; 1747 if ( 0 < expDiff ) goto aExpBigger; 1748 if ( expDiff < 0 ) goto bExpBigger; 1749 if ( aExp == 0xFF ) { 1750 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1751 float_raise( float_flag_invalid ); 1752 return float32_default_nan; 1753 } 1754 if ( aExp == 0 ) { 1755 aExp = 1; 1756 bExp = 1; 1757 } 1758 if ( bSig < aSig ) goto aBigger; 1759 if ( aSig < bSig ) goto bBigger; 1760 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 1761 bExpBigger: 1762 if ( bExp == 0xFF ) { 1763 if ( bSig ) return propagateFloat32NaN( a, b ); 1764 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1765 } 1766 if ( aExp == 0 ) { 1767 ++expDiff; 1768 } 1769 else { 1770 aSig |= 0x40000000; 1771 } 1772 shift32RightJamming( aSig, - expDiff, &aSig ); 1773 bSig |= 0x40000000; 1774 bBigger: 1775 zSig = bSig - aSig; 1776 zExp = bExp; 1777 zSign ^= 1; 1778 goto normalizeRoundAndPack; 1779 aExpBigger: 1780 if ( aExp == 0xFF ) { 1781 if ( aSig ) return propagateFloat32NaN( a, b ); 1782 return a; 1783 } 1784 if ( bExp == 0 ) { 1785 --expDiff; 1786 } 1787 else { 1788 bSig |= 0x40000000; 1789 } 1790 shift32RightJamming( bSig, expDiff, &bSig ); 1791 aSig |= 0x40000000; 1792 aBigger: 1793 zSig = aSig - bSig; 1794 zExp = aExp; 1795 normalizeRoundAndPack: 1796 --zExp; 1797 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1798 1799 } 1800 1801 /* 1802 ------------------------------------------------------------------------------- 1803 Returns the result of adding the single-precision floating-point values `a' 1804 and `b'. The operation is performed according to the IEC/IEEE Standard for 1805 Binary Floating-Point Arithmetic. 1806 ------------------------------------------------------------------------------- 1807 */ 1808 float32 float32_add( float32 a, float32 b ) 1809 { 1810 flag aSign, bSign; 1811 1812 aSign = extractFloat32Sign( a ); 1813 bSign = extractFloat32Sign( b ); 1814 if ( aSign == bSign ) { 1815 return addFloat32Sigs( a, b, aSign ); 1816 } 1817 else { 1818 return subFloat32Sigs( a, b, aSign ); 1819 } 1820 1821 } 1822 1823 /* 1824 ------------------------------------------------------------------------------- 1825 Returns the result of subtracting the single-precision floating-point values 1826 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1827 for Binary Floating-Point Arithmetic. 1828 ------------------------------------------------------------------------------- 1829 */ 1830 float32 float32_sub( float32 a, float32 b ) 1831 { 1832 flag aSign, bSign; 1833 1834 aSign = extractFloat32Sign( a ); 1835 bSign = extractFloat32Sign( b ); 1836 if ( aSign == bSign ) { 1837 return subFloat32Sigs( a, b, aSign ); 1838 } 1839 else { 1840 return addFloat32Sigs( a, b, aSign ); 1841 } 1842 1843 } 1844 1845 /* 1846 ------------------------------------------------------------------------------- 1847 Returns the result of multiplying the single-precision floating-point values 1848 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1849 for Binary Floating-Point Arithmetic. 1850 ------------------------------------------------------------------------------- 1851 */ 1852 float32 float32_mul( float32 a, float32 b ) 1853 { 1854 flag aSign, bSign, zSign; 1855 int16 aExp, bExp, zExp; 1856 bits32 aSig, bSig; 1857 bits64 zSig64; 1858 bits32 zSig; 1859 1860 aSig = extractFloat32Frac( a ); 1861 aExp = extractFloat32Exp( a ); 1862 aSign = extractFloat32Sign( a ); 1863 bSig = extractFloat32Frac( b ); 1864 bExp = extractFloat32Exp( b ); 1865 bSign = extractFloat32Sign( b ); 1866 zSign = aSign ^ bSign; 1867 if ( aExp == 0xFF ) { 1868 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1869 return propagateFloat32NaN( a, b ); 1870 } 1871 if ( ( bExp | bSig ) == 0 ) { 1872 float_raise( float_flag_invalid ); 1873 return float32_default_nan; 1874 } 1875 return packFloat32( zSign, 0xFF, 0 ); 1876 } 1877 if ( bExp == 0xFF ) { 1878 if ( bSig ) return propagateFloat32NaN( a, b ); 1879 if ( ( aExp | aSig ) == 0 ) { 1880 float_raise( float_flag_invalid ); 1881 return float32_default_nan; 1882 } 1883 return packFloat32( zSign, 0xFF, 0 ); 1884 } 1885 if ( aExp == 0 ) { 1886 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1887 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1888 } 1889 if ( bExp == 0 ) { 1890 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1891 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1892 } 1893 zExp = aExp + bExp - 0x7F; 1894 aSig = ( aSig | 0x00800000 )<<7; 1895 bSig = ( bSig | 0x00800000 )<<8; 1896 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1897 zSig = zSig64; 1898 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1899 zSig <<= 1; 1900 --zExp; 1901 } 1902 return roundAndPackFloat32( zSign, zExp, zSig ); 1903 1904 } 1905 1906 /* 1907 ------------------------------------------------------------------------------- 1908 Returns the result of dividing the single-precision floating-point value `a' 1909 by the corresponding value `b'. The operation is performed according to the 1910 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1911 ------------------------------------------------------------------------------- 1912 */ 1913 float32 float32_div( float32 a, float32 b ) 1914 { 1915 flag aSign, bSign, zSign; 1916 int16 aExp, bExp, zExp; 1917 bits32 aSig, bSig, zSig; 1918 1919 aSig = extractFloat32Frac( a ); 1920 aExp = extractFloat32Exp( a ); 1921 aSign = extractFloat32Sign( a ); 1922 bSig = extractFloat32Frac( b ); 1923 bExp = extractFloat32Exp( b ); 1924 bSign = extractFloat32Sign( b ); 1925 zSign = aSign ^ bSign; 1926 if ( aExp == 0xFF ) { 1927 if ( aSig ) return propagateFloat32NaN( a, b ); 1928 if ( bExp == 0xFF ) { 1929 if ( bSig ) return propagateFloat32NaN( a, b ); 1930 float_raise( float_flag_invalid ); 1931 return float32_default_nan; 1932 } 1933 return packFloat32( zSign, 0xFF, 0 ); 1934 } 1935 if ( bExp == 0xFF ) { 1936 if ( bSig ) return propagateFloat32NaN( a, b ); 1937 return packFloat32( zSign, 0, 0 ); 1938 } 1939 if ( bExp == 0 ) { 1940 if ( bSig == 0 ) { 1941 if ( ( aExp | aSig ) == 0 ) { 1942 float_raise( float_flag_invalid ); 1943 return float32_default_nan; 1944 } 1945 float_raise( float_flag_divbyzero ); 1946 return packFloat32( zSign, 0xFF, 0 ); 1947 } 1948 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1949 } 1950 if ( aExp == 0 ) { 1951 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1952 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1953 } 1954 zExp = aExp - bExp + 0x7D; 1955 aSig = ( aSig | 0x00800000 )<<7; 1956 bSig = ( bSig | 0x00800000 )<<8; 1957 if ( bSig <= ( aSig + aSig ) ) { 1958 aSig >>= 1; 1959 ++zExp; 1960 } 1961 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1962 if ( ( zSig & 0x3F ) == 0 ) { 1963 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 1964 } 1965 return roundAndPackFloat32( zSign, zExp, zSig ); 1966 1967 } 1968 1969 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1970 /* 1971 ------------------------------------------------------------------------------- 1972 Returns the remainder of the single-precision floating-point value `a' 1973 with respect to the corresponding value `b'. The operation is performed 1974 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1975 ------------------------------------------------------------------------------- 1976 */ 1977 float32 float32_rem( float32 a, float32 b ) 1978 { 1979 flag aSign, bSign, zSign; 1980 int16 aExp, bExp, expDiff; 1981 bits32 aSig, bSig; 1982 bits32 q; 1983 bits64 aSig64, bSig64, q64; 1984 bits32 alternateASig; 1985 sbits32 sigMean; 1986 1987 aSig = extractFloat32Frac( a ); 1988 aExp = extractFloat32Exp( a ); 1989 aSign = extractFloat32Sign( a ); 1990 bSig = extractFloat32Frac( b ); 1991 bExp = extractFloat32Exp( b ); 1992 bSign = extractFloat32Sign( b ); 1993 if ( aExp == 0xFF ) { 1994 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1995 return propagateFloat32NaN( a, b ); 1996 } 1997 float_raise( float_flag_invalid ); 1998 return float32_default_nan; 1999 } 2000 if ( bExp == 0xFF ) { 2001 if ( bSig ) return propagateFloat32NaN( a, b ); 2002 return a; 2003 } 2004 if ( bExp == 0 ) { 2005 if ( bSig == 0 ) { 2006 float_raise( float_flag_invalid ); 2007 return float32_default_nan; 2008 } 2009 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2010 } 2011 if ( aExp == 0 ) { 2012 if ( aSig == 0 ) return a; 2013 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2014 } 2015 expDiff = aExp - bExp; 2016 aSig |= 0x00800000; 2017 bSig |= 0x00800000; 2018 if ( expDiff < 32 ) { 2019 aSig <<= 8; 2020 bSig <<= 8; 2021 if ( expDiff < 0 ) { 2022 if ( expDiff < -1 ) return a; 2023 aSig >>= 1; 2024 } 2025 q = ( bSig <= aSig ); 2026 if ( q ) aSig -= bSig; 2027 if ( 0 < expDiff ) { 2028 q = ( ( (bits64) aSig )<<32 ) / bSig; 2029 q >>= 32 - expDiff; 2030 bSig >>= 2; 2031 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2032 } 2033 else { 2034 aSig >>= 2; 2035 bSig >>= 2; 2036 } 2037 } 2038 else { 2039 if ( bSig <= aSig ) aSig -= bSig; 2040 aSig64 = ( (bits64) aSig )<<40; 2041 bSig64 = ( (bits64) bSig )<<40; 2042 expDiff -= 64; 2043 while ( 0 < expDiff ) { 2044 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2045 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2046 aSig64 = - ( ( bSig * q64 )<<38 ); 2047 expDiff -= 62; 2048 } 2049 expDiff += 64; 2050 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2051 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2052 q = q64>>( 64 - expDiff ); 2053 bSig <<= 6; 2054 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2055 } 2056 do { 2057 alternateASig = aSig; 2058 ++q; 2059 aSig -= bSig; 2060 } while ( 0 <= (sbits32) aSig ); 2061 sigMean = aSig + alternateASig; 2062 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2063 aSig = alternateASig; 2064 } 2065 zSign = ( (sbits32) aSig < 0 ); 2066 if ( zSign ) aSig = - aSig; 2067 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2068 2069 } 2070 #endif /* !SOFTFLOAT_FOR_GCC */ 2071 2072 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2073 /* 2074 ------------------------------------------------------------------------------- 2075 Returns the square root of the single-precision floating-point value `a'. 2076 The operation is performed according to the IEC/IEEE Standard for Binary 2077 Floating-Point Arithmetic. 2078 ------------------------------------------------------------------------------- 2079 */ 2080 float32 float32_sqrt( float32 a ) 2081 { 2082 flag aSign; 2083 int16 aExp, zExp; 2084 bits32 aSig, zSig; 2085 bits64 rem, term; 2086 2087 aSig = extractFloat32Frac( a ); 2088 aExp = extractFloat32Exp( a ); 2089 aSign = extractFloat32Sign( a ); 2090 if ( aExp == 0xFF ) { 2091 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2092 if ( ! aSign ) return a; 2093 float_raise( float_flag_invalid ); 2094 return float32_default_nan; 2095 } 2096 if ( aSign ) { 2097 if ( ( aExp | aSig ) == 0 ) return a; 2098 float_raise( float_flag_invalid ); 2099 return float32_default_nan; 2100 } 2101 if ( aExp == 0 ) { 2102 if ( aSig == 0 ) return 0; 2103 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2104 } 2105 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2106 aSig = ( aSig | 0x00800000 )<<8; 2107 zSig = estimateSqrt32( aExp, aSig ) + 2; 2108 if ( ( zSig & 0x7F ) <= 5 ) { 2109 if ( zSig < 2 ) { 2110 zSig = 0x7FFFFFFF; 2111 goto roundAndPack; 2112 } 2113 aSig >>= aExp & 1; 2114 term = ( (bits64) zSig ) * zSig; 2115 rem = ( ( (bits64) aSig )<<32 ) - term; 2116 while ( (sbits64) rem < 0 ) { 2117 --zSig; 2118 rem += ( ( (bits64) zSig )<<1 ) | 1; 2119 } 2120 zSig |= ( rem != 0 ); 2121 } 2122 shift32RightJamming( zSig, 1, &zSig ); 2123 roundAndPack: 2124 return roundAndPackFloat32( 0, zExp, zSig ); 2125 2126 } 2127 #endif /* !SOFTFLOAT_FOR_GCC */ 2128 2129 /* 2130 ------------------------------------------------------------------------------- 2131 Returns 1 if the single-precision floating-point value `a' is equal to 2132 the corresponding value `b', and 0 otherwise. The comparison is performed 2133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2134 ------------------------------------------------------------------------------- 2135 */ 2136 flag float32_eq( float32 a, float32 b ) 2137 { 2138 2139 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2140 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2141 ) { 2142 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2143 float_raise( float_flag_invalid ); 2144 } 2145 return 0; 2146 } 2147 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2148 2149 } 2150 2151 /* 2152 ------------------------------------------------------------------------------- 2153 Returns 1 if the single-precision floating-point value `a' is less than 2154 or equal to the corresponding value `b', and 0 otherwise. The comparison 2155 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2156 Arithmetic. 2157 ------------------------------------------------------------------------------- 2158 */ 2159 flag float32_le( float32 a, float32 b ) 2160 { 2161 flag aSign, bSign; 2162 2163 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2164 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2165 ) { 2166 float_raise( float_flag_invalid ); 2167 return 0; 2168 } 2169 aSign = extractFloat32Sign( a ); 2170 bSign = extractFloat32Sign( b ); 2171 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2172 return ( a == b ) || ( aSign ^ ( a < b ) ); 2173 2174 } 2175 2176 /* 2177 ------------------------------------------------------------------------------- 2178 Returns 1 if the single-precision floating-point value `a' is less than 2179 the corresponding value `b', and 0 otherwise. The comparison is performed 2180 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2181 ------------------------------------------------------------------------------- 2182 */ 2183 flag float32_lt( float32 a, float32 b ) 2184 { 2185 flag aSign, bSign; 2186 2187 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2188 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2189 ) { 2190 float_raise( float_flag_invalid ); 2191 return 0; 2192 } 2193 aSign = extractFloat32Sign( a ); 2194 bSign = extractFloat32Sign( b ); 2195 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2196 return ( a != b ) && ( aSign ^ ( a < b ) ); 2197 2198 } 2199 2200 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2201 /* 2202 ------------------------------------------------------------------------------- 2203 Returns 1 if the single-precision floating-point value `a' is equal to 2204 the corresponding value `b', and 0 otherwise. The invalid exception is 2205 raised if either operand is a NaN. Otherwise, the comparison is performed 2206 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2207 ------------------------------------------------------------------------------- 2208 */ 2209 flag float32_eq_signaling( float32 a, float32 b ) 2210 { 2211 2212 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2213 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2214 ) { 2215 float_raise( float_flag_invalid ); 2216 return 0; 2217 } 2218 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2219 2220 } 2221 2222 /* 2223 ------------------------------------------------------------------------------- 2224 Returns 1 if the single-precision floating-point value `a' is less than or 2225 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2226 cause an exception. Otherwise, the comparison is performed according to the 2227 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2228 ------------------------------------------------------------------------------- 2229 */ 2230 flag float32_le_quiet( float32 a, float32 b ) 2231 { 2232 flag aSign, bSign; 2233 2234 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2235 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2236 ) { 2237 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2238 float_raise( float_flag_invalid ); 2239 } 2240 return 0; 2241 } 2242 aSign = extractFloat32Sign( a ); 2243 bSign = extractFloat32Sign( b ); 2244 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2245 return ( a == b ) || ( aSign ^ ( a < b ) ); 2246 2247 } 2248 2249 /* 2250 ------------------------------------------------------------------------------- 2251 Returns 1 if the single-precision floating-point value `a' is less than 2252 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2253 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2254 Standard for Binary Floating-Point Arithmetic. 2255 ------------------------------------------------------------------------------- 2256 */ 2257 flag float32_lt_quiet( float32 a, float32 b ) 2258 { 2259 flag aSign, bSign; 2260 2261 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2262 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2263 ) { 2264 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2265 float_raise( float_flag_invalid ); 2266 } 2267 return 0; 2268 } 2269 aSign = extractFloat32Sign( a ); 2270 bSign = extractFloat32Sign( b ); 2271 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2272 return ( a != b ) && ( aSign ^ ( a < b ) ); 2273 2274 } 2275 #endif /* !SOFTFLOAT_FOR_GCC */ 2276 2277 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2278 /* 2279 ------------------------------------------------------------------------------- 2280 Returns the result of converting the double-precision floating-point value 2281 `a' to the 32-bit two's complement integer format. The conversion is 2282 performed according to the IEC/IEEE Standard for Binary Floating-Point 2283 Arithmetic---which means in particular that the conversion is rounded 2284 according to the current rounding mode. If `a' is a NaN, the largest 2285 positive integer is returned. Otherwise, if the conversion overflows, the 2286 largest integer with the same sign as `a' is returned. 2287 ------------------------------------------------------------------------------- 2288 */ 2289 int32 float64_to_int32( float64 a ) 2290 { 2291 flag aSign; 2292 int16 aExp, shiftCount; 2293 bits64 aSig; 2294 2295 aSig = extractFloat64Frac( a ); 2296 aExp = extractFloat64Exp( a ); 2297 aSign = extractFloat64Sign( a ); 2298 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2299 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2300 shiftCount = 0x42C - aExp; 2301 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2302 return roundAndPackInt32( aSign, aSig ); 2303 2304 } 2305 #endif /* !SOFTFLOAT_FOR_GCC */ 2306 2307 /* 2308 ------------------------------------------------------------------------------- 2309 Returns the result of converting the double-precision floating-point value 2310 `a' to the 32-bit two's complement integer format. The conversion is 2311 performed according to the IEC/IEEE Standard for Binary Floating-Point 2312 Arithmetic, except that the conversion is always rounded toward zero. 2313 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2314 the conversion overflows, the largest integer with the same sign as `a' is 2315 returned. 2316 ------------------------------------------------------------------------------- 2317 */ 2318 int32 float64_to_int32_round_to_zero( float64 a ) 2319 { 2320 flag aSign; 2321 int16 aExp, shiftCount; 2322 bits64 aSig, savedASig; 2323 int32 z; 2324 2325 aSig = extractFloat64Frac( a ); 2326 aExp = extractFloat64Exp( a ); 2327 aSign = extractFloat64Sign( a ); 2328 if ( 0x41E < aExp ) { 2329 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2330 goto invalid; 2331 } 2332 else if ( aExp < 0x3FF ) { 2333 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 2334 return 0; 2335 } 2336 aSig |= LIT64( 0x0010000000000000 ); 2337 shiftCount = 0x433 - aExp; 2338 savedASig = aSig; 2339 aSig >>= shiftCount; 2340 z = aSig; 2341 if ( aSign ) z = - z; 2342 if ( ( z < 0 ) ^ aSign ) { 2343 invalid: 2344 float_raise( float_flag_invalid ); 2345 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2346 } 2347 if ( ( aSig<<shiftCount ) != savedASig ) { 2348 float_exception_flags |= float_flag_inexact; 2349 } 2350 return z; 2351 2352 } 2353 2354 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2355 /* 2356 ------------------------------------------------------------------------------- 2357 Returns the result of converting the double-precision floating-point value 2358 `a' to the 64-bit two's complement integer format. The conversion is 2359 performed according to the IEC/IEEE Standard for Binary Floating-Point 2360 Arithmetic---which means in particular that the conversion is rounded 2361 according to the current rounding mode. If `a' is a NaN, the largest 2362 positive integer is returned. Otherwise, if the conversion overflows, the 2363 largest integer with the same sign as `a' is returned. 2364 ------------------------------------------------------------------------------- 2365 */ 2366 int64 float64_to_int64( float64 a ) 2367 { 2368 flag aSign; 2369 int16 aExp, shiftCount; 2370 bits64 aSig, aSigExtra; 2371 2372 aSig = extractFloat64Frac( a ); 2373 aExp = extractFloat64Exp( a ); 2374 aSign = extractFloat64Sign( a ); 2375 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2376 shiftCount = 0x433 - aExp; 2377 if ( shiftCount <= 0 ) { 2378 if ( 0x43E < aExp ) { 2379 float_raise( float_flag_invalid ); 2380 if ( ! aSign 2381 || ( ( aExp == 0x7FF ) 2382 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2383 ) { 2384 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2385 } 2386 return (sbits64) LIT64( 0x8000000000000000 ); 2387 } 2388 aSigExtra = 0; 2389 aSig <<= - shiftCount; 2390 } 2391 else { 2392 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2393 } 2394 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2395 2396 } 2397 2398 /* 2399 ------------------------------------------------------------------------------- 2400 Returns the result of converting the double-precision floating-point value 2401 `a' to the 64-bit two's complement integer format. The conversion is 2402 performed according to the IEC/IEEE Standard for Binary Floating-Point 2403 Arithmetic, except that the conversion is always rounded toward zero. 2404 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2405 the conversion overflows, the largest integer with the same sign as `a' is 2406 returned. 2407 ------------------------------------------------------------------------------- 2408 */ 2409 int64 float64_to_int64_round_to_zero( float64 a ) 2410 { 2411 flag aSign; 2412 int16 aExp, shiftCount; 2413 bits64 aSig; 2414 int64 z; 2415 2416 aSig = extractFloat64Frac( a ); 2417 aExp = extractFloat64Exp( a ); 2418 aSign = extractFloat64Sign( a ); 2419 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2420 shiftCount = aExp - 0x433; 2421 if ( 0 <= shiftCount ) { 2422 if ( 0x43E <= aExp ) { 2423 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2424 float_raise( float_flag_invalid ); 2425 if ( ! aSign 2426 || ( ( aExp == 0x7FF ) 2427 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2428 ) { 2429 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2430 } 2431 } 2432 return (sbits64) LIT64( 0x8000000000000000 ); 2433 } 2434 z = aSig<<shiftCount; 2435 } 2436 else { 2437 if ( aExp < 0x3FE ) { 2438 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 2439 return 0; 2440 } 2441 z = aSig>>( - shiftCount ); 2442 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2443 float_exception_flags |= float_flag_inexact; 2444 } 2445 } 2446 if ( aSign ) z = - z; 2447 return z; 2448 2449 } 2450 #endif /* !SOFTFLOAT_FOR_GCC */ 2451 2452 /* 2453 ------------------------------------------------------------------------------- 2454 Returns the result of converting the double-precision floating-point value 2455 `a' to the single-precision floating-point format. The conversion is 2456 performed according to the IEC/IEEE Standard for Binary Floating-Point 2457 Arithmetic. 2458 ------------------------------------------------------------------------------- 2459 */ 2460 float32 float64_to_float32( float64 a ) 2461 { 2462 flag aSign; 2463 int16 aExp; 2464 bits64 aSig; 2465 bits32 zSig; 2466 2467 aSig = extractFloat64Frac( a ); 2468 aExp = extractFloat64Exp( a ); 2469 aSign = extractFloat64Sign( a ); 2470 if ( aExp == 0x7FF ) { 2471 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2472 return packFloat32( aSign, 0xFF, 0 ); 2473 } 2474 shift64RightJamming( aSig, 22, &aSig ); 2475 zSig = aSig; 2476 if ( aExp || zSig ) { 2477 zSig |= 0x40000000; 2478 aExp -= 0x381; 2479 } 2480 return roundAndPackFloat32( aSign, aExp, zSig ); 2481 2482 } 2483 2484 #ifdef FLOATX80 2485 2486 /* 2487 ------------------------------------------------------------------------------- 2488 Returns the result of converting the double-precision floating-point value 2489 `a' to the extended double-precision floating-point format. The conversion 2490 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2491 Arithmetic. 2492 ------------------------------------------------------------------------------- 2493 */ 2494 floatx80 float64_to_floatx80( float64 a ) 2495 { 2496 flag aSign; 2497 int16 aExp; 2498 bits64 aSig; 2499 2500 aSig = extractFloat64Frac( a ); 2501 aExp = extractFloat64Exp( a ); 2502 aSign = extractFloat64Sign( a ); 2503 if ( aExp == 0x7FF ) { 2504 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2505 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2506 } 2507 if ( aExp == 0 ) { 2508 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2509 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2510 } 2511 return 2512 packFloatx80( 2513 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2514 2515 } 2516 2517 #endif 2518 2519 #ifdef FLOAT128 2520 2521 /* 2522 ------------------------------------------------------------------------------- 2523 Returns the result of converting the double-precision floating-point value 2524 `a' to the quadruple-precision floating-point format. The conversion is 2525 performed according to the IEC/IEEE Standard for Binary Floating-Point 2526 Arithmetic. 2527 ------------------------------------------------------------------------------- 2528 */ 2529 float128 float64_to_float128( float64 a ) 2530 { 2531 flag aSign; 2532 int16 aExp; 2533 bits64 aSig, zSig0, zSig1; 2534 2535 aSig = extractFloat64Frac( a ); 2536 aExp = extractFloat64Exp( a ); 2537 aSign = extractFloat64Sign( a ); 2538 if ( aExp == 0x7FF ) { 2539 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2540 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2541 } 2542 if ( aExp == 0 ) { 2543 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2544 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2545 --aExp; 2546 } 2547 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2548 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2549 2550 } 2551 2552 #endif 2553 2554 #ifndef SOFTFLOAT_FOR_GCC 2555 /* 2556 ------------------------------------------------------------------------------- 2557 Rounds the double-precision floating-point value `a' to an integer, and 2558 returns the result as a double-precision floating-point value. The 2559 operation is performed according to the IEC/IEEE Standard for Binary 2560 Floating-Point Arithmetic. 2561 ------------------------------------------------------------------------------- 2562 */ 2563 float64 float64_round_to_int( float64 a ) 2564 { 2565 flag aSign; 2566 int16 aExp; 2567 bits64 lastBitMask, roundBitsMask; 2568 int8 roundingMode; 2569 float64 z; 2570 2571 aExp = extractFloat64Exp( a ); 2572 if ( 0x433 <= aExp ) { 2573 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2574 return propagateFloat64NaN( a, a ); 2575 } 2576 return a; 2577 } 2578 if ( aExp < 0x3FF ) { 2579 if ( (bits64) ( a<<1 ) == 0 ) return a; 2580 float_exception_flags |= float_flag_inexact; 2581 aSign = extractFloat64Sign( a ); 2582 switch ( float_rounding_mode ) { 2583 case float_round_nearest_even: 2584 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2585 return packFloat64( aSign, 0x3FF, 0 ); 2586 } 2587 break; 2588 case float_round_to_zero: 2589 break; 2590 case float_round_down: 2591 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2592 case float_round_up: 2593 return 2594 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2595 } 2596 return packFloat64( aSign, 0, 0 ); 2597 } 2598 lastBitMask = 1; 2599 lastBitMask <<= 0x433 - aExp; 2600 roundBitsMask = lastBitMask - 1; 2601 z = a; 2602 roundingMode = float_rounding_mode; 2603 if ( roundingMode == float_round_nearest_even ) { 2604 z += lastBitMask>>1; 2605 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2606 } 2607 else if ( roundingMode != float_round_to_zero ) { 2608 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2609 z += roundBitsMask; 2610 } 2611 } 2612 z &= ~ roundBitsMask; 2613 if ( z != a ) float_exception_flags |= float_flag_inexact; 2614 return z; 2615 2616 } 2617 #endif 2618 2619 /* 2620 ------------------------------------------------------------------------------- 2621 Returns the result of adding the absolute values of the double-precision 2622 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2623 before being returned. `zSign' is ignored if the result is a NaN. 2624 The addition is performed according to the IEC/IEEE Standard for Binary 2625 Floating-Point Arithmetic. 2626 ------------------------------------------------------------------------------- 2627 */ 2628 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2629 { 2630 int16 aExp, bExp, zExp; 2631 bits64 aSig, bSig, zSig; 2632 int16 expDiff; 2633 2634 aSig = extractFloat64Frac( a ); 2635 aExp = extractFloat64Exp( a ); 2636 bSig = extractFloat64Frac( b ); 2637 bExp = extractFloat64Exp( b ); 2638 expDiff = aExp - bExp; 2639 aSig <<= 9; 2640 bSig <<= 9; 2641 if ( 0 < expDiff ) { 2642 if ( aExp == 0x7FF ) { 2643 if ( aSig ) return propagateFloat64NaN( a, b ); 2644 return a; 2645 } 2646 if ( bExp == 0 ) { 2647 --expDiff; 2648 } 2649 else { 2650 bSig |= LIT64( 0x2000000000000000 ); 2651 } 2652 shift64RightJamming( bSig, expDiff, &bSig ); 2653 zExp = aExp; 2654 } 2655 else if ( expDiff < 0 ) { 2656 if ( bExp == 0x7FF ) { 2657 if ( bSig ) return propagateFloat64NaN( a, b ); 2658 return packFloat64( zSign, 0x7FF, 0 ); 2659 } 2660 if ( aExp == 0 ) { 2661 ++expDiff; 2662 } 2663 else { 2664 aSig |= LIT64( 0x2000000000000000 ); 2665 } 2666 shift64RightJamming( aSig, - expDiff, &aSig ); 2667 zExp = bExp; 2668 } 2669 else { 2670 if ( aExp == 0x7FF ) { 2671 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2672 return a; 2673 } 2674 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2675 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2676 zExp = aExp; 2677 goto roundAndPack; 2678 } 2679 aSig |= LIT64( 0x2000000000000000 ); 2680 zSig = ( aSig + bSig )<<1; 2681 --zExp; 2682 if ( (sbits64) zSig < 0 ) { 2683 zSig = aSig + bSig; 2684 ++zExp; 2685 } 2686 roundAndPack: 2687 return roundAndPackFloat64( zSign, zExp, zSig ); 2688 2689 } 2690 2691 /* 2692 ------------------------------------------------------------------------------- 2693 Returns the result of subtracting the absolute values of the double- 2694 precision floating-point values `a' and `b'. If `zSign' is 1, the 2695 difference is negated before being returned. `zSign' is ignored if the 2696 result is a NaN. The subtraction is performed according to the IEC/IEEE 2697 Standard for Binary Floating-Point Arithmetic. 2698 ------------------------------------------------------------------------------- 2699 */ 2700 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2701 { 2702 int16 aExp, bExp, zExp; 2703 bits64 aSig, bSig, zSig; 2704 int16 expDiff; 2705 2706 aSig = extractFloat64Frac( a ); 2707 aExp = extractFloat64Exp( a ); 2708 bSig = extractFloat64Frac( b ); 2709 bExp = extractFloat64Exp( b ); 2710 expDiff = aExp - bExp; 2711 aSig <<= 10; 2712 bSig <<= 10; 2713 if ( 0 < expDiff ) goto aExpBigger; 2714 if ( expDiff < 0 ) goto bExpBigger; 2715 if ( aExp == 0x7FF ) { 2716 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2717 float_raise( float_flag_invalid ); 2718 return float64_default_nan; 2719 } 2720 if ( aExp == 0 ) { 2721 aExp = 1; 2722 bExp = 1; 2723 } 2724 if ( bSig < aSig ) goto aBigger; 2725 if ( aSig < bSig ) goto bBigger; 2726 return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); 2727 bExpBigger: 2728 if ( bExp == 0x7FF ) { 2729 if ( bSig ) return propagateFloat64NaN( a, b ); 2730 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2731 } 2732 if ( aExp == 0 ) { 2733 ++expDiff; 2734 } 2735 else { 2736 aSig |= LIT64( 0x4000000000000000 ); 2737 } 2738 shift64RightJamming( aSig, - expDiff, &aSig ); 2739 bSig |= LIT64( 0x4000000000000000 ); 2740 bBigger: 2741 zSig = bSig - aSig; 2742 zExp = bExp; 2743 zSign ^= 1; 2744 goto normalizeRoundAndPack; 2745 aExpBigger: 2746 if ( aExp == 0x7FF ) { 2747 if ( aSig ) return propagateFloat64NaN( a, b ); 2748 return a; 2749 } 2750 if ( bExp == 0 ) { 2751 --expDiff; 2752 } 2753 else { 2754 bSig |= LIT64( 0x4000000000000000 ); 2755 } 2756 shift64RightJamming( bSig, expDiff, &bSig ); 2757 aSig |= LIT64( 0x4000000000000000 ); 2758 aBigger: 2759 zSig = aSig - bSig; 2760 zExp = aExp; 2761 normalizeRoundAndPack: 2762 --zExp; 2763 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2764 2765 } 2766 2767 /* 2768 ------------------------------------------------------------------------------- 2769 Returns the result of adding the double-precision floating-point values `a' 2770 and `b'. The operation is performed according to the IEC/IEEE Standard for 2771 Binary Floating-Point Arithmetic. 2772 ------------------------------------------------------------------------------- 2773 */ 2774 float64 float64_add( float64 a, float64 b ) 2775 { 2776 flag aSign, bSign; 2777 2778 aSign = extractFloat64Sign( a ); 2779 bSign = extractFloat64Sign( b ); 2780 if ( aSign == bSign ) { 2781 return addFloat64Sigs( a, b, aSign ); 2782 } 2783 else { 2784 return subFloat64Sigs( a, b, aSign ); 2785 } 2786 2787 } 2788 2789 /* 2790 ------------------------------------------------------------------------------- 2791 Returns the result of subtracting the double-precision floating-point values 2792 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2793 for Binary Floating-Point Arithmetic. 2794 ------------------------------------------------------------------------------- 2795 */ 2796 float64 float64_sub( float64 a, float64 b ) 2797 { 2798 flag aSign, bSign; 2799 2800 aSign = extractFloat64Sign( a ); 2801 bSign = extractFloat64Sign( b ); 2802 if ( aSign == bSign ) { 2803 return subFloat64Sigs( a, b, aSign ); 2804 } 2805 else { 2806 return addFloat64Sigs( a, b, aSign ); 2807 } 2808 2809 } 2810 2811 /* 2812 ------------------------------------------------------------------------------- 2813 Returns the result of multiplying the double-precision floating-point values 2814 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2815 for Binary Floating-Point Arithmetic. 2816 ------------------------------------------------------------------------------- 2817 */ 2818 float64 float64_mul( float64 a, float64 b ) 2819 { 2820 flag aSign, bSign, zSign; 2821 int16 aExp, bExp, zExp; 2822 bits64 aSig, bSig, zSig0, zSig1; 2823 2824 aSig = extractFloat64Frac( a ); 2825 aExp = extractFloat64Exp( a ); 2826 aSign = extractFloat64Sign( a ); 2827 bSig = extractFloat64Frac( b ); 2828 bExp = extractFloat64Exp( b ); 2829 bSign = extractFloat64Sign( b ); 2830 zSign = aSign ^ bSign; 2831 if ( aExp == 0x7FF ) { 2832 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2833 return propagateFloat64NaN( a, b ); 2834 } 2835 if ( ( bExp | bSig ) == 0 ) { 2836 float_raise( float_flag_invalid ); 2837 return float64_default_nan; 2838 } 2839 return packFloat64( zSign, 0x7FF, 0 ); 2840 } 2841 if ( bExp == 0x7FF ) { 2842 if ( bSig ) return propagateFloat64NaN( a, b ); 2843 if ( ( aExp | aSig ) == 0 ) { 2844 float_raise( float_flag_invalid ); 2845 return float64_default_nan; 2846 } 2847 return packFloat64( zSign, 0x7FF, 0 ); 2848 } 2849 if ( aExp == 0 ) { 2850 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2851 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2852 } 2853 if ( bExp == 0 ) { 2854 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2855 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2856 } 2857 zExp = aExp + bExp - 0x3FF; 2858 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2859 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2860 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2861 zSig0 |= ( zSig1 != 0 ); 2862 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2863 zSig0 <<= 1; 2864 --zExp; 2865 } 2866 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2867 2868 } 2869 2870 /* 2871 ------------------------------------------------------------------------------- 2872 Returns the result of dividing the double-precision floating-point value `a' 2873 by the corresponding value `b'. The operation is performed according to 2874 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2875 ------------------------------------------------------------------------------- 2876 */ 2877 float64 float64_div( float64 a, float64 b ) 2878 { 2879 flag aSign, bSign, zSign; 2880 int16 aExp, bExp, zExp; 2881 bits64 aSig, bSig, zSig; 2882 bits64 rem0, rem1; 2883 bits64 term0, term1; 2884 2885 aSig = extractFloat64Frac( a ); 2886 aExp = extractFloat64Exp( a ); 2887 aSign = extractFloat64Sign( a ); 2888 bSig = extractFloat64Frac( b ); 2889 bExp = extractFloat64Exp( b ); 2890 bSign = extractFloat64Sign( b ); 2891 zSign = aSign ^ bSign; 2892 if ( aExp == 0x7FF ) { 2893 if ( aSig ) return propagateFloat64NaN( a, b ); 2894 if ( bExp == 0x7FF ) { 2895 if ( bSig ) return propagateFloat64NaN( a, b ); 2896 float_raise( float_flag_invalid ); 2897 return float64_default_nan; 2898 } 2899 return packFloat64( zSign, 0x7FF, 0 ); 2900 } 2901 if ( bExp == 0x7FF ) { 2902 if ( bSig ) return propagateFloat64NaN( a, b ); 2903 return packFloat64( zSign, 0, 0 ); 2904 } 2905 if ( bExp == 0 ) { 2906 if ( bSig == 0 ) { 2907 if ( ( aExp | aSig ) == 0 ) { 2908 float_raise( float_flag_invalid ); 2909 return float64_default_nan; 2910 } 2911 float_raise( float_flag_divbyzero ); 2912 return packFloat64( zSign, 0x7FF, 0 ); 2913 } 2914 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2915 } 2916 if ( aExp == 0 ) { 2917 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2918 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2919 } 2920 zExp = aExp - bExp + 0x3FD; 2921 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2922 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2923 if ( bSig <= ( aSig + aSig ) ) { 2924 aSig >>= 1; 2925 ++zExp; 2926 } 2927 zSig = estimateDiv128To64( aSig, 0, bSig ); 2928 if ( ( zSig & 0x1FF ) <= 2 ) { 2929 mul64To128( bSig, zSig, &term0, &term1 ); 2930 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2931 while ( (sbits64) rem0 < 0 ) { 2932 --zSig; 2933 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2934 } 2935 zSig |= ( rem1 != 0 ); 2936 } 2937 return roundAndPackFloat64( zSign, zExp, zSig ); 2938 2939 } 2940 2941 #ifndef SOFTFLOAT_FOR_GCC 2942 /* 2943 ------------------------------------------------------------------------------- 2944 Returns the remainder of the double-precision floating-point value `a' 2945 with respect to the corresponding value `b'. The operation is performed 2946 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2947 ------------------------------------------------------------------------------- 2948 */ 2949 float64 float64_rem( float64 a, float64 b ) 2950 { 2951 flag aSign, bSign, zSign; 2952 int16 aExp, bExp, expDiff; 2953 bits64 aSig, bSig; 2954 bits64 q, alternateASig; 2955 sbits64 sigMean; 2956 2957 aSig = extractFloat64Frac( a ); 2958 aExp = extractFloat64Exp( a ); 2959 aSign = extractFloat64Sign( a ); 2960 bSig = extractFloat64Frac( b ); 2961 bExp = extractFloat64Exp( b ); 2962 bSign = extractFloat64Sign( b ); 2963 if ( aExp == 0x7FF ) { 2964 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2965 return propagateFloat64NaN( a, b ); 2966 } 2967 float_raise( float_flag_invalid ); 2968 return float64_default_nan; 2969 } 2970 if ( bExp == 0x7FF ) { 2971 if ( bSig ) return propagateFloat64NaN( a, b ); 2972 return a; 2973 } 2974 if ( bExp == 0 ) { 2975 if ( bSig == 0 ) { 2976 float_raise( float_flag_invalid ); 2977 return float64_default_nan; 2978 } 2979 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2980 } 2981 if ( aExp == 0 ) { 2982 if ( aSig == 0 ) return a; 2983 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2984 } 2985 expDiff = aExp - bExp; 2986 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2987 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2988 if ( expDiff < 0 ) { 2989 if ( expDiff < -1 ) return a; 2990 aSig >>= 1; 2991 } 2992 q = ( bSig <= aSig ); 2993 if ( q ) aSig -= bSig; 2994 expDiff -= 64; 2995 while ( 0 < expDiff ) { 2996 q = estimateDiv128To64( aSig, 0, bSig ); 2997 q = ( 2 < q ) ? q - 2 : 0; 2998 aSig = - ( ( bSig>>2 ) * q ); 2999 expDiff -= 62; 3000 } 3001 expDiff += 64; 3002 if ( 0 < expDiff ) { 3003 q = estimateDiv128To64( aSig, 0, bSig ); 3004 q = ( 2 < q ) ? q - 2 : 0; 3005 q >>= 64 - expDiff; 3006 bSig >>= 2; 3007 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3008 } 3009 else { 3010 aSig >>= 2; 3011 bSig >>= 2; 3012 } 3013 do { 3014 alternateASig = aSig; 3015 ++q; 3016 aSig -= bSig; 3017 } while ( 0 <= (sbits64) aSig ); 3018 sigMean = aSig + alternateASig; 3019 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3020 aSig = alternateASig; 3021 } 3022 zSign = ( (sbits64) aSig < 0 ); 3023 if ( zSign ) aSig = - aSig; 3024 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3025 3026 } 3027 3028 /* 3029 ------------------------------------------------------------------------------- 3030 Returns the square root of the double-precision floating-point value `a'. 3031 The operation is performed according to the IEC/IEEE Standard for Binary 3032 Floating-Point Arithmetic. 3033 ------------------------------------------------------------------------------- 3034 */ 3035 float64 float64_sqrt( float64 a ) 3036 { 3037 flag aSign; 3038 int16 aExp, zExp; 3039 bits64 aSig, zSig, doubleZSig; 3040 bits64 rem0, rem1, term0, term1; 3041 3042 aSig = extractFloat64Frac( a ); 3043 aExp = extractFloat64Exp( a ); 3044 aSign = extractFloat64Sign( a ); 3045 if ( aExp == 0x7FF ) { 3046 if ( aSig ) return propagateFloat64NaN( a, a ); 3047 if ( ! aSign ) return a; 3048 float_raise( float_flag_invalid ); 3049 return float64_default_nan; 3050 } 3051 if ( aSign ) { 3052 if ( ( aExp | aSig ) == 0 ) return a; 3053 float_raise( float_flag_invalid ); 3054 return float64_default_nan; 3055 } 3056 if ( aExp == 0 ) { 3057 if ( aSig == 0 ) return 0; 3058 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3059 } 3060 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3061 aSig |= LIT64( 0x0010000000000000 ); 3062 zSig = estimateSqrt32( aExp, aSig>>21 ); 3063 aSig <<= 9 - ( aExp & 1 ); 3064 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3065 if ( ( zSig & 0x1FF ) <= 5 ) { 3066 doubleZSig = zSig<<1; 3067 mul64To128( zSig, zSig, &term0, &term1 ); 3068 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3069 while ( (sbits64) rem0 < 0 ) { 3070 --zSig; 3071 doubleZSig -= 2; 3072 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3073 } 3074 zSig |= ( ( rem0 | rem1 ) != 0 ); 3075 } 3076 return roundAndPackFloat64( 0, zExp, zSig ); 3077 3078 } 3079 #endif 3080 3081 /* 3082 ------------------------------------------------------------------------------- 3083 Returns 1 if the double-precision floating-point value `a' is equal to the 3084 corresponding value `b', and 0 otherwise. The comparison is performed 3085 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3086 ------------------------------------------------------------------------------- 3087 */ 3088 flag float64_eq( float64 a, float64 b ) 3089 { 3090 3091 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3092 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3093 ) { 3094 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3095 float_raise( float_flag_invalid ); 3096 } 3097 return 0; 3098 } 3099 return ( a == b ) || 3100 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3101 3102 } 3103 3104 /* 3105 ------------------------------------------------------------------------------- 3106 Returns 1 if the double-precision floating-point value `a' is less than or 3107 equal to the corresponding value `b', and 0 otherwise. The comparison is 3108 performed according to the IEC/IEEE Standard for Binary Floating-Point 3109 Arithmetic. 3110 ------------------------------------------------------------------------------- 3111 */ 3112 flag float64_le( float64 a, float64 b ) 3113 { 3114 flag aSign, bSign; 3115 3116 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3117 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3118 ) { 3119 float_raise( float_flag_invalid ); 3120 return 0; 3121 } 3122 aSign = extractFloat64Sign( a ); 3123 bSign = extractFloat64Sign( b ); 3124 if ( aSign != bSign ) 3125 return aSign || 3126 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3127 0 ); 3128 return ( a == b ) || 3129 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3130 3131 } 3132 3133 /* 3134 ------------------------------------------------------------------------------- 3135 Returns 1 if the double-precision floating-point value `a' is less than 3136 the corresponding value `b', and 0 otherwise. The comparison is performed 3137 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3138 ------------------------------------------------------------------------------- 3139 */ 3140 flag float64_lt( float64 a, float64 b ) 3141 { 3142 flag aSign, bSign; 3143 3144 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3145 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3146 ) { 3147 float_raise( float_flag_invalid ); 3148 return 0; 3149 } 3150 aSign = extractFloat64Sign( a ); 3151 bSign = extractFloat64Sign( b ); 3152 if ( aSign != bSign ) 3153 return aSign && 3154 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3155 0 ); 3156 return ( a != b ) && 3157 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3158 3159 } 3160 3161 #ifndef SOFTFLOAT_FOR_GCC 3162 /* 3163 ------------------------------------------------------------------------------- 3164 Returns 1 if the double-precision floating-point value `a' is equal to the 3165 corresponding value `b', and 0 otherwise. The invalid exception is raised 3166 if either operand is a NaN. Otherwise, the comparison is performed 3167 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3168 ------------------------------------------------------------------------------- 3169 */ 3170 flag float64_eq_signaling( float64 a, float64 b ) 3171 { 3172 3173 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3174 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3175 ) { 3176 float_raise( float_flag_invalid ); 3177 return 0; 3178 } 3179 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3180 3181 } 3182 3183 /* 3184 ------------------------------------------------------------------------------- 3185 Returns 1 if the double-precision floating-point value `a' is less than or 3186 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3187 cause an exception. Otherwise, the comparison is performed according to the 3188 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3189 ------------------------------------------------------------------------------- 3190 */ 3191 flag float64_le_quiet( float64 a, float64 b ) 3192 { 3193 flag aSign, bSign; 3194 3195 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3196 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3197 ) { 3198 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3199 float_raise( float_flag_invalid ); 3200 } 3201 return 0; 3202 } 3203 aSign = extractFloat64Sign( a ); 3204 bSign = extractFloat64Sign( b ); 3205 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3206 return ( a == b ) || ( aSign ^ ( a < b ) ); 3207 3208 } 3209 3210 /* 3211 ------------------------------------------------------------------------------- 3212 Returns 1 if the double-precision floating-point value `a' is less than 3213 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3214 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3215 Standard for Binary Floating-Point Arithmetic. 3216 ------------------------------------------------------------------------------- 3217 */ 3218 flag float64_lt_quiet( float64 a, float64 b ) 3219 { 3220 flag aSign, bSign; 3221 3222 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3223 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3224 ) { 3225 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3226 float_raise( float_flag_invalid ); 3227 } 3228 return 0; 3229 } 3230 aSign = extractFloat64Sign( a ); 3231 bSign = extractFloat64Sign( b ); 3232 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3233 return ( a != b ) && ( aSign ^ ( a < b ) ); 3234 3235 } 3236 #endif 3237 3238 #ifdef FLOATX80 3239 3240 /* 3241 ------------------------------------------------------------------------------- 3242 Returns the result of converting the extended double-precision floating- 3243 point value `a' to the 32-bit two's complement integer format. The 3244 conversion is performed according to the IEC/IEEE Standard for Binary 3245 Floating-Point Arithmetic---which means in particular that the conversion 3246 is rounded according to the current rounding mode. If `a' is a NaN, the 3247 largest positive integer is returned. Otherwise, if the conversion 3248 overflows, the largest integer with the same sign as `a' is returned. 3249 ------------------------------------------------------------------------------- 3250 */ 3251 int32 floatx80_to_int32( floatx80 a ) 3252 { 3253 flag aSign; 3254 int32 aExp, shiftCount; 3255 bits64 aSig; 3256 3257 aSig = extractFloatx80Frac( a ); 3258 aExp = extractFloatx80Exp( a ); 3259 aSign = extractFloatx80Sign( a ); 3260 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3261 shiftCount = 0x4037 - aExp; 3262 if ( shiftCount <= 0 ) shiftCount = 1; 3263 shift64RightJamming( aSig, shiftCount, &aSig ); 3264 return roundAndPackInt32( aSign, aSig ); 3265 3266 } 3267 3268 /* 3269 ------------------------------------------------------------------------------- 3270 Returns the result of converting the extended double-precision floating- 3271 point value `a' to the 32-bit two's complement integer format. The 3272 conversion is performed according to the IEC/IEEE Standard for Binary 3273 Floating-Point Arithmetic, except that the conversion is always rounded 3274 toward zero. If `a' is a NaN, the largest positive integer is returned. 3275 Otherwise, if the conversion overflows, the largest integer with the same 3276 sign as `a' is returned. 3277 ------------------------------------------------------------------------------- 3278 */ 3279 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3280 { 3281 flag aSign; 3282 int32 aExp, shiftCount; 3283 bits64 aSig, savedASig; 3284 int32 z; 3285 3286 aSig = extractFloatx80Frac( a ); 3287 aExp = extractFloatx80Exp( a ); 3288 aSign = extractFloatx80Sign( a ); 3289 if ( 0x401E < aExp ) { 3290 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3291 goto invalid; 3292 } 3293 else if ( aExp < 0x3FFF ) { 3294 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 3295 return 0; 3296 } 3297 shiftCount = 0x403E - aExp; 3298 savedASig = aSig; 3299 aSig >>= shiftCount; 3300 z = aSig; 3301 if ( aSign ) z = - z; 3302 if ( ( z < 0 ) ^ aSign ) { 3303 invalid: 3304 float_raise( float_flag_invalid ); 3305 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3306 } 3307 if ( ( aSig<<shiftCount ) != savedASig ) { 3308 float_exception_flags |= float_flag_inexact; 3309 } 3310 return z; 3311 3312 } 3313 3314 /* 3315 ------------------------------------------------------------------------------- 3316 Returns the result of converting the extended double-precision floating- 3317 point value `a' to the 64-bit two's complement integer format. The 3318 conversion is performed according to the IEC/IEEE Standard for Binary 3319 Floating-Point Arithmetic---which means in particular that the conversion 3320 is rounded according to the current rounding mode. If `a' is a NaN, 3321 the largest positive integer is returned. Otherwise, if the conversion 3322 overflows, the largest integer with the same sign as `a' is returned. 3323 ------------------------------------------------------------------------------- 3324 */ 3325 int64 floatx80_to_int64( floatx80 a ) 3326 { 3327 flag aSign; 3328 int32 aExp, shiftCount; 3329 bits64 aSig, aSigExtra; 3330 3331 aSig = extractFloatx80Frac( a ); 3332 aExp = extractFloatx80Exp( a ); 3333 aSign = extractFloatx80Sign( a ); 3334 shiftCount = 0x403E - aExp; 3335 if ( shiftCount <= 0 ) { 3336 if ( shiftCount ) { 3337 float_raise( float_flag_invalid ); 3338 if ( ! aSign 3339 || ( ( aExp == 0x7FFF ) 3340 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3341 ) { 3342 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3343 } 3344 return (sbits64) LIT64( 0x8000000000000000 ); 3345 } 3346 aSigExtra = 0; 3347 } 3348 else { 3349 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3350 } 3351 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3352 3353 } 3354 3355 /* 3356 ------------------------------------------------------------------------------- 3357 Returns the result of converting the extended double-precision floating- 3358 point value `a' to the 64-bit two's complement integer format. The 3359 conversion is performed according to the IEC/IEEE Standard for Binary 3360 Floating-Point Arithmetic, except that the conversion is always rounded 3361 toward zero. If `a' is a NaN, the largest positive integer is returned. 3362 Otherwise, if the conversion overflows, the largest integer with the same 3363 sign as `a' is returned. 3364 ------------------------------------------------------------------------------- 3365 */ 3366 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3367 { 3368 flag aSign; 3369 int32 aExp, shiftCount; 3370 bits64 aSig; 3371 int64 z; 3372 3373 aSig = extractFloatx80Frac( a ); 3374 aExp = extractFloatx80Exp( a ); 3375 aSign = extractFloatx80Sign( a ); 3376 shiftCount = aExp - 0x403E; 3377 if ( 0 <= shiftCount ) { 3378 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3379 if ( ( a.high != 0xC03E ) || aSig ) { 3380 float_raise( float_flag_invalid ); 3381 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3382 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3383 } 3384 } 3385 return (sbits64) LIT64( 0x8000000000000000 ); 3386 } 3387 else if ( aExp < 0x3FFF ) { 3388 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 3389 return 0; 3390 } 3391 z = aSig>>( - shiftCount ); 3392 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3393 float_exception_flags |= float_flag_inexact; 3394 } 3395 if ( aSign ) z = - z; 3396 return z; 3397 3398 } 3399 3400 /* 3401 ------------------------------------------------------------------------------- 3402 Returns the result of converting the extended double-precision floating- 3403 point value `a' to the single-precision floating-point format. The 3404 conversion is performed according to the IEC/IEEE Standard for Binary 3405 Floating-Point Arithmetic. 3406 ------------------------------------------------------------------------------- 3407 */ 3408 float32 floatx80_to_float32( floatx80 a ) 3409 { 3410 flag aSign; 3411 int32 aExp; 3412 bits64 aSig; 3413 3414 aSig = extractFloatx80Frac( a ); 3415 aExp = extractFloatx80Exp( a ); 3416 aSign = extractFloatx80Sign( a ); 3417 if ( aExp == 0x7FFF ) { 3418 if ( (bits64) ( aSig<<1 ) ) { 3419 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3420 } 3421 return packFloat32( aSign, 0xFF, 0 ); 3422 } 3423 shift64RightJamming( aSig, 33, &aSig ); 3424 if ( aExp || aSig ) aExp -= 0x3F81; 3425 return roundAndPackFloat32( aSign, aExp, aSig ); 3426 3427 } 3428 3429 /* 3430 ------------------------------------------------------------------------------- 3431 Returns the result of converting the extended double-precision floating- 3432 point value `a' to the double-precision floating-point format. The 3433 conversion is performed according to the IEC/IEEE Standard for Binary 3434 Floating-Point Arithmetic. 3435 ------------------------------------------------------------------------------- 3436 */ 3437 float64 floatx80_to_float64( floatx80 a ) 3438 { 3439 flag aSign; 3440 int32 aExp; 3441 bits64 aSig, zSig; 3442 3443 aSig = extractFloatx80Frac( a ); 3444 aExp = extractFloatx80Exp( a ); 3445 aSign = extractFloatx80Sign( a ); 3446 if ( aExp == 0x7FFF ) { 3447 if ( (bits64) ( aSig<<1 ) ) { 3448 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3449 } 3450 return packFloat64( aSign, 0x7FF, 0 ); 3451 } 3452 shift64RightJamming( aSig, 1, &zSig ); 3453 if ( aExp || aSig ) aExp -= 0x3C01; 3454 return roundAndPackFloat64( aSign, aExp, zSig ); 3455 3456 } 3457 3458 #ifdef FLOAT128 3459 3460 /* 3461 ------------------------------------------------------------------------------- 3462 Returns the result of converting the extended double-precision floating- 3463 point value `a' to the quadruple-precision floating-point format. The 3464 conversion is performed according to the IEC/IEEE Standard for Binary 3465 Floating-Point Arithmetic. 3466 ------------------------------------------------------------------------------- 3467 */ 3468 float128 floatx80_to_float128( floatx80 a ) 3469 { 3470 flag aSign; 3471 int16 aExp; 3472 bits64 aSig, zSig0, zSig1; 3473 3474 aSig = extractFloatx80Frac( a ); 3475 aExp = extractFloatx80Exp( a ); 3476 aSign = extractFloatx80Sign( a ); 3477 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3478 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3479 } 3480 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3481 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3482 3483 } 3484 3485 #endif 3486 3487 /* 3488 ------------------------------------------------------------------------------- 3489 Rounds the extended double-precision floating-point value `a' to an integer, 3490 and returns the result as an extended quadruple-precision floating-point 3491 value. The operation is performed according to the IEC/IEEE Standard for 3492 Binary Floating-Point Arithmetic. 3493 ------------------------------------------------------------------------------- 3494 */ 3495 floatx80 floatx80_round_to_int( floatx80 a ) 3496 { 3497 flag aSign; 3498 int32 aExp; 3499 bits64 lastBitMask, roundBitsMask; 3500 int8 roundingMode; 3501 floatx80 z; 3502 3503 aExp = extractFloatx80Exp( a ); 3504 if ( 0x403E <= aExp ) { 3505 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3506 return propagateFloatx80NaN( a, a ); 3507 } 3508 return a; 3509 } 3510 if ( aExp < 0x3FFF ) { 3511 if ( ( aExp == 0 ) 3512 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3513 return a; 3514 } 3515 float_exception_flags |= float_flag_inexact; 3516 aSign = extractFloatx80Sign( a ); 3517 switch ( float_rounding_mode ) { 3518 case float_round_nearest_even: 3519 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3520 ) { 3521 return 3522 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3523 } 3524 break; 3525 case float_round_to_zero: 3526 break; 3527 case float_round_down: 3528 return 3529 aSign ? 3530 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3531 : packFloatx80( 0, 0, 0 ); 3532 case float_round_up: 3533 return 3534 aSign ? packFloatx80( 1, 0, 0 ) 3535 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3536 } 3537 return packFloatx80( aSign, 0, 0 ); 3538 } 3539 lastBitMask = 1; 3540 lastBitMask <<= 0x403E - aExp; 3541 roundBitsMask = lastBitMask - 1; 3542 z = a; 3543 roundingMode = float_rounding_mode; 3544 if ( roundingMode == float_round_nearest_even ) { 3545 z.low += lastBitMask>>1; 3546 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3547 } 3548 else if ( roundingMode != float_round_to_zero ) { 3549 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3550 z.low += roundBitsMask; 3551 } 3552 } 3553 z.low &= ~ roundBitsMask; 3554 if ( z.low == 0 ) { 3555 ++z.high; 3556 z.low = LIT64( 0x8000000000000000 ); 3557 } 3558 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact; 3559 return z; 3560 3561 } 3562 3563 /* 3564 ------------------------------------------------------------------------------- 3565 Returns the result of adding the absolute values of the extended double- 3566 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3567 negated before being returned. `zSign' is ignored if the result is a NaN. 3568 The addition is performed according to the IEC/IEEE Standard for Binary 3569 Floating-Point Arithmetic. 3570 ------------------------------------------------------------------------------- 3571 */ 3572 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3573 { 3574 int32 aExp, bExp, zExp; 3575 bits64 aSig, bSig, zSig0, zSig1; 3576 int32 expDiff; 3577 3578 aSig = extractFloatx80Frac( a ); 3579 aExp = extractFloatx80Exp( a ); 3580 bSig = extractFloatx80Frac( b ); 3581 bExp = extractFloatx80Exp( b ); 3582 expDiff = aExp - bExp; 3583 if ( 0 < expDiff ) { 3584 if ( aExp == 0x7FFF ) { 3585 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3586 return a; 3587 } 3588 if ( bExp == 0 ) --expDiff; 3589 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3590 zExp = aExp; 3591 } 3592 else if ( expDiff < 0 ) { 3593 if ( bExp == 0x7FFF ) { 3594 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3595 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3596 } 3597 if ( aExp == 0 ) ++expDiff; 3598 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3599 zExp = bExp; 3600 } 3601 else { 3602 if ( aExp == 0x7FFF ) { 3603 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3604 return propagateFloatx80NaN( a, b ); 3605 } 3606 return a; 3607 } 3608 zSig1 = 0; 3609 zSig0 = aSig + bSig; 3610 if ( aExp == 0 ) { 3611 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3612 goto roundAndPack; 3613 } 3614 zExp = aExp; 3615 goto shiftRight1; 3616 } 3617 zSig0 = aSig + bSig; 3618 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3619 shiftRight1: 3620 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3621 zSig0 |= LIT64( 0x8000000000000000 ); 3622 ++zExp; 3623 roundAndPack: 3624 return 3625 roundAndPackFloatx80( 3626 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3627 3628 } 3629 3630 /* 3631 ------------------------------------------------------------------------------- 3632 Returns the result of subtracting the absolute values of the extended 3633 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3634 difference is negated before being returned. `zSign' is ignored if the 3635 result is a NaN. The subtraction is performed according to the IEC/IEEE 3636 Standard for Binary Floating-Point Arithmetic. 3637 ------------------------------------------------------------------------------- 3638 */ 3639 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3640 { 3641 int32 aExp, bExp, zExp; 3642 bits64 aSig, bSig, zSig0, zSig1; 3643 int32 expDiff; 3644 floatx80 z; 3645 3646 aSig = extractFloatx80Frac( a ); 3647 aExp = extractFloatx80Exp( a ); 3648 bSig = extractFloatx80Frac( b ); 3649 bExp = extractFloatx80Exp( b ); 3650 expDiff = aExp - bExp; 3651 if ( 0 < expDiff ) goto aExpBigger; 3652 if ( expDiff < 0 ) goto bExpBigger; 3653 if ( aExp == 0x7FFF ) { 3654 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3655 return propagateFloatx80NaN( a, b ); 3656 } 3657 float_raise( float_flag_invalid ); 3658 z.low = floatx80_default_nan_low; 3659 z.high = floatx80_default_nan_high; 3660 return z; 3661 } 3662 if ( aExp == 0 ) { 3663 aExp = 1; 3664 bExp = 1; 3665 } 3666 zSig1 = 0; 3667 if ( bSig < aSig ) goto aBigger; 3668 if ( aSig < bSig ) goto bBigger; 3669 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); 3670 bExpBigger: 3671 if ( bExp == 0x7FFF ) { 3672 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3673 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3674 } 3675 if ( aExp == 0 ) ++expDiff; 3676 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3677 bBigger: 3678 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3679 zExp = bExp; 3680 zSign ^= 1; 3681 goto normalizeRoundAndPack; 3682 aExpBigger: 3683 if ( aExp == 0x7FFF ) { 3684 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3685 return a; 3686 } 3687 if ( bExp == 0 ) --expDiff; 3688 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3689 aBigger: 3690 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3691 zExp = aExp; 3692 normalizeRoundAndPack: 3693 return 3694 normalizeRoundAndPackFloatx80( 3695 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3696 3697 } 3698 3699 /* 3700 ------------------------------------------------------------------------------- 3701 Returns the result of adding the extended double-precision floating-point 3702 values `a' and `b'. The operation is performed according to the IEC/IEEE 3703 Standard for Binary Floating-Point Arithmetic. 3704 ------------------------------------------------------------------------------- 3705 */ 3706 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3707 { 3708 flag aSign, bSign; 3709 3710 aSign = extractFloatx80Sign( a ); 3711 bSign = extractFloatx80Sign( b ); 3712 if ( aSign == bSign ) { 3713 return addFloatx80Sigs( a, b, aSign ); 3714 } 3715 else { 3716 return subFloatx80Sigs( a, b, aSign ); 3717 } 3718 3719 } 3720 3721 /* 3722 ------------------------------------------------------------------------------- 3723 Returns the result of subtracting the extended double-precision floating- 3724 point values `a' and `b'. The operation is performed according to the 3725 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3726 ------------------------------------------------------------------------------- 3727 */ 3728 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3729 { 3730 flag aSign, bSign; 3731 3732 aSign = extractFloatx80Sign( a ); 3733 bSign = extractFloatx80Sign( b ); 3734 if ( aSign == bSign ) { 3735 return subFloatx80Sigs( a, b, aSign ); 3736 } 3737 else { 3738 return addFloatx80Sigs( a, b, aSign ); 3739 } 3740 3741 } 3742 3743 /* 3744 ------------------------------------------------------------------------------- 3745 Returns the result of multiplying the extended double-precision floating- 3746 point values `a' and `b'. The operation is performed according to the 3747 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3748 ------------------------------------------------------------------------------- 3749 */ 3750 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3751 { 3752 flag aSign, bSign, zSign; 3753 int32 aExp, bExp, zExp; 3754 bits64 aSig, bSig, zSig0, zSig1; 3755 floatx80 z; 3756 3757 aSig = extractFloatx80Frac( a ); 3758 aExp = extractFloatx80Exp( a ); 3759 aSign = extractFloatx80Sign( a ); 3760 bSig = extractFloatx80Frac( b ); 3761 bExp = extractFloatx80Exp( b ); 3762 bSign = extractFloatx80Sign( b ); 3763 zSign = aSign ^ bSign; 3764 if ( aExp == 0x7FFF ) { 3765 if ( (bits64) ( aSig<<1 ) 3766 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3767 return propagateFloatx80NaN( a, b ); 3768 } 3769 if ( ( bExp | bSig ) == 0 ) goto invalid; 3770 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3771 } 3772 if ( bExp == 0x7FFF ) { 3773 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3774 if ( ( aExp | aSig ) == 0 ) { 3775 invalid: 3776 float_raise( float_flag_invalid ); 3777 z.low = floatx80_default_nan_low; 3778 z.high = floatx80_default_nan_high; 3779 return z; 3780 } 3781 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3782 } 3783 if ( aExp == 0 ) { 3784 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3785 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3786 } 3787 if ( bExp == 0 ) { 3788 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3789 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3790 } 3791 zExp = aExp + bExp - 0x3FFE; 3792 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3793 if ( 0 < (sbits64) zSig0 ) { 3794 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3795 --zExp; 3796 } 3797 return 3798 roundAndPackFloatx80( 3799 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3800 3801 } 3802 3803 /* 3804 ------------------------------------------------------------------------------- 3805 Returns the result of dividing the extended double-precision floating-point 3806 value `a' by the corresponding value `b'. The operation is performed 3807 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3808 ------------------------------------------------------------------------------- 3809 */ 3810 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3811 { 3812 flag aSign, bSign, zSign; 3813 int32 aExp, bExp, zExp; 3814 bits64 aSig, bSig, zSig0, zSig1; 3815 bits64 rem0, rem1, rem2, term0, term1, term2; 3816 floatx80 z; 3817 3818 aSig = extractFloatx80Frac( a ); 3819 aExp = extractFloatx80Exp( a ); 3820 aSign = extractFloatx80Sign( a ); 3821 bSig = extractFloatx80Frac( b ); 3822 bExp = extractFloatx80Exp( b ); 3823 bSign = extractFloatx80Sign( b ); 3824 zSign = aSign ^ bSign; 3825 if ( aExp == 0x7FFF ) { 3826 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3827 if ( bExp == 0x7FFF ) { 3828 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3829 goto invalid; 3830 } 3831 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3832 } 3833 if ( bExp == 0x7FFF ) { 3834 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3835 return packFloatx80( zSign, 0, 0 ); 3836 } 3837 if ( bExp == 0 ) { 3838 if ( bSig == 0 ) { 3839 if ( ( aExp | aSig ) == 0 ) { 3840 invalid: 3841 float_raise( float_flag_invalid ); 3842 z.low = floatx80_default_nan_low; 3843 z.high = floatx80_default_nan_high; 3844 return z; 3845 } 3846 float_raise( float_flag_divbyzero ); 3847 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3848 } 3849 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3850 } 3851 if ( aExp == 0 ) { 3852 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3853 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3854 } 3855 zExp = aExp - bExp + 0x3FFE; 3856 rem1 = 0; 3857 if ( bSig <= aSig ) { 3858 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3859 ++zExp; 3860 } 3861 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3862 mul64To128( bSig, zSig0, &term0, &term1 ); 3863 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3864 while ( (sbits64) rem0 < 0 ) { 3865 --zSig0; 3866 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3867 } 3868 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3869 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3870 mul64To128( bSig, zSig1, &term1, &term2 ); 3871 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3872 while ( (sbits64) rem1 < 0 ) { 3873 --zSig1; 3874 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3875 } 3876 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3877 } 3878 return 3879 roundAndPackFloatx80( 3880 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3881 3882 } 3883 3884 /* 3885 ------------------------------------------------------------------------------- 3886 Returns the remainder of the extended double-precision floating-point value 3887 `a' with respect to the corresponding value `b'. The operation is performed 3888 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3889 ------------------------------------------------------------------------------- 3890 */ 3891 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3892 { 3893 flag aSign, bSign, zSign; 3894 int32 aExp, bExp, expDiff; 3895 bits64 aSig0, aSig1, bSig; 3896 bits64 q, term0, term1, alternateASig0, alternateASig1; 3897 floatx80 z; 3898 3899 aSig0 = extractFloatx80Frac( a ); 3900 aExp = extractFloatx80Exp( a ); 3901 aSign = extractFloatx80Sign( a ); 3902 bSig = extractFloatx80Frac( b ); 3903 bExp = extractFloatx80Exp( b ); 3904 bSign = extractFloatx80Sign( b ); 3905 if ( aExp == 0x7FFF ) { 3906 if ( (bits64) ( aSig0<<1 ) 3907 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3908 return propagateFloatx80NaN( a, b ); 3909 } 3910 goto invalid; 3911 } 3912 if ( bExp == 0x7FFF ) { 3913 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3914 return a; 3915 } 3916 if ( bExp == 0 ) { 3917 if ( bSig == 0 ) { 3918 invalid: 3919 float_raise( float_flag_invalid ); 3920 z.low = floatx80_default_nan_low; 3921 z.high = floatx80_default_nan_high; 3922 return z; 3923 } 3924 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3925 } 3926 if ( aExp == 0 ) { 3927 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3928 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3929 } 3930 bSig |= LIT64( 0x8000000000000000 ); 3931 zSign = aSign; 3932 expDiff = aExp - bExp; 3933 aSig1 = 0; 3934 if ( expDiff < 0 ) { 3935 if ( expDiff < -1 ) return a; 3936 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3937 expDiff = 0; 3938 } 3939 q = ( bSig <= aSig0 ); 3940 if ( q ) aSig0 -= bSig; 3941 expDiff -= 64; 3942 while ( 0 < expDiff ) { 3943 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3944 q = ( 2 < q ) ? q - 2 : 0; 3945 mul64To128( bSig, q, &term0, &term1 ); 3946 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3947 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3948 expDiff -= 62; 3949 } 3950 expDiff += 64; 3951 if ( 0 < expDiff ) { 3952 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3953 q = ( 2 < q ) ? q - 2 : 0; 3954 q >>= 64 - expDiff; 3955 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3956 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3957 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3958 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3959 ++q; 3960 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3961 } 3962 } 3963 else { 3964 term1 = 0; 3965 term0 = bSig; 3966 } 3967 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3968 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3969 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3970 && ( q & 1 ) ) 3971 ) { 3972 aSig0 = alternateASig0; 3973 aSig1 = alternateASig1; 3974 zSign = ! zSign; 3975 } 3976 return 3977 normalizeRoundAndPackFloatx80( 3978 80, zSign, bExp + expDiff, aSig0, aSig1 ); 3979 3980 } 3981 3982 /* 3983 ------------------------------------------------------------------------------- 3984 Returns the square root of the extended double-precision floating-point 3985 value `a'. The operation is performed according to the IEC/IEEE Standard 3986 for Binary Floating-Point Arithmetic. 3987 ------------------------------------------------------------------------------- 3988 */ 3989 floatx80 floatx80_sqrt( floatx80 a ) 3990 { 3991 flag aSign; 3992 int32 aExp, zExp; 3993 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 3994 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3995 floatx80 z; 3996 3997 aSig0 = extractFloatx80Frac( a ); 3998 aExp = extractFloatx80Exp( a ); 3999 aSign = extractFloatx80Sign( a ); 4000 if ( aExp == 0x7FFF ) { 4001 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4002 if ( ! aSign ) return a; 4003 goto invalid; 4004 } 4005 if ( aSign ) { 4006 if ( ( aExp | aSig0 ) == 0 ) return a; 4007 invalid: 4008 float_raise( float_flag_invalid ); 4009 z.low = floatx80_default_nan_low; 4010 z.high = floatx80_default_nan_high; 4011 return z; 4012 } 4013 if ( aExp == 0 ) { 4014 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4015 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4016 } 4017 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4018 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4019 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4020 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4021 doubleZSig0 = zSig0<<1; 4022 mul64To128( zSig0, zSig0, &term0, &term1 ); 4023 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4024 while ( (sbits64) rem0 < 0 ) { 4025 --zSig0; 4026 doubleZSig0 -= 2; 4027 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4028 } 4029 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4030 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4031 if ( zSig1 == 0 ) zSig1 = 1; 4032 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4033 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4034 mul64To128( zSig1, zSig1, &term2, &term3 ); 4035 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4036 while ( (sbits64) rem1 < 0 ) { 4037 --zSig1; 4038 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4039 term3 |= 1; 4040 term2 |= doubleZSig0; 4041 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4042 } 4043 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4044 } 4045 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4046 zSig0 |= doubleZSig0; 4047 return 4048 roundAndPackFloatx80( 4049 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4050 4051 } 4052 4053 /* 4054 ------------------------------------------------------------------------------- 4055 Returns 1 if the extended double-precision floating-point value `a' is 4056 equal to the corresponding value `b', and 0 otherwise. The comparison is 4057 performed according to the IEC/IEEE Standard for Binary Floating-Point 4058 Arithmetic. 4059 ------------------------------------------------------------------------------- 4060 */ 4061 flag floatx80_eq( floatx80 a, floatx80 b ) 4062 { 4063 4064 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4065 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4066 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4067 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4068 ) { 4069 if ( floatx80_is_signaling_nan( a ) 4070 || floatx80_is_signaling_nan( b ) ) { 4071 float_raise( float_flag_invalid ); 4072 } 4073 return 0; 4074 } 4075 return 4076 ( a.low == b.low ) 4077 && ( ( a.high == b.high ) 4078 || ( ( a.low == 0 ) 4079 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4080 ); 4081 4082 } 4083 4084 /* 4085 ------------------------------------------------------------------------------- 4086 Returns 1 if the extended double-precision floating-point value `a' is 4087 less than or equal to the corresponding value `b', and 0 otherwise. The 4088 comparison is performed according to the IEC/IEEE Standard for Binary 4089 Floating-Point Arithmetic. 4090 ------------------------------------------------------------------------------- 4091 */ 4092 flag floatx80_le( floatx80 a, floatx80 b ) 4093 { 4094 flag aSign, bSign; 4095 4096 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4097 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4098 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4099 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4100 ) { 4101 float_raise( float_flag_invalid ); 4102 return 0; 4103 } 4104 aSign = extractFloatx80Sign( a ); 4105 bSign = extractFloatx80Sign( b ); 4106 if ( aSign != bSign ) { 4107 return 4108 aSign 4109 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4110 == 0 ); 4111 } 4112 return 4113 aSign ? le128( b.high, b.low, a.high, a.low ) 4114 : le128( a.high, a.low, b.high, b.low ); 4115 4116 } 4117 4118 /* 4119 ------------------------------------------------------------------------------- 4120 Returns 1 if the extended double-precision floating-point value `a' is 4121 less than the corresponding value `b', and 0 otherwise. The comparison 4122 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4123 Arithmetic. 4124 ------------------------------------------------------------------------------- 4125 */ 4126 flag floatx80_lt( floatx80 a, floatx80 b ) 4127 { 4128 flag aSign, bSign; 4129 4130 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4131 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4132 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4133 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4134 ) { 4135 float_raise( float_flag_invalid ); 4136 return 0; 4137 } 4138 aSign = extractFloatx80Sign( a ); 4139 bSign = extractFloatx80Sign( b ); 4140 if ( aSign != bSign ) { 4141 return 4142 aSign 4143 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4144 != 0 ); 4145 } 4146 return 4147 aSign ? lt128( b.high, b.low, a.high, a.low ) 4148 : lt128( a.high, a.low, b.high, b.low ); 4149 4150 } 4151 4152 /* 4153 ------------------------------------------------------------------------------- 4154 Returns 1 if the extended double-precision floating-point value `a' is equal 4155 to the corresponding value `b', and 0 otherwise. The invalid exception is 4156 raised if either operand is a NaN. Otherwise, the comparison is performed 4157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4158 ------------------------------------------------------------------------------- 4159 */ 4160 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4161 { 4162 4163 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4164 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4165 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4166 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4167 ) { 4168 float_raise( float_flag_invalid ); 4169 return 0; 4170 } 4171 return 4172 ( a.low == b.low ) 4173 && ( ( a.high == b.high ) 4174 || ( ( a.low == 0 ) 4175 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4176 ); 4177 4178 } 4179 4180 /* 4181 ------------------------------------------------------------------------------- 4182 Returns 1 if the extended double-precision floating-point value `a' is less 4183 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4184 do not cause an exception. Otherwise, the comparison is performed according 4185 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4186 ------------------------------------------------------------------------------- 4187 */ 4188 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4189 { 4190 flag aSign, bSign; 4191 4192 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4193 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4194 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4195 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4196 ) { 4197 if ( floatx80_is_signaling_nan( a ) 4198 || floatx80_is_signaling_nan( b ) ) { 4199 float_raise( float_flag_invalid ); 4200 } 4201 return 0; 4202 } 4203 aSign = extractFloatx80Sign( a ); 4204 bSign = extractFloatx80Sign( b ); 4205 if ( aSign != bSign ) { 4206 return 4207 aSign 4208 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4209 == 0 ); 4210 } 4211 return 4212 aSign ? le128( b.high, b.low, a.high, a.low ) 4213 : le128( a.high, a.low, b.high, b.low ); 4214 4215 } 4216 4217 /* 4218 ------------------------------------------------------------------------------- 4219 Returns 1 if the extended double-precision floating-point value `a' is less 4220 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4221 an exception. Otherwise, the comparison is performed according to the 4222 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4223 ------------------------------------------------------------------------------- 4224 */ 4225 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4226 { 4227 flag aSign, bSign; 4228 4229 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4230 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4231 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4232 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4233 ) { 4234 if ( floatx80_is_signaling_nan( a ) 4235 || floatx80_is_signaling_nan( b ) ) { 4236 float_raise( float_flag_invalid ); 4237 } 4238 return 0; 4239 } 4240 aSign = extractFloatx80Sign( a ); 4241 bSign = extractFloatx80Sign( b ); 4242 if ( aSign != bSign ) { 4243 return 4244 aSign 4245 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4246 != 0 ); 4247 } 4248 return 4249 aSign ? lt128( b.high, b.low, a.high, a.low ) 4250 : lt128( a.high, a.low, b.high, b.low ); 4251 4252 } 4253 4254 #endif 4255 4256 #ifdef FLOAT128 4257 4258 /* 4259 ------------------------------------------------------------------------------- 4260 Returns the result of converting the quadruple-precision floating-point 4261 value `a' to the 32-bit two's complement integer format. The conversion 4262 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4263 Arithmetic---which means in particular that the conversion is rounded 4264 according to the current rounding mode. If `a' is a NaN, the largest 4265 positive integer is returned. Otherwise, if the conversion overflows, the 4266 largest integer with the same sign as `a' is returned. 4267 ------------------------------------------------------------------------------- 4268 */ 4269 int32 float128_to_int32( float128 a ) 4270 { 4271 flag aSign; 4272 int32 aExp, shiftCount; 4273 bits64 aSig0, aSig1; 4274 4275 aSig1 = extractFloat128Frac1( a ); 4276 aSig0 = extractFloat128Frac0( a ); 4277 aExp = extractFloat128Exp( a ); 4278 aSign = extractFloat128Sign( a ); 4279 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4280 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4281 aSig0 |= ( aSig1 != 0 ); 4282 shiftCount = 0x4028 - aExp; 4283 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4284 return roundAndPackInt32( aSign, aSig0 ); 4285 4286 } 4287 4288 /* 4289 ------------------------------------------------------------------------------- 4290 Returns the result of converting the quadruple-precision floating-point 4291 value `a' to the 32-bit two's complement integer format. The conversion 4292 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4293 Arithmetic, except that the conversion is always rounded toward zero. If 4294 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4295 conversion overflows, the largest integer with the same sign as `a' is 4296 returned. 4297 ------------------------------------------------------------------------------- 4298 */ 4299 int32 float128_to_int32_round_to_zero( float128 a ) 4300 { 4301 flag aSign; 4302 int32 aExp, shiftCount; 4303 bits64 aSig0, aSig1, savedASig; 4304 int32 z; 4305 4306 aSig1 = extractFloat128Frac1( a ); 4307 aSig0 = extractFloat128Frac0( a ); 4308 aExp = extractFloat128Exp( a ); 4309 aSign = extractFloat128Sign( a ); 4310 aSig0 |= ( aSig1 != 0 ); 4311 if ( 0x401E < aExp ) { 4312 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4313 goto invalid; 4314 } 4315 else if ( aExp < 0x3FFF ) { 4316 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact; 4317 return 0; 4318 } 4319 aSig0 |= LIT64( 0x0001000000000000 ); 4320 shiftCount = 0x402F - aExp; 4321 savedASig = aSig0; 4322 aSig0 >>= shiftCount; 4323 z = aSig0; 4324 if ( aSign ) z = - z; 4325 if ( ( z < 0 ) ^ aSign ) { 4326 invalid: 4327 float_raise( float_flag_invalid ); 4328 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4329 } 4330 if ( ( aSig0<<shiftCount ) != savedASig ) { 4331 float_exception_flags |= float_flag_inexact; 4332 } 4333 return z; 4334 4335 } 4336 4337 /* 4338 ------------------------------------------------------------------------------- 4339 Returns the result of converting the quadruple-precision floating-point 4340 value `a' to the 64-bit two's complement integer format. The conversion 4341 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4342 Arithmetic---which means in particular that the conversion is rounded 4343 according to the current rounding mode. If `a' is a NaN, the largest 4344 positive integer is returned. Otherwise, if the conversion overflows, the 4345 largest integer with the same sign as `a' is returned. 4346 ------------------------------------------------------------------------------- 4347 */ 4348 int64 float128_to_int64( float128 a ) 4349 { 4350 flag aSign; 4351 int32 aExp, shiftCount; 4352 bits64 aSig0, aSig1; 4353 4354 aSig1 = extractFloat128Frac1( a ); 4355 aSig0 = extractFloat128Frac0( a ); 4356 aExp = extractFloat128Exp( a ); 4357 aSign = extractFloat128Sign( a ); 4358 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4359 shiftCount = 0x402F - aExp; 4360 if ( shiftCount <= 0 ) { 4361 if ( 0x403E < aExp ) { 4362 float_raise( float_flag_invalid ); 4363 if ( ! aSign 4364 || ( ( aExp == 0x7FFF ) 4365 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4366 ) 4367 ) { 4368 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4369 } 4370 return (sbits64) LIT64( 0x8000000000000000 ); 4371 } 4372 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4373 } 4374 else { 4375 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4376 } 4377 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4378 4379 } 4380 4381 /* 4382 ------------------------------------------------------------------------------- 4383 Returns the result of converting the quadruple-precision floating-point 4384 value `a' to the 64-bit two's complement integer format. The conversion 4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4386 Arithmetic, except that the conversion is always rounded toward zero. 4387 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4388 the conversion overflows, the largest integer with the same sign as `a' is 4389 returned. 4390 ------------------------------------------------------------------------------- 4391 */ 4392 int64 float128_to_int64_round_to_zero( float128 a ) 4393 { 4394 flag aSign; 4395 int32 aExp, shiftCount; 4396 bits64 aSig0, aSig1; 4397 int64 z; 4398 4399 aSig1 = extractFloat128Frac1( a ); 4400 aSig0 = extractFloat128Frac0( a ); 4401 aExp = extractFloat128Exp( a ); 4402 aSign = extractFloat128Sign( a ); 4403 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4404 shiftCount = aExp - 0x402F; 4405 if ( 0 < shiftCount ) { 4406 if ( 0x403E <= aExp ) { 4407 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4408 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4409 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4410 if ( aSig1 ) float_exception_flags |= float_flag_inexact; 4411 } 4412 else { 4413 float_raise( float_flag_invalid ); 4414 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4415 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4416 } 4417 } 4418 return (sbits64) LIT64( 0x8000000000000000 ); 4419 } 4420 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4421 if ( (bits64) ( aSig1<<shiftCount ) ) { 4422 float_exception_flags |= float_flag_inexact; 4423 } 4424 } 4425 else { 4426 if ( aExp < 0x3FFF ) { 4427 if ( aExp | aSig0 | aSig1 ) { 4428 float_exception_flags |= float_flag_inexact; 4429 } 4430 return 0; 4431 } 4432 z = aSig0>>( - shiftCount ); 4433 if ( aSig1 4434 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4435 float_exception_flags |= float_flag_inexact; 4436 } 4437 } 4438 if ( aSign ) z = - z; 4439 return z; 4440 4441 } 4442 4443 /* 4444 ------------------------------------------------------------------------------- 4445 Returns the result of converting the quadruple-precision floating-point 4446 value `a' to the single-precision floating-point format. The conversion 4447 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4448 Arithmetic. 4449 ------------------------------------------------------------------------------- 4450 */ 4451 float32 float128_to_float32( float128 a ) 4452 { 4453 flag aSign; 4454 int32 aExp; 4455 bits64 aSig0, aSig1; 4456 bits32 zSig; 4457 4458 aSig1 = extractFloat128Frac1( a ); 4459 aSig0 = extractFloat128Frac0( a ); 4460 aExp = extractFloat128Exp( a ); 4461 aSign = extractFloat128Sign( a ); 4462 if ( aExp == 0x7FFF ) { 4463 if ( aSig0 | aSig1 ) { 4464 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4465 } 4466 return packFloat32( aSign, 0xFF, 0 ); 4467 } 4468 aSig0 |= ( aSig1 != 0 ); 4469 shift64RightJamming( aSig0, 18, &aSig0 ); 4470 zSig = aSig0; 4471 if ( aExp || zSig ) { 4472 zSig |= 0x40000000; 4473 aExp -= 0x3F81; 4474 } 4475 return roundAndPackFloat32( aSign, aExp, zSig ); 4476 4477 } 4478 4479 /* 4480 ------------------------------------------------------------------------------- 4481 Returns the result of converting the quadruple-precision floating-point 4482 value `a' to the double-precision floating-point format. The conversion 4483 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4484 Arithmetic. 4485 ------------------------------------------------------------------------------- 4486 */ 4487 float64 float128_to_float64( float128 a ) 4488 { 4489 flag aSign; 4490 int32 aExp; 4491 bits64 aSig0, aSig1; 4492 4493 aSig1 = extractFloat128Frac1( a ); 4494 aSig0 = extractFloat128Frac0( a ); 4495 aExp = extractFloat128Exp( a ); 4496 aSign = extractFloat128Sign( a ); 4497 if ( aExp == 0x7FFF ) { 4498 if ( aSig0 | aSig1 ) { 4499 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4500 } 4501 return packFloat64( aSign, 0x7FF, 0 ); 4502 } 4503 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4504 aSig0 |= ( aSig1 != 0 ); 4505 if ( aExp || aSig0 ) { 4506 aSig0 |= LIT64( 0x4000000000000000 ); 4507 aExp -= 0x3C01; 4508 } 4509 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4510 4511 } 4512 4513 #ifdef FLOATX80 4514 4515 /* 4516 ------------------------------------------------------------------------------- 4517 Returns the result of converting the quadruple-precision floating-point 4518 value `a' to the extended double-precision floating-point format. The 4519 conversion is performed according to the IEC/IEEE Standard for Binary 4520 Floating-Point Arithmetic. 4521 ------------------------------------------------------------------------------- 4522 */ 4523 floatx80 float128_to_floatx80( float128 a ) 4524 { 4525 flag aSign; 4526 int32 aExp; 4527 bits64 aSig0, aSig1; 4528 4529 aSig1 = extractFloat128Frac1( a ); 4530 aSig0 = extractFloat128Frac0( a ); 4531 aExp = extractFloat128Exp( a ); 4532 aSign = extractFloat128Sign( a ); 4533 if ( aExp == 0x7FFF ) { 4534 if ( aSig0 | aSig1 ) { 4535 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4536 } 4537 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4538 } 4539 if ( aExp == 0 ) { 4540 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4541 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4542 } 4543 else { 4544 aSig0 |= LIT64( 0x0001000000000000 ); 4545 } 4546 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4547 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4548 4549 } 4550 4551 #endif 4552 4553 /* 4554 ------------------------------------------------------------------------------- 4555 Rounds the quadruple-precision floating-point value `a' to an integer, and 4556 returns the result as a quadruple-precision floating-point value. The 4557 operation is performed according to the IEC/IEEE Standard for Binary 4558 Floating-Point Arithmetic. 4559 ------------------------------------------------------------------------------- 4560 */ 4561 float128 float128_round_to_int( float128 a ) 4562 { 4563 flag aSign; 4564 int32 aExp; 4565 bits64 lastBitMask, roundBitsMask; 4566 int8 roundingMode; 4567 float128 z; 4568 4569 aExp = extractFloat128Exp( a ); 4570 if ( 0x402F <= aExp ) { 4571 if ( 0x406F <= aExp ) { 4572 if ( ( aExp == 0x7FFF ) 4573 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4574 ) { 4575 return propagateFloat128NaN( a, a ); 4576 } 4577 return a; 4578 } 4579 lastBitMask = 1; 4580 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4581 roundBitsMask = lastBitMask - 1; 4582 z = a; 4583 roundingMode = float_rounding_mode; 4584 if ( roundingMode == float_round_nearest_even ) { 4585 if ( lastBitMask ) { 4586 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4587 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4588 } 4589 else { 4590 if ( (sbits64) z.low < 0 ) { 4591 ++z.high; 4592 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4593 } 4594 } 4595 } 4596 else if ( roundingMode != float_round_to_zero ) { 4597 if ( extractFloat128Sign( z ) 4598 ^ ( roundingMode == float_round_up ) ) { 4599 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4600 } 4601 } 4602 z.low &= ~ roundBitsMask; 4603 } 4604 else { 4605 if ( aExp < 0x3FFF ) { 4606 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4607 float_exception_flags |= float_flag_inexact; 4608 aSign = extractFloat128Sign( a ); 4609 switch ( float_rounding_mode ) { 4610 case float_round_nearest_even: 4611 if ( ( aExp == 0x3FFE ) 4612 && ( extractFloat128Frac0( a ) 4613 | extractFloat128Frac1( a ) ) 4614 ) { 4615 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4616 } 4617 break; 4618 case float_round_to_zero: 4619 break; 4620 case float_round_down: 4621 return 4622 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4623 : packFloat128( 0, 0, 0, 0 ); 4624 case float_round_up: 4625 return 4626 aSign ? packFloat128( 1, 0, 0, 0 ) 4627 : packFloat128( 0, 0x3FFF, 0, 0 ); 4628 } 4629 return packFloat128( aSign, 0, 0, 0 ); 4630 } 4631 lastBitMask = 1; 4632 lastBitMask <<= 0x402F - aExp; 4633 roundBitsMask = lastBitMask - 1; 4634 z.low = 0; 4635 z.high = a.high; 4636 roundingMode = float_rounding_mode; 4637 if ( roundingMode == float_round_nearest_even ) { 4638 z.high += lastBitMask>>1; 4639 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4640 z.high &= ~ lastBitMask; 4641 } 4642 } 4643 else if ( roundingMode != float_round_to_zero ) { 4644 if ( extractFloat128Sign( z ) 4645 ^ ( roundingMode == float_round_up ) ) { 4646 z.high |= ( a.low != 0 ); 4647 z.high += roundBitsMask; 4648 } 4649 } 4650 z.high &= ~ roundBitsMask; 4651 } 4652 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4653 float_exception_flags |= float_flag_inexact; 4654 } 4655 return z; 4656 4657 } 4658 4659 /* 4660 ------------------------------------------------------------------------------- 4661 Returns the result of adding the absolute values of the quadruple-precision 4662 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4663 before being returned. `zSign' is ignored if the result is a NaN. 4664 The addition is performed according to the IEC/IEEE Standard for Binary 4665 Floating-Point Arithmetic. 4666 ------------------------------------------------------------------------------- 4667 */ 4668 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4669 { 4670 int32 aExp, bExp, zExp; 4671 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4672 int32 expDiff; 4673 4674 aSig1 = extractFloat128Frac1( a ); 4675 aSig0 = extractFloat128Frac0( a ); 4676 aExp = extractFloat128Exp( a ); 4677 bSig1 = extractFloat128Frac1( b ); 4678 bSig0 = extractFloat128Frac0( b ); 4679 bExp = extractFloat128Exp( b ); 4680 expDiff = aExp - bExp; 4681 if ( 0 < expDiff ) { 4682 if ( aExp == 0x7FFF ) { 4683 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4684 return a; 4685 } 4686 if ( bExp == 0 ) { 4687 --expDiff; 4688 } 4689 else { 4690 bSig0 |= LIT64( 0x0001000000000000 ); 4691 } 4692 shift128ExtraRightJamming( 4693 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4694 zExp = aExp; 4695 } 4696 else if ( expDiff < 0 ) { 4697 if ( bExp == 0x7FFF ) { 4698 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4699 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4700 } 4701 if ( aExp == 0 ) { 4702 ++expDiff; 4703 } 4704 else { 4705 aSig0 |= LIT64( 0x0001000000000000 ); 4706 } 4707 shift128ExtraRightJamming( 4708 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4709 zExp = bExp; 4710 } 4711 else { 4712 if ( aExp == 0x7FFF ) { 4713 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4714 return propagateFloat128NaN( a, b ); 4715 } 4716 return a; 4717 } 4718 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4719 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4720 zSig2 = 0; 4721 zSig0 |= LIT64( 0x0002000000000000 ); 4722 zExp = aExp; 4723 goto shiftRight1; 4724 } 4725 aSig0 |= LIT64( 0x0001000000000000 ); 4726 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4727 --zExp; 4728 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4729 ++zExp; 4730 shiftRight1: 4731 shift128ExtraRightJamming( 4732 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4733 roundAndPack: 4734 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4735 4736 } 4737 4738 /* 4739 ------------------------------------------------------------------------------- 4740 Returns the result of subtracting the absolute values of the quadruple- 4741 precision floating-point values `a' and `b'. If `zSign' is 1, the 4742 difference is negated before being returned. `zSign' is ignored if the 4743 result is a NaN. The subtraction is performed according to the IEC/IEEE 4744 Standard for Binary Floating-Point Arithmetic. 4745 ------------------------------------------------------------------------------- 4746 */ 4747 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4748 { 4749 int32 aExp, bExp, zExp; 4750 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4751 int32 expDiff; 4752 float128 z; 4753 4754 aSig1 = extractFloat128Frac1( a ); 4755 aSig0 = extractFloat128Frac0( a ); 4756 aExp = extractFloat128Exp( a ); 4757 bSig1 = extractFloat128Frac1( b ); 4758 bSig0 = extractFloat128Frac0( b ); 4759 bExp = extractFloat128Exp( b ); 4760 expDiff = aExp - bExp; 4761 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4762 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4763 if ( 0 < expDiff ) goto aExpBigger; 4764 if ( expDiff < 0 ) goto bExpBigger; 4765 if ( aExp == 0x7FFF ) { 4766 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4767 return propagateFloat128NaN( a, b ); 4768 } 4769 float_raise( float_flag_invalid ); 4770 z.low = float128_default_nan_low; 4771 z.high = float128_default_nan_high; 4772 return z; 4773 } 4774 if ( aExp == 0 ) { 4775 aExp = 1; 4776 bExp = 1; 4777 } 4778 if ( bSig0 < aSig0 ) goto aBigger; 4779 if ( aSig0 < bSig0 ) goto bBigger; 4780 if ( bSig1 < aSig1 ) goto aBigger; 4781 if ( aSig1 < bSig1 ) goto bBigger; 4782 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); 4783 bExpBigger: 4784 if ( bExp == 0x7FFF ) { 4785 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4786 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4787 } 4788 if ( aExp == 0 ) { 4789 ++expDiff; 4790 } 4791 else { 4792 aSig0 |= LIT64( 0x4000000000000000 ); 4793 } 4794 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4795 bSig0 |= LIT64( 0x4000000000000000 ); 4796 bBigger: 4797 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4798 zExp = bExp; 4799 zSign ^= 1; 4800 goto normalizeRoundAndPack; 4801 aExpBigger: 4802 if ( aExp == 0x7FFF ) { 4803 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4804 return a; 4805 } 4806 if ( bExp == 0 ) { 4807 --expDiff; 4808 } 4809 else { 4810 bSig0 |= LIT64( 0x4000000000000000 ); 4811 } 4812 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4813 aSig0 |= LIT64( 0x4000000000000000 ); 4814 aBigger: 4815 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4816 zExp = aExp; 4817 normalizeRoundAndPack: 4818 --zExp; 4819 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4820 4821 } 4822 4823 /* 4824 ------------------------------------------------------------------------------- 4825 Returns the result of adding the quadruple-precision floating-point values 4826 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4827 for Binary Floating-Point Arithmetic. 4828 ------------------------------------------------------------------------------- 4829 */ 4830 float128 float128_add( float128 a, float128 b ) 4831 { 4832 flag aSign, bSign; 4833 4834 aSign = extractFloat128Sign( a ); 4835 bSign = extractFloat128Sign( b ); 4836 if ( aSign == bSign ) { 4837 return addFloat128Sigs( a, b, aSign ); 4838 } 4839 else { 4840 return subFloat128Sigs( a, b, aSign ); 4841 } 4842 4843 } 4844 4845 /* 4846 ------------------------------------------------------------------------------- 4847 Returns the result of subtracting the quadruple-precision floating-point 4848 values `a' and `b'. The operation is performed according to the IEC/IEEE 4849 Standard for Binary Floating-Point Arithmetic. 4850 ------------------------------------------------------------------------------- 4851 */ 4852 float128 float128_sub( float128 a, float128 b ) 4853 { 4854 flag aSign, bSign; 4855 4856 aSign = extractFloat128Sign( a ); 4857 bSign = extractFloat128Sign( b ); 4858 if ( aSign == bSign ) { 4859 return subFloat128Sigs( a, b, aSign ); 4860 } 4861 else { 4862 return addFloat128Sigs( a, b, aSign ); 4863 } 4864 4865 } 4866 4867 /* 4868 ------------------------------------------------------------------------------- 4869 Returns the result of multiplying the quadruple-precision floating-point 4870 values `a' and `b'. The operation is performed according to the IEC/IEEE 4871 Standard for Binary Floating-Point Arithmetic. 4872 ------------------------------------------------------------------------------- 4873 */ 4874 float128 float128_mul( float128 a, float128 b ) 4875 { 4876 flag aSign, bSign, zSign; 4877 int32 aExp, bExp, zExp; 4878 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4879 float128 z; 4880 4881 aSig1 = extractFloat128Frac1( a ); 4882 aSig0 = extractFloat128Frac0( a ); 4883 aExp = extractFloat128Exp( a ); 4884 aSign = extractFloat128Sign( a ); 4885 bSig1 = extractFloat128Frac1( b ); 4886 bSig0 = extractFloat128Frac0( b ); 4887 bExp = extractFloat128Exp( b ); 4888 bSign = extractFloat128Sign( b ); 4889 zSign = aSign ^ bSign; 4890 if ( aExp == 0x7FFF ) { 4891 if ( ( aSig0 | aSig1 ) 4892 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4893 return propagateFloat128NaN( a, b ); 4894 } 4895 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4896 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4897 } 4898 if ( bExp == 0x7FFF ) { 4899 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4900 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4901 invalid: 4902 float_raise( float_flag_invalid ); 4903 z.low = float128_default_nan_low; 4904 z.high = float128_default_nan_high; 4905 return z; 4906 } 4907 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4908 } 4909 if ( aExp == 0 ) { 4910 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4911 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4912 } 4913 if ( bExp == 0 ) { 4914 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4915 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4916 } 4917 zExp = aExp + bExp - 0x4000; 4918 aSig0 |= LIT64( 0x0001000000000000 ); 4919 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 4920 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 4921 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4922 zSig2 |= ( zSig3 != 0 ); 4923 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 4924 shift128ExtraRightJamming( 4925 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4926 ++zExp; 4927 } 4928 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4929 4930 } 4931 4932 /* 4933 ------------------------------------------------------------------------------- 4934 Returns the result of dividing the quadruple-precision floating-point value 4935 `a' by the corresponding value `b'. The operation is performed according to 4936 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4937 ------------------------------------------------------------------------------- 4938 */ 4939 float128 float128_div( float128 a, float128 b ) 4940 { 4941 flag aSign, bSign, zSign; 4942 int32 aExp, bExp, zExp; 4943 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4944 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4945 float128 z; 4946 4947 aSig1 = extractFloat128Frac1( a ); 4948 aSig0 = extractFloat128Frac0( a ); 4949 aExp = extractFloat128Exp( a ); 4950 aSign = extractFloat128Sign( a ); 4951 bSig1 = extractFloat128Frac1( b ); 4952 bSig0 = extractFloat128Frac0( b ); 4953 bExp = extractFloat128Exp( b ); 4954 bSign = extractFloat128Sign( b ); 4955 zSign = aSign ^ bSign; 4956 if ( aExp == 0x7FFF ) { 4957 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4958 if ( bExp == 0x7FFF ) { 4959 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4960 goto invalid; 4961 } 4962 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4963 } 4964 if ( bExp == 0x7FFF ) { 4965 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4966 return packFloat128( zSign, 0, 0, 0 ); 4967 } 4968 if ( bExp == 0 ) { 4969 if ( ( bSig0 | bSig1 ) == 0 ) { 4970 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4971 invalid: 4972 float_raise( float_flag_invalid ); 4973 z.low = float128_default_nan_low; 4974 z.high = float128_default_nan_high; 4975 return z; 4976 } 4977 float_raise( float_flag_divbyzero ); 4978 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4979 } 4980 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4981 } 4982 if ( aExp == 0 ) { 4983 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4984 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4985 } 4986 zExp = aExp - bExp + 0x3FFD; 4987 shortShift128Left( 4988 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 4989 shortShift128Left( 4990 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4991 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 4992 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 4993 ++zExp; 4994 } 4995 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4996 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 4997 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 4998 while ( (sbits64) rem0 < 0 ) { 4999 --zSig0; 5000 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5001 } 5002 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5003 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5004 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5005 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5006 while ( (sbits64) rem1 < 0 ) { 5007 --zSig1; 5008 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5009 } 5010 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5011 } 5012 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5013 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5014 5015 } 5016 5017 /* 5018 ------------------------------------------------------------------------------- 5019 Returns the remainder of the quadruple-precision floating-point value `a' 5020 with respect to the corresponding value `b'. The operation is performed 5021 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5022 ------------------------------------------------------------------------------- 5023 */ 5024 float128 float128_rem( float128 a, float128 b ) 5025 { 5026 flag aSign, bSign, zSign; 5027 int32 aExp, bExp, expDiff; 5028 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5029 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5030 sbits64 sigMean0; 5031 float128 z; 5032 5033 aSig1 = extractFloat128Frac1( a ); 5034 aSig0 = extractFloat128Frac0( a ); 5035 aExp = extractFloat128Exp( a ); 5036 aSign = extractFloat128Sign( a ); 5037 bSig1 = extractFloat128Frac1( b ); 5038 bSig0 = extractFloat128Frac0( b ); 5039 bExp = extractFloat128Exp( b ); 5040 bSign = extractFloat128Sign( b ); 5041 if ( aExp == 0x7FFF ) { 5042 if ( ( aSig0 | aSig1 ) 5043 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5044 return propagateFloat128NaN( a, b ); 5045 } 5046 goto invalid; 5047 } 5048 if ( bExp == 0x7FFF ) { 5049 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5050 return a; 5051 } 5052 if ( bExp == 0 ) { 5053 if ( ( bSig0 | bSig1 ) == 0 ) { 5054 invalid: 5055 float_raise( float_flag_invalid ); 5056 z.low = float128_default_nan_low; 5057 z.high = float128_default_nan_high; 5058 return z; 5059 } 5060 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5061 } 5062 if ( aExp == 0 ) { 5063 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5064 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5065 } 5066 expDiff = aExp - bExp; 5067 if ( expDiff < -1 ) return a; 5068 shortShift128Left( 5069 aSig0 | LIT64( 0x0001000000000000 ), 5070 aSig1, 5071 15 - ( expDiff < 0 ), 5072 &aSig0, 5073 &aSig1 5074 ); 5075 shortShift128Left( 5076 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5077 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5078 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5079 expDiff -= 64; 5080 while ( 0 < expDiff ) { 5081 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5082 q = ( 4 < q ) ? q - 4 : 0; 5083 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5084 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5085 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5086 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5087 expDiff -= 61; 5088 } 5089 if ( -64 < expDiff ) { 5090 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5091 q = ( 4 < q ) ? q - 4 : 0; 5092 q >>= - expDiff; 5093 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5094 expDiff += 52; 5095 if ( expDiff < 0 ) { 5096 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5097 } 5098 else { 5099 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5100 } 5101 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5102 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5103 } 5104 else { 5105 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5106 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5107 } 5108 do { 5109 alternateASig0 = aSig0; 5110 alternateASig1 = aSig1; 5111 ++q; 5112 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5113 } while ( 0 <= (sbits64) aSig0 ); 5114 add128( 5115 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5116 if ( ( sigMean0 < 0 ) 5117 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5118 aSig0 = alternateASig0; 5119 aSig1 = alternateASig1; 5120 } 5121 zSign = ( (sbits64) aSig0 < 0 ); 5122 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5123 return 5124 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5125 5126 } 5127 5128 /* 5129 ------------------------------------------------------------------------------- 5130 Returns the square root of the quadruple-precision floating-point value `a'. 5131 The operation is performed according to the IEC/IEEE Standard for Binary 5132 Floating-Point Arithmetic. 5133 ------------------------------------------------------------------------------- 5134 */ 5135 float128 float128_sqrt( float128 a ) 5136 { 5137 flag aSign; 5138 int32 aExp, zExp; 5139 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5140 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5141 float128 z; 5142 5143 aSig1 = extractFloat128Frac1( a ); 5144 aSig0 = extractFloat128Frac0( a ); 5145 aExp = extractFloat128Exp( a ); 5146 aSign = extractFloat128Sign( a ); 5147 if ( aExp == 0x7FFF ) { 5148 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5149 if ( ! aSign ) return a; 5150 goto invalid; 5151 } 5152 if ( aSign ) { 5153 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5154 invalid: 5155 float_raise( float_flag_invalid ); 5156 z.low = float128_default_nan_low; 5157 z.high = float128_default_nan_high; 5158 return z; 5159 } 5160 if ( aExp == 0 ) { 5161 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5162 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5163 } 5164 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5165 aSig0 |= LIT64( 0x0001000000000000 ); 5166 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5167 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5168 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5169 doubleZSig0 = zSig0<<1; 5170 mul64To128( zSig0, zSig0, &term0, &term1 ); 5171 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5172 while ( (sbits64) rem0 < 0 ) { 5173 --zSig0; 5174 doubleZSig0 -= 2; 5175 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5176 } 5177 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5178 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5179 if ( zSig1 == 0 ) zSig1 = 1; 5180 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5181 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5182 mul64To128( zSig1, zSig1, &term2, &term3 ); 5183 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5184 while ( (sbits64) rem1 < 0 ) { 5185 --zSig1; 5186 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5187 term3 |= 1; 5188 term2 |= doubleZSig0; 5189 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5190 } 5191 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5192 } 5193 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5194 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5195 5196 } 5197 5198 /* 5199 ------------------------------------------------------------------------------- 5200 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5201 the corresponding value `b', and 0 otherwise. The comparison is performed 5202 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5203 ------------------------------------------------------------------------------- 5204 */ 5205 flag float128_eq( float128 a, float128 b ) 5206 { 5207 5208 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5209 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5210 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5211 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5212 ) { 5213 if ( float128_is_signaling_nan( a ) 5214 || float128_is_signaling_nan( b ) ) { 5215 float_raise( float_flag_invalid ); 5216 } 5217 return 0; 5218 } 5219 return 5220 ( a.low == b.low ) 5221 && ( ( a.high == b.high ) 5222 || ( ( a.low == 0 ) 5223 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5224 ); 5225 5226 } 5227 5228 /* 5229 ------------------------------------------------------------------------------- 5230 Returns 1 if the quadruple-precision floating-point value `a' is less than 5231 or equal to the corresponding value `b', and 0 otherwise. The comparison 5232 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5233 Arithmetic. 5234 ------------------------------------------------------------------------------- 5235 */ 5236 flag float128_le( float128 a, float128 b ) 5237 { 5238 flag aSign, bSign; 5239 5240 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5241 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5242 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5243 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5244 ) { 5245 float_raise( float_flag_invalid ); 5246 return 0; 5247 } 5248 aSign = extractFloat128Sign( a ); 5249 bSign = extractFloat128Sign( b ); 5250 if ( aSign != bSign ) { 5251 return 5252 aSign 5253 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5254 == 0 ); 5255 } 5256 return 5257 aSign ? le128( b.high, b.low, a.high, a.low ) 5258 : le128( a.high, a.low, b.high, b.low ); 5259 5260 } 5261 5262 /* 5263 ------------------------------------------------------------------------------- 5264 Returns 1 if the quadruple-precision floating-point value `a' is less than 5265 the corresponding value `b', and 0 otherwise. The comparison is performed 5266 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5267 ------------------------------------------------------------------------------- 5268 */ 5269 flag float128_lt( float128 a, float128 b ) 5270 { 5271 flag aSign, bSign; 5272 5273 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5274 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5275 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5276 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5277 ) { 5278 float_raise( float_flag_invalid ); 5279 return 0; 5280 } 5281 aSign = extractFloat128Sign( a ); 5282 bSign = extractFloat128Sign( b ); 5283 if ( aSign != bSign ) { 5284 return 5285 aSign 5286 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5287 != 0 ); 5288 } 5289 return 5290 aSign ? lt128( b.high, b.low, a.high, a.low ) 5291 : lt128( a.high, a.low, b.high, b.low ); 5292 5293 } 5294 5295 /* 5296 ------------------------------------------------------------------------------- 5297 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5298 the corresponding value `b', and 0 otherwise. The invalid exception is 5299 raised if either operand is a NaN. Otherwise, the comparison is performed 5300 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5301 ------------------------------------------------------------------------------- 5302 */ 5303 flag float128_eq_signaling( float128 a, float128 b ) 5304 { 5305 5306 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5307 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5308 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5309 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5310 ) { 5311 float_raise( float_flag_invalid ); 5312 return 0; 5313 } 5314 return 5315 ( a.low == b.low ) 5316 && ( ( a.high == b.high ) 5317 || ( ( a.low == 0 ) 5318 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5319 ); 5320 5321 } 5322 5323 /* 5324 ------------------------------------------------------------------------------- 5325 Returns 1 if the quadruple-precision floating-point value `a' is less than 5326 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5327 cause an exception. Otherwise, the comparison is performed according to the 5328 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5329 ------------------------------------------------------------------------------- 5330 */ 5331 flag float128_le_quiet( float128 a, float128 b ) 5332 { 5333 flag aSign, bSign; 5334 5335 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5336 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5337 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5338 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5339 ) { 5340 if ( float128_is_signaling_nan( a ) 5341 || float128_is_signaling_nan( b ) ) { 5342 float_raise( float_flag_invalid ); 5343 } 5344 return 0; 5345 } 5346 aSign = extractFloat128Sign( a ); 5347 bSign = extractFloat128Sign( b ); 5348 if ( aSign != bSign ) { 5349 return 5350 aSign 5351 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5352 == 0 ); 5353 } 5354 return 5355 aSign ? le128( b.high, b.low, a.high, a.low ) 5356 : le128( a.high, a.low, b.high, b.low ); 5357 5358 } 5359 5360 /* 5361 ------------------------------------------------------------------------------- 5362 Returns 1 if the quadruple-precision floating-point value `a' is less than 5363 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5364 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5365 Standard for Binary Floating-Point Arithmetic. 5366 ------------------------------------------------------------------------------- 5367 */ 5368 flag float128_lt_quiet( float128 a, float128 b ) 5369 { 5370 flag aSign, bSign; 5371 5372 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5373 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5374 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5375 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5376 ) { 5377 if ( float128_is_signaling_nan( a ) 5378 || float128_is_signaling_nan( b ) ) { 5379 float_raise( float_flag_invalid ); 5380 } 5381 return 0; 5382 } 5383 aSign = extractFloat128Sign( a ); 5384 bSign = extractFloat128Sign( b ); 5385 if ( aSign != bSign ) { 5386 return 5387 aSign 5388 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5389 != 0 ); 5390 } 5391 return 5392 aSign ? lt128( b.high, b.low, a.high, a.low ) 5393 : lt128( a.high, a.low, b.high, b.low ); 5394 5395 } 5396 5397 #endif 5398 5399 5400 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5401 5402 /* 5403 * These two routines are not part of the original softfloat distribution. 5404 * 5405 * They are based on the corresponding conversions to integer but return 5406 * unsigned numbers instead since these functions are required by GCC. 5407 * 5408 * Added by Mark Brinicombe <mark@netbsd.org> 27/09/97 5409 * 5410 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5411 */ 5412 5413 /* 5414 ------------------------------------------------------------------------------- 5415 Returns the result of converting the double-precision floating-point value 5416 `a' to the 32-bit unsigned integer format. The conversion is 5417 performed according to the IEC/IEEE Standard for Binary Floating-point 5418 Arithmetic, except that the conversion is always rounded toward zero. If 5419 `a' is a NaN, the largest positive integer is returned. If the conversion 5420 overflows, the largest integer positive is returned. 5421 ------------------------------------------------------------------------------- 5422 */ 5423 uint32 float64_to_uint32_round_to_zero( float64 a ) 5424 { 5425 flag aSign; 5426 int16 aExp, shiftCount; 5427 bits64 aSig, savedASig; 5428 uint32 z; 5429 5430 aSig = extractFloat64Frac( a ); 5431 aExp = extractFloat64Exp( a ); 5432 aSign = extractFloat64Sign( a ); 5433 5434 if (aSign) { 5435 float_raise( float_flag_invalid ); 5436 return(0); 5437 } 5438 5439 if ( 0x41E < aExp ) { 5440 float_raise( float_flag_invalid ); 5441 return 0xffffffff; 5442 } 5443 else if ( aExp < 0x3FF ) { 5444 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 5445 return 0; 5446 } 5447 aSig |= LIT64( 0x0010000000000000 ); 5448 shiftCount = 0x433 - aExp; 5449 savedASig = aSig; 5450 aSig >>= shiftCount; 5451 z = aSig; 5452 if ( ( aSig<<shiftCount ) != savedASig ) { 5453 float_exception_flags |= float_flag_inexact; 5454 } 5455 return z; 5456 5457 } 5458 5459 /* 5460 ------------------------------------------------------------------------------- 5461 Returns the result of converting the single-precision floating-point value 5462 `a' to the 32-bit unsigned integer format. The conversion is 5463 performed according to the IEC/IEEE Standard for Binary Floating-point 5464 Arithmetic, except that the conversion is always rounded toward zero. If 5465 `a' is a NaN, the largest positive integer is returned. If the conversion 5466 overflows, the largest positive integer is returned. 5467 ------------------------------------------------------------------------------- 5468 */ 5469 uint32 float32_to_uint32_round_to_zero( float32 a ) 5470 { 5471 flag aSign; 5472 int16 aExp, shiftCount; 5473 bits32 aSig; 5474 uint32 z; 5475 5476 aSig = extractFloat32Frac( a ); 5477 aExp = extractFloat32Exp( a ); 5478 aSign = extractFloat32Sign( a ); 5479 shiftCount = aExp - 0x9E; 5480 5481 if (aSign) { 5482 float_raise( float_flag_invalid ); 5483 return(0); 5484 } 5485 if ( 0 < shiftCount ) { 5486 float_raise( float_flag_invalid ); 5487 return 0xFFFFFFFF; 5488 } 5489 else if ( aExp <= 0x7E ) { 5490 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 5491 return 0; 5492 } 5493 aSig = ( aSig | 0x800000 )<<8; 5494 z = aSig>>( - shiftCount ); 5495 if ( aSig<<( shiftCount & 31 ) ) { 5496 float_exception_flags |= float_flag_inexact; 5497 } 5498 return z; 5499 5500 } 5501 5502 #endif 5503