1// Copyright 2014 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// This file implements multi-precision floating-point numbers. 6// Like in the GNU MPFR library (https://www.mpfr.org/), operands 7// can be of mixed precision. Unlike MPFR, the rounding mode is 8// not specified with each operation, but with each operand. The 9// rounding mode of the result operand determines the rounding 10// mode of an operation. This is a from-scratch implementation. 11 12package big 13 14import ( 15 "fmt" 16 "math" 17 "math/bits" 18) 19 20const debugFloat = false // enable for debugging 21 22// A nonzero finite Float represents a multi-precision floating point number 23// 24// sign × mantissa × 2**exponent 25// 26// with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp. 27// A Float may also be zero (+0, -0) or infinite (+Inf, -Inf). 28// All Floats are ordered, and the ordering of two Floats x and y 29// is defined by x.Cmp(y). 30// 31// Each Float value also has a precision, rounding mode, and accuracy. 32// The precision is the maximum number of mantissa bits available to 33// represent the value. The rounding mode specifies how a result should 34// be rounded to fit into the mantissa bits, and accuracy describes the 35// rounding error with respect to the exact result. 36// 37// Unless specified otherwise, all operations (including setters) that 38// specify a *Float variable for the result (usually via the receiver 39// with the exception of MantExp), round the numeric result according 40// to the precision and rounding mode of the result variable. 41// 42// If the provided result precision is 0 (see below), it is set to the 43// precision of the argument with the largest precision value before any 44// rounding takes place, and the rounding mode remains unchanged. Thus, 45// uninitialized Floats provided as result arguments will have their 46// precision set to a reasonable value determined by the operands, and 47// their mode is the zero value for RoundingMode (ToNearestEven). 48// 49// By setting the desired precision to 24 or 53 and using matching rounding 50// mode (typically ToNearestEven), Float operations produce the same results 51// as the corresponding float32 or float64 IEEE-754 arithmetic for operands 52// that correspond to normal (i.e., not denormal) float32 or float64 numbers. 53// Exponent underflow and overflow lead to a 0 or an Infinity for different 54// values than IEEE-754 because Float exponents have a much larger range. 55// 56// The zero (uninitialized) value for a Float is ready to use and represents 57// the number +0.0 exactly, with precision 0 and rounding mode ToNearestEven. 58// 59// Operations always take pointer arguments (*Float) rather 60// than Float values, and each unique Float value requires 61// its own unique *Float pointer. To "copy" a Float value, 62// an existing (or newly allocated) Float must be set to 63// a new value using the Float.Set method; shallow copies 64// of Floats are not supported and may lead to errors. 65type Float struct { 66 prec uint32 67 mode RoundingMode 68 acc Accuracy 69 form form 70 neg bool 71 mant nat 72 exp int32 73} 74 75// An ErrNaN panic is raised by a Float operation that would lead to 76// a NaN under IEEE-754 rules. An ErrNaN implements the error interface. 77type ErrNaN struct { 78 msg string 79} 80 81func (err ErrNaN) Error() string { 82 return err.msg 83} 84 85// NewFloat allocates and returns a new Float set to x, 86// with precision 53 and rounding mode ToNearestEven. 87// NewFloat panics with ErrNaN if x is a NaN. 88func NewFloat(x float64) *Float { 89 if math.IsNaN(x) { 90 panic(ErrNaN{"NewFloat(NaN)"}) 91 } 92 return new(Float).SetFloat64(x) 93} 94 95// Exponent and precision limits. 96const ( 97 MaxExp = math.MaxInt32 // largest supported exponent 98 MinExp = math.MinInt32 // smallest supported exponent 99 MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited 100) 101 102// Internal representation: The mantissa bits x.mant of a nonzero finite 103// Float x are stored in a nat slice long enough to hold up to x.prec bits; 104// the slice may (but doesn't have to) be shorter if the mantissa contains 105// trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e., 106// the msb is shifted all the way "to the left"). Thus, if the mantissa has 107// trailing 0 bits or x.prec is not a multiple of the Word size _W, 108// x.mant[0] has trailing zero bits. The msb of the mantissa corresponds 109// to the value 0.5; the exponent x.exp shifts the binary point as needed. 110// 111// A zero or non-finite Float x ignores x.mant and x.exp. 112// 113// x form neg mant exp 114// ---------------------------------------------------------- 115// ±0 zero sign - - 116// 0 < |x| < +Inf finite sign mantissa exponent 117// ±Inf inf sign - - 118 119// A form value describes the internal representation. 120type form byte 121 122// The form value order is relevant - do not change! 123const ( 124 zero form = iota 125 finite 126 inf 127) 128 129// RoundingMode determines how a Float value is rounded to the 130// desired precision. Rounding may change the Float value; the 131// rounding error is described by the Float's Accuracy. 132type RoundingMode byte 133 134// These constants define supported rounding modes. 135const ( 136 ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven 137 ToNearestAway // == IEEE 754-2008 roundTiesToAway 138 ToZero // == IEEE 754-2008 roundTowardZero 139 AwayFromZero // no IEEE 754-2008 equivalent 140 ToNegativeInf // == IEEE 754-2008 roundTowardNegative 141 ToPositiveInf // == IEEE 754-2008 roundTowardPositive 142) 143 144//go:generate stringer -type=RoundingMode 145 146// Accuracy describes the rounding error produced by the most recent 147// operation that generated a Float value, relative to the exact value. 148type Accuracy int8 149 150// Constants describing the Accuracy of a Float. 151const ( 152 Below Accuracy = -1 153 Exact Accuracy = 0 154 Above Accuracy = +1 155) 156 157//go:generate stringer -type=Accuracy 158 159// SetPrec sets z's precision to prec and returns the (possibly) rounded 160// value of z. Rounding occurs according to z's rounding mode if the mantissa 161// cannot be represented in prec bits without loss of precision. 162// SetPrec(0) maps all finite values to ±0; infinite values remain unchanged. 163// If prec > MaxPrec, it is set to MaxPrec. 164func (z *Float) SetPrec(prec uint) *Float { 165 z.acc = Exact // optimistically assume no rounding is needed 166 167 // special case 168 if prec == 0 { 169 z.prec = 0 170 if z.form == finite { 171 // truncate z to 0 172 z.acc = makeAcc(z.neg) 173 z.form = zero 174 } 175 return z 176 } 177 178 // general case 179 if prec > MaxPrec { 180 prec = MaxPrec 181 } 182 old := z.prec 183 z.prec = uint32(prec) 184 if z.prec < old { 185 z.round(0) 186 } 187 return z 188} 189 190func makeAcc(above bool) Accuracy { 191 if above { 192 return Above 193 } 194 return Below 195} 196 197// SetMode sets z's rounding mode to mode and returns an exact z. 198// z remains unchanged otherwise. 199// z.SetMode(z.Mode()) is a cheap way to set z's accuracy to Exact. 200func (z *Float) SetMode(mode RoundingMode) *Float { 201 z.mode = mode 202 z.acc = Exact 203 return z 204} 205 206// Prec returns the mantissa precision of x in bits. 207// The result may be 0 for |x| == 0 and |x| == Inf. 208func (x *Float) Prec() uint { 209 return uint(x.prec) 210} 211 212// MinPrec returns the minimum precision required to represent x exactly 213// (i.e., the smallest prec before x.SetPrec(prec) would start rounding x). 214// The result is 0 for |x| == 0 and |x| == Inf. 215func (x *Float) MinPrec() uint { 216 if x.form != finite { 217 return 0 218 } 219 return uint(len(x.mant))*_W - x.mant.trailingZeroBits() 220} 221 222// Mode returns the rounding mode of x. 223func (x *Float) Mode() RoundingMode { 224 return x.mode 225} 226 227// Acc returns the accuracy of x produced by the most recent operation. 228func (x *Float) Acc() Accuracy { 229 return x.acc 230} 231 232// Sign returns: 233// 234// -1 if x < 0 235// 0 if x is ±0 236// +1 if x > 0 237// 238func (x *Float) Sign() int { 239 if debugFloat { 240 x.validate() 241 } 242 if x.form == zero { 243 return 0 244 } 245 if x.neg { 246 return -1 247 } 248 return 1 249} 250 251// MantExp breaks x into its mantissa and exponent components 252// and returns the exponent. If a non-nil mant argument is 253// provided its value is set to the mantissa of x, with the 254// same precision and rounding mode as x. The components 255// satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0. 256// Calling MantExp with a nil argument is an efficient way to 257// get the exponent of the receiver. 258// 259// Special cases are: 260// 261// ( ±0).MantExp(mant) = 0, with mant set to ±0 262// (±Inf).MantExp(mant) = 0, with mant set to ±Inf 263// 264// x and mant may be the same in which case x is set to its 265// mantissa value. 266func (x *Float) MantExp(mant *Float) (exp int) { 267 if debugFloat { 268 x.validate() 269 } 270 if x.form == finite { 271 exp = int(x.exp) 272 } 273 if mant != nil { 274 mant.Copy(x) 275 if mant.form == finite { 276 mant.exp = 0 277 } 278 } 279 return 280} 281 282func (z *Float) setExpAndRound(exp int64, sbit uint) { 283 if exp < MinExp { 284 // underflow 285 z.acc = makeAcc(z.neg) 286 z.form = zero 287 return 288 } 289 290 if exp > MaxExp { 291 // overflow 292 z.acc = makeAcc(!z.neg) 293 z.form = inf 294 return 295 } 296 297 z.form = finite 298 z.exp = int32(exp) 299 z.round(sbit) 300} 301 302// SetMantExp sets z to mant × 2**exp and returns z. 303// The result z has the same precision and rounding mode 304// as mant. SetMantExp is an inverse of MantExp but does 305// not require 0.5 <= |mant| < 1.0. Specifically: 306// 307// mant := new(Float) 308// new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0 309// 310// Special cases are: 311// 312// z.SetMantExp( ±0, exp) = ±0 313// z.SetMantExp(±Inf, exp) = ±Inf 314// 315// z and mant may be the same in which case z's exponent 316// is set to exp. 317func (z *Float) SetMantExp(mant *Float, exp int) *Float { 318 if debugFloat { 319 z.validate() 320 mant.validate() 321 } 322 z.Copy(mant) 323 if z.form != finite { 324 return z 325 } 326 z.setExpAndRound(int64(z.exp)+int64(exp), 0) 327 return z 328} 329 330// Signbit reports whether x is negative or negative zero. 331func (x *Float) Signbit() bool { 332 return x.neg 333} 334 335// IsInf reports whether x is +Inf or -Inf. 336func (x *Float) IsInf() bool { 337 return x.form == inf 338} 339 340// IsInt reports whether x is an integer. 341// ±Inf values are not integers. 342func (x *Float) IsInt() bool { 343 if debugFloat { 344 x.validate() 345 } 346 // special cases 347 if x.form != finite { 348 return x.form == zero 349 } 350 // x.form == finite 351 if x.exp <= 0 { 352 return false 353 } 354 // x.exp > 0 355 return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa 356} 357 358// debugging support 359func (x *Float) validate() { 360 if !debugFloat { 361 // avoid performance bugs 362 panic("validate called but debugFloat is not set") 363 } 364 if x.form != finite { 365 return 366 } 367 m := len(x.mant) 368 if m == 0 { 369 panic("nonzero finite number with empty mantissa") 370 } 371 const msb = 1 << (_W - 1) 372 if x.mant[m-1]&msb == 0 { 373 panic(fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0))) 374 } 375 if x.prec == 0 { 376 panic("zero precision finite number") 377 } 378} 379 380// round rounds z according to z.mode to z.prec bits and sets z.acc accordingly. 381// sbit must be 0 or 1 and summarizes any "sticky bit" information one might 382// have before calling round. z's mantissa must be normalized (with the msb set) 383// or empty. 384// 385// CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the 386// sign of z. For correct rounding, the sign of z must be set correctly before 387// calling round. 388func (z *Float) round(sbit uint) { 389 if debugFloat { 390 z.validate() 391 } 392 393 z.acc = Exact 394 if z.form != finite { 395 // ±0 or ±Inf => nothing left to do 396 return 397 } 398 // z.form == finite && len(z.mant) > 0 399 // m > 0 implies z.prec > 0 (checked by validate) 400 401 m := uint32(len(z.mant)) // present mantissa length in words 402 bits := m * _W // present mantissa bits; bits > 0 403 if bits <= z.prec { 404 // mantissa fits => nothing to do 405 return 406 } 407 // bits > z.prec 408 409 // Rounding is based on two bits: the rounding bit (rbit) and the 410 // sticky bit (sbit). The rbit is the bit immediately before the 411 // z.prec leading mantissa bits (the "0.5"). The sbit is set if any 412 // of the bits before the rbit are set (the "0.25", "0.125", etc.): 413 // 414 // rbit sbit => "fractional part" 415 // 416 // 0 0 == 0 417 // 0 1 > 0 , < 0.5 418 // 1 0 == 0.5 419 // 1 1 > 0.5, < 1.0 420 421 // bits > z.prec: mantissa too large => round 422 r := uint(bits - z.prec - 1) // rounding bit position; r >= 0 423 rbit := z.mant.bit(r) & 1 // rounding bit; be safe and ensure it's a single bit 424 // The sticky bit is only needed for rounding ToNearestEven 425 // or when the rounding bit is zero. Avoid computation otherwise. 426 if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) { 427 sbit = z.mant.sticky(r) 428 } 429 sbit &= 1 // be safe and ensure it's a single bit 430 431 // cut off extra words 432 n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision 433 if m > n { 434 copy(z.mant, z.mant[m-n:]) // move n last words to front 435 z.mant = z.mant[:n] 436 } 437 438 // determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word 439 ntz := n*_W - z.prec // 0 <= ntz < _W 440 lsb := Word(1) << ntz 441 442 // round if result is inexact 443 if rbit|sbit != 0 { 444 // Make rounding decision: The result mantissa is truncated ("rounded down") 445 // by default. Decide if we need to increment, or "round up", the (unsigned) 446 // mantissa. 447 inc := false 448 switch z.mode { 449 case ToNegativeInf: 450 inc = z.neg 451 case ToZero: 452 // nothing to do 453 case ToNearestEven: 454 inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0) 455 case ToNearestAway: 456 inc = rbit != 0 457 case AwayFromZero: 458 inc = true 459 case ToPositiveInf: 460 inc = !z.neg 461 default: 462 panic("unreachable") 463 } 464 465 // A positive result (!z.neg) is Above the exact result if we increment, 466 // and it's Below if we truncate (Exact results require no rounding). 467 // For a negative result (z.neg) it is exactly the opposite. 468 z.acc = makeAcc(inc != z.neg) 469 470 if inc { 471 // add 1 to mantissa 472 if addVW(z.mant, z.mant, lsb) != 0 { 473 // mantissa overflow => adjust exponent 474 if z.exp >= MaxExp { 475 // exponent overflow 476 z.form = inf 477 return 478 } 479 z.exp++ 480 // adjust mantissa: divide by 2 to compensate for exponent adjustment 481 shrVU(z.mant, z.mant, 1) 482 // set msb == carry == 1 from the mantissa overflow above 483 const msb = 1 << (_W - 1) 484 z.mant[n-1] |= msb 485 } 486 } 487 } 488 489 // zero out trailing bits in least-significant word 490 z.mant[0] &^= lsb - 1 491 492 if debugFloat { 493 z.validate() 494 } 495} 496 497func (z *Float) setBits64(neg bool, x uint64) *Float { 498 if z.prec == 0 { 499 z.prec = 64 500 } 501 z.acc = Exact 502 z.neg = neg 503 if x == 0 { 504 z.form = zero 505 return z 506 } 507 // x != 0 508 z.form = finite 509 s := bits.LeadingZeros64(x) 510 z.mant = z.mant.setUint64(x << uint(s)) 511 z.exp = int32(64 - s) // always fits 512 if z.prec < 64 { 513 z.round(0) 514 } 515 return z 516} 517 518// SetUint64 sets z to the (possibly rounded) value of x and returns z. 519// If z's precision is 0, it is changed to 64 (and rounding will have 520// no effect). 521func (z *Float) SetUint64(x uint64) *Float { 522 return z.setBits64(false, x) 523} 524 525// SetInt64 sets z to the (possibly rounded) value of x and returns z. 526// If z's precision is 0, it is changed to 64 (and rounding will have 527// no effect). 528func (z *Float) SetInt64(x int64) *Float { 529 u := x 530 if u < 0 { 531 u = -u 532 } 533 // We cannot simply call z.SetUint64(uint64(u)) and change 534 // the sign afterwards because the sign affects rounding. 535 return z.setBits64(x < 0, uint64(u)) 536} 537 538// SetFloat64 sets z to the (possibly rounded) value of x and returns z. 539// If z's precision is 0, it is changed to 53 (and rounding will have 540// no effect). SetFloat64 panics with ErrNaN if x is a NaN. 541func (z *Float) SetFloat64(x float64) *Float { 542 if z.prec == 0 { 543 z.prec = 53 544 } 545 if math.IsNaN(x) { 546 panic(ErrNaN{"Float.SetFloat64(NaN)"}) 547 } 548 z.acc = Exact 549 z.neg = math.Signbit(x) // handle -0, -Inf correctly 550 if x == 0 { 551 z.form = zero 552 return z 553 } 554 if math.IsInf(x, 0) { 555 z.form = inf 556 return z 557 } 558 // normalized x != 0 559 z.form = finite 560 fmant, exp := math.Frexp(x) // get normalized mantissa 561 z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11) 562 z.exp = int32(exp) // always fits 563 if z.prec < 53 { 564 z.round(0) 565 } 566 return z 567} 568 569// fnorm normalizes mantissa m by shifting it to the left 570// such that the msb of the most-significant word (msw) is 1. 571// It returns the shift amount. It assumes that len(m) != 0. 572func fnorm(m nat) int64 { 573 if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) { 574 panic("msw of mantissa is 0") 575 } 576 s := nlz(m[len(m)-1]) 577 if s > 0 { 578 c := shlVU(m, m, s) 579 if debugFloat && c != 0 { 580 panic("nlz or shlVU incorrect") 581 } 582 } 583 return int64(s) 584} 585 586// SetInt sets z to the (possibly rounded) value of x and returns z. 587// If z's precision is 0, it is changed to the larger of x.BitLen() 588// or 64 (and rounding will have no effect). 589func (z *Float) SetInt(x *Int) *Float { 590 // TODO(gri) can be more efficient if z.prec > 0 591 // but small compared to the size of x, or if there 592 // are many trailing 0's. 593 bits := uint32(x.BitLen()) 594 if z.prec == 0 { 595 z.prec = umax32(bits, 64) 596 } 597 z.acc = Exact 598 z.neg = x.neg 599 if len(x.abs) == 0 { 600 z.form = zero 601 return z 602 } 603 // x != 0 604 z.mant = z.mant.set(x.abs) 605 fnorm(z.mant) 606 z.setExpAndRound(int64(bits), 0) 607 return z 608} 609 610// SetRat sets z to the (possibly rounded) value of x and returns z. 611// If z's precision is 0, it is changed to the largest of a.BitLen(), 612// b.BitLen(), or 64; with x = a/b. 613func (z *Float) SetRat(x *Rat) *Float { 614 if x.IsInt() { 615 return z.SetInt(x.Num()) 616 } 617 var a, b Float 618 a.SetInt(x.Num()) 619 b.SetInt(x.Denom()) 620 if z.prec == 0 { 621 z.prec = umax32(a.prec, b.prec) 622 } 623 return z.Quo(&a, &b) 624} 625 626// SetInf sets z to the infinite Float -Inf if signbit is 627// set, or +Inf if signbit is not set, and returns z. The 628// precision of z is unchanged and the result is always 629// Exact. 630func (z *Float) SetInf(signbit bool) *Float { 631 z.acc = Exact 632 z.form = inf 633 z.neg = signbit 634 return z 635} 636 637// Set sets z to the (possibly rounded) value of x and returns z. 638// If z's precision is 0, it is changed to the precision of x 639// before setting z (and rounding will have no effect). 640// Rounding is performed according to z's precision and rounding 641// mode; and z's accuracy reports the result error relative to the 642// exact (not rounded) result. 643func (z *Float) Set(x *Float) *Float { 644 if debugFloat { 645 x.validate() 646 } 647 z.acc = Exact 648 if z != x { 649 z.form = x.form 650 z.neg = x.neg 651 if x.form == finite { 652 z.exp = x.exp 653 z.mant = z.mant.set(x.mant) 654 } 655 if z.prec == 0 { 656 z.prec = x.prec 657 } else if z.prec < x.prec { 658 z.round(0) 659 } 660 } 661 return z 662} 663 664// Copy sets z to x, with the same precision, rounding mode, and 665// accuracy as x, and returns z. x is not changed even if z and 666// x are the same. 667func (z *Float) Copy(x *Float) *Float { 668 if debugFloat { 669 x.validate() 670 } 671 if z != x { 672 z.prec = x.prec 673 z.mode = x.mode 674 z.acc = x.acc 675 z.form = x.form 676 z.neg = x.neg 677 if z.form == finite { 678 z.mant = z.mant.set(x.mant) 679 z.exp = x.exp 680 } 681 } 682 return z 683} 684 685// msb32 returns the 32 most significant bits of x. 686func msb32(x nat) uint32 { 687 i := len(x) - 1 688 if i < 0 { 689 return 0 690 } 691 if debugFloat && x[i]&(1<<(_W-1)) == 0 { 692 panic("x not normalized") 693 } 694 switch _W { 695 case 32: 696 return uint32(x[i]) 697 case 64: 698 return uint32(x[i] >> 32) 699 } 700 panic("unreachable") 701} 702 703// msb64 returns the 64 most significant bits of x. 704func msb64(x nat) uint64 { 705 i := len(x) - 1 706 if i < 0 { 707 return 0 708 } 709 if debugFloat && x[i]&(1<<(_W-1)) == 0 { 710 panic("x not normalized") 711 } 712 switch _W { 713 case 32: 714 v := uint64(x[i]) << 32 715 if i > 0 { 716 v |= uint64(x[i-1]) 717 } 718 return v 719 case 64: 720 return uint64(x[i]) 721 } 722 panic("unreachable") 723} 724 725// Uint64 returns the unsigned integer resulting from truncating x 726// towards zero. If 0 <= x <= math.MaxUint64, the result is Exact 727// if x is an integer and Below otherwise. 728// The result is (0, Above) for x < 0, and (math.MaxUint64, Below) 729// for x > math.MaxUint64. 730func (x *Float) Uint64() (uint64, Accuracy) { 731 if debugFloat { 732 x.validate() 733 } 734 735 switch x.form { 736 case finite: 737 if x.neg { 738 return 0, Above 739 } 740 // 0 < x < +Inf 741 if x.exp <= 0 { 742 // 0 < x < 1 743 return 0, Below 744 } 745 // 1 <= x < Inf 746 if x.exp <= 64 { 747 // u = trunc(x) fits into a uint64 748 u := msb64(x.mant) >> (64 - uint32(x.exp)) 749 if x.MinPrec() <= 64 { 750 return u, Exact 751 } 752 return u, Below // x truncated 753 } 754 // x too large 755 return math.MaxUint64, Below 756 757 case zero: 758 return 0, Exact 759 760 case inf: 761 if x.neg { 762 return 0, Above 763 } 764 return math.MaxUint64, Below 765 } 766 767 panic("unreachable") 768} 769 770// Int64 returns the integer resulting from truncating x towards zero. 771// If math.MinInt64 <= x <= math.MaxInt64, the result is Exact if x is 772// an integer, and Above (x < 0) or Below (x > 0) otherwise. 773// The result is (math.MinInt64, Above) for x < math.MinInt64, 774// and (math.MaxInt64, Below) for x > math.MaxInt64. 775func (x *Float) Int64() (int64, Accuracy) { 776 if debugFloat { 777 x.validate() 778 } 779 780 switch x.form { 781 case finite: 782 // 0 < |x| < +Inf 783 acc := makeAcc(x.neg) 784 if x.exp <= 0 { 785 // 0 < |x| < 1 786 return 0, acc 787 } 788 // x.exp > 0 789 790 // 1 <= |x| < +Inf 791 if x.exp <= 63 { 792 // i = trunc(x) fits into an int64 (excluding math.MinInt64) 793 i := int64(msb64(x.mant) >> (64 - uint32(x.exp))) 794 if x.neg { 795 i = -i 796 } 797 if x.MinPrec() <= uint(x.exp) { 798 return i, Exact 799 } 800 return i, acc // x truncated 801 } 802 if x.neg { 803 // check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64)) 804 if x.exp == 64 && x.MinPrec() == 1 { 805 acc = Exact 806 } 807 return math.MinInt64, acc 808 } 809 // x too large 810 return math.MaxInt64, Below 811 812 case zero: 813 return 0, Exact 814 815 case inf: 816 if x.neg { 817 return math.MinInt64, Above 818 } 819 return math.MaxInt64, Below 820 } 821 822 panic("unreachable") 823} 824 825// Float32 returns the float32 value nearest to x. If x is too small to be 826// represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result 827// is (0, Below) or (-0, Above), respectively, depending on the sign of x. 828// If x is too large to be represented by a float32 (|x| > math.MaxFloat32), 829// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x. 830func (x *Float) Float32() (float32, Accuracy) { 831 if debugFloat { 832 x.validate() 833 } 834 835 switch x.form { 836 case finite: 837 // 0 < |x| < +Inf 838 839 const ( 840 fbits = 32 // float size 841 mbits = 23 // mantissa size (excluding implicit msb) 842 ebits = fbits - mbits - 1 // 8 exponent size 843 bias = 1<<(ebits-1) - 1 // 127 exponent bias 844 dmin = 1 - bias - mbits // -149 smallest unbiased exponent (denormal) 845 emin = 1 - bias // -126 smallest unbiased exponent (normal) 846 emax = bias // 127 largest unbiased exponent (normal) 847 ) 848 849 // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa. 850 e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 851 852 // Compute precision p for float32 mantissa. 853 // If the exponent is too small, we have a denormal number before 854 // rounding and fewer than p mantissa bits of precision available 855 // (the exponent remains fixed but the mantissa gets shifted right). 856 p := mbits + 1 // precision of normal float 857 if e < emin { 858 // recompute precision 859 p = mbits + 1 - emin + int(e) 860 // If p == 0, the mantissa of x is shifted so much to the right 861 // that its msb falls immediately to the right of the float32 862 // mantissa space. In other words, if the smallest denormal is 863 // considered "1.0", for p == 0, the mantissa value m is >= 0.5. 864 // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. 865 // If m == 0.5, it is rounded down to even, i.e., 0.0. 866 // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. 867 if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { 868 // underflow to ±0 869 if x.neg { 870 var z float32 871 return -z, Above 872 } 873 return 0.0, Below 874 } 875 // otherwise, round up 876 // We handle p == 0 explicitly because it's easy and because 877 // Float.round doesn't support rounding to 0 bits of precision. 878 if p == 0 { 879 if x.neg { 880 return -math.SmallestNonzeroFloat32, Below 881 } 882 return math.SmallestNonzeroFloat32, Above 883 } 884 } 885 // p > 0 886 887 // round 888 var r Float 889 r.prec = uint32(p) 890 r.Set(x) 891 e = r.exp - 1 892 893 // Rounding may have caused r to overflow to ±Inf 894 // (rounding never causes underflows to 0). 895 // If the exponent is too large, also overflow to ±Inf. 896 if r.form == inf || e > emax { 897 // overflow 898 if x.neg { 899 return float32(math.Inf(-1)), Below 900 } 901 return float32(math.Inf(+1)), Above 902 } 903 // e <= emax 904 905 // Determine sign, biased exponent, and mantissa. 906 var sign, bexp, mant uint32 907 if x.neg { 908 sign = 1 << (fbits - 1) 909 } 910 911 // Rounding may have caused a denormal number to 912 // become normal. Check again. 913 if e < emin { 914 // denormal number: recompute precision 915 // Since rounding may have at best increased precision 916 // and we have eliminated p <= 0 early, we know p > 0. 917 // bexp == 0 for denormals 918 p = mbits + 1 - emin + int(e) 919 mant = msb32(r.mant) >> uint(fbits-p) 920 } else { 921 // normal number: emin <= e <= emax 922 bexp = uint32(e+bias) << mbits 923 mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit) 924 } 925 926 return math.Float32frombits(sign | bexp | mant), r.acc 927 928 case zero: 929 if x.neg { 930 var z float32 931 return -z, Exact 932 } 933 return 0.0, Exact 934 935 case inf: 936 if x.neg { 937 return float32(math.Inf(-1)), Exact 938 } 939 return float32(math.Inf(+1)), Exact 940 } 941 942 panic("unreachable") 943} 944 945// Float64 returns the float64 value nearest to x. If x is too small to be 946// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result 947// is (0, Below) or (-0, Above), respectively, depending on the sign of x. 948// If x is too large to be represented by a float64 (|x| > math.MaxFloat64), 949// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x. 950func (x *Float) Float64() (float64, Accuracy) { 951 if debugFloat { 952 x.validate() 953 } 954 955 switch x.form { 956 case finite: 957 // 0 < |x| < +Inf 958 959 const ( 960 fbits = 64 // float size 961 mbits = 52 // mantissa size (excluding implicit msb) 962 ebits = fbits - mbits - 1 // 11 exponent size 963 bias = 1<<(ebits-1) - 1 // 1023 exponent bias 964 dmin = 1 - bias - mbits // -1074 smallest unbiased exponent (denormal) 965 emin = 1 - bias // -1022 smallest unbiased exponent (normal) 966 emax = bias // 1023 largest unbiased exponent (normal) 967 ) 968 969 // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa. 970 e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 971 972 // Compute precision p for float64 mantissa. 973 // If the exponent is too small, we have a denormal number before 974 // rounding and fewer than p mantissa bits of precision available 975 // (the exponent remains fixed but the mantissa gets shifted right). 976 p := mbits + 1 // precision of normal float 977 if e < emin { 978 // recompute precision 979 p = mbits + 1 - emin + int(e) 980 // If p == 0, the mantissa of x is shifted so much to the right 981 // that its msb falls immediately to the right of the float64 982 // mantissa space. In other words, if the smallest denormal is 983 // considered "1.0", for p == 0, the mantissa value m is >= 0.5. 984 // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. 985 // If m == 0.5, it is rounded down to even, i.e., 0.0. 986 // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. 987 if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { 988 // underflow to ±0 989 if x.neg { 990 var z float64 991 return -z, Above 992 } 993 return 0.0, Below 994 } 995 // otherwise, round up 996 // We handle p == 0 explicitly because it's easy and because 997 // Float.round doesn't support rounding to 0 bits of precision. 998 if p == 0 { 999 if x.neg { 1000 return -math.SmallestNonzeroFloat64, Below 1001 } 1002 return math.SmallestNonzeroFloat64, Above 1003 } 1004 } 1005 // p > 0 1006 1007 // round 1008 var r Float 1009 r.prec = uint32(p) 1010 r.Set(x) 1011 e = r.exp - 1 1012 1013 // Rounding may have caused r to overflow to ±Inf 1014 // (rounding never causes underflows to 0). 1015 // If the exponent is too large, also overflow to ±Inf. 1016 if r.form == inf || e > emax { 1017 // overflow 1018 if x.neg { 1019 return math.Inf(-1), Below 1020 } 1021 return math.Inf(+1), Above 1022 } 1023 // e <= emax 1024 1025 // Determine sign, biased exponent, and mantissa. 1026 var sign, bexp, mant uint64 1027 if x.neg { 1028 sign = 1 << (fbits - 1) 1029 } 1030 1031 // Rounding may have caused a denormal number to 1032 // become normal. Check again. 1033 if e < emin { 1034 // denormal number: recompute precision 1035 // Since rounding may have at best increased precision 1036 // and we have eliminated p <= 0 early, we know p > 0. 1037 // bexp == 0 for denormals 1038 p = mbits + 1 - emin + int(e) 1039 mant = msb64(r.mant) >> uint(fbits-p) 1040 } else { 1041 // normal number: emin <= e <= emax 1042 bexp = uint64(e+bias) << mbits 1043 mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit) 1044 } 1045 1046 return math.Float64frombits(sign | bexp | mant), r.acc 1047 1048 case zero: 1049 if x.neg { 1050 var z float64 1051 return -z, Exact 1052 } 1053 return 0.0, Exact 1054 1055 case inf: 1056 if x.neg { 1057 return math.Inf(-1), Exact 1058 } 1059 return math.Inf(+1), Exact 1060 } 1061 1062 panic("unreachable") 1063} 1064 1065// Int returns the result of truncating x towards zero; 1066// or nil if x is an infinity. 1067// The result is Exact if x.IsInt(); otherwise it is Below 1068// for x > 0, and Above for x < 0. 1069// If a non-nil *Int argument z is provided, Int stores 1070// the result in z instead of allocating a new Int. 1071func (x *Float) Int(z *Int) (*Int, Accuracy) { 1072 if debugFloat { 1073 x.validate() 1074 } 1075 1076 if z == nil && x.form <= finite { 1077 z = new(Int) 1078 } 1079 1080 switch x.form { 1081 case finite: 1082 // 0 < |x| < +Inf 1083 acc := makeAcc(x.neg) 1084 if x.exp <= 0 { 1085 // 0 < |x| < 1 1086 return z.SetInt64(0), acc 1087 } 1088 // x.exp > 0 1089 1090 // 1 <= |x| < +Inf 1091 // determine minimum required precision for x 1092 allBits := uint(len(x.mant)) * _W 1093 exp := uint(x.exp) 1094 if x.MinPrec() <= exp { 1095 acc = Exact 1096 } 1097 // shift mantissa as needed 1098 if z == nil { 1099 z = new(Int) 1100 } 1101 z.neg = x.neg 1102 switch { 1103 case exp > allBits: 1104 z.abs = z.abs.shl(x.mant, exp-allBits) 1105 default: 1106 z.abs = z.abs.set(x.mant) 1107 case exp < allBits: 1108 z.abs = z.abs.shr(x.mant, allBits-exp) 1109 } 1110 return z, acc 1111 1112 case zero: 1113 return z.SetInt64(0), Exact 1114 1115 case inf: 1116 return nil, makeAcc(x.neg) 1117 } 1118 1119 panic("unreachable") 1120} 1121 1122// Rat returns the rational number corresponding to x; 1123// or nil if x is an infinity. 1124// The result is Exact if x is not an Inf. 1125// If a non-nil *Rat argument z is provided, Rat stores 1126// the result in z instead of allocating a new Rat. 1127func (x *Float) Rat(z *Rat) (*Rat, Accuracy) { 1128 if debugFloat { 1129 x.validate() 1130 } 1131 1132 if z == nil && x.form <= finite { 1133 z = new(Rat) 1134 } 1135 1136 switch x.form { 1137 case finite: 1138 // 0 < |x| < +Inf 1139 allBits := int32(len(x.mant)) * _W 1140 // build up numerator and denominator 1141 z.a.neg = x.neg 1142 switch { 1143 case x.exp > allBits: 1144 z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits)) 1145 z.b.abs = z.b.abs[:0] // == 1 (see Rat) 1146 // z already in normal form 1147 default: 1148 z.a.abs = z.a.abs.set(x.mant) 1149 z.b.abs = z.b.abs[:0] // == 1 (see Rat) 1150 // z already in normal form 1151 case x.exp < allBits: 1152 z.a.abs = z.a.abs.set(x.mant) 1153 t := z.b.abs.setUint64(1) 1154 z.b.abs = t.shl(t, uint(allBits-x.exp)) 1155 z.norm() 1156 } 1157 return z, Exact 1158 1159 case zero: 1160 return z.SetInt64(0), Exact 1161 1162 case inf: 1163 return nil, makeAcc(x.neg) 1164 } 1165 1166 panic("unreachable") 1167} 1168 1169// Abs sets z to the (possibly rounded) value |x| (the absolute value of x) 1170// and returns z. 1171func (z *Float) Abs(x *Float) *Float { 1172 z.Set(x) 1173 z.neg = false 1174 return z 1175} 1176 1177// Neg sets z to the (possibly rounded) value of x with its sign negated, 1178// and returns z. 1179func (z *Float) Neg(x *Float) *Float { 1180 z.Set(x) 1181 z.neg = !z.neg 1182 return z 1183} 1184 1185func validateBinaryOperands(x, y *Float) { 1186 if !debugFloat { 1187 // avoid performance bugs 1188 panic("validateBinaryOperands called but debugFloat is not set") 1189 } 1190 if len(x.mant) == 0 { 1191 panic("empty mantissa for x") 1192 } 1193 if len(y.mant) == 0 { 1194 panic("empty mantissa for y") 1195 } 1196} 1197 1198// z = x + y, ignoring signs of x and y for the addition 1199// but using the sign of z for rounding the result. 1200// x and y must have a non-empty mantissa and valid exponent. 1201func (z *Float) uadd(x, y *Float) { 1202 // Note: This implementation requires 2 shifts most of the 1203 // time. It is also inefficient if exponents or precisions 1204 // differ by wide margins. The following article describes 1205 // an efficient (but much more complicated) implementation 1206 // compatible with the internal representation used here: 1207 // 1208 // Vincent Lefèvre: "The Generic Multiple-Precision Floating- 1209 // Point Addition With Exact Rounding (as in the MPFR Library)" 1210 // http://www.vinc17.net/research/papers/rnc6.pdf 1211 1212 if debugFloat { 1213 validateBinaryOperands(x, y) 1214 } 1215 1216 // compute exponents ex, ey for mantissa with "binary point" 1217 // on the right (mantissa.0) - use int64 to avoid overflow 1218 ex := int64(x.exp) - int64(len(x.mant))*_W 1219 ey := int64(y.exp) - int64(len(y.mant))*_W 1220 1221 al := alias(z.mant, x.mant) || alias(z.mant, y.mant) 1222 1223 // TODO(gri) having a combined add-and-shift primitive 1224 // could make this code significantly faster 1225 switch { 1226 case ex < ey: 1227 if al { 1228 t := nat(nil).shl(y.mant, uint(ey-ex)) 1229 z.mant = z.mant.add(x.mant, t) 1230 } else { 1231 z.mant = z.mant.shl(y.mant, uint(ey-ex)) 1232 z.mant = z.mant.add(x.mant, z.mant) 1233 } 1234 default: 1235 // ex == ey, no shift needed 1236 z.mant = z.mant.add(x.mant, y.mant) 1237 case ex > ey: 1238 if al { 1239 t := nat(nil).shl(x.mant, uint(ex-ey)) 1240 z.mant = z.mant.add(t, y.mant) 1241 } else { 1242 z.mant = z.mant.shl(x.mant, uint(ex-ey)) 1243 z.mant = z.mant.add(z.mant, y.mant) 1244 } 1245 ex = ey 1246 } 1247 // len(z.mant) > 0 1248 1249 z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0) 1250} 1251 1252// z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction 1253// but using the sign of z for rounding the result. 1254// x and y must have a non-empty mantissa and valid exponent. 1255func (z *Float) usub(x, y *Float) { 1256 // This code is symmetric to uadd. 1257 // We have not factored the common code out because 1258 // eventually uadd (and usub) should be optimized 1259 // by special-casing, and the code will diverge. 1260 1261 if debugFloat { 1262 validateBinaryOperands(x, y) 1263 } 1264 1265 ex := int64(x.exp) - int64(len(x.mant))*_W 1266 ey := int64(y.exp) - int64(len(y.mant))*_W 1267 1268 al := alias(z.mant, x.mant) || alias(z.mant, y.mant) 1269 1270 switch { 1271 case ex < ey: 1272 if al { 1273 t := nat(nil).shl(y.mant, uint(ey-ex)) 1274 z.mant = t.sub(x.mant, t) 1275 } else { 1276 z.mant = z.mant.shl(y.mant, uint(ey-ex)) 1277 z.mant = z.mant.sub(x.mant, z.mant) 1278 } 1279 default: 1280 // ex == ey, no shift needed 1281 z.mant = z.mant.sub(x.mant, y.mant) 1282 case ex > ey: 1283 if al { 1284 t := nat(nil).shl(x.mant, uint(ex-ey)) 1285 z.mant = t.sub(t, y.mant) 1286 } else { 1287 z.mant = z.mant.shl(x.mant, uint(ex-ey)) 1288 z.mant = z.mant.sub(z.mant, y.mant) 1289 } 1290 ex = ey 1291 } 1292 1293 // operands may have canceled each other out 1294 if len(z.mant) == 0 { 1295 z.acc = Exact 1296 z.form = zero 1297 z.neg = false 1298 return 1299 } 1300 // len(z.mant) > 0 1301 1302 z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0) 1303} 1304 1305// z = x * y, ignoring signs of x and y for the multiplication 1306// but using the sign of z for rounding the result. 1307// x and y must have a non-empty mantissa and valid exponent. 1308func (z *Float) umul(x, y *Float) { 1309 if debugFloat { 1310 validateBinaryOperands(x, y) 1311 } 1312 1313 // Note: This is doing too much work if the precision 1314 // of z is less than the sum of the precisions of x 1315 // and y which is often the case (e.g., if all floats 1316 // have the same precision). 1317 // TODO(gri) Optimize this for the common case. 1318 1319 e := int64(x.exp) + int64(y.exp) 1320 if x == y { 1321 z.mant = z.mant.sqr(x.mant) 1322 } else { 1323 z.mant = z.mant.mul(x.mant, y.mant) 1324 } 1325 z.setExpAndRound(e-fnorm(z.mant), 0) 1326} 1327 1328// z = x / y, ignoring signs of x and y for the division 1329// but using the sign of z for rounding the result. 1330// x and y must have a non-empty mantissa and valid exponent. 1331func (z *Float) uquo(x, y *Float) { 1332 if debugFloat { 1333 validateBinaryOperands(x, y) 1334 } 1335 1336 // mantissa length in words for desired result precision + 1 1337 // (at least one extra bit so we get the rounding bit after 1338 // the division) 1339 n := int(z.prec/_W) + 1 1340 1341 // compute adjusted x.mant such that we get enough result precision 1342 xadj := x.mant 1343 if d := n - len(x.mant) + len(y.mant); d > 0 { 1344 // d extra words needed => add d "0 digits" to x 1345 xadj = make(nat, len(x.mant)+d) 1346 copy(xadj[d:], x.mant) 1347 } 1348 // TODO(gri): If we have too many digits (d < 0), we should be able 1349 // to shorten x for faster division. But we must be extra careful 1350 // with rounding in that case. 1351 1352 // Compute d before division since there may be aliasing of x.mant 1353 // (via xadj) or y.mant with z.mant. 1354 d := len(xadj) - len(y.mant) 1355 1356 // divide 1357 var r nat 1358 z.mant, r = z.mant.div(nil, xadj, y.mant) 1359 e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W 1360 1361 // The result is long enough to include (at least) the rounding bit. 1362 // If there's a non-zero remainder, the corresponding fractional part 1363 // (if it were computed), would have a non-zero sticky bit (if it were 1364 // zero, it couldn't have a non-zero remainder). 1365 var sbit uint 1366 if len(r) > 0 { 1367 sbit = 1 1368 } 1369 1370 z.setExpAndRound(e-fnorm(z.mant), sbit) 1371} 1372 1373// ucmp returns -1, 0, or +1, depending on whether 1374// |x| < |y|, |x| == |y|, or |x| > |y|. 1375// x and y must have a non-empty mantissa and valid exponent. 1376func (x *Float) ucmp(y *Float) int { 1377 if debugFloat { 1378 validateBinaryOperands(x, y) 1379 } 1380 1381 switch { 1382 case x.exp < y.exp: 1383 return -1 1384 case x.exp > y.exp: 1385 return +1 1386 } 1387 // x.exp == y.exp 1388 1389 // compare mantissas 1390 i := len(x.mant) 1391 j := len(y.mant) 1392 for i > 0 || j > 0 { 1393 var xm, ym Word 1394 if i > 0 { 1395 i-- 1396 xm = x.mant[i] 1397 } 1398 if j > 0 { 1399 j-- 1400 ym = y.mant[j] 1401 } 1402 switch { 1403 case xm < ym: 1404 return -1 1405 case xm > ym: 1406 return +1 1407 } 1408 } 1409 1410 return 0 1411} 1412 1413// Handling of sign bit as defined by IEEE 754-2008, section 6.3: 1414// 1415// When neither the inputs nor result are NaN, the sign of a product or 1416// quotient is the exclusive OR of the operands’ signs; the sign of a sum, 1417// or of a difference x−y regarded as a sum x+(−y), differs from at most 1418// one of the addends’ signs; and the sign of the result of conversions, 1419// the quantize operation, the roundToIntegral operations, and the 1420// roundToIntegralExact (see 5.3.1) is the sign of the first or only operand. 1421// These rules shall apply even when operands or results are zero or infinite. 1422// 1423// When the sum of two operands with opposite signs (or the difference of 1424// two operands with like signs) is exactly zero, the sign of that sum (or 1425// difference) shall be +0 in all rounding-direction attributes except 1426// roundTowardNegative; under that attribute, the sign of an exact zero 1427// sum (or difference) shall be −0. However, x+x = x−(−x) retains the same 1428// sign as x even when x is zero. 1429// 1430// See also: https://play.golang.org/p/RtH3UCt5IH 1431 1432// Add sets z to the rounded sum x+y and returns z. If z's precision is 0, 1433// it is changed to the larger of x's or y's precision before the operation. 1434// Rounding is performed according to z's precision and rounding mode; and 1435// z's accuracy reports the result error relative to the exact (not rounded) 1436// result. Add panics with ErrNaN if x and y are infinities with opposite 1437// signs. The value of z is undefined in that case. 1438func (z *Float) Add(x, y *Float) *Float { 1439 if debugFloat { 1440 x.validate() 1441 y.validate() 1442 } 1443 1444 if z.prec == 0 { 1445 z.prec = umax32(x.prec, y.prec) 1446 } 1447 1448 if x.form == finite && y.form == finite { 1449 // x + y (common case) 1450 1451 // Below we set z.neg = x.neg, and when z aliases y this will 1452 // change the y operand's sign. This is fine, because if an 1453 // operand aliases the receiver it'll be overwritten, but we still 1454 // want the original x.neg and y.neg values when we evaluate 1455 // x.neg != y.neg, so we need to save y.neg before setting z.neg. 1456 yneg := y.neg 1457 1458 z.neg = x.neg 1459 if x.neg == yneg { 1460 // x + y == x + y 1461 // (-x) + (-y) == -(x + y) 1462 z.uadd(x, y) 1463 } else { 1464 // x + (-y) == x - y == -(y - x) 1465 // (-x) + y == y - x == -(x - y) 1466 if x.ucmp(y) > 0 { 1467 z.usub(x, y) 1468 } else { 1469 z.neg = !z.neg 1470 z.usub(y, x) 1471 } 1472 } 1473 if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact { 1474 z.neg = true 1475 } 1476 return z 1477 } 1478 1479 if x.form == inf && y.form == inf && x.neg != y.neg { 1480 // +Inf + -Inf 1481 // -Inf + +Inf 1482 // value of z is undefined but make sure it's valid 1483 z.acc = Exact 1484 z.form = zero 1485 z.neg = false 1486 panic(ErrNaN{"addition of infinities with opposite signs"}) 1487 } 1488 1489 if x.form == zero && y.form == zero { 1490 // ±0 + ±0 1491 z.acc = Exact 1492 z.form = zero 1493 z.neg = x.neg && y.neg // -0 + -0 == -0 1494 return z 1495 } 1496 1497 if x.form == inf || y.form == zero { 1498 // ±Inf + y 1499 // x + ±0 1500 return z.Set(x) 1501 } 1502 1503 // ±0 + y 1504 // x + ±Inf 1505 return z.Set(y) 1506} 1507 1508// Sub sets z to the rounded difference x-y and returns z. 1509// Precision, rounding, and accuracy reporting are as for Add. 1510// Sub panics with ErrNaN if x and y are infinities with equal 1511// signs. The value of z is undefined in that case. 1512func (z *Float) Sub(x, y *Float) *Float { 1513 if debugFloat { 1514 x.validate() 1515 y.validate() 1516 } 1517 1518 if z.prec == 0 { 1519 z.prec = umax32(x.prec, y.prec) 1520 } 1521 1522 if x.form == finite && y.form == finite { 1523 // x - y (common case) 1524 yneg := y.neg 1525 z.neg = x.neg 1526 if x.neg != yneg { 1527 // x - (-y) == x + y 1528 // (-x) - y == -(x + y) 1529 z.uadd(x, y) 1530 } else { 1531 // x - y == x - y == -(y - x) 1532 // (-x) - (-y) == y - x == -(x - y) 1533 if x.ucmp(y) > 0 { 1534 z.usub(x, y) 1535 } else { 1536 z.neg = !z.neg 1537 z.usub(y, x) 1538 } 1539 } 1540 if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact { 1541 z.neg = true 1542 } 1543 return z 1544 } 1545 1546 if x.form == inf && y.form == inf && x.neg == y.neg { 1547 // +Inf - +Inf 1548 // -Inf - -Inf 1549 // value of z is undefined but make sure it's valid 1550 z.acc = Exact 1551 z.form = zero 1552 z.neg = false 1553 panic(ErrNaN{"subtraction of infinities with equal signs"}) 1554 } 1555 1556 if x.form == zero && y.form == zero { 1557 // ±0 - ±0 1558 z.acc = Exact 1559 z.form = zero 1560 z.neg = x.neg && !y.neg // -0 - +0 == -0 1561 return z 1562 } 1563 1564 if x.form == inf || y.form == zero { 1565 // ±Inf - y 1566 // x - ±0 1567 return z.Set(x) 1568 } 1569 1570 // ±0 - y 1571 // x - ±Inf 1572 return z.Neg(y) 1573} 1574 1575// Mul sets z to the rounded product x*y and returns z. 1576// Precision, rounding, and accuracy reporting are as for Add. 1577// Mul panics with ErrNaN if one operand is zero and the other 1578// operand an infinity. The value of z is undefined in that case. 1579func (z *Float) Mul(x, y *Float) *Float { 1580 if debugFloat { 1581 x.validate() 1582 y.validate() 1583 } 1584 1585 if z.prec == 0 { 1586 z.prec = umax32(x.prec, y.prec) 1587 } 1588 1589 z.neg = x.neg != y.neg 1590 1591 if x.form == finite && y.form == finite { 1592 // x * y (common case) 1593 z.umul(x, y) 1594 return z 1595 } 1596 1597 z.acc = Exact 1598 if x.form == zero && y.form == inf || x.form == inf && y.form == zero { 1599 // ±0 * ±Inf 1600 // ±Inf * ±0 1601 // value of z is undefined but make sure it's valid 1602 z.form = zero 1603 z.neg = false 1604 panic(ErrNaN{"multiplication of zero with infinity"}) 1605 } 1606 1607 if x.form == inf || y.form == inf { 1608 // ±Inf * y 1609 // x * ±Inf 1610 z.form = inf 1611 return z 1612 } 1613 1614 // ±0 * y 1615 // x * ±0 1616 z.form = zero 1617 return z 1618} 1619 1620// Quo sets z to the rounded quotient x/y and returns z. 1621// Precision, rounding, and accuracy reporting are as for Add. 1622// Quo panics with ErrNaN if both operands are zero or infinities. 1623// The value of z is undefined in that case. 1624func (z *Float) Quo(x, y *Float) *Float { 1625 if debugFloat { 1626 x.validate() 1627 y.validate() 1628 } 1629 1630 if z.prec == 0 { 1631 z.prec = umax32(x.prec, y.prec) 1632 } 1633 1634 z.neg = x.neg != y.neg 1635 1636 if x.form == finite && y.form == finite { 1637 // x / y (common case) 1638 z.uquo(x, y) 1639 return z 1640 } 1641 1642 z.acc = Exact 1643 if x.form == zero && y.form == zero || x.form == inf && y.form == inf { 1644 // ±0 / ±0 1645 // ±Inf / ±Inf 1646 // value of z is undefined but make sure it's valid 1647 z.form = zero 1648 z.neg = false 1649 panic(ErrNaN{"division of zero by zero or infinity by infinity"}) 1650 } 1651 1652 if x.form == zero || y.form == inf { 1653 // ±0 / y 1654 // x / ±Inf 1655 z.form = zero 1656 return z 1657 } 1658 1659 // x / ±0 1660 // ±Inf / y 1661 z.form = inf 1662 return z 1663} 1664 1665// Cmp compares x and y and returns: 1666// 1667// -1 if x < y 1668// 0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf) 1669// +1 if x > y 1670// 1671func (x *Float) Cmp(y *Float) int { 1672 if debugFloat { 1673 x.validate() 1674 y.validate() 1675 } 1676 1677 mx := x.ord() 1678 my := y.ord() 1679 switch { 1680 case mx < my: 1681 return -1 1682 case mx > my: 1683 return +1 1684 } 1685 // mx == my 1686 1687 // only if |mx| == 1 we have to compare the mantissae 1688 switch mx { 1689 case -1: 1690 return y.ucmp(x) 1691 case +1: 1692 return x.ucmp(y) 1693 } 1694 1695 return 0 1696} 1697 1698// ord classifies x and returns: 1699// 1700// -2 if -Inf == x 1701// -1 if -Inf < x < 0 1702// 0 if x == 0 (signed or unsigned) 1703// +1 if 0 < x < +Inf 1704// +2 if x == +Inf 1705// 1706func (x *Float) ord() int { 1707 var m int 1708 switch x.form { 1709 case finite: 1710 m = 1 1711 case zero: 1712 return 0 1713 case inf: 1714 m = 2 1715 } 1716 if x.neg { 1717 m = -m 1718 } 1719 return m 1720} 1721 1722func umax32(x, y uint32) uint32 { 1723 if x > y { 1724 return x 1725 } 1726 return y 1727} 1728