1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file implements multi-precision floating-point numbers.
6// Like in the GNU MPFR library (https://www.mpfr.org/), operands
7// can be of mixed precision. Unlike MPFR, the rounding mode is
8// not specified with each operation, but with each operand. The
9// rounding mode of the result operand determines the rounding
10// mode of an operation. This is a from-scratch implementation.
11
12package big
13
14import (
15	"fmt"
16	"math"
17	"math/bits"
18)
19
20const debugFloat = false // enable for debugging
21
22// A nonzero finite Float represents a multi-precision floating point number
23//
24//   sign × mantissa × 2**exponent
25//
26// with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp.
27// A Float may also be zero (+0, -0) or infinite (+Inf, -Inf).
28// All Floats are ordered, and the ordering of two Floats x and y
29// is defined by x.Cmp(y).
30//
31// Each Float value also has a precision, rounding mode, and accuracy.
32// The precision is the maximum number of mantissa bits available to
33// represent the value. The rounding mode specifies how a result should
34// be rounded to fit into the mantissa bits, and accuracy describes the
35// rounding error with respect to the exact result.
36//
37// Unless specified otherwise, all operations (including setters) that
38// specify a *Float variable for the result (usually via the receiver
39// with the exception of MantExp), round the numeric result according
40// to the precision and rounding mode of the result variable.
41//
42// If the provided result precision is 0 (see below), it is set to the
43// precision of the argument with the largest precision value before any
44// rounding takes place, and the rounding mode remains unchanged. Thus,
45// uninitialized Floats provided as result arguments will have their
46// precision set to a reasonable value determined by the operands, and
47// their mode is the zero value for RoundingMode (ToNearestEven).
48//
49// By setting the desired precision to 24 or 53 and using matching rounding
50// mode (typically ToNearestEven), Float operations produce the same results
51// as the corresponding float32 or float64 IEEE-754 arithmetic for operands
52// that correspond to normal (i.e., not denormal) float32 or float64 numbers.
53// Exponent underflow and overflow lead to a 0 or an Infinity for different
54// values than IEEE-754 because Float exponents have a much larger range.
55//
56// The zero (uninitialized) value for a Float is ready to use and represents
57// the number +0.0 exactly, with precision 0 and rounding mode ToNearestEven.
58//
59// Operations always take pointer arguments (*Float) rather
60// than Float values, and each unique Float value requires
61// its own unique *Float pointer. To "copy" a Float value,
62// an existing (or newly allocated) Float must be set to
63// a new value using the Float.Set method; shallow copies
64// of Floats are not supported and may lead to errors.
65type Float struct {
66	prec uint32
67	mode RoundingMode
68	acc  Accuracy
69	form form
70	neg  bool
71	mant nat
72	exp  int32
73}
74
75// An ErrNaN panic is raised by a Float operation that would lead to
76// a NaN under IEEE-754 rules. An ErrNaN implements the error interface.
77type ErrNaN struct {
78	msg string
79}
80
81func (err ErrNaN) Error() string {
82	return err.msg
83}
84
85// NewFloat allocates and returns a new Float set to x,
86// with precision 53 and rounding mode ToNearestEven.
87// NewFloat panics with ErrNaN if x is a NaN.
88func NewFloat(x float64) *Float {
89	if math.IsNaN(x) {
90		panic(ErrNaN{"NewFloat(NaN)"})
91	}
92	return new(Float).SetFloat64(x)
93}
94
95// Exponent and precision limits.
96const (
97	MaxExp  = math.MaxInt32  // largest supported exponent
98	MinExp  = math.MinInt32  // smallest supported exponent
99	MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited
100)
101
102// Internal representation: The mantissa bits x.mant of a nonzero finite
103// Float x are stored in a nat slice long enough to hold up to x.prec bits;
104// the slice may (but doesn't have to) be shorter if the mantissa contains
105// trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e.,
106// the msb is shifted all the way "to the left"). Thus, if the mantissa has
107// trailing 0 bits or x.prec is not a multiple of the Word size _W,
108// x.mant[0] has trailing zero bits. The msb of the mantissa corresponds
109// to the value 0.5; the exponent x.exp shifts the binary point as needed.
110//
111// A zero or non-finite Float x ignores x.mant and x.exp.
112//
113// x                 form      neg      mant         exp
114// ----------------------------------------------------------
115// ±0                zero      sign     -            -
116// 0 < |x| < +Inf    finite    sign     mantissa     exponent
117// ±Inf              inf       sign     -            -
118
119// A form value describes the internal representation.
120type form byte
121
122// The form value order is relevant - do not change!
123const (
124	zero form = iota
125	finite
126	inf
127)
128
129// RoundingMode determines how a Float value is rounded to the
130// desired precision. Rounding may change the Float value; the
131// rounding error is described by the Float's Accuracy.
132type RoundingMode byte
133
134// These constants define supported rounding modes.
135const (
136	ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven
137	ToNearestAway                     // == IEEE 754-2008 roundTiesToAway
138	ToZero                            // == IEEE 754-2008 roundTowardZero
139	AwayFromZero                      // no IEEE 754-2008 equivalent
140	ToNegativeInf                     // == IEEE 754-2008 roundTowardNegative
141	ToPositiveInf                     // == IEEE 754-2008 roundTowardPositive
142)
143
144//go:generate stringer -type=RoundingMode
145
146// Accuracy describes the rounding error produced by the most recent
147// operation that generated a Float value, relative to the exact value.
148type Accuracy int8
149
150// Constants describing the Accuracy of a Float.
151const (
152	Below Accuracy = -1
153	Exact Accuracy = 0
154	Above Accuracy = +1
155)
156
157//go:generate stringer -type=Accuracy
158
159// SetPrec sets z's precision to prec and returns the (possibly) rounded
160// value of z. Rounding occurs according to z's rounding mode if the mantissa
161// cannot be represented in prec bits without loss of precision.
162// SetPrec(0) maps all finite values to ±0; infinite values remain unchanged.
163// If prec > MaxPrec, it is set to MaxPrec.
164func (z *Float) SetPrec(prec uint) *Float {
165	z.acc = Exact // optimistically assume no rounding is needed
166
167	// special case
168	if prec == 0 {
169		z.prec = 0
170		if z.form == finite {
171			// truncate z to 0
172			z.acc = makeAcc(z.neg)
173			z.form = zero
174		}
175		return z
176	}
177
178	// general case
179	if prec > MaxPrec {
180		prec = MaxPrec
181	}
182	old := z.prec
183	z.prec = uint32(prec)
184	if z.prec < old {
185		z.round(0)
186	}
187	return z
188}
189
190func makeAcc(above bool) Accuracy {
191	if above {
192		return Above
193	}
194	return Below
195}
196
197// SetMode sets z's rounding mode to mode and returns an exact z.
198// z remains unchanged otherwise.
199// z.SetMode(z.Mode()) is a cheap way to set z's accuracy to Exact.
200func (z *Float) SetMode(mode RoundingMode) *Float {
201	z.mode = mode
202	z.acc = Exact
203	return z
204}
205
206// Prec returns the mantissa precision of x in bits.
207// The result may be 0 for |x| == 0 and |x| == Inf.
208func (x *Float) Prec() uint {
209	return uint(x.prec)
210}
211
212// MinPrec returns the minimum precision required to represent x exactly
213// (i.e., the smallest prec before x.SetPrec(prec) would start rounding x).
214// The result is 0 for |x| == 0 and |x| == Inf.
215func (x *Float) MinPrec() uint {
216	if x.form != finite {
217		return 0
218	}
219	return uint(len(x.mant))*_W - x.mant.trailingZeroBits()
220}
221
222// Mode returns the rounding mode of x.
223func (x *Float) Mode() RoundingMode {
224	return x.mode
225}
226
227// Acc returns the accuracy of x produced by the most recent
228// operation, unless explicitly documented otherwise by that
229// operation.
230func (x *Float) Acc() Accuracy {
231	return x.acc
232}
233
234// Sign returns:
235//
236//	-1 if x <   0
237//	 0 if x is ±0
238//	+1 if x >   0
239//
240func (x *Float) Sign() int {
241	if debugFloat {
242		x.validate()
243	}
244	if x.form == zero {
245		return 0
246	}
247	if x.neg {
248		return -1
249	}
250	return 1
251}
252
253// MantExp breaks x into its mantissa and exponent components
254// and returns the exponent. If a non-nil mant argument is
255// provided its value is set to the mantissa of x, with the
256// same precision and rounding mode as x. The components
257// satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0.
258// Calling MantExp with a nil argument is an efficient way to
259// get the exponent of the receiver.
260//
261// Special cases are:
262//
263//	(  ±0).MantExp(mant) = 0, with mant set to   ±0
264//	(±Inf).MantExp(mant) = 0, with mant set to ±Inf
265//
266// x and mant may be the same in which case x is set to its
267// mantissa value.
268func (x *Float) MantExp(mant *Float) (exp int) {
269	if debugFloat {
270		x.validate()
271	}
272	if x.form == finite {
273		exp = int(x.exp)
274	}
275	if mant != nil {
276		mant.Copy(x)
277		if mant.form == finite {
278			mant.exp = 0
279		}
280	}
281	return
282}
283
284func (z *Float) setExpAndRound(exp int64, sbit uint) {
285	if exp < MinExp {
286		// underflow
287		z.acc = makeAcc(z.neg)
288		z.form = zero
289		return
290	}
291
292	if exp > MaxExp {
293		// overflow
294		z.acc = makeAcc(!z.neg)
295		z.form = inf
296		return
297	}
298
299	z.form = finite
300	z.exp = int32(exp)
301	z.round(sbit)
302}
303
304// SetMantExp sets z to mant × 2**exp and returns z.
305// The result z has the same precision and rounding mode
306// as mant. SetMantExp is an inverse of MantExp but does
307// not require 0.5 <= |mant| < 1.0. Specifically:
308//
309//	mant := new(Float)
310//	new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0
311//
312// Special cases are:
313//
314//	z.SetMantExp(  ±0, exp) =   ±0
315//	z.SetMantExp(±Inf, exp) = ±Inf
316//
317// z and mant may be the same in which case z's exponent
318// is set to exp.
319func (z *Float) SetMantExp(mant *Float, exp int) *Float {
320	if debugFloat {
321		z.validate()
322		mant.validate()
323	}
324	z.Copy(mant)
325
326	if z.form == finite {
327		// 0 < |mant| < +Inf
328		z.setExpAndRound(int64(z.exp)+int64(exp), 0)
329	}
330	return z
331}
332
333// Signbit reports whether x is negative or negative zero.
334func (x *Float) Signbit() bool {
335	return x.neg
336}
337
338// IsInf reports whether x is +Inf or -Inf.
339func (x *Float) IsInf() bool {
340	return x.form == inf
341}
342
343// IsInt reports whether x is an integer.
344// ±Inf values are not integers.
345func (x *Float) IsInt() bool {
346	if debugFloat {
347		x.validate()
348	}
349	// special cases
350	if x.form != finite {
351		return x.form == zero
352	}
353	// x.form == finite
354	if x.exp <= 0 {
355		return false
356	}
357	// x.exp > 0
358	return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa
359}
360
361// debugging support
362func (x *Float) validate() {
363	if !debugFloat {
364		// avoid performance bugs
365		panic("validate called but debugFloat is not set")
366	}
367	if x.form != finite {
368		return
369	}
370	m := len(x.mant)
371	if m == 0 {
372		panic("nonzero finite number with empty mantissa")
373	}
374	const msb = 1 << (_W - 1)
375	if x.mant[m-1]&msb == 0 {
376		panic(fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0)))
377	}
378	if x.prec == 0 {
379		panic("zero precision finite number")
380	}
381}
382
383// round rounds z according to z.mode to z.prec bits and sets z.acc accordingly.
384// sbit must be 0 or 1 and summarizes any "sticky bit" information one might
385// have before calling round. z's mantissa must be normalized (with the msb set)
386// or empty.
387//
388// CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the
389// sign of z. For correct rounding, the sign of z must be set correctly before
390// calling round.
391func (z *Float) round(sbit uint) {
392	if debugFloat {
393		z.validate()
394	}
395
396	z.acc = Exact
397	if z.form != finite {
398		// ±0 or ±Inf => nothing left to do
399		return
400	}
401	// z.form == finite && len(z.mant) > 0
402	// m > 0 implies z.prec > 0 (checked by validate)
403
404	m := uint32(len(z.mant)) // present mantissa length in words
405	bits := m * _W           // present mantissa bits; bits > 0
406	if bits <= z.prec {
407		// mantissa fits => nothing to do
408		return
409	}
410	// bits > z.prec
411
412	// Rounding is based on two bits: the rounding bit (rbit) and the
413	// sticky bit (sbit). The rbit is the bit immediately before the
414	// z.prec leading mantissa bits (the "0.5"). The sbit is set if any
415	// of the bits before the rbit are set (the "0.25", "0.125", etc.):
416	//
417	//   rbit  sbit  => "fractional part"
418	//
419	//   0     0        == 0
420	//   0     1        >  0  , < 0.5
421	//   1     0        == 0.5
422	//   1     1        >  0.5, < 1.0
423
424	// bits > z.prec: mantissa too large => round
425	r := uint(bits - z.prec - 1) // rounding bit position; r >= 0
426	rbit := z.mant.bit(r) & 1    // rounding bit; be safe and ensure it's a single bit
427	// The sticky bit is only needed for rounding ToNearestEven
428	// or when the rounding bit is zero. Avoid computation otherwise.
429	if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) {
430		sbit = z.mant.sticky(r)
431	}
432	sbit &= 1 // be safe and ensure it's a single bit
433
434	// cut off extra words
435	n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision
436	if m > n {
437		copy(z.mant, z.mant[m-n:]) // move n last words to front
438		z.mant = z.mant[:n]
439	}
440
441	// determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word
442	ntz := n*_W - z.prec // 0 <= ntz < _W
443	lsb := Word(1) << ntz
444
445	// round if result is inexact
446	if rbit|sbit != 0 {
447		// Make rounding decision: The result mantissa is truncated ("rounded down")
448		// by default. Decide if we need to increment, or "round up", the (unsigned)
449		// mantissa.
450		inc := false
451		switch z.mode {
452		case ToNegativeInf:
453			inc = z.neg
454		case ToZero:
455			// nothing to do
456		case ToNearestEven:
457			inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0)
458		case ToNearestAway:
459			inc = rbit != 0
460		case AwayFromZero:
461			inc = true
462		case ToPositiveInf:
463			inc = !z.neg
464		default:
465			panic("unreachable")
466		}
467
468		// A positive result (!z.neg) is Above the exact result if we increment,
469		// and it's Below if we truncate (Exact results require no rounding).
470		// For a negative result (z.neg) it is exactly the opposite.
471		z.acc = makeAcc(inc != z.neg)
472
473		if inc {
474			// add 1 to mantissa
475			if addVW(z.mant, z.mant, lsb) != 0 {
476				// mantissa overflow => adjust exponent
477				if z.exp >= MaxExp {
478					// exponent overflow
479					z.form = inf
480					return
481				}
482				z.exp++
483				// adjust mantissa: divide by 2 to compensate for exponent adjustment
484				shrVU(z.mant, z.mant, 1)
485				// set msb == carry == 1 from the mantissa overflow above
486				const msb = 1 << (_W - 1)
487				z.mant[n-1] |= msb
488			}
489		}
490	}
491
492	// zero out trailing bits in least-significant word
493	z.mant[0] &^= lsb - 1
494
495	if debugFloat {
496		z.validate()
497	}
498}
499
500func (z *Float) setBits64(neg bool, x uint64) *Float {
501	if z.prec == 0 {
502		z.prec = 64
503	}
504	z.acc = Exact
505	z.neg = neg
506	if x == 0 {
507		z.form = zero
508		return z
509	}
510	// x != 0
511	z.form = finite
512	s := bits.LeadingZeros64(x)
513	z.mant = z.mant.setUint64(x << uint(s))
514	z.exp = int32(64 - s) // always fits
515	if z.prec < 64 {
516		z.round(0)
517	}
518	return z
519}
520
521// SetUint64 sets z to the (possibly rounded) value of x and returns z.
522// If z's precision is 0, it is changed to 64 (and rounding will have
523// no effect).
524func (z *Float) SetUint64(x uint64) *Float {
525	return z.setBits64(false, x)
526}
527
528// SetInt64 sets z to the (possibly rounded) value of x and returns z.
529// If z's precision is 0, it is changed to 64 (and rounding will have
530// no effect).
531func (z *Float) SetInt64(x int64) *Float {
532	u := x
533	if u < 0 {
534		u = -u
535	}
536	// We cannot simply call z.SetUint64(uint64(u)) and change
537	// the sign afterwards because the sign affects rounding.
538	return z.setBits64(x < 0, uint64(u))
539}
540
541// SetFloat64 sets z to the (possibly rounded) value of x and returns z.
542// If z's precision is 0, it is changed to 53 (and rounding will have
543// no effect). SetFloat64 panics with ErrNaN if x is a NaN.
544func (z *Float) SetFloat64(x float64) *Float {
545	if z.prec == 0 {
546		z.prec = 53
547	}
548	if math.IsNaN(x) {
549		panic(ErrNaN{"Float.SetFloat64(NaN)"})
550	}
551	z.acc = Exact
552	z.neg = math.Signbit(x) // handle -0, -Inf correctly
553	if x == 0 {
554		z.form = zero
555		return z
556	}
557	if math.IsInf(x, 0) {
558		z.form = inf
559		return z
560	}
561	// normalized x != 0
562	z.form = finite
563	fmant, exp := math.Frexp(x) // get normalized mantissa
564	z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11)
565	z.exp = int32(exp) // always fits
566	if z.prec < 53 {
567		z.round(0)
568	}
569	return z
570}
571
572// fnorm normalizes mantissa m by shifting it to the left
573// such that the msb of the most-significant word (msw) is 1.
574// It returns the shift amount. It assumes that len(m) != 0.
575func fnorm(m nat) int64 {
576	if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) {
577		panic("msw of mantissa is 0")
578	}
579	s := nlz(m[len(m)-1])
580	if s > 0 {
581		c := shlVU(m, m, s)
582		if debugFloat && c != 0 {
583			panic("nlz or shlVU incorrect")
584		}
585	}
586	return int64(s)
587}
588
589// SetInt sets z to the (possibly rounded) value of x and returns z.
590// If z's precision is 0, it is changed to the larger of x.BitLen()
591// or 64 (and rounding will have no effect).
592func (z *Float) SetInt(x *Int) *Float {
593	// TODO(gri) can be more efficient if z.prec > 0
594	// but small compared to the size of x, or if there
595	// are many trailing 0's.
596	bits := uint32(x.BitLen())
597	if z.prec == 0 {
598		z.prec = umax32(bits, 64)
599	}
600	z.acc = Exact
601	z.neg = x.neg
602	if len(x.abs) == 0 {
603		z.form = zero
604		return z
605	}
606	// x != 0
607	z.mant = z.mant.set(x.abs)
608	fnorm(z.mant)
609	z.setExpAndRound(int64(bits), 0)
610	return z
611}
612
613// SetRat sets z to the (possibly rounded) value of x and returns z.
614// If z's precision is 0, it is changed to the largest of a.BitLen(),
615// b.BitLen(), or 64; with x = a/b.
616func (z *Float) SetRat(x *Rat) *Float {
617	if x.IsInt() {
618		return z.SetInt(x.Num())
619	}
620	var a, b Float
621	a.SetInt(x.Num())
622	b.SetInt(x.Denom())
623	if z.prec == 0 {
624		z.prec = umax32(a.prec, b.prec)
625	}
626	return z.Quo(&a, &b)
627}
628
629// SetInf sets z to the infinite Float -Inf if signbit is
630// set, or +Inf if signbit is not set, and returns z. The
631// precision of z is unchanged and the result is always
632// Exact.
633func (z *Float) SetInf(signbit bool) *Float {
634	z.acc = Exact
635	z.form = inf
636	z.neg = signbit
637	return z
638}
639
640// Set sets z to the (possibly rounded) value of x and returns z.
641// If z's precision is 0, it is changed to the precision of x
642// before setting z (and rounding will have no effect).
643// Rounding is performed according to z's precision and rounding
644// mode; and z's accuracy reports the result error relative to the
645// exact (not rounded) result.
646func (z *Float) Set(x *Float) *Float {
647	if debugFloat {
648		x.validate()
649	}
650	z.acc = Exact
651	if z != x {
652		z.form = x.form
653		z.neg = x.neg
654		if x.form == finite {
655			z.exp = x.exp
656			z.mant = z.mant.set(x.mant)
657		}
658		if z.prec == 0 {
659			z.prec = x.prec
660		} else if z.prec < x.prec {
661			z.round(0)
662		}
663	}
664	return z
665}
666
667// Copy sets z to x, with the same precision, rounding mode, and
668// accuracy as x, and returns z. x is not changed even if z and
669// x are the same.
670func (z *Float) Copy(x *Float) *Float {
671	if debugFloat {
672		x.validate()
673	}
674	if z != x {
675		z.prec = x.prec
676		z.mode = x.mode
677		z.acc = x.acc
678		z.form = x.form
679		z.neg = x.neg
680		if z.form == finite {
681			z.mant = z.mant.set(x.mant)
682			z.exp = x.exp
683		}
684	}
685	return z
686}
687
688// msb32 returns the 32 most significant bits of x.
689func msb32(x nat) uint32 {
690	i := len(x) - 1
691	if i < 0 {
692		return 0
693	}
694	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
695		panic("x not normalized")
696	}
697	switch _W {
698	case 32:
699		return uint32(x[i])
700	case 64:
701		return uint32(x[i] >> 32)
702	}
703	panic("unreachable")
704}
705
706// msb64 returns the 64 most significant bits of x.
707func msb64(x nat) uint64 {
708	i := len(x) - 1
709	if i < 0 {
710		return 0
711	}
712	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
713		panic("x not normalized")
714	}
715	switch _W {
716	case 32:
717		v := uint64(x[i]) << 32
718		if i > 0 {
719			v |= uint64(x[i-1])
720		}
721		return v
722	case 64:
723		return uint64(x[i])
724	}
725	panic("unreachable")
726}
727
728// Uint64 returns the unsigned integer resulting from truncating x
729// towards zero. If 0 <= x <= math.MaxUint64, the result is Exact
730// if x is an integer and Below otherwise.
731// The result is (0, Above) for x < 0, and (math.MaxUint64, Below)
732// for x > math.MaxUint64.
733func (x *Float) Uint64() (uint64, Accuracy) {
734	if debugFloat {
735		x.validate()
736	}
737
738	switch x.form {
739	case finite:
740		if x.neg {
741			return 0, Above
742		}
743		// 0 < x < +Inf
744		if x.exp <= 0 {
745			// 0 < x < 1
746			return 0, Below
747		}
748		// 1 <= x < Inf
749		if x.exp <= 64 {
750			// u = trunc(x) fits into a uint64
751			u := msb64(x.mant) >> (64 - uint32(x.exp))
752			if x.MinPrec() <= 64 {
753				return u, Exact
754			}
755			return u, Below // x truncated
756		}
757		// x too large
758		return math.MaxUint64, Below
759
760	case zero:
761		return 0, Exact
762
763	case inf:
764		if x.neg {
765			return 0, Above
766		}
767		return math.MaxUint64, Below
768	}
769
770	panic("unreachable")
771}
772
773// Int64 returns the integer resulting from truncating x towards zero.
774// If math.MinInt64 <= x <= math.MaxInt64, the result is Exact if x is
775// an integer, and Above (x < 0) or Below (x > 0) otherwise.
776// The result is (math.MinInt64, Above) for x < math.MinInt64,
777// and (math.MaxInt64, Below) for x > math.MaxInt64.
778func (x *Float) Int64() (int64, Accuracy) {
779	if debugFloat {
780		x.validate()
781	}
782
783	switch x.form {
784	case finite:
785		// 0 < |x| < +Inf
786		acc := makeAcc(x.neg)
787		if x.exp <= 0 {
788			// 0 < |x| < 1
789			return 0, acc
790		}
791		// x.exp > 0
792
793		// 1 <= |x| < +Inf
794		if x.exp <= 63 {
795			// i = trunc(x) fits into an int64 (excluding math.MinInt64)
796			i := int64(msb64(x.mant) >> (64 - uint32(x.exp)))
797			if x.neg {
798				i = -i
799			}
800			if x.MinPrec() <= uint(x.exp) {
801				return i, Exact
802			}
803			return i, acc // x truncated
804		}
805		if x.neg {
806			// check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64))
807			if x.exp == 64 && x.MinPrec() == 1 {
808				acc = Exact
809			}
810			return math.MinInt64, acc
811		}
812		// x too large
813		return math.MaxInt64, Below
814
815	case zero:
816		return 0, Exact
817
818	case inf:
819		if x.neg {
820			return math.MinInt64, Above
821		}
822		return math.MaxInt64, Below
823	}
824
825	panic("unreachable")
826}
827
828// Float32 returns the float32 value nearest to x. If x is too small to be
829// represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result
830// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
831// If x is too large to be represented by a float32 (|x| > math.MaxFloat32),
832// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
833func (x *Float) Float32() (float32, Accuracy) {
834	if debugFloat {
835		x.validate()
836	}
837
838	switch x.form {
839	case finite:
840		// 0 < |x| < +Inf
841
842		const (
843			fbits = 32                //        float size
844			mbits = 23                //        mantissa size (excluding implicit msb)
845			ebits = fbits - mbits - 1 //     8  exponent size
846			bias  = 1<<(ebits-1) - 1  //   127  exponent bias
847			dmin  = 1 - bias - mbits  //  -149  smallest unbiased exponent (denormal)
848			emin  = 1 - bias          //  -126  smallest unbiased exponent (normal)
849			emax  = bias              //   127  largest unbiased exponent (normal)
850		)
851
852		// Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa.
853		e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
854
855		// Compute precision p for float32 mantissa.
856		// If the exponent is too small, we have a denormal number before
857		// rounding and fewer than p mantissa bits of precision available
858		// (the exponent remains fixed but the mantissa gets shifted right).
859		p := mbits + 1 // precision of normal float
860		if e < emin {
861			// recompute precision
862			p = mbits + 1 - emin + int(e)
863			// If p == 0, the mantissa of x is shifted so much to the right
864			// that its msb falls immediately to the right of the float32
865			// mantissa space. In other words, if the smallest denormal is
866			// considered "1.0", for p == 0, the mantissa value m is >= 0.5.
867			// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
868			// If m == 0.5, it is rounded down to even, i.e., 0.0.
869			// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
870			if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
871				// underflow to ±0
872				if x.neg {
873					var z float32
874					return -z, Above
875				}
876				return 0.0, Below
877			}
878			// otherwise, round up
879			// We handle p == 0 explicitly because it's easy and because
880			// Float.round doesn't support rounding to 0 bits of precision.
881			if p == 0 {
882				if x.neg {
883					return -math.SmallestNonzeroFloat32, Below
884				}
885				return math.SmallestNonzeroFloat32, Above
886			}
887		}
888		// p > 0
889
890		// round
891		var r Float
892		r.prec = uint32(p)
893		r.Set(x)
894		e = r.exp - 1
895
896		// Rounding may have caused r to overflow to ±Inf
897		// (rounding never causes underflows to 0).
898		// If the exponent is too large, also overflow to ±Inf.
899		if r.form == inf || e > emax {
900			// overflow
901			if x.neg {
902				return float32(math.Inf(-1)), Below
903			}
904			return float32(math.Inf(+1)), Above
905		}
906		// e <= emax
907
908		// Determine sign, biased exponent, and mantissa.
909		var sign, bexp, mant uint32
910		if x.neg {
911			sign = 1 << (fbits - 1)
912		}
913
914		// Rounding may have caused a denormal number to
915		// become normal. Check again.
916		if e < emin {
917			// denormal number: recompute precision
918			// Since rounding may have at best increased precision
919			// and we have eliminated p <= 0 early, we know p > 0.
920			// bexp == 0 for denormals
921			p = mbits + 1 - emin + int(e)
922			mant = msb32(r.mant) >> uint(fbits-p)
923		} else {
924			// normal number: emin <= e <= emax
925			bexp = uint32(e+bias) << mbits
926			mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
927		}
928
929		return math.Float32frombits(sign | bexp | mant), r.acc
930
931	case zero:
932		if x.neg {
933			var z float32
934			return -z, Exact
935		}
936		return 0.0, Exact
937
938	case inf:
939		if x.neg {
940			return float32(math.Inf(-1)), Exact
941		}
942		return float32(math.Inf(+1)), Exact
943	}
944
945	panic("unreachable")
946}
947
948// Float64 returns the float64 value nearest to x. If x is too small to be
949// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
950// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
951// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
952// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
953func (x *Float) Float64() (float64, Accuracy) {
954	if debugFloat {
955		x.validate()
956	}
957
958	switch x.form {
959	case finite:
960		// 0 < |x| < +Inf
961
962		const (
963			fbits = 64                //        float size
964			mbits = 52                //        mantissa size (excluding implicit msb)
965			ebits = fbits - mbits - 1 //    11  exponent size
966			bias  = 1<<(ebits-1) - 1  //  1023  exponent bias
967			dmin  = 1 - bias - mbits  // -1074  smallest unbiased exponent (denormal)
968			emin  = 1 - bias          // -1022  smallest unbiased exponent (normal)
969			emax  = bias              //  1023  largest unbiased exponent (normal)
970		)
971
972		// Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa.
973		e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
974
975		// Compute precision p for float64 mantissa.
976		// If the exponent is too small, we have a denormal number before
977		// rounding and fewer than p mantissa bits of precision available
978		// (the exponent remains fixed but the mantissa gets shifted right).
979		p := mbits + 1 // precision of normal float
980		if e < emin {
981			// recompute precision
982			p = mbits + 1 - emin + int(e)
983			// If p == 0, the mantissa of x is shifted so much to the right
984			// that its msb falls immediately to the right of the float64
985			// mantissa space. In other words, if the smallest denormal is
986			// considered "1.0", for p == 0, the mantissa value m is >= 0.5.
987			// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
988			// If m == 0.5, it is rounded down to even, i.e., 0.0.
989			// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
990			if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
991				// underflow to ±0
992				if x.neg {
993					var z float64
994					return -z, Above
995				}
996				return 0.0, Below
997			}
998			// otherwise, round up
999			// We handle p == 0 explicitly because it's easy and because
1000			// Float.round doesn't support rounding to 0 bits of precision.
1001			if p == 0 {
1002				if x.neg {
1003					return -math.SmallestNonzeroFloat64, Below
1004				}
1005				return math.SmallestNonzeroFloat64, Above
1006			}
1007		}
1008		// p > 0
1009
1010		// round
1011		var r Float
1012		r.prec = uint32(p)
1013		r.Set(x)
1014		e = r.exp - 1
1015
1016		// Rounding may have caused r to overflow to ±Inf
1017		// (rounding never causes underflows to 0).
1018		// If the exponent is too large, also overflow to ±Inf.
1019		if r.form == inf || e > emax {
1020			// overflow
1021			if x.neg {
1022				return math.Inf(-1), Below
1023			}
1024			return math.Inf(+1), Above
1025		}
1026		// e <= emax
1027
1028		// Determine sign, biased exponent, and mantissa.
1029		var sign, bexp, mant uint64
1030		if x.neg {
1031			sign = 1 << (fbits - 1)
1032		}
1033
1034		// Rounding may have caused a denormal number to
1035		// become normal. Check again.
1036		if e < emin {
1037			// denormal number: recompute precision
1038			// Since rounding may have at best increased precision
1039			// and we have eliminated p <= 0 early, we know p > 0.
1040			// bexp == 0 for denormals
1041			p = mbits + 1 - emin + int(e)
1042			mant = msb64(r.mant) >> uint(fbits-p)
1043		} else {
1044			// normal number: emin <= e <= emax
1045			bexp = uint64(e+bias) << mbits
1046			mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
1047		}
1048
1049		return math.Float64frombits(sign | bexp | mant), r.acc
1050
1051	case zero:
1052		if x.neg {
1053			var z float64
1054			return -z, Exact
1055		}
1056		return 0.0, Exact
1057
1058	case inf:
1059		if x.neg {
1060			return math.Inf(-1), Exact
1061		}
1062		return math.Inf(+1), Exact
1063	}
1064
1065	panic("unreachable")
1066}
1067
1068// Int returns the result of truncating x towards zero;
1069// or nil if x is an infinity.
1070// The result is Exact if x.IsInt(); otherwise it is Below
1071// for x > 0, and Above for x < 0.
1072// If a non-nil *Int argument z is provided, Int stores
1073// the result in z instead of allocating a new Int.
1074func (x *Float) Int(z *Int) (*Int, Accuracy) {
1075	if debugFloat {
1076		x.validate()
1077	}
1078
1079	if z == nil && x.form <= finite {
1080		z = new(Int)
1081	}
1082
1083	switch x.form {
1084	case finite:
1085		// 0 < |x| < +Inf
1086		acc := makeAcc(x.neg)
1087		if x.exp <= 0 {
1088			// 0 < |x| < 1
1089			return z.SetInt64(0), acc
1090		}
1091		// x.exp > 0
1092
1093		// 1 <= |x| < +Inf
1094		// determine minimum required precision for x
1095		allBits := uint(len(x.mant)) * _W
1096		exp := uint(x.exp)
1097		if x.MinPrec() <= exp {
1098			acc = Exact
1099		}
1100		// shift mantissa as needed
1101		if z == nil {
1102			z = new(Int)
1103		}
1104		z.neg = x.neg
1105		switch {
1106		case exp > allBits:
1107			z.abs = z.abs.shl(x.mant, exp-allBits)
1108		default:
1109			z.abs = z.abs.set(x.mant)
1110		case exp < allBits:
1111			z.abs = z.abs.shr(x.mant, allBits-exp)
1112		}
1113		return z, acc
1114
1115	case zero:
1116		return z.SetInt64(0), Exact
1117
1118	case inf:
1119		return nil, makeAcc(x.neg)
1120	}
1121
1122	panic("unreachable")
1123}
1124
1125// Rat returns the rational number corresponding to x;
1126// or nil if x is an infinity.
1127// The result is Exact if x is not an Inf.
1128// If a non-nil *Rat argument z is provided, Rat stores
1129// the result in z instead of allocating a new Rat.
1130func (x *Float) Rat(z *Rat) (*Rat, Accuracy) {
1131	if debugFloat {
1132		x.validate()
1133	}
1134
1135	if z == nil && x.form <= finite {
1136		z = new(Rat)
1137	}
1138
1139	switch x.form {
1140	case finite:
1141		// 0 < |x| < +Inf
1142		allBits := int32(len(x.mant)) * _W
1143		// build up numerator and denominator
1144		z.a.neg = x.neg
1145		switch {
1146		case x.exp > allBits:
1147			z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits))
1148			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
1149			// z already in normal form
1150		default:
1151			z.a.abs = z.a.abs.set(x.mant)
1152			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
1153			// z already in normal form
1154		case x.exp < allBits:
1155			z.a.abs = z.a.abs.set(x.mant)
1156			t := z.b.abs.setUint64(1)
1157			z.b.abs = t.shl(t, uint(allBits-x.exp))
1158			z.norm()
1159		}
1160		return z, Exact
1161
1162	case zero:
1163		return z.SetInt64(0), Exact
1164
1165	case inf:
1166		return nil, makeAcc(x.neg)
1167	}
1168
1169	panic("unreachable")
1170}
1171
1172// Abs sets z to the (possibly rounded) value |x| (the absolute value of x)
1173// and returns z.
1174func (z *Float) Abs(x *Float) *Float {
1175	z.Set(x)
1176	z.neg = false
1177	return z
1178}
1179
1180// Neg sets z to the (possibly rounded) value of x with its sign negated,
1181// and returns z.
1182func (z *Float) Neg(x *Float) *Float {
1183	z.Set(x)
1184	z.neg = !z.neg
1185	return z
1186}
1187
1188func validateBinaryOperands(x, y *Float) {
1189	if !debugFloat {
1190		// avoid performance bugs
1191		panic("validateBinaryOperands called but debugFloat is not set")
1192	}
1193	if len(x.mant) == 0 {
1194		panic("empty mantissa for x")
1195	}
1196	if len(y.mant) == 0 {
1197		panic("empty mantissa for y")
1198	}
1199}
1200
1201// z = x + y, ignoring signs of x and y for the addition
1202// but using the sign of z for rounding the result.
1203// x and y must have a non-empty mantissa and valid exponent.
1204func (z *Float) uadd(x, y *Float) {
1205	// Note: This implementation requires 2 shifts most of the
1206	// time. It is also inefficient if exponents or precisions
1207	// differ by wide margins. The following article describes
1208	// an efficient (but much more complicated) implementation
1209	// compatible with the internal representation used here:
1210	//
1211	// Vincent Lefèvre: "The Generic Multiple-Precision Floating-
1212	// Point Addition With Exact Rounding (as in the MPFR Library)"
1213	// http://www.vinc17.net/research/papers/rnc6.pdf
1214
1215	if debugFloat {
1216		validateBinaryOperands(x, y)
1217	}
1218
1219	// compute exponents ex, ey for mantissa with "binary point"
1220	// on the right (mantissa.0) - use int64 to avoid overflow
1221	ex := int64(x.exp) - int64(len(x.mant))*_W
1222	ey := int64(y.exp) - int64(len(y.mant))*_W
1223
1224	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
1225
1226	// TODO(gri) having a combined add-and-shift primitive
1227	//           could make this code significantly faster
1228	switch {
1229	case ex < ey:
1230		if al {
1231			t := nat(nil).shl(y.mant, uint(ey-ex))
1232			z.mant = z.mant.add(x.mant, t)
1233		} else {
1234			z.mant = z.mant.shl(y.mant, uint(ey-ex))
1235			z.mant = z.mant.add(x.mant, z.mant)
1236		}
1237	default:
1238		// ex == ey, no shift needed
1239		z.mant = z.mant.add(x.mant, y.mant)
1240	case ex > ey:
1241		if al {
1242			t := nat(nil).shl(x.mant, uint(ex-ey))
1243			z.mant = z.mant.add(t, y.mant)
1244		} else {
1245			z.mant = z.mant.shl(x.mant, uint(ex-ey))
1246			z.mant = z.mant.add(z.mant, y.mant)
1247		}
1248		ex = ey
1249	}
1250	// len(z.mant) > 0
1251
1252	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1253}
1254
1255// z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction
1256// but using the sign of z for rounding the result.
1257// x and y must have a non-empty mantissa and valid exponent.
1258func (z *Float) usub(x, y *Float) {
1259	// This code is symmetric to uadd.
1260	// We have not factored the common code out because
1261	// eventually uadd (and usub) should be optimized
1262	// by special-casing, and the code will diverge.
1263
1264	if debugFloat {
1265		validateBinaryOperands(x, y)
1266	}
1267
1268	ex := int64(x.exp) - int64(len(x.mant))*_W
1269	ey := int64(y.exp) - int64(len(y.mant))*_W
1270
1271	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
1272
1273	switch {
1274	case ex < ey:
1275		if al {
1276			t := nat(nil).shl(y.mant, uint(ey-ex))
1277			z.mant = t.sub(x.mant, t)
1278		} else {
1279			z.mant = z.mant.shl(y.mant, uint(ey-ex))
1280			z.mant = z.mant.sub(x.mant, z.mant)
1281		}
1282	default:
1283		// ex == ey, no shift needed
1284		z.mant = z.mant.sub(x.mant, y.mant)
1285	case ex > ey:
1286		if al {
1287			t := nat(nil).shl(x.mant, uint(ex-ey))
1288			z.mant = t.sub(t, y.mant)
1289		} else {
1290			z.mant = z.mant.shl(x.mant, uint(ex-ey))
1291			z.mant = z.mant.sub(z.mant, y.mant)
1292		}
1293		ex = ey
1294	}
1295
1296	// operands may have canceled each other out
1297	if len(z.mant) == 0 {
1298		z.acc = Exact
1299		z.form = zero
1300		z.neg = false
1301		return
1302	}
1303	// len(z.mant) > 0
1304
1305	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1306}
1307
1308// z = x * y, ignoring signs of x and y for the multiplication
1309// but using the sign of z for rounding the result.
1310// x and y must have a non-empty mantissa and valid exponent.
1311func (z *Float) umul(x, y *Float) {
1312	if debugFloat {
1313		validateBinaryOperands(x, y)
1314	}
1315
1316	// Note: This is doing too much work if the precision
1317	// of z is less than the sum of the precisions of x
1318	// and y which is often the case (e.g., if all floats
1319	// have the same precision).
1320	// TODO(gri) Optimize this for the common case.
1321
1322	e := int64(x.exp) + int64(y.exp)
1323	if x == y {
1324		z.mant = z.mant.sqr(x.mant)
1325	} else {
1326		z.mant = z.mant.mul(x.mant, y.mant)
1327	}
1328	z.setExpAndRound(e-fnorm(z.mant), 0)
1329}
1330
1331// z = x / y, ignoring signs of x and y for the division
1332// but using the sign of z for rounding the result.
1333// x and y must have a non-empty mantissa and valid exponent.
1334func (z *Float) uquo(x, y *Float) {
1335	if debugFloat {
1336		validateBinaryOperands(x, y)
1337	}
1338
1339	// mantissa length in words for desired result precision + 1
1340	// (at least one extra bit so we get the rounding bit after
1341	// the division)
1342	n := int(z.prec/_W) + 1
1343
1344	// compute adjusted x.mant such that we get enough result precision
1345	xadj := x.mant
1346	if d := n - len(x.mant) + len(y.mant); d > 0 {
1347		// d extra words needed => add d "0 digits" to x
1348		xadj = make(nat, len(x.mant)+d)
1349		copy(xadj[d:], x.mant)
1350	}
1351	// TODO(gri): If we have too many digits (d < 0), we should be able
1352	// to shorten x for faster division. But we must be extra careful
1353	// with rounding in that case.
1354
1355	// Compute d before division since there may be aliasing of x.mant
1356	// (via xadj) or y.mant with z.mant.
1357	d := len(xadj) - len(y.mant)
1358
1359	// divide
1360	var r nat
1361	z.mant, r = z.mant.div(nil, xadj, y.mant)
1362	e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W
1363
1364	// The result is long enough to include (at least) the rounding bit.
1365	// If there's a non-zero remainder, the corresponding fractional part
1366	// (if it were computed), would have a non-zero sticky bit (if it were
1367	// zero, it couldn't have a non-zero remainder).
1368	var sbit uint
1369	if len(r) > 0 {
1370		sbit = 1
1371	}
1372
1373	z.setExpAndRound(e-fnorm(z.mant), sbit)
1374}
1375
1376// ucmp returns -1, 0, or +1, depending on whether
1377// |x| < |y|, |x| == |y|, or |x| > |y|.
1378// x and y must have a non-empty mantissa and valid exponent.
1379func (x *Float) ucmp(y *Float) int {
1380	if debugFloat {
1381		validateBinaryOperands(x, y)
1382	}
1383
1384	switch {
1385	case x.exp < y.exp:
1386		return -1
1387	case x.exp > y.exp:
1388		return +1
1389	}
1390	// x.exp == y.exp
1391
1392	// compare mantissas
1393	i := len(x.mant)
1394	j := len(y.mant)
1395	for i > 0 || j > 0 {
1396		var xm, ym Word
1397		if i > 0 {
1398			i--
1399			xm = x.mant[i]
1400		}
1401		if j > 0 {
1402			j--
1403			ym = y.mant[j]
1404		}
1405		switch {
1406		case xm < ym:
1407			return -1
1408		case xm > ym:
1409			return +1
1410		}
1411	}
1412
1413	return 0
1414}
1415
1416// Handling of sign bit as defined by IEEE 754-2008, section 6.3:
1417//
1418// When neither the inputs nor result are NaN, the sign of a product or
1419// quotient is the exclusive OR of the operands’ signs; the sign of a sum,
1420// or of a difference x−y regarded as a sum x+(−y), differs from at most
1421// one of the addends’ signs; and the sign of the result of conversions,
1422// the quantize operation, the roundToIntegral operations, and the
1423// roundToIntegralExact (see 5.3.1) is the sign of the first or only operand.
1424// These rules shall apply even when operands or results are zero or infinite.
1425//
1426// When the sum of two operands with opposite signs (or the difference of
1427// two operands with like signs) is exactly zero, the sign of that sum (or
1428// difference) shall be +0 in all rounding-direction attributes except
1429// roundTowardNegative; under that attribute, the sign of an exact zero
1430// sum (or difference) shall be −0. However, x+x = x−(−x) retains the same
1431// sign as x even when x is zero.
1432//
1433// See also: https://play.golang.org/p/RtH3UCt5IH
1434
1435// Add sets z to the rounded sum x+y and returns z. If z's precision is 0,
1436// it is changed to the larger of x's or y's precision before the operation.
1437// Rounding is performed according to z's precision and rounding mode; and
1438// z's accuracy reports the result error relative to the exact (not rounded)
1439// result. Add panics with ErrNaN if x and y are infinities with opposite
1440// signs. The value of z is undefined in that case.
1441func (z *Float) Add(x, y *Float) *Float {
1442	if debugFloat {
1443		x.validate()
1444		y.validate()
1445	}
1446
1447	if z.prec == 0 {
1448		z.prec = umax32(x.prec, y.prec)
1449	}
1450
1451	if x.form == finite && y.form == finite {
1452		// x + y (common case)
1453
1454		// Below we set z.neg = x.neg, and when z aliases y this will
1455		// change the y operand's sign. This is fine, because if an
1456		// operand aliases the receiver it'll be overwritten, but we still
1457		// want the original x.neg and y.neg values when we evaluate
1458		// x.neg != y.neg, so we need to save y.neg before setting z.neg.
1459		yneg := y.neg
1460
1461		z.neg = x.neg
1462		if x.neg == yneg {
1463			// x + y == x + y
1464			// (-x) + (-y) == -(x + y)
1465			z.uadd(x, y)
1466		} else {
1467			// x + (-y) == x - y == -(y - x)
1468			// (-x) + y == y - x == -(x - y)
1469			if x.ucmp(y) > 0 {
1470				z.usub(x, y)
1471			} else {
1472				z.neg = !z.neg
1473				z.usub(y, x)
1474			}
1475		}
1476		if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
1477			z.neg = true
1478		}
1479		return z
1480	}
1481
1482	if x.form == inf && y.form == inf && x.neg != y.neg {
1483		// +Inf + -Inf
1484		// -Inf + +Inf
1485		// value of z is undefined but make sure it's valid
1486		z.acc = Exact
1487		z.form = zero
1488		z.neg = false
1489		panic(ErrNaN{"addition of infinities with opposite signs"})
1490	}
1491
1492	if x.form == zero && y.form == zero {
1493		// ±0 + ±0
1494		z.acc = Exact
1495		z.form = zero
1496		z.neg = x.neg && y.neg // -0 + -0 == -0
1497		return z
1498	}
1499
1500	if x.form == inf || y.form == zero {
1501		// ±Inf + y
1502		// x + ±0
1503		return z.Set(x)
1504	}
1505
1506	// ±0 + y
1507	// x + ±Inf
1508	return z.Set(y)
1509}
1510
1511// Sub sets z to the rounded difference x-y and returns z.
1512// Precision, rounding, and accuracy reporting are as for Add.
1513// Sub panics with ErrNaN if x and y are infinities with equal
1514// signs. The value of z is undefined in that case.
1515func (z *Float) Sub(x, y *Float) *Float {
1516	if debugFloat {
1517		x.validate()
1518		y.validate()
1519	}
1520
1521	if z.prec == 0 {
1522		z.prec = umax32(x.prec, y.prec)
1523	}
1524
1525	if x.form == finite && y.form == finite {
1526		// x - y (common case)
1527		yneg := y.neg
1528		z.neg = x.neg
1529		if x.neg != yneg {
1530			// x - (-y) == x + y
1531			// (-x) - y == -(x + y)
1532			z.uadd(x, y)
1533		} else {
1534			// x - y == x - y == -(y - x)
1535			// (-x) - (-y) == y - x == -(x - y)
1536			if x.ucmp(y) > 0 {
1537				z.usub(x, y)
1538			} else {
1539				z.neg = !z.neg
1540				z.usub(y, x)
1541			}
1542		}
1543		if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
1544			z.neg = true
1545		}
1546		return z
1547	}
1548
1549	if x.form == inf && y.form == inf && x.neg == y.neg {
1550		// +Inf - +Inf
1551		// -Inf - -Inf
1552		// value of z is undefined but make sure it's valid
1553		z.acc = Exact
1554		z.form = zero
1555		z.neg = false
1556		panic(ErrNaN{"subtraction of infinities with equal signs"})
1557	}
1558
1559	if x.form == zero && y.form == zero {
1560		// ±0 - ±0
1561		z.acc = Exact
1562		z.form = zero
1563		z.neg = x.neg && !y.neg // -0 - +0 == -0
1564		return z
1565	}
1566
1567	if x.form == inf || y.form == zero {
1568		// ±Inf - y
1569		// x - ±0
1570		return z.Set(x)
1571	}
1572
1573	// ±0 - y
1574	// x - ±Inf
1575	return z.Neg(y)
1576}
1577
1578// Mul sets z to the rounded product x*y and returns z.
1579// Precision, rounding, and accuracy reporting are as for Add.
1580// Mul panics with ErrNaN if one operand is zero and the other
1581// operand an infinity. The value of z is undefined in that case.
1582func (z *Float) Mul(x, y *Float) *Float {
1583	if debugFloat {
1584		x.validate()
1585		y.validate()
1586	}
1587
1588	if z.prec == 0 {
1589		z.prec = umax32(x.prec, y.prec)
1590	}
1591
1592	z.neg = x.neg != y.neg
1593
1594	if x.form == finite && y.form == finite {
1595		// x * y (common case)
1596		z.umul(x, y)
1597		return z
1598	}
1599
1600	z.acc = Exact
1601	if x.form == zero && y.form == inf || x.form == inf && y.form == zero {
1602		// ±0 * ±Inf
1603		// ±Inf * ±0
1604		// value of z is undefined but make sure it's valid
1605		z.form = zero
1606		z.neg = false
1607		panic(ErrNaN{"multiplication of zero with infinity"})
1608	}
1609
1610	if x.form == inf || y.form == inf {
1611		// ±Inf * y
1612		// x * ±Inf
1613		z.form = inf
1614		return z
1615	}
1616
1617	// ±0 * y
1618	// x * ±0
1619	z.form = zero
1620	return z
1621}
1622
1623// Quo sets z to the rounded quotient x/y and returns z.
1624// Precision, rounding, and accuracy reporting are as for Add.
1625// Quo panics with ErrNaN if both operands are zero or infinities.
1626// The value of z is undefined in that case.
1627func (z *Float) Quo(x, y *Float) *Float {
1628	if debugFloat {
1629		x.validate()
1630		y.validate()
1631	}
1632
1633	if z.prec == 0 {
1634		z.prec = umax32(x.prec, y.prec)
1635	}
1636
1637	z.neg = x.neg != y.neg
1638
1639	if x.form == finite && y.form == finite {
1640		// x / y (common case)
1641		z.uquo(x, y)
1642		return z
1643	}
1644
1645	z.acc = Exact
1646	if x.form == zero && y.form == zero || x.form == inf && y.form == inf {
1647		// ±0 / ±0
1648		// ±Inf / ±Inf
1649		// value of z is undefined but make sure it's valid
1650		z.form = zero
1651		z.neg = false
1652		panic(ErrNaN{"division of zero by zero or infinity by infinity"})
1653	}
1654
1655	if x.form == zero || y.form == inf {
1656		// ±0 / y
1657		// x / ±Inf
1658		z.form = zero
1659		return z
1660	}
1661
1662	// x / ±0
1663	// ±Inf / y
1664	z.form = inf
1665	return z
1666}
1667
1668// Cmp compares x and y and returns:
1669//
1670//   -1 if x <  y
1671//    0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf)
1672//   +1 if x >  y
1673//
1674func (x *Float) Cmp(y *Float) int {
1675	if debugFloat {
1676		x.validate()
1677		y.validate()
1678	}
1679
1680	mx := x.ord()
1681	my := y.ord()
1682	switch {
1683	case mx < my:
1684		return -1
1685	case mx > my:
1686		return +1
1687	}
1688	// mx == my
1689
1690	// only if |mx| == 1 we have to compare the mantissae
1691	switch mx {
1692	case -1:
1693		return y.ucmp(x)
1694	case +1:
1695		return x.ucmp(y)
1696	}
1697
1698	return 0
1699}
1700
1701// ord classifies x and returns:
1702//
1703//	-2 if -Inf == x
1704//	-1 if -Inf < x < 0
1705//	 0 if x == 0 (signed or unsigned)
1706//	+1 if 0 < x < +Inf
1707//	+2 if x == +Inf
1708//
1709func (x *Float) ord() int {
1710	var m int
1711	switch x.form {
1712	case finite:
1713		m = 1
1714	case zero:
1715		return 0
1716	case inf:
1717		m = 2
1718	}
1719	if x.neg {
1720		m = -m
1721	}
1722	return m
1723}
1724
1725func umax32(x, y uint32) uint32 {
1726	if x > y {
1727		return x
1728	}
1729	return y
1730}
1731