1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file implements multi-precision floating-point numbers.
6// Like in the GNU MPFR library (http://www.mpfr.org/), operands
7// can be of mixed precision. Unlike MPFR, the rounding mode is
8// not specified with each operation, but with each operand. The
9// rounding mode of the result operand determines the rounding
10// mode of an operation. This is a from-scratch implementation.
11
12package big
13
14import (
15	"fmt"
16	"math"
17	"math/bits"
18)
19
20const debugFloat = false // enable for debugging
21
22// A nonzero finite Float represents a multi-precision floating point number
23//
24//   sign × mantissa × 2**exponent
25//
26// with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp.
27// A Float may also be zero (+0, -0) or infinite (+Inf, -Inf).
28// All Floats are ordered, and the ordering of two Floats x and y
29// is defined by x.Cmp(y).
30//
31// Each Float value also has a precision, rounding mode, and accuracy.
32// The precision is the maximum number of mantissa bits available to
33// represent the value. The rounding mode specifies how a result should
34// be rounded to fit into the mantissa bits, and accuracy describes the
35// rounding error with respect to the exact result.
36//
37// Unless specified otherwise, all operations (including setters) that
38// specify a *Float variable for the result (usually via the receiver
39// with the exception of MantExp), round the numeric result according
40// to the precision and rounding mode of the result variable.
41//
42// If the provided result precision is 0 (see below), it is set to the
43// precision of the argument with the largest precision value before any
44// rounding takes place, and the rounding mode remains unchanged. Thus,
45// uninitialized Floats provided as result arguments will have their
46// precision set to a reasonable value determined by the operands and
47// their mode is the zero value for RoundingMode (ToNearestEven).
48//
49// By setting the desired precision to 24 or 53 and using matching rounding
50// mode (typically ToNearestEven), Float operations produce the same results
51// as the corresponding float32 or float64 IEEE-754 arithmetic for operands
52// that correspond to normal (i.e., not denormal) float32 or float64 numbers.
53// Exponent underflow and overflow lead to a 0 or an Infinity for different
54// values than IEEE-754 because Float exponents have a much larger range.
55//
56// The zero (uninitialized) value for a Float is ready to use and represents
57// the number +0.0 exactly, with precision 0 and rounding mode ToNearestEven.
58//
59type Float struct {
60	prec uint32
61	mode RoundingMode
62	acc  Accuracy
63	form form
64	neg  bool
65	mant nat
66	exp  int32
67}
68
69// An ErrNaN panic is raised by a Float operation that would lead to
70// a NaN under IEEE-754 rules. An ErrNaN implements the error interface.
71type ErrNaN struct {
72	msg string
73}
74
75func (err ErrNaN) Error() string {
76	return err.msg
77}
78
79// NewFloat allocates and returns a new Float set to x,
80// with precision 53 and rounding mode ToNearestEven.
81// NewFloat panics with ErrNaN if x is a NaN.
82func NewFloat(x float64) *Float {
83	if math.IsNaN(x) {
84		panic(ErrNaN{"NewFloat(NaN)"})
85	}
86	return new(Float).SetFloat64(x)
87}
88
89// Exponent and precision limits.
90const (
91	MaxExp  = math.MaxInt32  // largest supported exponent
92	MinExp  = math.MinInt32  // smallest supported exponent
93	MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited
94)
95
96// Internal representation: The mantissa bits x.mant of a nonzero finite
97// Float x are stored in a nat slice long enough to hold up to x.prec bits;
98// the slice may (but doesn't have to) be shorter if the mantissa contains
99// trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e.,
100// the msb is shifted all the way "to the left"). Thus, if the mantissa has
101// trailing 0 bits or x.prec is not a multiple of the Word size _W,
102// x.mant[0] has trailing zero bits. The msb of the mantissa corresponds
103// to the value 0.5; the exponent x.exp shifts the binary point as needed.
104//
105// A zero or non-finite Float x ignores x.mant and x.exp.
106//
107// x                 form      neg      mant         exp
108// ----------------------------------------------------------
109// ±0                zero      sign     -            -
110// 0 < |x| < +Inf    finite    sign     mantissa     exponent
111// ±Inf              inf       sign     -            -
112
113// A form value describes the internal representation.
114type form byte
115
116// The form value order is relevant - do not change!
117const (
118	zero form = iota
119	finite
120	inf
121)
122
123// RoundingMode determines how a Float value is rounded to the
124// desired precision. Rounding may change the Float value; the
125// rounding error is described by the Float's Accuracy.
126type RoundingMode byte
127
128// These constants define supported rounding modes.
129const (
130	ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven
131	ToNearestAway                     // == IEEE 754-2008 roundTiesToAway
132	ToZero                            // == IEEE 754-2008 roundTowardZero
133	AwayFromZero                      // no IEEE 754-2008 equivalent
134	ToNegativeInf                     // == IEEE 754-2008 roundTowardNegative
135	ToPositiveInf                     // == IEEE 754-2008 roundTowardPositive
136)
137
138//go:generate stringer -type=RoundingMode
139
140// Accuracy describes the rounding error produced by the most recent
141// operation that generated a Float value, relative to the exact value.
142type Accuracy int8
143
144// Constants describing the Accuracy of a Float.
145const (
146	Below Accuracy = -1
147	Exact Accuracy = 0
148	Above Accuracy = +1
149)
150
151//go:generate stringer -type=Accuracy
152
153// SetPrec sets z's precision to prec and returns the (possibly) rounded
154// value of z. Rounding occurs according to z's rounding mode if the mantissa
155// cannot be represented in prec bits without loss of precision.
156// SetPrec(0) maps all finite values to ±0; infinite values remain unchanged.
157// If prec > MaxPrec, it is set to MaxPrec.
158func (z *Float) SetPrec(prec uint) *Float {
159	z.acc = Exact // optimistically assume no rounding is needed
160
161	// special case
162	if prec == 0 {
163		z.prec = 0
164		if z.form == finite {
165			// truncate z to 0
166			z.acc = makeAcc(z.neg)
167			z.form = zero
168		}
169		return z
170	}
171
172	// general case
173	if prec > MaxPrec {
174		prec = MaxPrec
175	}
176	old := z.prec
177	z.prec = uint32(prec)
178	if z.prec < old {
179		z.round(0)
180	}
181	return z
182}
183
184func makeAcc(above bool) Accuracy {
185	if above {
186		return Above
187	}
188	return Below
189}
190
191// SetMode sets z's rounding mode to mode and returns an exact z.
192// z remains unchanged otherwise.
193// z.SetMode(z.Mode()) is a cheap way to set z's accuracy to Exact.
194func (z *Float) SetMode(mode RoundingMode) *Float {
195	z.mode = mode
196	z.acc = Exact
197	return z
198}
199
200// Prec returns the mantissa precision of x in bits.
201// The result may be 0 for |x| == 0 and |x| == Inf.
202func (x *Float) Prec() uint {
203	return uint(x.prec)
204}
205
206// MinPrec returns the minimum precision required to represent x exactly
207// (i.e., the smallest prec before x.SetPrec(prec) would start rounding x).
208// The result is 0 for |x| == 0 and |x| == Inf.
209func (x *Float) MinPrec() uint {
210	if x.form != finite {
211		return 0
212	}
213	return uint(len(x.mant))*_W - x.mant.trailingZeroBits()
214}
215
216// Mode returns the rounding mode of x.
217func (x *Float) Mode() RoundingMode {
218	return x.mode
219}
220
221// Acc returns the accuracy of x produced by the most recent operation.
222func (x *Float) Acc() Accuracy {
223	return x.acc
224}
225
226// Sign returns:
227//
228//	-1 if x <   0
229//	 0 if x is ±0
230//	+1 if x >   0
231//
232func (x *Float) Sign() int {
233	if debugFloat {
234		x.validate()
235	}
236	if x.form == zero {
237		return 0
238	}
239	if x.neg {
240		return -1
241	}
242	return 1
243}
244
245// MantExp breaks x into its mantissa and exponent components
246// and returns the exponent. If a non-nil mant argument is
247// provided its value is set to the mantissa of x, with the
248// same precision and rounding mode as x. The components
249// satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0.
250// Calling MantExp with a nil argument is an efficient way to
251// get the exponent of the receiver.
252//
253// Special cases are:
254//
255//	(  ±0).MantExp(mant) = 0, with mant set to   ±0
256//	(±Inf).MantExp(mant) = 0, with mant set to ±Inf
257//
258// x and mant may be the same in which case x is set to its
259// mantissa value.
260func (x *Float) MantExp(mant *Float) (exp int) {
261	if debugFloat {
262		x.validate()
263	}
264	if x.form == finite {
265		exp = int(x.exp)
266	}
267	if mant != nil {
268		mant.Copy(x)
269		if mant.form == finite {
270			mant.exp = 0
271		}
272	}
273	return
274}
275
276func (z *Float) setExpAndRound(exp int64, sbit uint) {
277	if exp < MinExp {
278		// underflow
279		z.acc = makeAcc(z.neg)
280		z.form = zero
281		return
282	}
283
284	if exp > MaxExp {
285		// overflow
286		z.acc = makeAcc(!z.neg)
287		z.form = inf
288		return
289	}
290
291	z.form = finite
292	z.exp = int32(exp)
293	z.round(sbit)
294}
295
296// SetMantExp sets z to mant × 2**exp and and returns z.
297// The result z has the same precision and rounding mode
298// as mant. SetMantExp is an inverse of MantExp but does
299// not require 0.5 <= |mant| < 1.0. Specifically:
300//
301//	mant := new(Float)
302//	new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0
303//
304// Special cases are:
305//
306//	z.SetMantExp(  ±0, exp) =   ±0
307//	z.SetMantExp(±Inf, exp) = ±Inf
308//
309// z and mant may be the same in which case z's exponent
310// is set to exp.
311func (z *Float) SetMantExp(mant *Float, exp int) *Float {
312	if debugFloat {
313		z.validate()
314		mant.validate()
315	}
316	z.Copy(mant)
317	if z.form != finite {
318		return z
319	}
320	z.setExpAndRound(int64(z.exp)+int64(exp), 0)
321	return z
322}
323
324// Signbit returns true if x is negative or negative zero.
325func (x *Float) Signbit() bool {
326	return x.neg
327}
328
329// IsInf reports whether x is +Inf or -Inf.
330func (x *Float) IsInf() bool {
331	return x.form == inf
332}
333
334// IsInt reports whether x is an integer.
335// ±Inf values are not integers.
336func (x *Float) IsInt() bool {
337	if debugFloat {
338		x.validate()
339	}
340	// special cases
341	if x.form != finite {
342		return x.form == zero
343	}
344	// x.form == finite
345	if x.exp <= 0 {
346		return false
347	}
348	// x.exp > 0
349	return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa
350}
351
352// debugging support
353func (x *Float) validate() {
354	if !debugFloat {
355		// avoid performance bugs
356		panic("validate called but debugFloat is not set")
357	}
358	if x.form != finite {
359		return
360	}
361	m := len(x.mant)
362	if m == 0 {
363		panic("nonzero finite number with empty mantissa")
364	}
365	const msb = 1 << (_W - 1)
366	if x.mant[m-1]&msb == 0 {
367		panic(fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0)))
368	}
369	if x.prec == 0 {
370		panic("zero precision finite number")
371	}
372}
373
374// round rounds z according to z.mode to z.prec bits and sets z.acc accordingly.
375// sbit must be 0 or 1 and summarizes any "sticky bit" information one might
376// have before calling round. z's mantissa must be normalized (with the msb set)
377// or empty.
378//
379// CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the
380// sign of z. For correct rounding, the sign of z must be set correctly before
381// calling round.
382func (z *Float) round(sbit uint) {
383	if debugFloat {
384		z.validate()
385	}
386
387	z.acc = Exact
388	if z.form != finite {
389		// ±0 or ±Inf => nothing left to do
390		return
391	}
392	// z.form == finite && len(z.mant) > 0
393	// m > 0 implies z.prec > 0 (checked by validate)
394
395	m := uint32(len(z.mant)) // present mantissa length in words
396	bits := m * _W           // present mantissa bits; bits > 0
397	if bits <= z.prec {
398		// mantissa fits => nothing to do
399		return
400	}
401	// bits > z.prec
402
403	// Rounding is based on two bits: the rounding bit (rbit) and the
404	// sticky bit (sbit). The rbit is the bit immediately before the
405	// z.prec leading mantissa bits (the "0.5"). The sbit is set if any
406	// of the bits before the rbit are set (the "0.25", "0.125", etc.):
407	//
408	//   rbit  sbit  => "fractional part"
409	//
410	//   0     0        == 0
411	//   0     1        >  0  , < 0.5
412	//   1     0        == 0.5
413	//   1     1        >  0.5, < 1.0
414
415	// bits > z.prec: mantissa too large => round
416	r := uint(bits - z.prec - 1) // rounding bit position; r >= 0
417	rbit := z.mant.bit(r) & 1    // rounding bit; be safe and ensure it's a single bit
418	// The sticky bit is only needed for rounding ToNearestEven
419	// or when the rounding bit is zero. Avoid computation otherwise.
420	if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) {
421		sbit = z.mant.sticky(r)
422	}
423	sbit &= 1 // be safe and ensure it's a single bit
424
425	// cut off extra words
426	n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision
427	if m > n {
428		copy(z.mant, z.mant[m-n:]) // move n last words to front
429		z.mant = z.mant[:n]
430	}
431
432	// determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word
433	ntz := n*_W - z.prec // 0 <= ntz < _W
434	lsb := Word(1) << ntz
435
436	// round if result is inexact
437	if rbit|sbit != 0 {
438		// Make rounding decision: The result mantissa is truncated ("rounded down")
439		// by default. Decide if we need to increment, or "round up", the (unsigned)
440		// mantissa.
441		inc := false
442		switch z.mode {
443		case ToNegativeInf:
444			inc = z.neg
445		case ToZero:
446			// nothing to do
447		case ToNearestEven:
448			inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0)
449		case ToNearestAway:
450			inc = rbit != 0
451		case AwayFromZero:
452			inc = true
453		case ToPositiveInf:
454			inc = !z.neg
455		default:
456			panic("unreachable")
457		}
458
459		// A positive result (!z.neg) is Above the exact result if we increment,
460		// and it's Below if we truncate (Exact results require no rounding).
461		// For a negative result (z.neg) it is exactly the opposite.
462		z.acc = makeAcc(inc != z.neg)
463
464		if inc {
465			// add 1 to mantissa
466			if addVW(z.mant, z.mant, lsb) != 0 {
467				// mantissa overflow => adjust exponent
468				if z.exp >= MaxExp {
469					// exponent overflow
470					z.form = inf
471					return
472				}
473				z.exp++
474				// adjust mantissa: divide by 2 to compensate for exponent adjustment
475				shrVU(z.mant, z.mant, 1)
476				// set msb == carry == 1 from the mantissa overflow above
477				const msb = 1 << (_W - 1)
478				z.mant[n-1] |= msb
479			}
480		}
481	}
482
483	// zero out trailing bits in least-significant word
484	z.mant[0] &^= lsb - 1
485
486	if debugFloat {
487		z.validate()
488	}
489}
490
491func (z *Float) setBits64(neg bool, x uint64) *Float {
492	if z.prec == 0 {
493		z.prec = 64
494	}
495	z.acc = Exact
496	z.neg = neg
497	if x == 0 {
498		z.form = zero
499		return z
500	}
501	// x != 0
502	z.form = finite
503	s := bits.LeadingZeros64(x)
504	z.mant = z.mant.setUint64(x << uint(s))
505	z.exp = int32(64 - s) // always fits
506	if z.prec < 64 {
507		z.round(0)
508	}
509	return z
510}
511
512// SetUint64 sets z to the (possibly rounded) value of x and returns z.
513// If z's precision is 0, it is changed to 64 (and rounding will have
514// no effect).
515func (z *Float) SetUint64(x uint64) *Float {
516	return z.setBits64(false, x)
517}
518
519// SetInt64 sets z to the (possibly rounded) value of x and returns z.
520// If z's precision is 0, it is changed to 64 (and rounding will have
521// no effect).
522func (z *Float) SetInt64(x int64) *Float {
523	u := x
524	if u < 0 {
525		u = -u
526	}
527	// We cannot simply call z.SetUint64(uint64(u)) and change
528	// the sign afterwards because the sign affects rounding.
529	return z.setBits64(x < 0, uint64(u))
530}
531
532// SetFloat64 sets z to the (possibly rounded) value of x and returns z.
533// If z's precision is 0, it is changed to 53 (and rounding will have
534// no effect). SetFloat64 panics with ErrNaN if x is a NaN.
535func (z *Float) SetFloat64(x float64) *Float {
536	if z.prec == 0 {
537		z.prec = 53
538	}
539	if math.IsNaN(x) {
540		panic(ErrNaN{"Float.SetFloat64(NaN)"})
541	}
542	z.acc = Exact
543	z.neg = math.Signbit(x) // handle -0, -Inf correctly
544	if x == 0 {
545		z.form = zero
546		return z
547	}
548	if math.IsInf(x, 0) {
549		z.form = inf
550		return z
551	}
552	// normalized x != 0
553	z.form = finite
554	fmant, exp := math.Frexp(x) // get normalized mantissa
555	z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11)
556	z.exp = int32(exp) // always fits
557	if z.prec < 53 {
558		z.round(0)
559	}
560	return z
561}
562
563// fnorm normalizes mantissa m by shifting it to the left
564// such that the msb of the most-significant word (msw) is 1.
565// It returns the shift amount. It assumes that len(m) != 0.
566func fnorm(m nat) int64 {
567	if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) {
568		panic("msw of mantissa is 0")
569	}
570	s := nlz(m[len(m)-1])
571	if s > 0 {
572		c := shlVU(m, m, s)
573		if debugFloat && c != 0 {
574			panic("nlz or shlVU incorrect")
575		}
576	}
577	return int64(s)
578}
579
580// SetInt sets z to the (possibly rounded) value of x and returns z.
581// If z's precision is 0, it is changed to the larger of x.BitLen()
582// or 64 (and rounding will have no effect).
583func (z *Float) SetInt(x *Int) *Float {
584	// TODO(gri) can be more efficient if z.prec > 0
585	// but small compared to the size of x, or if there
586	// are many trailing 0's.
587	bits := uint32(x.BitLen())
588	if z.prec == 0 {
589		z.prec = umax32(bits, 64)
590	}
591	z.acc = Exact
592	z.neg = x.neg
593	if len(x.abs) == 0 {
594		z.form = zero
595		return z
596	}
597	// x != 0
598	z.mant = z.mant.set(x.abs)
599	fnorm(z.mant)
600	z.setExpAndRound(int64(bits), 0)
601	return z
602}
603
604// SetRat sets z to the (possibly rounded) value of x and returns z.
605// If z's precision is 0, it is changed to the largest of a.BitLen(),
606// b.BitLen(), or 64; with x = a/b.
607func (z *Float) SetRat(x *Rat) *Float {
608	if x.IsInt() {
609		return z.SetInt(x.Num())
610	}
611	var a, b Float
612	a.SetInt(x.Num())
613	b.SetInt(x.Denom())
614	if z.prec == 0 {
615		z.prec = umax32(a.prec, b.prec)
616	}
617	return z.Quo(&a, &b)
618}
619
620// SetInf sets z to the infinite Float -Inf if signbit is
621// set, or +Inf if signbit is not set, and returns z. The
622// precision of z is unchanged and the result is always
623// Exact.
624func (z *Float) SetInf(signbit bool) *Float {
625	z.acc = Exact
626	z.form = inf
627	z.neg = signbit
628	return z
629}
630
631// Set sets z to the (possibly rounded) value of x and returns z.
632// If z's precision is 0, it is changed to the precision of x
633// before setting z (and rounding will have no effect).
634// Rounding is performed according to z's precision and rounding
635// mode; and z's accuracy reports the result error relative to the
636// exact (not rounded) result.
637func (z *Float) Set(x *Float) *Float {
638	if debugFloat {
639		x.validate()
640	}
641	z.acc = Exact
642	if z != x {
643		z.form = x.form
644		z.neg = x.neg
645		if x.form == finite {
646			z.exp = x.exp
647			z.mant = z.mant.set(x.mant)
648		}
649		if z.prec == 0 {
650			z.prec = x.prec
651		} else if z.prec < x.prec {
652			z.round(0)
653		}
654	}
655	return z
656}
657
658// Copy sets z to x, with the same precision, rounding mode, and
659// accuracy as x, and returns z. x is not changed even if z and
660// x are the same.
661func (z *Float) Copy(x *Float) *Float {
662	if debugFloat {
663		x.validate()
664	}
665	if z != x {
666		z.prec = x.prec
667		z.mode = x.mode
668		z.acc = x.acc
669		z.form = x.form
670		z.neg = x.neg
671		if z.form == finite {
672			z.mant = z.mant.set(x.mant)
673			z.exp = x.exp
674		}
675	}
676	return z
677}
678
679// msb32 returns the 32 most significant bits of x.
680func msb32(x nat) uint32 {
681	i := len(x) - 1
682	if i < 0 {
683		return 0
684	}
685	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
686		panic("x not normalized")
687	}
688	switch _W {
689	case 32:
690		return uint32(x[i])
691	case 64:
692		return uint32(x[i] >> 32)
693	}
694	panic("unreachable")
695}
696
697// msb64 returns the 64 most significant bits of x.
698func msb64(x nat) uint64 {
699	i := len(x) - 1
700	if i < 0 {
701		return 0
702	}
703	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
704		panic("x not normalized")
705	}
706	switch _W {
707	case 32:
708		v := uint64(x[i]) << 32
709		if i > 0 {
710			v |= uint64(x[i-1])
711		}
712		return v
713	case 64:
714		return uint64(x[i])
715	}
716	panic("unreachable")
717}
718
719// Uint64 returns the unsigned integer resulting from truncating x
720// towards zero. If 0 <= x <= math.MaxUint64, the result is Exact
721// if x is an integer and Below otherwise.
722// The result is (0, Above) for x < 0, and (math.MaxUint64, Below)
723// for x > math.MaxUint64.
724func (x *Float) Uint64() (uint64, Accuracy) {
725	if debugFloat {
726		x.validate()
727	}
728
729	switch x.form {
730	case finite:
731		if x.neg {
732			return 0, Above
733		}
734		// 0 < x < +Inf
735		if x.exp <= 0 {
736			// 0 < x < 1
737			return 0, Below
738		}
739		// 1 <= x < Inf
740		if x.exp <= 64 {
741			// u = trunc(x) fits into a uint64
742			u := msb64(x.mant) >> (64 - uint32(x.exp))
743			if x.MinPrec() <= 64 {
744				return u, Exact
745			}
746			return u, Below // x truncated
747		}
748		// x too large
749		return math.MaxUint64, Below
750
751	case zero:
752		return 0, Exact
753
754	case inf:
755		if x.neg {
756			return 0, Above
757		}
758		return math.MaxUint64, Below
759	}
760
761	panic("unreachable")
762}
763
764// Int64 returns the integer resulting from truncating x towards zero.
765// If math.MinInt64 <= x <= math.MaxInt64, the result is Exact if x is
766// an integer, and Above (x < 0) or Below (x > 0) otherwise.
767// The result is (math.MinInt64, Above) for x < math.MinInt64,
768// and (math.MaxInt64, Below) for x > math.MaxInt64.
769func (x *Float) Int64() (int64, Accuracy) {
770	if debugFloat {
771		x.validate()
772	}
773
774	switch x.form {
775	case finite:
776		// 0 < |x| < +Inf
777		acc := makeAcc(x.neg)
778		if x.exp <= 0 {
779			// 0 < |x| < 1
780			return 0, acc
781		}
782		// x.exp > 0
783
784		// 1 <= |x| < +Inf
785		if x.exp <= 63 {
786			// i = trunc(x) fits into an int64 (excluding math.MinInt64)
787			i := int64(msb64(x.mant) >> (64 - uint32(x.exp)))
788			if x.neg {
789				i = -i
790			}
791			if x.MinPrec() <= uint(x.exp) {
792				return i, Exact
793			}
794			return i, acc // x truncated
795		}
796		if x.neg {
797			// check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64))
798			if x.exp == 64 && x.MinPrec() == 1 {
799				acc = Exact
800			}
801			return math.MinInt64, acc
802		}
803		// x too large
804		return math.MaxInt64, Below
805
806	case zero:
807		return 0, Exact
808
809	case inf:
810		if x.neg {
811			return math.MinInt64, Above
812		}
813		return math.MaxInt64, Below
814	}
815
816	panic("unreachable")
817}
818
819// Float32 returns the float32 value nearest to x. If x is too small to be
820// represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result
821// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
822// If x is too large to be represented by a float32 (|x| > math.MaxFloat32),
823// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
824func (x *Float) Float32() (float32, Accuracy) {
825	if debugFloat {
826		x.validate()
827	}
828
829	switch x.form {
830	case finite:
831		// 0 < |x| < +Inf
832
833		const (
834			fbits = 32                //        float size
835			mbits = 23                //        mantissa size (excluding implicit msb)
836			ebits = fbits - mbits - 1 //     8  exponent size
837			bias  = 1<<(ebits-1) - 1  //   127  exponent bias
838			dmin  = 1 - bias - mbits  //  -149  smallest unbiased exponent (denormal)
839			emin  = 1 - bias          //  -126  smallest unbiased exponent (normal)
840			emax  = bias              //   127  largest unbiased exponent (normal)
841		)
842
843		// Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa.
844		e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
845
846		// Compute precision p for float32 mantissa.
847		// If the exponent is too small, we have a denormal number before
848		// rounding and fewer than p mantissa bits of precision available
849		// (the exponent remains fixed but the mantissa gets shifted right).
850		p := mbits + 1 // precision of normal float
851		if e < emin {
852			// recompute precision
853			p = mbits + 1 - emin + int(e)
854			// If p == 0, the mantissa of x is shifted so much to the right
855			// that its msb falls immediately to the right of the float32
856			// mantissa space. In other words, if the smallest denormal is
857			// considered "1.0", for p == 0, the mantissa value m is >= 0.5.
858			// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
859			// If m == 0.5, it is rounded down to even, i.e., 0.0.
860			// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
861			if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
862				// underflow to ±0
863				if x.neg {
864					var z float32
865					return -z, Above
866				}
867				return 0.0, Below
868			}
869			// otherwise, round up
870			// We handle p == 0 explicitly because it's easy and because
871			// Float.round doesn't support rounding to 0 bits of precision.
872			if p == 0 {
873				if x.neg {
874					return -math.SmallestNonzeroFloat32, Below
875				}
876				return math.SmallestNonzeroFloat32, Above
877			}
878		}
879		// p > 0
880
881		// round
882		var r Float
883		r.prec = uint32(p)
884		r.Set(x)
885		e = r.exp - 1
886
887		// Rounding may have caused r to overflow to ±Inf
888		// (rounding never causes underflows to 0).
889		// If the exponent is too large, also overflow to ±Inf.
890		if r.form == inf || e > emax {
891			// overflow
892			if x.neg {
893				return float32(math.Inf(-1)), Below
894			}
895			return float32(math.Inf(+1)), Above
896		}
897		// e <= emax
898
899		// Determine sign, biased exponent, and mantissa.
900		var sign, bexp, mant uint32
901		if x.neg {
902			sign = 1 << (fbits - 1)
903		}
904
905		// Rounding may have caused a denormal number to
906		// become normal. Check again.
907		if e < emin {
908			// denormal number: recompute precision
909			// Since rounding may have at best increased precision
910			// and we have eliminated p <= 0 early, we know p > 0.
911			// bexp == 0 for denormals
912			p = mbits + 1 - emin + int(e)
913			mant = msb32(r.mant) >> uint(fbits-p)
914		} else {
915			// normal number: emin <= e <= emax
916			bexp = uint32(e+bias) << mbits
917			mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
918		}
919
920		return math.Float32frombits(sign | bexp | mant), r.acc
921
922	case zero:
923		if x.neg {
924			var z float32
925			return -z, Exact
926		}
927		return 0.0, Exact
928
929	case inf:
930		if x.neg {
931			return float32(math.Inf(-1)), Exact
932		}
933		return float32(math.Inf(+1)), Exact
934	}
935
936	panic("unreachable")
937}
938
939// Float64 returns the float64 value nearest to x. If x is too small to be
940// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
941// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
942// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
943// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
944func (x *Float) Float64() (float64, Accuracy) {
945	if debugFloat {
946		x.validate()
947	}
948
949	switch x.form {
950	case finite:
951		// 0 < |x| < +Inf
952
953		const (
954			fbits = 64                //        float size
955			mbits = 52                //        mantissa size (excluding implicit msb)
956			ebits = fbits - mbits - 1 //    11  exponent size
957			bias  = 1<<(ebits-1) - 1  //  1023  exponent bias
958			dmin  = 1 - bias - mbits  // -1074  smallest unbiased exponent (denormal)
959			emin  = 1 - bias          // -1022  smallest unbiased exponent (normal)
960			emax  = bias              //  1023  largest unbiased exponent (normal)
961		)
962
963		// Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa.
964		e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
965
966		// Compute precision p for float64 mantissa.
967		// If the exponent is too small, we have a denormal number before
968		// rounding and fewer than p mantissa bits of precision available
969		// (the exponent remains fixed but the mantissa gets shifted right).
970		p := mbits + 1 // precision of normal float
971		if e < emin {
972			// recompute precision
973			p = mbits + 1 - emin + int(e)
974			// If p == 0, the mantissa of x is shifted so much to the right
975			// that its msb falls immediately to the right of the float64
976			// mantissa space. In other words, if the smallest denormal is
977			// considered "1.0", for p == 0, the mantissa value m is >= 0.5.
978			// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
979			// If m == 0.5, it is rounded down to even, i.e., 0.0.
980			// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
981			if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
982				// underflow to ±0
983				if x.neg {
984					var z float64
985					return -z, Above
986				}
987				return 0.0, Below
988			}
989			// otherwise, round up
990			// We handle p == 0 explicitly because it's easy and because
991			// Float.round doesn't support rounding to 0 bits of precision.
992			if p == 0 {
993				if x.neg {
994					return -math.SmallestNonzeroFloat64, Below
995				}
996				return math.SmallestNonzeroFloat64, Above
997			}
998		}
999		// p > 0
1000
1001		// round
1002		var r Float
1003		r.prec = uint32(p)
1004		r.Set(x)
1005		e = r.exp - 1
1006
1007		// Rounding may have caused r to overflow to ±Inf
1008		// (rounding never causes underflows to 0).
1009		// If the exponent is too large, also overflow to ±Inf.
1010		if r.form == inf || e > emax {
1011			// overflow
1012			if x.neg {
1013				return math.Inf(-1), Below
1014			}
1015			return math.Inf(+1), Above
1016		}
1017		// e <= emax
1018
1019		// Determine sign, biased exponent, and mantissa.
1020		var sign, bexp, mant uint64
1021		if x.neg {
1022			sign = 1 << (fbits - 1)
1023		}
1024
1025		// Rounding may have caused a denormal number to
1026		// become normal. Check again.
1027		if e < emin {
1028			// denormal number: recompute precision
1029			// Since rounding may have at best increased precision
1030			// and we have eliminated p <= 0 early, we know p > 0.
1031			// bexp == 0 for denormals
1032			p = mbits + 1 - emin + int(e)
1033			mant = msb64(r.mant) >> uint(fbits-p)
1034		} else {
1035			// normal number: emin <= e <= emax
1036			bexp = uint64(e+bias) << mbits
1037			mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
1038		}
1039
1040		return math.Float64frombits(sign | bexp | mant), r.acc
1041
1042	case zero:
1043		if x.neg {
1044			var z float64
1045			return -z, Exact
1046		}
1047		return 0.0, Exact
1048
1049	case inf:
1050		if x.neg {
1051			return math.Inf(-1), Exact
1052		}
1053		return math.Inf(+1), Exact
1054	}
1055
1056	panic("unreachable")
1057}
1058
1059// Int returns the result of truncating x towards zero;
1060// or nil if x is an infinity.
1061// The result is Exact if x.IsInt(); otherwise it is Below
1062// for x > 0, and Above for x < 0.
1063// If a non-nil *Int argument z is provided, Int stores
1064// the result in z instead of allocating a new Int.
1065func (x *Float) Int(z *Int) (*Int, Accuracy) {
1066	if debugFloat {
1067		x.validate()
1068	}
1069
1070	if z == nil && x.form <= finite {
1071		z = new(Int)
1072	}
1073
1074	switch x.form {
1075	case finite:
1076		// 0 < |x| < +Inf
1077		acc := makeAcc(x.neg)
1078		if x.exp <= 0 {
1079			// 0 < |x| < 1
1080			return z.SetInt64(0), acc
1081		}
1082		// x.exp > 0
1083
1084		// 1 <= |x| < +Inf
1085		// determine minimum required precision for x
1086		allBits := uint(len(x.mant)) * _W
1087		exp := uint(x.exp)
1088		if x.MinPrec() <= exp {
1089			acc = Exact
1090		}
1091		// shift mantissa as needed
1092		if z == nil {
1093			z = new(Int)
1094		}
1095		z.neg = x.neg
1096		switch {
1097		case exp > allBits:
1098			z.abs = z.abs.shl(x.mant, exp-allBits)
1099		default:
1100			z.abs = z.abs.set(x.mant)
1101		case exp < allBits:
1102			z.abs = z.abs.shr(x.mant, allBits-exp)
1103		}
1104		return z, acc
1105
1106	case zero:
1107		return z.SetInt64(0), Exact
1108
1109	case inf:
1110		return nil, makeAcc(x.neg)
1111	}
1112
1113	panic("unreachable")
1114}
1115
1116// Rat returns the rational number corresponding to x;
1117// or nil if x is an infinity.
1118// The result is Exact if x is not an Inf.
1119// If a non-nil *Rat argument z is provided, Rat stores
1120// the result in z instead of allocating a new Rat.
1121func (x *Float) Rat(z *Rat) (*Rat, Accuracy) {
1122	if debugFloat {
1123		x.validate()
1124	}
1125
1126	if z == nil && x.form <= finite {
1127		z = new(Rat)
1128	}
1129
1130	switch x.form {
1131	case finite:
1132		// 0 < |x| < +Inf
1133		allBits := int32(len(x.mant)) * _W
1134		// build up numerator and denominator
1135		z.a.neg = x.neg
1136		switch {
1137		case x.exp > allBits:
1138			z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits))
1139			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
1140			// z already in normal form
1141		default:
1142			z.a.abs = z.a.abs.set(x.mant)
1143			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
1144			// z already in normal form
1145		case x.exp < allBits:
1146			z.a.abs = z.a.abs.set(x.mant)
1147			t := z.b.abs.setUint64(1)
1148			z.b.abs = t.shl(t, uint(allBits-x.exp))
1149			z.norm()
1150		}
1151		return z, Exact
1152
1153	case zero:
1154		return z.SetInt64(0), Exact
1155
1156	case inf:
1157		return nil, makeAcc(x.neg)
1158	}
1159
1160	panic("unreachable")
1161}
1162
1163// Abs sets z to the (possibly rounded) value |x| (the absolute value of x)
1164// and returns z.
1165func (z *Float) Abs(x *Float) *Float {
1166	z.Set(x)
1167	z.neg = false
1168	return z
1169}
1170
1171// Neg sets z to the (possibly rounded) value of x with its sign negated,
1172// and returns z.
1173func (z *Float) Neg(x *Float) *Float {
1174	z.Set(x)
1175	z.neg = !z.neg
1176	return z
1177}
1178
1179func validateBinaryOperands(x, y *Float) {
1180	if !debugFloat {
1181		// avoid performance bugs
1182		panic("validateBinaryOperands called but debugFloat is not set")
1183	}
1184	if len(x.mant) == 0 {
1185		panic("empty mantissa for x")
1186	}
1187	if len(y.mant) == 0 {
1188		panic("empty mantissa for y")
1189	}
1190}
1191
1192// z = x + y, ignoring signs of x and y for the addition
1193// but using the sign of z for rounding the result.
1194// x and y must have a non-empty mantissa and valid exponent.
1195func (z *Float) uadd(x, y *Float) {
1196	// Note: This implementation requires 2 shifts most of the
1197	// time. It is also inefficient if exponents or precisions
1198	// differ by wide margins. The following article describes
1199	// an efficient (but much more complicated) implementation
1200	// compatible with the internal representation used here:
1201	//
1202	// Vincent Lefèvre: "The Generic Multiple-Precision Floating-
1203	// Point Addition With Exact Rounding (as in the MPFR Library)"
1204	// http://www.vinc17.net/research/papers/rnc6.pdf
1205
1206	if debugFloat {
1207		validateBinaryOperands(x, y)
1208	}
1209
1210	// compute exponents ex, ey for mantissa with "binary point"
1211	// on the right (mantissa.0) - use int64 to avoid overflow
1212	ex := int64(x.exp) - int64(len(x.mant))*_W
1213	ey := int64(y.exp) - int64(len(y.mant))*_W
1214
1215	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
1216
1217	// TODO(gri) having a combined add-and-shift primitive
1218	//           could make this code significantly faster
1219	switch {
1220	case ex < ey:
1221		if al {
1222			t := nat(nil).shl(y.mant, uint(ey-ex))
1223			z.mant = z.mant.add(x.mant, t)
1224		} else {
1225			z.mant = z.mant.shl(y.mant, uint(ey-ex))
1226			z.mant = z.mant.add(x.mant, z.mant)
1227		}
1228	default:
1229		// ex == ey, no shift needed
1230		z.mant = z.mant.add(x.mant, y.mant)
1231	case ex > ey:
1232		if al {
1233			t := nat(nil).shl(x.mant, uint(ex-ey))
1234			z.mant = z.mant.add(t, y.mant)
1235		} else {
1236			z.mant = z.mant.shl(x.mant, uint(ex-ey))
1237			z.mant = z.mant.add(z.mant, y.mant)
1238		}
1239		ex = ey
1240	}
1241	// len(z.mant) > 0
1242
1243	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1244}
1245
1246// z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction
1247// but using the sign of z for rounding the result.
1248// x and y must have a non-empty mantissa and valid exponent.
1249func (z *Float) usub(x, y *Float) {
1250	// This code is symmetric to uadd.
1251	// We have not factored the common code out because
1252	// eventually uadd (and usub) should be optimized
1253	// by special-casing, and the code will diverge.
1254
1255	if debugFloat {
1256		validateBinaryOperands(x, y)
1257	}
1258
1259	ex := int64(x.exp) - int64(len(x.mant))*_W
1260	ey := int64(y.exp) - int64(len(y.mant))*_W
1261
1262	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
1263
1264	switch {
1265	case ex < ey:
1266		if al {
1267			t := nat(nil).shl(y.mant, uint(ey-ex))
1268			z.mant = t.sub(x.mant, t)
1269		} else {
1270			z.mant = z.mant.shl(y.mant, uint(ey-ex))
1271			z.mant = z.mant.sub(x.mant, z.mant)
1272		}
1273	default:
1274		// ex == ey, no shift needed
1275		z.mant = z.mant.sub(x.mant, y.mant)
1276	case ex > ey:
1277		if al {
1278			t := nat(nil).shl(x.mant, uint(ex-ey))
1279			z.mant = t.sub(t, y.mant)
1280		} else {
1281			z.mant = z.mant.shl(x.mant, uint(ex-ey))
1282			z.mant = z.mant.sub(z.mant, y.mant)
1283		}
1284		ex = ey
1285	}
1286
1287	// operands may have canceled each other out
1288	if len(z.mant) == 0 {
1289		z.acc = Exact
1290		z.form = zero
1291		z.neg = false
1292		return
1293	}
1294	// len(z.mant) > 0
1295
1296	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1297}
1298
1299// z = x * y, ignoring signs of x and y for the multiplication
1300// but using the sign of z for rounding the result.
1301// x and y must have a non-empty mantissa and valid exponent.
1302func (z *Float) umul(x, y *Float) {
1303	if debugFloat {
1304		validateBinaryOperands(x, y)
1305	}
1306
1307	// Note: This is doing too much work if the precision
1308	// of z is less than the sum of the precisions of x
1309	// and y which is often the case (e.g., if all floats
1310	// have the same precision).
1311	// TODO(gri) Optimize this for the common case.
1312
1313	e := int64(x.exp) + int64(y.exp)
1314	if x == y {
1315		z.mant = z.mant.sqr(x.mant)
1316	} else {
1317		z.mant = z.mant.mul(x.mant, y.mant)
1318	}
1319	z.setExpAndRound(e-fnorm(z.mant), 0)
1320}
1321
1322// z = x / y, ignoring signs of x and y for the division
1323// but using the sign of z for rounding the result.
1324// x and y must have a non-empty mantissa and valid exponent.
1325func (z *Float) uquo(x, y *Float) {
1326	if debugFloat {
1327		validateBinaryOperands(x, y)
1328	}
1329
1330	// mantissa length in words for desired result precision + 1
1331	// (at least one extra bit so we get the rounding bit after
1332	// the division)
1333	n := int(z.prec/_W) + 1
1334
1335	// compute adjusted x.mant such that we get enough result precision
1336	xadj := x.mant
1337	if d := n - len(x.mant) + len(y.mant); d > 0 {
1338		// d extra words needed => add d "0 digits" to x
1339		xadj = make(nat, len(x.mant)+d)
1340		copy(xadj[d:], x.mant)
1341	}
1342	// TODO(gri): If we have too many digits (d < 0), we should be able
1343	// to shorten x for faster division. But we must be extra careful
1344	// with rounding in that case.
1345
1346	// Compute d before division since there may be aliasing of x.mant
1347	// (via xadj) or y.mant with z.mant.
1348	d := len(xadj) - len(y.mant)
1349
1350	// divide
1351	var r nat
1352	z.mant, r = z.mant.div(nil, xadj, y.mant)
1353	e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W
1354
1355	// The result is long enough to include (at least) the rounding bit.
1356	// If there's a non-zero remainder, the corresponding fractional part
1357	// (if it were computed), would have a non-zero sticky bit (if it were
1358	// zero, it couldn't have a non-zero remainder).
1359	var sbit uint
1360	if len(r) > 0 {
1361		sbit = 1
1362	}
1363
1364	z.setExpAndRound(e-fnorm(z.mant), sbit)
1365}
1366
1367// ucmp returns -1, 0, or +1, depending on whether
1368// |x| < |y|, |x| == |y|, or |x| > |y|.
1369// x and y must have a non-empty mantissa and valid exponent.
1370func (x *Float) ucmp(y *Float) int {
1371	if debugFloat {
1372		validateBinaryOperands(x, y)
1373	}
1374
1375	switch {
1376	case x.exp < y.exp:
1377		return -1
1378	case x.exp > y.exp:
1379		return +1
1380	}
1381	// x.exp == y.exp
1382
1383	// compare mantissas
1384	i := len(x.mant)
1385	j := len(y.mant)
1386	for i > 0 || j > 0 {
1387		var xm, ym Word
1388		if i > 0 {
1389			i--
1390			xm = x.mant[i]
1391		}
1392		if j > 0 {
1393			j--
1394			ym = y.mant[j]
1395		}
1396		switch {
1397		case xm < ym:
1398			return -1
1399		case xm > ym:
1400			return +1
1401		}
1402	}
1403
1404	return 0
1405}
1406
1407// Handling of sign bit as defined by IEEE 754-2008, section 6.3:
1408//
1409// When neither the inputs nor result are NaN, the sign of a product or
1410// quotient is the exclusive OR of the operands’ signs; the sign of a sum,
1411// or of a difference x−y regarded as a sum x+(−y), differs from at most
1412// one of the addends’ signs; and the sign of the result of conversions,
1413// the quantize operation, the roundToIntegral operations, and the
1414// roundToIntegralExact (see 5.3.1) is the sign of the first or only operand.
1415// These rules shall apply even when operands or results are zero or infinite.
1416//
1417// When the sum of two operands with opposite signs (or the difference of
1418// two operands with like signs) is exactly zero, the sign of that sum (or
1419// difference) shall be +0 in all rounding-direction attributes except
1420// roundTowardNegative; under that attribute, the sign of an exact zero
1421// sum (or difference) shall be −0. However, x+x = x−(−x) retains the same
1422// sign as x even when x is zero.
1423//
1424// See also: https://play.golang.org/p/RtH3UCt5IH
1425
1426// Add sets z to the rounded sum x+y and returns z. If z's precision is 0,
1427// it is changed to the larger of x's or y's precision before the operation.
1428// Rounding is performed according to z's precision and rounding mode; and
1429// z's accuracy reports the result error relative to the exact (not rounded)
1430// result. Add panics with ErrNaN if x and y are infinities with opposite
1431// signs. The value of z is undefined in that case.
1432//
1433// BUG(gri) When rounding ToNegativeInf, the sign of Float values rounded to 0 is incorrect.
1434func (z *Float) Add(x, y *Float) *Float {
1435	if debugFloat {
1436		x.validate()
1437		y.validate()
1438	}
1439
1440	if z.prec == 0 {
1441		z.prec = umax32(x.prec, y.prec)
1442	}
1443
1444	if x.form == finite && y.form == finite {
1445		// x + y (common case)
1446
1447		// Below we set z.neg = x.neg, and when z aliases y this will
1448		// change the y operand's sign. This is fine, because if an
1449		// operand aliases the receiver it'll be overwritten, but we still
1450		// want the original x.neg and y.neg values when we evaluate
1451		// x.neg != y.neg, so we need to save y.neg before setting z.neg.
1452		yneg := y.neg
1453
1454		z.neg = x.neg
1455		if x.neg == yneg {
1456			// x + y == x + y
1457			// (-x) + (-y) == -(x + y)
1458			z.uadd(x, y)
1459		} else {
1460			// x + (-y) == x - y == -(y - x)
1461			// (-x) + y == y - x == -(x - y)
1462			if x.ucmp(y) > 0 {
1463				z.usub(x, y)
1464			} else {
1465				z.neg = !z.neg
1466				z.usub(y, x)
1467			}
1468		}
1469		return z
1470	}
1471
1472	if x.form == inf && y.form == inf && x.neg != y.neg {
1473		// +Inf + -Inf
1474		// -Inf + +Inf
1475		// value of z is undefined but make sure it's valid
1476		z.acc = Exact
1477		z.form = zero
1478		z.neg = false
1479		panic(ErrNaN{"addition of infinities with opposite signs"})
1480	}
1481
1482	if x.form == zero && y.form == zero {
1483		// ±0 + ±0
1484		z.acc = Exact
1485		z.form = zero
1486		z.neg = x.neg && y.neg // -0 + -0 == -0
1487		return z
1488	}
1489
1490	if x.form == inf || y.form == zero {
1491		// ±Inf + y
1492		// x + ±0
1493		return z.Set(x)
1494	}
1495
1496	// ±0 + y
1497	// x + ±Inf
1498	return z.Set(y)
1499}
1500
1501// Sub sets z to the rounded difference x-y and returns z.
1502// Precision, rounding, and accuracy reporting are as for Add.
1503// Sub panics with ErrNaN if x and y are infinities with equal
1504// signs. The value of z is undefined in that case.
1505func (z *Float) Sub(x, y *Float) *Float {
1506	if debugFloat {
1507		x.validate()
1508		y.validate()
1509	}
1510
1511	if z.prec == 0 {
1512		z.prec = umax32(x.prec, y.prec)
1513	}
1514
1515	if x.form == finite && y.form == finite {
1516		// x - y (common case)
1517		yneg := y.neg
1518		z.neg = x.neg
1519		if x.neg != yneg {
1520			// x - (-y) == x + y
1521			// (-x) - y == -(x + y)
1522			z.uadd(x, y)
1523		} else {
1524			// x - y == x - y == -(y - x)
1525			// (-x) - (-y) == y - x == -(x - y)
1526			if x.ucmp(y) > 0 {
1527				z.usub(x, y)
1528			} else {
1529				z.neg = !z.neg
1530				z.usub(y, x)
1531			}
1532		}
1533		return z
1534	}
1535
1536	if x.form == inf && y.form == inf && x.neg == y.neg {
1537		// +Inf - +Inf
1538		// -Inf - -Inf
1539		// value of z is undefined but make sure it's valid
1540		z.acc = Exact
1541		z.form = zero
1542		z.neg = false
1543		panic(ErrNaN{"subtraction of infinities with equal signs"})
1544	}
1545
1546	if x.form == zero && y.form == zero {
1547		// ±0 - ±0
1548		z.acc = Exact
1549		z.form = zero
1550		z.neg = x.neg && !y.neg // -0 - +0 == -0
1551		return z
1552	}
1553
1554	if x.form == inf || y.form == zero {
1555		// ±Inf - y
1556		// x - ±0
1557		return z.Set(x)
1558	}
1559
1560	// ±0 - y
1561	// x - ±Inf
1562	return z.Neg(y)
1563}
1564
1565// Mul sets z to the rounded product x*y and returns z.
1566// Precision, rounding, and accuracy reporting are as for Add.
1567// Mul panics with ErrNaN if one operand is zero and the other
1568// operand an infinity. The value of z is undefined in that case.
1569func (z *Float) Mul(x, y *Float) *Float {
1570	if debugFloat {
1571		x.validate()
1572		y.validate()
1573	}
1574
1575	if z.prec == 0 {
1576		z.prec = umax32(x.prec, y.prec)
1577	}
1578
1579	z.neg = x.neg != y.neg
1580
1581	if x.form == finite && y.form == finite {
1582		// x * y (common case)
1583		z.umul(x, y)
1584		return z
1585	}
1586
1587	z.acc = Exact
1588	if x.form == zero && y.form == inf || x.form == inf && y.form == zero {
1589		// ±0 * ±Inf
1590		// ±Inf * ±0
1591		// value of z is undefined but make sure it's valid
1592		z.form = zero
1593		z.neg = false
1594		panic(ErrNaN{"multiplication of zero with infinity"})
1595	}
1596
1597	if x.form == inf || y.form == inf {
1598		// ±Inf * y
1599		// x * ±Inf
1600		z.form = inf
1601		return z
1602	}
1603
1604	// ±0 * y
1605	// x * ±0
1606	z.form = zero
1607	return z
1608}
1609
1610// Quo sets z to the rounded quotient x/y and returns z.
1611// Precision, rounding, and accuracy reporting are as for Add.
1612// Quo panics with ErrNaN if both operands are zero or infinities.
1613// The value of z is undefined in that case.
1614func (z *Float) Quo(x, y *Float) *Float {
1615	if debugFloat {
1616		x.validate()
1617		y.validate()
1618	}
1619
1620	if z.prec == 0 {
1621		z.prec = umax32(x.prec, y.prec)
1622	}
1623
1624	z.neg = x.neg != y.neg
1625
1626	if x.form == finite && y.form == finite {
1627		// x / y (common case)
1628		z.uquo(x, y)
1629		return z
1630	}
1631
1632	z.acc = Exact
1633	if x.form == zero && y.form == zero || x.form == inf && y.form == inf {
1634		// ±0 / ±0
1635		// ±Inf / ±Inf
1636		// value of z is undefined but make sure it's valid
1637		z.form = zero
1638		z.neg = false
1639		panic(ErrNaN{"division of zero by zero or infinity by infinity"})
1640	}
1641
1642	if x.form == zero || y.form == inf {
1643		// ±0 / y
1644		// x / ±Inf
1645		z.form = zero
1646		return z
1647	}
1648
1649	// x / ±0
1650	// ±Inf / y
1651	z.form = inf
1652	return z
1653}
1654
1655// Cmp compares x and y and returns:
1656//
1657//   -1 if x <  y
1658//    0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf)
1659//   +1 if x >  y
1660//
1661func (x *Float) Cmp(y *Float) int {
1662	if debugFloat {
1663		x.validate()
1664		y.validate()
1665	}
1666
1667	mx := x.ord()
1668	my := y.ord()
1669	switch {
1670	case mx < my:
1671		return -1
1672	case mx > my:
1673		return +1
1674	}
1675	// mx == my
1676
1677	// only if |mx| == 1 we have to compare the mantissae
1678	switch mx {
1679	case -1:
1680		return y.ucmp(x)
1681	case +1:
1682		return x.ucmp(y)
1683	}
1684
1685	return 0
1686}
1687
1688// ord classifies x and returns:
1689//
1690//	-2 if -Inf == x
1691//	-1 if -Inf < x < 0
1692//	 0 if x == 0 (signed or unsigned)
1693//	+1 if 0 < x < +Inf
1694//	+2 if x == +Inf
1695//
1696func (x *Float) ord() int {
1697	var m int
1698	switch x.form {
1699	case finite:
1700		m = 1
1701	case zero:
1702		return 0
1703	case inf:
1704		m = 2
1705	}
1706	if x.neg {
1707		m = -m
1708	}
1709	return m
1710}
1711
1712func umax32(x, y uint32) uint32 {
1713	if x > y {
1714		return x
1715	}
1716	return y
1717}
1718