1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file implements multi-precision floating-point numbers.
6// Like in the GNU MPFR library (https://www.mpfr.org/), operands
7// can be of mixed precision. Unlike MPFR, the rounding mode is
8// not specified with each operation, but with each operand. The
9// rounding mode of the result operand determines the rounding
10// mode of an operation. This is a from-scratch implementation.
11
12package big
13
14import (
15	"fmt"
16	"math"
17	"math/bits"
18)
19
20const debugFloat = false // enable for debugging
21
22// A nonzero finite Float represents a multi-precision floating point number
23//
24//   sign × mantissa × 2**exponent
25//
26// with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp.
27// A Float may also be zero (+0, -0) or infinite (+Inf, -Inf).
28// All Floats are ordered, and the ordering of two Floats x and y
29// is defined by x.Cmp(y).
30//
31// Each Float value also has a precision, rounding mode, and accuracy.
32// The precision is the maximum number of mantissa bits available to
33// represent the value. The rounding mode specifies how a result should
34// be rounded to fit into the mantissa bits, and accuracy describes the
35// rounding error with respect to the exact result.
36//
37// Unless specified otherwise, all operations (including setters) that
38// specify a *Float variable for the result (usually via the receiver
39// with the exception of MantExp), round the numeric result according
40// to the precision and rounding mode of the result variable.
41//
42// If the provided result precision is 0 (see below), it is set to the
43// precision of the argument with the largest precision value before any
44// rounding takes place, and the rounding mode remains unchanged. Thus,
45// uninitialized Floats provided as result arguments will have their
46// precision set to a reasonable value determined by the operands, and
47// their mode is the zero value for RoundingMode (ToNearestEven).
48//
49// By setting the desired precision to 24 or 53 and using matching rounding
50// mode (typically ToNearestEven), Float operations produce the same results
51// as the corresponding float32 or float64 IEEE-754 arithmetic for operands
52// that correspond to normal (i.e., not denormal) float32 or float64 numbers.
53// Exponent underflow and overflow lead to a 0 or an Infinity for different
54// values than IEEE-754 because Float exponents have a much larger range.
55//
56// The zero (uninitialized) value for a Float is ready to use and represents
57// the number +0.0 exactly, with precision 0 and rounding mode ToNearestEven.
58//
59// Operations always take pointer arguments (*Float) rather
60// than Float values, and each unique Float value requires
61// its own unique *Float pointer. To "copy" a Float value,
62// an existing (or newly allocated) Float must be set to
63// a new value using the Float.Set method; shallow copies
64// of Floats are not supported and may lead to errors.
65type Float struct {
66	prec uint32
67	mode RoundingMode
68	acc  Accuracy
69	form form
70	neg  bool
71	mant nat
72	exp  int32
73}
74
75// An ErrNaN panic is raised by a Float operation that would lead to
76// a NaN under IEEE-754 rules. An ErrNaN implements the error interface.
77type ErrNaN struct {
78	msg string
79}
80
81func (err ErrNaN) Error() string {
82	return err.msg
83}
84
85// NewFloat allocates and returns a new Float set to x,
86// with precision 53 and rounding mode ToNearestEven.
87// NewFloat panics with ErrNaN if x is a NaN.
88func NewFloat(x float64) *Float {
89	if math.IsNaN(x) {
90		panic(ErrNaN{"NewFloat(NaN)"})
91	}
92	return new(Float).SetFloat64(x)
93}
94
95// Exponent and precision limits.
96const (
97	MaxExp  = math.MaxInt32  // largest supported exponent
98	MinExp  = math.MinInt32  // smallest supported exponent
99	MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited
100)
101
102// Internal representation: The mantissa bits x.mant of a nonzero finite
103// Float x are stored in a nat slice long enough to hold up to x.prec bits;
104// the slice may (but doesn't have to) be shorter if the mantissa contains
105// trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e.,
106// the msb is shifted all the way "to the left"). Thus, if the mantissa has
107// trailing 0 bits or x.prec is not a multiple of the Word size _W,
108// x.mant[0] has trailing zero bits. The msb of the mantissa corresponds
109// to the value 0.5; the exponent x.exp shifts the binary point as needed.
110//
111// A zero or non-finite Float x ignores x.mant and x.exp.
112//
113// x                 form      neg      mant         exp
114// ----------------------------------------------------------
115// ±0                zero      sign     -            -
116// 0 < |x| < +Inf    finite    sign     mantissa     exponent
117// ±Inf              inf       sign     -            -
118
119// A form value describes the internal representation.
120type form byte
121
122// The form value order is relevant - do not change!
123const (
124	zero form = iota
125	finite
126	inf
127)
128
129// RoundingMode determines how a Float value is rounded to the
130// desired precision. Rounding may change the Float value; the
131// rounding error is described by the Float's Accuracy.
132type RoundingMode byte
133
134// These constants define supported rounding modes.
135const (
136	ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven
137	ToNearestAway                     // == IEEE 754-2008 roundTiesToAway
138	ToZero                            // == IEEE 754-2008 roundTowardZero
139	AwayFromZero                      // no IEEE 754-2008 equivalent
140	ToNegativeInf                     // == IEEE 754-2008 roundTowardNegative
141	ToPositiveInf                     // == IEEE 754-2008 roundTowardPositive
142)
143
144//go:generate stringer -type=RoundingMode
145
146// Accuracy describes the rounding error produced by the most recent
147// operation that generated a Float value, relative to the exact value.
148type Accuracy int8
149
150// Constants describing the Accuracy of a Float.
151const (
152	Below Accuracy = -1
153	Exact Accuracy = 0
154	Above Accuracy = +1
155)
156
157//go:generate stringer -type=Accuracy
158
159// SetPrec sets z's precision to prec and returns the (possibly) rounded
160// value of z. Rounding occurs according to z's rounding mode if the mantissa
161// cannot be represented in prec bits without loss of precision.
162// SetPrec(0) maps all finite values to ±0; infinite values remain unchanged.
163// If prec > MaxPrec, it is set to MaxPrec.
164func (z *Float) SetPrec(prec uint) *Float {
165	z.acc = Exact // optimistically assume no rounding is needed
166
167	// special case
168	if prec == 0 {
169		z.prec = 0
170		if z.form == finite {
171			// truncate z to 0
172			z.acc = makeAcc(z.neg)
173			z.form = zero
174		}
175		return z
176	}
177
178	// general case
179	if prec > MaxPrec {
180		prec = MaxPrec
181	}
182	old := z.prec
183	z.prec = uint32(prec)
184	if z.prec < old {
185		z.round(0)
186	}
187	return z
188}
189
190func makeAcc(above bool) Accuracy {
191	if above {
192		return Above
193	}
194	return Below
195}
196
197// SetMode sets z's rounding mode to mode and returns an exact z.
198// z remains unchanged otherwise.
199// z.SetMode(z.Mode()) is a cheap way to set z's accuracy to Exact.
200func (z *Float) SetMode(mode RoundingMode) *Float {
201	z.mode = mode
202	z.acc = Exact
203	return z
204}
205
206// Prec returns the mantissa precision of x in bits.
207// The result may be 0 for |x| == 0 and |x| == Inf.
208func (x *Float) Prec() uint {
209	return uint(x.prec)
210}
211
212// MinPrec returns the minimum precision required to represent x exactly
213// (i.e., the smallest prec before x.SetPrec(prec) would start rounding x).
214// The result is 0 for |x| == 0 and |x| == Inf.
215func (x *Float) MinPrec() uint {
216	if x.form != finite {
217		return 0
218	}
219	return uint(len(x.mant))*_W - x.mant.trailingZeroBits()
220}
221
222// Mode returns the rounding mode of x.
223func (x *Float) Mode() RoundingMode {
224	return x.mode
225}
226
227// Acc returns the accuracy of x produced by the most recent
228// operation, unless explicitly documented otherwise by that
229// operation.
230func (x *Float) Acc() Accuracy {
231	return x.acc
232}
233
234// Sign returns:
235//
236//	-1 if x <   0
237//	 0 if x is ±0
238//	+1 if x >   0
239//
240func (x *Float) Sign() int {
241	if debugFloat {
242		x.validate()
243	}
244	if x.form == zero {
245		return 0
246	}
247	if x.neg {
248		return -1
249	}
250	return 1
251}
252
253// MantExp breaks x into its mantissa and exponent components
254// and returns the exponent. If a non-nil mant argument is
255// provided its value is set to the mantissa of x, with the
256// same precision and rounding mode as x. The components
257// satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0.
258// Calling MantExp with a nil argument is an efficient way to
259// get the exponent of the receiver.
260//
261// Special cases are:
262//
263//	(  ±0).MantExp(mant) = 0, with mant set to   ±0
264//	(±Inf).MantExp(mant) = 0, with mant set to ±Inf
265//
266// x and mant may be the same in which case x is set to its
267// mantissa value.
268func (x *Float) MantExp(mant *Float) (exp int) {
269	if debugFloat {
270		x.validate()
271	}
272	if x.form == finite {
273		exp = int(x.exp)
274	}
275	if mant != nil {
276		mant.Copy(x)
277		if mant.form == finite {
278			mant.exp = 0
279		}
280	}
281	return
282}
283
284func (z *Float) setExpAndRound(exp int64, sbit uint) {
285	if exp < MinExp {
286		// underflow
287		z.acc = makeAcc(z.neg)
288		z.form = zero
289		return
290	}
291
292	if exp > MaxExp {
293		// overflow
294		z.acc = makeAcc(!z.neg)
295		z.form = inf
296		return
297	}
298
299	z.form = finite
300	z.exp = int32(exp)
301	z.round(sbit)
302}
303
304// SetMantExp sets z to mant × 2**exp and returns z.
305// The result z has the same precision and rounding mode
306// as mant. SetMantExp is an inverse of MantExp but does
307// not require 0.5 <= |mant| < 1.0. Specifically, for a
308// given x of type *Float, SetMantExp relates to MantExp
309// as follows:
310//
311//	mant := new(Float)
312//	new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0
313//
314// Special cases are:
315//
316//	z.SetMantExp(  ±0, exp) =   ±0
317//	z.SetMantExp(±Inf, exp) = ±Inf
318//
319// z and mant may be the same in which case z's exponent
320// is set to exp.
321func (z *Float) SetMantExp(mant *Float, exp int) *Float {
322	if debugFloat {
323		z.validate()
324		mant.validate()
325	}
326	z.Copy(mant)
327
328	if z.form == finite {
329		// 0 < |mant| < +Inf
330		z.setExpAndRound(int64(z.exp)+int64(exp), 0)
331	}
332	return z
333}
334
335// Signbit reports whether x is negative or negative zero.
336func (x *Float) Signbit() bool {
337	return x.neg
338}
339
340// IsInf reports whether x is +Inf or -Inf.
341func (x *Float) IsInf() bool {
342	return x.form == inf
343}
344
345// IsInt reports whether x is an integer.
346// ±Inf values are not integers.
347func (x *Float) IsInt() bool {
348	if debugFloat {
349		x.validate()
350	}
351	// special cases
352	if x.form != finite {
353		return x.form == zero
354	}
355	// x.form == finite
356	if x.exp <= 0 {
357		return false
358	}
359	// x.exp > 0
360	return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa
361}
362
363// debugging support
364func (x *Float) validate() {
365	if !debugFloat {
366		// avoid performance bugs
367		panic("validate called but debugFloat is not set")
368	}
369	if x.form != finite {
370		return
371	}
372	m := len(x.mant)
373	if m == 0 {
374		panic("nonzero finite number with empty mantissa")
375	}
376	const msb = 1 << (_W - 1)
377	if x.mant[m-1]&msb == 0 {
378		panic(fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0)))
379	}
380	if x.prec == 0 {
381		panic("zero precision finite number")
382	}
383}
384
385// round rounds z according to z.mode to z.prec bits and sets z.acc accordingly.
386// sbit must be 0 or 1 and summarizes any "sticky bit" information one might
387// have before calling round. z's mantissa must be normalized (with the msb set)
388// or empty.
389//
390// CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the
391// sign of z. For correct rounding, the sign of z must be set correctly before
392// calling round.
393func (z *Float) round(sbit uint) {
394	if debugFloat {
395		z.validate()
396	}
397
398	z.acc = Exact
399	if z.form != finite {
400		// ±0 or ±Inf => nothing left to do
401		return
402	}
403	// z.form == finite && len(z.mant) > 0
404	// m > 0 implies z.prec > 0 (checked by validate)
405
406	m := uint32(len(z.mant)) // present mantissa length in words
407	bits := m * _W           // present mantissa bits; bits > 0
408	if bits <= z.prec {
409		// mantissa fits => nothing to do
410		return
411	}
412	// bits > z.prec
413
414	// Rounding is based on two bits: the rounding bit (rbit) and the
415	// sticky bit (sbit). The rbit is the bit immediately before the
416	// z.prec leading mantissa bits (the "0.5"). The sbit is set if any
417	// of the bits before the rbit are set (the "0.25", "0.125", etc.):
418	//
419	//   rbit  sbit  => "fractional part"
420	//
421	//   0     0        == 0
422	//   0     1        >  0  , < 0.5
423	//   1     0        == 0.5
424	//   1     1        >  0.5, < 1.0
425
426	// bits > z.prec: mantissa too large => round
427	r := uint(bits - z.prec - 1) // rounding bit position; r >= 0
428	rbit := z.mant.bit(r) & 1    // rounding bit; be safe and ensure it's a single bit
429	// The sticky bit is only needed for rounding ToNearestEven
430	// or when the rounding bit is zero. Avoid computation otherwise.
431	if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) {
432		sbit = z.mant.sticky(r)
433	}
434	sbit &= 1 // be safe and ensure it's a single bit
435
436	// cut off extra words
437	n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision
438	if m > n {
439		copy(z.mant, z.mant[m-n:]) // move n last words to front
440		z.mant = z.mant[:n]
441	}
442
443	// determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word
444	ntz := n*_W - z.prec // 0 <= ntz < _W
445	lsb := Word(1) << ntz
446
447	// round if result is inexact
448	if rbit|sbit != 0 {
449		// Make rounding decision: The result mantissa is truncated ("rounded down")
450		// by default. Decide if we need to increment, or "round up", the (unsigned)
451		// mantissa.
452		inc := false
453		switch z.mode {
454		case ToNegativeInf:
455			inc = z.neg
456		case ToZero:
457			// nothing to do
458		case ToNearestEven:
459			inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0)
460		case ToNearestAway:
461			inc = rbit != 0
462		case AwayFromZero:
463			inc = true
464		case ToPositiveInf:
465			inc = !z.neg
466		default:
467			panic("unreachable")
468		}
469
470		// A positive result (!z.neg) is Above the exact result if we increment,
471		// and it's Below if we truncate (Exact results require no rounding).
472		// For a negative result (z.neg) it is exactly the opposite.
473		z.acc = makeAcc(inc != z.neg)
474
475		if inc {
476			// add 1 to mantissa
477			if addVW(z.mant, z.mant, lsb) != 0 {
478				// mantissa overflow => adjust exponent
479				if z.exp >= MaxExp {
480					// exponent overflow
481					z.form = inf
482					return
483				}
484				z.exp++
485				// adjust mantissa: divide by 2 to compensate for exponent adjustment
486				shrVU(z.mant, z.mant, 1)
487				// set msb == carry == 1 from the mantissa overflow above
488				const msb = 1 << (_W - 1)
489				z.mant[n-1] |= msb
490			}
491		}
492	}
493
494	// zero out trailing bits in least-significant word
495	z.mant[0] &^= lsb - 1
496
497	if debugFloat {
498		z.validate()
499	}
500}
501
502func (z *Float) setBits64(neg bool, x uint64) *Float {
503	if z.prec == 0 {
504		z.prec = 64
505	}
506	z.acc = Exact
507	z.neg = neg
508	if x == 0 {
509		z.form = zero
510		return z
511	}
512	// x != 0
513	z.form = finite
514	s := bits.LeadingZeros64(x)
515	z.mant = z.mant.setUint64(x << uint(s))
516	z.exp = int32(64 - s) // always fits
517	if z.prec < 64 {
518		z.round(0)
519	}
520	return z
521}
522
523// SetUint64 sets z to the (possibly rounded) value of x and returns z.
524// If z's precision is 0, it is changed to 64 (and rounding will have
525// no effect).
526func (z *Float) SetUint64(x uint64) *Float {
527	return z.setBits64(false, x)
528}
529
530// SetInt64 sets z to the (possibly rounded) value of x and returns z.
531// If z's precision is 0, it is changed to 64 (and rounding will have
532// no effect).
533func (z *Float) SetInt64(x int64) *Float {
534	u := x
535	if u < 0 {
536		u = -u
537	}
538	// We cannot simply call z.SetUint64(uint64(u)) and change
539	// the sign afterwards because the sign affects rounding.
540	return z.setBits64(x < 0, uint64(u))
541}
542
543// SetFloat64 sets z to the (possibly rounded) value of x and returns z.
544// If z's precision is 0, it is changed to 53 (and rounding will have
545// no effect). SetFloat64 panics with ErrNaN if x is a NaN.
546func (z *Float) SetFloat64(x float64) *Float {
547	if z.prec == 0 {
548		z.prec = 53
549	}
550	if math.IsNaN(x) {
551		panic(ErrNaN{"Float.SetFloat64(NaN)"})
552	}
553	z.acc = Exact
554	z.neg = math.Signbit(x) // handle -0, -Inf correctly
555	if x == 0 {
556		z.form = zero
557		return z
558	}
559	if math.IsInf(x, 0) {
560		z.form = inf
561		return z
562	}
563	// normalized x != 0
564	z.form = finite
565	fmant, exp := math.Frexp(x) // get normalized mantissa
566	z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11)
567	z.exp = int32(exp) // always fits
568	if z.prec < 53 {
569		z.round(0)
570	}
571	return z
572}
573
574// fnorm normalizes mantissa m by shifting it to the left
575// such that the msb of the most-significant word (msw) is 1.
576// It returns the shift amount. It assumes that len(m) != 0.
577func fnorm(m nat) int64 {
578	if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) {
579		panic("msw of mantissa is 0")
580	}
581	s := nlz(m[len(m)-1])
582	if s > 0 {
583		c := shlVU(m, m, s)
584		if debugFloat && c != 0 {
585			panic("nlz or shlVU incorrect")
586		}
587	}
588	return int64(s)
589}
590
591// SetInt sets z to the (possibly rounded) value of x and returns z.
592// If z's precision is 0, it is changed to the larger of x.BitLen()
593// or 64 (and rounding will have no effect).
594func (z *Float) SetInt(x *Int) *Float {
595	// TODO(gri) can be more efficient if z.prec > 0
596	// but small compared to the size of x, or if there
597	// are many trailing 0's.
598	bits := uint32(x.BitLen())
599	if z.prec == 0 {
600		z.prec = umax32(bits, 64)
601	}
602	z.acc = Exact
603	z.neg = x.neg
604	if len(x.abs) == 0 {
605		z.form = zero
606		return z
607	}
608	// x != 0
609	z.mant = z.mant.set(x.abs)
610	fnorm(z.mant)
611	z.setExpAndRound(int64(bits), 0)
612	return z
613}
614
615// SetRat sets z to the (possibly rounded) value of x and returns z.
616// If z's precision is 0, it is changed to the largest of a.BitLen(),
617// b.BitLen(), or 64; with x = a/b.
618func (z *Float) SetRat(x *Rat) *Float {
619	if x.IsInt() {
620		return z.SetInt(x.Num())
621	}
622	var a, b Float
623	a.SetInt(x.Num())
624	b.SetInt(x.Denom())
625	if z.prec == 0 {
626		z.prec = umax32(a.prec, b.prec)
627	}
628	return z.Quo(&a, &b)
629}
630
631// SetInf sets z to the infinite Float -Inf if signbit is
632// set, or +Inf if signbit is not set, and returns z. The
633// precision of z is unchanged and the result is always
634// Exact.
635func (z *Float) SetInf(signbit bool) *Float {
636	z.acc = Exact
637	z.form = inf
638	z.neg = signbit
639	return z
640}
641
642// Set sets z to the (possibly rounded) value of x and returns z.
643// If z's precision is 0, it is changed to the precision of x
644// before setting z (and rounding will have no effect).
645// Rounding is performed according to z's precision and rounding
646// mode; and z's accuracy reports the result error relative to the
647// exact (not rounded) result.
648func (z *Float) Set(x *Float) *Float {
649	if debugFloat {
650		x.validate()
651	}
652	z.acc = Exact
653	if z != x {
654		z.form = x.form
655		z.neg = x.neg
656		if x.form == finite {
657			z.exp = x.exp
658			z.mant = z.mant.set(x.mant)
659		}
660		if z.prec == 0 {
661			z.prec = x.prec
662		} else if z.prec < x.prec {
663			z.round(0)
664		}
665	}
666	return z
667}
668
669// Copy sets z to x, with the same precision, rounding mode, and
670// accuracy as x, and returns z. x is not changed even if z and
671// x are the same.
672func (z *Float) Copy(x *Float) *Float {
673	if debugFloat {
674		x.validate()
675	}
676	if z != x {
677		z.prec = x.prec
678		z.mode = x.mode
679		z.acc = x.acc
680		z.form = x.form
681		z.neg = x.neg
682		if z.form == finite {
683			z.mant = z.mant.set(x.mant)
684			z.exp = x.exp
685		}
686	}
687	return z
688}
689
690// msb32 returns the 32 most significant bits of x.
691func msb32(x nat) uint32 {
692	i := len(x) - 1
693	if i < 0 {
694		return 0
695	}
696	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
697		panic("x not normalized")
698	}
699	switch _W {
700	case 32:
701		return uint32(x[i])
702	case 64:
703		return uint32(x[i] >> 32)
704	}
705	panic("unreachable")
706}
707
708// msb64 returns the 64 most significant bits of x.
709func msb64(x nat) uint64 {
710	i := len(x) - 1
711	if i < 0 {
712		return 0
713	}
714	if debugFloat && x[i]&(1<<(_W-1)) == 0 {
715		panic("x not normalized")
716	}
717	switch _W {
718	case 32:
719		v := uint64(x[i]) << 32
720		if i > 0 {
721			v |= uint64(x[i-1])
722		}
723		return v
724	case 64:
725		return uint64(x[i])
726	}
727	panic("unreachable")
728}
729
730// Uint64 returns the unsigned integer resulting from truncating x
731// towards zero. If 0 <= x <= math.MaxUint64, the result is Exact
732// if x is an integer and Below otherwise.
733// The result is (0, Above) for x < 0, and (math.MaxUint64, Below)
734// for x > math.MaxUint64.
735func (x *Float) Uint64() (uint64, Accuracy) {
736	if debugFloat {
737		x.validate()
738	}
739
740	switch x.form {
741	case finite:
742		if x.neg {
743			return 0, Above
744		}
745		// 0 < x < +Inf
746		if x.exp <= 0 {
747			// 0 < x < 1
748			return 0, Below
749		}
750		// 1 <= x < Inf
751		if x.exp <= 64 {
752			// u = trunc(x) fits into a uint64
753			u := msb64(x.mant) >> (64 - uint32(x.exp))
754			if x.MinPrec() <= 64 {
755				return u, Exact
756			}
757			return u, Below // x truncated
758		}
759		// x too large
760		return math.MaxUint64, Below
761
762	case zero:
763		return 0, Exact
764
765	case inf:
766		if x.neg {
767			return 0, Above
768		}
769		return math.MaxUint64, Below
770	}
771
772	panic("unreachable")
773}
774
775// Int64 returns the integer resulting from truncating x towards zero.
776// If math.MinInt64 <= x <= math.MaxInt64, the result is Exact if x is
777// an integer, and Above (x < 0) or Below (x > 0) otherwise.
778// The result is (math.MinInt64, Above) for x < math.MinInt64,
779// and (math.MaxInt64, Below) for x > math.MaxInt64.
780func (x *Float) Int64() (int64, Accuracy) {
781	if debugFloat {
782		x.validate()
783	}
784
785	switch x.form {
786	case finite:
787		// 0 < |x| < +Inf
788		acc := makeAcc(x.neg)
789		if x.exp <= 0 {
790			// 0 < |x| < 1
791			return 0, acc
792		}
793		// x.exp > 0
794
795		// 1 <= |x| < +Inf
796		if x.exp <= 63 {
797			// i = trunc(x) fits into an int64 (excluding math.MinInt64)
798			i := int64(msb64(x.mant) >> (64 - uint32(x.exp)))
799			if x.neg {
800				i = -i
801			}
802			if x.MinPrec() <= uint(x.exp) {
803				return i, Exact
804			}
805			return i, acc // x truncated
806		}
807		if x.neg {
808			// check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64))
809			if x.exp == 64 && x.MinPrec() == 1 {
810				acc = Exact
811			}
812			return math.MinInt64, acc
813		}
814		// x too large
815		return math.MaxInt64, Below
816
817	case zero:
818		return 0, Exact
819
820	case inf:
821		if x.neg {
822			return math.MinInt64, Above
823		}
824		return math.MaxInt64, Below
825	}
826
827	panic("unreachable")
828}
829
830// Float32 returns the float32 value nearest to x. If x is too small to be
831// represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result
832// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
833// If x is too large to be represented by a float32 (|x| > math.MaxFloat32),
834// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
835func (x *Float) Float32() (float32, Accuracy) {
836	if debugFloat {
837		x.validate()
838	}
839
840	switch x.form {
841	case finite:
842		// 0 < |x| < +Inf
843
844		const (
845			fbits = 32                //        float size
846			mbits = 23                //        mantissa size (excluding implicit msb)
847			ebits = fbits - mbits - 1 //     8  exponent size
848			bias  = 1<<(ebits-1) - 1  //   127  exponent bias
849			dmin  = 1 - bias - mbits  //  -149  smallest unbiased exponent (denormal)
850			emin  = 1 - bias          //  -126  smallest unbiased exponent (normal)
851			emax  = bias              //   127  largest unbiased exponent (normal)
852		)
853
854		// Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa.
855		e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
856
857		// Compute precision p for float32 mantissa.
858		// If the exponent is too small, we have a denormal number before
859		// rounding and fewer than p mantissa bits of precision available
860		// (the exponent remains fixed but the mantissa gets shifted right).
861		p := mbits + 1 // precision of normal float
862		if e < emin {
863			// recompute precision
864			p = mbits + 1 - emin + int(e)
865			// If p == 0, the mantissa of x is shifted so much to the right
866			// that its msb falls immediately to the right of the float32
867			// mantissa space. In other words, if the smallest denormal is
868			// considered "1.0", for p == 0, the mantissa value m is >= 0.5.
869			// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
870			// If m == 0.5, it is rounded down to even, i.e., 0.0.
871			// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
872			if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
873				// underflow to ±0
874				if x.neg {
875					var z float32
876					return -z, Above
877				}
878				return 0.0, Below
879			}
880			// otherwise, round up
881			// We handle p == 0 explicitly because it's easy and because
882			// Float.round doesn't support rounding to 0 bits of precision.
883			if p == 0 {
884				if x.neg {
885					return -math.SmallestNonzeroFloat32, Below
886				}
887				return math.SmallestNonzeroFloat32, Above
888			}
889		}
890		// p > 0
891
892		// round
893		var r Float
894		r.prec = uint32(p)
895		r.Set(x)
896		e = r.exp - 1
897
898		// Rounding may have caused r to overflow to ±Inf
899		// (rounding never causes underflows to 0).
900		// If the exponent is too large, also overflow to ±Inf.
901		if r.form == inf || e > emax {
902			// overflow
903			if x.neg {
904				return float32(math.Inf(-1)), Below
905			}
906			return float32(math.Inf(+1)), Above
907		}
908		// e <= emax
909
910		// Determine sign, biased exponent, and mantissa.
911		var sign, bexp, mant uint32
912		if x.neg {
913			sign = 1 << (fbits - 1)
914		}
915
916		// Rounding may have caused a denormal number to
917		// become normal. Check again.
918		if e < emin {
919			// denormal number: recompute precision
920			// Since rounding may have at best increased precision
921			// and we have eliminated p <= 0 early, we know p > 0.
922			// bexp == 0 for denormals
923			p = mbits + 1 - emin + int(e)
924			mant = msb32(r.mant) >> uint(fbits-p)
925		} else {
926			// normal number: emin <= e <= emax
927			bexp = uint32(e+bias) << mbits
928			mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
929		}
930
931		return math.Float32frombits(sign | bexp | mant), r.acc
932
933	case zero:
934		if x.neg {
935			var z float32
936			return -z, Exact
937		}
938		return 0.0, Exact
939
940	case inf:
941		if x.neg {
942			return float32(math.Inf(-1)), Exact
943		}
944		return float32(math.Inf(+1)), Exact
945	}
946
947	panic("unreachable")
948}
949
950// Float64 returns the float64 value nearest to x. If x is too small to be
951// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
952// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
953// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
954// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
955func (x *Float) Float64() (float64, Accuracy) {
956	if debugFloat {
957		x.validate()
958	}
959
960	switch x.form {
961	case finite:
962		// 0 < |x| < +Inf
963
964		const (
965			fbits = 64                //        float size
966			mbits = 52                //        mantissa size (excluding implicit msb)
967			ebits = fbits - mbits - 1 //    11  exponent size
968			bias  = 1<<(ebits-1) - 1  //  1023  exponent bias
969			dmin  = 1 - bias - mbits  // -1074  smallest unbiased exponent (denormal)
970			emin  = 1 - bias          // -1022  smallest unbiased exponent (normal)
971			emax  = bias              //  1023  largest unbiased exponent (normal)
972		)
973
974		// Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa.
975		e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
976
977		// Compute precision p for float64 mantissa.
978		// If the exponent is too small, we have a denormal number before
979		// rounding and fewer than p mantissa bits of precision available
980		// (the exponent remains fixed but the mantissa gets shifted right).
981		p := mbits + 1 // precision of normal float
982		if e < emin {
983			// recompute precision
984			p = mbits + 1 - emin + int(e)
985			// If p == 0, the mantissa of x is shifted so much to the right
986			// that its msb falls immediately to the right of the float64
987			// mantissa space. In other words, if the smallest denormal is
988			// considered "1.0", for p == 0, the mantissa value m is >= 0.5.
989			// If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
990			// If m == 0.5, it is rounded down to even, i.e., 0.0.
991			// If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
992			if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
993				// underflow to ±0
994				if x.neg {
995					var z float64
996					return -z, Above
997				}
998				return 0.0, Below
999			}
1000			// otherwise, round up
1001			// We handle p == 0 explicitly because it's easy and because
1002			// Float.round doesn't support rounding to 0 bits of precision.
1003			if p == 0 {
1004				if x.neg {
1005					return -math.SmallestNonzeroFloat64, Below
1006				}
1007				return math.SmallestNonzeroFloat64, Above
1008			}
1009		}
1010		// p > 0
1011
1012		// round
1013		var r Float
1014		r.prec = uint32(p)
1015		r.Set(x)
1016		e = r.exp - 1
1017
1018		// Rounding may have caused r to overflow to ±Inf
1019		// (rounding never causes underflows to 0).
1020		// If the exponent is too large, also overflow to ±Inf.
1021		if r.form == inf || e > emax {
1022			// overflow
1023			if x.neg {
1024				return math.Inf(-1), Below
1025			}
1026			return math.Inf(+1), Above
1027		}
1028		// e <= emax
1029
1030		// Determine sign, biased exponent, and mantissa.
1031		var sign, bexp, mant uint64
1032		if x.neg {
1033			sign = 1 << (fbits - 1)
1034		}
1035
1036		// Rounding may have caused a denormal number to
1037		// become normal. Check again.
1038		if e < emin {
1039			// denormal number: recompute precision
1040			// Since rounding may have at best increased precision
1041			// and we have eliminated p <= 0 early, we know p > 0.
1042			// bexp == 0 for denormals
1043			p = mbits + 1 - emin + int(e)
1044			mant = msb64(r.mant) >> uint(fbits-p)
1045		} else {
1046			// normal number: emin <= e <= emax
1047			bexp = uint64(e+bias) << mbits
1048			mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
1049		}
1050
1051		return math.Float64frombits(sign | bexp | mant), r.acc
1052
1053	case zero:
1054		if x.neg {
1055			var z float64
1056			return -z, Exact
1057		}
1058		return 0.0, Exact
1059
1060	case inf:
1061		if x.neg {
1062			return math.Inf(-1), Exact
1063		}
1064		return math.Inf(+1), Exact
1065	}
1066
1067	panic("unreachable")
1068}
1069
1070// Int returns the result of truncating x towards zero;
1071// or nil if x is an infinity.
1072// The result is Exact if x.IsInt(); otherwise it is Below
1073// for x > 0, and Above for x < 0.
1074// If a non-nil *Int argument z is provided, Int stores
1075// the result in z instead of allocating a new Int.
1076func (x *Float) Int(z *Int) (*Int, Accuracy) {
1077	if debugFloat {
1078		x.validate()
1079	}
1080
1081	if z == nil && x.form <= finite {
1082		z = new(Int)
1083	}
1084
1085	switch x.form {
1086	case finite:
1087		// 0 < |x| < +Inf
1088		acc := makeAcc(x.neg)
1089		if x.exp <= 0 {
1090			// 0 < |x| < 1
1091			return z.SetInt64(0), acc
1092		}
1093		// x.exp > 0
1094
1095		// 1 <= |x| < +Inf
1096		// determine minimum required precision for x
1097		allBits := uint(len(x.mant)) * _W
1098		exp := uint(x.exp)
1099		if x.MinPrec() <= exp {
1100			acc = Exact
1101		}
1102		// shift mantissa as needed
1103		if z == nil {
1104			z = new(Int)
1105		}
1106		z.neg = x.neg
1107		switch {
1108		case exp > allBits:
1109			z.abs = z.abs.shl(x.mant, exp-allBits)
1110		default:
1111			z.abs = z.abs.set(x.mant)
1112		case exp < allBits:
1113			z.abs = z.abs.shr(x.mant, allBits-exp)
1114		}
1115		return z, acc
1116
1117	case zero:
1118		return z.SetInt64(0), Exact
1119
1120	case inf:
1121		return nil, makeAcc(x.neg)
1122	}
1123
1124	panic("unreachable")
1125}
1126
1127// Rat returns the rational number corresponding to x;
1128// or nil if x is an infinity.
1129// The result is Exact if x is not an Inf.
1130// If a non-nil *Rat argument z is provided, Rat stores
1131// the result in z instead of allocating a new Rat.
1132func (x *Float) Rat(z *Rat) (*Rat, Accuracy) {
1133	if debugFloat {
1134		x.validate()
1135	}
1136
1137	if z == nil && x.form <= finite {
1138		z = new(Rat)
1139	}
1140
1141	switch x.form {
1142	case finite:
1143		// 0 < |x| < +Inf
1144		allBits := int32(len(x.mant)) * _W
1145		// build up numerator and denominator
1146		z.a.neg = x.neg
1147		switch {
1148		case x.exp > allBits:
1149			z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits))
1150			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
1151			// z already in normal form
1152		default:
1153			z.a.abs = z.a.abs.set(x.mant)
1154			z.b.abs = z.b.abs[:0] // == 1 (see Rat)
1155			// z already in normal form
1156		case x.exp < allBits:
1157			z.a.abs = z.a.abs.set(x.mant)
1158			t := z.b.abs.setUint64(1)
1159			z.b.abs = t.shl(t, uint(allBits-x.exp))
1160			z.norm()
1161		}
1162		return z, Exact
1163
1164	case zero:
1165		return z.SetInt64(0), Exact
1166
1167	case inf:
1168		return nil, makeAcc(x.neg)
1169	}
1170
1171	panic("unreachable")
1172}
1173
1174// Abs sets z to the (possibly rounded) value |x| (the absolute value of x)
1175// and returns z.
1176func (z *Float) Abs(x *Float) *Float {
1177	z.Set(x)
1178	z.neg = false
1179	return z
1180}
1181
1182// Neg sets z to the (possibly rounded) value of x with its sign negated,
1183// and returns z.
1184func (z *Float) Neg(x *Float) *Float {
1185	z.Set(x)
1186	z.neg = !z.neg
1187	return z
1188}
1189
1190func validateBinaryOperands(x, y *Float) {
1191	if !debugFloat {
1192		// avoid performance bugs
1193		panic("validateBinaryOperands called but debugFloat is not set")
1194	}
1195	if len(x.mant) == 0 {
1196		panic("empty mantissa for x")
1197	}
1198	if len(y.mant) == 0 {
1199		panic("empty mantissa for y")
1200	}
1201}
1202
1203// z = x + y, ignoring signs of x and y for the addition
1204// but using the sign of z for rounding the result.
1205// x and y must have a non-empty mantissa and valid exponent.
1206func (z *Float) uadd(x, y *Float) {
1207	// Note: This implementation requires 2 shifts most of the
1208	// time. It is also inefficient if exponents or precisions
1209	// differ by wide margins. The following article describes
1210	// an efficient (but much more complicated) implementation
1211	// compatible with the internal representation used here:
1212	//
1213	// Vincent Lefèvre: "The Generic Multiple-Precision Floating-
1214	// Point Addition With Exact Rounding (as in the MPFR Library)"
1215	// http://www.vinc17.net/research/papers/rnc6.pdf
1216
1217	if debugFloat {
1218		validateBinaryOperands(x, y)
1219	}
1220
1221	// compute exponents ex, ey for mantissa with "binary point"
1222	// on the right (mantissa.0) - use int64 to avoid overflow
1223	ex := int64(x.exp) - int64(len(x.mant))*_W
1224	ey := int64(y.exp) - int64(len(y.mant))*_W
1225
1226	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
1227
1228	// TODO(gri) having a combined add-and-shift primitive
1229	//           could make this code significantly faster
1230	switch {
1231	case ex < ey:
1232		if al {
1233			t := nat(nil).shl(y.mant, uint(ey-ex))
1234			z.mant = z.mant.add(x.mant, t)
1235		} else {
1236			z.mant = z.mant.shl(y.mant, uint(ey-ex))
1237			z.mant = z.mant.add(x.mant, z.mant)
1238		}
1239	default:
1240		// ex == ey, no shift needed
1241		z.mant = z.mant.add(x.mant, y.mant)
1242	case ex > ey:
1243		if al {
1244			t := nat(nil).shl(x.mant, uint(ex-ey))
1245			z.mant = z.mant.add(t, y.mant)
1246		} else {
1247			z.mant = z.mant.shl(x.mant, uint(ex-ey))
1248			z.mant = z.mant.add(z.mant, y.mant)
1249		}
1250		ex = ey
1251	}
1252	// len(z.mant) > 0
1253
1254	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1255}
1256
1257// z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction
1258// but using the sign of z for rounding the result.
1259// x and y must have a non-empty mantissa and valid exponent.
1260func (z *Float) usub(x, y *Float) {
1261	// This code is symmetric to uadd.
1262	// We have not factored the common code out because
1263	// eventually uadd (and usub) should be optimized
1264	// by special-casing, and the code will diverge.
1265
1266	if debugFloat {
1267		validateBinaryOperands(x, y)
1268	}
1269
1270	ex := int64(x.exp) - int64(len(x.mant))*_W
1271	ey := int64(y.exp) - int64(len(y.mant))*_W
1272
1273	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
1274
1275	switch {
1276	case ex < ey:
1277		if al {
1278			t := nat(nil).shl(y.mant, uint(ey-ex))
1279			z.mant = t.sub(x.mant, t)
1280		} else {
1281			z.mant = z.mant.shl(y.mant, uint(ey-ex))
1282			z.mant = z.mant.sub(x.mant, z.mant)
1283		}
1284	default:
1285		// ex == ey, no shift needed
1286		z.mant = z.mant.sub(x.mant, y.mant)
1287	case ex > ey:
1288		if al {
1289			t := nat(nil).shl(x.mant, uint(ex-ey))
1290			z.mant = t.sub(t, y.mant)
1291		} else {
1292			z.mant = z.mant.shl(x.mant, uint(ex-ey))
1293			z.mant = z.mant.sub(z.mant, y.mant)
1294		}
1295		ex = ey
1296	}
1297
1298	// operands may have canceled each other out
1299	if len(z.mant) == 0 {
1300		z.acc = Exact
1301		z.form = zero
1302		z.neg = false
1303		return
1304	}
1305	// len(z.mant) > 0
1306
1307	z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
1308}
1309
1310// z = x * y, ignoring signs of x and y for the multiplication
1311// but using the sign of z for rounding the result.
1312// x and y must have a non-empty mantissa and valid exponent.
1313func (z *Float) umul(x, y *Float) {
1314	if debugFloat {
1315		validateBinaryOperands(x, y)
1316	}
1317
1318	// Note: This is doing too much work if the precision
1319	// of z is less than the sum of the precisions of x
1320	// and y which is often the case (e.g., if all floats
1321	// have the same precision).
1322	// TODO(gri) Optimize this for the common case.
1323
1324	e := int64(x.exp) + int64(y.exp)
1325	if x == y {
1326		z.mant = z.mant.sqr(x.mant)
1327	} else {
1328		z.mant = z.mant.mul(x.mant, y.mant)
1329	}
1330	z.setExpAndRound(e-fnorm(z.mant), 0)
1331}
1332
1333// z = x / y, ignoring signs of x and y for the division
1334// but using the sign of z for rounding the result.
1335// x and y must have a non-empty mantissa and valid exponent.
1336func (z *Float) uquo(x, y *Float) {
1337	if debugFloat {
1338		validateBinaryOperands(x, y)
1339	}
1340
1341	// mantissa length in words for desired result precision + 1
1342	// (at least one extra bit so we get the rounding bit after
1343	// the division)
1344	n := int(z.prec/_W) + 1
1345
1346	// compute adjusted x.mant such that we get enough result precision
1347	xadj := x.mant
1348	if d := n - len(x.mant) + len(y.mant); d > 0 {
1349		// d extra words needed => add d "0 digits" to x
1350		xadj = make(nat, len(x.mant)+d)
1351		copy(xadj[d:], x.mant)
1352	}
1353	// TODO(gri): If we have too many digits (d < 0), we should be able
1354	// to shorten x for faster division. But we must be extra careful
1355	// with rounding in that case.
1356
1357	// Compute d before division since there may be aliasing of x.mant
1358	// (via xadj) or y.mant with z.mant.
1359	d := len(xadj) - len(y.mant)
1360
1361	// divide
1362	var r nat
1363	z.mant, r = z.mant.div(nil, xadj, y.mant)
1364	e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W
1365
1366	// The result is long enough to include (at least) the rounding bit.
1367	// If there's a non-zero remainder, the corresponding fractional part
1368	// (if it were computed), would have a non-zero sticky bit (if it were
1369	// zero, it couldn't have a non-zero remainder).
1370	var sbit uint
1371	if len(r) > 0 {
1372		sbit = 1
1373	}
1374
1375	z.setExpAndRound(e-fnorm(z.mant), sbit)
1376}
1377
1378// ucmp returns -1, 0, or +1, depending on whether
1379// |x| < |y|, |x| == |y|, or |x| > |y|.
1380// x and y must have a non-empty mantissa and valid exponent.
1381func (x *Float) ucmp(y *Float) int {
1382	if debugFloat {
1383		validateBinaryOperands(x, y)
1384	}
1385
1386	switch {
1387	case x.exp < y.exp:
1388		return -1
1389	case x.exp > y.exp:
1390		return +1
1391	}
1392	// x.exp == y.exp
1393
1394	// compare mantissas
1395	i := len(x.mant)
1396	j := len(y.mant)
1397	for i > 0 || j > 0 {
1398		var xm, ym Word
1399		if i > 0 {
1400			i--
1401			xm = x.mant[i]
1402		}
1403		if j > 0 {
1404			j--
1405			ym = y.mant[j]
1406		}
1407		switch {
1408		case xm < ym:
1409			return -1
1410		case xm > ym:
1411			return +1
1412		}
1413	}
1414
1415	return 0
1416}
1417
1418// Handling of sign bit as defined by IEEE 754-2008, section 6.3:
1419//
1420// When neither the inputs nor result are NaN, the sign of a product or
1421// quotient is the exclusive OR of the operands’ signs; the sign of a sum,
1422// or of a difference x−y regarded as a sum x+(−y), differs from at most
1423// one of the addends’ signs; and the sign of the result of conversions,
1424// the quantize operation, the roundToIntegral operations, and the
1425// roundToIntegralExact (see 5.3.1) is the sign of the first or only operand.
1426// These rules shall apply even when operands or results are zero or infinite.
1427//
1428// When the sum of two operands with opposite signs (or the difference of
1429// two operands with like signs) is exactly zero, the sign of that sum (or
1430// difference) shall be +0 in all rounding-direction attributes except
1431// roundTowardNegative; under that attribute, the sign of an exact zero
1432// sum (or difference) shall be −0. However, x+x = x−(−x) retains the same
1433// sign as x even when x is zero.
1434//
1435// See also: https://play.golang.org/p/RtH3UCt5IH
1436
1437// Add sets z to the rounded sum x+y and returns z. If z's precision is 0,
1438// it is changed to the larger of x's or y's precision before the operation.
1439// Rounding is performed according to z's precision and rounding mode; and
1440// z's accuracy reports the result error relative to the exact (not rounded)
1441// result. Add panics with ErrNaN if x and y are infinities with opposite
1442// signs. The value of z is undefined in that case.
1443func (z *Float) Add(x, y *Float) *Float {
1444	if debugFloat {
1445		x.validate()
1446		y.validate()
1447	}
1448
1449	if z.prec == 0 {
1450		z.prec = umax32(x.prec, y.prec)
1451	}
1452
1453	if x.form == finite && y.form == finite {
1454		// x + y (common case)
1455
1456		// Below we set z.neg = x.neg, and when z aliases y this will
1457		// change the y operand's sign. This is fine, because if an
1458		// operand aliases the receiver it'll be overwritten, but we still
1459		// want the original x.neg and y.neg values when we evaluate
1460		// x.neg != y.neg, so we need to save y.neg before setting z.neg.
1461		yneg := y.neg
1462
1463		z.neg = x.neg
1464		if x.neg == yneg {
1465			// x + y == x + y
1466			// (-x) + (-y) == -(x + y)
1467			z.uadd(x, y)
1468		} else {
1469			// x + (-y) == x - y == -(y - x)
1470			// (-x) + y == y - x == -(x - y)
1471			if x.ucmp(y) > 0 {
1472				z.usub(x, y)
1473			} else {
1474				z.neg = !z.neg
1475				z.usub(y, x)
1476			}
1477		}
1478		if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
1479			z.neg = true
1480		}
1481		return z
1482	}
1483
1484	if x.form == inf && y.form == inf && x.neg != y.neg {
1485		// +Inf + -Inf
1486		// -Inf + +Inf
1487		// value of z is undefined but make sure it's valid
1488		z.acc = Exact
1489		z.form = zero
1490		z.neg = false
1491		panic(ErrNaN{"addition of infinities with opposite signs"})
1492	}
1493
1494	if x.form == zero && y.form == zero {
1495		// ±0 + ±0
1496		z.acc = Exact
1497		z.form = zero
1498		z.neg = x.neg && y.neg // -0 + -0 == -0
1499		return z
1500	}
1501
1502	if x.form == inf || y.form == zero {
1503		// ±Inf + y
1504		// x + ±0
1505		return z.Set(x)
1506	}
1507
1508	// ±0 + y
1509	// x + ±Inf
1510	return z.Set(y)
1511}
1512
1513// Sub sets z to the rounded difference x-y and returns z.
1514// Precision, rounding, and accuracy reporting are as for Add.
1515// Sub panics with ErrNaN if x and y are infinities with equal
1516// signs. The value of z is undefined in that case.
1517func (z *Float) Sub(x, y *Float) *Float {
1518	if debugFloat {
1519		x.validate()
1520		y.validate()
1521	}
1522
1523	if z.prec == 0 {
1524		z.prec = umax32(x.prec, y.prec)
1525	}
1526
1527	if x.form == finite && y.form == finite {
1528		// x - y (common case)
1529		yneg := y.neg
1530		z.neg = x.neg
1531		if x.neg != yneg {
1532			// x - (-y) == x + y
1533			// (-x) - y == -(x + y)
1534			z.uadd(x, y)
1535		} else {
1536			// x - y == x - y == -(y - x)
1537			// (-x) - (-y) == y - x == -(x - y)
1538			if x.ucmp(y) > 0 {
1539				z.usub(x, y)
1540			} else {
1541				z.neg = !z.neg
1542				z.usub(y, x)
1543			}
1544		}
1545		if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
1546			z.neg = true
1547		}
1548		return z
1549	}
1550
1551	if x.form == inf && y.form == inf && x.neg == y.neg {
1552		// +Inf - +Inf
1553		// -Inf - -Inf
1554		// value of z is undefined but make sure it's valid
1555		z.acc = Exact
1556		z.form = zero
1557		z.neg = false
1558		panic(ErrNaN{"subtraction of infinities with equal signs"})
1559	}
1560
1561	if x.form == zero && y.form == zero {
1562		// ±0 - ±0
1563		z.acc = Exact
1564		z.form = zero
1565		z.neg = x.neg && !y.neg // -0 - +0 == -0
1566		return z
1567	}
1568
1569	if x.form == inf || y.form == zero {
1570		// ±Inf - y
1571		// x - ±0
1572		return z.Set(x)
1573	}
1574
1575	// ±0 - y
1576	// x - ±Inf
1577	return z.Neg(y)
1578}
1579
1580// Mul sets z to the rounded product x*y and returns z.
1581// Precision, rounding, and accuracy reporting are as for Add.
1582// Mul panics with ErrNaN if one operand is zero and the other
1583// operand an infinity. The value of z is undefined in that case.
1584func (z *Float) Mul(x, y *Float) *Float {
1585	if debugFloat {
1586		x.validate()
1587		y.validate()
1588	}
1589
1590	if z.prec == 0 {
1591		z.prec = umax32(x.prec, y.prec)
1592	}
1593
1594	z.neg = x.neg != y.neg
1595
1596	if x.form == finite && y.form == finite {
1597		// x * y (common case)
1598		z.umul(x, y)
1599		return z
1600	}
1601
1602	z.acc = Exact
1603	if x.form == zero && y.form == inf || x.form == inf && y.form == zero {
1604		// ±0 * ±Inf
1605		// ±Inf * ±0
1606		// value of z is undefined but make sure it's valid
1607		z.form = zero
1608		z.neg = false
1609		panic(ErrNaN{"multiplication of zero with infinity"})
1610	}
1611
1612	if x.form == inf || y.form == inf {
1613		// ±Inf * y
1614		// x * ±Inf
1615		z.form = inf
1616		return z
1617	}
1618
1619	// ±0 * y
1620	// x * ±0
1621	z.form = zero
1622	return z
1623}
1624
1625// Quo sets z to the rounded quotient x/y and returns z.
1626// Precision, rounding, and accuracy reporting are as for Add.
1627// Quo panics with ErrNaN if both operands are zero or infinities.
1628// The value of z is undefined in that case.
1629func (z *Float) Quo(x, y *Float) *Float {
1630	if debugFloat {
1631		x.validate()
1632		y.validate()
1633	}
1634
1635	if z.prec == 0 {
1636		z.prec = umax32(x.prec, y.prec)
1637	}
1638
1639	z.neg = x.neg != y.neg
1640
1641	if x.form == finite && y.form == finite {
1642		// x / y (common case)
1643		z.uquo(x, y)
1644		return z
1645	}
1646
1647	z.acc = Exact
1648	if x.form == zero && y.form == zero || x.form == inf && y.form == inf {
1649		// ±0 / ±0
1650		// ±Inf / ±Inf
1651		// value of z is undefined but make sure it's valid
1652		z.form = zero
1653		z.neg = false
1654		panic(ErrNaN{"division of zero by zero or infinity by infinity"})
1655	}
1656
1657	if x.form == zero || y.form == inf {
1658		// ±0 / y
1659		// x / ±Inf
1660		z.form = zero
1661		return z
1662	}
1663
1664	// x / ±0
1665	// ±Inf / y
1666	z.form = inf
1667	return z
1668}
1669
1670// Cmp compares x and y and returns:
1671//
1672//   -1 if x <  y
1673//    0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf)
1674//   +1 if x >  y
1675//
1676func (x *Float) Cmp(y *Float) int {
1677	if debugFloat {
1678		x.validate()
1679		y.validate()
1680	}
1681
1682	mx := x.ord()
1683	my := y.ord()
1684	switch {
1685	case mx < my:
1686		return -1
1687	case mx > my:
1688		return +1
1689	}
1690	// mx == my
1691
1692	// only if |mx| == 1 we have to compare the mantissae
1693	switch mx {
1694	case -1:
1695		return y.ucmp(x)
1696	case +1:
1697		return x.ucmp(y)
1698	}
1699
1700	return 0
1701}
1702
1703// ord classifies x and returns:
1704//
1705//	-2 if -Inf == x
1706//	-1 if -Inf < x < 0
1707//	 0 if x == 0 (signed or unsigned)
1708//	+1 if 0 < x < +Inf
1709//	+2 if x == +Inf
1710//
1711func (x *Float) ord() int {
1712	var m int
1713	switch x.form {
1714	case finite:
1715		m = 1
1716	case zero:
1717		return 0
1718	case inf:
1719		m = 2
1720	}
1721	if x.neg {
1722		m = -m
1723	}
1724	return m
1725}
1726
1727func umax32(x, y uint32) uint32 {
1728	if x > y {
1729		return x
1730	}
1731	return y
1732}
1733