1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package big implements multi-precision arithmetic (big numbers).
6// The following numeric types are supported:
7//
8//	- Int	signed integers
9//	- Rat	rational numbers
10//
11// Methods are typically of the form:
12//
13//	func (z *Int) Op(x, y *Int) *Int	(similar for *Rat)
14//
15// and implement operations z = x Op y with the result as receiver; if it
16// is one of the operands it may be overwritten (and its memory reused).
17// To enable chaining of operations, the result is also returned. Methods
18// returning a result other than *Int or *Rat take one of the operands as
19// the receiver.
20//
21package big
22
23// This file contains operations on unsigned multi-precision integers.
24// These are the building blocks for the operations on signed integers
25// and rationals.
26
27import (
28	"errors"
29	"io"
30	"math"
31	"math/rand"
32	"sync"
33)
34
35// An unsigned integer x of the form
36//
37//   x = x[n-1]*_B^(n-1) + x[n-2]*_B^(n-2) + ... + x[1]*_B + x[0]
38//
39// with 0 <= x[i] < _B and 0 <= i < n is stored in a slice of length n,
40// with the digits x[i] as the slice elements.
41//
42// A number is normalized if the slice contains no leading 0 digits.
43// During arithmetic operations, denormalized values may occur but are
44// always normalized before returning the final result. The normalized
45// representation of 0 is the empty or nil slice (length = 0).
46//
47type nat []Word
48
49var (
50	natOne = nat{1}
51	natTwo = nat{2}
52	natTen = nat{10}
53)
54
55func (z nat) clear() {
56	for i := range z {
57		z[i] = 0
58	}
59}
60
61func (z nat) norm() nat {
62	i := len(z)
63	for i > 0 && z[i-1] == 0 {
64		i--
65	}
66	return z[0:i]
67}
68
69func (z nat) make(n int) nat {
70	if n <= cap(z) {
71		return z[0:n] // reuse z
72	}
73	// Choosing a good value for e has significant performance impact
74	// because it increases the chance that a value can be reused.
75	const e = 4 // extra capacity
76	return make(nat, n, n+e)
77}
78
79func (z nat) setWord(x Word) nat {
80	if x == 0 {
81		return z.make(0)
82	}
83	z = z.make(1)
84	z[0] = x
85	return z
86}
87
88func (z nat) setUint64(x uint64) nat {
89	// single-digit values
90	if w := Word(x); uint64(w) == x {
91		return z.setWord(w)
92	}
93
94	// compute number of words n required to represent x
95	n := 0
96	for t := x; t > 0; t >>= _W {
97		n++
98	}
99
100	// split x into n words
101	z = z.make(n)
102	for i := range z {
103		z[i] = Word(x & _M)
104		x >>= _W
105	}
106
107	return z
108}
109
110func (z nat) set(x nat) nat {
111	z = z.make(len(x))
112	copy(z, x)
113	return z
114}
115
116func (z nat) add(x, y nat) nat {
117	m := len(x)
118	n := len(y)
119
120	switch {
121	case m < n:
122		return z.add(y, x)
123	case m == 0:
124		// n == 0 because m >= n; result is 0
125		return z.make(0)
126	case n == 0:
127		// result is x
128		return z.set(x)
129	}
130	// m > 0
131
132	z = z.make(m + 1)
133	c := addVV(z[0:n], x, y)
134	if m > n {
135		c = addVW(z[n:m], x[n:], c)
136	}
137	z[m] = c
138
139	return z.norm()
140}
141
142func (z nat) sub(x, y nat) nat {
143	m := len(x)
144	n := len(y)
145
146	switch {
147	case m < n:
148		panic("underflow")
149	case m == 0:
150		// n == 0 because m >= n; result is 0
151		return z.make(0)
152	case n == 0:
153		// result is x
154		return z.set(x)
155	}
156	// m > 0
157
158	z = z.make(m)
159	c := subVV(z[0:n], x, y)
160	if m > n {
161		c = subVW(z[n:], x[n:], c)
162	}
163	if c != 0 {
164		panic("underflow")
165	}
166
167	return z.norm()
168}
169
170func (x nat) cmp(y nat) (r int) {
171	m := len(x)
172	n := len(y)
173	if m != n || m == 0 {
174		switch {
175		case m < n:
176			r = -1
177		case m > n:
178			r = 1
179		}
180		return
181	}
182
183	i := m - 1
184	for i > 0 && x[i] == y[i] {
185		i--
186	}
187
188	switch {
189	case x[i] < y[i]:
190		r = -1
191	case x[i] > y[i]:
192		r = 1
193	}
194	return
195}
196
197func (z nat) mulAddWW(x nat, y, r Word) nat {
198	m := len(x)
199	if m == 0 || y == 0 {
200		return z.setWord(r) // result is r
201	}
202	// m > 0
203
204	z = z.make(m + 1)
205	z[m] = mulAddVWW(z[0:m], x, y, r)
206
207	return z.norm()
208}
209
210// basicMul multiplies x and y and leaves the result in z.
211// The (non-normalized) result is placed in z[0 : len(x) + len(y)].
212func basicMul(z, x, y nat) {
213	z[0 : len(x)+len(y)].clear() // initialize z
214	for i, d := range y {
215		if d != 0 {
216			z[len(x)+i] = addMulVVW(z[i:i+len(x)], x, d)
217		}
218	}
219}
220
221// Fast version of z[0:n+n>>1].add(z[0:n+n>>1], x[0:n]) w/o bounds checks.
222// Factored out for readability - do not use outside karatsuba.
223func karatsubaAdd(z, x nat, n int) {
224	if c := addVV(z[0:n], z, x); c != 0 {
225		addVW(z[n:n+n>>1], z[n:], c)
226	}
227}
228
229// Like karatsubaAdd, but does subtract.
230func karatsubaSub(z, x nat, n int) {
231	if c := subVV(z[0:n], z, x); c != 0 {
232		subVW(z[n:n+n>>1], z[n:], c)
233	}
234}
235
236// Operands that are shorter than karatsubaThreshold are multiplied using
237// "grade school" multiplication; for longer operands the Karatsuba algorithm
238// is used.
239var karatsubaThreshold int = 40 // computed by calibrate.go
240
241// karatsuba multiplies x and y and leaves the result in z.
242// Both x and y must have the same length n and n must be a
243// power of 2. The result vector z must have len(z) >= 6*n.
244// The (non-normalized) result is placed in z[0 : 2*n].
245func karatsuba(z, x, y nat) {
246	n := len(y)
247
248	// Switch to basic multiplication if numbers are odd or small.
249	// (n is always even if karatsubaThreshold is even, but be
250	// conservative)
251	if n&1 != 0 || n < karatsubaThreshold || n < 2 {
252		basicMul(z, x, y)
253		return
254	}
255	// n&1 == 0 && n >= karatsubaThreshold && n >= 2
256
257	// Karatsuba multiplication is based on the observation that
258	// for two numbers x and y with:
259	//
260	//   x = x1*b + x0
261	//   y = y1*b + y0
262	//
263	// the product x*y can be obtained with 3 products z2, z1, z0
264	// instead of 4:
265	//
266	//   x*y = x1*y1*b*b + (x1*y0 + x0*y1)*b + x0*y0
267	//       =    z2*b*b +              z1*b +    z0
268	//
269	// with:
270	//
271	//   xd = x1 - x0
272	//   yd = y0 - y1
273	//
274	//   z1 =      xd*yd                    + z2 + z0
275	//      = (x1-x0)*(y0 - y1)             + z2 + z0
276	//      = x1*y0 - x1*y1 - x0*y0 + x0*y1 + z2 + z0
277	//      = x1*y0 -    z2 -    z0 + x0*y1 + z2 + z0
278	//      = x1*y0                 + x0*y1
279
280	// split x, y into "digits"
281	n2 := n >> 1              // n2 >= 1
282	x1, x0 := x[n2:], x[0:n2] // x = x1*b + y0
283	y1, y0 := y[n2:], y[0:n2] // y = y1*b + y0
284
285	// z is used for the result and temporary storage:
286	//
287	//   6*n     5*n     4*n     3*n     2*n     1*n     0*n
288	// z = [z2 copy|z0 copy| xd*yd | yd:xd | x1*y1 | x0*y0 ]
289	//
290	// For each recursive call of karatsuba, an unused slice of
291	// z is passed in that has (at least) half the length of the
292	// caller's z.
293
294	// compute z0 and z2 with the result "in place" in z
295	karatsuba(z, x0, y0)     // z0 = x0*y0
296	karatsuba(z[n:], x1, y1) // z2 = x1*y1
297
298	// compute xd (or the negative value if underflow occurs)
299	s := 1 // sign of product xd*yd
300	xd := z[2*n : 2*n+n2]
301	if subVV(xd, x1, x0) != 0 { // x1-x0
302		s = -s
303		subVV(xd, x0, x1) // x0-x1
304	}
305
306	// compute yd (or the negative value if underflow occurs)
307	yd := z[2*n+n2 : 3*n]
308	if subVV(yd, y0, y1) != 0 { // y0-y1
309		s = -s
310		subVV(yd, y1, y0) // y1-y0
311	}
312
313	// p = (x1-x0)*(y0-y1) == x1*y0 - x1*y1 - x0*y0 + x0*y1 for s > 0
314	// p = (x0-x1)*(y0-y1) == x0*y0 - x0*y1 - x1*y0 + x1*y1 for s < 0
315	p := z[n*3:]
316	karatsuba(p, xd, yd)
317
318	// save original z2:z0
319	// (ok to use upper half of z since we're done recursing)
320	r := z[n*4:]
321	copy(r, z[:n*2])
322
323	// add up all partial products
324	//
325	//   2*n     n     0
326	// z = [ z2  | z0  ]
327	//   +    [ z0  ]
328	//   +    [ z2  ]
329	//   +    [  p  ]
330	//
331	karatsubaAdd(z[n2:], r, n)
332	karatsubaAdd(z[n2:], r[n:], n)
333	if s > 0 {
334		karatsubaAdd(z[n2:], p, n)
335	} else {
336		karatsubaSub(z[n2:], p, n)
337	}
338}
339
340// alias returns true if x and y share the same base array.
341func alias(x, y nat) bool {
342	return cap(x) > 0 && cap(y) > 0 && &x[0:cap(x)][cap(x)-1] == &y[0:cap(y)][cap(y)-1]
343}
344
345// addAt implements z += x<<(_W*i); z must be long enough.
346// (we don't use nat.add because we need z to stay the same
347// slice, and we don't need to normalize z after each addition)
348func addAt(z, x nat, i int) {
349	if n := len(x); n > 0 {
350		if c := addVV(z[i:i+n], z[i:], x); c != 0 {
351			j := i + n
352			if j < len(z) {
353				addVW(z[j:], z[j:], c)
354			}
355		}
356	}
357}
358
359func max(x, y int) int {
360	if x > y {
361		return x
362	}
363	return y
364}
365
366// karatsubaLen computes an approximation to the maximum k <= n such that
367// k = p<<i for a number p <= karatsubaThreshold and an i >= 0. Thus, the
368// result is the largest number that can be divided repeatedly by 2 before
369// becoming about the value of karatsubaThreshold.
370func karatsubaLen(n int) int {
371	i := uint(0)
372	for n > karatsubaThreshold {
373		n >>= 1
374		i++
375	}
376	return n << i
377}
378
379func (z nat) mul(x, y nat) nat {
380	m := len(x)
381	n := len(y)
382
383	switch {
384	case m < n:
385		return z.mul(y, x)
386	case m == 0 || n == 0:
387		return z.make(0)
388	case n == 1:
389		return z.mulAddWW(x, y[0], 0)
390	}
391	// m >= n > 1
392
393	// determine if z can be reused
394	if alias(z, x) || alias(z, y) {
395		z = nil // z is an alias for x or y - cannot reuse
396	}
397
398	// use basic multiplication if the numbers are small
399	if n < karatsubaThreshold {
400		z = z.make(m + n)
401		basicMul(z, x, y)
402		return z.norm()
403	}
404	// m >= n && n >= karatsubaThreshold && n >= 2
405
406	// determine Karatsuba length k such that
407	//
408	//   x = xh*b + x0  (0 <= x0 < b)
409	//   y = yh*b + y0  (0 <= y0 < b)
410	//   b = 1<<(_W*k)  ("base" of digits xi, yi)
411	//
412	k := karatsubaLen(n)
413	// k <= n
414
415	// multiply x0 and y0 via Karatsuba
416	x0 := x[0:k]              // x0 is not normalized
417	y0 := y[0:k]              // y0 is not normalized
418	z = z.make(max(6*k, m+n)) // enough space for karatsuba of x0*y0 and full result of x*y
419	karatsuba(z, x0, y0)
420	z = z[0 : m+n]  // z has final length but may be incomplete
421	z[2*k:].clear() // upper portion of z is garbage (and 2*k <= m+n since k <= n <= m)
422
423	// If xh != 0 or yh != 0, add the missing terms to z. For
424	//
425	//   xh = xi*b^i + ... + x2*b^2 + x1*b (0 <= xi < b)
426	//   yh =                         y1*b (0 <= y1 < b)
427	//
428	// the missing terms are
429	//
430	//   x0*y1*b and xi*y0*b^i, xi*y1*b^(i+1) for i > 0
431	//
432	// since all the yi for i > 1 are 0 by choice of k: If any of them
433	// were > 0, then yh >= b^2 and thus y >= b^2. Then k' = k*2 would
434	// be a larger valid threshold contradicting the assumption about k.
435	//
436	if k < n || m != n {
437		var t nat
438
439		// add x0*y1*b
440		x0 := x0.norm()
441		y1 := y[k:]       // y1 is normalized because y is
442		t = t.mul(x0, y1) // update t so we don't lose t's underlying array
443		addAt(z, t, k)
444
445		// add xi*y0<<i, xi*y1*b<<(i+k)
446		y0 := y0.norm()
447		for i := k; i < len(x); i += k {
448			xi := x[i:]
449			if len(xi) > k {
450				xi = xi[:k]
451			}
452			xi = xi.norm()
453			t = t.mul(xi, y0)
454			addAt(z, t, i)
455			t = t.mul(xi, y1)
456			addAt(z, t, i+k)
457		}
458	}
459
460	return z.norm()
461}
462
463// mulRange computes the product of all the unsigned integers in the
464// range [a, b] inclusively. If a > b (empty range), the result is 1.
465func (z nat) mulRange(a, b uint64) nat {
466	switch {
467	case a == 0:
468		// cut long ranges short (optimization)
469		return z.setUint64(0)
470	case a > b:
471		return z.setUint64(1)
472	case a == b:
473		return z.setUint64(a)
474	case a+1 == b:
475		return z.mul(nat(nil).setUint64(a), nat(nil).setUint64(b))
476	}
477	m := (a + b) / 2
478	return z.mul(nat(nil).mulRange(a, m), nat(nil).mulRange(m+1, b))
479}
480
481// q = (x-r)/y, with 0 <= r < y
482func (z nat) divW(x nat, y Word) (q nat, r Word) {
483	m := len(x)
484	switch {
485	case y == 0:
486		panic("division by zero")
487	case y == 1:
488		q = z.set(x) // result is x
489		return
490	case m == 0:
491		q = z.make(0) // result is 0
492		return
493	}
494	// m > 0
495	z = z.make(m)
496	r = divWVW(z, 0, x, y)
497	q = z.norm()
498	return
499}
500
501func (z nat) div(z2, u, v nat) (q, r nat) {
502	if len(v) == 0 {
503		panic("division by zero")
504	}
505
506	if u.cmp(v) < 0 {
507		q = z.make(0)
508		r = z2.set(u)
509		return
510	}
511
512	if len(v) == 1 {
513		var r2 Word
514		q, r2 = z.divW(u, v[0])
515		r = z2.setWord(r2)
516		return
517	}
518
519	q, r = z.divLarge(z2, u, v)
520	return
521}
522
523// q = (uIn-r)/v, with 0 <= r < y
524// Uses z as storage for q, and u as storage for r if possible.
525// See Knuth, Volume 2, section 4.3.1, Algorithm D.
526// Preconditions:
527//    len(v) >= 2
528//    len(uIn) >= len(v)
529func (z nat) divLarge(u, uIn, v nat) (q, r nat) {
530	n := len(v)
531	m := len(uIn) - n
532
533	// determine if z can be reused
534	// TODO(gri) should find a better solution - this if statement
535	//           is very costly (see e.g. time pidigits -s -n 10000)
536	if alias(z, uIn) || alias(z, v) {
537		z = nil // z is an alias for uIn or v - cannot reuse
538	}
539	q = z.make(m + 1)
540
541	qhatv := make(nat, n+1)
542	if alias(u, uIn) || alias(u, v) {
543		u = nil // u is an alias for uIn or v - cannot reuse
544	}
545	u = u.make(len(uIn) + 1)
546	u.clear()
547
548	// D1.
549	shift := leadingZeros(v[n-1])
550	if shift > 0 {
551		// do not modify v, it may be used by another goroutine simultaneously
552		v1 := make(nat, n)
553		shlVU(v1, v, shift)
554		v = v1
555	}
556	u[len(uIn)] = shlVU(u[0:len(uIn)], uIn, shift)
557
558	// D2.
559	for j := m; j >= 0; j-- {
560		// D3.
561		qhat := Word(_M)
562		if u[j+n] != v[n-1] {
563			var rhat Word
564			qhat, rhat = divWW(u[j+n], u[j+n-1], v[n-1])
565
566			// x1 | x2 = q̂v_{n-2}
567			x1, x2 := mulWW(qhat, v[n-2])
568			// test if q̂v_{n-2} > br̂ + u_{j+n-2}
569			for greaterThan(x1, x2, rhat, u[j+n-2]) {
570				qhat--
571				prevRhat := rhat
572				rhat += v[n-1]
573				// v[n-1] >= 0, so this tests for overflow.
574				if rhat < prevRhat {
575					break
576				}
577				x1, x2 = mulWW(qhat, v[n-2])
578			}
579		}
580
581		// D4.
582		qhatv[n] = mulAddVWW(qhatv[0:n], v, qhat, 0)
583
584		c := subVV(u[j:j+len(qhatv)], u[j:], qhatv)
585		if c != 0 {
586			c := addVV(u[j:j+n], u[j:], v)
587			u[j+n] += c
588			qhat--
589		}
590
591		q[j] = qhat
592	}
593
594	q = q.norm()
595	shrVU(u, u, shift)
596	r = u.norm()
597
598	return q, r
599}
600
601// Length of x in bits. x must be normalized.
602func (x nat) bitLen() int {
603	if i := len(x) - 1; i >= 0 {
604		return i*_W + bitLen(x[i])
605	}
606	return 0
607}
608
609// MaxBase is the largest number base accepted for string conversions.
610const MaxBase = 'z' - 'a' + 10 + 1 // = hexValue('z') + 1
611
612func hexValue(ch rune) Word {
613	d := int(MaxBase + 1) // illegal base
614	switch {
615	case '0' <= ch && ch <= '9':
616		d = int(ch - '0')
617	case 'a' <= ch && ch <= 'z':
618		d = int(ch - 'a' + 10)
619	case 'A' <= ch && ch <= 'Z':
620		d = int(ch - 'A' + 10)
621	}
622	return Word(d)
623}
624
625// scan sets z to the natural number corresponding to the longest possible prefix
626// read from r representing an unsigned integer in a given conversion base.
627// It returns z, the actual conversion base used, and an error, if any. In the
628// error case, the value of z is undefined. The syntax follows the syntax of
629// unsigned integer literals in Go.
630//
631// The base argument must be 0 or a value from 2 through MaxBase. If the base
632// is 0, the string prefix determines the actual conversion base. A prefix of
633// ``0x'' or ``0X'' selects base 16; the ``0'' prefix selects base 8, and a
634// ``0b'' or ``0B'' prefix selects base 2. Otherwise the selected base is 10.
635//
636func (z nat) scan(r io.RuneScanner, base int) (nat, int, error) {
637	// reject illegal bases
638	if base < 0 || base == 1 || MaxBase < base {
639		return z, 0, errors.New("illegal number base")
640	}
641
642	// one char look-ahead
643	ch, _, err := r.ReadRune()
644	if err != nil {
645		return z, 0, err
646	}
647
648	// determine base if necessary
649	b := Word(base)
650	if base == 0 {
651		b = 10
652		if ch == '0' {
653			switch ch, _, err = r.ReadRune(); err {
654			case nil:
655				b = 8
656				switch ch {
657				case 'x', 'X':
658					b = 16
659				case 'b', 'B':
660					b = 2
661				}
662				if b == 2 || b == 16 {
663					if ch, _, err = r.ReadRune(); err != nil {
664						return z, 0, err
665					}
666				}
667			case io.EOF:
668				return z.make(0), 10, nil
669			default:
670				return z, 10, err
671			}
672		}
673	}
674
675	// convert string
676	// - group as many digits d as possible together into a "super-digit" dd with "super-base" bb
677	// - only when bb does not fit into a word anymore, do a full number mulAddWW using bb and dd
678	z = z.make(0)
679	bb := Word(1)
680	dd := Word(0)
681	for max := _M / b; ; {
682		d := hexValue(ch)
683		if d >= b {
684			r.UnreadRune() // ch does not belong to number anymore
685			break
686		}
687
688		if bb <= max {
689			bb *= b
690			dd = dd*b + d
691		} else {
692			// bb * b would overflow
693			z = z.mulAddWW(z, bb, dd)
694			bb = b
695			dd = d
696		}
697
698		if ch, _, err = r.ReadRune(); err != nil {
699			if err != io.EOF {
700				return z, int(b), err
701			}
702			break
703		}
704	}
705
706	switch {
707	case bb > 1:
708		// there was at least one mantissa digit
709		z = z.mulAddWW(z, bb, dd)
710	case base == 0 && b == 8:
711		// there was only the octal prefix 0 (possibly followed by digits > 7);
712		// return base 10, not 8
713		return z, 10, nil
714	case base != 0 || b != 8:
715		// there was neither a mantissa digit nor the octal prefix 0
716		return z, int(b), errors.New("syntax error scanning number")
717	}
718
719	return z.norm(), int(b), nil
720}
721
722// Character sets for string conversion.
723const (
724	lowercaseDigits = "0123456789abcdefghijklmnopqrstuvwxyz"
725	uppercaseDigits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
726)
727
728// decimalString returns a decimal representation of x.
729// It calls x.string with the charset "0123456789".
730func (x nat) decimalString() string {
731	return x.string(lowercaseDigits[0:10])
732}
733
734// string converts x to a string using digits from a charset; a digit with
735// value d is represented by charset[d]. The conversion base is determined
736// by len(charset), which must be >= 2 and <= 256.
737func (x nat) string(charset string) string {
738	b := Word(len(charset))
739
740	// special cases
741	switch {
742	case b < 2 || MaxBase > 256:
743		panic("illegal base")
744	case len(x) == 0:
745		return string(charset[0])
746	}
747
748	// allocate buffer for conversion
749	i := int(float64(x.bitLen())/math.Log2(float64(b))) + 1 // off by one at most
750	s := make([]byte, i)
751
752	// convert power of two and non power of two bases separately
753	if b == b&-b {
754		// shift is base-b digit size in bits
755		shift := trailingZeroBits(b) // shift > 0 because b >= 2
756		mask := Word(1)<<shift - 1
757		w := x[0]
758		nbits := uint(_W) // number of unprocessed bits in w
759
760		// convert less-significant words
761		for k := 1; k < len(x); k++ {
762			// convert full digits
763			for nbits >= shift {
764				i--
765				s[i] = charset[w&mask]
766				w >>= shift
767				nbits -= shift
768			}
769
770			// convert any partial leading digit and advance to next word
771			if nbits == 0 {
772				// no partial digit remaining, just advance
773				w = x[k]
774				nbits = _W
775			} else {
776				// partial digit in current (k-1) and next (k) word
777				w |= x[k] << nbits
778				i--
779				s[i] = charset[w&mask]
780
781				// advance
782				w = x[k] >> (shift - nbits)
783				nbits = _W - (shift - nbits)
784			}
785		}
786
787		// convert digits of most-significant word (omit leading zeros)
788		for nbits >= 0 && w != 0 {
789			i--
790			s[i] = charset[w&mask]
791			w >>= shift
792			nbits -= shift
793		}
794
795	} else {
796		// determine "big base"; i.e., the largest possible value bb
797		// that is a power of base b and still fits into a Word
798		// (as in 10^19 for 19 decimal digits in a 64bit Word)
799		bb := b      // big base is b**ndigits
800		ndigits := 1 // number of base b digits
801		for max := Word(_M / b); bb <= max; bb *= b {
802			ndigits++ // maximize ndigits where bb = b**ndigits, bb <= _M
803		}
804
805		// construct table of successive squares of bb*leafSize to use in subdivisions
806		// result (table != nil) <=> (len(x) > leafSize > 0)
807		table := divisors(len(x), b, ndigits, bb)
808
809		// preserve x, create local copy for use by convertWords
810		q := nat(nil).set(x)
811
812		// convert q to string s in base b
813		q.convertWords(s, charset, b, ndigits, bb, table)
814
815		// strip leading zeros
816		// (x != 0; thus s must contain at least one non-zero digit
817		// and the loop will terminate)
818		i = 0
819		for zero := charset[0]; s[i] == zero; {
820			i++
821		}
822	}
823
824	return string(s[i:])
825}
826
827// Convert words of q to base b digits in s. If q is large, it is recursively "split in half"
828// by nat/nat division using tabulated divisors. Otherwise, it is converted iteratively using
829// repeated nat/Word division.
830//
831// The iterative method processes n Words by n divW() calls, each of which visits every Word in the
832// incrementally shortened q for a total of n + (n-1) + (n-2) ... + 2 + 1, or n(n+1)/2 divW()'s.
833// Recursive conversion divides q by its approximate square root, yielding two parts, each half
834// the size of q. Using the iterative method on both halves means 2 * (n/2)(n/2 + 1)/2 divW()'s
835// plus the expensive long div(). Asymptotically, the ratio is favorable at 1/2 the divW()'s, and
836// is made better by splitting the subblocks recursively. Best is to split blocks until one more
837// split would take longer (because of the nat/nat div()) than the twice as many divW()'s of the
838// iterative approach. This threshold is represented by leafSize. Benchmarking of leafSize in the
839// range 2..64 shows that values of 8 and 16 work well, with a 4x speedup at medium lengths and
840// ~30x for 20000 digits. Use nat_test.go's BenchmarkLeafSize tests to optimize leafSize for
841// specific hardware.
842//
843func (q nat) convertWords(s []byte, charset string, b Word, ndigits int, bb Word, table []divisor) {
844	// split larger blocks recursively
845	if table != nil {
846		// len(q) > leafSize > 0
847		var r nat
848		index := len(table) - 1
849		for len(q) > leafSize {
850			// find divisor close to sqrt(q) if possible, but in any case < q
851			maxLength := q.bitLen()     // ~= log2 q, or at of least largest possible q of this bit length
852			minLength := maxLength >> 1 // ~= log2 sqrt(q)
853			for index > 0 && table[index-1].nbits > minLength {
854				index-- // desired
855			}
856			if table[index].nbits >= maxLength && table[index].bbb.cmp(q) >= 0 {
857				index--
858				if index < 0 {
859					panic("internal inconsistency")
860				}
861			}
862
863			// split q into the two digit number (q'*bbb + r) to form independent subblocks
864			q, r = q.div(r, q, table[index].bbb)
865
866			// convert subblocks and collect results in s[:h] and s[h:]
867			h := len(s) - table[index].ndigits
868			r.convertWords(s[h:], charset, b, ndigits, bb, table[0:index])
869			s = s[:h] // == q.convertWords(s, charset, b, ndigits, bb, table[0:index+1])
870		}
871	}
872
873	// having split any large blocks now process the remaining (small) block iteratively
874	i := len(s)
875	var r Word
876	if b == 10 {
877		// hard-coding for 10 here speeds this up by 1.25x (allows for / and % by constants)
878		for len(q) > 0 {
879			// extract least significant, base bb "digit"
880			q, r = q.divW(q, bb)
881			for j := 0; j < ndigits && i > 0; j++ {
882				i--
883				// avoid % computation since r%10 == r - int(r/10)*10;
884				// this appears to be faster for BenchmarkString10000Base10
885				// and smaller strings (but a bit slower for larger ones)
886				t := r / 10
887				s[i] = charset[r-t<<3-t-t] // TODO(gri) replace w/ t*10 once compiler produces better code
888				r = t
889			}
890		}
891	} else {
892		for len(q) > 0 {
893			// extract least significant, base bb "digit"
894			q, r = q.divW(q, bb)
895			for j := 0; j < ndigits && i > 0; j++ {
896				i--
897				s[i] = charset[r%b]
898				r /= b
899			}
900		}
901	}
902
903	// prepend high-order zeroes
904	zero := charset[0]
905	for i > 0 { // while need more leading zeroes
906		i--
907		s[i] = zero
908	}
909}
910
911// Split blocks greater than leafSize Words (or set to 0 to disable recursive conversion)
912// Benchmark and configure leafSize using: go test -bench="Leaf"
913//   8 and 16 effective on 3.0 GHz Xeon "Clovertown" CPU (128 byte cache lines)
914//   8 and 16 effective on 2.66 GHz Core 2 Duo "Penryn" CPU
915var leafSize int = 8 // number of Word-size binary values treat as a monolithic block
916
917type divisor struct {
918	bbb     nat // divisor
919	nbits   int // bit length of divisor (discounting leading zeroes) ~= log2(bbb)
920	ndigits int // digit length of divisor in terms of output base digits
921}
922
923var cacheBase10 struct {
924	sync.Mutex
925	table [64]divisor // cached divisors for base 10
926}
927
928// expWW computes x**y
929func (z nat) expWW(x, y Word) nat {
930	return z.expNN(nat(nil).setWord(x), nat(nil).setWord(y), nil)
931}
932
933// construct table of powers of bb*leafSize to use in subdivisions
934func divisors(m int, b Word, ndigits int, bb Word) []divisor {
935	// only compute table when recursive conversion is enabled and x is large
936	if leafSize == 0 || m <= leafSize {
937		return nil
938	}
939
940	// determine k where (bb**leafSize)**(2**k) >= sqrt(x)
941	k := 1
942	for words := leafSize; words < m>>1 && k < len(cacheBase10.table); words <<= 1 {
943		k++
944	}
945
946	// reuse and extend existing table of divisors or create new table as appropriate
947	var table []divisor // for b == 10, table overlaps with cacheBase10.table
948	if b == 10 {
949		cacheBase10.Lock()
950		table = cacheBase10.table[0:k] // reuse old table for this conversion
951	} else {
952		table = make([]divisor, k) // create new table for this conversion
953	}
954
955	// extend table
956	if table[k-1].ndigits == 0 {
957		// add new entries as needed
958		var larger nat
959		for i := 0; i < k; i++ {
960			if table[i].ndigits == 0 {
961				if i == 0 {
962					table[0].bbb = nat(nil).expWW(bb, Word(leafSize))
963					table[0].ndigits = ndigits * leafSize
964				} else {
965					table[i].bbb = nat(nil).mul(table[i-1].bbb, table[i-1].bbb)
966					table[i].ndigits = 2 * table[i-1].ndigits
967				}
968
969				// optimization: exploit aggregated extra bits in macro blocks
970				larger = nat(nil).set(table[i].bbb)
971				for mulAddVWW(larger, larger, b, 0) == 0 {
972					table[i].bbb = table[i].bbb.set(larger)
973					table[i].ndigits++
974				}
975
976				table[i].nbits = table[i].bbb.bitLen()
977			}
978		}
979	}
980
981	if b == 10 {
982		cacheBase10.Unlock()
983	}
984
985	return table
986}
987
988const deBruijn32 = 0x077CB531
989
990var deBruijn32Lookup = []byte{
991	0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
992	31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9,
993}
994
995const deBruijn64 = 0x03f79d71b4ca8b09
996
997var deBruijn64Lookup = []byte{
998	0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4,
999	62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5,
1000	63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11,
1001	54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
1002}
1003
1004// trailingZeroBits returns the number of consecutive least significant zero
1005// bits of x.
1006func trailingZeroBits(x Word) uint {
1007	// x & -x leaves only the right-most bit set in the word. Let k be the
1008	// index of that bit. Since only a single bit is set, the value is two
1009	// to the power of k. Multiplying by a power of two is equivalent to
1010	// left shifting, in this case by k bits.  The de Bruijn constant is
1011	// such that all six bit, consecutive substrings are distinct.
1012	// Therefore, if we have a left shifted version of this constant we can
1013	// find by how many bits it was shifted by looking at which six bit
1014	// substring ended up at the top of the word.
1015	// (Knuth, volume 4, section 7.3.1)
1016	switch _W {
1017	case 32:
1018		return uint(deBruijn32Lookup[((x&-x)*deBruijn32)>>27])
1019	case 64:
1020		return uint(deBruijn64Lookup[((x&-x)*(deBruijn64&_M))>>58])
1021	default:
1022		panic("unknown word size")
1023	}
1024
1025	return 0
1026}
1027
1028// trailingZeroBits returns the number of consecutive least significant zero
1029// bits of x.
1030func (x nat) trailingZeroBits() uint {
1031	if len(x) == 0 {
1032		return 0
1033	}
1034	var i uint
1035	for x[i] == 0 {
1036		i++
1037	}
1038	// x[i] != 0
1039	return i*_W + trailingZeroBits(x[i])
1040}
1041
1042// z = x << s
1043func (z nat) shl(x nat, s uint) nat {
1044	m := len(x)
1045	if m == 0 {
1046		return z.make(0)
1047	}
1048	// m > 0
1049
1050	n := m + int(s/_W)
1051	z = z.make(n + 1)
1052	z[n] = shlVU(z[n-m:n], x, s%_W)
1053	z[0 : n-m].clear()
1054
1055	return z.norm()
1056}
1057
1058// z = x >> s
1059func (z nat) shr(x nat, s uint) nat {
1060	m := len(x)
1061	n := m - int(s/_W)
1062	if n <= 0 {
1063		return z.make(0)
1064	}
1065	// n > 0
1066
1067	z = z.make(n)
1068	shrVU(z, x[m-n:], s%_W)
1069
1070	return z.norm()
1071}
1072
1073func (z nat) setBit(x nat, i uint, b uint) nat {
1074	j := int(i / _W)
1075	m := Word(1) << (i % _W)
1076	n := len(x)
1077	switch b {
1078	case 0:
1079		z = z.make(n)
1080		copy(z, x)
1081		if j >= n {
1082			// no need to grow
1083			return z
1084		}
1085		z[j] &^= m
1086		return z.norm()
1087	case 1:
1088		if j >= n {
1089			z = z.make(j + 1)
1090			z[n:].clear()
1091		} else {
1092			z = z.make(n)
1093		}
1094		copy(z, x)
1095		z[j] |= m
1096		// no need to normalize
1097		return z
1098	}
1099	panic("set bit is not 0 or 1")
1100}
1101
1102func (z nat) bit(i uint) uint {
1103	j := int(i / _W)
1104	if j >= len(z) {
1105		return 0
1106	}
1107	return uint(z[j] >> (i % _W) & 1)
1108}
1109
1110func (z nat) and(x, y nat) nat {
1111	m := len(x)
1112	n := len(y)
1113	if m > n {
1114		m = n
1115	}
1116	// m <= n
1117
1118	z = z.make(m)
1119	for i := 0; i < m; i++ {
1120		z[i] = x[i] & y[i]
1121	}
1122
1123	return z.norm()
1124}
1125
1126func (z nat) andNot(x, y nat) nat {
1127	m := len(x)
1128	n := len(y)
1129	if n > m {
1130		n = m
1131	}
1132	// m >= n
1133
1134	z = z.make(m)
1135	for i := 0; i < n; i++ {
1136		z[i] = x[i] &^ y[i]
1137	}
1138	copy(z[n:m], x[n:m])
1139
1140	return z.norm()
1141}
1142
1143func (z nat) or(x, y nat) nat {
1144	m := len(x)
1145	n := len(y)
1146	s := x
1147	if m < n {
1148		n, m = m, n
1149		s = y
1150	}
1151	// m >= n
1152
1153	z = z.make(m)
1154	for i := 0; i < n; i++ {
1155		z[i] = x[i] | y[i]
1156	}
1157	copy(z[n:m], s[n:m])
1158
1159	return z.norm()
1160}
1161
1162func (z nat) xor(x, y nat) nat {
1163	m := len(x)
1164	n := len(y)
1165	s := x
1166	if m < n {
1167		n, m = m, n
1168		s = y
1169	}
1170	// m >= n
1171
1172	z = z.make(m)
1173	for i := 0; i < n; i++ {
1174		z[i] = x[i] ^ y[i]
1175	}
1176	copy(z[n:m], s[n:m])
1177
1178	return z.norm()
1179}
1180
1181// greaterThan returns true iff (x1<<_W + x2) > (y1<<_W + y2)
1182func greaterThan(x1, x2, y1, y2 Word) bool {
1183	return x1 > y1 || x1 == y1 && x2 > y2
1184}
1185
1186// modW returns x % d.
1187func (x nat) modW(d Word) (r Word) {
1188	// TODO(agl): we don't actually need to store the q value.
1189	var q nat
1190	q = q.make(len(x))
1191	return divWVW(q, 0, x, d)
1192}
1193
1194// random creates a random integer in [0..limit), using the space in z if
1195// possible. n is the bit length of limit.
1196func (z nat) random(rand *rand.Rand, limit nat, n int) nat {
1197	if alias(z, limit) {
1198		z = nil // z is an alias for limit - cannot reuse
1199	}
1200	z = z.make(len(limit))
1201
1202	bitLengthOfMSW := uint(n % _W)
1203	if bitLengthOfMSW == 0 {
1204		bitLengthOfMSW = _W
1205	}
1206	mask := Word((1 << bitLengthOfMSW) - 1)
1207
1208	for {
1209		switch _W {
1210		case 32:
1211			for i := range z {
1212				z[i] = Word(rand.Uint32())
1213			}
1214		case 64:
1215			for i := range z {
1216				z[i] = Word(rand.Uint32()) | Word(rand.Uint32())<<32
1217			}
1218		default:
1219			panic("unknown word size")
1220		}
1221		z[len(limit)-1] &= mask
1222		if z.cmp(limit) < 0 {
1223			break
1224		}
1225	}
1226
1227	return z.norm()
1228}
1229
1230// If m != 0 (i.e., len(m) != 0), expNN sets z to x**y mod m;
1231// otherwise it sets z to x**y. The result is the value of z.
1232func (z nat) expNN(x, y, m nat) nat {
1233	if alias(z, x) || alias(z, y) {
1234		// We cannot allow in-place modification of x or y.
1235		z = nil
1236	}
1237
1238	if len(y) == 0 {
1239		z = z.make(1)
1240		z[0] = 1
1241		return z
1242	}
1243	// y > 0
1244
1245	if len(m) != 0 {
1246		// We likely end up being as long as the modulus.
1247		z = z.make(len(m))
1248	}
1249	z = z.set(x)
1250
1251	// If the base is non-trivial and the exponent is large, we use
1252	// 4-bit, windowed exponentiation. This involves precomputing 14 values
1253	// (x^2...x^15) but then reduces the number of multiply-reduces by a
1254	// third. Even for a 32-bit exponent, this reduces the number of
1255	// operations.
1256	if len(x) > 1 && len(y) > 1 && len(m) > 0 {
1257		return z.expNNWindowed(x, y, m)
1258	}
1259
1260	v := y[len(y)-1] // v > 0 because y is normalized and y > 0
1261	shift := leadingZeros(v) + 1
1262	v <<= shift
1263	var q nat
1264
1265	const mask = 1 << (_W - 1)
1266
1267	// We walk through the bits of the exponent one by one. Each time we
1268	// see a bit, we square, thus doubling the power. If the bit is a one,
1269	// we also multiply by x, thus adding one to the power.
1270
1271	w := _W - int(shift)
1272	// zz and r are used to avoid allocating in mul and div as
1273	// otherwise the arguments would alias.
1274	var zz, r nat
1275	for j := 0; j < w; j++ {
1276		zz = zz.mul(z, z)
1277		zz, z = z, zz
1278
1279		if v&mask != 0 {
1280			zz = zz.mul(z, x)
1281			zz, z = z, zz
1282		}
1283
1284		if len(m) != 0 {
1285			zz, r = zz.div(r, z, m)
1286			zz, r, q, z = q, z, zz, r
1287		}
1288
1289		v <<= 1
1290	}
1291
1292	for i := len(y) - 2; i >= 0; i-- {
1293		v = y[i]
1294
1295		for j := 0; j < _W; j++ {
1296			zz = zz.mul(z, z)
1297			zz, z = z, zz
1298
1299			if v&mask != 0 {
1300				zz = zz.mul(z, x)
1301				zz, z = z, zz
1302			}
1303
1304			if len(m) != 0 {
1305				zz, r = zz.div(r, z, m)
1306				zz, r, q, z = q, z, zz, r
1307			}
1308
1309			v <<= 1
1310		}
1311	}
1312
1313	return z.norm()
1314}
1315
1316// expNNWindowed calculates x**y mod m using a fixed, 4-bit window.
1317func (z nat) expNNWindowed(x, y, m nat) nat {
1318	// zz and r are used to avoid allocating in mul and div as otherwise
1319	// the arguments would alias.
1320	var zz, r nat
1321
1322	const n = 4
1323	// powers[i] contains x^i.
1324	var powers [1 << n]nat
1325	powers[0] = natOne
1326	powers[1] = x
1327	for i := 2; i < 1<<n; i += 2 {
1328		p2, p, p1 := &powers[i/2], &powers[i], &powers[i+1]
1329		*p = p.mul(*p2, *p2)
1330		zz, r = zz.div(r, *p, m)
1331		*p, r = r, *p
1332		*p1 = p1.mul(*p, x)
1333		zz, r = zz.div(r, *p1, m)
1334		*p1, r = r, *p1
1335	}
1336
1337	z = z.setWord(1)
1338
1339	for i := len(y) - 1; i >= 0; i-- {
1340		yi := y[i]
1341		for j := 0; j < _W; j += n {
1342			if i != len(y)-1 || j != 0 {
1343				// Unrolled loop for significant performance
1344				// gain.  Use go test -bench=".*" in crypto/rsa
1345				// to check performance before making changes.
1346				zz = zz.mul(z, z)
1347				zz, z = z, zz
1348				zz, r = zz.div(r, z, m)
1349				z, r = r, z
1350
1351				zz = zz.mul(z, z)
1352				zz, z = z, zz
1353				zz, r = zz.div(r, z, m)
1354				z, r = r, z
1355
1356				zz = zz.mul(z, z)
1357				zz, z = z, zz
1358				zz, r = zz.div(r, z, m)
1359				z, r = r, z
1360
1361				zz = zz.mul(z, z)
1362				zz, z = z, zz
1363				zz, r = zz.div(r, z, m)
1364				z, r = r, z
1365			}
1366
1367			zz = zz.mul(z, powers[yi>>(_W-n)])
1368			zz, z = z, zz
1369			zz, r = zz.div(r, z, m)
1370			z, r = r, z
1371
1372			yi <<= n
1373		}
1374	}
1375
1376	return z.norm()
1377}
1378
1379// probablyPrime performs reps Miller-Rabin tests to check whether n is prime.
1380// If it returns true, n is prime with probability 1 - 1/4^reps.
1381// If it returns false, n is not prime.
1382func (n nat) probablyPrime(reps int) bool {
1383	if len(n) == 0 {
1384		return false
1385	}
1386
1387	if len(n) == 1 {
1388		if n[0] < 2 {
1389			return false
1390		}
1391
1392		if n[0]%2 == 0 {
1393			return n[0] == 2
1394		}
1395
1396		// We have to exclude these cases because we reject all
1397		// multiples of these numbers below.
1398		switch n[0] {
1399		case 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53:
1400			return true
1401		}
1402	}
1403
1404	const primesProduct32 = 0xC0CFD797         // Π {p ∈ primes, 2 < p <= 29}
1405	const primesProduct64 = 0xE221F97C30E94E1D // Π {p ∈ primes, 2 < p <= 53}
1406
1407	var r Word
1408	switch _W {
1409	case 32:
1410		r = n.modW(primesProduct32)
1411	case 64:
1412		r = n.modW(primesProduct64 & _M)
1413	default:
1414		panic("Unknown word size")
1415	}
1416
1417	if r%3 == 0 || r%5 == 0 || r%7 == 0 || r%11 == 0 ||
1418		r%13 == 0 || r%17 == 0 || r%19 == 0 || r%23 == 0 || r%29 == 0 {
1419		return false
1420	}
1421
1422	if _W == 64 && (r%31 == 0 || r%37 == 0 || r%41 == 0 ||
1423		r%43 == 0 || r%47 == 0 || r%53 == 0) {
1424		return false
1425	}
1426
1427	nm1 := nat(nil).sub(n, natOne)
1428	// determine q, k such that nm1 = q << k
1429	k := nm1.trailingZeroBits()
1430	q := nat(nil).shr(nm1, k)
1431
1432	nm3 := nat(nil).sub(nm1, natTwo)
1433	rand := rand.New(rand.NewSource(int64(n[0])))
1434
1435	var x, y, quotient nat
1436	nm3Len := nm3.bitLen()
1437
1438NextRandom:
1439	for i := 0; i < reps; i++ {
1440		x = x.random(rand, nm3, nm3Len)
1441		x = x.add(x, natTwo)
1442		y = y.expNN(x, q, n)
1443		if y.cmp(natOne) == 0 || y.cmp(nm1) == 0 {
1444			continue
1445		}
1446		for j := uint(1); j < k; j++ {
1447			y = y.mul(y, y)
1448			quotient, y = quotient.div(y, y, n)
1449			if y.cmp(nm1) == 0 {
1450				continue NextRandom
1451			}
1452			if y.cmp(natOne) == 0 {
1453				return false
1454			}
1455		}
1456		return false
1457	}
1458
1459	return true
1460}
1461
1462// bytes writes the value of z into buf using big-endian encoding.
1463// len(buf) must be >= len(z)*_S. The value of z is encoded in the
1464// slice buf[i:]. The number i of unused bytes at the beginning of
1465// buf is returned as result.
1466func (z nat) bytes(buf []byte) (i int) {
1467	i = len(buf)
1468	for _, d := range z {
1469		for j := 0; j < _S; j++ {
1470			i--
1471			buf[i] = byte(d)
1472			d >>= 8
1473		}
1474	}
1475
1476	for i < len(buf) && buf[i] == 0 {
1477		i++
1478	}
1479
1480	return
1481}
1482
1483// setBytes interprets buf as the bytes of a big-endian unsigned
1484// integer, sets z to that value, and returns z.
1485func (z nat) setBytes(buf []byte) nat {
1486	z = z.make((len(buf) + _S - 1) / _S)
1487
1488	k := 0
1489	s := uint(0)
1490	var d Word
1491	for i := len(buf); i > 0; i-- {
1492		d |= Word(buf[i-1]) << s
1493		if s += 8; s == _S*8 {
1494			z[k] = d
1495			k++
1496			s = 0
1497			d = 0
1498		}
1499	}
1500	if k < len(z) {
1501		z[k] = d
1502	}
1503
1504	return z.norm()
1505}
1506