1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package big implements multi-precision arithmetic (big numbers). 6// The following numeric types are supported: 7// 8// - Int signed integers 9// - Rat rational numbers 10// 11// Methods are typically of the form: 12// 13// func (z *Int) Op(x, y *Int) *Int (similar for *Rat) 14// 15// and implement operations z = x Op y with the result as receiver; if it 16// is one of the operands it may be overwritten (and its memory reused). 17// To enable chaining of operations, the result is also returned. Methods 18// returning a result other than *Int or *Rat take one of the operands as 19// the receiver. 20// 21package big 22 23// This file contains operations on unsigned multi-precision integers. 24// These are the building blocks for the operations on signed integers 25// and rationals. 26 27import ( 28 "errors" 29 "io" 30 "math" 31 "math/rand" 32 "sync" 33) 34 35// An unsigned integer x of the form 36// 37// x = x[n-1]*_B^(n-1) + x[n-2]*_B^(n-2) + ... + x[1]*_B + x[0] 38// 39// with 0 <= x[i] < _B and 0 <= i < n is stored in a slice of length n, 40// with the digits x[i] as the slice elements. 41// 42// A number is normalized if the slice contains no leading 0 digits. 43// During arithmetic operations, denormalized values may occur but are 44// always normalized before returning the final result. The normalized 45// representation of 0 is the empty or nil slice (length = 0). 46// 47type nat []Word 48 49var ( 50 natOne = nat{1} 51 natTwo = nat{2} 52 natTen = nat{10} 53) 54 55func (z nat) clear() { 56 for i := range z { 57 z[i] = 0 58 } 59} 60 61func (z nat) norm() nat { 62 i := len(z) 63 for i > 0 && z[i-1] == 0 { 64 i-- 65 } 66 return z[0:i] 67} 68 69func (z nat) make(n int) nat { 70 if n <= cap(z) { 71 return z[0:n] // reuse z 72 } 73 // Choosing a good value for e has significant performance impact 74 // because it increases the chance that a value can be reused. 75 const e = 4 // extra capacity 76 return make(nat, n, n+e) 77} 78 79func (z nat) setWord(x Word) nat { 80 if x == 0 { 81 return z.make(0) 82 } 83 z = z.make(1) 84 z[0] = x 85 return z 86} 87 88func (z nat) setUint64(x uint64) nat { 89 // single-digit values 90 if w := Word(x); uint64(w) == x { 91 return z.setWord(w) 92 } 93 94 // compute number of words n required to represent x 95 n := 0 96 for t := x; t > 0; t >>= _W { 97 n++ 98 } 99 100 // split x into n words 101 z = z.make(n) 102 for i := range z { 103 z[i] = Word(x & _M) 104 x >>= _W 105 } 106 107 return z 108} 109 110func (z nat) set(x nat) nat { 111 z = z.make(len(x)) 112 copy(z, x) 113 return z 114} 115 116func (z nat) add(x, y nat) nat { 117 m := len(x) 118 n := len(y) 119 120 switch { 121 case m < n: 122 return z.add(y, x) 123 case m == 0: 124 // n == 0 because m >= n; result is 0 125 return z.make(0) 126 case n == 0: 127 // result is x 128 return z.set(x) 129 } 130 // m > 0 131 132 z = z.make(m + 1) 133 c := addVV(z[0:n], x, y) 134 if m > n { 135 c = addVW(z[n:m], x[n:], c) 136 } 137 z[m] = c 138 139 return z.norm() 140} 141 142func (z nat) sub(x, y nat) nat { 143 m := len(x) 144 n := len(y) 145 146 switch { 147 case m < n: 148 panic("underflow") 149 case m == 0: 150 // n == 0 because m >= n; result is 0 151 return z.make(0) 152 case n == 0: 153 // result is x 154 return z.set(x) 155 } 156 // m > 0 157 158 z = z.make(m) 159 c := subVV(z[0:n], x, y) 160 if m > n { 161 c = subVW(z[n:], x[n:], c) 162 } 163 if c != 0 { 164 panic("underflow") 165 } 166 167 return z.norm() 168} 169 170func (x nat) cmp(y nat) (r int) { 171 m := len(x) 172 n := len(y) 173 if m != n || m == 0 { 174 switch { 175 case m < n: 176 r = -1 177 case m > n: 178 r = 1 179 } 180 return 181 } 182 183 i := m - 1 184 for i > 0 && x[i] == y[i] { 185 i-- 186 } 187 188 switch { 189 case x[i] < y[i]: 190 r = -1 191 case x[i] > y[i]: 192 r = 1 193 } 194 return 195} 196 197func (z nat) mulAddWW(x nat, y, r Word) nat { 198 m := len(x) 199 if m == 0 || y == 0 { 200 return z.setWord(r) // result is r 201 } 202 // m > 0 203 204 z = z.make(m + 1) 205 z[m] = mulAddVWW(z[0:m], x, y, r) 206 207 return z.norm() 208} 209 210// basicMul multiplies x and y and leaves the result in z. 211// The (non-normalized) result is placed in z[0 : len(x) + len(y)]. 212func basicMul(z, x, y nat) { 213 z[0 : len(x)+len(y)].clear() // initialize z 214 for i, d := range y { 215 if d != 0 { 216 z[len(x)+i] = addMulVVW(z[i:i+len(x)], x, d) 217 } 218 } 219} 220 221// Fast version of z[0:n+n>>1].add(z[0:n+n>>1], x[0:n]) w/o bounds checks. 222// Factored out for readability - do not use outside karatsuba. 223func karatsubaAdd(z, x nat, n int) { 224 if c := addVV(z[0:n], z, x); c != 0 { 225 addVW(z[n:n+n>>1], z[n:], c) 226 } 227} 228 229// Like karatsubaAdd, but does subtract. 230func karatsubaSub(z, x nat, n int) { 231 if c := subVV(z[0:n], z, x); c != 0 { 232 subVW(z[n:n+n>>1], z[n:], c) 233 } 234} 235 236// Operands that are shorter than karatsubaThreshold are multiplied using 237// "grade school" multiplication; for longer operands the Karatsuba algorithm 238// is used. 239var karatsubaThreshold int = 40 // computed by calibrate.go 240 241// karatsuba multiplies x and y and leaves the result in z. 242// Both x and y must have the same length n and n must be a 243// power of 2. The result vector z must have len(z) >= 6*n. 244// The (non-normalized) result is placed in z[0 : 2*n]. 245func karatsuba(z, x, y nat) { 246 n := len(y) 247 248 // Switch to basic multiplication if numbers are odd or small. 249 // (n is always even if karatsubaThreshold is even, but be 250 // conservative) 251 if n&1 != 0 || n < karatsubaThreshold || n < 2 { 252 basicMul(z, x, y) 253 return 254 } 255 // n&1 == 0 && n >= karatsubaThreshold && n >= 2 256 257 // Karatsuba multiplication is based on the observation that 258 // for two numbers x and y with: 259 // 260 // x = x1*b + x0 261 // y = y1*b + y0 262 // 263 // the product x*y can be obtained with 3 products z2, z1, z0 264 // instead of 4: 265 // 266 // x*y = x1*y1*b*b + (x1*y0 + x0*y1)*b + x0*y0 267 // = z2*b*b + z1*b + z0 268 // 269 // with: 270 // 271 // xd = x1 - x0 272 // yd = y0 - y1 273 // 274 // z1 = xd*yd + z2 + z0 275 // = (x1-x0)*(y0 - y1) + z2 + z0 276 // = x1*y0 - x1*y1 - x0*y0 + x0*y1 + z2 + z0 277 // = x1*y0 - z2 - z0 + x0*y1 + z2 + z0 278 // = x1*y0 + x0*y1 279 280 // split x, y into "digits" 281 n2 := n >> 1 // n2 >= 1 282 x1, x0 := x[n2:], x[0:n2] // x = x1*b + y0 283 y1, y0 := y[n2:], y[0:n2] // y = y1*b + y0 284 285 // z is used for the result and temporary storage: 286 // 287 // 6*n 5*n 4*n 3*n 2*n 1*n 0*n 288 // z = [z2 copy|z0 copy| xd*yd | yd:xd | x1*y1 | x0*y0 ] 289 // 290 // For each recursive call of karatsuba, an unused slice of 291 // z is passed in that has (at least) half the length of the 292 // caller's z. 293 294 // compute z0 and z2 with the result "in place" in z 295 karatsuba(z, x0, y0) // z0 = x0*y0 296 karatsuba(z[n:], x1, y1) // z2 = x1*y1 297 298 // compute xd (or the negative value if underflow occurs) 299 s := 1 // sign of product xd*yd 300 xd := z[2*n : 2*n+n2] 301 if subVV(xd, x1, x0) != 0 { // x1-x0 302 s = -s 303 subVV(xd, x0, x1) // x0-x1 304 } 305 306 // compute yd (or the negative value if underflow occurs) 307 yd := z[2*n+n2 : 3*n] 308 if subVV(yd, y0, y1) != 0 { // y0-y1 309 s = -s 310 subVV(yd, y1, y0) // y1-y0 311 } 312 313 // p = (x1-x0)*(y0-y1) == x1*y0 - x1*y1 - x0*y0 + x0*y1 for s > 0 314 // p = (x0-x1)*(y0-y1) == x0*y0 - x0*y1 - x1*y0 + x1*y1 for s < 0 315 p := z[n*3:] 316 karatsuba(p, xd, yd) 317 318 // save original z2:z0 319 // (ok to use upper half of z since we're done recursing) 320 r := z[n*4:] 321 copy(r, z[:n*2]) 322 323 // add up all partial products 324 // 325 // 2*n n 0 326 // z = [ z2 | z0 ] 327 // + [ z0 ] 328 // + [ z2 ] 329 // + [ p ] 330 // 331 karatsubaAdd(z[n2:], r, n) 332 karatsubaAdd(z[n2:], r[n:], n) 333 if s > 0 { 334 karatsubaAdd(z[n2:], p, n) 335 } else { 336 karatsubaSub(z[n2:], p, n) 337 } 338} 339 340// alias returns true if x and y share the same base array. 341func alias(x, y nat) bool { 342 return cap(x) > 0 && cap(y) > 0 && &x[0:cap(x)][cap(x)-1] == &y[0:cap(y)][cap(y)-1] 343} 344 345// addAt implements z += x<<(_W*i); z must be long enough. 346// (we don't use nat.add because we need z to stay the same 347// slice, and we don't need to normalize z after each addition) 348func addAt(z, x nat, i int) { 349 if n := len(x); n > 0 { 350 if c := addVV(z[i:i+n], z[i:], x); c != 0 { 351 j := i + n 352 if j < len(z) { 353 addVW(z[j:], z[j:], c) 354 } 355 } 356 } 357} 358 359func max(x, y int) int { 360 if x > y { 361 return x 362 } 363 return y 364} 365 366// karatsubaLen computes an approximation to the maximum k <= n such that 367// k = p<<i for a number p <= karatsubaThreshold and an i >= 0. Thus, the 368// result is the largest number that can be divided repeatedly by 2 before 369// becoming about the value of karatsubaThreshold. 370func karatsubaLen(n int) int { 371 i := uint(0) 372 for n > karatsubaThreshold { 373 n >>= 1 374 i++ 375 } 376 return n << i 377} 378 379func (z nat) mul(x, y nat) nat { 380 m := len(x) 381 n := len(y) 382 383 switch { 384 case m < n: 385 return z.mul(y, x) 386 case m == 0 || n == 0: 387 return z.make(0) 388 case n == 1: 389 return z.mulAddWW(x, y[0], 0) 390 } 391 // m >= n > 1 392 393 // determine if z can be reused 394 if alias(z, x) || alias(z, y) { 395 z = nil // z is an alias for x or y - cannot reuse 396 } 397 398 // use basic multiplication if the numbers are small 399 if n < karatsubaThreshold { 400 z = z.make(m + n) 401 basicMul(z, x, y) 402 return z.norm() 403 } 404 // m >= n && n >= karatsubaThreshold && n >= 2 405 406 // determine Karatsuba length k such that 407 // 408 // x = xh*b + x0 (0 <= x0 < b) 409 // y = yh*b + y0 (0 <= y0 < b) 410 // b = 1<<(_W*k) ("base" of digits xi, yi) 411 // 412 k := karatsubaLen(n) 413 // k <= n 414 415 // multiply x0 and y0 via Karatsuba 416 x0 := x[0:k] // x0 is not normalized 417 y0 := y[0:k] // y0 is not normalized 418 z = z.make(max(6*k, m+n)) // enough space for karatsuba of x0*y0 and full result of x*y 419 karatsuba(z, x0, y0) 420 z = z[0 : m+n] // z has final length but may be incomplete 421 z[2*k:].clear() // upper portion of z is garbage (and 2*k <= m+n since k <= n <= m) 422 423 // If xh != 0 or yh != 0, add the missing terms to z. For 424 // 425 // xh = xi*b^i + ... + x2*b^2 + x1*b (0 <= xi < b) 426 // yh = y1*b (0 <= y1 < b) 427 // 428 // the missing terms are 429 // 430 // x0*y1*b and xi*y0*b^i, xi*y1*b^(i+1) for i > 0 431 // 432 // since all the yi for i > 1 are 0 by choice of k: If any of them 433 // were > 0, then yh >= b^2 and thus y >= b^2. Then k' = k*2 would 434 // be a larger valid threshold contradicting the assumption about k. 435 // 436 if k < n || m != n { 437 var t nat 438 439 // add x0*y1*b 440 x0 := x0.norm() 441 y1 := y[k:] // y1 is normalized because y is 442 t = t.mul(x0, y1) // update t so we don't lose t's underlying array 443 addAt(z, t, k) 444 445 // add xi*y0<<i, xi*y1*b<<(i+k) 446 y0 := y0.norm() 447 for i := k; i < len(x); i += k { 448 xi := x[i:] 449 if len(xi) > k { 450 xi = xi[:k] 451 } 452 xi = xi.norm() 453 t = t.mul(xi, y0) 454 addAt(z, t, i) 455 t = t.mul(xi, y1) 456 addAt(z, t, i+k) 457 } 458 } 459 460 return z.norm() 461} 462 463// mulRange computes the product of all the unsigned integers in the 464// range [a, b] inclusively. If a > b (empty range), the result is 1. 465func (z nat) mulRange(a, b uint64) nat { 466 switch { 467 case a == 0: 468 // cut long ranges short (optimization) 469 return z.setUint64(0) 470 case a > b: 471 return z.setUint64(1) 472 case a == b: 473 return z.setUint64(a) 474 case a+1 == b: 475 return z.mul(nat(nil).setUint64(a), nat(nil).setUint64(b)) 476 } 477 m := (a + b) / 2 478 return z.mul(nat(nil).mulRange(a, m), nat(nil).mulRange(m+1, b)) 479} 480 481// q = (x-r)/y, with 0 <= r < y 482func (z nat) divW(x nat, y Word) (q nat, r Word) { 483 m := len(x) 484 switch { 485 case y == 0: 486 panic("division by zero") 487 case y == 1: 488 q = z.set(x) // result is x 489 return 490 case m == 0: 491 q = z.make(0) // result is 0 492 return 493 } 494 // m > 0 495 z = z.make(m) 496 r = divWVW(z, 0, x, y) 497 q = z.norm() 498 return 499} 500 501func (z nat) div(z2, u, v nat) (q, r nat) { 502 if len(v) == 0 { 503 panic("division by zero") 504 } 505 506 if u.cmp(v) < 0 { 507 q = z.make(0) 508 r = z2.set(u) 509 return 510 } 511 512 if len(v) == 1 { 513 var r2 Word 514 q, r2 = z.divW(u, v[0]) 515 r = z2.setWord(r2) 516 return 517 } 518 519 q, r = z.divLarge(z2, u, v) 520 return 521} 522 523// q = (uIn-r)/v, with 0 <= r < y 524// Uses z as storage for q, and u as storage for r if possible. 525// See Knuth, Volume 2, section 4.3.1, Algorithm D. 526// Preconditions: 527// len(v) >= 2 528// len(uIn) >= len(v) 529func (z nat) divLarge(u, uIn, v nat) (q, r nat) { 530 n := len(v) 531 m := len(uIn) - n 532 533 // determine if z can be reused 534 // TODO(gri) should find a better solution - this if statement 535 // is very costly (see e.g. time pidigits -s -n 10000) 536 if alias(z, uIn) || alias(z, v) { 537 z = nil // z is an alias for uIn or v - cannot reuse 538 } 539 q = z.make(m + 1) 540 541 qhatv := make(nat, n+1) 542 if alias(u, uIn) || alias(u, v) { 543 u = nil // u is an alias for uIn or v - cannot reuse 544 } 545 u = u.make(len(uIn) + 1) 546 u.clear() 547 548 // D1. 549 shift := leadingZeros(v[n-1]) 550 if shift > 0 { 551 // do not modify v, it may be used by another goroutine simultaneously 552 v1 := make(nat, n) 553 shlVU(v1, v, shift) 554 v = v1 555 } 556 u[len(uIn)] = shlVU(u[0:len(uIn)], uIn, shift) 557 558 // D2. 559 for j := m; j >= 0; j-- { 560 // D3. 561 qhat := Word(_M) 562 if u[j+n] != v[n-1] { 563 var rhat Word 564 qhat, rhat = divWW(u[j+n], u[j+n-1], v[n-1]) 565 566 // x1 | x2 = q̂v_{n-2} 567 x1, x2 := mulWW(qhat, v[n-2]) 568 // test if q̂v_{n-2} > br̂ + u_{j+n-2} 569 for greaterThan(x1, x2, rhat, u[j+n-2]) { 570 qhat-- 571 prevRhat := rhat 572 rhat += v[n-1] 573 // v[n-1] >= 0, so this tests for overflow. 574 if rhat < prevRhat { 575 break 576 } 577 x1, x2 = mulWW(qhat, v[n-2]) 578 } 579 } 580 581 // D4. 582 qhatv[n] = mulAddVWW(qhatv[0:n], v, qhat, 0) 583 584 c := subVV(u[j:j+len(qhatv)], u[j:], qhatv) 585 if c != 0 { 586 c := addVV(u[j:j+n], u[j:], v) 587 u[j+n] += c 588 qhat-- 589 } 590 591 q[j] = qhat 592 } 593 594 q = q.norm() 595 shrVU(u, u, shift) 596 r = u.norm() 597 598 return q, r 599} 600 601// Length of x in bits. x must be normalized. 602func (x nat) bitLen() int { 603 if i := len(x) - 1; i >= 0 { 604 return i*_W + bitLen(x[i]) 605 } 606 return 0 607} 608 609// MaxBase is the largest number base accepted for string conversions. 610const MaxBase = 'z' - 'a' + 10 + 1 // = hexValue('z') + 1 611 612func hexValue(ch rune) Word { 613 d := int(MaxBase + 1) // illegal base 614 switch { 615 case '0' <= ch && ch <= '9': 616 d = int(ch - '0') 617 case 'a' <= ch && ch <= 'z': 618 d = int(ch - 'a' + 10) 619 case 'A' <= ch && ch <= 'Z': 620 d = int(ch - 'A' + 10) 621 } 622 return Word(d) 623} 624 625// scan sets z to the natural number corresponding to the longest possible prefix 626// read from r representing an unsigned integer in a given conversion base. 627// It returns z, the actual conversion base used, and an error, if any. In the 628// error case, the value of z is undefined. The syntax follows the syntax of 629// unsigned integer literals in Go. 630// 631// The base argument must be 0 or a value from 2 through MaxBase. If the base 632// is 0, the string prefix determines the actual conversion base. A prefix of 633// ``0x'' or ``0X'' selects base 16; the ``0'' prefix selects base 8, and a 634// ``0b'' or ``0B'' prefix selects base 2. Otherwise the selected base is 10. 635// 636func (z nat) scan(r io.RuneScanner, base int) (nat, int, error) { 637 // reject illegal bases 638 if base < 0 || base == 1 || MaxBase < base { 639 return z, 0, errors.New("illegal number base") 640 } 641 642 // one char look-ahead 643 ch, _, err := r.ReadRune() 644 if err != nil { 645 return z, 0, err 646 } 647 648 // determine base if necessary 649 b := Word(base) 650 if base == 0 { 651 b = 10 652 if ch == '0' { 653 switch ch, _, err = r.ReadRune(); err { 654 case nil: 655 b = 8 656 switch ch { 657 case 'x', 'X': 658 b = 16 659 case 'b', 'B': 660 b = 2 661 } 662 if b == 2 || b == 16 { 663 if ch, _, err = r.ReadRune(); err != nil { 664 return z, 0, err 665 } 666 } 667 case io.EOF: 668 return z.make(0), 10, nil 669 default: 670 return z, 10, err 671 } 672 } 673 } 674 675 // convert string 676 // - group as many digits d as possible together into a "super-digit" dd with "super-base" bb 677 // - only when bb does not fit into a word anymore, do a full number mulAddWW using bb and dd 678 z = z.make(0) 679 bb := Word(1) 680 dd := Word(0) 681 for max := _M / b; ; { 682 d := hexValue(ch) 683 if d >= b { 684 r.UnreadRune() // ch does not belong to number anymore 685 break 686 } 687 688 if bb <= max { 689 bb *= b 690 dd = dd*b + d 691 } else { 692 // bb * b would overflow 693 z = z.mulAddWW(z, bb, dd) 694 bb = b 695 dd = d 696 } 697 698 if ch, _, err = r.ReadRune(); err != nil { 699 if err != io.EOF { 700 return z, int(b), err 701 } 702 break 703 } 704 } 705 706 switch { 707 case bb > 1: 708 // there was at least one mantissa digit 709 z = z.mulAddWW(z, bb, dd) 710 case base == 0 && b == 8: 711 // there was only the octal prefix 0 (possibly followed by digits > 7); 712 // return base 10, not 8 713 return z, 10, nil 714 case base != 0 || b != 8: 715 // there was neither a mantissa digit nor the octal prefix 0 716 return z, int(b), errors.New("syntax error scanning number") 717 } 718 719 return z.norm(), int(b), nil 720} 721 722// Character sets for string conversion. 723const ( 724 lowercaseDigits = "0123456789abcdefghijklmnopqrstuvwxyz" 725 uppercaseDigits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 726) 727 728// decimalString returns a decimal representation of x. 729// It calls x.string with the charset "0123456789". 730func (x nat) decimalString() string { 731 return x.string(lowercaseDigits[0:10]) 732} 733 734// string converts x to a string using digits from a charset; a digit with 735// value d is represented by charset[d]. The conversion base is determined 736// by len(charset), which must be >= 2 and <= 256. 737func (x nat) string(charset string) string { 738 b := Word(len(charset)) 739 740 // special cases 741 switch { 742 case b < 2 || MaxBase > 256: 743 panic("illegal base") 744 case len(x) == 0: 745 return string(charset[0]) 746 } 747 748 // allocate buffer for conversion 749 i := int(float64(x.bitLen())/math.Log2(float64(b))) + 1 // off by one at most 750 s := make([]byte, i) 751 752 // convert power of two and non power of two bases separately 753 if b == b&-b { 754 // shift is base-b digit size in bits 755 shift := trailingZeroBits(b) // shift > 0 because b >= 2 756 mask := Word(1)<<shift - 1 757 w := x[0] 758 nbits := uint(_W) // number of unprocessed bits in w 759 760 // convert less-significant words 761 for k := 1; k < len(x); k++ { 762 // convert full digits 763 for nbits >= shift { 764 i-- 765 s[i] = charset[w&mask] 766 w >>= shift 767 nbits -= shift 768 } 769 770 // convert any partial leading digit and advance to next word 771 if nbits == 0 { 772 // no partial digit remaining, just advance 773 w = x[k] 774 nbits = _W 775 } else { 776 // partial digit in current (k-1) and next (k) word 777 w |= x[k] << nbits 778 i-- 779 s[i] = charset[w&mask] 780 781 // advance 782 w = x[k] >> (shift - nbits) 783 nbits = _W - (shift - nbits) 784 } 785 } 786 787 // convert digits of most-significant word (omit leading zeros) 788 for nbits >= 0 && w != 0 { 789 i-- 790 s[i] = charset[w&mask] 791 w >>= shift 792 nbits -= shift 793 } 794 795 } else { 796 // determine "big base"; i.e., the largest possible value bb 797 // that is a power of base b and still fits into a Word 798 // (as in 10^19 for 19 decimal digits in a 64bit Word) 799 bb := b // big base is b**ndigits 800 ndigits := 1 // number of base b digits 801 for max := Word(_M / b); bb <= max; bb *= b { 802 ndigits++ // maximize ndigits where bb = b**ndigits, bb <= _M 803 } 804 805 // construct table of successive squares of bb*leafSize to use in subdivisions 806 // result (table != nil) <=> (len(x) > leafSize > 0) 807 table := divisors(len(x), b, ndigits, bb) 808 809 // preserve x, create local copy for use by convertWords 810 q := nat(nil).set(x) 811 812 // convert q to string s in base b 813 q.convertWords(s, charset, b, ndigits, bb, table) 814 815 // strip leading zeros 816 // (x != 0; thus s must contain at least one non-zero digit 817 // and the loop will terminate) 818 i = 0 819 for zero := charset[0]; s[i] == zero; { 820 i++ 821 } 822 } 823 824 return string(s[i:]) 825} 826 827// Convert words of q to base b digits in s. If q is large, it is recursively "split in half" 828// by nat/nat division using tabulated divisors. Otherwise, it is converted iteratively using 829// repeated nat/Word division. 830// 831// The iterative method processes n Words by n divW() calls, each of which visits every Word in the 832// incrementally shortened q for a total of n + (n-1) + (n-2) ... + 2 + 1, or n(n+1)/2 divW()'s. 833// Recursive conversion divides q by its approximate square root, yielding two parts, each half 834// the size of q. Using the iterative method on both halves means 2 * (n/2)(n/2 + 1)/2 divW()'s 835// plus the expensive long div(). Asymptotically, the ratio is favorable at 1/2 the divW()'s, and 836// is made better by splitting the subblocks recursively. Best is to split blocks until one more 837// split would take longer (because of the nat/nat div()) than the twice as many divW()'s of the 838// iterative approach. This threshold is represented by leafSize. Benchmarking of leafSize in the 839// range 2..64 shows that values of 8 and 16 work well, with a 4x speedup at medium lengths and 840// ~30x for 20000 digits. Use nat_test.go's BenchmarkLeafSize tests to optimize leafSize for 841// specific hardware. 842// 843func (q nat) convertWords(s []byte, charset string, b Word, ndigits int, bb Word, table []divisor) { 844 // split larger blocks recursively 845 if table != nil { 846 // len(q) > leafSize > 0 847 var r nat 848 index := len(table) - 1 849 for len(q) > leafSize { 850 // find divisor close to sqrt(q) if possible, but in any case < q 851 maxLength := q.bitLen() // ~= log2 q, or at of least largest possible q of this bit length 852 minLength := maxLength >> 1 // ~= log2 sqrt(q) 853 for index > 0 && table[index-1].nbits > minLength { 854 index-- // desired 855 } 856 if table[index].nbits >= maxLength && table[index].bbb.cmp(q) >= 0 { 857 index-- 858 if index < 0 { 859 panic("internal inconsistency") 860 } 861 } 862 863 // split q into the two digit number (q'*bbb + r) to form independent subblocks 864 q, r = q.div(r, q, table[index].bbb) 865 866 // convert subblocks and collect results in s[:h] and s[h:] 867 h := len(s) - table[index].ndigits 868 r.convertWords(s[h:], charset, b, ndigits, bb, table[0:index]) 869 s = s[:h] // == q.convertWords(s, charset, b, ndigits, bb, table[0:index+1]) 870 } 871 } 872 873 // having split any large blocks now process the remaining (small) block iteratively 874 i := len(s) 875 var r Word 876 if b == 10 { 877 // hard-coding for 10 here speeds this up by 1.25x (allows for / and % by constants) 878 for len(q) > 0 { 879 // extract least significant, base bb "digit" 880 q, r = q.divW(q, bb) 881 for j := 0; j < ndigits && i > 0; j++ { 882 i-- 883 // avoid % computation since r%10 == r - int(r/10)*10; 884 // this appears to be faster for BenchmarkString10000Base10 885 // and smaller strings (but a bit slower for larger ones) 886 t := r / 10 887 s[i] = charset[r-t<<3-t-t] // TODO(gri) replace w/ t*10 once compiler produces better code 888 r = t 889 } 890 } 891 } else { 892 for len(q) > 0 { 893 // extract least significant, base bb "digit" 894 q, r = q.divW(q, bb) 895 for j := 0; j < ndigits && i > 0; j++ { 896 i-- 897 s[i] = charset[r%b] 898 r /= b 899 } 900 } 901 } 902 903 // prepend high-order zeroes 904 zero := charset[0] 905 for i > 0 { // while need more leading zeroes 906 i-- 907 s[i] = zero 908 } 909} 910 911// Split blocks greater than leafSize Words (or set to 0 to disable recursive conversion) 912// Benchmark and configure leafSize using: go test -bench="Leaf" 913// 8 and 16 effective on 3.0 GHz Xeon "Clovertown" CPU (128 byte cache lines) 914// 8 and 16 effective on 2.66 GHz Core 2 Duo "Penryn" CPU 915var leafSize int = 8 // number of Word-size binary values treat as a monolithic block 916 917type divisor struct { 918 bbb nat // divisor 919 nbits int // bit length of divisor (discounting leading zeroes) ~= log2(bbb) 920 ndigits int // digit length of divisor in terms of output base digits 921} 922 923var cacheBase10 struct { 924 sync.Mutex 925 table [64]divisor // cached divisors for base 10 926} 927 928// expWW computes x**y 929func (z nat) expWW(x, y Word) nat { 930 return z.expNN(nat(nil).setWord(x), nat(nil).setWord(y), nil) 931} 932 933// construct table of powers of bb*leafSize to use in subdivisions 934func divisors(m int, b Word, ndigits int, bb Word) []divisor { 935 // only compute table when recursive conversion is enabled and x is large 936 if leafSize == 0 || m <= leafSize { 937 return nil 938 } 939 940 // determine k where (bb**leafSize)**(2**k) >= sqrt(x) 941 k := 1 942 for words := leafSize; words < m>>1 && k < len(cacheBase10.table); words <<= 1 { 943 k++ 944 } 945 946 // reuse and extend existing table of divisors or create new table as appropriate 947 var table []divisor // for b == 10, table overlaps with cacheBase10.table 948 if b == 10 { 949 cacheBase10.Lock() 950 table = cacheBase10.table[0:k] // reuse old table for this conversion 951 } else { 952 table = make([]divisor, k) // create new table for this conversion 953 } 954 955 // extend table 956 if table[k-1].ndigits == 0 { 957 // add new entries as needed 958 var larger nat 959 for i := 0; i < k; i++ { 960 if table[i].ndigits == 0 { 961 if i == 0 { 962 table[0].bbb = nat(nil).expWW(bb, Word(leafSize)) 963 table[0].ndigits = ndigits * leafSize 964 } else { 965 table[i].bbb = nat(nil).mul(table[i-1].bbb, table[i-1].bbb) 966 table[i].ndigits = 2 * table[i-1].ndigits 967 } 968 969 // optimization: exploit aggregated extra bits in macro blocks 970 larger = nat(nil).set(table[i].bbb) 971 for mulAddVWW(larger, larger, b, 0) == 0 { 972 table[i].bbb = table[i].bbb.set(larger) 973 table[i].ndigits++ 974 } 975 976 table[i].nbits = table[i].bbb.bitLen() 977 } 978 } 979 } 980 981 if b == 10 { 982 cacheBase10.Unlock() 983 } 984 985 return table 986} 987 988const deBruijn32 = 0x077CB531 989 990var deBruijn32Lookup = []byte{ 991 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 992 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9, 993} 994 995const deBruijn64 = 0x03f79d71b4ca8b09 996 997var deBruijn64Lookup = []byte{ 998 0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4, 999 62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5, 1000 63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11, 1001 54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6, 1002} 1003 1004// trailingZeroBits returns the number of consecutive least significant zero 1005// bits of x. 1006func trailingZeroBits(x Word) uint { 1007 // x & -x leaves only the right-most bit set in the word. Let k be the 1008 // index of that bit. Since only a single bit is set, the value is two 1009 // to the power of k. Multiplying by a power of two is equivalent to 1010 // left shifting, in this case by k bits. The de Bruijn constant is 1011 // such that all six bit, consecutive substrings are distinct. 1012 // Therefore, if we have a left shifted version of this constant we can 1013 // find by how many bits it was shifted by looking at which six bit 1014 // substring ended up at the top of the word. 1015 // (Knuth, volume 4, section 7.3.1) 1016 switch _W { 1017 case 32: 1018 return uint(deBruijn32Lookup[((x&-x)*deBruijn32)>>27]) 1019 case 64: 1020 return uint(deBruijn64Lookup[((x&-x)*(deBruijn64&_M))>>58]) 1021 default: 1022 panic("unknown word size") 1023 } 1024 1025 return 0 1026} 1027 1028// trailingZeroBits returns the number of consecutive least significant zero 1029// bits of x. 1030func (x nat) trailingZeroBits() uint { 1031 if len(x) == 0 { 1032 return 0 1033 } 1034 var i uint 1035 for x[i] == 0 { 1036 i++ 1037 } 1038 // x[i] != 0 1039 return i*_W + trailingZeroBits(x[i]) 1040} 1041 1042// z = x << s 1043func (z nat) shl(x nat, s uint) nat { 1044 m := len(x) 1045 if m == 0 { 1046 return z.make(0) 1047 } 1048 // m > 0 1049 1050 n := m + int(s/_W) 1051 z = z.make(n + 1) 1052 z[n] = shlVU(z[n-m:n], x, s%_W) 1053 z[0 : n-m].clear() 1054 1055 return z.norm() 1056} 1057 1058// z = x >> s 1059func (z nat) shr(x nat, s uint) nat { 1060 m := len(x) 1061 n := m - int(s/_W) 1062 if n <= 0 { 1063 return z.make(0) 1064 } 1065 // n > 0 1066 1067 z = z.make(n) 1068 shrVU(z, x[m-n:], s%_W) 1069 1070 return z.norm() 1071} 1072 1073func (z nat) setBit(x nat, i uint, b uint) nat { 1074 j := int(i / _W) 1075 m := Word(1) << (i % _W) 1076 n := len(x) 1077 switch b { 1078 case 0: 1079 z = z.make(n) 1080 copy(z, x) 1081 if j >= n { 1082 // no need to grow 1083 return z 1084 } 1085 z[j] &^= m 1086 return z.norm() 1087 case 1: 1088 if j >= n { 1089 z = z.make(j + 1) 1090 z[n:].clear() 1091 } else { 1092 z = z.make(n) 1093 } 1094 copy(z, x) 1095 z[j] |= m 1096 // no need to normalize 1097 return z 1098 } 1099 panic("set bit is not 0 or 1") 1100} 1101 1102func (z nat) bit(i uint) uint { 1103 j := int(i / _W) 1104 if j >= len(z) { 1105 return 0 1106 } 1107 return uint(z[j] >> (i % _W) & 1) 1108} 1109 1110func (z nat) and(x, y nat) nat { 1111 m := len(x) 1112 n := len(y) 1113 if m > n { 1114 m = n 1115 } 1116 // m <= n 1117 1118 z = z.make(m) 1119 for i := 0; i < m; i++ { 1120 z[i] = x[i] & y[i] 1121 } 1122 1123 return z.norm() 1124} 1125 1126func (z nat) andNot(x, y nat) nat { 1127 m := len(x) 1128 n := len(y) 1129 if n > m { 1130 n = m 1131 } 1132 // m >= n 1133 1134 z = z.make(m) 1135 for i := 0; i < n; i++ { 1136 z[i] = x[i] &^ y[i] 1137 } 1138 copy(z[n:m], x[n:m]) 1139 1140 return z.norm() 1141} 1142 1143func (z nat) or(x, y nat) nat { 1144 m := len(x) 1145 n := len(y) 1146 s := x 1147 if m < n { 1148 n, m = m, n 1149 s = y 1150 } 1151 // m >= n 1152 1153 z = z.make(m) 1154 for i := 0; i < n; i++ { 1155 z[i] = x[i] | y[i] 1156 } 1157 copy(z[n:m], s[n:m]) 1158 1159 return z.norm() 1160} 1161 1162func (z nat) xor(x, y nat) nat { 1163 m := len(x) 1164 n := len(y) 1165 s := x 1166 if m < n { 1167 n, m = m, n 1168 s = y 1169 } 1170 // m >= n 1171 1172 z = z.make(m) 1173 for i := 0; i < n; i++ { 1174 z[i] = x[i] ^ y[i] 1175 } 1176 copy(z[n:m], s[n:m]) 1177 1178 return z.norm() 1179} 1180 1181// greaterThan returns true iff (x1<<_W + x2) > (y1<<_W + y2) 1182func greaterThan(x1, x2, y1, y2 Word) bool { 1183 return x1 > y1 || x1 == y1 && x2 > y2 1184} 1185 1186// modW returns x % d. 1187func (x nat) modW(d Word) (r Word) { 1188 // TODO(agl): we don't actually need to store the q value. 1189 var q nat 1190 q = q.make(len(x)) 1191 return divWVW(q, 0, x, d) 1192} 1193 1194// random creates a random integer in [0..limit), using the space in z if 1195// possible. n is the bit length of limit. 1196func (z nat) random(rand *rand.Rand, limit nat, n int) nat { 1197 if alias(z, limit) { 1198 z = nil // z is an alias for limit - cannot reuse 1199 } 1200 z = z.make(len(limit)) 1201 1202 bitLengthOfMSW := uint(n % _W) 1203 if bitLengthOfMSW == 0 { 1204 bitLengthOfMSW = _W 1205 } 1206 mask := Word((1 << bitLengthOfMSW) - 1) 1207 1208 for { 1209 switch _W { 1210 case 32: 1211 for i := range z { 1212 z[i] = Word(rand.Uint32()) 1213 } 1214 case 64: 1215 for i := range z { 1216 z[i] = Word(rand.Uint32()) | Word(rand.Uint32())<<32 1217 } 1218 default: 1219 panic("unknown word size") 1220 } 1221 z[len(limit)-1] &= mask 1222 if z.cmp(limit) < 0 { 1223 break 1224 } 1225 } 1226 1227 return z.norm() 1228} 1229 1230// If m != 0 (i.e., len(m) != 0), expNN sets z to x**y mod m; 1231// otherwise it sets z to x**y. The result is the value of z. 1232func (z nat) expNN(x, y, m nat) nat { 1233 if alias(z, x) || alias(z, y) { 1234 // We cannot allow in-place modification of x or y. 1235 z = nil 1236 } 1237 1238 if len(y) == 0 { 1239 z = z.make(1) 1240 z[0] = 1 1241 return z 1242 } 1243 // y > 0 1244 1245 if len(m) != 0 { 1246 // We likely end up being as long as the modulus. 1247 z = z.make(len(m)) 1248 } 1249 z = z.set(x) 1250 1251 // If the base is non-trivial and the exponent is large, we use 1252 // 4-bit, windowed exponentiation. This involves precomputing 14 values 1253 // (x^2...x^15) but then reduces the number of multiply-reduces by a 1254 // third. Even for a 32-bit exponent, this reduces the number of 1255 // operations. 1256 if len(x) > 1 && len(y) > 1 && len(m) > 0 { 1257 return z.expNNWindowed(x, y, m) 1258 } 1259 1260 v := y[len(y)-1] // v > 0 because y is normalized and y > 0 1261 shift := leadingZeros(v) + 1 1262 v <<= shift 1263 var q nat 1264 1265 const mask = 1 << (_W - 1) 1266 1267 // We walk through the bits of the exponent one by one. Each time we 1268 // see a bit, we square, thus doubling the power. If the bit is a one, 1269 // we also multiply by x, thus adding one to the power. 1270 1271 w := _W - int(shift) 1272 // zz and r are used to avoid allocating in mul and div as 1273 // otherwise the arguments would alias. 1274 var zz, r nat 1275 for j := 0; j < w; j++ { 1276 zz = zz.mul(z, z) 1277 zz, z = z, zz 1278 1279 if v&mask != 0 { 1280 zz = zz.mul(z, x) 1281 zz, z = z, zz 1282 } 1283 1284 if len(m) != 0 { 1285 zz, r = zz.div(r, z, m) 1286 zz, r, q, z = q, z, zz, r 1287 } 1288 1289 v <<= 1 1290 } 1291 1292 for i := len(y) - 2; i >= 0; i-- { 1293 v = y[i] 1294 1295 for j := 0; j < _W; j++ { 1296 zz = zz.mul(z, z) 1297 zz, z = z, zz 1298 1299 if v&mask != 0 { 1300 zz = zz.mul(z, x) 1301 zz, z = z, zz 1302 } 1303 1304 if len(m) != 0 { 1305 zz, r = zz.div(r, z, m) 1306 zz, r, q, z = q, z, zz, r 1307 } 1308 1309 v <<= 1 1310 } 1311 } 1312 1313 return z.norm() 1314} 1315 1316// expNNWindowed calculates x**y mod m using a fixed, 4-bit window. 1317func (z nat) expNNWindowed(x, y, m nat) nat { 1318 // zz and r are used to avoid allocating in mul and div as otherwise 1319 // the arguments would alias. 1320 var zz, r nat 1321 1322 const n = 4 1323 // powers[i] contains x^i. 1324 var powers [1 << n]nat 1325 powers[0] = natOne 1326 powers[1] = x 1327 for i := 2; i < 1<<n; i += 2 { 1328 p2, p, p1 := &powers[i/2], &powers[i], &powers[i+1] 1329 *p = p.mul(*p2, *p2) 1330 zz, r = zz.div(r, *p, m) 1331 *p, r = r, *p 1332 *p1 = p1.mul(*p, x) 1333 zz, r = zz.div(r, *p1, m) 1334 *p1, r = r, *p1 1335 } 1336 1337 z = z.setWord(1) 1338 1339 for i := len(y) - 1; i >= 0; i-- { 1340 yi := y[i] 1341 for j := 0; j < _W; j += n { 1342 if i != len(y)-1 || j != 0 { 1343 // Unrolled loop for significant performance 1344 // gain. Use go test -bench=".*" in crypto/rsa 1345 // to check performance before making changes. 1346 zz = zz.mul(z, z) 1347 zz, z = z, zz 1348 zz, r = zz.div(r, z, m) 1349 z, r = r, z 1350 1351 zz = zz.mul(z, z) 1352 zz, z = z, zz 1353 zz, r = zz.div(r, z, m) 1354 z, r = r, z 1355 1356 zz = zz.mul(z, z) 1357 zz, z = z, zz 1358 zz, r = zz.div(r, z, m) 1359 z, r = r, z 1360 1361 zz = zz.mul(z, z) 1362 zz, z = z, zz 1363 zz, r = zz.div(r, z, m) 1364 z, r = r, z 1365 } 1366 1367 zz = zz.mul(z, powers[yi>>(_W-n)]) 1368 zz, z = z, zz 1369 zz, r = zz.div(r, z, m) 1370 z, r = r, z 1371 1372 yi <<= n 1373 } 1374 } 1375 1376 return z.norm() 1377} 1378 1379// probablyPrime performs reps Miller-Rabin tests to check whether n is prime. 1380// If it returns true, n is prime with probability 1 - 1/4^reps. 1381// If it returns false, n is not prime. 1382func (n nat) probablyPrime(reps int) bool { 1383 if len(n) == 0 { 1384 return false 1385 } 1386 1387 if len(n) == 1 { 1388 if n[0] < 2 { 1389 return false 1390 } 1391 1392 if n[0]%2 == 0 { 1393 return n[0] == 2 1394 } 1395 1396 // We have to exclude these cases because we reject all 1397 // multiples of these numbers below. 1398 switch n[0] { 1399 case 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53: 1400 return true 1401 } 1402 } 1403 1404 const primesProduct32 = 0xC0CFD797 // Π {p ∈ primes, 2 < p <= 29} 1405 const primesProduct64 = 0xE221F97C30E94E1D // Π {p ∈ primes, 2 < p <= 53} 1406 1407 var r Word 1408 switch _W { 1409 case 32: 1410 r = n.modW(primesProduct32) 1411 case 64: 1412 r = n.modW(primesProduct64 & _M) 1413 default: 1414 panic("Unknown word size") 1415 } 1416 1417 if r%3 == 0 || r%5 == 0 || r%7 == 0 || r%11 == 0 || 1418 r%13 == 0 || r%17 == 0 || r%19 == 0 || r%23 == 0 || r%29 == 0 { 1419 return false 1420 } 1421 1422 if _W == 64 && (r%31 == 0 || r%37 == 0 || r%41 == 0 || 1423 r%43 == 0 || r%47 == 0 || r%53 == 0) { 1424 return false 1425 } 1426 1427 nm1 := nat(nil).sub(n, natOne) 1428 // determine q, k such that nm1 = q << k 1429 k := nm1.trailingZeroBits() 1430 q := nat(nil).shr(nm1, k) 1431 1432 nm3 := nat(nil).sub(nm1, natTwo) 1433 rand := rand.New(rand.NewSource(int64(n[0]))) 1434 1435 var x, y, quotient nat 1436 nm3Len := nm3.bitLen() 1437 1438NextRandom: 1439 for i := 0; i < reps; i++ { 1440 x = x.random(rand, nm3, nm3Len) 1441 x = x.add(x, natTwo) 1442 y = y.expNN(x, q, n) 1443 if y.cmp(natOne) == 0 || y.cmp(nm1) == 0 { 1444 continue 1445 } 1446 for j := uint(1); j < k; j++ { 1447 y = y.mul(y, y) 1448 quotient, y = quotient.div(y, y, n) 1449 if y.cmp(nm1) == 0 { 1450 continue NextRandom 1451 } 1452 if y.cmp(natOne) == 0 { 1453 return false 1454 } 1455 } 1456 return false 1457 } 1458 1459 return true 1460} 1461 1462// bytes writes the value of z into buf using big-endian encoding. 1463// len(buf) must be >= len(z)*_S. The value of z is encoded in the 1464// slice buf[i:]. The number i of unused bytes at the beginning of 1465// buf is returned as result. 1466func (z nat) bytes(buf []byte) (i int) { 1467 i = len(buf) 1468 for _, d := range z { 1469 for j := 0; j < _S; j++ { 1470 i-- 1471 buf[i] = byte(d) 1472 d >>= 8 1473 } 1474 } 1475 1476 for i < len(buf) && buf[i] == 0 { 1477 i++ 1478 } 1479 1480 return 1481} 1482 1483// setBytes interprets buf as the bytes of a big-endian unsigned 1484// integer, sets z to that value, and returns z. 1485func (z nat) setBytes(buf []byte) nat { 1486 z = z.make((len(buf) + _S - 1) / _S) 1487 1488 k := 0 1489 s := uint(0) 1490 var d Word 1491 for i := len(buf); i > 0; i-- { 1492 d |= Word(buf[i-1]) << s 1493 if s += 8; s == _S*8 { 1494 z[k] = d 1495 k++ 1496 s = 0 1497 d = 0 1498 } 1499 } 1500 if k < len(z) { 1501 z[k] = d 1502 } 1503 1504 return z.norm() 1505} 1506