1 /* hgcd.c. 2 3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 6 7 Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc. 8 9 This file is part of the GNU MP Library. 10 11 The GNU MP Library is free software; you can redistribute it and/or modify 12 it under the terms of the GNU Lesser General Public License as published by 13 the Free Software Foundation; either version 3 of the License, or (at your 14 option) any later version. 15 16 The GNU MP Library is distributed in the hope that it will be useful, but 17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 19 License for more details. 20 21 You should have received a copy of the GNU Lesser General Public License 22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 23 24 #include "gmp.h" 25 #include "gmp-impl.h" 26 #include "longlong.h" 27 28 /* For input of size n, matrix elements are of size at most ceil(n/2) 29 - 1, but we need two limbs extra. */ 30 void 31 mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p) 32 { 33 mp_size_t s = (n+1)/2 + 1; 34 M->alloc = s; 35 M->n = 1; 36 MPN_ZERO (p, 4 * s); 37 M->p[0][0] = p; 38 M->p[0][1] = p + s; 39 M->p[1][0] = p + 2 * s; 40 M->p[1][1] = p + 3 * s; 41 42 M->p[0][0][0] = M->p[1][1][0] = 1; 43 } 44 45 /* Updated column COL, adding in column (1-COL). */ 46 static void 47 hgcd_matrix_update_1 (struct hgcd_matrix *M, unsigned col) 48 { 49 mp_limb_t c0, c1; 50 ASSERT (col < 2); 51 52 c0 = mpn_add_n (M->p[0][col], M->p[0][0], M->p[0][1], M->n); 53 c1 = mpn_add_n (M->p[1][col], M->p[1][0], M->p[1][1], M->n); 54 55 M->p[0][col][M->n] = c0; 56 M->p[1][col][M->n] = c1; 57 58 M->n += (c0 | c1) != 0; 59 ASSERT (M->n < M->alloc); 60 } 61 62 /* Updated column COL, adding in column Q * (1-COL). Temporary 63 * storage: qn + n <= M->alloc, where n is the size of the largest 64 * element in column 1 - COL. */ 65 static void 66 hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn, 67 unsigned col, mp_ptr tp) 68 { 69 ASSERT (col < 2); 70 71 if (qn == 1) 72 { 73 mp_limb_t q = qp[0]; 74 mp_limb_t c0, c1; 75 76 c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q); 77 c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q); 78 79 M->p[0][col][M->n] = c0; 80 M->p[1][col][M->n] = c1; 81 82 M->n += (c0 | c1) != 0; 83 } 84 else 85 { 86 unsigned row; 87 88 /* Carries for the unlikely case that we get both high words 89 from the multiplication and carries from the addition. */ 90 mp_limb_t c[2]; 91 mp_size_t n; 92 93 /* The matrix will not necessarily grow in size by qn, so we 94 need normalization in order not to overflow M. */ 95 96 for (n = M->n; n + qn > M->n; n--) 97 { 98 ASSERT (n > 0); 99 if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0) 100 break; 101 } 102 103 ASSERT (qn + n <= M->alloc); 104 105 for (row = 0; row < 2; row++) 106 { 107 if (qn <= n) 108 mpn_mul (tp, M->p[row][1-col], n, qp, qn); 109 else 110 mpn_mul (tp, qp, qn, M->p[row][1-col], n); 111 112 ASSERT (n + qn >= M->n); 113 c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n); 114 } 115 if (c[0] | c[1]) 116 { 117 M->n = n + qn + 1; 118 M->p[0][col][M->n - 1] = c[0]; 119 M->p[1][col][M->n - 1] = c[1]; 120 } 121 else 122 { 123 n += qn; 124 n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0; 125 if (n > M->n) 126 M->n = n; 127 } 128 } 129 130 ASSERT (M->n < M->alloc); 131 } 132 133 /* Multiply M by M1 from the right. Since the M1 elements fit in 134 GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs 135 temporary space M->n */ 136 static void 137 hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1, 138 mp_ptr tp) 139 { 140 mp_size_t n0, n1; 141 142 /* Could avoid copy by some swapping of pointers. */ 143 MPN_COPY (tp, M->p[0][0], M->n); 144 n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n); 145 MPN_COPY (tp, M->p[1][0], M->n); 146 n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n); 147 148 /* Depends on zero initialization */ 149 M->n = MAX(n0, n1); 150 ASSERT (M->n < M->alloc); 151 } 152 153 /* Perform a few steps, using some of mpn_hgcd2, subtraction and 154 division. Reduces the size by almost one limb or more, but never 155 below the given size s. Return new size for a and b, or 0 if no 156 more steps are possible. 157 158 If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n 159 limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2 160 fails, needs space for the quotient, qn <= n - s + 1 limbs, for and 161 hgcd_matrix_update_q, qn + (size of the appropriate column of M) <= 162 resulting size of $. 163 164 If N is the input size to the calling hgcd, then s = floor(N/2) + 165 1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1 166 < N, so N is sufficient. 167 */ 168 169 static mp_size_t 170 hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s, 171 struct hgcd_matrix *M, mp_ptr tp) 172 { 173 struct hgcd_matrix1 M1; 174 mp_limb_t mask; 175 mp_limb_t ah, al, bh, bl; 176 mp_size_t an, bn, qn; 177 int col; 178 179 ASSERT (n > s); 180 181 mask = ap[n-1] | bp[n-1]; 182 ASSERT (mask > 0); 183 184 if (n == s + 1) 185 { 186 if (mask < 4) 187 goto subtract; 188 189 ah = ap[n-1]; al = ap[n-2]; 190 bh = bp[n-1]; bl = bp[n-2]; 191 } 192 else if (mask & GMP_NUMB_HIGHBIT) 193 { 194 ah = ap[n-1]; al = ap[n-2]; 195 bh = bp[n-1]; bl = bp[n-2]; 196 } 197 else 198 { 199 int shift; 200 201 count_leading_zeros (shift, mask); 202 ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); 203 al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); 204 bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); 205 bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); 206 } 207 208 /* Try an mpn_hgcd2 step */ 209 if (mpn_hgcd2 (ah, al, bh, bl, &M1)) 210 { 211 /* Multiply M <- M * M1 */ 212 hgcd_matrix_mul_1 (M, &M1, tp); 213 214 /* Can't swap inputs, so we need to copy. */ 215 MPN_COPY (tp, ap, n); 216 /* Multiply M1^{-1} (a;b) */ 217 return mpn_hgcd_mul_matrix1_inverse_vector (&M1, ap, tp, bp, n); 218 } 219 220 subtract: 221 /* There are two ways in which mpn_hgcd2 can fail. Either one of ah and 222 bh was too small, or ah, bh were (almost) equal. Perform one 223 subtraction step (for possible cancellation of high limbs), 224 followed by one division. */ 225 226 /* Since we must ensure that #(a-b) > s, we handle cancellation of 227 high limbs explicitly up front. (FIXME: Or is it better to just 228 subtract, normalize, and use an addition to undo if it turns out 229 the the difference is too small?) */ 230 for (an = n; an > s; an--) 231 if (ap[an-1] != bp[an-1]) 232 break; 233 234 if (an == s) 235 return 0; 236 237 /* Maintain a > b. When needed, swap a and b, and let col keep track 238 of how to update M. */ 239 if (ap[an-1] > bp[an-1]) 240 { 241 /* a is largest. In the subtraction step, we need to update 242 column 1 of M */ 243 col = 1; 244 } 245 else 246 { 247 MP_PTR_SWAP (ap, bp); 248 col = 0; 249 } 250 251 bn = n; 252 MPN_NORMALIZE (bp, bn); 253 if (bn <= s) 254 return 0; 255 256 /* We have #a, #b > s. When is it possible that #(a-b) < s? For 257 cancellation to happen, the numbers must be of the form 258 259 a = x + 1, 0, ..., 0, al 260 b = x , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl 261 262 where al, bl denotes the least significant k limbs. If al < bl, 263 then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX, 264 then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */ 265 266 if (ap[an-1] == bp[an-1] + 1) 267 { 268 mp_size_t k; 269 int c; 270 for (k = an-1; k > s; k--) 271 if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX) 272 break; 273 274 MPN_CMP (c, ap, bp, k); 275 if (c < 0) 276 { 277 mp_limb_t cy; 278 279 /* The limbs from k and up are cancelled. */ 280 if (k == s) 281 return 0; 282 cy = mpn_sub_n (ap, ap, bp, k); 283 ASSERT (cy == 1); 284 an = k; 285 } 286 else 287 { 288 ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k)); 289 ap[k] = 1; 290 an = k + 1; 291 } 292 } 293 else 294 ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an)); 295 296 ASSERT (an > s); 297 ASSERT (ap[an-1] > 0); 298 ASSERT (bn > s); 299 ASSERT (bp[bn-1] > 0); 300 301 hgcd_matrix_update_1 (M, col); 302 303 if (an < bn) 304 { 305 MPN_PTR_SWAP (ap, an, bp, bn); 306 col ^= 1; 307 } 308 else if (an == bn) 309 { 310 int c; 311 MPN_CMP (c, ap, bp, an); 312 if (c < 0) 313 { 314 MP_PTR_SWAP (ap, bp); 315 col ^= 1; 316 } 317 } 318 319 /* Divide a / b. */ 320 qn = an + 1 - bn; 321 322 /* FIXME: We could use an approximate division, that may return a 323 too small quotient, and only guarantee that the size of r is 324 almost the size of b. FIXME: Let ap and remainder overlap. */ 325 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn); 326 qn -= (tp[qn -1] == 0); 327 328 /* Normalize remainder */ 329 an = bn; 330 for ( ; an > s; an--) 331 if (ap[an-1] > 0) 332 break; 333 334 if (an <= s) 335 { 336 /* Quotient is too large */ 337 mp_limb_t cy; 338 339 cy = mpn_add (ap, bp, bn, ap, an); 340 341 if (cy > 0) 342 { 343 ASSERT (bn < n); 344 ap[bn] = cy; 345 bp[bn] = 0; 346 bn++; 347 } 348 349 MPN_DECR_U (tp, qn, 1); 350 qn -= (tp[qn-1] == 0); 351 } 352 353 if (qn > 0) 354 hgcd_matrix_update_q (M, tp, qn, col, tp + qn); 355 356 return bn; 357 } 358 359 /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M 360 with elements of size at most (n+1)/2 - 1. Returns new size of a, 361 b, or zero if no reduction is possible. */ 362 mp_size_t 363 mpn_hgcd_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n, 364 struct hgcd_matrix *M, mp_ptr tp) 365 { 366 mp_size_t s = n/2 + 1; 367 mp_size_t nn; 368 369 ASSERT (n > s); 370 ASSERT (ap[n-1] > 0 || bp[n-1] > 0); 371 372 nn = hgcd_step (n, ap, bp, s, M, tp); 373 if (!nn) 374 return 0; 375 376 for (;;) 377 { 378 n = nn; 379 ASSERT (n > s); 380 nn = hgcd_step (n, ap, bp, s, M, tp); 381 if (!nn ) 382 return n; 383 } 384 } 385 386 /* Multiply M by M1 from the right. Needs 3*(M->n + M1->n) + 5 limbs 387 of temporary storage (see mpn_matrix22_mul_itch). */ 388 void 389 mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1, 390 mp_ptr tp) 391 { 392 mp_size_t n; 393 394 /* About the new size of M:s elements. Since M1's diagonal elements 395 are > 0, no element can decrease. The new elements are of size 396 M->n + M1->n, one limb more or less. The computation of the 397 matrix product produces elements of size M->n + M1->n + 1. But 398 the true size, after normalization, may be three limbs smaller. 399 400 The reason that the product has normalized size >= M->n + M1->n - 401 2 is subtle. It depends on the fact that M and M1 can be factored 402 as products of (1,1; 0,1) and (1,0; 1,1), and that we can't have 403 M ending with a large power and M1 starting with a large power of 404 the same matrix. */ 405 406 /* FIXME: Strassen multiplication gives only a small speedup. In FFT 407 multiplication range, this function could be sped up quite a lot 408 using invariance. */ 409 ASSERT (M->n + M1->n < M->alloc); 410 411 ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1] 412 | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0); 413 414 ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1] 415 | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0); 416 417 mpn_matrix22_mul (M->p[0][0], M->p[0][1], 418 M->p[1][0], M->p[1][1], M->n, 419 M1->p[0][0], M1->p[0][1], 420 M1->p[1][0], M1->p[1][1], M1->n, tp); 421 422 /* Index of last potentially non-zero limb, size is one greater. */ 423 n = M->n + M1->n; 424 425 n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); 426 n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); 427 n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); 428 429 ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0); 430 431 M->n = n + 1; 432 } 433 434 /* Multiplies the least significant p limbs of (a;b) by M^-1. 435 Temporary space needed: 2 * (p + M->n)*/ 436 mp_size_t 437 mpn_hgcd_matrix_adjust (struct hgcd_matrix *M, 438 mp_size_t n, mp_ptr ap, mp_ptr bp, 439 mp_size_t p, mp_ptr tp) 440 { 441 /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b) 442 = (r11 a - r01 b; - r10 a + r00 b */ 443 444 mp_ptr t0 = tp; 445 mp_ptr t1 = tp + p + M->n; 446 mp_limb_t ah, bh; 447 mp_limb_t cy; 448 449 ASSERT (p + M->n < n); 450 451 /* First compute the two values depending on a, before overwriting a */ 452 453 if (M->n >= p) 454 { 455 mpn_mul (t0, M->p[1][1], M->n, ap, p); 456 mpn_mul (t1, M->p[1][0], M->n, ap, p); 457 } 458 else 459 { 460 mpn_mul (t0, ap, p, M->p[1][1], M->n); 461 mpn_mul (t1, ap, p, M->p[1][0], M->n); 462 } 463 464 /* Update a */ 465 MPN_COPY (ap, t0, p); 466 ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n); 467 468 if (M->n >= p) 469 mpn_mul (t0, M->p[0][1], M->n, bp, p); 470 else 471 mpn_mul (t0, bp, p, M->p[0][1], M->n); 472 473 cy = mpn_sub (ap, ap, n, t0, p + M->n); 474 ASSERT (cy <= ah); 475 ah -= cy; 476 477 /* Update b */ 478 if (M->n >= p) 479 mpn_mul (t0, M->p[0][0], M->n, bp, p); 480 else 481 mpn_mul (t0, bp, p, M->p[0][0], M->n); 482 483 MPN_COPY (bp, t0, p); 484 bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n); 485 cy = mpn_sub (bp, bp, n, t1, p + M->n); 486 ASSERT (cy <= bh); 487 bh -= cy; 488 489 if (ah > 0 || bh > 0) 490 { 491 ap[n] = ah; 492 bp[n] = bh; 493 n++; 494 } 495 else 496 { 497 /* The subtraction can reduce the size by at most one limb. */ 498 if (ap[n-1] == 0 && bp[n-1] == 0) 499 n--; 500 } 501 ASSERT (ap[n-1] > 0 || bp[n-1] > 0); 502 return n; 503 } 504 505 /* Size analysis for hgcd: 506 507 For the recursive calls, we have n1 <= ceil(n / 2). Then the 508 storage need is determined by the storage for the recursive call 509 computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1 510 (after this, the storage needed for M1 can be recycled). 511 512 Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1) 513 = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2, 514 and for the hgcd_matrix_mul, we may need 3 ceil(n/2) + 8. In total, 515 4 * ceil(n/4) + 3 ceil(n/2) + 12 <= 10 ceil(n/4) + 12. 516 517 For the recursive call, we need S(n1) = S(ceil(n/2)). 518 519 S(n) <= 10*ceil(n/4) + 12 + S(ceil(n/2)) 520 <= 10*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 12k + S(ceil(n/2^k)) 521 <= 10*(2 ceil(n/4) + k) + 12k + S(ceil(n/2^k)) 522 <= 20 ceil(n/4) + 22k + S(ceil(n/2^k)) 523 */ 524 525 mp_size_t 526 mpn_hgcd_itch (mp_size_t n) 527 { 528 unsigned k; 529 int count; 530 mp_size_t nscaled; 531 532 if (BELOW_THRESHOLD (n, HGCD_THRESHOLD)) 533 return MPN_HGCD_LEHMER_ITCH (n); 534 535 /* Get the recursion depth. */ 536 nscaled = (n - 1) / (HGCD_THRESHOLD - 1); 537 count_leading_zeros (count, nscaled); 538 k = GMP_LIMB_BITS - count; 539 540 return 20 * ((n+3) / 4) + 22 * k 541 + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD); 542 } 543 544 /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M 545 with elements of size at most (n+1)/2 - 1. Returns new size of a, 546 b, or zero if no reduction is possible. */ 547 548 mp_size_t 549 mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n, 550 struct hgcd_matrix *M, mp_ptr tp) 551 { 552 mp_size_t s = n/2 + 1; 553 mp_size_t n2 = (3*n)/4 + 1; 554 555 mp_size_t p, nn; 556 int success = 0; 557 558 if (n <= s) 559 /* Happens when n <= 2, a fairly uninteresting case but exercised 560 by the random inputs of the testsuite. */ 561 return 0; 562 563 ASSERT ((ap[n-1] | bp[n-1]) > 0); 564 565 ASSERT ((n+1)/2 - 1 < M->alloc); 566 567 if (BELOW_THRESHOLD (n, HGCD_THRESHOLD)) 568 return mpn_hgcd_lehmer (ap, bp, n, M, tp); 569 570 p = n/2; 571 nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp); 572 if (nn > 0) 573 { 574 /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1) 575 = 2 (n - 1) */ 576 n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp); 577 success = 1; 578 } 579 while (n > n2) 580 { 581 /* Needs n + 1 storage */ 582 nn = hgcd_step (n, ap, bp, s, M, tp); 583 if (!nn) 584 return success ? n : 0; 585 n = nn; 586 success = 1; 587 } 588 589 if (n > s + 2) 590 { 591 struct hgcd_matrix M1; 592 mp_size_t scratch; 593 594 p = 2*s - n + 1; 595 scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p); 596 597 mpn_hgcd_matrix_init(&M1, n - p, tp); 598 nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch); 599 if (nn > 0) 600 { 601 /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */ 602 ASSERT (M->n + 2 >= M1.n); 603 604 /* Furthermore, assume M ends with a quotient (1, q; 0, 1), 605 then either q or q + 1 is a correct quotient, and M1 will 606 start with either (1, 0; 1, 1) or (2, 1; 1, 1). This 607 rules out the case that the size of M * M1 is much 608 smaller than the expected M->n + M1->n. */ 609 610 ASSERT (M->n + M1.n < M->alloc); 611 612 /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1) 613 = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */ 614 n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch); 615 616 /* We need a bound for of M->n + M1.n. Let n be the original 617 input size. Then 618 619 ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2 620 621 and it follows that 622 623 M.n + M1.n <= ceil(n/2) + 1 624 625 Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the 626 amount of needed scratch space. */ 627 mpn_hgcd_matrix_mul (M, &M1, tp + scratch); 628 success = 1; 629 } 630 } 631 632 /* This really is the base case */ 633 for (;;) 634 { 635 /* Needs s+3 < n */ 636 nn = hgcd_step (n, ap, bp, s, M, tp); 637 if (!nn) 638 return success ? n : 0; 639 640 n = nn; 641 success = 1; 642 } 643 } 644