1 /* mpn_gcdext -- Extended Greatest Common Divisor. 2 3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software 4 Foundation, Inc. 5 6 This file is part of the GNU MP Library. 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 9 it under the terms of the GNU Lesser General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or (at your 11 option) any later version. 12 13 The GNU MP Library is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 License for more details. 17 18 You should have received a copy of the GNU Lesser General Public License 19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 20 21 #include "gmp.h" 22 #include "gmp-impl.h" 23 #include "longlong.h" 24 25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and 26 the size is returned (if inputs are non-normalized, result may be 27 non-normalized too). Temporary space needed is M->n + n. 28 */ 29 static size_t 30 hgcd_mul_matrix_vector (struct hgcd_matrix *M, 31 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) 32 { 33 mp_limb_t ah, bh; 34 35 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as 36 37 t = u00 * a 38 r = u10 * b 39 r += t; 40 41 t = u11 * b 42 b = u01 * a 43 b += t; 44 */ 45 46 if (M->n >= n) 47 { 48 mpn_mul (tp, M->p[0][0], M->n, ap, n); 49 mpn_mul (rp, M->p[1][0], M->n, bp, n); 50 } 51 else 52 { 53 mpn_mul (tp, ap, n, M->p[0][0], M->n); 54 mpn_mul (rp, bp, n, M->p[1][0], M->n); 55 } 56 57 ah = mpn_add_n (rp, rp, tp, n + M->n); 58 59 if (M->n >= n) 60 { 61 mpn_mul (tp, M->p[1][1], M->n, bp, n); 62 mpn_mul (bp, M->p[0][1], M->n, ap, n); 63 } 64 else 65 { 66 mpn_mul (tp, bp, n, M->p[1][1], M->n); 67 mpn_mul (bp, ap, n, M->p[0][1], M->n); 68 } 69 bh = mpn_add_n (bp, bp, tp, n + M->n); 70 71 n += M->n; 72 if ( (ah | bh) > 0) 73 { 74 rp[n] = ah; 75 bp[n] = bh; 76 n++; 77 } 78 else 79 { 80 /* Normalize */ 81 while ( (rp[n-1] | bp[n-1]) == 0) 82 n--; 83 } 84 85 return n; 86 } 87 88 #define COMPUTE_V_ITCH(n) (2*(n) + 1) 89 90 /* Computes |v| = |(g - u a)| / b, where u may be positive or 91 negative, and v is of the opposite sign. a, b are of size n, u and 92 v at most size n, and v must have space for n+1 limbs. */ 93 static mp_size_t 94 compute_v (mp_ptr vp, 95 mp_srcptr ap, mp_srcptr bp, mp_size_t n, 96 mp_srcptr gp, mp_size_t gn, 97 mp_srcptr up, mp_size_t usize, 98 mp_ptr tp) 99 { 100 mp_size_t size; 101 mp_size_t an; 102 mp_size_t bn; 103 mp_size_t vn; 104 105 ASSERT (n > 0); 106 ASSERT (gn > 0); 107 ASSERT (usize != 0); 108 109 size = ABS (usize); 110 ASSERT (size <= n); 111 112 an = n; 113 MPN_NORMALIZE (ap, an); 114 115 if (an >= size) 116 mpn_mul (tp, ap, an, up, size); 117 else 118 mpn_mul (tp, up, size, ap, an); 119 120 size += an; 121 122 ASSERT (gn <= size); 123 124 if (usize > 0) 125 { 126 /* |v| = -v = (u a - g) / b */ 127 128 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); 129 MPN_NORMALIZE (tp, size); 130 if (size == 0) 131 return 0; 132 } 133 else 134 { /* usize < 0 */ 135 /* |v| = v = (c - u a) / b = (c + |u| a) / b */ 136 mp_limb_t cy = mpn_add (tp, tp, size, gp, gn); 137 if (cy) 138 tp[size++] = cy; 139 } 140 141 /* Now divide t / b. There must be no remainder */ 142 bn = n; 143 MPN_NORMALIZE (bp, bn); 144 ASSERT (size >= bn); 145 146 vn = size + 1 - bn; 147 ASSERT (vn <= n + 1); 148 149 /* FIXME: Use divexact. Or do the entire calculation mod 2^{n * 150 GMP_NUMB_BITS}. */ 151 mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn); 152 vn -= (vp[vn-1] == 0); 153 154 /* Remainder must be zero */ 155 #if WANT_ASSERT 156 { 157 mp_size_t i; 158 for (i = 0; i < bn; i++) 159 { 160 ASSERT (tp[i] == 0); 161 } 162 } 163 #endif 164 return vn; 165 } 166 167 /* Temporary storage: 168 169 Initial division: Quotient of at most an - n + 1 <= an limbs. 170 171 Storage for u0 and u1: 2(n+1). 172 173 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) 174 175 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. 176 177 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors. 178 179 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. 180 181 For the lehmer call after the loop, Let T denote 182 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for 183 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T 184 + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient. 185 186 */ 187 188 /* Optimal choice of p seems difficult. In each iteration the division 189 * of work between hgcd and the updates of u0 and u1 depends on the 190 * current size of the u. It may be desirable to use a different 191 * choice of p in each iteration. Also the input size seems to matter; 192 * choosing p = n / 3 in the first iteration seems to improve 193 * performance slightly for input size just above the threshold, but 194 * degrade performance for larger inputs. */ 195 #define CHOOSE_P_1(n) ((n) / 2) 196 #define CHOOSE_P_2(n) ((n) / 3) 197 198 mp_size_t 199 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, 200 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) 201 { 202 mp_size_t talloc; 203 mp_size_t scratch; 204 mp_size_t matrix_scratch; 205 mp_size_t ualloc = n + 1; 206 207 mp_size_t un; 208 mp_ptr u0; 209 mp_ptr u1; 210 211 mp_ptr tp; 212 213 TMP_DECL; 214 215 ASSERT (an >= n); 216 ASSERT (n > 0); 217 218 TMP_MARK; 219 220 /* FIXME: Check for small sizes first, before setting up temporary 221 storage etc. */ 222 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); 223 224 /* For initial division */ 225 scratch = an - n + 1; 226 if (scratch > talloc) 227 talloc = scratch; 228 229 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 230 { 231 /* For hgcd loop. */ 232 mp_size_t hgcd_scratch; 233 mp_size_t update_scratch; 234 mp_size_t p1 = CHOOSE_P_1 (n); 235 mp_size_t p2 = CHOOSE_P_2 (n); 236 mp_size_t min_p = MIN(p1, p2); 237 mp_size_t max_p = MAX(p1, p2); 238 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); 239 hgcd_scratch = mpn_hgcd_itch (n - min_p); 240 update_scratch = max_p + n - 1; 241 242 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); 243 if (scratch > talloc) 244 talloc = scratch; 245 246 /* Final mpn_gcdext_lehmer_n call. Need space for u and for 247 copies of a and b. */ 248 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) 249 + 3*GCDEXT_DC_THRESHOLD; 250 251 if (scratch > talloc) 252 talloc = scratch; 253 254 /* Cofactors u0 and u1 */ 255 talloc += 2*(n+1); 256 } 257 258 tp = TMP_ALLOC_LIMBS(talloc); 259 260 if (an > n) 261 { 262 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); 263 264 if (mpn_zero_p (ap, n)) 265 { 266 MPN_COPY (gp, bp, n); 267 *usizep = 0; 268 TMP_FREE; 269 return n; 270 } 271 } 272 273 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 274 { 275 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); 276 277 TMP_FREE; 278 return gn; 279 } 280 281 MPN_ZERO (tp, 2*ualloc); 282 u0 = tp; tp += ualloc; 283 u1 = tp; tp += ualloc; 284 285 { 286 /* For the first hgcd call, there are no u updates, and it makes 287 some sense to use a different choice for p. */ 288 289 /* FIXME: We could trim use of temporary storage, since u0 and u1 290 are not used yet. For the hgcd call, we could swap in the u0 291 and u1 pointers for the relevant matrix elements. */ 292 293 struct hgcd_matrix M; 294 mp_size_t p = CHOOSE_P_1 (n); 295 mp_size_t nn; 296 297 mpn_hgcd_matrix_init (&M, n - p, tp); 298 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 299 if (nn > 0) 300 { 301 ASSERT (M.n <= (n - p - 1)/2); 302 ASSERT (M.n + p <= (p + n - 1) / 2); 303 304 /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 305 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); 306 307 MPN_COPY (u0, M.p[1][0], M.n); 308 MPN_COPY (u1, M.p[1][1], M.n); 309 un = M.n; 310 while ( (u0[un-1] | u1[un-1] ) == 0) 311 un--; 312 } 313 else 314 { 315 /* mpn_hgcd has failed. Then either one of a or b is very 316 small, or the difference is very small. Perform one 317 subtraction followed by one division. */ 318 mp_size_t gn; 319 mp_size_t updated_un = 1; 320 321 u1[0] = 1; 322 323 /* Temporary storage 2n + 1 */ 324 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, 325 u0, u1, &updated_un, tp, tp + n); 326 if (n == 0) 327 { 328 TMP_FREE; 329 return gn; 330 } 331 332 un = updated_un; 333 ASSERT (un < ualloc); 334 } 335 } 336 337 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 338 { 339 struct hgcd_matrix M; 340 mp_size_t p = CHOOSE_P_2 (n); 341 mp_size_t nn; 342 343 mpn_hgcd_matrix_init (&M, n - p, tp); 344 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 345 if (nn > 0) 346 { 347 mp_ptr t0; 348 349 t0 = tp + matrix_scratch; 350 ASSERT (M.n <= (n - p - 1)/2); 351 ASSERT (M.n + p <= (p + n - 1) / 2); 352 353 /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 354 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); 355 356 /* By the same analysis as for mpn_hgcd_matrix_mul */ 357 ASSERT (M.n + un <= ualloc); 358 359 /* FIXME: This copying could be avoided by some swapping of 360 * pointers. May need more temporary storage, though. */ 361 MPN_COPY (t0, u0, un); 362 363 /* Temporary storage ualloc */ 364 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); 365 366 ASSERT (un < ualloc); 367 ASSERT ( (u0[un-1] | u1[un-1]) > 0); 368 } 369 else 370 { 371 /* mpn_hgcd has failed. Then either one of a or b is very 372 small, or the difference is very small. Perform one 373 subtraction followed by one division. */ 374 mp_size_t gn; 375 mp_size_t updated_un = un; 376 377 /* Temporary storage 2n + 1 */ 378 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, 379 u0, u1, &updated_un, tp, tp + n); 380 if (n == 0) 381 { 382 TMP_FREE; 383 return gn; 384 } 385 386 un = updated_un; 387 ASSERT (un < ualloc); 388 } 389 } 390 391 if (mpn_zero_p (ap, n)) 392 { 393 MPN_COPY (gp, bp, n); 394 MPN_NORMALIZE_NOT_ZERO (u0, un); 395 MPN_COPY (up, u0, un); 396 *usizep = -un; 397 398 TMP_FREE; 399 return n; 400 } 401 else if (mpn_zero_p (bp, n)) 402 { 403 MPN_COPY (gp, ap, n); 404 MPN_NORMALIZE_NOT_ZERO (u1, un); 405 MPN_COPY (up, u1, un); 406 *usizep = un; 407 408 TMP_FREE; 409 return n; 410 } 411 else if (mpn_zero_p (u0, un)) 412 { 413 mp_size_t gn; 414 ASSERT (un == 1); 415 ASSERT (u1[0] == 1); 416 417 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ 418 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); 419 420 TMP_FREE; 421 return gn; 422 } 423 else 424 { 425 /* We have A = ... a + ... b 426 B = u0 a + u1 b 427 428 a = u1 A + ... B 429 b = -u0 A + ... B 430 431 with bounds 432 433 |u0|, |u1| <= B / min(a, b) 434 435 Compute g = u a + v b = (u u1 - v u0) A + (...) B 436 Here, u, v are bounded by 437 438 |u| <= b, 439 |v| <= a 440 */ 441 442 mp_size_t u0n; 443 mp_size_t u1n; 444 mp_size_t lehmer_un; 445 mp_size_t lehmer_vn; 446 mp_size_t gn; 447 448 mp_ptr lehmer_up; 449 mp_ptr lehmer_vp; 450 int negate; 451 452 lehmer_up = tp; tp += n; 453 454 /* Call mpn_gcdext_lehmer_n with copies of a and b. */ 455 MPN_COPY (tp, ap, n); 456 MPN_COPY (tp + n, bp, n); 457 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); 458 459 u0n = un; 460 MPN_NORMALIZE (u0, u0n); 461 if (lehmer_un == 0) 462 { 463 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ 464 MPN_COPY (up, u0, u0n); 465 *usizep = -u0n; 466 467 TMP_FREE; 468 return gn; 469 } 470 471 lehmer_vp = tp; 472 /* Compute v = (g - u a) / b */ 473 lehmer_vn = compute_v (lehmer_vp, 474 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); 475 476 if (lehmer_un > 0) 477 negate = 0; 478 else 479 { 480 lehmer_un = -lehmer_un; 481 negate = 1; 482 } 483 484 u1n = un; 485 MPN_NORMALIZE (u1, u1n); 486 487 /* It's possible that u0 = 1, u1 = 0 */ 488 if (u1n == 0) 489 { 490 ASSERT (un == 1); 491 ASSERT (u0[0] == 1); 492 493 /* u1 == 0 ==> u u1 + v u0 = v */ 494 MPN_COPY (up, lehmer_vp, lehmer_vn); 495 *usizep = negate ? lehmer_vn : - lehmer_vn; 496 497 TMP_FREE; 498 return gn; 499 } 500 501 ASSERT (lehmer_un + u1n <= ualloc); 502 ASSERT (lehmer_vn + u0n <= ualloc); 503 504 /* Now u0, u1, u are non-zero. We may still have v == 0 */ 505 506 /* Compute u u0 */ 507 if (lehmer_un <= u1n) 508 /* Should be the common case */ 509 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); 510 else 511 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); 512 513 un = u1n + lehmer_un; 514 un -= (up[un - 1] == 0); 515 516 if (lehmer_vn > 0) 517 { 518 mp_limb_t cy; 519 520 /* Overwrites old u1 value */ 521 if (lehmer_vn <= u0n) 522 /* Should be the common case */ 523 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); 524 else 525 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); 526 527 u1n = u0n + lehmer_vn; 528 u1n -= (u1[u1n - 1] == 0); 529 530 if (u1n <= un) 531 { 532 cy = mpn_add (up, up, un, u1, u1n); 533 } 534 else 535 { 536 cy = mpn_add (up, u1, u1n, up, un); 537 un = u1n; 538 } 539 up[un] = cy; 540 un += (cy != 0); 541 542 ASSERT (un < ualloc); 543 } 544 *usizep = negate ? -un : un; 545 546 TMP_FREE; 547 return gn; 548 } 549 } 550