1 /* mpn_gcdext -- Extended Greatest Common Divisor. 2 3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software 4 Foundation, Inc. 5 6 This file is part of the GNU MP Library. 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 9 it under the terms of the GNU Lesser General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or (at your 11 option) any later version. 12 13 The GNU MP Library is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 License for more details. 17 18 You should have received a copy of the GNU Lesser General Public License 19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 20 21 #include "gmp.h" 22 #include "gmp-impl.h" 23 #include "longlong.h" 24 25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and 26 the size is returned (if inputs are non-normalized, result may be 27 non-normalized too). Temporary space needed is M->n + n. 28 */ 29 static size_t 30 hgcd_mul_matrix_vector (struct hgcd_matrix *M, 31 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) 32 { 33 mp_limb_t ah, bh; 34 35 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as 36 37 t = u00 * a 38 r = u10 * b 39 r += t; 40 41 t = u11 * b 42 b = u01 * a 43 b += t; 44 */ 45 46 if (M->n >= n) 47 { 48 mpn_mul (tp, M->p[0][0], M->n, ap, n); 49 mpn_mul (rp, M->p[1][0], M->n, bp, n); 50 } 51 else 52 { 53 mpn_mul (tp, ap, n, M->p[0][0], M->n); 54 mpn_mul (rp, bp, n, M->p[1][0], M->n); 55 } 56 57 ah = mpn_add_n (rp, rp, tp, n + M->n); 58 59 if (M->n >= n) 60 { 61 mpn_mul (tp, M->p[1][1], M->n, bp, n); 62 mpn_mul (bp, M->p[0][1], M->n, ap, n); 63 } 64 else 65 { 66 mpn_mul (tp, bp, n, M->p[1][1], M->n); 67 mpn_mul (bp, ap, n, M->p[0][1], M->n); 68 } 69 bh = mpn_add_n (bp, bp, tp, n + M->n); 70 71 n += M->n; 72 if ( (ah | bh) > 0) 73 { 74 rp[n] = ah; 75 bp[n] = bh; 76 n++; 77 } 78 else 79 { 80 /* Normalize */ 81 while ( (rp[n-1] | bp[n-1]) == 0) 82 n--; 83 } 84 85 return n; 86 } 87 88 #define COMPUTE_V_ITCH(n) (2*(n) + 1) 89 90 /* Computes |v| = |(g - u a)| / b, where u may be positive or 91 negative, and v is of the opposite sign. a, b are of size n, u and 92 v at most size n, and v must have space for n+1 limbs. */ 93 static mp_size_t 94 compute_v (mp_ptr vp, 95 mp_srcptr ap, mp_srcptr bp, mp_size_t n, 96 mp_srcptr gp, mp_size_t gn, 97 mp_srcptr up, mp_size_t usize, 98 mp_ptr tp) 99 { 100 mp_size_t size; 101 mp_size_t an; 102 mp_size_t bn; 103 mp_size_t vn; 104 105 ASSERT (n > 0); 106 ASSERT (gn > 0); 107 ASSERT (usize != 0); 108 109 size = ABS (usize); 110 ASSERT (size <= n); 111 112 an = n; 113 MPN_NORMALIZE (ap, an); 114 115 if (an >= size) 116 mpn_mul (tp, ap, an, up, size); 117 else 118 mpn_mul (tp, up, size, ap, an); 119 120 size += an; 121 size -= tp[size - 1] == 0; 122 123 ASSERT (gn <= size); 124 125 if (usize > 0) 126 { 127 /* |v| = -v = (u a - g) / b */ 128 129 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); 130 MPN_NORMALIZE (tp, size); 131 if (size == 0) 132 return 0; 133 } 134 else 135 { /* usize < 0 */ 136 /* |v| = v = (c - u a) / b = (c + |u| a) / b */ 137 mp_limb_t cy = mpn_add (tp, tp, size, gp, gn); 138 if (cy) 139 tp[size++] = cy; 140 } 141 142 /* Now divide t / b. There must be no remainder */ 143 bn = n; 144 MPN_NORMALIZE (bp, bn); 145 ASSERT (size >= bn); 146 147 vn = size + 1 - bn; 148 ASSERT (vn <= n + 1); 149 150 mpn_divexact (vp, tp, size, bp, bn); 151 vn -= (vp[vn-1] == 0); 152 153 return vn; 154 } 155 156 /* Temporary storage: 157 158 Initial division: Quotient of at most an - n + 1 <= an limbs. 159 160 Storage for u0 and u1: 2(n+1). 161 162 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) 163 164 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. 165 166 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors. 167 168 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. 169 170 For the lehmer call after the loop, Let T denote 171 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for 172 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T 173 for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is 174 sufficient for both operations. 175 176 */ 177 178 /* Optimal choice of p seems difficult. In each iteration the division 179 * of work between hgcd and the updates of u0 and u1 depends on the 180 * current size of the u. It may be desirable to use a different 181 * choice of p in each iteration. Also the input size seems to matter; 182 * choosing p = n / 3 in the first iteration seems to improve 183 * performance slightly for input size just above the threshold, but 184 * degrade performance for larger inputs. */ 185 #define CHOOSE_P_1(n) ((n) / 2) 186 #define CHOOSE_P_2(n) ((n) / 3) 187 188 mp_size_t 189 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, 190 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) 191 { 192 mp_size_t talloc; 193 mp_size_t scratch; 194 mp_size_t matrix_scratch; 195 mp_size_t ualloc = n + 1; 196 197 mp_size_t un; 198 mp_ptr u0; 199 mp_ptr u1; 200 201 mp_ptr tp; 202 203 TMP_DECL; 204 205 ASSERT (an >= n); 206 ASSERT (n > 0); 207 208 TMP_MARK; 209 210 /* FIXME: Check for small sizes first, before setting up temporary 211 storage etc. */ 212 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); 213 214 /* For initial division */ 215 scratch = an - n + 1; 216 if (scratch > talloc) 217 talloc = scratch; 218 219 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 220 { 221 /* For hgcd loop. */ 222 mp_size_t hgcd_scratch; 223 mp_size_t update_scratch; 224 mp_size_t p1 = CHOOSE_P_1 (n); 225 mp_size_t p2 = CHOOSE_P_2 (n); 226 mp_size_t min_p = MIN(p1, p2); 227 mp_size_t max_p = MAX(p1, p2); 228 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); 229 hgcd_scratch = mpn_hgcd_itch (n - min_p); 230 update_scratch = max_p + n - 1; 231 232 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); 233 if (scratch > talloc) 234 talloc = scratch; 235 236 /* Final mpn_gcdext_lehmer_n call. Need space for u and for 237 copies of a and b. */ 238 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) 239 + 3*GCDEXT_DC_THRESHOLD; 240 241 if (scratch > talloc) 242 talloc = scratch; 243 244 /* Cofactors u0 and u1 */ 245 talloc += 2*(n+1); 246 } 247 248 tp = TMP_ALLOC_LIMBS(talloc); 249 250 if (an > n) 251 { 252 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); 253 254 if (mpn_zero_p (ap, n)) 255 { 256 MPN_COPY (gp, bp, n); 257 *usizep = 0; 258 TMP_FREE; 259 return n; 260 } 261 } 262 263 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 264 { 265 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); 266 267 TMP_FREE; 268 return gn; 269 } 270 271 MPN_ZERO (tp, 2*ualloc); 272 u0 = tp; tp += ualloc; 273 u1 = tp; tp += ualloc; 274 275 { 276 /* For the first hgcd call, there are no u updates, and it makes 277 some sense to use a different choice for p. */ 278 279 /* FIXME: We could trim use of temporary storage, since u0 and u1 280 are not used yet. For the hgcd call, we could swap in the u0 281 and u1 pointers for the relevant matrix elements. */ 282 283 struct hgcd_matrix M; 284 mp_size_t p = CHOOSE_P_1 (n); 285 mp_size_t nn; 286 287 mpn_hgcd_matrix_init (&M, n - p, tp); 288 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 289 if (nn > 0) 290 { 291 ASSERT (M.n <= (n - p - 1)/2); 292 ASSERT (M.n + p <= (p + n - 1) / 2); 293 294 /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 295 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); 296 297 MPN_COPY (u0, M.p[1][0], M.n); 298 MPN_COPY (u1, M.p[1][1], M.n); 299 un = M.n; 300 while ( (u0[un-1] | u1[un-1] ) == 0) 301 un--; 302 } 303 else 304 { 305 /* mpn_hgcd has failed. Then either one of a or b is very 306 small, or the difference is very small. Perform one 307 subtraction followed by one division. */ 308 mp_size_t gn; 309 mp_size_t updated_un = 1; 310 311 u1[0] = 1; 312 313 /* Temporary storage 2n + 1 */ 314 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, 315 u0, u1, &updated_un, tp, tp + n); 316 if (n == 0) 317 { 318 TMP_FREE; 319 return gn; 320 } 321 322 un = updated_un; 323 ASSERT (un < ualloc); 324 } 325 } 326 327 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 328 { 329 struct hgcd_matrix M; 330 mp_size_t p = CHOOSE_P_2 (n); 331 mp_size_t nn; 332 333 mpn_hgcd_matrix_init (&M, n - p, tp); 334 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 335 if (nn > 0) 336 { 337 mp_ptr t0; 338 339 t0 = tp + matrix_scratch; 340 ASSERT (M.n <= (n - p - 1)/2); 341 ASSERT (M.n + p <= (p + n - 1) / 2); 342 343 /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 344 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); 345 346 /* By the same analysis as for mpn_hgcd_matrix_mul */ 347 ASSERT (M.n + un <= ualloc); 348 349 /* FIXME: This copying could be avoided by some swapping of 350 * pointers. May need more temporary storage, though. */ 351 MPN_COPY (t0, u0, un); 352 353 /* Temporary storage ualloc */ 354 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); 355 356 ASSERT (un < ualloc); 357 ASSERT ( (u0[un-1] | u1[un-1]) > 0); 358 } 359 else 360 { 361 /* mpn_hgcd has failed. Then either one of a or b is very 362 small, or the difference is very small. Perform one 363 subtraction followed by one division. */ 364 mp_size_t gn; 365 mp_size_t updated_un = un; 366 367 /* Temporary storage 2n + 1 */ 368 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, 369 u0, u1, &updated_un, tp, tp + n); 370 if (n == 0) 371 { 372 TMP_FREE; 373 return gn; 374 } 375 376 un = updated_un; 377 ASSERT (un < ualloc); 378 } 379 } 380 381 if (UNLIKELY (mpn_cmp (ap, bp, n) == 0)) 382 { 383 /* Must return the smallest cofactor, +u1 or -u0 */ 384 int c; 385 386 MPN_COPY (gp, ap, n); 387 388 MPN_CMP (c, u0, u1, un); 389 ASSERT (c != 0); 390 if (c < 0) 391 { 392 MPN_NORMALIZE (u0, un); 393 MPN_COPY (up, u0, un); 394 *usizep = -un; 395 } 396 else 397 { 398 MPN_NORMALIZE_NOT_ZERO (u1, un); 399 MPN_COPY (up, u1, un); 400 *usizep = un; 401 } 402 403 TMP_FREE; 404 return n; 405 } 406 else if (mpn_zero_p (u0, un)) 407 { 408 mp_size_t gn; 409 ASSERT (un == 1); 410 ASSERT (u1[0] == 1); 411 412 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ 413 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); 414 415 TMP_FREE; 416 return gn; 417 } 418 else 419 { 420 /* We have A = ... a + ... b 421 B = u0 a + u1 b 422 423 a = u1 A + ... B 424 b = -u0 A + ... B 425 426 with bounds 427 428 |u0|, |u1| <= B / min(a, b) 429 430 Compute g = u a + v b = (u u1 - v u0) A + (...) B 431 Here, u, v are bounded by 432 433 |u| <= b, 434 |v| <= a 435 */ 436 437 mp_size_t u0n; 438 mp_size_t u1n; 439 mp_size_t lehmer_un; 440 mp_size_t lehmer_vn; 441 mp_size_t gn; 442 443 mp_ptr lehmer_up; 444 mp_ptr lehmer_vp; 445 int negate; 446 447 lehmer_up = tp; tp += n; 448 449 /* Call mpn_gcdext_lehmer_n with copies of a and b. */ 450 MPN_COPY (tp, ap, n); 451 MPN_COPY (tp + n, bp, n); 452 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); 453 454 u0n = un; 455 MPN_NORMALIZE (u0, u0n); 456 if (lehmer_un == 0) 457 { 458 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ 459 MPN_COPY (up, u0, u0n); 460 *usizep = -u0n; 461 462 TMP_FREE; 463 return gn; 464 } 465 466 lehmer_vp = tp; 467 /* Compute v = (g - u a) / b */ 468 lehmer_vn = compute_v (lehmer_vp, 469 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); 470 471 if (lehmer_un > 0) 472 negate = 0; 473 else 474 { 475 lehmer_un = -lehmer_un; 476 negate = 1; 477 } 478 479 u1n = un; 480 MPN_NORMALIZE (u1, u1n); 481 482 /* It's possible that u0 = 1, u1 = 0 */ 483 if (u1n == 0) 484 { 485 ASSERT (un == 1); 486 ASSERT (u0[0] == 1); 487 488 /* u1 == 0 ==> u u1 + v u0 = v */ 489 MPN_COPY (up, lehmer_vp, lehmer_vn); 490 *usizep = negate ? lehmer_vn : - lehmer_vn; 491 492 TMP_FREE; 493 return gn; 494 } 495 496 ASSERT (lehmer_un + u1n <= ualloc); 497 ASSERT (lehmer_vn + u0n <= ualloc); 498 499 /* Now u0, u1, u are non-zero. We may still have v == 0 */ 500 501 /* Compute u u0 */ 502 if (lehmer_un <= u1n) 503 /* Should be the common case */ 504 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); 505 else 506 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); 507 508 un = u1n + lehmer_un; 509 un -= (up[un - 1] == 0); 510 511 if (lehmer_vn > 0) 512 { 513 mp_limb_t cy; 514 515 /* Overwrites old u1 value */ 516 if (lehmer_vn <= u0n) 517 /* Should be the common case */ 518 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); 519 else 520 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); 521 522 u1n = u0n + lehmer_vn; 523 u1n -= (u1[u1n - 1] == 0); 524 525 if (u1n <= un) 526 { 527 cy = mpn_add (up, up, un, u1, u1n); 528 } 529 else 530 { 531 cy = mpn_add (up, u1, u1n, up, un); 532 un = u1n; 533 } 534 up[un] = cy; 535 un += (cy != 0); 536 537 ASSERT (un < ualloc); 538 } 539 *usizep = negate ? -un : un; 540 541 TMP_FREE; 542 return gn; 543 } 544 } 545