1 /* mpn_gcdext -- Extended Greatest Common Divisor. 2 3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software 4 Foundation, Inc. 5 6 This file is part of the GNU MP Library. 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 9 it under the terms of the GNU Lesser General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or (at your 11 option) any later version. 12 13 The GNU MP Library is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 License for more details. 17 18 You should have received a copy of the GNU Lesser General Public License 19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 20 21 #include "gmp.h" 22 #include "gmp-impl.h" 23 #include "longlong.h" 24 25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and 26 the size is returned (if inputs are non-normalized, result may be 27 non-normalized too). Temporary space needed is M->n + n. 28 */ 29 static size_t 30 hgcd_mul_matrix_vector (struct hgcd_matrix *M, 31 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) 32 { 33 mp_limb_t ah, bh; 34 35 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as 36 37 t = u00 * a 38 r = u10 * b 39 r += t; 40 41 t = u11 * b 42 b = u01 * a 43 b += t; 44 */ 45 46 if (M->n >= n) 47 { 48 mpn_mul (tp, M->p[0][0], M->n, ap, n); 49 mpn_mul (rp, M->p[1][0], M->n, bp, n); 50 } 51 else 52 { 53 mpn_mul (tp, ap, n, M->p[0][0], M->n); 54 mpn_mul (rp, bp, n, M->p[1][0], M->n); 55 } 56 57 ah = mpn_add_n (rp, rp, tp, n + M->n); 58 59 if (M->n >= n) 60 { 61 mpn_mul (tp, M->p[1][1], M->n, bp, n); 62 mpn_mul (bp, M->p[0][1], M->n, ap, n); 63 } 64 else 65 { 66 mpn_mul (tp, bp, n, M->p[1][1], M->n); 67 mpn_mul (bp, ap, n, M->p[0][1], M->n); 68 } 69 bh = mpn_add_n (bp, bp, tp, n + M->n); 70 71 n += M->n; 72 if ( (ah | bh) > 0) 73 { 74 rp[n] = ah; 75 bp[n] = bh; 76 n++; 77 } 78 else 79 { 80 /* Normalize */ 81 while ( (rp[n-1] | bp[n-1]) == 0) 82 n--; 83 } 84 85 return n; 86 } 87 88 #define COMPUTE_V_ITCH(n) (2*(n) + 1) 89 90 /* Computes |v| = |(g - u a)| / b, where u may be positive or 91 negative, and v is of the opposite sign. a, b are of size n, u and 92 v at most size n, and v must have space for n+1 limbs. */ 93 static mp_size_t 94 compute_v (mp_ptr vp, 95 mp_srcptr ap, mp_srcptr bp, mp_size_t n, 96 mp_srcptr gp, mp_size_t gn, 97 mp_srcptr up, mp_size_t usize, 98 mp_ptr tp) 99 { 100 mp_size_t size; 101 mp_size_t an; 102 mp_size_t bn; 103 mp_size_t vn; 104 105 ASSERT (n > 0); 106 ASSERT (gn > 0); 107 ASSERT (usize != 0); 108 109 size = ABS (usize); 110 ASSERT (size <= n); 111 112 an = n; 113 MPN_NORMALIZE (ap, an); 114 115 if (an >= size) 116 mpn_mul (tp, ap, an, up, size); 117 else 118 mpn_mul (tp, up, size, ap, an); 119 120 size += an; 121 size -= tp[size - 1] == 0; 122 123 ASSERT (gn <= size); 124 125 if (usize > 0) 126 { 127 /* |v| = -v = (u a - g) / b */ 128 129 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); 130 MPN_NORMALIZE (tp, size); 131 if (size == 0) 132 return 0; 133 } 134 else 135 { /* usize < 0 */ 136 /* |v| = v = (c - u a) / b = (c + |u| a) / b */ 137 mp_limb_t cy = mpn_add (tp, tp, size, gp, gn); 138 if (cy) 139 tp[size++] = cy; 140 } 141 142 /* Now divide t / b. There must be no remainder */ 143 bn = n; 144 MPN_NORMALIZE (bp, bn); 145 ASSERT (size >= bn); 146 147 vn = size + 1 - bn; 148 ASSERT (vn <= n + 1); 149 150 mpn_divexact (vp, tp, size, bp, bn); 151 vn -= (vp[vn-1] == 0); 152 153 return vn; 154 } 155 156 /* Temporary storage: 157 158 Initial division: Quotient of at most an - n + 1 <= an limbs. 159 160 Storage for u0 and u1: 2(n+1). 161 162 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) 163 164 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. 165 166 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors. 167 168 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. 169 170 For the lehmer call after the loop, Let T denote 171 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for 172 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T 173 for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is 174 sufficient for both operations. 175 176 */ 177 178 /* Optimal choice of p seems difficult. In each iteration the division 179 * of work between hgcd and the updates of u0 and u1 depends on the 180 * current size of the u. It may be desirable to use a different 181 * choice of p in each iteration. Also the input size seems to matter; 182 * choosing p = n / 3 in the first iteration seems to improve 183 * performance slightly for input size just above the threshold, but 184 * degrade performance for larger inputs. */ 185 #define CHOOSE_P_1(n) ((n) / 2) 186 #define CHOOSE_P_2(n) ((n) / 3) 187 188 mp_size_t 189 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, 190 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) 191 { 192 mp_size_t talloc; 193 mp_size_t scratch; 194 mp_size_t matrix_scratch; 195 mp_size_t ualloc = n + 1; 196 197 mp_size_t un; 198 mp_ptr u0; 199 mp_ptr u1; 200 201 mp_ptr tp; 202 203 TMP_DECL; 204 205 ASSERT (an >= n); 206 ASSERT (n > 0); 207 208 TMP_MARK; 209 210 /* FIXME: Check for small sizes first, before setting up temporary 211 storage etc. */ 212 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); 213 214 /* For initial division */ 215 scratch = an - n + 1; 216 if (scratch > talloc) 217 talloc = scratch; 218 219 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 220 { 221 /* For hgcd loop. */ 222 mp_size_t hgcd_scratch; 223 mp_size_t update_scratch; 224 mp_size_t p1 = CHOOSE_P_1 (n); 225 mp_size_t p2 = CHOOSE_P_2 (n); 226 mp_size_t min_p = MIN(p1, p2); 227 mp_size_t max_p = MAX(p1, p2); 228 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); 229 hgcd_scratch = mpn_hgcd_itch (n - min_p); 230 update_scratch = max_p + n - 1; 231 232 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); 233 if (scratch > talloc) 234 talloc = scratch; 235 236 /* Final mpn_gcdext_lehmer_n call. Need space for u and for 237 copies of a and b. */ 238 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) 239 + 3*GCDEXT_DC_THRESHOLD; 240 241 if (scratch > talloc) 242 talloc = scratch; 243 244 /* Cofactors u0 and u1 */ 245 talloc += 2*(n+1); 246 } 247 248 tp = TMP_ALLOC_LIMBS(talloc); 249 250 if (an > n) 251 { 252 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); 253 254 if (mpn_zero_p (ap, n)) 255 { 256 MPN_COPY (gp, bp, n); 257 *usizep = 0; 258 TMP_FREE; 259 return n; 260 } 261 } 262 263 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 264 { 265 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); 266 267 TMP_FREE; 268 return gn; 269 } 270 271 MPN_ZERO (tp, 2*ualloc); 272 u0 = tp; tp += ualloc; 273 u1 = tp; tp += ualloc; 274 275 { 276 /* For the first hgcd call, there are no u updates, and it makes 277 some sense to use a different choice for p. */ 278 279 /* FIXME: We could trim use of temporary storage, since u0 and u1 280 are not used yet. For the hgcd call, we could swap in the u0 281 and u1 pointers for the relevant matrix elements. */ 282 283 struct hgcd_matrix M; 284 mp_size_t p = CHOOSE_P_1 (n); 285 mp_size_t nn; 286 287 mpn_hgcd_matrix_init (&M, n - p, tp); 288 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 289 if (nn > 0) 290 { 291 ASSERT (M.n <= (n - p - 1)/2); 292 ASSERT (M.n + p <= (p + n - 1) / 2); 293 294 /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 295 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); 296 297 MPN_COPY (u0, M.p[1][0], M.n); 298 MPN_COPY (u1, M.p[1][1], M.n); 299 un = M.n; 300 while ( (u0[un-1] | u1[un-1] ) == 0) 301 un--; 302 } 303 else 304 { 305 /* mpn_hgcd has failed. Then either one of a or b is very 306 small, or the difference is very small. Perform one 307 subtraction followed by one division. */ 308 mp_size_t gn; 309 mp_size_t updated_un = 1; 310 311 u1[0] = 1; 312 313 /* Temporary storage 2n + 1 */ 314 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, 315 u0, u1, &updated_un, tp, tp + n); 316 if (n == 0) 317 { 318 TMP_FREE; 319 return gn; 320 } 321 322 un = updated_un; 323 ASSERT (un < ualloc); 324 } 325 } 326 327 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) 328 { 329 struct hgcd_matrix M; 330 mp_size_t p = CHOOSE_P_2 (n); 331 mp_size_t nn; 332 333 mpn_hgcd_matrix_init (&M, n - p, tp); 334 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); 335 if (nn > 0) 336 { 337 mp_ptr t0; 338 339 t0 = tp + matrix_scratch; 340 ASSERT (M.n <= (n - p - 1)/2); 341 ASSERT (M.n + p <= (p + n - 1) / 2); 342 343 /* Temporary storage 2 (p + M->n) <= p + n - 1 */ 344 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); 345 346 /* By the same analysis as for mpn_hgcd_matrix_mul */ 347 ASSERT (M.n + un <= ualloc); 348 349 /* FIXME: This copying could be avoided by some swapping of 350 * pointers. May need more temporary storage, though. */ 351 MPN_COPY (t0, u0, un); 352 353 /* Temporary storage ualloc */ 354 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); 355 356 ASSERT (un < ualloc); 357 ASSERT ( (u0[un-1] | u1[un-1]) > 0); 358 } 359 else 360 { 361 /* mpn_hgcd has failed. Then either one of a or b is very 362 small, or the difference is very small. Perform one 363 subtraction followed by one division. */ 364 mp_size_t gn; 365 mp_size_t updated_un = un; 366 367 /* Temporary storage 2n + 1 */ 368 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n, 369 u0, u1, &updated_un, tp, tp + n); 370 if (n == 0) 371 { 372 TMP_FREE; 373 return gn; 374 } 375 376 un = updated_un; 377 ASSERT (un < ualloc); 378 } 379 } 380 381 if (UNLIKELY (mpn_cmp (ap, bp, n) == 0)) 382 { 383 /* Must return the smallest cofactor, +u1 or -u0 */ 384 int c; 385 386 MPN_COPY (gp, ap, n); 387 388 MPN_CMP (c, u0, u1, un); 389 /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in 390 this case we choose the cofactor + 1, corresponding to G = A 391 - k B, rather than -1, corresponding to G = - A + (k+1) B. */ 392 ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); 393 if (c < 0) 394 { 395 MPN_NORMALIZE (u0, un); 396 MPN_COPY (up, u0, un); 397 *usizep = -un; 398 } 399 else 400 { 401 MPN_NORMALIZE_NOT_ZERO (u1, un); 402 MPN_COPY (up, u1, un); 403 *usizep = un; 404 } 405 406 TMP_FREE; 407 return n; 408 } 409 else if (mpn_zero_p (u0, un)) 410 { 411 mp_size_t gn; 412 ASSERT (un == 1); 413 ASSERT (u1[0] == 1); 414 415 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ 416 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); 417 418 TMP_FREE; 419 return gn; 420 } 421 else 422 { 423 /* We have A = ... a + ... b 424 B = u0 a + u1 b 425 426 a = u1 A + ... B 427 b = -u0 A + ... B 428 429 with bounds 430 431 |u0|, |u1| <= B / min(a, b) 432 433 Compute g = u a + v b = (u u1 - v u0) A + (...) B 434 Here, u, v are bounded by 435 436 |u| <= b, 437 |v| <= a 438 */ 439 440 mp_size_t u0n; 441 mp_size_t u1n; 442 mp_size_t lehmer_un; 443 mp_size_t lehmer_vn; 444 mp_size_t gn; 445 446 mp_ptr lehmer_up; 447 mp_ptr lehmer_vp; 448 int negate; 449 450 lehmer_up = tp; tp += n; 451 452 /* Call mpn_gcdext_lehmer_n with copies of a and b. */ 453 MPN_COPY (tp, ap, n); 454 MPN_COPY (tp + n, bp, n); 455 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); 456 457 u0n = un; 458 MPN_NORMALIZE (u0, u0n); 459 if (lehmer_un == 0) 460 { 461 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ 462 MPN_COPY (up, u0, u0n); 463 *usizep = -u0n; 464 465 TMP_FREE; 466 return gn; 467 } 468 469 lehmer_vp = tp; 470 /* Compute v = (g - u a) / b */ 471 lehmer_vn = compute_v (lehmer_vp, 472 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); 473 474 if (lehmer_un > 0) 475 negate = 0; 476 else 477 { 478 lehmer_un = -lehmer_un; 479 negate = 1; 480 } 481 482 u1n = un; 483 MPN_NORMALIZE (u1, u1n); 484 485 /* It's possible that u0 = 1, u1 = 0 */ 486 if (u1n == 0) 487 { 488 ASSERT (un == 1); 489 ASSERT (u0[0] == 1); 490 491 /* u1 == 0 ==> u u1 + v u0 = v */ 492 MPN_COPY (up, lehmer_vp, lehmer_vn); 493 *usizep = negate ? lehmer_vn : - lehmer_vn; 494 495 TMP_FREE; 496 return gn; 497 } 498 499 ASSERT (lehmer_un + u1n <= ualloc); 500 ASSERT (lehmer_vn + u0n <= ualloc); 501 502 /* Now u0, u1, u are non-zero. We may still have v == 0 */ 503 504 /* Compute u u0 */ 505 if (lehmer_un <= u1n) 506 /* Should be the common case */ 507 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); 508 else 509 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); 510 511 un = u1n + lehmer_un; 512 un -= (up[un - 1] == 0); 513 514 if (lehmer_vn > 0) 515 { 516 mp_limb_t cy; 517 518 /* Overwrites old u1 value */ 519 if (lehmer_vn <= u0n) 520 /* Should be the common case */ 521 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); 522 else 523 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); 524 525 u1n = u0n + lehmer_vn; 526 u1n -= (u1[u1n - 1] == 0); 527 528 if (u1n <= un) 529 { 530 cy = mpn_add (up, up, un, u1, u1n); 531 } 532 else 533 { 534 cy = mpn_add (up, u1, u1n, up, un); 535 un = u1n; 536 } 537 up[un] = cy; 538 un += (cy != 0); 539 540 ASSERT (un < ualloc); 541 } 542 *usizep = negate ? -un : un; 543 544 TMP_FREE; 545 return gn; 546 } 547 } 548