1 /* mpn_gcdext -- Extended Greatest Common Divisor.
2
3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
4 Foundation, Inc.
5
6 This file is part of the GNU MP Library.
7
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
20
21 #include "gmp.h"
22 #include "gmp-impl.h"
23 #include "longlong.h"
24
25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
26 the size is returned (if inputs are non-normalized, result may be
27 non-normalized too). Temporary space needed is M->n + n.
28 */
29 static size_t
hgcd_mul_matrix_vector(struct hgcd_matrix * M,mp_ptr rp,mp_srcptr ap,mp_ptr bp,mp_size_t n,mp_ptr tp)30 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
31 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
32 {
33 mp_limb_t ah, bh;
34
35 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
36
37 t = u00 * a
38 r = u10 * b
39 r += t;
40
41 t = u11 * b
42 b = u01 * a
43 b += t;
44 */
45
46 if (M->n >= n)
47 {
48 mpn_mul (tp, M->p[0][0], M->n, ap, n);
49 mpn_mul (rp, M->p[1][0], M->n, bp, n);
50 }
51 else
52 {
53 mpn_mul (tp, ap, n, M->p[0][0], M->n);
54 mpn_mul (rp, bp, n, M->p[1][0], M->n);
55 }
56
57 ah = mpn_add_n (rp, rp, tp, n + M->n);
58
59 if (M->n >= n)
60 {
61 mpn_mul (tp, M->p[1][1], M->n, bp, n);
62 mpn_mul (bp, M->p[0][1], M->n, ap, n);
63 }
64 else
65 {
66 mpn_mul (tp, bp, n, M->p[1][1], M->n);
67 mpn_mul (bp, ap, n, M->p[0][1], M->n);
68 }
69 bh = mpn_add_n (bp, bp, tp, n + M->n);
70
71 n += M->n;
72 if ( (ah | bh) > 0)
73 {
74 rp[n] = ah;
75 bp[n] = bh;
76 n++;
77 }
78 else
79 {
80 /* Normalize */
81 while ( (rp[n-1] | bp[n-1]) == 0)
82 n--;
83 }
84
85 return n;
86 }
87
88 #define COMPUTE_V_ITCH(n) (2*(n) + 1)
89
90 /* Computes |v| = |(g - u a)| / b, where u may be positive or
91 negative, and v is of the opposite sign. a, b are of size n, u and
92 v at most size n, and v must have space for n+1 limbs. */
93 static mp_size_t
compute_v(mp_ptr vp,mp_srcptr ap,mp_srcptr bp,mp_size_t n,mp_srcptr gp,mp_size_t gn,mp_srcptr up,mp_size_t usize,mp_ptr tp)94 compute_v (mp_ptr vp,
95 mp_srcptr ap, mp_srcptr bp, mp_size_t n,
96 mp_srcptr gp, mp_size_t gn,
97 mp_srcptr up, mp_size_t usize,
98 mp_ptr tp)
99 {
100 mp_size_t size;
101 mp_size_t an;
102 mp_size_t bn;
103 mp_size_t vn;
104
105 ASSERT (n > 0);
106 ASSERT (gn > 0);
107 ASSERT (usize != 0);
108
109 size = ABS (usize);
110 ASSERT (size <= n);
111
112 an = n;
113 MPN_NORMALIZE (ap, an);
114
115 if (an >= size)
116 mpn_mul (tp, ap, an, up, size);
117 else
118 mpn_mul (tp, up, size, ap, an);
119
120 size += an;
121 size -= tp[size - 1] == 0;
122
123 ASSERT (gn <= size);
124
125 if (usize > 0)
126 {
127 /* |v| = -v = (u a - g) / b */
128
129 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
130 MPN_NORMALIZE (tp, size);
131 if (size == 0)
132 return 0;
133 }
134 else
135 { /* usize < 0 */
136 /* |v| = v = (c - u a) / b = (c + |u| a) / b */
137 mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
138 if (cy)
139 tp[size++] = cy;
140 }
141
142 /* Now divide t / b. There must be no remainder */
143 bn = n;
144 MPN_NORMALIZE (bp, bn);
145 ASSERT (size >= bn);
146
147 vn = size + 1 - bn;
148 ASSERT (vn <= n + 1);
149
150 mpn_divexact (vp, tp, size, bp, bn);
151 vn -= (vp[vn-1] == 0);
152
153 return vn;
154 }
155
156 /* Temporary storage:
157
158 Initial division: Quotient of at most an - n + 1 <= an limbs.
159
160 Storage for u0 and u1: 2(n+1).
161
162 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
163
164 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
165
166 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
167
168 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
169
170 For the lehmer call after the loop, Let T denote
171 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
172 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
173 for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is
174 sufficient for both operations.
175
176 */
177
178 /* Optimal choice of p seems difficult. In each iteration the division
179 * of work between hgcd and the updates of u0 and u1 depends on the
180 * current size of the u. It may be desirable to use a different
181 * choice of p in each iteration. Also the input size seems to matter;
182 * choosing p = n / 3 in the first iteration seems to improve
183 * performance slightly for input size just above the threshold, but
184 * degrade performance for larger inputs. */
185 #define CHOOSE_P_1(n) ((n) / 2)
186 #define CHOOSE_P_2(n) ((n) / 3)
187
188 mp_size_t
mpn_gcdext(mp_ptr gp,mp_ptr up,mp_size_t * usizep,mp_ptr ap,mp_size_t an,mp_ptr bp,mp_size_t n)189 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
190 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
191 {
192 mp_size_t talloc;
193 mp_size_t scratch;
194 mp_size_t matrix_scratch;
195 mp_size_t ualloc = n + 1;
196
197 mp_size_t un;
198 mp_ptr u0;
199 mp_ptr u1;
200
201 mp_ptr tp;
202
203 TMP_DECL;
204
205 ASSERT (an >= n);
206 ASSERT (n > 0);
207
208 TMP_MARK;
209
210 /* FIXME: Check for small sizes first, before setting up temporary
211 storage etc. */
212 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
213
214 /* For initial division */
215 scratch = an - n + 1;
216 if (scratch > talloc)
217 talloc = scratch;
218
219 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
220 {
221 /* For hgcd loop. */
222 mp_size_t hgcd_scratch;
223 mp_size_t update_scratch;
224 mp_size_t p1 = CHOOSE_P_1 (n);
225 mp_size_t p2 = CHOOSE_P_2 (n);
226 mp_size_t min_p = MIN(p1, p2);
227 mp_size_t max_p = MAX(p1, p2);
228 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
229 hgcd_scratch = mpn_hgcd_itch (n - min_p);
230 update_scratch = max_p + n - 1;
231
232 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
233 if (scratch > talloc)
234 talloc = scratch;
235
236 /* Final mpn_gcdext_lehmer_n call. Need space for u and for
237 copies of a and b. */
238 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
239 + 3*GCDEXT_DC_THRESHOLD;
240
241 if (scratch > talloc)
242 talloc = scratch;
243
244 /* Cofactors u0 and u1 */
245 talloc += 2*(n+1);
246 }
247
248 tp = TMP_ALLOC_LIMBS(talloc);
249
250 if (an > n)
251 {
252 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
253
254 if (mpn_zero_p (ap, n))
255 {
256 MPN_COPY (gp, bp, n);
257 *usizep = 0;
258 TMP_FREE;
259 return n;
260 }
261 }
262
263 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
264 {
265 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
266
267 TMP_FREE;
268 return gn;
269 }
270
271 MPN_ZERO (tp, 2*ualloc);
272 u0 = tp; tp += ualloc;
273 u1 = tp; tp += ualloc;
274
275 {
276 /* For the first hgcd call, there are no u updates, and it makes
277 some sense to use a different choice for p. */
278
279 /* FIXME: We could trim use of temporary storage, since u0 and u1
280 are not used yet. For the hgcd call, we could swap in the u0
281 and u1 pointers for the relevant matrix elements. */
282
283 struct hgcd_matrix M;
284 mp_size_t p = CHOOSE_P_1 (n);
285 mp_size_t nn;
286
287 mpn_hgcd_matrix_init (&M, n - p, tp);
288 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
289 if (nn > 0)
290 {
291 ASSERT (M.n <= (n - p - 1)/2);
292 ASSERT (M.n + p <= (p + n - 1) / 2);
293
294 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
295 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
296
297 MPN_COPY (u0, M.p[1][0], M.n);
298 MPN_COPY (u1, M.p[1][1], M.n);
299 un = M.n;
300 while ( (u0[un-1] | u1[un-1] ) == 0)
301 un--;
302 }
303 else
304 {
305 /* mpn_hgcd has failed. Then either one of a or b is very
306 small, or the difference is very small. Perform one
307 subtraction followed by one division. */
308 mp_size_t gn;
309 mp_size_t updated_un = 1;
310
311 u1[0] = 1;
312
313 /* Temporary storage 2n + 1 */
314 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
315 u0, u1, &updated_un, tp, tp + n);
316 if (n == 0)
317 {
318 TMP_FREE;
319 return gn;
320 }
321
322 un = updated_un;
323 ASSERT (un < ualloc);
324 }
325 }
326
327 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
328 {
329 struct hgcd_matrix M;
330 mp_size_t p = CHOOSE_P_2 (n);
331 mp_size_t nn;
332
333 mpn_hgcd_matrix_init (&M, n - p, tp);
334 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
335 if (nn > 0)
336 {
337 mp_ptr t0;
338
339 t0 = tp + matrix_scratch;
340 ASSERT (M.n <= (n - p - 1)/2);
341 ASSERT (M.n + p <= (p + n - 1) / 2);
342
343 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
344 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
345
346 /* By the same analysis as for mpn_hgcd_matrix_mul */
347 ASSERT (M.n + un <= ualloc);
348
349 /* FIXME: This copying could be avoided by some swapping of
350 * pointers. May need more temporary storage, though. */
351 MPN_COPY (t0, u0, un);
352
353 /* Temporary storage ualloc */
354 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
355
356 ASSERT (un < ualloc);
357 ASSERT ( (u0[un-1] | u1[un-1]) > 0);
358 }
359 else
360 {
361 /* mpn_hgcd has failed. Then either one of a or b is very
362 small, or the difference is very small. Perform one
363 subtraction followed by one division. */
364 mp_size_t gn;
365 mp_size_t updated_un = un;
366
367 /* Temporary storage 2n + 1 */
368 n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
369 u0, u1, &updated_un, tp, tp + n);
370 if (n == 0)
371 {
372 TMP_FREE;
373 return gn;
374 }
375
376 un = updated_un;
377 ASSERT (un < ualloc);
378 }
379 }
380
381 if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
382 {
383 /* Must return the smallest cofactor, +u1 or -u0 */
384 int c;
385
386 MPN_COPY (gp, ap, n);
387
388 MPN_CMP (c, u0, u1, un);
389 /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
390 this case we choose the cofactor + 1, corresponding to G = A
391 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
392 ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
393 if (c < 0)
394 {
395 MPN_NORMALIZE (u0, un);
396 MPN_COPY (up, u0, un);
397 *usizep = -un;
398 }
399 else
400 {
401 MPN_NORMALIZE_NOT_ZERO (u1, un);
402 MPN_COPY (up, u1, un);
403 *usizep = un;
404 }
405
406 TMP_FREE;
407 return n;
408 }
409 else if (mpn_zero_p (u0, un))
410 {
411 mp_size_t gn;
412 ASSERT (un == 1);
413 ASSERT (u1[0] == 1);
414
415 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
416 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
417
418 TMP_FREE;
419 return gn;
420 }
421 else
422 {
423 /* We have A = ... a + ... b
424 B = u0 a + u1 b
425
426 a = u1 A + ... B
427 b = -u0 A + ... B
428
429 with bounds
430
431 |u0|, |u1| <= B / min(a, b)
432
433 Compute g = u a + v b = (u u1 - v u0) A + (...) B
434 Here, u, v are bounded by
435
436 |u| <= b,
437 |v| <= a
438 */
439
440 mp_size_t u0n;
441 mp_size_t u1n;
442 mp_size_t lehmer_un;
443 mp_size_t lehmer_vn;
444 mp_size_t gn;
445
446 mp_ptr lehmer_up;
447 mp_ptr lehmer_vp;
448 int negate;
449
450 lehmer_up = tp; tp += n;
451
452 /* Call mpn_gcdext_lehmer_n with copies of a and b. */
453 MPN_COPY (tp, ap, n);
454 MPN_COPY (tp + n, bp, n);
455 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
456
457 u0n = un;
458 MPN_NORMALIZE (u0, u0n);
459 if (lehmer_un == 0)
460 {
461 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
462 MPN_COPY (up, u0, u0n);
463 *usizep = -u0n;
464
465 TMP_FREE;
466 return gn;
467 }
468
469 lehmer_vp = tp;
470 /* Compute v = (g - u a) / b */
471 lehmer_vn = compute_v (lehmer_vp,
472 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
473
474 if (lehmer_un > 0)
475 negate = 0;
476 else
477 {
478 lehmer_un = -lehmer_un;
479 negate = 1;
480 }
481
482 u1n = un;
483 MPN_NORMALIZE (u1, u1n);
484
485 /* It's possible that u0 = 1, u1 = 0 */
486 if (u1n == 0)
487 {
488 ASSERT (un == 1);
489 ASSERT (u0[0] == 1);
490
491 /* u1 == 0 ==> u u1 + v u0 = v */
492 MPN_COPY (up, lehmer_vp, lehmer_vn);
493 *usizep = negate ? lehmer_vn : - lehmer_vn;
494
495 TMP_FREE;
496 return gn;
497 }
498
499 ASSERT (lehmer_un + u1n <= ualloc);
500 ASSERT (lehmer_vn + u0n <= ualloc);
501
502 /* Now u0, u1, u are non-zero. We may still have v == 0 */
503
504 /* Compute u u0 */
505 if (lehmer_un <= u1n)
506 /* Should be the common case */
507 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
508 else
509 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
510
511 un = u1n + lehmer_un;
512 un -= (up[un - 1] == 0);
513
514 if (lehmer_vn > 0)
515 {
516 mp_limb_t cy;
517
518 /* Overwrites old u1 value */
519 if (lehmer_vn <= u0n)
520 /* Should be the common case */
521 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
522 else
523 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
524
525 u1n = u0n + lehmer_vn;
526 u1n -= (u1[u1n - 1] == 0);
527
528 if (u1n <= un)
529 {
530 cy = mpn_add (up, up, un, u1, u1n);
531 }
532 else
533 {
534 cy = mpn_add (up, u1, u1n, up, un);
535 un = u1n;
536 }
537 up[un] = cy;
538 un += (cy != 0);
539
540 ASSERT (un < ualloc);
541 }
542 *usizep = negate ? -un : un;
543
544 TMP_FREE;
545 return gn;
546 }
547 }
548