1 /* mpn_gcdext -- Extended Greatest Common Divisor.
2
3 Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
4 Inc.
5
6 This file is part of the GNU MP Library.
7
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15 or
16
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
20
21 or both in parallel, as here.
22
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 for more details.
27
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library. If not,
30 see https://www.gnu.org/licenses/. */
31
32 #include "gmp.h"
33 #include "gmp-impl.h"
34 #include "longlong.h"
35
36 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
37 the size is returned (if inputs are non-normalized, result may be
38 non-normalized too). Temporary space needed is M->n + n.
39 */
40 static size_t
hgcd_mul_matrix_vector(struct hgcd_matrix * M,mp_ptr rp,mp_srcptr ap,mp_ptr bp,mp_size_t n,mp_ptr tp)41 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
42 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
43 {
44 mp_limb_t ah, bh;
45
46 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
47
48 t = u00 * a
49 r = u10 * b
50 r += t;
51
52 t = u11 * b
53 b = u01 * a
54 b += t;
55 */
56
57 if (M->n >= n)
58 {
59 mpn_mul (tp, M->p[0][0], M->n, ap, n);
60 mpn_mul (rp, M->p[1][0], M->n, bp, n);
61 }
62 else
63 {
64 mpn_mul (tp, ap, n, M->p[0][0], M->n);
65 mpn_mul (rp, bp, n, M->p[1][0], M->n);
66 }
67
68 ah = mpn_add_n (rp, rp, tp, n + M->n);
69
70 if (M->n >= n)
71 {
72 mpn_mul (tp, M->p[1][1], M->n, bp, n);
73 mpn_mul (bp, M->p[0][1], M->n, ap, n);
74 }
75 else
76 {
77 mpn_mul (tp, bp, n, M->p[1][1], M->n);
78 mpn_mul (bp, ap, n, M->p[0][1], M->n);
79 }
80 bh = mpn_add_n (bp, bp, tp, n + M->n);
81
82 n += M->n;
83 if ( (ah | bh) > 0)
84 {
85 rp[n] = ah;
86 bp[n] = bh;
87 n++;
88 }
89 else
90 {
91 /* Normalize */
92 while ( (rp[n-1] | bp[n-1]) == 0)
93 n--;
94 }
95
96 return n;
97 }
98
99 #define COMPUTE_V_ITCH(n) (2*(n))
100
101 /* Computes |v| = |(g - u a)| / b, where u may be positive or
102 negative, and v is of the opposite sign. max(a, b) is of size n, u and
103 v at most size n, and v must have space for n+1 limbs. */
104 static mp_size_t
compute_v(mp_ptr vp,mp_srcptr ap,mp_srcptr bp,mp_size_t n,mp_srcptr gp,mp_size_t gn,mp_srcptr up,mp_size_t usize,mp_ptr tp)105 compute_v (mp_ptr vp,
106 mp_srcptr ap, mp_srcptr bp, mp_size_t n,
107 mp_srcptr gp, mp_size_t gn,
108 mp_srcptr up, mp_size_t usize,
109 mp_ptr tp)
110 {
111 mp_size_t size;
112 mp_size_t an;
113 mp_size_t bn;
114 mp_size_t vn;
115
116 ASSERT (n > 0);
117 ASSERT (gn > 0);
118 ASSERT (usize != 0);
119
120 size = ABS (usize);
121 ASSERT (size <= n);
122 ASSERT (up[size-1] > 0);
123
124 an = n;
125 MPN_NORMALIZE (ap, an);
126 ASSERT (gn <= an);
127
128 if (an >= size)
129 mpn_mul (tp, ap, an, up, size);
130 else
131 mpn_mul (tp, up, size, ap, an);
132
133 size += an;
134
135 if (usize > 0)
136 {
137 /* |v| = -v = (u a - g) / b */
138
139 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
140 MPN_NORMALIZE (tp, size);
141 if (size == 0)
142 return 0;
143 }
144 else
145 { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
146 (g + |u| a) always fits in (|usize| + an) limbs. */
147
148 ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
149 size -= (tp[size - 1] == 0);
150 }
151
152 /* Now divide t / b. There must be no remainder */
153 bn = n;
154 MPN_NORMALIZE (bp, bn);
155 ASSERT (size >= bn);
156
157 vn = size + 1 - bn;
158 ASSERT (vn <= n + 1);
159
160 mpn_divexact (vp, tp, size, bp, bn);
161 vn -= (vp[vn-1] == 0);
162
163 return vn;
164 }
165
166 /* Temporary storage:
167
168 Initial division: Quotient of at most an - n + 1 <= an limbs.
169
170 Storage for u0 and u1: 2(n+1).
171
172 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
173
174 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
175
176 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
177
178 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
179
180 For the lehmer call after the loop, Let T denote
181 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
182 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
183 for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
184 sufficient for both operations.
185
186 */
187
188 /* Optimal choice of p seems difficult. In each iteration the division
189 * of work between hgcd and the updates of u0 and u1 depends on the
190 * current size of the u. It may be desirable to use a different
191 * choice of p in each iteration. Also the input size seems to matter;
192 * choosing p = n / 3 in the first iteration seems to improve
193 * performance slightly for input size just above the threshold, but
194 * degrade performance for larger inputs. */
195 #define CHOOSE_P_1(n) ((n) / 2)
196 #define CHOOSE_P_2(n) ((n) / 3)
197
198 mp_size_t
mpn_gcdext(mp_ptr gp,mp_ptr up,mp_size_t * usizep,mp_ptr ap,mp_size_t an,mp_ptr bp,mp_size_t n)199 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
200 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
201 {
202 mp_size_t talloc;
203 mp_size_t scratch;
204 mp_size_t matrix_scratch;
205 mp_size_t ualloc = n + 1;
206
207 struct gcdext_ctx ctx;
208 mp_size_t un;
209 mp_ptr u0;
210 mp_ptr u1;
211
212 mp_ptr tp;
213
214 TMP_DECL;
215
216 ASSERT (an >= n);
217 ASSERT (n > 0);
218 ASSERT (bp[n-1] > 0);
219
220 TMP_MARK;
221
222 /* FIXME: Check for small sizes first, before setting up temporary
223 storage etc. */
224 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
225
226 /* For initial division */
227 scratch = an - n + 1;
228 if (scratch > talloc)
229 talloc = scratch;
230
231 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
232 {
233 /* For hgcd loop. */
234 mp_size_t hgcd_scratch;
235 mp_size_t update_scratch;
236 mp_size_t p1 = CHOOSE_P_1 (n);
237 mp_size_t p2 = CHOOSE_P_2 (n);
238 mp_size_t min_p = MIN(p1, p2);
239 mp_size_t max_p = MAX(p1, p2);
240 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
241 hgcd_scratch = mpn_hgcd_itch (n - min_p);
242 update_scratch = max_p + n - 1;
243
244 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
245 if (scratch > talloc)
246 talloc = scratch;
247
248 /* Final mpn_gcdext_lehmer_n call. Need space for u and for
249 copies of a and b. */
250 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
251 + 3*GCDEXT_DC_THRESHOLD;
252
253 if (scratch > talloc)
254 talloc = scratch;
255
256 /* Cofactors u0 and u1 */
257 talloc += 2*(n+1);
258 }
259
260 tp = TMP_ALLOC_LIMBS(talloc);
261
262 if (an > n)
263 {
264 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
265
266 if (mpn_zero_p (ap, n))
267 {
268 MPN_COPY (gp, bp, n);
269 *usizep = 0;
270 TMP_FREE;
271 return n;
272 }
273 }
274
275 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
276 {
277 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
278
279 TMP_FREE;
280 return gn;
281 }
282
283 MPN_ZERO (tp, 2*ualloc);
284 u0 = tp; tp += ualloc;
285 u1 = tp; tp += ualloc;
286
287 ctx.gp = gp;
288 ctx.up = up;
289 ctx.usize = usizep;
290
291 {
292 /* For the first hgcd call, there are no u updates, and it makes
293 some sense to use a different choice for p. */
294
295 /* FIXME: We could trim use of temporary storage, since u0 and u1
296 are not used yet. For the hgcd call, we could swap in the u0
297 and u1 pointers for the relevant matrix elements. */
298
299 struct hgcd_matrix M;
300 mp_size_t p = CHOOSE_P_1 (n);
301 mp_size_t nn;
302
303 mpn_hgcd_matrix_init (&M, n - p, tp);
304 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
305 if (nn > 0)
306 {
307 ASSERT (M.n <= (n - p - 1)/2);
308 ASSERT (M.n + p <= (p + n - 1) / 2);
309
310 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
311 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
312
313 MPN_COPY (u0, M.p[1][0], M.n);
314 MPN_COPY (u1, M.p[1][1], M.n);
315 un = M.n;
316 while ( (u0[un-1] | u1[un-1] ) == 0)
317 un--;
318 }
319 else
320 {
321 /* mpn_hgcd has failed. Then either one of a or b is very
322 small, or the difference is very small. Perform one
323 subtraction followed by one division. */
324 u1[0] = 1;
325
326 ctx.u0 = u0;
327 ctx.u1 = u1;
328 ctx.tp = tp + n; /* ualloc */
329 ctx.un = 1;
330
331 /* Temporary storage n */
332 n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
333 if (n == 0)
334 {
335 TMP_FREE;
336 return ctx.gn;
337 }
338
339 un = ctx.un;
340 ASSERT (un < ualloc);
341 }
342 }
343
344 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
345 {
346 struct hgcd_matrix M;
347 mp_size_t p = CHOOSE_P_2 (n);
348 mp_size_t nn;
349
350 mpn_hgcd_matrix_init (&M, n - p, tp);
351 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
352 if (nn > 0)
353 {
354 mp_ptr t0;
355
356 t0 = tp + matrix_scratch;
357 ASSERT (M.n <= (n - p - 1)/2);
358 ASSERT (M.n + p <= (p + n - 1) / 2);
359
360 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
361 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
362
363 /* By the same analysis as for mpn_hgcd_matrix_mul */
364 ASSERT (M.n + un <= ualloc);
365
366 /* FIXME: This copying could be avoided by some swapping of
367 * pointers. May need more temporary storage, though. */
368 MPN_COPY (t0, u0, un);
369
370 /* Temporary storage ualloc */
371 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
372
373 ASSERT (un < ualloc);
374 ASSERT ( (u0[un-1] | u1[un-1]) > 0);
375 }
376 else
377 {
378 /* mpn_hgcd has failed. Then either one of a or b is very
379 small, or the difference is very small. Perform one
380 subtraction followed by one division. */
381 ctx.u0 = u0;
382 ctx.u1 = u1;
383 ctx.tp = tp + n; /* ualloc */
384 ctx.un = un;
385
386 /* Temporary storage n */
387 n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
388 if (n == 0)
389 {
390 TMP_FREE;
391 return ctx.gn;
392 }
393
394 un = ctx.un;
395 ASSERT (un < ualloc);
396 }
397 }
398 /* We have A = ... a + ... b
399 B = u0 a + u1 b
400
401 a = u1 A + ... B
402 b = -u0 A + ... B
403
404 with bounds
405
406 |u0|, |u1| <= B / min(a, b)
407
408 We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
409 in which case the only reduction done so far is a = A - k B for
410 some k.
411
412 Compute g = u a + v b = (u u1 - v u0) A + (...) B
413 Here, u, v are bounded by
414
415 |u| <= b,
416 |v| <= a
417 */
418
419 ASSERT ( (ap[n-1] | bp[n-1]) > 0);
420
421 if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
422 {
423 /* Must return the smallest cofactor, +u1 or -u0 */
424 int c;
425
426 MPN_COPY (gp, ap, n);
427
428 MPN_CMP (c, u0, u1, un);
429 /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
430 this case we choose the cofactor + 1, corresponding to G = A
431 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
432 ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
433 if (c < 0)
434 {
435 MPN_NORMALIZE (u0, un);
436 MPN_COPY (up, u0, un);
437 *usizep = -un;
438 }
439 else
440 {
441 MPN_NORMALIZE_NOT_ZERO (u1, un);
442 MPN_COPY (up, u1, un);
443 *usizep = un;
444 }
445
446 TMP_FREE;
447 return n;
448 }
449 else if (UNLIKELY (u0[0] == 0) && un == 1)
450 {
451 mp_size_t gn;
452 ASSERT (u1[0] == 1);
453
454 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
455 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
456
457 TMP_FREE;
458 return gn;
459 }
460 else
461 {
462 mp_size_t u0n;
463 mp_size_t u1n;
464 mp_size_t lehmer_un;
465 mp_size_t lehmer_vn;
466 mp_size_t gn;
467
468 mp_ptr lehmer_up;
469 mp_ptr lehmer_vp;
470 int negate;
471
472 lehmer_up = tp; tp += n;
473
474 /* Call mpn_gcdext_lehmer_n with copies of a and b. */
475 MPN_COPY (tp, ap, n);
476 MPN_COPY (tp + n, bp, n);
477 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
478
479 u0n = un;
480 MPN_NORMALIZE (u0, u0n);
481 ASSERT (u0n > 0);
482
483 if (lehmer_un == 0)
484 {
485 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
486 MPN_COPY (up, u0, u0n);
487 *usizep = -u0n;
488
489 TMP_FREE;
490 return gn;
491 }
492
493 lehmer_vp = tp;
494 /* Compute v = (g - u a) / b */
495 lehmer_vn = compute_v (lehmer_vp,
496 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
497
498 if (lehmer_un > 0)
499 negate = 0;
500 else
501 {
502 lehmer_un = -lehmer_un;
503 negate = 1;
504 }
505
506 u1n = un;
507 MPN_NORMALIZE (u1, u1n);
508 ASSERT (u1n > 0);
509
510 ASSERT (lehmer_un + u1n <= ualloc);
511 ASSERT (lehmer_vn + u0n <= ualloc);
512
513 /* We may still have v == 0 */
514
515 /* Compute u u0 */
516 if (lehmer_un <= u1n)
517 /* Should be the common case */
518 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
519 else
520 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
521
522 un = u1n + lehmer_un;
523 un -= (up[un - 1] == 0);
524
525 if (lehmer_vn > 0)
526 {
527 mp_limb_t cy;
528
529 /* Overwrites old u1 value */
530 if (lehmer_vn <= u0n)
531 /* Should be the common case */
532 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
533 else
534 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
535
536 u1n = u0n + lehmer_vn;
537 u1n -= (u1[u1n - 1] == 0);
538
539 if (u1n <= un)
540 {
541 cy = mpn_add (up, up, un, u1, u1n);
542 }
543 else
544 {
545 cy = mpn_add (up, u1, u1n, up, un);
546 un = u1n;
547 }
548 up[un] = cy;
549 un += (cy != 0);
550
551 ASSERT (un < ualloc);
552 }
553 *usizep = negate ? -un : un;
554
555 TMP_FREE;
556 return gn;
557 }
558 }
559