1 /* mpn_gcdext -- Extended Greatest Common Divisor.
2 
3 Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
4 Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10 
11   * the GNU Lesser General Public License as published by the Free
12     Software Foundation; either version 3 of the License, or (at your
13     option) any later version.
14 
15 or
16 
17   * the GNU General Public License as published by the Free Software
18     Foundation; either version 2 of the License, or (at your option) any
19     later version.
20 
21 or both in parallel, as here.
22 
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26 for more details.
27 
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library.  If not,
30 see https://www.gnu.org/licenses/.  */
31 
32 #include "gmp-impl.h"
33 #include "longlong.h"
34 
35 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
36    the size is returned (if inputs are non-normalized, result may be
37    non-normalized too). Temporary space needed is M->n + n.
38  */
39 static size_t
hgcd_mul_matrix_vector(struct hgcd_matrix * M,mp_ptr rp,mp_srcptr ap,mp_ptr bp,mp_size_t n,mp_ptr tp)40 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
41 			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
42 {
43   mp_limb_t ah, bh;
44 
45   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
46 
47      t  = u00 * a
48      r  = u10 * b
49      r += t;
50 
51      t  = u11 * b
52      b  = u01 * a
53      b += t;
54   */
55 
56   if (M->n >= n)
57     {
58       mpn_mul (tp, M->p[0][0], M->n, ap, n);
59       mpn_mul (rp, M->p[1][0], M->n, bp, n);
60     }
61   else
62     {
63       mpn_mul (tp, ap, n, M->p[0][0], M->n);
64       mpn_mul (rp, bp, n, M->p[1][0], M->n);
65     }
66 
67   ah = mpn_add_n (rp, rp, tp, n + M->n);
68 
69   if (M->n >= n)
70     {
71       mpn_mul (tp, M->p[1][1], M->n, bp, n);
72       mpn_mul (bp, M->p[0][1], M->n, ap, n);
73     }
74   else
75     {
76       mpn_mul (tp, bp, n, M->p[1][1], M->n);
77       mpn_mul (bp, ap, n, M->p[0][1], M->n);
78     }
79   bh = mpn_add_n (bp, bp, tp, n + M->n);
80 
81   n += M->n;
82   if ( (ah | bh) > 0)
83     {
84       rp[n] = ah;
85       bp[n] = bh;
86       n++;
87     }
88   else
89     {
90       /* Normalize */
91       while ( (rp[n-1] | bp[n-1]) == 0)
92 	n--;
93     }
94 
95   return n;
96 }
97 
98 #define COMPUTE_V_ITCH(n) (2*(n))
99 
100 /* Computes |v| = |(g - u a)| / b, where u may be positive or
101    negative, and v is of the opposite sign. max(a, b) is of size n, u and
102    v at most size n, and v must have space for n+1 limbs. */
103 static mp_size_t
compute_v(mp_ptr vp,mp_srcptr ap,mp_srcptr bp,mp_size_t n,mp_srcptr gp,mp_size_t gn,mp_srcptr up,mp_size_t usize,mp_ptr tp)104 compute_v (mp_ptr vp,
105 	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
106 	   mp_srcptr gp, mp_size_t gn,
107 	   mp_srcptr up, mp_size_t usize,
108 	   mp_ptr tp)
109 {
110   mp_size_t size;
111   mp_size_t an;
112   mp_size_t bn;
113   mp_size_t vn;
114 
115   ASSERT (n > 0);
116   ASSERT (gn > 0);
117   ASSERT (usize != 0);
118 
119   size = ABS (usize);
120   ASSERT (size <= n);
121   ASSERT (up[size-1] > 0);
122 
123   an = n;
124   MPN_NORMALIZE (ap, an);
125   ASSERT (gn <= an);
126 
127   if (an >= size)
128     mpn_mul (tp, ap, an, up, size);
129   else
130     mpn_mul (tp, up, size, ap, an);
131 
132   size += an;
133 
134   if (usize > 0)
135     {
136       /* |v| = -v = (u a - g) / b */
137 
138       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
139       MPN_NORMALIZE (tp, size);
140       if (size == 0)
141 	return 0;
142     }
143   else
144     { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
145 	 (g + |u| a) always fits in (|usize| + an) limbs. */
146 
147       ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
148       size -= (tp[size - 1] == 0);
149     }
150 
151   /* Now divide t / b. There must be no remainder */
152   bn = n;
153   MPN_NORMALIZE (bp, bn);
154   ASSERT (size >= bn);
155 
156   vn = size + 1 - bn;
157   ASSERT (vn <= n + 1);
158 
159   mpn_divexact (vp, tp, size, bp, bn);
160   vn -= (vp[vn-1] == 0);
161 
162   return vn;
163 }
164 
165 /* Temporary storage:
166 
167    Initial division: Quotient of at most an - n + 1 <= an limbs.
168 
169    Storage for u0 and u1: 2(n+1).
170 
171    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
172 
173    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
174 
175    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
176 
177    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
178 
179    For the lehmer call after the loop, Let T denote
180    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
181    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
182    for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
183    sufficient for both operations.
184 
185 */
186 
187 /* Optimal choice of p seems difficult. In each iteration the division
188  * of work between hgcd and the updates of u0 and u1 depends on the
189  * current size of the u. It may be desirable to use a different
190  * choice of p in each iteration. Also the input size seems to matter;
191  * choosing p = n / 3 in the first iteration seems to improve
192  * performance slightly for input size just above the threshold, but
193  * degrade performance for larger inputs. */
194 #define CHOOSE_P_1(n) ((n) / 2)
195 #define CHOOSE_P_2(n) ((n) / 3)
196 
197 mp_size_t
mpn_gcdext(mp_ptr gp,mp_ptr up,mp_size_t * usizep,mp_ptr ap,mp_size_t an,mp_ptr bp,mp_size_t n)198 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
199 	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
200 {
201   mp_size_t talloc;
202   mp_size_t scratch;
203   mp_size_t matrix_scratch;
204   mp_size_t ualloc = n + 1;
205 
206   struct gcdext_ctx ctx;
207   mp_size_t un;
208   mp_ptr u0;
209   mp_ptr u1;
210 
211   mp_ptr tp;
212 
213   TMP_DECL;
214 
215   ASSERT (an >= n);
216   ASSERT (n > 0);
217   ASSERT (bp[n-1] > 0);
218 
219   TMP_MARK;
220 
221   /* FIXME: Check for small sizes first, before setting up temporary
222      storage etc. */
223   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
224 
225   /* For initial division */
226   scratch = an - n + 1;
227   if (scratch > talloc)
228     talloc = scratch;
229 
230   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
231     {
232       /* For hgcd loop. */
233       mp_size_t hgcd_scratch;
234       mp_size_t update_scratch;
235       mp_size_t p1 = CHOOSE_P_1 (n);
236       mp_size_t p2 = CHOOSE_P_2 (n);
237       mp_size_t min_p = MIN(p1, p2);
238       mp_size_t max_p = MAX(p1, p2);
239       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
240       hgcd_scratch = mpn_hgcd_itch (n - min_p);
241       update_scratch = max_p + n - 1;
242 
243       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
244       if (scratch > talloc)
245 	talloc = scratch;
246 
247       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
248 	 copies of a and b. */
249       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
250 	+ 3*GCDEXT_DC_THRESHOLD;
251 
252       if (scratch > talloc)
253 	talloc = scratch;
254 
255       /* Cofactors u0 and u1 */
256       talloc += 2*(n+1);
257     }
258 
259   tp = TMP_ALLOC_LIMBS(talloc);
260 
261   if (an > n)
262     {
263       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
264 
265       if (mpn_zero_p (ap, n))
266 	{
267 	  MPN_COPY (gp, bp, n);
268 	  *usizep = 0;
269 	  TMP_FREE;
270 	  return n;
271 	}
272     }
273 
274   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
275     {
276       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
277 
278       TMP_FREE;
279       return gn;
280     }
281 
282   MPN_ZERO (tp, 2*ualloc);
283   u0 = tp; tp += ualloc;
284   u1 = tp; tp += ualloc;
285 
286   ctx.gp = gp;
287   ctx.up = up;
288   ctx.usize = usizep;
289 
290   {
291     /* For the first hgcd call, there are no u updates, and it makes
292        some sense to use a different choice for p. */
293 
294     /* FIXME: We could trim use of temporary storage, since u0 and u1
295        are not used yet. For the hgcd call, we could swap in the u0
296        and u1 pointers for the relevant matrix elements. */
297 
298     struct hgcd_matrix M;
299     mp_size_t p = CHOOSE_P_1 (n);
300     mp_size_t nn;
301 
302     mpn_hgcd_matrix_init (&M, n - p, tp);
303     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
304     if (nn > 0)
305       {
306 	ASSERT (M.n <= (n - p - 1)/2);
307 	ASSERT (M.n + p <= (p + n - 1) / 2);
308 
309 	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
310 	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
311 
312 	MPN_COPY (u0, M.p[1][0], M.n);
313 	MPN_COPY (u1, M.p[1][1], M.n);
314 	un = M.n;
315 	while ( (u0[un-1] | u1[un-1] ) == 0)
316 	  un--;
317       }
318     else
319       {
320 	/* mpn_hgcd has failed. Then either one of a or b is very
321 	   small, or the difference is very small. Perform one
322 	   subtraction followed by one division. */
323 	u1[0] = 1;
324 
325 	ctx.u0 = u0;
326 	ctx.u1 = u1;
327 	ctx.tp = tp + n; /* ualloc */
328 	ctx.un = 1;
329 
330 	/* Temporary storage n */
331 	n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
332 	if (n == 0)
333 	  {
334 	    TMP_FREE;
335 	    return ctx.gn;
336 	  }
337 
338 	un = ctx.un;
339 	ASSERT (un < ualloc);
340       }
341   }
342 
343   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
344     {
345       struct hgcd_matrix M;
346       mp_size_t p = CHOOSE_P_2 (n);
347       mp_size_t nn;
348 
349       mpn_hgcd_matrix_init (&M, n - p, tp);
350       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
351       if (nn > 0)
352 	{
353 	  mp_ptr t0;
354 
355 	  t0 = tp + matrix_scratch;
356 	  ASSERT (M.n <= (n - p - 1)/2);
357 	  ASSERT (M.n + p <= (p + n - 1) / 2);
358 
359 	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
360 	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
361 
362 	  /* By the same analysis as for mpn_hgcd_matrix_mul */
363 	  ASSERT (M.n + un <= ualloc);
364 
365 	  /* FIXME: This copying could be avoided by some swapping of
366 	   * pointers. May need more temporary storage, though. */
367 	  MPN_COPY (t0, u0, un);
368 
369 	  /* Temporary storage ualloc */
370 	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
371 
372 	  ASSERT (un < ualloc);
373 	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
374 	}
375       else
376 	{
377 	  /* mpn_hgcd has failed. Then either one of a or b is very
378 	     small, or the difference is very small. Perform one
379 	     subtraction followed by one division. */
380 	  ctx.u0 = u0;
381 	  ctx.u1 = u1;
382 	  ctx.tp = tp + n; /* ualloc */
383 	  ctx.un = un;
384 
385 	  /* Temporary storage n */
386 	  n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
387 	  if (n == 0)
388 	    {
389 	      TMP_FREE;
390 	      return ctx.gn;
391 	    }
392 
393 	  un = ctx.un;
394 	  ASSERT (un < ualloc);
395 	}
396     }
397   /* We have A = ... a + ... b
398 	     B =  u0 a +  u1 b
399 
400 	     a = u1  A + ... B
401 	     b = -u0 A + ... B
402 
403      with bounds
404 
405        |u0|, |u1| <= B / min(a, b)
406 
407      We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
408      in which case the only reduction done so far is a = A - k B for
409      some k.
410 
411      Compute g = u a + v b = (u u1 - v u0) A + (...) B
412      Here, u, v are bounded by
413 
414        |u| <= b,
415        |v| <= a
416   */
417 
418   ASSERT ( (ap[n-1] | bp[n-1]) > 0);
419 
420   if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
421     {
422       /* Must return the smallest cofactor, +u1 or -u0 */
423       int c;
424 
425       MPN_COPY (gp, ap, n);
426 
427       MPN_CMP (c, u0, u1, un);
428       /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
429 	 this case we choose the cofactor + 1, corresponding to G = A
430 	 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
431       ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
432       if (c < 0)
433 	{
434 	  MPN_NORMALIZE (u0, un);
435 	  MPN_COPY (up, u0, un);
436 	  *usizep = -un;
437 	}
438       else
439 	{
440 	  MPN_NORMALIZE_NOT_ZERO (u1, un);
441 	  MPN_COPY (up, u1, un);
442 	  *usizep = un;
443 	}
444 
445       TMP_FREE;
446       return n;
447     }
448   else if (UNLIKELY (u0[0] == 0) && un == 1)
449     {
450       mp_size_t gn;
451       ASSERT (u1[0] == 1);
452 
453       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
454       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
455 
456       TMP_FREE;
457       return gn;
458     }
459   else
460     {
461       mp_size_t u0n;
462       mp_size_t u1n;
463       mp_size_t lehmer_un;
464       mp_size_t lehmer_vn;
465       mp_size_t gn;
466 
467       mp_ptr lehmer_up;
468       mp_ptr lehmer_vp;
469       int negate;
470 
471       lehmer_up = tp; tp += n;
472 
473       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
474       MPN_COPY (tp, ap, n);
475       MPN_COPY (tp + n, bp, n);
476       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
477 
478       u0n = un;
479       MPN_NORMALIZE (u0, u0n);
480       ASSERT (u0n > 0);
481 
482       if (lehmer_un == 0)
483 	{
484 	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
485 	  MPN_COPY (up, u0, u0n);
486 	  *usizep = -u0n;
487 
488 	  TMP_FREE;
489 	  return gn;
490 	}
491 
492       lehmer_vp = tp;
493       /* Compute v = (g - u a) / b */
494       lehmer_vn = compute_v (lehmer_vp,
495 			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
496 
497       if (lehmer_un > 0)
498 	negate = 0;
499       else
500 	{
501 	  lehmer_un = -lehmer_un;
502 	  negate = 1;
503 	}
504 
505       u1n = un;
506       MPN_NORMALIZE (u1, u1n);
507       ASSERT (u1n > 0);
508 
509       ASSERT (lehmer_un + u1n <= ualloc);
510       ASSERT (lehmer_vn + u0n <= ualloc);
511 
512       /* We may still have v == 0 */
513 
514       /* Compute u u0 */
515       if (lehmer_un <= u1n)
516 	/* Should be the common case */
517 	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
518       else
519 	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
520 
521       un = u1n + lehmer_un;
522       un -= (up[un - 1] == 0);
523 
524       if (lehmer_vn > 0)
525 	{
526 	  mp_limb_t cy;
527 
528 	  /* Overwrites old u1 value */
529 	  if (lehmer_vn <= u0n)
530 	    /* Should be the common case */
531 	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
532 	  else
533 	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
534 
535 	  u1n = u0n + lehmer_vn;
536 	  u1n -= (u1[u1n - 1] == 0);
537 
538 	  if (u1n <= un)
539 	    {
540 	      cy = mpn_add (up, up, un, u1, u1n);
541 	    }
542 	  else
543 	    {
544 	      cy = mpn_add (up, u1, u1n, up, un);
545 	      un = u1n;
546 	    }
547 	  up[un] = cy;
548 	  un += (cy != 0);
549 
550 	  ASSERT (un < ualloc);
551 	}
552       *usizep = negate ? -un : un;
553 
554       TMP_FREE;
555       return gn;
556     }
557 }
558