1 /* mpn_gcdext -- Extended Greatest Common Divisor.
2 
3 Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
4 Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10 
11   * the GNU Lesser General Public License as published by the Free
12     Software Foundation; either version 3 of the License, or (at your
13     option) any later version.
14 
15 or
16 
17   * the GNU General Public License as published by the Free Software
18     Foundation; either version 2 of the License, or (at your option) any
19     later version.
20 
21 or both in parallel, as here.
22 
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26 for more details.
27 
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library.  If not,
30 see https://www.gnu.org/licenses/.  */
31 
32 #include "gmp.h"
33 #include "gmp-impl.h"
34 #include "longlong.h"
35 
36 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
37    the size is returned (if inputs are non-normalized, result may be
38    non-normalized too). Temporary space needed is M->n + n.
39  */
40 static size_t
hgcd_mul_matrix_vector(struct hgcd_matrix * M,mp_ptr rp,mp_srcptr ap,mp_ptr bp,mp_size_t n,mp_ptr tp)41 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
42 			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
43 {
44   mp_limb_t ah, bh;
45 
46   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
47 
48      t  = u00 * a
49      r  = u10 * b
50      r += t;
51 
52      t  = u11 * b
53      b  = u01 * a
54      b += t;
55   */
56 
57   if (M->n >= n)
58     {
59       mpn_mul (tp, M->p[0][0], M->n, ap, n);
60       mpn_mul (rp, M->p[1][0], M->n, bp, n);
61     }
62   else
63     {
64       mpn_mul (tp, ap, n, M->p[0][0], M->n);
65       mpn_mul (rp, bp, n, M->p[1][0], M->n);
66     }
67 
68   ah = mpn_add_n (rp, rp, tp, n + M->n);
69 
70   if (M->n >= n)
71     {
72       mpn_mul (tp, M->p[1][1], M->n, bp, n);
73       mpn_mul (bp, M->p[0][1], M->n, ap, n);
74     }
75   else
76     {
77       mpn_mul (tp, bp, n, M->p[1][1], M->n);
78       mpn_mul (bp, ap, n, M->p[0][1], M->n);
79     }
80   bh = mpn_add_n (bp, bp, tp, n + M->n);
81 
82   n += M->n;
83   if ( (ah | bh) > 0)
84     {
85       rp[n] = ah;
86       bp[n] = bh;
87       n++;
88     }
89   else
90     {
91       /* Normalize */
92       while ( (rp[n-1] | bp[n-1]) == 0)
93 	n--;
94     }
95 
96   return n;
97 }
98 
99 #define COMPUTE_V_ITCH(n) (2*(n))
100 
101 /* Computes |v| = |(g - u a)| / b, where u may be positive or
102    negative, and v is of the opposite sign. max(a, b) is of size n, u and
103    v at most size n, and v must have space for n+1 limbs. */
104 static mp_size_t
compute_v(mp_ptr vp,mp_srcptr ap,mp_srcptr bp,mp_size_t n,mp_srcptr gp,mp_size_t gn,mp_srcptr up,mp_size_t usize,mp_ptr tp)105 compute_v (mp_ptr vp,
106 	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
107 	   mp_srcptr gp, mp_size_t gn,
108 	   mp_srcptr up, mp_size_t usize,
109 	   mp_ptr tp)
110 {
111   mp_size_t size;
112   mp_size_t an;
113   mp_size_t bn;
114   mp_size_t vn;
115 
116   ASSERT (n > 0);
117   ASSERT (gn > 0);
118   ASSERT (usize != 0);
119 
120   size = ABS (usize);
121   ASSERT (size <= n);
122   ASSERT (up[size-1] > 0);
123 
124   an = n;
125   MPN_NORMALIZE (ap, an);
126   ASSERT (gn <= an);
127 
128   if (an >= size)
129     mpn_mul (tp, ap, an, up, size);
130   else
131     mpn_mul (tp, up, size, ap, an);
132 
133   size += an;
134 
135   if (usize > 0)
136     {
137       /* |v| = -v = (u a - g) / b */
138 
139       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
140       MPN_NORMALIZE (tp, size);
141       if (size == 0)
142 	return 0;
143     }
144   else
145     { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
146 	 (g + |u| a) always fits in (|usize| + an) limbs. */
147 
148       ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
149       size -= (tp[size - 1] == 0);
150     }
151 
152   /* Now divide t / b. There must be no remainder */
153   bn = n;
154   MPN_NORMALIZE (bp, bn);
155   ASSERT (size >= bn);
156 
157   vn = size + 1 - bn;
158   ASSERT (vn <= n + 1);
159 
160   mpn_divexact (vp, tp, size, bp, bn);
161   vn -= (vp[vn-1] == 0);
162 
163   return vn;
164 }
165 
166 /* Temporary storage:
167 
168    Initial division: Quotient of at most an - n + 1 <= an limbs.
169 
170    Storage for u0 and u1: 2(n+1).
171 
172    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
173 
174    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
175 
176    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
177 
178    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
179 
180    For the lehmer call after the loop, Let T denote
181    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
182    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
183    for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
184    sufficient for both operations.
185 
186 */
187 
188 /* Optimal choice of p seems difficult. In each iteration the division
189  * of work between hgcd and the updates of u0 and u1 depends on the
190  * current size of the u. It may be desirable to use a different
191  * choice of p in each iteration. Also the input size seems to matter;
192  * choosing p = n / 3 in the first iteration seems to improve
193  * performance slightly for input size just above the threshold, but
194  * degrade performance for larger inputs. */
195 #define CHOOSE_P_1(n) ((n) / 2)
196 #define CHOOSE_P_2(n) ((n) / 3)
197 
198 mp_size_t
mpn_gcdext(mp_ptr gp,mp_ptr up,mp_size_t * usizep,mp_ptr ap,mp_size_t an,mp_ptr bp,mp_size_t n)199 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
200 	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
201 {
202   mp_size_t talloc;
203   mp_size_t scratch;
204   mp_size_t matrix_scratch;
205   mp_size_t ualloc = n + 1;
206 
207   struct gcdext_ctx ctx;
208   mp_size_t un;
209   mp_ptr u0;
210   mp_ptr u1;
211 
212   mp_ptr tp;
213 
214   TMP_DECL;
215 
216   ASSERT (an >= n);
217   ASSERT (n > 0);
218   ASSERT (bp[n-1] > 0);
219 
220   TMP_MARK;
221 
222   /* FIXME: Check for small sizes first, before setting up temporary
223      storage etc. */
224   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
225 
226   /* For initial division */
227   scratch = an - n + 1;
228   if (scratch > talloc)
229     talloc = scratch;
230 
231   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
232     {
233       /* For hgcd loop. */
234       mp_size_t hgcd_scratch;
235       mp_size_t update_scratch;
236       mp_size_t p1 = CHOOSE_P_1 (n);
237       mp_size_t p2 = CHOOSE_P_2 (n);
238       mp_size_t min_p = MIN(p1, p2);
239       mp_size_t max_p = MAX(p1, p2);
240       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
241       hgcd_scratch = mpn_hgcd_itch (n - min_p);
242       update_scratch = max_p + n - 1;
243 
244       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
245       if (scratch > talloc)
246 	talloc = scratch;
247 
248       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
249 	 copies of a and b. */
250       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
251 	+ 3*GCDEXT_DC_THRESHOLD;
252 
253       if (scratch > talloc)
254 	talloc = scratch;
255 
256       /* Cofactors u0 and u1 */
257       talloc += 2*(n+1);
258     }
259 
260   tp = TMP_ALLOC_LIMBS(talloc);
261 
262   if (an > n)
263     {
264       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
265 
266       if (mpn_zero_p (ap, n))
267 	{
268 	  MPN_COPY (gp, bp, n);
269 	  *usizep = 0;
270 	  TMP_FREE;
271 	  return n;
272 	}
273     }
274 
275   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
276     {
277       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
278 
279       TMP_FREE;
280       return gn;
281     }
282 
283   MPN_ZERO (tp, 2*ualloc);
284   u0 = tp; tp += ualloc;
285   u1 = tp; tp += ualloc;
286 
287   ctx.gp = gp;
288   ctx.up = up;
289   ctx.usize = usizep;
290 
291   {
292     /* For the first hgcd call, there are no u updates, and it makes
293        some sense to use a different choice for p. */
294 
295     /* FIXME: We could trim use of temporary storage, since u0 and u1
296        are not used yet. For the hgcd call, we could swap in the u0
297        and u1 pointers for the relevant matrix elements. */
298 
299     struct hgcd_matrix M;
300     mp_size_t p = CHOOSE_P_1 (n);
301     mp_size_t nn;
302 
303     mpn_hgcd_matrix_init (&M, n - p, tp);
304     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
305     if (nn > 0)
306       {
307 	ASSERT (M.n <= (n - p - 1)/2);
308 	ASSERT (M.n + p <= (p + n - 1) / 2);
309 
310 	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
311 	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
312 
313 	MPN_COPY (u0, M.p[1][0], M.n);
314 	MPN_COPY (u1, M.p[1][1], M.n);
315 	un = M.n;
316 	while ( (u0[un-1] | u1[un-1] ) == 0)
317 	  un--;
318       }
319     else
320       {
321 	/* mpn_hgcd has failed. Then either one of a or b is very
322 	   small, or the difference is very small. Perform one
323 	   subtraction followed by one division. */
324 	u1[0] = 1;
325 
326 	ctx.u0 = u0;
327 	ctx.u1 = u1;
328 	ctx.tp = tp + n; /* ualloc */
329 	ctx.un = 1;
330 
331 	/* Temporary storage n */
332 	n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
333 	if (n == 0)
334 	  {
335 	    TMP_FREE;
336 	    return ctx.gn;
337 	  }
338 
339 	un = ctx.un;
340 	ASSERT (un < ualloc);
341       }
342   }
343 
344   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
345     {
346       struct hgcd_matrix M;
347       mp_size_t p = CHOOSE_P_2 (n);
348       mp_size_t nn;
349 
350       mpn_hgcd_matrix_init (&M, n - p, tp);
351       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
352       if (nn > 0)
353 	{
354 	  mp_ptr t0;
355 
356 	  t0 = tp + matrix_scratch;
357 	  ASSERT (M.n <= (n - p - 1)/2);
358 	  ASSERT (M.n + p <= (p + n - 1) / 2);
359 
360 	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
361 	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
362 
363 	  /* By the same analysis as for mpn_hgcd_matrix_mul */
364 	  ASSERT (M.n + un <= ualloc);
365 
366 	  /* FIXME: This copying could be avoided by some swapping of
367 	   * pointers. May need more temporary storage, though. */
368 	  MPN_COPY (t0, u0, un);
369 
370 	  /* Temporary storage ualloc */
371 	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
372 
373 	  ASSERT (un < ualloc);
374 	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
375 	}
376       else
377 	{
378 	  /* mpn_hgcd has failed. Then either one of a or b is very
379 	     small, or the difference is very small. Perform one
380 	     subtraction followed by one division. */
381 	  ctx.u0 = u0;
382 	  ctx.u1 = u1;
383 	  ctx.tp = tp + n; /* ualloc */
384 	  ctx.un = un;
385 
386 	  /* Temporary storage n */
387 	  n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
388 	  if (n == 0)
389 	    {
390 	      TMP_FREE;
391 	      return ctx.gn;
392 	    }
393 
394 	  un = ctx.un;
395 	  ASSERT (un < ualloc);
396 	}
397     }
398   /* We have A = ... a + ... b
399 	     B =  u0 a +  u1 b
400 
401 	     a = u1  A + ... B
402 	     b = -u0 A + ... B
403 
404      with bounds
405 
406        |u0|, |u1| <= B / min(a, b)
407 
408      We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
409      in which case the only reduction done so far is a = A - k B for
410      some k.
411 
412      Compute g = u a + v b = (u u1 - v u0) A + (...) B
413      Here, u, v are bounded by
414 
415        |u| <= b,
416        |v| <= a
417   */
418 
419   ASSERT ( (ap[n-1] | bp[n-1]) > 0);
420 
421   if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
422     {
423       /* Must return the smallest cofactor, +u1 or -u0 */
424       int c;
425 
426       MPN_COPY (gp, ap, n);
427 
428       MPN_CMP (c, u0, u1, un);
429       /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
430 	 this case we choose the cofactor + 1, corresponding to G = A
431 	 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
432       ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
433       if (c < 0)
434 	{
435 	  MPN_NORMALIZE (u0, un);
436 	  MPN_COPY (up, u0, un);
437 	  *usizep = -un;
438 	}
439       else
440 	{
441 	  MPN_NORMALIZE_NOT_ZERO (u1, un);
442 	  MPN_COPY (up, u1, un);
443 	  *usizep = un;
444 	}
445 
446       TMP_FREE;
447       return n;
448     }
449   else if (UNLIKELY (u0[0] == 0) && un == 1)
450     {
451       mp_size_t gn;
452       ASSERT (u1[0] == 1);
453 
454       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
455       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
456 
457       TMP_FREE;
458       return gn;
459     }
460   else
461     {
462       mp_size_t u0n;
463       mp_size_t u1n;
464       mp_size_t lehmer_un;
465       mp_size_t lehmer_vn;
466       mp_size_t gn;
467 
468       mp_ptr lehmer_up;
469       mp_ptr lehmer_vp;
470       int negate;
471 
472       lehmer_up = tp; tp += n;
473 
474       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
475       MPN_COPY (tp, ap, n);
476       MPN_COPY (tp + n, bp, n);
477       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
478 
479       u0n = un;
480       MPN_NORMALIZE (u0, u0n);
481       ASSERT (u0n > 0);
482 
483       if (lehmer_un == 0)
484 	{
485 	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
486 	  MPN_COPY (up, u0, u0n);
487 	  *usizep = -u0n;
488 
489 	  TMP_FREE;
490 	  return gn;
491 	}
492 
493       lehmer_vp = tp;
494       /* Compute v = (g - u a) / b */
495       lehmer_vn = compute_v (lehmer_vp,
496 			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
497 
498       if (lehmer_un > 0)
499 	negate = 0;
500       else
501 	{
502 	  lehmer_un = -lehmer_un;
503 	  negate = 1;
504 	}
505 
506       u1n = un;
507       MPN_NORMALIZE (u1, u1n);
508       ASSERT (u1n > 0);
509 
510       ASSERT (lehmer_un + u1n <= ualloc);
511       ASSERT (lehmer_vn + u0n <= ualloc);
512 
513       /* We may still have v == 0 */
514 
515       /* Compute u u0 */
516       if (lehmer_un <= u1n)
517 	/* Should be the common case */
518 	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
519       else
520 	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
521 
522       un = u1n + lehmer_un;
523       un -= (up[un - 1] == 0);
524 
525       if (lehmer_vn > 0)
526 	{
527 	  mp_limb_t cy;
528 
529 	  /* Overwrites old u1 value */
530 	  if (lehmer_vn <= u0n)
531 	    /* Should be the common case */
532 	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
533 	  else
534 	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
535 
536 	  u1n = u0n + lehmer_vn;
537 	  u1n -= (u1[u1n - 1] == 0);
538 
539 	  if (u1n <= un)
540 	    {
541 	      cy = mpn_add (up, up, un, u1, u1n);
542 	    }
543 	  else
544 	    {
545 	      cy = mpn_add (up, u1, u1n, up, un);
546 	      un = u1n;
547 	    }
548 	  up[un] = cy;
549 	  un += (cy != 0);
550 
551 	  ASSERT (un < ualloc);
552 	}
553       *usizep = negate ? -un : un;
554 
555       TMP_FREE;
556       return gn;
557     }
558 }
559