xref: /dragonfly/contrib/gmp/mpn/generic/gcdext.c (revision a68e0df0)
1 /* mpn_gcdext -- Extended Greatest Common Divisor.
2 
3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008 Free Software
4 Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20 
21 #include "gmp.h"
22 #include "gmp-impl.h"
23 #include "longlong.h"
24 
25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
26    the size is returned (if inputs are non-normalized, result may be
27    non-normalized too). Temporary space needed is M->n + n.
28  */
29 static size_t
30 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
31 			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
32 {
33   mp_limb_t ah, bh;
34 
35   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
36 
37      t  = u00 * a
38      r  = u10 * b
39      r += t;
40 
41      t  = u11 * b
42      b  = u01 * a
43      b += t;
44   */
45 
46   if (M->n >= n)
47     {
48       mpn_mul (tp, M->p[0][0], M->n, ap, n);
49       mpn_mul (rp, M->p[1][0], M->n, bp, n);
50     }
51   else
52     {
53       mpn_mul (tp, ap, n, M->p[0][0], M->n);
54       mpn_mul (rp, bp, n, M->p[1][0], M->n);
55     }
56 
57   ah = mpn_add_n (rp, rp, tp, n + M->n);
58 
59   if (M->n >= n)
60     {
61       mpn_mul (tp, M->p[1][1], M->n, bp, n);
62       mpn_mul (bp, M->p[0][1], M->n, ap, n);
63     }
64   else
65     {
66       mpn_mul (tp, bp, n, M->p[1][1], M->n);
67       mpn_mul (bp, ap, n, M->p[0][1], M->n);
68     }
69   bh = mpn_add_n (bp, bp, tp, n + M->n);
70 
71   n += M->n;
72   if ( (ah | bh) > 0)
73     {
74       rp[n] = ah;
75       bp[n] = bh;
76       n++;
77     }
78   else
79     {
80       /* Normalize */
81       while ( (rp[n-1] | bp[n-1]) == 0)
82 	n--;
83     }
84 
85   return n;
86 }
87 
88 #define COMPUTE_V_ITCH(n) (2*(n) + 1)
89 
90 /* Computes |v| = |(g - u a)| / b, where u may be positive or
91    negative, and v is of the opposite sign. a, b are of size n, u and
92    v at most size n, and v must have space for n+1 limbs. */
93 static mp_size_t
94 compute_v (mp_ptr vp,
95 	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
96 	   mp_srcptr gp, mp_size_t gn,
97 	   mp_srcptr up, mp_size_t usize,
98 	   mp_ptr tp)
99 {
100   mp_size_t size;
101   mp_size_t an;
102   mp_size_t bn;
103   mp_size_t vn;
104 
105   ASSERT (n > 0);
106   ASSERT (gn > 0);
107   ASSERT (usize != 0);
108 
109   size = ABS (usize);
110   ASSERT (size <= n);
111 
112   an = n;
113   MPN_NORMALIZE (ap, an);
114 
115   if (an >= size)
116     mpn_mul (tp, ap, an, up, size);
117   else
118     mpn_mul (tp, up, size, ap, an);
119 
120   size += an;
121 
122   ASSERT (gn <= size);
123 
124   if (usize > 0)
125     {
126       /* |v| = -v = (u a - g) / b */
127 
128       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
129       MPN_NORMALIZE (tp, size);
130       if (size == 0)
131 	return 0;
132     }
133   else
134     { /* usize < 0 */
135       /* |v| = v = (c - u a) / b = (c + |u| a) / b */
136       mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
137       if (cy)
138 	tp[size++] = cy;
139     }
140 
141   /* Now divide t / b. There must be no remainder */
142   bn = n;
143   MPN_NORMALIZE (bp, bn);
144   ASSERT (size >= bn);
145 
146   vn = size + 1 - bn;
147   ASSERT (vn <= n + 1);
148 
149   /* FIXME: Use divexact. Or do the entire calculation mod 2^{n *
150      GMP_NUMB_BITS}. */
151   mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn);
152   vn -= (vp[vn-1] == 0);
153 
154   /* Remainder must be zero */
155 #if WANT_ASSERT
156   {
157     mp_size_t i;
158     for (i = 0; i < bn; i++)
159       {
160 	ASSERT (tp[i] == 0);
161       }
162   }
163 #endif
164   return vn;
165 }
166 
167 /* Temporary storage:
168 
169    Initial division: Quotient of at most an - n + 1 <= an limbs.
170 
171    Storage for u0 and u1: 2(n+1).
172 
173    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
174 
175    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
176 
177    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
178 
179    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
180 
181    For the lehmer call after the loop, Let T denote
182    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
183    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
184    + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient.
185 
186 */
187 
188 /* Optimal choice of p seems difficult. In each iteration the division
189  * of work between hgcd and the updates of u0 and u1 depends on the
190  * current size of the u. It may be desirable to use a different
191  * choice of p in each iteration. Also the input size seems to matter;
192  * choosing p = n / 3 in the first iteration seems to improve
193  * performance slightly for input size just above the threshold, but
194  * degrade performance for larger inputs. */
195 #define CHOOSE_P_1(n) ((n) / 2)
196 #define CHOOSE_P_2(n) ((n) / 3)
197 
198 mp_size_t
199 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
200 	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
201 {
202   mp_size_t talloc;
203   mp_size_t scratch;
204   mp_size_t matrix_scratch;
205   mp_size_t ualloc = n + 1;
206 
207   mp_size_t un;
208   mp_ptr u0;
209   mp_ptr u1;
210 
211   mp_ptr tp;
212 
213   TMP_DECL;
214 
215   ASSERT (an >= n);
216   ASSERT (n > 0);
217 
218   TMP_MARK;
219 
220   /* FIXME: Check for small sizes first, before setting up temporary
221      storage etc. */
222   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
223 
224   /* For initial division */
225   scratch = an - n + 1;
226   if (scratch > talloc)
227     talloc = scratch;
228 
229   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
230     {
231       /* For hgcd loop. */
232       mp_size_t hgcd_scratch;
233       mp_size_t update_scratch;
234       mp_size_t p1 = CHOOSE_P_1 (n);
235       mp_size_t p2 = CHOOSE_P_2 (n);
236       mp_size_t min_p = MIN(p1, p2);
237       mp_size_t max_p = MAX(p1, p2);
238       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
239       hgcd_scratch = mpn_hgcd_itch (n - min_p);
240       update_scratch = max_p + n - 1;
241 
242       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
243       if (scratch > talloc)
244 	talloc = scratch;
245 
246       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
247 	 copies of a and b. */
248       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
249 	+ 3*GCDEXT_DC_THRESHOLD;
250 
251       if (scratch > talloc)
252 	talloc = scratch;
253 
254       /* Cofactors u0 and u1 */
255       talloc += 2*(n+1);
256     }
257 
258   tp = TMP_ALLOC_LIMBS(talloc);
259 
260   if (an > n)
261     {
262       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
263 
264       if (mpn_zero_p (ap, n))
265 	{
266 	  MPN_COPY (gp, bp, n);
267 	  *usizep = 0;
268 	  TMP_FREE;
269 	  return n;
270 	}
271     }
272 
273   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
274     {
275       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
276 
277       TMP_FREE;
278       return gn;
279     }
280 
281   MPN_ZERO (tp, 2*ualloc);
282   u0 = tp; tp += ualloc;
283   u1 = tp; tp += ualloc;
284 
285   {
286     /* For the first hgcd call, there are no u updates, and it makes
287        some sense to use a different choice for p. */
288 
289     /* FIXME: We could trim use of temporary storage, since u0 and u1
290        are not used yet. For the hgcd call, we could swap in the u0
291        and u1 pointers for the relevant matrix elements. */
292 
293     struct hgcd_matrix M;
294     mp_size_t p = CHOOSE_P_1 (n);
295     mp_size_t nn;
296 
297     mpn_hgcd_matrix_init (&M, n - p, tp);
298     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
299     if (nn > 0)
300       {
301 	ASSERT (M.n <= (n - p - 1)/2);
302 	ASSERT (M.n + p <= (p + n - 1) / 2);
303 
304 	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
305 	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
306 
307 	MPN_COPY (u0, M.p[1][0], M.n);
308 	MPN_COPY (u1, M.p[1][1], M.n);
309 	un = M.n;
310 	while ( (u0[un-1] | u1[un-1] ) == 0)
311 	  un--;
312       }
313     else
314       {
315 	/* mpn_hgcd has failed. Then either one of a or b is very
316 	   small, or the difference is very small. Perform one
317 	   subtraction followed by one division. */
318 	mp_size_t gn;
319 	mp_size_t updated_un = 1;
320 
321 	u1[0] = 1;
322 
323 	/* Temporary storage 2n + 1 */
324 	n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
325 				    u0, u1, &updated_un, tp, tp + n);
326 	if (n == 0)
327 	  {
328 	    TMP_FREE;
329 	    return gn;
330 	  }
331 
332 	un = updated_un;
333 	ASSERT (un < ualloc);
334       }
335   }
336 
337   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
338     {
339       struct hgcd_matrix M;
340       mp_size_t p = CHOOSE_P_2 (n);
341       mp_size_t nn;
342 
343       mpn_hgcd_matrix_init (&M, n - p, tp);
344       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
345       if (nn > 0)
346 	{
347 	  mp_ptr t0;
348 
349 	  t0 = tp + matrix_scratch;
350 	  ASSERT (M.n <= (n - p - 1)/2);
351 	  ASSERT (M.n + p <= (p + n - 1) / 2);
352 
353 	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
354 	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
355 
356 	  /* By the same analysis as for mpn_hgcd_matrix_mul */
357 	  ASSERT (M.n + un <= ualloc);
358 
359 	  /* FIXME: This copying could be avoided by some swapping of
360 	   * pointers. May need more temporary storage, though. */
361 	  MPN_COPY (t0, u0, un);
362 
363 	  /* Temporary storage ualloc */
364 	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
365 
366 	  ASSERT (un < ualloc);
367 	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
368 	}
369       else
370 	{
371 	  /* mpn_hgcd has failed. Then either one of a or b is very
372 	     small, or the difference is very small. Perform one
373 	     subtraction followed by one division. */
374 	  mp_size_t gn;
375 	  mp_size_t updated_un = un;
376 
377 	  /* Temporary storage 2n + 1 */
378 	  n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
379 				      u0, u1, &updated_un, tp, tp + n);
380 	  if (n == 0)
381 	    {
382 	      TMP_FREE;
383 	      return gn;
384 	    }
385 
386 	  un = updated_un;
387 	  ASSERT (un < ualloc);
388 	}
389     }
390 
391   if (mpn_zero_p (ap, n))
392     {
393       MPN_COPY (gp, bp, n);
394       MPN_NORMALIZE_NOT_ZERO (u0, un);
395       MPN_COPY (up, u0, un);
396       *usizep = -un;
397 
398       TMP_FREE;
399       return n;
400     }
401   else if (mpn_zero_p (bp, n))
402     {
403       MPN_COPY (gp, ap, n);
404       MPN_NORMALIZE_NOT_ZERO (u1, un);
405       MPN_COPY (up, u1, un);
406       *usizep = un;
407 
408       TMP_FREE;
409       return n;
410     }
411   else if (mpn_zero_p (u0, un))
412     {
413       mp_size_t gn;
414       ASSERT (un == 1);
415       ASSERT (u1[0] == 1);
416 
417       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
418       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
419 
420       TMP_FREE;
421       return gn;
422     }
423   else
424     {
425       /* We have A = ... a + ... b
426 		 B =  u0 a +  u1 b
427 
428 		 a = u1  A + ... B
429 		 b = -u0 A + ... B
430 
431 	 with bounds
432 
433 	   |u0|, |u1| <= B / min(a, b)
434 
435 	 Compute g = u a + v b = (u u1 - v u0) A + (...) B
436 	 Here, u, v are bounded by
437 
438 	 |u| <= b,
439 	 |v| <= a
440       */
441 
442       mp_size_t u0n;
443       mp_size_t u1n;
444       mp_size_t lehmer_un;
445       mp_size_t lehmer_vn;
446       mp_size_t gn;
447 
448       mp_ptr lehmer_up;
449       mp_ptr lehmer_vp;
450       int negate;
451 
452       lehmer_up = tp; tp += n;
453 
454       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
455       MPN_COPY (tp, ap, n);
456       MPN_COPY (tp + n, bp, n);
457       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
458 
459       u0n = un;
460       MPN_NORMALIZE (u0, u0n);
461       if (lehmer_un == 0)
462 	{
463 	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
464 	  MPN_COPY (up, u0, u0n);
465 	  *usizep = -u0n;
466 
467 	  TMP_FREE;
468 	  return gn;
469 	}
470 
471       lehmer_vp = tp;
472       /* Compute v = (g - u a) / b */
473       lehmer_vn = compute_v (lehmer_vp,
474 			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
475 
476       if (lehmer_un > 0)
477 	negate = 0;
478       else
479 	{
480 	  lehmer_un = -lehmer_un;
481 	  negate = 1;
482 	}
483 
484       u1n = un;
485       MPN_NORMALIZE (u1, u1n);
486 
487       /* It's possible that u0 = 1, u1 = 0 */
488       if (u1n == 0)
489 	{
490 	  ASSERT (un == 1);
491 	  ASSERT (u0[0] == 1);
492 
493 	  /* u1 == 0 ==> u u1 + v u0 = v */
494 	  MPN_COPY (up, lehmer_vp, lehmer_vn);
495 	  *usizep = negate ? lehmer_vn : - lehmer_vn;
496 
497 	  TMP_FREE;
498 	  return gn;
499 	}
500 
501       ASSERT (lehmer_un + u1n <= ualloc);
502       ASSERT (lehmer_vn + u0n <= ualloc);
503 
504       /* Now u0, u1, u are non-zero. We may still have v == 0 */
505 
506       /* Compute u u0 */
507       if (lehmer_un <= u1n)
508 	/* Should be the common case */
509 	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
510       else
511 	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
512 
513       un = u1n + lehmer_un;
514       un -= (up[un - 1] == 0);
515 
516       if (lehmer_vn > 0)
517 	{
518 	  mp_limb_t cy;
519 
520 	  /* Overwrites old u1 value */
521 	  if (lehmer_vn <= u0n)
522 	    /* Should be the common case */
523 	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
524 	  else
525 	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
526 
527 	  u1n = u0n + lehmer_vn;
528 	  u1n -= (u1[u1n - 1] == 0);
529 
530 	  if (u1n <= un)
531 	    {
532 	      cy = mpn_add (up, up, un, u1, u1n);
533 	    }
534 	  else
535 	    {
536 	      cy = mpn_add (up, u1, u1n, up, un);
537 	      un = u1n;
538 	    }
539 	  up[un] = cy;
540 	  un += (cy != 0);
541 
542 	  ASSERT (un < ualloc);
543 	}
544       *usizep = negate ? -un : un;
545 
546       TMP_FREE;
547       return gn;
548     }
549 }
550