xref: /dragonfly/contrib/gmp/mpn/generic/gcdext.c (revision ad9f8794)
1 /* mpn_gcdext -- Extended Greatest Common Divisor.
2 
3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
4 Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20 
21 #include "gmp.h"
22 #include "gmp-impl.h"
23 #include "longlong.h"
24 
25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
26    the size is returned (if inputs are non-normalized, result may be
27    non-normalized too). Temporary space needed is M->n + n.
28  */
29 static size_t
30 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
31 			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
32 {
33   mp_limb_t ah, bh;
34 
35   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
36 
37      t  = u00 * a
38      r  = u10 * b
39      r += t;
40 
41      t  = u11 * b
42      b  = u01 * a
43      b += t;
44   */
45 
46   if (M->n >= n)
47     {
48       mpn_mul (tp, M->p[0][0], M->n, ap, n);
49       mpn_mul (rp, M->p[1][0], M->n, bp, n);
50     }
51   else
52     {
53       mpn_mul (tp, ap, n, M->p[0][0], M->n);
54       mpn_mul (rp, bp, n, M->p[1][0], M->n);
55     }
56 
57   ah = mpn_add_n (rp, rp, tp, n + M->n);
58 
59   if (M->n >= n)
60     {
61       mpn_mul (tp, M->p[1][1], M->n, bp, n);
62       mpn_mul (bp, M->p[0][1], M->n, ap, n);
63     }
64   else
65     {
66       mpn_mul (tp, bp, n, M->p[1][1], M->n);
67       mpn_mul (bp, ap, n, M->p[0][1], M->n);
68     }
69   bh = mpn_add_n (bp, bp, tp, n + M->n);
70 
71   n += M->n;
72   if ( (ah | bh) > 0)
73     {
74       rp[n] = ah;
75       bp[n] = bh;
76       n++;
77     }
78   else
79     {
80       /* Normalize */
81       while ( (rp[n-1] | bp[n-1]) == 0)
82 	n--;
83     }
84 
85   return n;
86 }
87 
88 #define COMPUTE_V_ITCH(n) (2*(n) + 1)
89 
90 /* Computes |v| = |(g - u a)| / b, where u may be positive or
91    negative, and v is of the opposite sign. a, b are of size n, u and
92    v at most size n, and v must have space for n+1 limbs. */
93 static mp_size_t
94 compute_v (mp_ptr vp,
95 	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
96 	   mp_srcptr gp, mp_size_t gn,
97 	   mp_srcptr up, mp_size_t usize,
98 	   mp_ptr tp)
99 {
100   mp_size_t size;
101   mp_size_t an;
102   mp_size_t bn;
103   mp_size_t vn;
104 
105   ASSERT (n > 0);
106   ASSERT (gn > 0);
107   ASSERT (usize != 0);
108 
109   size = ABS (usize);
110   ASSERT (size <= n);
111 
112   an = n;
113   MPN_NORMALIZE (ap, an);
114 
115   if (an >= size)
116     mpn_mul (tp, ap, an, up, size);
117   else
118     mpn_mul (tp, up, size, ap, an);
119 
120   size += an;
121 
122   ASSERT (gn <= size);
123 
124   if (usize > 0)
125     {
126       /* |v| = -v = (u a - g) / b */
127 
128       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
129       MPN_NORMALIZE (tp, size);
130       if (size == 0)
131 	return 0;
132     }
133   else
134     { /* usize < 0 */
135       /* |v| = v = (c - u a) / b = (c + |u| a) / b */
136       mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
137       if (cy)
138 	tp[size++] = cy;
139     }
140 
141   /* Now divide t / b. There must be no remainder */
142   bn = n;
143   MPN_NORMALIZE (bp, bn);
144   ASSERT (size >= bn);
145 
146   vn = size + 1 - bn;
147   ASSERT (vn <= n + 1);
148 
149   /* FIXME: Use divexact. Or do the entire calculation mod 2^{n *
150      GMP_NUMB_BITS}. */
151   mpn_tdiv_qr (vp, tp, 0, tp, size, bp, bn);
152   vn -= (vp[vn-1] == 0);
153 
154   /* Remainder must be zero */
155 #if WANT_ASSERT
156   {
157     mp_size_t i;
158     for (i = 0; i < bn; i++)
159       {
160 	ASSERT (tp[i] == 0);
161       }
162   }
163 #endif
164   return vn;
165 }
166 
167 /* Temporary storage:
168 
169    Initial division: Quotient of at most an - n + 1 <= an limbs.
170 
171    Storage for u0 and u1: 2(n+1).
172 
173    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
174 
175    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
176 
177    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
178 
179    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
180 
181    For the lehmer call after the loop, Let T denote
182    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
183    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
184    + 1 for v and 2T + 1 scratch space. In all, 7T + 3 is sufficient.
185 
186 */
187 
188 /* Optimal choice of p seems difficult. In each iteration the division
189  * of work between hgcd and the updates of u0 and u1 depends on the
190  * current size of the u. It may be desirable to use a different
191  * choice of p in each iteration. Also the input size seems to matter;
192  * choosing p = n / 3 in the first iteration seems to improve
193  * performance slightly for input size just above the threshold, but
194  * degrade performance for larger inputs. */
195 #define CHOOSE_P_1(n) ((n) / 2)
196 #define CHOOSE_P_2(n) ((n) / 3)
197 
198 mp_size_t
199 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
200 	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
201 {
202   mp_size_t talloc;
203   mp_size_t scratch;
204   mp_size_t matrix_scratch;
205   mp_size_t ualloc = n + 1;
206 
207   mp_size_t un;
208   mp_ptr u0;
209   mp_ptr u1;
210 
211   mp_ptr tp;
212 
213   TMP_DECL;
214 
215   ASSERT (an >= n);
216   ASSERT (n > 0);
217 
218   TMP_MARK;
219 
220   /* FIXME: Check for small sizes first, before setting up temporary
221      storage etc. */
222   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
223 
224   /* For initial division */
225   scratch = an - n + 1;
226   if (scratch > talloc)
227     talloc = scratch;
228 
229   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
230     {
231       /* For hgcd loop. */
232       mp_size_t hgcd_scratch;
233       mp_size_t update_scratch;
234       mp_size_t p1 = CHOOSE_P_1 (n);
235       mp_size_t p2 = CHOOSE_P_2 (n);
236       mp_size_t min_p = MIN(p1, p2);
237       mp_size_t max_p = MAX(p1, p2);
238       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
239       hgcd_scratch = mpn_hgcd_itch (n - min_p);
240       update_scratch = max_p + n - 1;
241 
242       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
243       if (scratch > talloc)
244 	talloc = scratch;
245 
246       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
247 	 copies of a and b. */
248       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
249 	+ 3*GCDEXT_DC_THRESHOLD;
250 
251       if (scratch > talloc)
252 	talloc = scratch;
253 
254       /* Cofactors u0 and u1 */
255       talloc += 2*(n+1);
256     }
257 
258   tp = TMP_ALLOC_LIMBS(talloc);
259 
260   if (an > n)
261     {
262       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
263 
264       if (mpn_zero_p (ap, n))
265 	{
266 	  MPN_COPY (gp, bp, n);
267 	  *usizep = 0;
268 	  TMP_FREE;
269 	  return n;
270 	}
271     }
272 
273   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
274     {
275       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
276 
277       TMP_FREE;
278       return gn;
279     }
280 
281   MPN_ZERO (tp, 2*ualloc);
282   u0 = tp; tp += ualloc;
283   u1 = tp; tp += ualloc;
284 
285   {
286     /* For the first hgcd call, there are no u updates, and it makes
287        some sense to use a different choice for p. */
288 
289     /* FIXME: We could trim use of temporary storage, since u0 and u1
290        are not used yet. For the hgcd call, we could swap in the u0
291        and u1 pointers for the relevant matrix elements. */
292 
293     struct hgcd_matrix M;
294     mp_size_t p = CHOOSE_P_1 (n);
295     mp_size_t nn;
296 
297     mpn_hgcd_matrix_init (&M, n - p, tp);
298     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
299     if (nn > 0)
300       {
301 	ASSERT (M.n <= (n - p - 1)/2);
302 	ASSERT (M.n + p <= (p + n - 1) / 2);
303 
304 	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
305 	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
306 
307 	MPN_COPY (u0, M.p[1][0], M.n);
308 	MPN_COPY (u1, M.p[1][1], M.n);
309 	un = M.n;
310 	while ( (u0[un-1] | u1[un-1] ) == 0)
311 	  un--;
312       }
313     else
314       {
315 	/* mpn_hgcd has failed. Then either one of a or b is very
316 	   small, or the difference is very small. Perform one
317 	   subtraction followed by one division. */
318 	mp_size_t gn;
319 	mp_size_t updated_un = 1;
320 
321 	u1[0] = 1;
322 
323 	/* Temporary storage 2n + 1 */
324 	n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
325 				    u0, u1, &updated_un, tp, tp + n);
326 	if (n == 0)
327 	  {
328 	    TMP_FREE;
329 	    return gn;
330 	  }
331 
332 	un = updated_un;
333 	ASSERT (un < ualloc);
334       }
335   }
336 
337   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
338     {
339       struct hgcd_matrix M;
340       mp_size_t p = CHOOSE_P_2 (n);
341       mp_size_t nn;
342 
343       mpn_hgcd_matrix_init (&M, n - p, tp);
344       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
345       if (nn > 0)
346 	{
347 	  mp_ptr t0;
348 
349 	  t0 = tp + matrix_scratch;
350 	  ASSERT (M.n <= (n - p - 1)/2);
351 	  ASSERT (M.n + p <= (p + n - 1) / 2);
352 
353 	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
354 	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
355 
356 	  /* By the same analysis as for mpn_hgcd_matrix_mul */
357 	  ASSERT (M.n + un <= ualloc);
358 
359 	  /* FIXME: This copying could be avoided by some swapping of
360 	   * pointers. May need more temporary storage, though. */
361 	  MPN_COPY (t0, u0, un);
362 
363 	  /* Temporary storage ualloc */
364 	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
365 
366 	  ASSERT (un < ualloc);
367 	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
368 	}
369       else
370 	{
371 	  /* mpn_hgcd has failed. Then either one of a or b is very
372 	     small, or the difference is very small. Perform one
373 	     subtraction followed by one division. */
374 	  mp_size_t gn;
375 	  mp_size_t updated_un = un;
376 
377 	  /* Temporary storage 2n + 1 */
378 	  n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
379 				      u0, u1, &updated_un, tp, tp + n);
380 	  if (n == 0)
381 	    {
382 	      TMP_FREE;
383 	      return gn;
384 	    }
385 
386 	  un = updated_un;
387 	  ASSERT (un < ualloc);
388 	}
389     }
390 
391   if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
392     {
393       /* Must return the smallest cofactor, +u1 or -u0 */
394       int c;
395 
396       MPN_COPY (gp, ap, n);
397 
398       MPN_CMP (c, u0, u1, un);
399       ASSERT (c != 0);
400       if (c < 0)
401 	{
402 	  MPN_NORMALIZE (u0, un);
403 	  MPN_COPY (up, u0, un);
404 	  *usizep = -un;
405 	}
406       else
407 	{
408 	  MPN_NORMALIZE_NOT_ZERO (u1, un);
409 	  MPN_COPY (up, u1, un);
410 	  *usizep = un;
411 	}
412 
413       TMP_FREE;
414       return n;
415     }
416   else if (mpn_zero_p (u0, un))
417     {
418       mp_size_t gn;
419       ASSERT (un == 1);
420       ASSERT (u1[0] == 1);
421 
422       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
423       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
424 
425       TMP_FREE;
426       return gn;
427     }
428   else
429     {
430       /* We have A = ... a + ... b
431 		 B =  u0 a +  u1 b
432 
433 		 a = u1  A + ... B
434 		 b = -u0 A + ... B
435 
436 	 with bounds
437 
438 	   |u0|, |u1| <= B / min(a, b)
439 
440 	 Compute g = u a + v b = (u u1 - v u0) A + (...) B
441 	 Here, u, v are bounded by
442 
443 	 |u| <= b,
444 	 |v| <= a
445       */
446 
447       mp_size_t u0n;
448       mp_size_t u1n;
449       mp_size_t lehmer_un;
450       mp_size_t lehmer_vn;
451       mp_size_t gn;
452 
453       mp_ptr lehmer_up;
454       mp_ptr lehmer_vp;
455       int negate;
456 
457       lehmer_up = tp; tp += n;
458 
459       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
460       MPN_COPY (tp, ap, n);
461       MPN_COPY (tp + n, bp, n);
462       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
463 
464       u0n = un;
465       MPN_NORMALIZE (u0, u0n);
466       if (lehmer_un == 0)
467 	{
468 	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
469 	  MPN_COPY (up, u0, u0n);
470 	  *usizep = -u0n;
471 
472 	  TMP_FREE;
473 	  return gn;
474 	}
475 
476       lehmer_vp = tp;
477       /* Compute v = (g - u a) / b */
478       lehmer_vn = compute_v (lehmer_vp,
479 			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
480 
481       if (lehmer_un > 0)
482 	negate = 0;
483       else
484 	{
485 	  lehmer_un = -lehmer_un;
486 	  negate = 1;
487 	}
488 
489       u1n = un;
490       MPN_NORMALIZE (u1, u1n);
491 
492       /* It's possible that u0 = 1, u1 = 0 */
493       if (u1n == 0)
494 	{
495 	  ASSERT (un == 1);
496 	  ASSERT (u0[0] == 1);
497 
498 	  /* u1 == 0 ==> u u1 + v u0 = v */
499 	  MPN_COPY (up, lehmer_vp, lehmer_vn);
500 	  *usizep = negate ? lehmer_vn : - lehmer_vn;
501 
502 	  TMP_FREE;
503 	  return gn;
504 	}
505 
506       ASSERT (lehmer_un + u1n <= ualloc);
507       ASSERT (lehmer_vn + u0n <= ualloc);
508 
509       /* Now u0, u1, u are non-zero. We may still have v == 0 */
510 
511       /* Compute u u0 */
512       if (lehmer_un <= u1n)
513 	/* Should be the common case */
514 	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
515       else
516 	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
517 
518       un = u1n + lehmer_un;
519       un -= (up[un - 1] == 0);
520 
521       if (lehmer_vn > 0)
522 	{
523 	  mp_limb_t cy;
524 
525 	  /* Overwrites old u1 value */
526 	  if (lehmer_vn <= u0n)
527 	    /* Should be the common case */
528 	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
529 	  else
530 	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
531 
532 	  u1n = u0n + lehmer_vn;
533 	  u1n -= (u1[u1n - 1] == 0);
534 
535 	  if (u1n <= un)
536 	    {
537 	      cy = mpn_add (up, up, un, u1, u1n);
538 	    }
539 	  else
540 	    {
541 	      cy = mpn_add (up, u1, u1n, up, un);
542 	      un = u1n;
543 	    }
544 	  up[un] = cy;
545 	  un += (cy != 0);
546 
547 	  ASSERT (un < ualloc);
548 	}
549       *usizep = negate ? -un : un;
550 
551       TMP_FREE;
552       return gn;
553     }
554 }
555