xref: /dragonfly/contrib/gmp/mpn/generic/gcdext.c (revision 9f7604d7)
1 /* mpn_gcdext -- Extended Greatest Common Divisor.
2 
3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
4 Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20 
21 #include "gmp.h"
22 #include "gmp-impl.h"
23 #include "longlong.h"
24 
25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
26    the size is returned (if inputs are non-normalized, result may be
27    non-normalized too). Temporary space needed is M->n + n.
28  */
29 static size_t
30 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
31 			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
32 {
33   mp_limb_t ah, bh;
34 
35   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
36 
37      t  = u00 * a
38      r  = u10 * b
39      r += t;
40 
41      t  = u11 * b
42      b  = u01 * a
43      b += t;
44   */
45 
46   if (M->n >= n)
47     {
48       mpn_mul (tp, M->p[0][0], M->n, ap, n);
49       mpn_mul (rp, M->p[1][0], M->n, bp, n);
50     }
51   else
52     {
53       mpn_mul (tp, ap, n, M->p[0][0], M->n);
54       mpn_mul (rp, bp, n, M->p[1][0], M->n);
55     }
56 
57   ah = mpn_add_n (rp, rp, tp, n + M->n);
58 
59   if (M->n >= n)
60     {
61       mpn_mul (tp, M->p[1][1], M->n, bp, n);
62       mpn_mul (bp, M->p[0][1], M->n, ap, n);
63     }
64   else
65     {
66       mpn_mul (tp, bp, n, M->p[1][1], M->n);
67       mpn_mul (bp, ap, n, M->p[0][1], M->n);
68     }
69   bh = mpn_add_n (bp, bp, tp, n + M->n);
70 
71   n += M->n;
72   if ( (ah | bh) > 0)
73     {
74       rp[n] = ah;
75       bp[n] = bh;
76       n++;
77     }
78   else
79     {
80       /* Normalize */
81       while ( (rp[n-1] | bp[n-1]) == 0)
82 	n--;
83     }
84 
85   return n;
86 }
87 
88 #define COMPUTE_V_ITCH(n) (2*(n) + 1)
89 
90 /* Computes |v| = |(g - u a)| / b, where u may be positive or
91    negative, and v is of the opposite sign. a, b are of size n, u and
92    v at most size n, and v must have space for n+1 limbs. */
93 static mp_size_t
94 compute_v (mp_ptr vp,
95 	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
96 	   mp_srcptr gp, mp_size_t gn,
97 	   mp_srcptr up, mp_size_t usize,
98 	   mp_ptr tp)
99 {
100   mp_size_t size;
101   mp_size_t an;
102   mp_size_t bn;
103   mp_size_t vn;
104 
105   ASSERT (n > 0);
106   ASSERT (gn > 0);
107   ASSERT (usize != 0);
108 
109   size = ABS (usize);
110   ASSERT (size <= n);
111 
112   an = n;
113   MPN_NORMALIZE (ap, an);
114 
115   if (an >= size)
116     mpn_mul (tp, ap, an, up, size);
117   else
118     mpn_mul (tp, up, size, ap, an);
119 
120   size += an;
121   size -= tp[size - 1] == 0;
122 
123   ASSERT (gn <= size);
124 
125   if (usize > 0)
126     {
127       /* |v| = -v = (u a - g) / b */
128 
129       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
130       MPN_NORMALIZE (tp, size);
131       if (size == 0)
132 	return 0;
133     }
134   else
135     { /* usize < 0 */
136       /* |v| = v = (c - u a) / b = (c + |u| a) / b */
137       mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
138       if (cy)
139 	tp[size++] = cy;
140     }
141 
142   /* Now divide t / b. There must be no remainder */
143   bn = n;
144   MPN_NORMALIZE (bp, bn);
145   ASSERT (size >= bn);
146 
147   vn = size + 1 - bn;
148   ASSERT (vn <= n + 1);
149 
150   mpn_divexact (vp, tp, size, bp, bn);
151   vn -= (vp[vn-1] == 0);
152 
153   return vn;
154 }
155 
156 /* Temporary storage:
157 
158    Initial division: Quotient of at most an - n + 1 <= an limbs.
159 
160    Storage for u0 and u1: 2(n+1).
161 
162    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
163 
164    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
165 
166    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
167 
168    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
169 
170    For the lehmer call after the loop, Let T denote
171    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
172    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
173    for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is
174    sufficient for both operations.
175 
176 */
177 
178 /* Optimal choice of p seems difficult. In each iteration the division
179  * of work between hgcd and the updates of u0 and u1 depends on the
180  * current size of the u. It may be desirable to use a different
181  * choice of p in each iteration. Also the input size seems to matter;
182  * choosing p = n / 3 in the first iteration seems to improve
183  * performance slightly for input size just above the threshold, but
184  * degrade performance for larger inputs. */
185 #define CHOOSE_P_1(n) ((n) / 2)
186 #define CHOOSE_P_2(n) ((n) / 3)
187 
188 mp_size_t
189 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
190 	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
191 {
192   mp_size_t talloc;
193   mp_size_t scratch;
194   mp_size_t matrix_scratch;
195   mp_size_t ualloc = n + 1;
196 
197   mp_size_t un;
198   mp_ptr u0;
199   mp_ptr u1;
200 
201   mp_ptr tp;
202 
203   TMP_DECL;
204 
205   ASSERT (an >= n);
206   ASSERT (n > 0);
207 
208   TMP_MARK;
209 
210   /* FIXME: Check for small sizes first, before setting up temporary
211      storage etc. */
212   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
213 
214   /* For initial division */
215   scratch = an - n + 1;
216   if (scratch > talloc)
217     talloc = scratch;
218 
219   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
220     {
221       /* For hgcd loop. */
222       mp_size_t hgcd_scratch;
223       mp_size_t update_scratch;
224       mp_size_t p1 = CHOOSE_P_1 (n);
225       mp_size_t p2 = CHOOSE_P_2 (n);
226       mp_size_t min_p = MIN(p1, p2);
227       mp_size_t max_p = MAX(p1, p2);
228       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
229       hgcd_scratch = mpn_hgcd_itch (n - min_p);
230       update_scratch = max_p + n - 1;
231 
232       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
233       if (scratch > talloc)
234 	talloc = scratch;
235 
236       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
237 	 copies of a and b. */
238       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
239 	+ 3*GCDEXT_DC_THRESHOLD;
240 
241       if (scratch > talloc)
242 	talloc = scratch;
243 
244       /* Cofactors u0 and u1 */
245       talloc += 2*(n+1);
246     }
247 
248   tp = TMP_ALLOC_LIMBS(talloc);
249 
250   if (an > n)
251     {
252       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
253 
254       if (mpn_zero_p (ap, n))
255 	{
256 	  MPN_COPY (gp, bp, n);
257 	  *usizep = 0;
258 	  TMP_FREE;
259 	  return n;
260 	}
261     }
262 
263   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
264     {
265       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
266 
267       TMP_FREE;
268       return gn;
269     }
270 
271   MPN_ZERO (tp, 2*ualloc);
272   u0 = tp; tp += ualloc;
273   u1 = tp; tp += ualloc;
274 
275   {
276     /* For the first hgcd call, there are no u updates, and it makes
277        some sense to use a different choice for p. */
278 
279     /* FIXME: We could trim use of temporary storage, since u0 and u1
280        are not used yet. For the hgcd call, we could swap in the u0
281        and u1 pointers for the relevant matrix elements. */
282 
283     struct hgcd_matrix M;
284     mp_size_t p = CHOOSE_P_1 (n);
285     mp_size_t nn;
286 
287     mpn_hgcd_matrix_init (&M, n - p, tp);
288     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
289     if (nn > 0)
290       {
291 	ASSERT (M.n <= (n - p - 1)/2);
292 	ASSERT (M.n + p <= (p + n - 1) / 2);
293 
294 	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
295 	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
296 
297 	MPN_COPY (u0, M.p[1][0], M.n);
298 	MPN_COPY (u1, M.p[1][1], M.n);
299 	un = M.n;
300 	while ( (u0[un-1] | u1[un-1] ) == 0)
301 	  un--;
302       }
303     else
304       {
305 	/* mpn_hgcd has failed. Then either one of a or b is very
306 	   small, or the difference is very small. Perform one
307 	   subtraction followed by one division. */
308 	mp_size_t gn;
309 	mp_size_t updated_un = 1;
310 
311 	u1[0] = 1;
312 
313 	/* Temporary storage 2n + 1 */
314 	n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
315 				    u0, u1, &updated_un, tp, tp + n);
316 	if (n == 0)
317 	  {
318 	    TMP_FREE;
319 	    return gn;
320 	  }
321 
322 	un = updated_un;
323 	ASSERT (un < ualloc);
324       }
325   }
326 
327   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
328     {
329       struct hgcd_matrix M;
330       mp_size_t p = CHOOSE_P_2 (n);
331       mp_size_t nn;
332 
333       mpn_hgcd_matrix_init (&M, n - p, tp);
334       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
335       if (nn > 0)
336 	{
337 	  mp_ptr t0;
338 
339 	  t0 = tp + matrix_scratch;
340 	  ASSERT (M.n <= (n - p - 1)/2);
341 	  ASSERT (M.n + p <= (p + n - 1) / 2);
342 
343 	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
344 	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
345 
346 	  /* By the same analysis as for mpn_hgcd_matrix_mul */
347 	  ASSERT (M.n + un <= ualloc);
348 
349 	  /* FIXME: This copying could be avoided by some swapping of
350 	   * pointers. May need more temporary storage, though. */
351 	  MPN_COPY (t0, u0, un);
352 
353 	  /* Temporary storage ualloc */
354 	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
355 
356 	  ASSERT (un < ualloc);
357 	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
358 	}
359       else
360 	{
361 	  /* mpn_hgcd has failed. Then either one of a or b is very
362 	     small, or the difference is very small. Perform one
363 	     subtraction followed by one division. */
364 	  mp_size_t gn;
365 	  mp_size_t updated_un = un;
366 
367 	  /* Temporary storage 2n + 1 */
368 	  n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
369 				      u0, u1, &updated_un, tp, tp + n);
370 	  if (n == 0)
371 	    {
372 	      TMP_FREE;
373 	      return gn;
374 	    }
375 
376 	  un = updated_un;
377 	  ASSERT (un < ualloc);
378 	}
379     }
380 
381   if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
382     {
383       /* Must return the smallest cofactor, +u1 or -u0 */
384       int c;
385 
386       MPN_COPY (gp, ap, n);
387 
388       MPN_CMP (c, u0, u1, un);
389       /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
390 	 this case we choose the cofactor + 1, corresponding to G = A
391 	 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
392       ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
393       if (c < 0)
394 	{
395 	  MPN_NORMALIZE (u0, un);
396 	  MPN_COPY (up, u0, un);
397 	  *usizep = -un;
398 	}
399       else
400 	{
401 	  MPN_NORMALIZE_NOT_ZERO (u1, un);
402 	  MPN_COPY (up, u1, un);
403 	  *usizep = un;
404 	}
405 
406       TMP_FREE;
407       return n;
408     }
409   else if (mpn_zero_p (u0, un))
410     {
411       mp_size_t gn;
412       ASSERT (un == 1);
413       ASSERT (u1[0] == 1);
414 
415       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
416       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
417 
418       TMP_FREE;
419       return gn;
420     }
421   else
422     {
423       /* We have A = ... a + ... b
424 		 B =  u0 a +  u1 b
425 
426 		 a = u1  A + ... B
427 		 b = -u0 A + ... B
428 
429 	 with bounds
430 
431 	   |u0|, |u1| <= B / min(a, b)
432 
433 	 Compute g = u a + v b = (u u1 - v u0) A + (...) B
434 	 Here, u, v are bounded by
435 
436 	 |u| <= b,
437 	 |v| <= a
438       */
439 
440       mp_size_t u0n;
441       mp_size_t u1n;
442       mp_size_t lehmer_un;
443       mp_size_t lehmer_vn;
444       mp_size_t gn;
445 
446       mp_ptr lehmer_up;
447       mp_ptr lehmer_vp;
448       int negate;
449 
450       lehmer_up = tp; tp += n;
451 
452       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
453       MPN_COPY (tp, ap, n);
454       MPN_COPY (tp + n, bp, n);
455       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
456 
457       u0n = un;
458       MPN_NORMALIZE (u0, u0n);
459       if (lehmer_un == 0)
460 	{
461 	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
462 	  MPN_COPY (up, u0, u0n);
463 	  *usizep = -u0n;
464 
465 	  TMP_FREE;
466 	  return gn;
467 	}
468 
469       lehmer_vp = tp;
470       /* Compute v = (g - u a) / b */
471       lehmer_vn = compute_v (lehmer_vp,
472 			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
473 
474       if (lehmer_un > 0)
475 	negate = 0;
476       else
477 	{
478 	  lehmer_un = -lehmer_un;
479 	  negate = 1;
480 	}
481 
482       u1n = un;
483       MPN_NORMALIZE (u1, u1n);
484 
485       /* It's possible that u0 = 1, u1 = 0 */
486       if (u1n == 0)
487 	{
488 	  ASSERT (un == 1);
489 	  ASSERT (u0[0] == 1);
490 
491 	  /* u1 == 0 ==> u u1 + v u0 = v */
492 	  MPN_COPY (up, lehmer_vp, lehmer_vn);
493 	  *usizep = negate ? lehmer_vn : - lehmer_vn;
494 
495 	  TMP_FREE;
496 	  return gn;
497 	}
498 
499       ASSERT (lehmer_un + u1n <= ualloc);
500       ASSERT (lehmer_vn + u0n <= ualloc);
501 
502       /* Now u0, u1, u are non-zero. We may still have v == 0 */
503 
504       /* Compute u u0 */
505       if (lehmer_un <= u1n)
506 	/* Should be the common case */
507 	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
508       else
509 	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
510 
511       un = u1n + lehmer_un;
512       un -= (up[un - 1] == 0);
513 
514       if (lehmer_vn > 0)
515 	{
516 	  mp_limb_t cy;
517 
518 	  /* Overwrites old u1 value */
519 	  if (lehmer_vn <= u0n)
520 	    /* Should be the common case */
521 	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
522 	  else
523 	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
524 
525 	  u1n = u0n + lehmer_vn;
526 	  u1n -= (u1[u1n - 1] == 0);
527 
528 	  if (u1n <= un)
529 	    {
530 	      cy = mpn_add (up, up, un, u1, u1n);
531 	    }
532 	  else
533 	    {
534 	      cy = mpn_add (up, u1, u1n, up, un);
535 	      un = u1n;
536 	    }
537 	  up[un] = cy;
538 	  un += (cy != 0);
539 
540 	  ASSERT (un < ualloc);
541 	}
542       *usizep = negate ? -un : un;
543 
544       TMP_FREE;
545       return gn;
546     }
547 }
548