xref: /dragonfly/contrib/gmp/mpn/generic/gcdext.c (revision 2020c8fe)
1 /* mpn_gcdext -- Extended Greatest Common Divisor.
2 
3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 Free Software
4 Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20 
21 #include "gmp.h"
22 #include "gmp-impl.h"
23 #include "longlong.h"
24 
25 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
26    the size is returned (if inputs are non-normalized, result may be
27    non-normalized too). Temporary space needed is M->n + n.
28  */
29 static size_t
30 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
31 			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
32 {
33   mp_limb_t ah, bh;
34 
35   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
36 
37      t  = u00 * a
38      r  = u10 * b
39      r += t;
40 
41      t  = u11 * b
42      b  = u01 * a
43      b += t;
44   */
45 
46   if (M->n >= n)
47     {
48       mpn_mul (tp, M->p[0][0], M->n, ap, n);
49       mpn_mul (rp, M->p[1][0], M->n, bp, n);
50     }
51   else
52     {
53       mpn_mul (tp, ap, n, M->p[0][0], M->n);
54       mpn_mul (rp, bp, n, M->p[1][0], M->n);
55     }
56 
57   ah = mpn_add_n (rp, rp, tp, n + M->n);
58 
59   if (M->n >= n)
60     {
61       mpn_mul (tp, M->p[1][1], M->n, bp, n);
62       mpn_mul (bp, M->p[0][1], M->n, ap, n);
63     }
64   else
65     {
66       mpn_mul (tp, bp, n, M->p[1][1], M->n);
67       mpn_mul (bp, ap, n, M->p[0][1], M->n);
68     }
69   bh = mpn_add_n (bp, bp, tp, n + M->n);
70 
71   n += M->n;
72   if ( (ah | bh) > 0)
73     {
74       rp[n] = ah;
75       bp[n] = bh;
76       n++;
77     }
78   else
79     {
80       /* Normalize */
81       while ( (rp[n-1] | bp[n-1]) == 0)
82 	n--;
83     }
84 
85   return n;
86 }
87 
88 #define COMPUTE_V_ITCH(n) (2*(n) + 1)
89 
90 /* Computes |v| = |(g - u a)| / b, where u may be positive or
91    negative, and v is of the opposite sign. a, b are of size n, u and
92    v at most size n, and v must have space for n+1 limbs. */
93 static mp_size_t
94 compute_v (mp_ptr vp,
95 	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
96 	   mp_srcptr gp, mp_size_t gn,
97 	   mp_srcptr up, mp_size_t usize,
98 	   mp_ptr tp)
99 {
100   mp_size_t size;
101   mp_size_t an;
102   mp_size_t bn;
103   mp_size_t vn;
104 
105   ASSERT (n > 0);
106   ASSERT (gn > 0);
107   ASSERT (usize != 0);
108 
109   size = ABS (usize);
110   ASSERT (size <= n);
111 
112   an = n;
113   MPN_NORMALIZE (ap, an);
114 
115   if (an >= size)
116     mpn_mul (tp, ap, an, up, size);
117   else
118     mpn_mul (tp, up, size, ap, an);
119 
120   size += an;
121   size -= tp[size - 1] == 0;
122 
123   ASSERT (gn <= size);
124 
125   if (usize > 0)
126     {
127       /* |v| = -v = (u a - g) / b */
128 
129       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
130       MPN_NORMALIZE (tp, size);
131       if (size == 0)
132 	return 0;
133     }
134   else
135     { /* usize < 0 */
136       /* |v| = v = (c - u a) / b = (c + |u| a) / b */
137       mp_limb_t cy = mpn_add (tp, tp, size, gp, gn);
138       if (cy)
139 	tp[size++] = cy;
140     }
141 
142   /* Now divide t / b. There must be no remainder */
143   bn = n;
144   MPN_NORMALIZE (bp, bn);
145   ASSERT (size >= bn);
146 
147   vn = size + 1 - bn;
148   ASSERT (vn <= n + 1);
149 
150   mpn_divexact (vp, tp, size, bp, bn);
151   vn -= (vp[vn-1] == 0);
152 
153   return vn;
154 }
155 
156 /* Temporary storage:
157 
158    Initial division: Quotient of at most an - n + 1 <= an limbs.
159 
160    Storage for u0 and u1: 2(n+1).
161 
162    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
163 
164    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
165 
166    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
167 
168    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
169 
170    For the lehmer call after the loop, Let T denote
171    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
172    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
173    for u, T+1 for v and 2T + 1 scratch space. In all, 7T + 3 is
174    sufficient for both operations.
175 
176 */
177 
178 /* Optimal choice of p seems difficult. In each iteration the division
179  * of work between hgcd and the updates of u0 and u1 depends on the
180  * current size of the u. It may be desirable to use a different
181  * choice of p in each iteration. Also the input size seems to matter;
182  * choosing p = n / 3 in the first iteration seems to improve
183  * performance slightly for input size just above the threshold, but
184  * degrade performance for larger inputs. */
185 #define CHOOSE_P_1(n) ((n) / 2)
186 #define CHOOSE_P_2(n) ((n) / 3)
187 
188 mp_size_t
189 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
190 	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
191 {
192   mp_size_t talloc;
193   mp_size_t scratch;
194   mp_size_t matrix_scratch;
195   mp_size_t ualloc = n + 1;
196 
197   mp_size_t un;
198   mp_ptr u0;
199   mp_ptr u1;
200 
201   mp_ptr tp;
202 
203   TMP_DECL;
204 
205   ASSERT (an >= n);
206   ASSERT (n > 0);
207 
208   TMP_MARK;
209 
210   /* FIXME: Check for small sizes first, before setting up temporary
211      storage etc. */
212   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
213 
214   /* For initial division */
215   scratch = an - n + 1;
216   if (scratch > talloc)
217     talloc = scratch;
218 
219   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
220     {
221       /* For hgcd loop. */
222       mp_size_t hgcd_scratch;
223       mp_size_t update_scratch;
224       mp_size_t p1 = CHOOSE_P_1 (n);
225       mp_size_t p2 = CHOOSE_P_2 (n);
226       mp_size_t min_p = MIN(p1, p2);
227       mp_size_t max_p = MAX(p1, p2);
228       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
229       hgcd_scratch = mpn_hgcd_itch (n - min_p);
230       update_scratch = max_p + n - 1;
231 
232       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
233       if (scratch > talloc)
234 	talloc = scratch;
235 
236       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
237 	 copies of a and b. */
238       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
239 	+ 3*GCDEXT_DC_THRESHOLD;
240 
241       if (scratch > talloc)
242 	talloc = scratch;
243 
244       /* Cofactors u0 and u1 */
245       talloc += 2*(n+1);
246     }
247 
248   tp = TMP_ALLOC_LIMBS(talloc);
249 
250   if (an > n)
251     {
252       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
253 
254       if (mpn_zero_p (ap, n))
255 	{
256 	  MPN_COPY (gp, bp, n);
257 	  *usizep = 0;
258 	  TMP_FREE;
259 	  return n;
260 	}
261     }
262 
263   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
264     {
265       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
266 
267       TMP_FREE;
268       return gn;
269     }
270 
271   MPN_ZERO (tp, 2*ualloc);
272   u0 = tp; tp += ualloc;
273   u1 = tp; tp += ualloc;
274 
275   {
276     /* For the first hgcd call, there are no u updates, and it makes
277        some sense to use a different choice for p. */
278 
279     /* FIXME: We could trim use of temporary storage, since u0 and u1
280        are not used yet. For the hgcd call, we could swap in the u0
281        and u1 pointers for the relevant matrix elements. */
282 
283     struct hgcd_matrix M;
284     mp_size_t p = CHOOSE_P_1 (n);
285     mp_size_t nn;
286 
287     mpn_hgcd_matrix_init (&M, n - p, tp);
288     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
289     if (nn > 0)
290       {
291 	ASSERT (M.n <= (n - p - 1)/2);
292 	ASSERT (M.n + p <= (p + n - 1) / 2);
293 
294 	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
295 	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
296 
297 	MPN_COPY (u0, M.p[1][0], M.n);
298 	MPN_COPY (u1, M.p[1][1], M.n);
299 	un = M.n;
300 	while ( (u0[un-1] | u1[un-1] ) == 0)
301 	  un--;
302       }
303     else
304       {
305 	/* mpn_hgcd has failed. Then either one of a or b is very
306 	   small, or the difference is very small. Perform one
307 	   subtraction followed by one division. */
308 	mp_size_t gn;
309 	mp_size_t updated_un = 1;
310 
311 	u1[0] = 1;
312 
313 	/* Temporary storage 2n + 1 */
314 	n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
315 				    u0, u1, &updated_un, tp, tp + n);
316 	if (n == 0)
317 	  {
318 	    TMP_FREE;
319 	    return gn;
320 	  }
321 
322 	un = updated_un;
323 	ASSERT (un < ualloc);
324       }
325   }
326 
327   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
328     {
329       struct hgcd_matrix M;
330       mp_size_t p = CHOOSE_P_2 (n);
331       mp_size_t nn;
332 
333       mpn_hgcd_matrix_init (&M, n - p, tp);
334       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
335       if (nn > 0)
336 	{
337 	  mp_ptr t0;
338 
339 	  t0 = tp + matrix_scratch;
340 	  ASSERT (M.n <= (n - p - 1)/2);
341 	  ASSERT (M.n + p <= (p + n - 1) / 2);
342 
343 	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
344 	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
345 
346 	  /* By the same analysis as for mpn_hgcd_matrix_mul */
347 	  ASSERT (M.n + un <= ualloc);
348 
349 	  /* FIXME: This copying could be avoided by some swapping of
350 	   * pointers. May need more temporary storage, though. */
351 	  MPN_COPY (t0, u0, un);
352 
353 	  /* Temporary storage ualloc */
354 	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
355 
356 	  ASSERT (un < ualloc);
357 	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
358 	}
359       else
360 	{
361 	  /* mpn_hgcd has failed. Then either one of a or b is very
362 	     small, or the difference is very small. Perform one
363 	     subtraction followed by one division. */
364 	  mp_size_t gn;
365 	  mp_size_t updated_un = un;
366 
367 	  /* Temporary storage 2n + 1 */
368 	  n = mpn_gcdext_subdiv_step (gp, &gn, up, usizep, ap, bp, n,
369 				      u0, u1, &updated_un, tp, tp + n);
370 	  if (n == 0)
371 	    {
372 	      TMP_FREE;
373 	      return gn;
374 	    }
375 
376 	  un = updated_un;
377 	  ASSERT (un < ualloc);
378 	}
379     }
380 
381   if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
382     {
383       /* Must return the smallest cofactor, +u1 or -u0 */
384       int c;
385 
386       MPN_COPY (gp, ap, n);
387 
388       MPN_CMP (c, u0, u1, un);
389       ASSERT (c != 0);
390       if (c < 0)
391 	{
392 	  MPN_NORMALIZE (u0, un);
393 	  MPN_COPY (up, u0, un);
394 	  *usizep = -un;
395 	}
396       else
397 	{
398 	  MPN_NORMALIZE_NOT_ZERO (u1, un);
399 	  MPN_COPY (up, u1, un);
400 	  *usizep = un;
401 	}
402 
403       TMP_FREE;
404       return n;
405     }
406   else if (mpn_zero_p (u0, un))
407     {
408       mp_size_t gn;
409       ASSERT (un == 1);
410       ASSERT (u1[0] == 1);
411 
412       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
413       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
414 
415       TMP_FREE;
416       return gn;
417     }
418   else
419     {
420       /* We have A = ... a + ... b
421 		 B =  u0 a +  u1 b
422 
423 		 a = u1  A + ... B
424 		 b = -u0 A + ... B
425 
426 	 with bounds
427 
428 	   |u0|, |u1| <= B / min(a, b)
429 
430 	 Compute g = u a + v b = (u u1 - v u0) A + (...) B
431 	 Here, u, v are bounded by
432 
433 	 |u| <= b,
434 	 |v| <= a
435       */
436 
437       mp_size_t u0n;
438       mp_size_t u1n;
439       mp_size_t lehmer_un;
440       mp_size_t lehmer_vn;
441       mp_size_t gn;
442 
443       mp_ptr lehmer_up;
444       mp_ptr lehmer_vp;
445       int negate;
446 
447       lehmer_up = tp; tp += n;
448 
449       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
450       MPN_COPY (tp, ap, n);
451       MPN_COPY (tp + n, bp, n);
452       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
453 
454       u0n = un;
455       MPN_NORMALIZE (u0, u0n);
456       if (lehmer_un == 0)
457 	{
458 	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
459 	  MPN_COPY (up, u0, u0n);
460 	  *usizep = -u0n;
461 
462 	  TMP_FREE;
463 	  return gn;
464 	}
465 
466       lehmer_vp = tp;
467       /* Compute v = (g - u a) / b */
468       lehmer_vn = compute_v (lehmer_vp,
469 			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
470 
471       if (lehmer_un > 0)
472 	negate = 0;
473       else
474 	{
475 	  lehmer_un = -lehmer_un;
476 	  negate = 1;
477 	}
478 
479       u1n = un;
480       MPN_NORMALIZE (u1, u1n);
481 
482       /* It's possible that u0 = 1, u1 = 0 */
483       if (u1n == 0)
484 	{
485 	  ASSERT (un == 1);
486 	  ASSERT (u0[0] == 1);
487 
488 	  /* u1 == 0 ==> u u1 + v u0 = v */
489 	  MPN_COPY (up, lehmer_vp, lehmer_vn);
490 	  *usizep = negate ? lehmer_vn : - lehmer_vn;
491 
492 	  TMP_FREE;
493 	  return gn;
494 	}
495 
496       ASSERT (lehmer_un + u1n <= ualloc);
497       ASSERT (lehmer_vn + u0n <= ualloc);
498 
499       /* Now u0, u1, u are non-zero. We may still have v == 0 */
500 
501       /* Compute u u0 */
502       if (lehmer_un <= u1n)
503 	/* Should be the common case */
504 	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
505       else
506 	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
507 
508       un = u1n + lehmer_un;
509       un -= (up[un - 1] == 0);
510 
511       if (lehmer_vn > 0)
512 	{
513 	  mp_limb_t cy;
514 
515 	  /* Overwrites old u1 value */
516 	  if (lehmer_vn <= u0n)
517 	    /* Should be the common case */
518 	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
519 	  else
520 	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
521 
522 	  u1n = u0n + lehmer_vn;
523 	  u1n -= (u1[u1n - 1] == 0);
524 
525 	  if (u1n <= un)
526 	    {
527 	      cy = mpn_add (up, up, un, u1, u1n);
528 	    }
529 	  else
530 	    {
531 	      cy = mpn_add (up, u1, u1n, up, un);
532 	      un = u1n;
533 	    }
534 	  up[un] = cy;
535 	  un += (cy != 0);
536 
537 	  ASSERT (un < ualloc);
538 	}
539       *usizep = negate ? -un : un;
540 
541       TMP_FREE;
542       return gn;
543     }
544 }
545