xref: /dragonfly/contrib/gmp/mpn/generic/hgcd.c (revision a4da4a90)
1 /* hgcd.c.
2 
3    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
4    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6 
7 Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of the GNU Lesser General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
15 
16 The GNU MP Library is distributed in the hope that it will be useful, but
17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
19 License for more details.
20 
21 You should have received a copy of the GNU Lesser General Public License
22 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
23 
24 #include "gmp.h"
25 #include "gmp-impl.h"
26 #include "longlong.h"
27 
28 /* For input of size n, matrix elements are of size at most ceil(n/2)
29    - 1, but we need two limbs extra. */
30 void
31 mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p)
32 {
33   mp_size_t s = (n+1)/2 + 1;
34   M->alloc = s;
35   M->n = 1;
36   MPN_ZERO (p, 4 * s);
37   M->p[0][0] = p;
38   M->p[0][1] = p + s;
39   M->p[1][0] = p + 2 * s;
40   M->p[1][1] = p + 3 * s;
41 
42   M->p[0][0][0] = M->p[1][1][0] = 1;
43 }
44 
45 /* Updated column COL, adding in column (1-COL). */
46 static void
47 hgcd_matrix_update_1 (struct hgcd_matrix *M, unsigned col)
48 {
49   mp_limb_t c0, c1;
50   ASSERT (col < 2);
51 
52   c0 = mpn_add_n (M->p[0][col], M->p[0][0], M->p[0][1], M->n);
53   c1 = mpn_add_n (M->p[1][col], M->p[1][0], M->p[1][1], M->n);
54 
55   M->p[0][col][M->n] = c0;
56   M->p[1][col][M->n] = c1;
57 
58   M->n += (c0 | c1) != 0;
59   ASSERT (M->n < M->alloc);
60 }
61 
62 /* Updated column COL, adding in column Q * (1-COL). Temporary
63  * storage: qn + n <= M->alloc, where n is the size of the largest
64  * element in column 1 - COL. */
65 static void
66 hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn,
67 		      unsigned col, mp_ptr tp)
68 {
69   ASSERT (col < 2);
70 
71   if (qn == 1)
72     {
73       mp_limb_t q = qp[0];
74       mp_limb_t c0, c1;
75 
76       c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q);
77       c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q);
78 
79       M->p[0][col][M->n] = c0;
80       M->p[1][col][M->n] = c1;
81 
82       M->n += (c0 | c1) != 0;
83     }
84   else
85     {
86       unsigned row;
87 
88       /* Carries for the unlikely case that we get both high words
89 	 from the multiplication and carries from the addition. */
90       mp_limb_t c[2];
91       mp_size_t n;
92 
93       /* The matrix will not necessarily grow in size by qn, so we
94 	 need normalization in order not to overflow M. */
95 
96       for (n = M->n; n + qn > M->n; n--)
97 	{
98 	  ASSERT (n > 0);
99 	  if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0)
100 	    break;
101 	}
102 
103       ASSERT (qn + n <= M->alloc);
104 
105       for (row = 0; row < 2; row++)
106 	{
107 	  if (qn <= n)
108 	    mpn_mul (tp, M->p[row][1-col], n, qp, qn);
109 	  else
110 	    mpn_mul (tp, qp, qn, M->p[row][1-col], n);
111 
112 	  ASSERT (n + qn >= M->n);
113 	  c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n);
114 	}
115       if (c[0] | c[1])
116 	{
117 	  M->n = n + qn + 1;
118 	  M->p[0][col][M->n - 1] = c[0];
119 	  M->p[1][col][M->n - 1] = c[1];
120 	}
121       else
122 	{
123 	  n += qn;
124 	  n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0;
125 	  if (n > M->n)
126 	    M->n = n;
127 	}
128     }
129 
130   ASSERT (M->n < M->alloc);
131 }
132 
133 /* Multiply M by M1 from the right. Since the M1 elements fit in
134    GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs
135    temporary space M->n */
136 static void
137 hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1,
138 		   mp_ptr tp)
139 {
140   mp_size_t n0, n1;
141 
142   /* Could avoid copy by some swapping of pointers. */
143   MPN_COPY (tp, M->p[0][0], M->n);
144   n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n);
145   MPN_COPY (tp, M->p[1][0], M->n);
146   n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n);
147 
148   /* Depends on zero initialization */
149   M->n = MAX(n0, n1);
150   ASSERT (M->n < M->alloc);
151 }
152 
153 /* Perform a few steps, using some of mpn_hgcd2, subtraction and
154    division. Reduces the size by almost one limb or more, but never
155    below the given size s. Return new size for a and b, or 0 if no
156    more steps are possible.
157 
158    If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n
159    limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
160    fails, needs space for the quotient, qn <= n - s + 1 limbs, for and
161    hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
162    resulting size of $.
163 
164    If N is the input size to the calling hgcd, then s = floor(N/2) +
165    1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1
166    < N, so N is sufficient.
167 */
168 
169 static mp_size_t
170 hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
171 	   struct hgcd_matrix *M, mp_ptr tp)
172 {
173   struct hgcd_matrix1 M1;
174   mp_limb_t mask;
175   mp_limb_t ah, al, bh, bl;
176   mp_size_t an, bn, qn;
177   int col;
178 
179   ASSERT (n > s);
180 
181   mask = ap[n-1] | bp[n-1];
182   ASSERT (mask > 0);
183 
184   if (n == s + 1)
185     {
186       if (mask < 4)
187 	goto subtract;
188 
189       ah = ap[n-1]; al = ap[n-2];
190       bh = bp[n-1]; bl = bp[n-2];
191     }
192   else if (mask & GMP_NUMB_HIGHBIT)
193     {
194       ah = ap[n-1]; al = ap[n-2];
195       bh = bp[n-1]; bl = bp[n-2];
196     }
197   else
198     {
199       int shift;
200 
201       count_leading_zeros (shift, mask);
202       ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
203       al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
204       bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
205       bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
206     }
207 
208   /* Try an mpn_hgcd2 step */
209   if (mpn_hgcd2 (ah, al, bh, bl, &M1))
210     {
211       /* Multiply M <- M * M1 */
212       hgcd_matrix_mul_1 (M, &M1, tp);
213 
214       /* Can't swap inputs, so we need to copy. */
215       MPN_COPY (tp, ap, n);
216       /* Multiply M1^{-1} (a;b) */
217       return mpn_hgcd_mul_matrix1_inverse_vector (&M1, ap, tp, bp, n);
218     }
219 
220  subtract:
221   /* There are two ways in which mpn_hgcd2 can fail. Either one of ah and
222      bh was too small, or ah, bh were (almost) equal. Perform one
223      subtraction step (for possible cancellation of high limbs),
224      followed by one division. */
225 
226   /* Since we must ensure that #(a-b) > s, we handle cancellation of
227      high limbs explicitly up front. (FIXME: Or is it better to just
228      subtract, normalize, and use an addition to undo if it turns out
229      the the difference is too small?) */
230   for (an = n; an > s; an--)
231     if (ap[an-1] != bp[an-1])
232       break;
233 
234   if (an == s)
235     return 0;
236 
237   /* Maintain a > b. When needed, swap a and b, and let col keep track
238      of how to update M. */
239   if (ap[an-1] > bp[an-1])
240     {
241       /* a is largest. In the subtraction step, we need to update
242 	 column 1 of M */
243       col = 1;
244     }
245   else
246     {
247       MP_PTR_SWAP (ap, bp);
248       col = 0;
249     }
250 
251   bn = n;
252   MPN_NORMALIZE (bp, bn);
253   if (bn <= s)
254     return 0;
255 
256   /* We have #a, #b > s. When is it possible that #(a-b) < s? For
257      cancellation to happen, the numbers must be of the form
258 
259        a = x + 1, 0,            ..., 0,            al
260        b = x    , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl
261 
262      where al, bl denotes the least significant k limbs. If al < bl,
263      then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX,
264      then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */
265 
266   if (ap[an-1] == bp[an-1] + 1)
267     {
268       mp_size_t k;
269       int c;
270       for (k = an-1; k > s; k--)
271 	if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX)
272 	  break;
273 
274       MPN_CMP (c, ap, bp, k);
275       if (c < 0)
276 	{
277 	  mp_limb_t cy;
278 
279 	  /* The limbs from k and up are cancelled. */
280 	  if (k == s)
281 	    return 0;
282 	  cy = mpn_sub_n (ap, ap, bp, k);
283 	  ASSERT (cy == 1);
284 	  an = k;
285 	}
286       else
287 	{
288 	  ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k));
289 	  ap[k] = 1;
290 	  an = k + 1;
291 	}
292     }
293   else
294     ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an));
295 
296   ASSERT (an > s);
297   ASSERT (ap[an-1] > 0);
298   ASSERT (bn > s);
299   ASSERT (bp[bn-1] > 0);
300 
301   hgcd_matrix_update_1 (M, col);
302 
303   if (an < bn)
304     {
305       MPN_PTR_SWAP (ap, an, bp, bn);
306       col ^= 1;
307     }
308   else if (an == bn)
309     {
310       int c;
311       MPN_CMP (c, ap, bp, an);
312       if (c < 0)
313 	{
314 	  MP_PTR_SWAP (ap, bp);
315 	  col ^= 1;
316 	}
317     }
318 
319   /* Divide a / b. */
320   qn = an + 1 - bn;
321 
322   /* FIXME: We could use an approximate division, that may return a
323      too small quotient, and only guarantee that the size of r is
324      almost the size of b. FIXME: Let ap and remainder overlap. */
325   mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
326   qn -= (tp[qn -1] == 0);
327 
328   /* Normalize remainder */
329   an = bn;
330   for ( ; an > s; an--)
331     if (ap[an-1] > 0)
332       break;
333 
334   if (an <= s)
335     {
336       /* Quotient is too large */
337       mp_limb_t cy;
338 
339       cy = mpn_add (ap, bp, bn, ap, an);
340 
341       if (cy > 0)
342 	{
343 	  ASSERT (bn < n);
344 	  ap[bn] = cy;
345 	  bp[bn] = 0;
346 	  bn++;
347 	}
348 
349       MPN_DECR_U (tp, qn, 1);
350       qn -= (tp[qn-1] == 0);
351     }
352 
353   if (qn > 0)
354     hgcd_matrix_update_q (M, tp, qn, col, tp + qn);
355 
356   return bn;
357 }
358 
359 /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
360    with elements of size at most (n+1)/2 - 1. Returns new size of a,
361    b, or zero if no reduction is possible. */
362 mp_size_t
363 mpn_hgcd_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n,
364 		 struct hgcd_matrix *M, mp_ptr tp)
365 {
366   mp_size_t s = n/2 + 1;
367   mp_size_t nn;
368 
369   ASSERT (n > s);
370   ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
371 
372   nn = hgcd_step (n, ap, bp, s, M, tp);
373   if (!nn)
374     return 0;
375 
376   for (;;)
377     {
378       n = nn;
379       ASSERT (n > s);
380       nn = hgcd_step (n, ap, bp, s, M, tp);
381       if (!nn )
382 	return n;
383     }
384 }
385 
386 /* Multiply M by M1 from the right. Needs 3*(M->n + M1->n) + 5 limbs
387    of temporary storage (see mpn_matrix22_mul_itch). */
388 void
389 mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1,
390 		     mp_ptr tp)
391 {
392   mp_size_t n;
393 
394   /* About the new size of M:s elements. Since M1's diagonal elements
395      are > 0, no element can decrease. The new elements are of size
396      M->n + M1->n, one limb more or less. The computation of the
397      matrix product produces elements of size M->n + M1->n + 1. But
398      the true size, after normalization, may be three limbs smaller.
399 
400      The reason that the product has normalized size >= M->n + M1->n -
401      2 is subtle. It depends on the fact that M and M1 can be factored
402      as products of (1,1; 0,1) and (1,0; 1,1), and that we can't have
403      M ending with a large power and M1 starting with a large power of
404      the same matrix. */
405 
406   /* FIXME: Strassen multiplication gives only a small speedup. In FFT
407      multiplication range, this function could be sped up quite a lot
408      using invariance. */
409   ASSERT (M->n + M1->n < M->alloc);
410 
411   ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1]
412 	   | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0);
413 
414   ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1]
415 	   | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0);
416 
417   mpn_matrix22_mul (M->p[0][0], M->p[0][1],
418 		    M->p[1][0], M->p[1][1], M->n,
419 		    M1->p[0][0], M1->p[0][1],
420 		    M1->p[1][0], M1->p[1][1], M1->n, tp);
421 
422   /* Index of last potentially non-zero limb, size is one greater. */
423   n = M->n + M1->n;
424 
425   n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
426   n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
427   n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
428 
429   ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0);
430 
431   M->n = n + 1;
432 }
433 
434 /* Multiplies the least significant p limbs of (a;b) by M^-1.
435    Temporary space needed: 2 * (p + M->n)*/
436 mp_size_t
437 mpn_hgcd_matrix_adjust (struct hgcd_matrix *M,
438 			mp_size_t n, mp_ptr ap, mp_ptr bp,
439 			mp_size_t p, mp_ptr tp)
440 {
441   /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b)
442      = (r11 a - r01 b; - r10 a + r00 b */
443 
444   mp_ptr t0 = tp;
445   mp_ptr t1 = tp + p + M->n;
446   mp_limb_t ah, bh;
447   mp_limb_t cy;
448 
449   ASSERT (p + M->n  < n);
450 
451   /* First compute the two values depending on a, before overwriting a */
452 
453   if (M->n >= p)
454     {
455       mpn_mul (t0, M->p[1][1], M->n, ap, p);
456       mpn_mul (t1, M->p[1][0], M->n, ap, p);
457     }
458   else
459     {
460       mpn_mul (t0, ap, p, M->p[1][1], M->n);
461       mpn_mul (t1, ap, p, M->p[1][0], M->n);
462     }
463 
464   /* Update a */
465   MPN_COPY (ap, t0, p);
466   ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n);
467 
468   if (M->n >= p)
469     mpn_mul (t0, M->p[0][1], M->n, bp, p);
470   else
471     mpn_mul (t0, bp, p, M->p[0][1], M->n);
472 
473   cy = mpn_sub (ap, ap, n, t0, p + M->n);
474   ASSERT (cy <= ah);
475   ah -= cy;
476 
477   /* Update b */
478   if (M->n >= p)
479     mpn_mul (t0, M->p[0][0], M->n, bp, p);
480   else
481     mpn_mul (t0, bp, p, M->p[0][0], M->n);
482 
483   MPN_COPY (bp, t0, p);
484   bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n);
485   cy = mpn_sub (bp, bp, n, t1, p + M->n);
486   ASSERT (cy <= bh);
487   bh -= cy;
488 
489   if (ah > 0 || bh > 0)
490     {
491       ap[n] = ah;
492       bp[n] = bh;
493       n++;
494     }
495   else
496     {
497       /* The subtraction can reduce the size by at most one limb. */
498       if (ap[n-1] == 0 && bp[n-1] == 0)
499 	n--;
500     }
501   ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
502   return n;
503 }
504 
505 /* Size analysis for hgcd:
506 
507    For the recursive calls, we have n1 <= ceil(n / 2). Then the
508    storage need is determined by the storage for the recursive call
509    computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1
510    (after this, the storage needed for M1 can be recycled).
511 
512    Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1)
513    = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2,
514    and for the hgcd_matrix_mul, we may need 3 ceil(n/2) + 8. In total,
515    4 * ceil(n/4) + 3 ceil(n/2) + 12 <= 10 ceil(n/4) + 12.
516 
517    For the recursive call, we need S(n1) = S(ceil(n/2)).
518 
519    S(n) <= 10*ceil(n/4) + 12 + S(ceil(n/2))
520 	<= 10*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 12k + S(ceil(n/2^k))
521 	<= 10*(2 ceil(n/4) + k) + 12k + S(ceil(n/2^k))
522 	<= 20 ceil(n/4) + 22k + S(ceil(n/2^k))
523 */
524 
525 mp_size_t
526 mpn_hgcd_itch (mp_size_t n)
527 {
528   unsigned k;
529   int count;
530   mp_size_t nscaled;
531 
532   if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
533     return MPN_HGCD_LEHMER_ITCH (n);
534 
535   /* Get the recursion depth. */
536   nscaled = (n - 1) / (HGCD_THRESHOLD - 1);
537   count_leading_zeros (count, nscaled);
538   k = GMP_LIMB_BITS - count;
539 
540   return 20 * ((n+3) / 4) + 22 * k
541     + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD);
542 }
543 
544 /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
545    with elements of size at most (n+1)/2 - 1. Returns new size of a,
546    b, or zero if no reduction is possible. */
547 
548 mp_size_t
549 mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
550 	  struct hgcd_matrix *M, mp_ptr tp)
551 {
552   mp_size_t s = n/2 + 1;
553   mp_size_t n2 = (3*n)/4 + 1;
554 
555   mp_size_t p, nn;
556   int success = 0;
557 
558   if (n <= s)
559     /* Happens when n <= 2, a fairly uninteresting case but exercised
560        by the random inputs of the testsuite. */
561     return 0;
562 
563   ASSERT ((ap[n-1] | bp[n-1]) > 0);
564 
565   ASSERT ((n+1)/2 - 1 < M->alloc);
566 
567   if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
568     return mpn_hgcd_lehmer (ap, bp, n, M, tp);
569 
570   p = n/2;
571   nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
572   if (nn > 0)
573     {
574       /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
575 	 = 2 (n - 1) */
576       n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
577       success = 1;
578     }
579   while (n > n2)
580     {
581       /* Needs n + 1 storage */
582       nn = hgcd_step (n, ap, bp, s, M, tp);
583       if (!nn)
584 	return success ? n : 0;
585       n = nn;
586       success = 1;
587     }
588 
589   if (n > s + 2)
590     {
591       struct hgcd_matrix M1;
592       mp_size_t scratch;
593 
594       p = 2*s - n + 1;
595       scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
596 
597       mpn_hgcd_matrix_init(&M1, n - p, tp);
598       nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
599       if (nn > 0)
600 	{
601 	  /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
602 	  ASSERT (M->n + 2 >= M1.n);
603 
604 	  /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
605 	     then either q or q + 1 is a correct quotient, and M1 will
606 	     start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
607 	     rules out the case that the size of M * M1 is much
608 	     smaller than the expected M->n + M1->n. */
609 
610 	  ASSERT (M->n + M1.n < M->alloc);
611 
612 	  /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
613 	     = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
614 	  n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
615 
616 	  /* We need a bound for of M->n + M1.n. Let n be the original
617 	     input size. Then
618 
619 	       ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2
620 
621 	     and it follows that
622 
623 	       M.n + M1.n <= ceil(n/2) + 1
624 
625 	     Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the
626 	     amount of needed scratch space. */
627 	  mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
628 	  success = 1;
629 	}
630     }
631 
632   /* This really is the base case */
633   for (;;)
634     {
635       /* Needs s+3 < n */
636       nn = hgcd_step (n, ap, bp, s, M, tp);
637       if (!nn)
638 	return success ? n : 0;
639 
640       n = nn;
641       success = 1;
642     }
643 }
644