xref: /dragonfly/contrib/gmp/mpn/generic/hgcd.c (revision ad9f8794)
1 /* hgcd.c.
2 
3    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
4    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6 
7 Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of the GNU Lesser General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
15 
16 The GNU MP Library is distributed in the hope that it will be useful, but
17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
19 License for more details.
20 
21 You should have received a copy of the GNU Lesser General Public License
22 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
23 
24 #include "gmp.h"
25 #include "gmp-impl.h"
26 #include "longlong.h"
27 
28 /* For input of size n, matrix elements are of size at most ceil(n/2)
29    - 1, but we need two limbs extra. */
30 void
31 mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p)
32 {
33   mp_size_t s = (n+1)/2 + 1;
34   M->alloc = s;
35   M->n = 1;
36   MPN_ZERO (p, 4 * s);
37   M->p[0][0] = p;
38   M->p[0][1] = p + s;
39   M->p[1][0] = p + 2 * s;
40   M->p[1][1] = p + 3 * s;
41 
42   M->p[0][0][0] = M->p[1][1][0] = 1;
43 }
44 
45 /* Updated column COL, adding in column (1-COL). */
46 static void
47 hgcd_matrix_update_1 (struct hgcd_matrix *M, unsigned col)
48 {
49   mp_limb_t c0, c1;
50   ASSERT (col < 2);
51 
52   c0 = mpn_add_n (M->p[0][col], M->p[0][0], M->p[0][1], M->n);
53   c1 = mpn_add_n (M->p[1][col], M->p[1][0], M->p[1][1], M->n);
54 
55   M->p[0][col][M->n] = c0;
56   M->p[1][col][M->n] = c1;
57 
58   M->n += (c0 | c1) != 0;
59   ASSERT (M->n < M->alloc);
60 }
61 
62 /* Updated column COL, adding in column Q * (1-COL). Temporary
63  * storage: qn + n <= M->alloc, where n is the size of the largest
64  * element in column 1 - COL. */
65 static void
66 hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn,
67 		      unsigned col, mp_ptr tp)
68 {
69   ASSERT (col < 2);
70 
71   if (qn == 1)
72     {
73       mp_limb_t q = qp[0];
74       mp_limb_t c0, c1;
75 
76       c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q);
77       c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q);
78 
79       M->p[0][col][M->n] = c0;
80       M->p[1][col][M->n] = c1;
81 
82       M->n += (c0 | c1) != 0;
83     }
84   else
85     {
86       unsigned row;
87 
88       /* Carries for the unlikely case that we get both high words
89 	 from the multiplication and carries from the addition. */
90       mp_limb_t c[2];
91       mp_size_t n;
92 
93       /* The matrix will not necessarily grow in size by qn, so we
94 	 need normalization in order not to overflow M. */
95 
96       for (n = M->n; n + qn > M->n; n--)
97 	{
98 	  ASSERT (n > 0);
99 	  if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0)
100 	    break;
101 	}
102 
103       ASSERT (qn + n <= M->alloc);
104 
105       for (row = 0; row < 2; row++)
106 	{
107 	  if (qn <= n)
108 	    mpn_mul (tp, M->p[row][1-col], n, qp, qn);
109 	  else
110 	    mpn_mul (tp, qp, qn, M->p[row][1-col], n);
111 
112 	  ASSERT (n + qn >= M->n);
113 	  c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n);
114 	}
115       if (c[0] | c[1])
116 	{
117 	  M->n = n + qn + 1;
118 	  M->p[0][col][n-1] = c[0];
119 	  M->p[1][col][n-1] = c[1];
120 	}
121       else
122 	{
123 	  n += qn;
124 	  n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0;
125 	  if (n > M->n)
126 	    M->n = n;
127 	}
128     }
129 
130   ASSERT (M->n < M->alloc);
131 }
132 
133 /* Multiply M by M1 from the right. Since the M1 elements fit in
134    GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs
135    temporary space M->n */
136 static void
137 hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1,
138 		   mp_ptr tp)
139 {
140   mp_size_t n0, n1;
141 
142   /* Could avoid copy by some swapping of pointers. */
143   MPN_COPY (tp, M->p[0][0], M->n);
144   n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n);
145   MPN_COPY (tp, M->p[1][0], M->n);
146   n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n);
147 
148   /* Depends on zero initialization */
149   M->n = MAX(n0, n1);
150   ASSERT (M->n < M->alloc);
151 }
152 
153 /* Perform a few steps, using some of mpn_hgcd2, subtraction and
154    division. Reduces the size by almost one limb or more, but never
155    below the given size s. Return new size for a and b, or 0 if no
156    more steps are possible.
157 
158    If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n
159    limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
160    fails, needs space for the quotient, qn <= n - s + 1 limbs, for and
161    hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
162    resulting size of $.
163 
164    If N is the input size to the calling hgcd, then s = floor(N/2) +
165    1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1
166    < N, so N is sufficient.
167 */
168 
169 static mp_size_t
170 hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
171 	   struct hgcd_matrix *M, mp_ptr tp)
172 {
173   struct hgcd_matrix1 M1;
174   mp_limb_t mask;
175   mp_limb_t ah, al, bh, bl;
176   mp_size_t an, bn, qn;
177   int col;
178 
179   ASSERT (n > s);
180 
181   mask = ap[n-1] | bp[n-1];
182   ASSERT (mask > 0);
183 
184   if (n == s + 1)
185     {
186       if (mask < 4)
187 	goto subtract;
188 
189       ah = ap[n-1]; al = ap[n-2];
190       bh = bp[n-1]; bl = bp[n-2];
191     }
192   else if (mask & GMP_NUMB_HIGHBIT)
193     {
194       ah = ap[n-1]; al = ap[n-2];
195       bh = bp[n-1]; bl = bp[n-2];
196     }
197   else
198     {
199       int shift;
200 
201       count_leading_zeros (shift, mask);
202       ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
203       al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
204       bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
205       bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
206     }
207 
208   /* Try an mpn_hgcd2 step */
209   if (mpn_hgcd2 (ah, al, bh, bl, &M1))
210     {
211       /* Multiply M <- M * M1 */
212       hgcd_matrix_mul_1 (M, &M1, tp);
213 
214       /* Can't swap inputs, so we need to copy. */
215       MPN_COPY (tp, ap, n);
216       /* Multiply M1^{-1} (a;b) */
217       return mpn_hgcd_mul_matrix1_inverse_vector (&M1, ap, tp, bp, n);
218     }
219 
220  subtract:
221   /* There are two ways in which mpn_hgcd2 can fail. Either one of ah and
222      bh was too small, or ah, bh were (almost) equal. Perform one
223      subtraction step (for possible cancellation of high limbs),
224      followed by one division. */
225 
226   /* Since we must ensure that #(a-b) > s, we handle cancellation of
227      high limbs explicitly up front. (FIXME: Or is it better to just
228      subtract, normalize, and use an addition to undo if it turns out
229      the the difference is too small?) */
230   for (an = n; an > s; an--)
231     if (ap[an-1] != bp[an-1])
232       break;
233 
234   if (an == s)
235     return 0;
236 
237   /* Maintain a > b. When needed, swap a and b, and let col keep track
238      of how to update M. */
239   if (ap[an-1] > bp[an-1])
240     {
241       /* a is largest. In the subtraction step, we need to update
242 	 column 1 of M */
243       col = 1;
244     }
245   else
246     {
247       MP_PTR_SWAP (ap, bp);
248       col = 0;
249     }
250 
251   bn = n;
252   MPN_NORMALIZE (bp, bn);
253   if (bn <= s)
254     return 0;
255 
256   /* We have #a, #b > s. When is it possible that #(a-b) < s? For
257      cancellation to happen, the numbers must be of the form
258 
259        a = x + 1, 0,            ..., 0,            al
260        b = x    , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl
261 
262      where al, bl denotes the least significant k limbs. If al < bl,
263      then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX,
264      then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */
265 
266   if (ap[an-1] == bp[an-1] + 1)
267     {
268       mp_size_t k;
269       int c;
270       for (k = an-1; k > s; k--)
271 	if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX)
272 	  break;
273 
274       MPN_CMP (c, ap, bp, k);
275       if (c < 0)
276 	{
277 	  mp_limb_t cy;
278 
279 	  /* The limbs from k and up are cancelled. */
280 	  if (k == s)
281 	    return 0;
282 	  cy = mpn_sub_n (ap, ap, bp, k);
283 	  ASSERT (cy == 1);
284 	  an = k;
285 	}
286       else
287 	{
288 	  ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k));
289 	  ap[k] = 1;
290 	  an = k + 1;
291 	}
292     }
293   else
294     ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an));
295 
296   ASSERT (an > s);
297   ASSERT (ap[an-1] > 0);
298   ASSERT (bn > s);
299   ASSERT (bp[bn-1] > 0);
300 
301   hgcd_matrix_update_1 (M, col);
302 
303   if (an < bn)
304     {
305       MPN_PTR_SWAP (ap, an, bp, bn);
306       col ^= 1;
307     }
308   else if (an == bn)
309     {
310       int c;
311       MPN_CMP (c, ap, bp, an);
312       if (c < 0)
313 	{
314 	  MP_PTR_SWAP (ap, bp);
315 	  col ^= 1;
316 	}
317     }
318 
319   /* Divide a / b. */
320   qn = an + 1 - bn;
321 
322   /* FIXME: We could use an approximate division, that may return a
323      too small quotient, and only guarantee that the size of r is
324      almost the size of b. FIXME: Let ap and remainder overlap. */
325   mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn);
326   qn -= (tp[qn -1] == 0);
327 
328   /* Normalize remainder */
329   an = bn;
330   for ( ; an > s; an--)
331     if (ap[an-1] > 0)
332       break;
333 
334   if (an <= s)
335     {
336       /* Quotient is too large */
337       mp_limb_t cy;
338 
339       cy = mpn_add (ap, bp, bn, ap, an);
340 
341       if (cy > 0)
342 	{
343 	  ASSERT (bn < n);
344 	  ap[bn] = cy;
345 	  bp[bn] = 0;
346 	  bn++;
347 	}
348 
349       MPN_DECR_U (tp, qn, 1);
350       qn -= (tp[qn-1] == 0);
351     }
352 
353   if (qn > 0)
354     hgcd_matrix_update_q (M, tp, qn, col, tp + qn);
355 
356   return bn;
357 }
358 
359 /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
360    with elements of size at most (n+1)/2 - 1. Returns new size of a,
361    b, or zero if no reduction is possible. */
362 mp_size_t
363 mpn_hgcd_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n,
364 		 struct hgcd_matrix *M, mp_ptr tp)
365 {
366   mp_size_t s = n/2 + 1;
367   mp_size_t nn;
368 
369   ASSERT (n > s);
370   ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
371 
372   nn = hgcd_step (n, ap, bp, s, M, tp);
373   if (!nn)
374     return 0;
375 
376   for (;;)
377     {
378       n = nn;
379       ASSERT (n > s);
380       nn = hgcd_step (n, ap, bp, s, M, tp);
381       if (!nn )
382 	return n;
383     }
384 }
385 
386 /* Multiply M by M1 from the right. Needs 4*(M->n + M1->n) + 5 limbs
387    of temporary storage (see mpn_matrix22_mul_itch). */
388 void
389 mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1,
390 		     mp_ptr tp)
391 {
392   mp_size_t n;
393 
394   /* About the new size of M:s elements. Since M1's diagonal elements
395      are > 0, no element can decrease. The new elements are of size
396      M->n + M1->n, one limb more or less. The computation of the
397      matrix product produces elements of size M->n + M1->n + 1. But
398      the true size, after normalization, may be three limbs smaller. */
399 
400   /* FIXME: Strassen multiplication gives only a small speedup. In FFT
401      multiplication range, this function could be sped up quite a lot
402      using invariance. */
403   ASSERT (M->n + M1->n < M->alloc);
404 
405   ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1]
406 	   | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0);
407 
408   ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1]
409 	   | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0);
410 
411   mpn_matrix22_mul (M->p[0][0], M->p[0][1],
412 		    M->p[1][0], M->p[1][1], M->n,
413 		    M1->p[0][0], M1->p[0][1],
414 		    M1->p[1][0], M1->p[1][1], M1->n, tp);
415 
416   /* Index of last potentially non-zero limb, size is one greater. */
417   n = M->n + M1->n;
418 
419   n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
420   n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
421   n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
422 
423   ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0);
424 
425   M->n = n + 1;
426 }
427 
428 /* Multiplies the least significant p limbs of (a;b) by M^-1.
429    Temporary space needed: 2 * (p + M->n)*/
430 mp_size_t
431 mpn_hgcd_matrix_adjust (struct hgcd_matrix *M,
432 			mp_size_t n, mp_ptr ap, mp_ptr bp,
433 			mp_size_t p, mp_ptr tp)
434 {
435   /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b)
436      = (r11 a - r01 b; - r10 a + r00 b */
437 
438   mp_ptr t0 = tp;
439   mp_ptr t1 = tp + p + M->n;
440   mp_limb_t ah, bh;
441   mp_limb_t cy;
442 
443   ASSERT (p + M->n  < n);
444 
445   /* First compute the two values depending on a, before overwriting a */
446 
447   if (M->n >= p)
448     {
449       mpn_mul (t0, M->p[1][1], M->n, ap, p);
450       mpn_mul (t1, M->p[1][0], M->n, ap, p);
451     }
452   else
453     {
454       mpn_mul (t0, ap, p, M->p[1][1], M->n);
455       mpn_mul (t1, ap, p, M->p[1][0], M->n);
456     }
457 
458   /* Update a */
459   MPN_COPY (ap, t0, p);
460   ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n);
461 
462   if (M->n >= p)
463     mpn_mul (t0, M->p[0][1], M->n, bp, p);
464   else
465     mpn_mul (t0, bp, p, M->p[0][1], M->n);
466 
467   cy = mpn_sub (ap, ap, n, t0, p + M->n);
468   ASSERT (cy <= ah);
469   ah -= cy;
470 
471   /* Update b */
472   if (M->n >= p)
473     mpn_mul (t0, M->p[0][0], M->n, bp, p);
474   else
475     mpn_mul (t0, bp, p, M->p[0][0], M->n);
476 
477   MPN_COPY (bp, t0, p);
478   bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n);
479   cy = mpn_sub (bp, bp, n, t1, p + M->n);
480   ASSERT (cy <= bh);
481   bh -= cy;
482 
483   if (ah > 0 || bh > 0)
484     {
485       ap[n] = ah;
486       bp[n] = bh;
487       n++;
488     }
489   else
490     {
491       /* The subtraction can reduce the size by at most one limb. */
492       if (ap[n-1] == 0 && bp[n-1] == 0)
493 	n--;
494     }
495   ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
496   return n;
497 }
498 
499 /* Size analysis for hgcd:
500 
501    For the recursive calls, we have n1 <= ceil(n / 2). Then the
502    storage need is determined by the storage for the recursive call
503    computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1
504    (after this, the storage needed for M1 can be recycled).
505 
506    Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1)
507    = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2,
508    and for the hgcd_matrix_mul, we may need 4 ceil(n/2) + 1. In total,
509    4 * ceil(n/4) + 4 ceil(n/2) + 5 <= 12 ceil(n/4) + 5.
510 
511    For the recursive call, we need S(n1) = S(ceil(n/2)).
512 
513    S(n) <= 12*ceil(n/4) + 5 + S(ceil(n/2))
514 	<= 12*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 5k + S(ceil(n/2^k))
515 	<= 12*(2 ceil(n/4) + k) + 5k + S(n/2^k)
516 	<= 24 ceil(n/4) + 17k + S(n/2^k)
517 
518 */
519 
520 mp_size_t
521 mpn_hgcd_itch (mp_size_t n)
522 {
523   unsigned k;
524   int count;
525   mp_size_t nscaled;
526 
527   if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
528     return MPN_HGCD_LEHMER_ITCH (n);
529 
530   /* Get the recursion depth. */
531   nscaled = (n - 1) / (HGCD_THRESHOLD - 1);
532   count_leading_zeros (count, nscaled);
533   k = GMP_LIMB_BITS - count;
534 
535   return 24 * ((n+3) / 4) + 17 * k
536     + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD);
537 }
538 
539 /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
540    with elements of size at most (n+1)/2 - 1. Returns new size of a,
541    b, or zero if no reduction is possible. */
542 
543 mp_size_t
544 mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
545 	  struct hgcd_matrix *M, mp_ptr tp)
546 {
547   mp_size_t s = n/2 + 1;
548   mp_size_t n2 = (3*n)/4 + 1;
549 
550   mp_size_t p, nn;
551   int success = 0;
552 
553   if (n <= s)
554     /* Happens when n <= 2, a fairly uninteresting case but exercised
555        by the random inputs of the testsuite. */
556     return 0;
557 
558   ASSERT ((ap[n-1] | bp[n-1]) > 0);
559 
560   ASSERT ((n+1)/2 - 1 < M->alloc);
561 
562   if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
563     return mpn_hgcd_lehmer (ap, bp, n, M, tp);
564 
565   p = n/2;
566   nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
567   if (nn > 0)
568     {
569       /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
570 	 = 2 (n - 1) */
571       n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
572       success = 1;
573     }
574   while (n > n2)
575     {
576       /* Needs n + 1 storage */
577       nn = hgcd_step (n, ap, bp, s, M, tp);
578       if (!nn)
579 	return success ? n : 0;
580       n = nn;
581       success = 1;
582     }
583 
584   if (n > s + 2)
585     {
586       struct hgcd_matrix M1;
587       mp_size_t scratch;
588 
589       p = 2*s - n + 1;
590       scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
591 
592       mpn_hgcd_matrix_init(&M1, n - p, tp);
593       nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
594       if (nn > 0)
595 	{
596 	  /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
597 	  ASSERT (M->n + 2 >= M1.n);
598 
599 	  /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
600 	     then either q or q + 1 is a correct quotient, and M1 will
601 	     start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
602 	     rules out the case that the size of M * M1 is much
603 	     smaller than the expected M->n + M1->n. */
604 
605 	  ASSERT (M->n + M1.n < M->alloc);
606 
607 	  /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
608 	     = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
609 	  n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
610 	  /* Needs 4 ceil(n/2) + 1 */
611 	  mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
612 	  success = 1;
613 	}
614     }
615 
616   /* This really is the base case */
617   for (;;)
618     {
619       /* Needs s+3 < n */
620       nn = hgcd_step (n, ap, bp, s, M, tp);
621       if (!nn)
622 	return success ? n : 0;
623 
624       n = nn;
625       success = 1;
626     }
627 }
628