xref: /dragonfly/contrib/gmp/mpn/generic/hgcd2.c (revision d4ef6694)
1 /* hgcd2.c
2 
3    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
4    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6 
7 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2008 Free Software
8 Foundation, Inc.
9 
10 This file is part of the GNU MP Library.
11 
12 The GNU MP Library is free software; you can redistribute it and/or modify
13 it under the terms of the GNU Lesser General Public License as published by
14 the Free Software Foundation; either version 3 of the License, or (at your
15 option) any later version.
16 
17 The GNU MP Library is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
19 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
20 License for more details.
21 
22 You should have received a copy of the GNU Lesser General Public License
23 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
24 
25 #include "gmp.h"
26 #include "gmp-impl.h"
27 #include "longlong.h"
28 
29 #if GMP_NAIL_BITS == 0
30 
31 /* Copied from the old mpn/generic/gcdext.c, and modified slightly to return
32    the remainder. */
33 
34 /* Single-limb division optimized for small quotients. */
35 static inline mp_limb_t
36 div1 (mp_ptr rp,
37       mp_limb_t n0,
38       mp_limb_t d0)
39 {
40   mp_limb_t q = 0;
41 
42   if ((mp_limb_signed_t) n0 < 0)
43     {
44       int cnt;
45       for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++)
46 	{
47 	  d0 = d0 << 1;
48 	}
49 
50       q = 0;
51       while (cnt)
52 	{
53 	  q <<= 1;
54 	  if (n0 >= d0)
55 	    {
56 	      n0 = n0 - d0;
57 	      q |= 1;
58 	    }
59 	  d0 = d0 >> 1;
60 	  cnt--;
61 	}
62     }
63   else
64     {
65       int cnt;
66       for (cnt = 0; n0 >= d0; cnt++)
67 	{
68 	  d0 = d0 << 1;
69 	}
70 
71       q = 0;
72       while (cnt)
73 	{
74 	  d0 = d0 >> 1;
75 	  q <<= 1;
76 	  if (n0 >= d0)
77 	    {
78 	      n0 = n0 - d0;
79 	      q |= 1;
80 	    }
81 	  cnt--;
82 	}
83     }
84   *rp = n0;
85   return q;
86 }
87 
88 /* Two-limb division optimized for small quotients.  */
89 static inline mp_limb_t
90 div2 (mp_ptr rp,
91       mp_limb_t nh, mp_limb_t nl,
92       mp_limb_t dh, mp_limb_t dl)
93 {
94   mp_limb_t q = 0;
95 
96   if ((mp_limb_signed_t) nh < 0)
97     {
98       int cnt;
99       for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++)
100 	{
101 	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
102 	  dl = dl << 1;
103 	}
104 
105       while (cnt)
106 	{
107 	  q <<= 1;
108 	  if (nh > dh || (nh == dh && nl >= dl))
109 	    {
110 	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
111 	      q |= 1;
112 	    }
113 	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
114 	  dh = dh >> 1;
115 	  cnt--;
116 	}
117     }
118   else
119     {
120       int cnt;
121       for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
122 	{
123 	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
124 	  dl = dl << 1;
125 	}
126 
127       while (cnt)
128 	{
129 	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
130 	  dh = dh >> 1;
131 	  q <<= 1;
132 	  if (nh > dh || (nh == dh && nl >= dl))
133 	    {
134 	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
135 	      q |= 1;
136 	    }
137 	  cnt--;
138 	}
139     }
140 
141   rp[0] = nl;
142   rp[1] = nh;
143 
144   return q;
145 }
146 
147 #if 0
148 /* This div2 uses less branches, but it seems to nevertheless be
149    slightly slower than the above code. */
150 static inline mp_limb_t
151 div2 (mp_ptr rp,
152       mp_limb_t nh, mp_limb_t nl,
153       mp_limb_t dh, mp_limb_t dl)
154 {
155   mp_limb_t q = 0;
156   int ncnt;
157   int dcnt;
158 
159   count_leading_zeros (ncnt, nh);
160   count_leading_zeros (dcnt, dh);
161   dcnt -= ncnt;
162 
163   dh = (dh << dcnt) + (-(dcnt > 0) & (dl >> (GMP_LIMB_BITS - dcnt)));
164   dl <<= dcnt;
165 
166   do
167     {
168       mp_limb_t bit;
169       q <<= 1;
170       if (UNLIKELY (nh == dh))
171 	bit = (nl >= dl);
172       else
173 	bit = (nh > dh);
174 
175       q |= bit;
176 
177       sub_ddmmss (nh, nl, nh, nl, (-bit) & dh, (-bit) & dl);
178 
179       dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
180       dh = dh >> 1;
181     }
182   while (dcnt--);
183 
184   rp[0] = nl;
185   rp[1] = nh;
186 
187   return q;
188 }
189 #endif
190 
191 #else /* GMP_NAIL_BITS != 0 */
192 /* Check all functions for nail support. */
193 /* hgcd2 should be defined to take inputs including nail bits, and
194    produce a matrix with elements also including nail bits. This is
195    necessary, for the matrix elements to be useful with mpn_mul_1,
196    mpn_addmul_1 and friends. */
197 #error Not implemented
198 #endif /* GMP_NAIL_BITS != 0 */
199 
200 /* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs
201    matrix M. Returns 1 if we make progress, i.e. can perform at least
202    one subtraction. Otherwise returns zero.. */
203 
204 /* FIXME: Possible optimizations:
205 
206    The div2 function starts with checking the most significant bit of
207    the numerator. We can maintained normalized operands here, call
208    hgcd with normalized operands only, which should make the code
209    simpler and possibly faster.
210 
211    Experiment with table lookups on the most significant bits.
212 
213    This function is also a candidate for assembler implementation.
214 */
215 int
216 mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
217 	   struct hgcd_matrix1 *M)
218 {
219   mp_limb_t u00, u01, u10, u11;
220 
221   if (ah < 2 || bh < 2)
222     return 0;
223 
224   if (ah > bh || (ah == bh && al > bl))
225     {
226       sub_ddmmss (ah, al, ah, al, bh, bl);
227       if (ah < 2)
228 	return 0;
229 
230       u00 = u01 = u11 = 1;
231       u10 = 0;
232     }
233   else
234     {
235       sub_ddmmss (bh, bl, bh, bl, ah, al);
236       if (bh < 2)
237 	return 0;
238 
239       u00 = u10 = u11 = 1;
240       u01 = 0;
241     }
242 
243   if (ah < bh)
244     goto subtract_a;
245 
246   for (;;)
247     {
248       ASSERT (ah >= bh);
249       if (ah == bh)
250 	goto done;
251 
252       if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
253 	{
254 	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
255 	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
256 
257 	  break;
258 	}
259 
260       /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
261 	 1), affecting the second column of M. */
262       ASSERT (ah > bh);
263       sub_ddmmss (ah, al, ah, al, bh, bl);
264 
265       if (ah < 2)
266 	goto done;
267 
268       if (ah <= bh)
269 	{
270 	  /* Use q = 1 */
271 	  u01 += u00;
272 	  u11 += u10;
273 	}
274       else
275 	{
276 	  mp_limb_t r[2];
277 	  mp_limb_t q = div2 (r, ah, al, bh, bl);
278 	  al = r[0]; ah = r[1];
279 	  if (ah < 2)
280 	    {
281 	      /* A is too small, but q is correct. */
282 	      u01 += q * u00;
283 	      u11 += q * u10;
284 	      goto done;
285 	    }
286 	  q++;
287 	  u01 += q * u00;
288 	  u11 += q * u10;
289 	}
290     subtract_a:
291       ASSERT (bh >= ah);
292       if (ah == bh)
293 	goto done;
294 
295       if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
296 	{
297 	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
298 	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
299 
300 	  goto subtract_a1;
301 	}
302 
303       /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
304 	 1), affecting the first column of M. */
305       sub_ddmmss (bh, bl, bh, bl, ah, al);
306 
307       if (bh < 2)
308 	goto done;
309 
310       if (bh <= ah)
311 	{
312 	  /* Use q = 1 */
313 	  u00 += u01;
314 	  u10 += u11;
315 	}
316       else
317 	{
318 	  mp_limb_t r[2];
319 	  mp_limb_t q = div2 (r, bh, bl, ah, al);
320 	  bl = r[0]; bh = r[1];
321 	  if (bh < 2)
322 	    {
323 	      /* B is too small, but q is correct. */
324 	      u00 += q * u01;
325 	      u10 += q * u11;
326 	      goto done;
327 	    }
328 	  q++;
329 	  u00 += q * u01;
330 	  u10 += q * u11;
331 	}
332     }
333 
334   /* NOTE: Since we discard the least significant half limb, we don't
335      get a truly maximal M (corresponding to |a - b| <
336      2^{GMP_LIMB_BITS +1}). */
337   /* Single precision loop */
338   for (;;)
339     {
340       ASSERT (ah >= bh);
341       if (ah == bh)
342 	break;
343 
344       ah -= bh;
345       if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
346 	break;
347 
348       if (ah <= bh)
349 	{
350 	  /* Use q = 1 */
351 	  u01 += u00;
352 	  u11 += u10;
353 	}
354       else
355 	{
356 	  mp_limb_t r;
357 	  mp_limb_t q = div1 (&r, ah, bh);
358 	  ah = r;
359 	  if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
360 	    {
361 	      /* A is too small, but q is correct. */
362 	      u01 += q * u00;
363 	      u11 += q * u10;
364 	      break;
365 	    }
366 	  q++;
367 	  u01 += q * u00;
368 	  u11 += q * u10;
369 	}
370     subtract_a1:
371       ASSERT (bh >= ah);
372       if (ah == bh)
373 	break;
374 
375       bh -= ah;
376       if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
377 	break;
378 
379       if (bh <= ah)
380 	{
381 	  /* Use q = 1 */
382 	  u00 += u01;
383 	  u10 += u11;
384 	}
385       else
386 	{
387 	  mp_limb_t r;
388 	  mp_limb_t q = div1 (&r, bh, ah);
389 	  bh = r;
390 	  if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
391 	    {
392 	      /* B is too small, but q is correct. */
393 	      u00 += q * u01;
394 	      u10 += q * u11;
395 	      break;
396 	    }
397 	  q++;
398 	  u00 += q * u01;
399 	  u10 += q * u11;
400 	}
401     }
402 
403  done:
404   M->u[0][0] = u00; M->u[0][1] = u01;
405   M->u[1][0] = u10; M->u[1][1] = u11;
406 
407   return 1;
408 }
409 
410 /* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
411  * have space for n + 1 limbs. Uses three buffers to avoid a copy*/
412 mp_size_t
413 mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
414 			     mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
415 {
416   mp_limb_t ah, bh;
417 
418   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
419 
420      r  = u00 * a
421      r += u10 * b
422      b *= u11
423      b += u01 * a
424   */
425 
426 #if HAVE_NATIVE_mpn_addaddmul_1msb0
427   ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
428   bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
429 #else
430   ah =     mpn_mul_1 (rp, ap, n, M->u[0][0]);
431   ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
432 
433   bh =     mpn_mul_1 (bp, bp, n, M->u[1][1]);
434   bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
435 #endif
436   rp[n] = ah;
437   bp[n] = bh;
438 
439   n += (ah | bh) > 0;
440   return n;
441 }
442 
443 /* Sets (r;b) = M^{-1}(a;b), with M^{-1} = (u11, -u01; -u10, u00) from
444    the left. Uses three buffers, to avoid a copy. */
445 mp_size_t
446 mpn_hgcd_mul_matrix1_inverse_vector (const struct hgcd_matrix1 *M,
447 				     mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
448 {
449   mp_limb_t h0, h1;
450 
451   /* Compute (r;b) <-- (u11 a - u01 b; -u10 a + u00 b) as
452 
453      r  = u11 * a
454      r -= u01 * b
455      b *= u00
456      b -= u10 * a
457   */
458 
459   h0 =    mpn_mul_1 (rp, ap, n, M->u[1][1]);
460   h1 = mpn_submul_1 (rp, bp, n, M->u[0][1]);
461   ASSERT (h0 == h1);
462 
463   h0 =    mpn_mul_1 (bp, bp, n, M->u[0][0]);
464   h1 = mpn_submul_1 (bp, ap, n, M->u[1][0]);
465   ASSERT (h0 == h1);
466 
467   n -= (rp[n-1] | bp[n-1]) == 0;
468   return n;
469 }
470