1 /* hgcd2.c
2 
3    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
4    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6 
7 Copyright 1996, 1998, 2000-2004, 2008, 2012 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13 
14   * the GNU Lesser General Public License as published by the Free
15     Software Foundation; either version 3 of the License, or (at your
16     option) any later version.
17 
18 or
19 
20   * the GNU General Public License as published by the Free Software
21     Foundation; either version 2 of the License, or (at your option) any
22     later version.
23 
24 or both in parallel, as here.
25 
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29 for more details.
30 
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library.  If not,
33 see https://www.gnu.org/licenses/.  */
34 
35 #include "gmp.h"
36 #include "gmp-impl.h"
37 #include "longlong.h"
38 
39 #if GMP_NAIL_BITS == 0
40 
41 /* Copied from the old mpn/generic/gcdext.c, and modified slightly to return
42    the remainder. */
43 
44 /* Single-limb division optimized for small quotients. */
45 static inline mp_limb_t
div1(mp_ptr rp,mp_limb_t n0,mp_limb_t d0)46 div1 (mp_ptr rp,
47       mp_limb_t n0,
48       mp_limb_t d0)
49 {
50   mp_limb_t q = 0;
51 
52   if ((mp_limb_signed_t) n0 < 0)
53     {
54       int cnt;
55       for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++)
56 	{
57 	  d0 = d0 << 1;
58 	}
59 
60       q = 0;
61       while (cnt)
62 	{
63 	  q <<= 1;
64 	  if (n0 >= d0)
65 	    {
66 	      n0 = n0 - d0;
67 	      q |= 1;
68 	    }
69 	  d0 = d0 >> 1;
70 	  cnt--;
71 	}
72     }
73   else
74     {
75       int cnt;
76       for (cnt = 0; n0 >= d0; cnt++)
77 	{
78 	  d0 = d0 << 1;
79 	}
80 
81       q = 0;
82       while (cnt)
83 	{
84 	  d0 = d0 >> 1;
85 	  q <<= 1;
86 	  if (n0 >= d0)
87 	    {
88 	      n0 = n0 - d0;
89 	      q |= 1;
90 	    }
91 	  cnt--;
92 	}
93     }
94   *rp = n0;
95   return q;
96 }
97 
98 /* Two-limb division optimized for small quotients.  */
99 static inline mp_limb_t
div2(mp_ptr rp,mp_limb_t nh,mp_limb_t nl,mp_limb_t dh,mp_limb_t dl)100 div2 (mp_ptr rp,
101       mp_limb_t nh, mp_limb_t nl,
102       mp_limb_t dh, mp_limb_t dl)
103 {
104   mp_limb_t q = 0;
105 
106   if ((mp_limb_signed_t) nh < 0)
107     {
108       int cnt;
109       for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++)
110 	{
111 	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
112 	  dl = dl << 1;
113 	}
114 
115       while (cnt)
116 	{
117 	  q <<= 1;
118 	  if (nh > dh || (nh == dh && nl >= dl))
119 	    {
120 	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
121 	      q |= 1;
122 	    }
123 	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
124 	  dh = dh >> 1;
125 	  cnt--;
126 	}
127     }
128   else
129     {
130       int cnt;
131       for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
132 	{
133 	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
134 	  dl = dl << 1;
135 	}
136 
137       while (cnt)
138 	{
139 	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
140 	  dh = dh >> 1;
141 	  q <<= 1;
142 	  if (nh > dh || (nh == dh && nl >= dl))
143 	    {
144 	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
145 	      q |= 1;
146 	    }
147 	  cnt--;
148 	}
149     }
150 
151   rp[0] = nl;
152   rp[1] = nh;
153 
154   return q;
155 }
156 
157 #if 0
158 /* This div2 uses less branches, but it seems to nevertheless be
159    slightly slower than the above code. */
160 static inline mp_limb_t
161 div2 (mp_ptr rp,
162       mp_limb_t nh, mp_limb_t nl,
163       mp_limb_t dh, mp_limb_t dl)
164 {
165   mp_limb_t q = 0;
166   int ncnt;
167   int dcnt;
168 
169   count_leading_zeros (ncnt, nh);
170   count_leading_zeros (dcnt, dh);
171   dcnt -= ncnt;
172 
173   dh = (dh << dcnt) + (-(dcnt > 0) & (dl >> (GMP_LIMB_BITS - dcnt)));
174   dl <<= dcnt;
175 
176   do
177     {
178       mp_limb_t bit;
179       q <<= 1;
180       if (UNLIKELY (nh == dh))
181 	bit = (nl >= dl);
182       else
183 	bit = (nh > dh);
184 
185       q |= bit;
186 
187       sub_ddmmss (nh, nl, nh, nl, (-bit) & dh, (-bit) & dl);
188 
189       dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
190       dh = dh >> 1;
191     }
192   while (dcnt--);
193 
194   rp[0] = nl;
195   rp[1] = nh;
196 
197   return q;
198 }
199 #endif
200 
201 #else /* GMP_NAIL_BITS != 0 */
202 /* Check all functions for nail support. */
203 /* hgcd2 should be defined to take inputs including nail bits, and
204    produce a matrix with elements also including nail bits. This is
205    necessary, for the matrix elements to be useful with mpn_mul_1,
206    mpn_addmul_1 and friends. */
207 #error Not implemented
208 #endif /* GMP_NAIL_BITS != 0 */
209 
210 /* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs
211    matrix M. Returns 1 if we make progress, i.e. can perform at least
212    one subtraction. Otherwise returns zero. */
213 
214 /* FIXME: Possible optimizations:
215 
216    The div2 function starts with checking the most significant bit of
217    the numerator. We can maintained normalized operands here, call
218    hgcd with normalized operands only, which should make the code
219    simpler and possibly faster.
220 
221    Experiment with table lookups on the most significant bits.
222 
223    This function is also a candidate for assembler implementation.
224 */
225 int
mpn_hgcd2(mp_limb_t ah,mp_limb_t al,mp_limb_t bh,mp_limb_t bl,struct hgcd_matrix1 * M)226 mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
227 	   struct hgcd_matrix1 *M)
228 {
229   mp_limb_t u00, u01, u10, u11;
230 
231   if (ah < 2 || bh < 2)
232     return 0;
233 
234   if (ah > bh || (ah == bh && al > bl))
235     {
236       sub_ddmmss (ah, al, ah, al, bh, bl);
237       if (ah < 2)
238 	return 0;
239 
240       u00 = u01 = u11 = 1;
241       u10 = 0;
242     }
243   else
244     {
245       sub_ddmmss (bh, bl, bh, bl, ah, al);
246       if (bh < 2)
247 	return 0;
248 
249       u00 = u10 = u11 = 1;
250       u01 = 0;
251     }
252 
253   if (ah < bh)
254     goto subtract_a;
255 
256   for (;;)
257     {
258       ASSERT (ah >= bh);
259       if (ah == bh)
260 	goto done;
261 
262       if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
263 	{
264 	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
265 	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
266 
267 	  break;
268 	}
269 
270       /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
271 	 1), affecting the second column of M. */
272       ASSERT (ah > bh);
273       sub_ddmmss (ah, al, ah, al, bh, bl);
274 
275       if (ah < 2)
276 	goto done;
277 
278       if (ah <= bh)
279 	{
280 	  /* Use q = 1 */
281 	  u01 += u00;
282 	  u11 += u10;
283 	}
284       else
285 	{
286 	  mp_limb_t r[2];
287 	  mp_limb_t q = div2 (r, ah, al, bh, bl);
288 	  al = r[0]; ah = r[1];
289 	  if (ah < 2)
290 	    {
291 	      /* A is too small, but q is correct. */
292 	      u01 += q * u00;
293 	      u11 += q * u10;
294 	      goto done;
295 	    }
296 	  q++;
297 	  u01 += q * u00;
298 	  u11 += q * u10;
299 	}
300     subtract_a:
301       ASSERT (bh >= ah);
302       if (ah == bh)
303 	goto done;
304 
305       if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
306 	{
307 	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
308 	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
309 
310 	  goto subtract_a1;
311 	}
312 
313       /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
314 	 1), affecting the first column of M. */
315       sub_ddmmss (bh, bl, bh, bl, ah, al);
316 
317       if (bh < 2)
318 	goto done;
319 
320       if (bh <= ah)
321 	{
322 	  /* Use q = 1 */
323 	  u00 += u01;
324 	  u10 += u11;
325 	}
326       else
327 	{
328 	  mp_limb_t r[2];
329 	  mp_limb_t q = div2 (r, bh, bl, ah, al);
330 	  bl = r[0]; bh = r[1];
331 	  if (bh < 2)
332 	    {
333 	      /* B is too small, but q is correct. */
334 	      u00 += q * u01;
335 	      u10 += q * u11;
336 	      goto done;
337 	    }
338 	  q++;
339 	  u00 += q * u01;
340 	  u10 += q * u11;
341 	}
342     }
343 
344   /* NOTE: Since we discard the least significant half limb, we don't
345      get a truly maximal M (corresponding to |a - b| <
346      2^{GMP_LIMB_BITS +1}). */
347   /* Single precision loop */
348   for (;;)
349     {
350       ASSERT (ah >= bh);
351 
352       ah -= bh;
353       if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
354 	break;
355 
356       if (ah <= bh)
357 	{
358 	  /* Use q = 1 */
359 	  u01 += u00;
360 	  u11 += u10;
361 	}
362       else
363 	{
364 	  mp_limb_t r;
365 	  mp_limb_t q = div1 (&r, ah, bh);
366 	  ah = r;
367 	  if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
368 	    {
369 	      /* A is too small, but q is correct. */
370 	      u01 += q * u00;
371 	      u11 += q * u10;
372 	      break;
373 	    }
374 	  q++;
375 	  u01 += q * u00;
376 	  u11 += q * u10;
377 	}
378     subtract_a1:
379       ASSERT (bh >= ah);
380 
381       bh -= ah;
382       if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
383 	break;
384 
385       if (bh <= ah)
386 	{
387 	  /* Use q = 1 */
388 	  u00 += u01;
389 	  u10 += u11;
390 	}
391       else
392 	{
393 	  mp_limb_t r;
394 	  mp_limb_t q = div1 (&r, bh, ah);
395 	  bh = r;
396 	  if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
397 	    {
398 	      /* B is too small, but q is correct. */
399 	      u00 += q * u01;
400 	      u10 += q * u11;
401 	      break;
402 	    }
403 	  q++;
404 	  u00 += q * u01;
405 	  u10 += q * u11;
406 	}
407     }
408 
409  done:
410   M->u[0][0] = u00; M->u[0][1] = u01;
411   M->u[1][0] = u10; M->u[1][1] = u11;
412 
413   return 1;
414 }
415 
416 /* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
417  * have space for n + 1 limbs. Uses three buffers to avoid a copy*/
418 mp_size_t
mpn_hgcd_mul_matrix1_vector(const struct hgcd_matrix1 * M,mp_ptr rp,mp_srcptr ap,mp_ptr bp,mp_size_t n)419 mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
420 			     mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
421 {
422   mp_limb_t ah, bh;
423 
424   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
425 
426      r  = u00 * a
427      r += u10 * b
428      b *= u11
429      b += u01 * a
430   */
431 
432 #if HAVE_NATIVE_mpn_addaddmul_1msb0
433   ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
434   bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
435 #else
436   ah =     mpn_mul_1 (rp, ap, n, M->u[0][0]);
437   ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
438 
439   bh =     mpn_mul_1 (bp, bp, n, M->u[1][1]);
440   bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
441 #endif
442   rp[n] = ah;
443   bp[n] = bh;
444 
445   n += (ah | bh) > 0;
446   return n;
447 }
448