1 /* Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
2 
3 This file is part of the GNU MP Library.
4 
5 The GNU MP Library is free software; you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 2.1 of the License, or (at your
8 option) any later version.
9 
10 The GNU MP Library is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
13 License for more details.
14 
15 You should have received a copy of the GNU Lesser General Public License
16 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
17 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
18 MA 02111-1307, USA. */
19 
20 #include "../schpriv.h"
21 #include "gmp.h"
22 #include "gmp-impl.h"
23 #include "gmplonglong.h"
24 
25 #if defined(__GNUC__)
26 # define USED_ONLY_SOMETIMES __attribute__((unused))
27 #else
28 # define USED_ONLY_SOMETIMES /* empty */
29 #endif
30 
31 extern void *scheme_malloc_gmp(uintptr_t, void **mem_pool);
32 extern void scheme_free_gmp(void *, void **mem_pool);
33 THREAD_LOCAL_DECL(static void *gmp_mem_pool);
34 #define MALLOC(amt) scheme_malloc_gmp(amt, &gmp_mem_pool)
35 #define FREE(p, s) scheme_free_gmp(p, &gmp_mem_pool)
36 
37 #define GMP_NAIL_BITS 0
38 #define GMP_LIMB_BITS BITS_PER_MP_LIMB
39 #define GMP_NUMB_BITS BITS_PER_MP_LIMB
40 #define GMP_LIMB_HIGHBIT ((mp_limb_t)1 << (BITS_PER_MP_LIMB - 1))
41 #define GMP_NUMB_HIGHBIT GMP_LIMB_HIGHBIT
42 
43 #ifndef NULL
44 # define NULL 0L
45 #endif
46 
47 #if GMP_NUMB_BITS == 32
48 # define MP_BASES_CHARS_PER_LIMB_10      9
49 # define MP_BASES_BIG_BASE_10            CNST_LIMB(0x3b9aca00)
50 # define MP_BASES_BIG_BASE_INVERTED_10   CNST_LIMB(0x12e0be82)
51 # define MP_BASES_NORMALIZATION_STEPS_10 2
52 # define GMP_NUMB_MASK 0xFFFFFFFF
53 #else
54 # define MP_BASES_CHARS_PER_LIMB_10      19
55 # define MP_BASES_BIG_BASE_10            CNST_LIMB(0x8ac7230489e80000)
56 # define MP_BASES_BIG_BASE_INVERTED_10   CNST_LIMB(0xd83c94fb6d2ac34a)
57 # define MP_BASES_NORMALIZATION_STEPS_10 0
58 # define GMP_NUMB_MASK (~(mp_limb_t)0)
59 #endif
60 
61 #define MPN_DIVREM_OR_PREINV_DIVREM_1(qp,xsize,ap,size,d,dinv,shift)    \
62   mpn_divrem_1 (qp, xsize, ap, size, d)
63 
64 #define ABOVE_THRESHOLD(size,thresh)    \
65   ((thresh) == 0                        \
66    || ((thresh) != MP_SIZE_T_MAX        \
67        && (size) >= (thresh)))
68 #define BELOW_THRESHOLD(size,thresh)  (! ABOVE_THRESHOLD (size, thresh))
69 
70 /* n-1 inverts any low zeros and the lowest one bit.  If n&(n-1) leaves zero
71    then that lowest one bit must have been the only bit set.  n==0 will
72    return true though, so avoid that.  */
73 #define POW2_P(n)  (((n) & ((n) - 1)) == 0)
74 
75 # define __GMP_ALLOCATE_FUNC_LIMBS(n) TMP_ALLOC(n * sizeof(mp_limb_t))
76 # define __GMP_FREE_FUNC_LIMBS(p, n) /* */
77 
78 /* static const int mp_bits_per_limb = BITS_PER_MP_LIMB; */
79 /* static const int __gmp_0 = 0; */
80 /* static int __gmp_junk; */
81 /* static int gmp_errno = 0; */
82 
83 #define SCHEME_BIGNUM_USE_FUEL(n) scheme_bignum_use_fuel(n)
84 extern void scheme_bignum_use_fuel(intptr_t n);
85 
86 /* Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE.
87    There are no restrictions on the relative sizes of
88    the two arguments.
89    Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2.  */
90 
91 int
92 #if __STDC__
mpn_cmp(mp_srcptr op1_ptr,mp_srcptr op2_ptr,mp_size_t size)93 mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size)
94 #else
95 mpn_cmp (op1_ptr, op2_ptr, size)
96      mp_srcptr op1_ptr;
97      mp_srcptr op2_ptr;
98      mp_size_t size;
99 #endif
100 {
101   mp_size_t i;
102   mp_limb_t op1_word, op2_word;
103 
104   for (i = size - 1; i >= 0; i--)
105     {
106       op1_word = op1_ptr[i];
107       op2_word = op2_ptr[i];
108       if (op1_word != op2_word)
109 	goto diff;
110     }
111   return 0;
112  diff:
113   /* This can *not* be simplified to
114 	op2_word - op2_word
115      since that expression might give signed overflow.  */
116   return (op1_word > op2_word) ? 1 : -1;
117 }
118 
119 
120 /* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length. */
121 
122 mp_limb_t
123 #if __STDC__
mpn_sub_n(mp_ptr res_ptr,mp_srcptr s1_ptr,mp_srcptr s2_ptr,mp_size_t size)124 mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
125 #else
126 mpn_sub_n (res_ptr, s1_ptr, s2_ptr, size)
127      register mp_ptr res_ptr;
128      register mp_srcptr s1_ptr;
129      register mp_srcptr s2_ptr;
130      mp_size_t size;
131 #endif
132 {
133   register mp_limb_t x, y, cy;
134   register mp_size_t j;
135 
136   /* The loop counter and index J goes from -SIZE to -1.  This way
137      the loop becomes faster.  */
138   j = -size;
139 
140   /* Offset the base pointers to compensate for the negative indices.  */
141   s1_ptr -= j;
142   s2_ptr -= j;
143   res_ptr -= j;
144 
145   cy = 0;
146   do
147     {
148       y = s2_ptr[j];
149       x = s1_ptr[j];
150       y += cy;			/* add previous carry to subtrahend */
151       cy = (y < cy);		/* get out carry from that addition */
152       y = x - y;		/* main subtract */
153       cy = (y > x) + cy;	/* get out carry from the subtract, combine */
154       res_ptr[j] = y;
155     }
156   while (++j != 0);
157 
158   return cy;
159 }
160 
161 /* mpn_add_n -- Add two limb vectors of equal, non-zero length. */
162 
163 mp_limb_t
164 #if __STDC__
mpn_add_n(mp_ptr res_ptr,mp_srcptr s1_ptr,mp_srcptr s2_ptr,mp_size_t size)165 mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
166 #else
167 mpn_add_n (res_ptr, s1_ptr, s2_ptr, size)
168      register mp_ptr res_ptr;
169      register mp_srcptr s1_ptr;
170      register mp_srcptr s2_ptr;
171      mp_size_t size;
172 #endif
173 {
174   register mp_limb_t x, y, cy;
175   register mp_size_t j;
176 
177   /* The loop counter and index J goes from -SIZE to -1.  This way
178      the loop becomes faster.  */
179   j = -size;
180 
181   /* Offset the base pointers to compensate for the negative indices.  */
182   s1_ptr -= j;
183   s2_ptr -= j;
184   res_ptr -= j;
185 
186   cy = 0;
187   do
188     {
189       y = s2_ptr[j];
190       x = s1_ptr[j];
191       y += cy;			/* add previous carry to one addend */
192       cy = (y < cy);		/* get out carry from that addition */
193       y = x + y;		/* add other addend */
194       cy = (y < x) + cy;	/* get out carry from that add, combine */
195       res_ptr[j] = y;
196     }
197   while (++j != 0);
198 
199   return cy;
200 }
201 
202 
203 /* mpn_mul_n and helper function -- Multiply/square natural numbers. */
204 
205 /* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
206    0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
207 #define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
208 
209 #if !defined (__alpha) && !defined (__mips)
210 /* For all other machines, we want to call mpn functions for the compund
211    operations instead of open-coding them.  */
212 #define USE_MORE_MPN
213 #endif
214 
215 /*== Function declarations =================================================*/
216 
217 static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
218                                mp_ptr, mp_ptr, mp_ptr,
219                                mp_srcptr, mp_srcptr, mp_srcptr,
220                                mp_size_t, mp_size_t));
221 static void interpolate3 _PROTO ((mp_srcptr,
222                                   mp_ptr, mp_ptr, mp_ptr,
223                                   mp_srcptr,
224                                   mp_ptr, mp_ptr, mp_ptr,
225                                   mp_size_t, mp_size_t));
226 static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
227 
228 
229 /*-- mpn_kara_mul_n ---------------------------------------------------------------*/
230 
231 /* Multiplies using 3 half-sized mults and so on recursively.
232  * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
233  * No overlap of p[...] with a[...] or b[...].
234  * ws is workspace.
235  */
236 
237 void
238 #if __STDC__
mpn_kara_mul_n(mp_ptr p,mp_srcptr a,mp_srcptr b,mp_size_t n,mp_ptr ws)239 mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
240 #else
241 mpn_kara_mul_n(p, a, b, n, ws)
242      mp_ptr    p;
243      mp_srcptr a;
244      mp_srcptr b;
245      mp_size_t n;
246      mp_ptr    ws;
247 #endif
248 {
249   mp_limb_t i, sign, w, w0, w1;
250   mp_size_t n2;
251   mp_srcptr x, y;
252 
253   n2 = n >> 1;
254   ASSERT (n2 > 0);
255 
256   SCHEME_BIGNUM_USE_FUEL(n);
257 
258   if (n & 1)
259     {
260       /* Odd length. */
261       mp_size_t n1, n3, nm1;
262 
263       n3 = n - n2;
264 
265       sign = 0;
266       w = a[n2];
267       if (w != 0)
268 	w -= mpn_sub_n (p, a, a + n3, n2);
269       else
270 	{
271 	  i = n2;
272 	  do
273 	    {
274 	      --i;
275 	      w0 = a[i];
276 	      w1 = a[n3+i];
277 	    }
278 	  while (w0 == w1 && i != 0);
279 	  if (w0 < w1)
280 	    {
281 	      x = a + n3;
282 	      y = a;
283 	      sign = 1;
284 	    }
285 	  else
286 	    {
287 	      x = a;
288 	      y = a + n3;
289 	    }
290 	  mpn_sub_n (p, x, y, n2);
291 	}
292       p[n2] = w;
293 
294       w = b[n2];
295       if (w != 0)
296 	w -= mpn_sub_n (p + n3, b, b + n3, n2);
297       else
298 	{
299 	  i = n2;
300 	  do
301 	    {
302 	      --i;
303 	      w0 = b[i];
304 	      w1 = b[n3+i];
305 	    }
306 	  while (w0 == w1 && i != 0);
307 	  if (w0 < w1)
308 	    {
309 	      x = b + n3;
310 	      y = b;
311 	      sign ^= 1;
312 	    }
313 	  else
314 	    {
315 	      x = b;
316 	      y = b + n3;
317 	    }
318 	  mpn_sub_n (p + n3, x, y, n2);
319 	}
320       p[n] = w;
321 
322       n1 = n + 1;
323       if (n2 < KARATSUBA_MUL_THRESHOLD)
324 	{
325 	  if (n3 < KARATSUBA_MUL_THRESHOLD)
326 	    {
327 	      mpn_mul_basecase (ws, p, n3, p + n3, n3);
328 	      mpn_mul_basecase (p, a, n3, b, n3);
329 	    }
330 	  else
331 	    {
332 	      mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
333 	      mpn_kara_mul_n (p, a, b, n3, ws + n1);
334 	    }
335 	  mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
336 	}
337       else
338 	{
339 	  mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
340 	  mpn_kara_mul_n (p, a, b, n3, ws + n1);
341 	  mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
342 	}
343 
344       if (sign)
345 	mpn_add_n (ws, p, ws, n1);
346       else
347 	mpn_sub_n (ws, p, ws, n1);
348 
349       nm1 = n - 1;
350       if (mpn_add_n (ws, p + n1, ws, nm1))
351 	{
352 	  mp_limb_t x = ws[nm1] + 1;
353 	  ws[nm1] = x;
354 	  if (x == 0)
355 	    ++ws[n];
356 	}
357       if (mpn_add_n (p + n3, p + n3, ws, n1))
358 	{
359 	  mp_limb_t x;
360 	  i = n1 + n3;
361 	  do
362 	    {
363 	      x = p[i] + 1;
364 	      p[i] = x;
365 	      ++i;
366 	    } while (x == 0);
367 	}
368     }
369   else
370     {
371       /* Even length. */
372       mp_limb_t t;
373 
374       i = n2;
375       do
376 	{
377 	  --i;
378 	  w0 = a[i];
379 	  w1 = a[n2+i];
380 	}
381       while (w0 == w1 && i != 0);
382       sign = 0;
383       if (w0 < w1)
384 	{
385 	  x = a + n2;
386 	  y = a;
387 	  sign = 1;
388 	}
389       else
390 	{
391 	  x = a;
392 	  y = a + n2;
393 	}
394       mpn_sub_n (p, x, y, n2);
395 
396       i = n2;
397       do
398 	{
399 	  --i;
400 	  w0 = b[i];
401 	  w1 = b[n2+i];
402 	}
403       while (w0 == w1 && i != 0);
404       if (w0 < w1)
405 	{
406 	  x = b + n2;
407 	  y = b;
408 	  sign ^= 1;
409 	}
410       else
411 	{
412 	  x = b;
413 	  y = b + n2;
414 	}
415       mpn_sub_n (p + n2, x, y, n2);
416 
417       /* Pointwise products. */
418       if (n2 < KARATSUBA_MUL_THRESHOLD)
419 	{
420 	  mpn_mul_basecase (ws, p, n2, p + n2, n2);
421 	  mpn_mul_basecase (p, a, n2, b, n2);
422 	  mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
423 	}
424       else
425 	{
426 	  mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
427 	  mpn_kara_mul_n (p, a, b, n2, ws + n);
428 	  mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
429 	}
430 
431       /* Interpolate. */
432       if (sign)
433 	w = mpn_add_n (ws, p, ws, n);
434       else
435 	w = -mpn_sub_n (ws, p, ws, n);
436       w += mpn_add_n (ws, p + n, ws, n);
437       w += mpn_add_n (p + n2, p + n2, ws, n);
438       /* TO DO: could put "if (w) { ... }" here.
439        * Less work but badly predicted branch.
440        * No measurable difference in speed on Alpha.
441        */
442       i = n + n2;
443       t = p[i] + w;
444       p[i] = t;
445       if (t < w)
446 	{
447 	  do
448 	    {
449 	      ++i;
450 	      w = p[i] + 1;
451 	      p[i] = w;
452 	    }
453 	  while (w == 0);
454 	}
455     }
456 }
457 
458 void
459 #if __STDC__
mpn_kara_sqr_n(mp_ptr p,mp_srcptr a,mp_size_t n,mp_ptr ws)460 mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
461 #else
462 mpn_kara_sqr_n (p, a, n, ws)
463      mp_ptr    p;
464      mp_srcptr a;
465      mp_size_t n;
466      mp_ptr    ws;
467 #endif
468 {
469   mp_limb_t i, sign, w, w0, w1;
470   mp_size_t n2;
471   mp_srcptr x, y;
472 
473   n2 = n >> 1;
474   ASSERT (n2 > 0);
475 
476   SCHEME_BIGNUM_USE_FUEL(n);
477 
478   if (n & 1)
479     {
480       /* Odd length. */
481       mp_size_t n1, n3, nm1;
482 
483       n3 = n - n2;
484 
485       sign = 0;
486       w = a[n2];
487       if (w != 0)
488 	w -= mpn_sub_n (p, a, a + n3, n2);
489       else
490 	{
491 	  i = n2;
492 	  do
493 	    {
494 	      --i;
495 	      w0 = a[i];
496 	      w1 = a[n3+i];
497 	    }
498 	  while (w0 == w1 && i != 0);
499 	  if (w0 < w1)
500 	    {
501 	      x = a + n3;
502 	      y = a;
503 	      sign = 1;
504 	    }
505 	  else
506 	    {
507 	      x = a;
508 	      y = a + n3;
509 	    }
510 	  mpn_sub_n (p, x, y, n2);
511 	}
512       p[n2] = w;
513 
514       w = a[n2];
515       if (w != 0)
516 	w -= mpn_sub_n (p + n3, a, a + n3, n2);
517       else
518 	{
519 	  i = n2;
520 	  do
521 	    {
522 	      --i;
523 	      w0 = a[i];
524 	      w1 = a[n3+i];
525 	    }
526 	  while (w0 == w1 && i != 0);
527 	  if (w0 < w1)
528 	    {
529 	      x = a + n3;
530 	      y = a;
531 	      sign ^= 1;
532 	    }
533 	  else
534 	    {
535 	      x = a;
536 	      y = a + n3;
537 	    }
538 	  mpn_sub_n (p + n3, x, y, n2);
539 	}
540       p[n] = w;
541 
542       n1 = n + 1;
543       if (n2 < KARATSUBA_SQR_THRESHOLD)
544 	{
545 	  if (n3 < KARATSUBA_SQR_THRESHOLD)
546 	    {
547 	      mpn_sqr_basecase (ws, p, n3);
548 	      mpn_sqr_basecase (p, a, n3);
549 	    }
550 	  else
551 	    {
552 	      mpn_kara_sqr_n (ws, p, n3, ws + n1);
553 	      mpn_kara_sqr_n (p, a, n3, ws + n1);
554 	    }
555 	  mpn_sqr_basecase (p + n1, a + n3, n2);
556 	}
557       else
558 	{
559 	  mpn_kara_sqr_n (ws, p, n3, ws + n1);
560 	  mpn_kara_sqr_n (p, a, n3, ws + n1);
561 	  mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
562 	}
563 
564       if (sign)
565 	mpn_add_n (ws, p, ws, n1);
566       else
567 	mpn_sub_n (ws, p, ws, n1);
568 
569       nm1 = n - 1;
570       if (mpn_add_n (ws, p + n1, ws, nm1))
571 	{
572 	  mp_limb_t x = ws[nm1] + 1;
573 	  ws[nm1] = x;
574 	  if (x == 0)
575 	    ++ws[n];
576 	}
577       if (mpn_add_n (p + n3, p + n3, ws, n1))
578 	{
579 	  mp_limb_t x;
580 	  i = n1 + n3;
581 	  do
582 	    {
583 	      x = p[i] + 1;
584 	      p[i] = x;
585 	      ++i;
586 	    } while (x == 0);
587 	}
588     }
589   else
590     {
591       /* Even length. */
592       mp_limb_t t;
593 
594       i = n2;
595       do
596 	{
597 	  --i;
598 	  w0 = a[i];
599 	  w1 = a[n2+i];
600 	}
601       while (w0 == w1 && i != 0);
602       sign = 0;
603       if (w0 < w1)
604 	{
605 	  x = a + n2;
606 	  y = a;
607 	  sign = 1;
608 	}
609       else
610 	{
611 	  x = a;
612 	  y = a + n2;
613 	}
614       mpn_sub_n (p, x, y, n2);
615 
616       i = n2;
617       do
618 	{
619 	  --i;
620 	  w0 = a[i];
621 	  w1 = a[n2+i];
622 	}
623       while (w0 == w1 && i != 0);
624       if (w0 < w1)
625 	{
626 	  x = a + n2;
627 	  y = a;
628 	  sign ^= 1;
629 	}
630       else
631 	{
632 	  x = a;
633 	  y = a + n2;
634 	}
635       mpn_sub_n (p + n2, x, y, n2);
636 
637       /* Pointwise products. */
638       if (n2 < KARATSUBA_SQR_THRESHOLD)
639 	{
640 	  mpn_sqr_basecase (ws, p, n2);
641 	  mpn_sqr_basecase (p, a, n2);
642 	  mpn_sqr_basecase (p + n, a + n2, n2);
643 	}
644       else
645 	{
646 	  mpn_kara_sqr_n (ws, p, n2, ws + n);
647 	  mpn_kara_sqr_n (p, a, n2, ws + n);
648 	  mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
649 	}
650 
651       /* Interpolate. */
652       if (sign)
653 	w = mpn_add_n (ws, p, ws, n);
654       else
655 	w = -mpn_sub_n (ws, p, ws, n);
656       w += mpn_add_n (ws, p + n, ws, n);
657       w += mpn_add_n (p + n2, p + n2, ws, n);
658       /* TO DO: could put "if (w) { ... }" here.
659        * Less work but badly predicted branch.
660        * No measurable difference in speed on Alpha.
661        */
662       i = n + n2;
663       t = p[i] + w;
664       p[i] = t;
665       if (t < w)
666 	{
667 	  do
668 	    {
669 	      ++i;
670 	      w = p[i] + 1;
671 	      p[i] = w;
672 	    }
673 	  while (w == 0);
674 	}
675     }
676 }
677 
678 /*-- add2Times -------------------------------------------------------------*/
679 
680 /* z[] = x[] + 2 * y[]
681    Note that z and x might point to the same vectors. */
682 #ifdef USE_MORE_MPN
683 static inline mp_limb_t
684 #if __STDC__
add2Times(mp_ptr z,mp_srcptr x,mp_srcptr y,mp_size_t n)685 add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
686 #else
687 add2Times (z, x, y, n)
688      mp_ptr    z;
689      mp_srcptr x;
690      mp_srcptr y;
691      mp_size_t n;
692 #endif
693 {
694   mp_ptr t;
695   mp_limb_t c;
696   TMP_DECL (marker);
697   TMP_MARK (marker);
698   t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
699   c = mpn_lshift (t, y, n, 1);
700   c += mpn_add_n (z, x, t, n);
701   TMP_FREE (marker);
702   return c;
703 }
704 #else
705 
706 static mp_limb_t
707 #if __STDC__
add2Times(mp_ptr z,mp_srcptr x,mp_srcptr y,mp_size_t n)708 add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
709 #else
710 add2Times (z, x, y, n)
711      mp_ptr    z;
712      mp_srcptr x;
713      mp_srcptr y;
714      mp_size_t n;
715 #endif
716 {
717   mp_limb_t c, v, w;
718 
719   ASSERT (n > 0);
720   v = *x; w = *y;
721   c = w >> (BITS_PER_MP_LIMB - 1);
722   w <<= 1;
723   v += w;
724   c += v < w;
725   *z = v;
726   ++x; ++y; ++z;
727   while (--n)
728     {
729       v = *x;
730       w = *y;
731       v += c;
732       c = v < c;
733       c += w >> (BITS_PER_MP_LIMB - 1);
734       w <<= 1;
735       v += w;
736       c += v < w;
737       *z = v;
738       ++x; ++y; ++z;
739     }
740 
741   return c;
742 }
743 #endif
744 
745 /*-- evaluate3 -------------------------------------------------------------*/
746 
747 /* Evaluates:
748  *   ph := 4*A+2*B+C
749  *   p1 := A+B+C
750  *   p2 := A+2*B+4*C
751  * where:
752  *   ph[], p1[], p2[], A[] and B[] all have length len,
753  *   C[] has length len2 with len-len2 = 0, 1 or 2.
754  * Returns top words (overflow) at pth, pt1 and pt2 respectively.
755  */
756 #ifdef USE_MORE_MPN
757 static void
758 #if __STDC__
evaluate3(mp_ptr ph,mp_ptr p1,mp_ptr p2,mp_ptr pth,mp_ptr pt1,mp_ptr pt2,mp_srcptr A,mp_srcptr B,mp_srcptr C,mp_size_t len,mp_size_t len2)759 evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
760 	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
761 #else
762 evaluate3 (ph, p1, p2, pth, pt1, pt2,
763            A, B, C, len, len2)
764      mp_ptr    ph;
765      mp_ptr    p1;
766      mp_ptr    p2;
767      mp_ptr    pth;
768      mp_ptr    pt1;
769      mp_ptr    pt2;
770      mp_srcptr A;
771      mp_srcptr B;
772      mp_srcptr C;
773      mp_size_t len;
774      mp_size_t len2;
775 #endif
776 {
777   mp_limb_t c, d, e;
778 
779   ASSERT (len - len2 <= 2);
780 
781   e = mpn_lshift (p1, B, len, 1);
782 
783   c = mpn_lshift (ph, A, len, 2);
784   c += e + mpn_add_n (ph, ph, p1, len);
785   d = mpn_add_n (ph, ph, C, len2);
786   if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
787   ASSERT (c < 7);
788   *pth = c;
789 
790   c = mpn_lshift (p2, C, len2, 2);
791 #if 1
792   if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
793   c += e + mpn_add_n (p2, p2, p1, len);
794 #else
795   d = mpn_add_n (p2, p2, p1, len2);
796   c += d;
797   if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
798   c += e;
799 #endif
800   c += mpn_add_n (p2, p2, A, len);
801   ASSERT (c < 7);
802   *pt2 = c;
803 
804   c = mpn_add_n (p1, A, B, len);
805   d = mpn_add_n (p1, p1, C, len2);
806   if (len2 == len) c += d;
807   else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
808   ASSERT (c < 3);
809   *pt1 = c;
810 
811 }
812 
813 #else
814 
815 static void
816 #if __STDC__
evaluate3(mp_ptr ph,mp_ptr p1,mp_ptr p2,mp_ptr pth,mp_ptr pt1,mp_ptr pt2,mp_srcptr A,mp_srcptr B,mp_srcptr C,mp_size_t l,mp_size_t ls)817 evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
818 	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
819 #else
820 evaluate3 (ph, p1, p2, pth, pt1, pt2,
821            A, B, C, l, ls)
822      mp_ptr    ph;
823      mp_ptr    p1;
824      mp_ptr    p2;
825      mp_ptr    pth;
826      mp_ptr    pt1;
827      mp_ptr    pt2;
828      mp_srcptr A;
829      mp_srcptr B;
830      mp_srcptr C;
831      mp_size_t l;
832      mp_size_t ls;
833 #endif
834 {
835   mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
836 
837   ASSERT (l - ls <= 2);
838 
839   th = t1 = t2 = 0;
840   for (i = 0; i < l; ++i)
841     {
842       a = *A;
843       b = *B;
844       c = i < ls ? *C : 0;
845 
846       /* TO DO: choose one of the following alternatives. */
847 #if 0
848       t = a << 2;
849       vh = th + t;
850       th = vh < t;
851       th += a >> (BITS_PER_MP_LIMB - 2);
852       t = b << 1;
853       vh += t;
854       th += vh < t;
855       th += b >> (BITS_PER_MP_LIMB - 1);
856       vh += c;
857       th += vh < c;
858 #else
859       vh = th + c;
860       th = vh < c;
861       t = b << 1;
862       vh += t;
863       th += vh < t;
864       th += b >> (BITS_PER_MP_LIMB - 1);
865       t = a << 2;
866       vh += t;
867       th += vh < t;
868       th += a >> (BITS_PER_MP_LIMB - 2);
869 #endif
870 
871       v1 = t1 + a;
872       t1 = v1 < a;
873       v1 += b;
874       t1 += v1 < b;
875       v1 += c;
876       t1 += v1 < c;
877 
878       v2 = t2 + a;
879       t2 = v2 < a;
880       t = b << 1;
881       v2 += t;
882       t2 += v2 < t;
883       t2 += b >> (BITS_PER_MP_LIMB - 1);
884       t = c << 2;
885       v2 += t;
886       t2 += v2 < t;
887       t2 += c >> (BITS_PER_MP_LIMB - 2);
888 
889       *ph = vh;
890       *p1 = v1;
891       *p2 = v2;
892 
893       ++A; ++B; ++C;
894       ++ph; ++p1; ++p2;
895     }
896 
897   ASSERT (th < 7);
898   ASSERT (t1 < 3);
899   ASSERT (t2 < 7);
900 
901   *pth = th;
902   *pt1 = t1;
903   *pt2 = t2;
904 }
905 #endif
906 
907 
908 /*-- interpolate3 ----------------------------------------------------------*/
909 
910 /* Interpolates B, C, D (in-place) from:
911  *   16*A+8*B+4*C+2*D+E
912  *   A+B+C+D+E
913  *   A+2*B+4*C+8*D+16*E
914  * where:
915  *   A[], B[], C[] and D[] all have length l,
916  *   E[] has length ls with l-ls = 0, 2 or 4.
917  *
918  * Reads top words (from earlier overflow) from ptb, ptc and ptd,
919  * and returns new top words there.
920  */
921 
922 #ifdef USE_MORE_MPN
923 static void
924 #if __STDC__
interpolate3(mp_srcptr A,mp_ptr B,mp_ptr C,mp_ptr D,mp_srcptr E,mp_ptr ptb,mp_ptr ptc,mp_ptr ptd,mp_size_t len,mp_size_t len2)925 interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
926               mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
927 #else
928 interpolate3 (A, B, C, D, E,
929               ptb, ptc, ptd, len, len2)
930      mp_srcptr A;
931      mp_ptr    B;
932      mp_ptr    C;
933      mp_ptr    D;
934      mp_srcptr E;
935      mp_ptr    ptb;
936      mp_ptr    ptc;
937      mp_ptr    ptd;
938      mp_size_t len;
939      mp_size_t len2;
940 #endif
941 {
942   mp_ptr ws;
943   mp_limb_t t, tb,tc,td;
944   TMP_DECL (marker);
945   TMP_MARK (marker);
946 
947   ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
948 
949   /* Let x1, x2, x3 be the values to interpolate.  We have:
950    *         b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
951    *         c =    a +   x1 +   x2 +   x3 +    e
952    *         d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
953    */
954 
955   ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
956 
957   tb = *ptb; tc = *ptc; td = *ptd;
958 
959 
960   /* b := b - 16*a -    e
961    * c := c -    a -    e
962    * d := d -    a - 16*e
963    */
964 
965   t = mpn_lshift (ws, A, len, 4);
966   tb -= t + mpn_sub_n (B, B, ws, len);
967   t = mpn_sub_n (B, B, E, len2);
968   if (len2 == len) tb -= t;
969   else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
970 
971   tc -= mpn_sub_n (C, C, A, len);
972   t = mpn_sub_n (C, C, E, len2);
973   if (len2 == len) tc -= t;
974   else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
975 
976   t = mpn_lshift (ws, E, len2, 4);
977   t += mpn_add_n (ws, ws, A, len2);
978 #if 1
979   if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
980   td -= t + mpn_sub_n (D, D, ws, len);
981 #else
982   t += mpn_sub_n (D, D, ws, len2);
983   if (len2 != len) {
984     t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
985     t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
986   } /* end if/else */
987   td -= t;
988 #endif
989 
990 
991   /* b, d := b + d, b - d */
992 
993 #ifdef HAVE_MPN_ADD_SUB_N
994   /* #error TO DO ... */
995 #else
996   t = tb + td + mpn_add_n (ws, B, D, len);
997   td = tb - td - mpn_sub_n (D, B, D, len);
998   tb = t;
999   MPN_COPY (B, ws, len);
1000 #endif
1001 
1002   /* b := b-8*c */
1003   t = 8 * tc + mpn_lshift (ws, C, len, 3);
1004   tb -= t + mpn_sub_n (B, B, ws, len);
1005 
1006   /* c := 2*c - b */
1007   tc = 2 * tc + mpn_lshift (C, C, len, 1);
1008   tc -= tb + mpn_sub_n (C, C, B, len);
1009 
1010   /* d := d/3 */
1011   td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
1012 
1013   /* b, d := b + d, b - d */
1014 #ifdef HAVE_MPN_ADD_SUB_N
1015   /* #error TO DO ... */
1016 #else
1017   t = tb + td + mpn_add_n (ws, B, D, len);
1018   td = tb - td - mpn_sub_n (D, B, D, len);
1019   tb = t;
1020   MPN_COPY (B, ws, len);
1021 #endif
1022 
1023       /* Now:
1024        *	 b = 4*x1
1025        *	 c = 2*x2
1026        *	 d = 4*x3
1027        */
1028 
1029   ASSERT(!(*B & 3));
1030   mpn_rshift (B, B, len, 2);
1031   B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
1032   ASSERT((long)tb >= 0);
1033   tb >>= 2;
1034 
1035   ASSERT(!(*C & 1));
1036   mpn_rshift (C, C, len, 1);
1037   C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
1038   ASSERT((long)tc >= 0);
1039   tc >>= 1;
1040 
1041   ASSERT(!(*D & 3));
1042   mpn_rshift (D, D, len, 2);
1043   D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
1044   ASSERT((long)td >= 0);
1045   td >>= 2;
1046 
1047 #if WANT_ASSERT
1048   ASSERT (tb < 2);
1049   if (len == len2)
1050     {
1051       ASSERT (tc < 3);
1052       ASSERT (td < 2);
1053     }
1054   else
1055     {
1056       ASSERT (tc < 2);
1057       ASSERT (!td);
1058     }
1059 #endif
1060 
1061   *ptb = tb;
1062   *ptc = tc;
1063   *ptd = td;
1064 
1065   TMP_FREE (marker);
1066 }
1067 
1068 #else
1069 
1070 static void
1071 #if __STDC__
interpolate3(mp_srcptr A,mp_ptr B,mp_ptr C,mp_ptr D,mp_srcptr E,mp_ptr ptb,mp_ptr ptc,mp_ptr ptd,mp_size_t l,mp_size_t ls)1072 interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
1073 	      mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
1074 #else
1075 interpolate3 (A, B, C, D, E,
1076               ptb, ptc, ptd, l, ls)
1077      mp_srcptr A;
1078      mp_ptr    B;
1079      mp_ptr    C;
1080      mp_ptr    D;
1081      mp_srcptr E;
1082      mp_ptr    ptb;
1083      mp_ptr    ptc;
1084      mp_ptr    ptd;
1085      mp_size_t l;
1086      mp_size_t ls;
1087 #endif
1088 {
1089   mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
1090   const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
1091 
1092 #if WANT_ASSERT
1093   t = l - ls;
1094   ASSERT (t == 0 || t == 2 || t == 4);
1095 #endif
1096 
1097   sb = sc = sd = 0;
1098   for (i = 0; i < l; ++i)
1099     {
1100       mp_limb_t tb, tc, td, tt;
1101 
1102       a = *A;
1103       b = *B;
1104       c = *C;
1105       d = *D;
1106       e = i < ls ? *E : 0;
1107 
1108       /* Let x1, x2, x3 be the values to interpolate.  We have:
1109        *	 b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
1110        *	 c =	a +   x1 +   x2 +   x3 +    e
1111        *	 d =	a + 2*x1 + 4*x2 + 8*x3 + 16*e
1112        */
1113 
1114       /* b := b - 16*a -    e
1115        * c := c -    a -    e
1116        * d := d -    a - 16*e
1117        */
1118       t = a << 4;
1119       tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
1120       b -= t;
1121       tb -= b < e;
1122       b -= e;
1123       tc = -(c < a);
1124       c -= a;
1125       tc -= c < e;
1126       c -= e;
1127       td = -(d < a);
1128       d -= a;
1129       t = e << 4;
1130       td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
1131       d -= t;
1132 
1133       /* b, d := b + d, b - d */
1134       t = b + d;
1135       tt = tb + td + (t < b);
1136       td = tb - td - (b < d);
1137       d = b - d;
1138       b = t;
1139       tb = tt;
1140 
1141       /* b := b-8*c */
1142       t = c << 3;
1143       tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
1144       b -= t;
1145 
1146       /* c := 2*c - b */
1147       t = c << 1;
1148       tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
1149       c = t - b;
1150 
1151       /* d := d/3 */
1152       d *= INVERSE_3;
1153       td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
1154       td *= INVERSE_3;
1155 
1156       /* b, d := b + d, b - d */
1157       t = b + d;
1158       tt = tb + td + (t < b);
1159       td = tb - td - (b < d);
1160       d = b - d;
1161       b = t;
1162       tb = tt;
1163 
1164       /* Now:
1165        *	 b = 4*x1
1166        *	 c = 2*x2
1167        *	 d = 4*x3
1168        */
1169 
1170       /* sb has period 2. */
1171       b += sb;
1172       tb += b < sb;
1173       sb &= maskOffHalf;
1174       sb |= sb >> (BITS_PER_MP_LIMB >> 1);
1175       sb += tb;
1176 
1177       /* sc has period 1. */
1178       c += sc;
1179       tc += c < sc;
1180       /* TO DO: choose one of the following alternatives. */
1181 #if 1
1182       sc = (mp_limb_t)((mp_limb_signed_t)sc >> (BITS_PER_MP_LIMB - 1));
1183       sc += tc;
1184 #else
1185       sc = tc - ((mp_limb_signed_t)sc < 0L);
1186 #endif
1187 
1188       /* sd has period 2. */
1189       d += sd;
1190       td += d < sd;
1191       sd &= maskOffHalf;
1192       sd |= sd >> (BITS_PER_MP_LIMB >> 1);
1193       sd += td;
1194 
1195       if (i != 0)
1196 	{
1197 	  B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
1198 	  C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
1199 	  D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
1200 	}
1201       ob = b >> 2;
1202       oc = c >> 1;
1203       od = d >> 2;
1204 
1205       ++A; ++B; ++C; ++D; ++E;
1206     }
1207 
1208   /* Handle top words. */
1209   b = *ptb;
1210   c = *ptc;
1211   d = *ptd;
1212 
1213   t = b + d;
1214   d = b - d;
1215   b = t;
1216   b -= c << 3;
1217   c = (c << 1) - b;
1218   d *= INVERSE_3;
1219   t = b + d;
1220   d = b - d;
1221   b = t;
1222 
1223   b += sb;
1224   c += sc;
1225   d += sd;
1226 
1227   B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
1228   C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
1229   D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
1230 
1231   b >>= 2;
1232   c >>= 1;
1233   d >>= 2;
1234 
1235 #if WANT_ASSERT
1236   ASSERT (b < 2);
1237   if (l == ls)
1238     {
1239       ASSERT (c < 3);
1240       ASSERT (d < 2);
1241     }
1242   else
1243     {
1244       ASSERT (c < 2);
1245       ASSERT (!d);
1246     }
1247 #endif
1248 
1249   *ptb = b;
1250   *ptc = c;
1251   *ptd = d;
1252 }
1253 #endif
1254 
1255 
1256 /*-- mpn_toom3_mul_n --------------------------------------------------------------*/
1257 
1258 /* Multiplies using 5 mults of one third size and so on recursively.
1259  * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
1260  * No overlap of p[...] with a[...] or b[...].
1261  * ws is workspace.
1262  */
1263 
1264 /* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
1265  *        recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
1266  *        because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
1267  */
1268 
1269 #define TOOM3_MUL_REC(p, a, b, n, ws) \
1270   do {								\
1271     if (n < KARATSUBA_MUL_THRESHOLD)				\
1272       mpn_mul_basecase (p, a, n, b, n);				\
1273     else if (n < TOOM3_MUL_THRESHOLD)				\
1274       mpn_kara_mul_n (p, a, b, n, ws);				\
1275     else							\
1276       mpn_toom3_mul_n (p, a, b, n, ws);				\
1277   } while (0)
1278 
1279 void
1280 #if __STDC__
mpn_toom3_mul_n(mp_ptr p,mp_srcptr a,mp_srcptr b,mp_size_t n,mp_ptr ws)1281 mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
1282 #else
1283 mpn_toom3_mul_n (p, a, b, n, ws)
1284      mp_ptr    p;
1285      mp_srcptr a;
1286      mp_srcptr b;
1287      mp_size_t n;
1288      mp_ptr    ws;
1289 #endif
1290 {
1291   mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
1292   mp_limb_t *A,*B,*C,*D,*E, *W;
1293   mp_size_t l,l2,l3,l4,l5,ls;
1294 
1295   SCHEME_BIGNUM_USE_FUEL(n);
1296 
1297   /* Break n words into chunks of size l, l and ls.
1298    * n = 3*k   => l = k,   ls = k
1299    * n = 3*k+1 => l = k+1, ls = k-1
1300    * n = 3*k+2 => l = k+1, ls = k
1301    */
1302   {
1303     mp_limb_t m;
1304 
1305     ASSERT (n >= TOOM3_MUL_THRESHOLD);
1306     l = ls = n / 3;
1307     m = n - l * 3;
1308     if (m != 0)
1309       ++l;
1310     if (m == 1)
1311       --ls;
1312 
1313     l2 = l * 2;
1314     l3 = l * 3;
1315     l4 = l * 4;
1316     l5 = l * 5;
1317     A = p;
1318     B = ws;
1319     C = p + l2;
1320     D = ws + l2;
1321     E = p + l4;
1322     W = ws + l4;
1323   }
1324 
1325   /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
1326   evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
1327   evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
1328 
1329   /** Second stage: pointwise multiplies. **/
1330   TOOM3_MUL_REC(D, C, C + l, l, W);
1331   tD = cD*dD;
1332   if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
1333   if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
1334   ASSERT (tD < 49);
1335   TOOM3_MUL_REC(C, B, B + l, l, W);
1336   tC = cC*dC;
1337   /* TO DO: choose one of the following alternatives. */
1338 #if 0
1339   if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
1340   if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
1341 #else
1342   if (cC)
1343     {
1344       if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
1345       else tC += add2Times (C + l, C + l, B + l, l);
1346     }
1347   if (dC)
1348     {
1349       if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
1350       else tC += add2Times (C + l, C + l, B, l);
1351     }
1352 #endif
1353   ASSERT (tC < 9);
1354   TOOM3_MUL_REC(B, A, A + l, l, W);
1355   tB = cB*dB;
1356   if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
1357   if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
1358   ASSERT (tB < 49);
1359   TOOM3_MUL_REC(A, a, b, l, W);
1360   TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
1361 
1362   /** Third stage: interpolation. **/
1363   interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
1364 
1365   /** Final stage: add up the coefficients. **/
1366   {
1367     /* mp_limb_t i, x, y; */
1368     tB += mpn_add_n (p + l, p + l, B, l2);
1369     tD += mpn_add_n (p + l3, p + l3, D, l2);
1370     mpn_incr_u (p + l3, tB);
1371     mpn_incr_u (p + l4, tC);
1372     mpn_incr_u (p + l5, tD);
1373   }
1374 }
1375 
1376 /*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
1377 
1378 /* Like previous function but for squaring */
1379 
1380 #define TOOM3_SQR_REC(p, a, n, ws) \
1381   do {								\
1382     if (n < KARATSUBA_SQR_THRESHOLD)				\
1383       mpn_sqr_basecase (p, a, n);				\
1384     else if (n < TOOM3_SQR_THRESHOLD)				\
1385       mpn_kara_sqr_n (p, a, n, ws);				\
1386     else							\
1387       mpn_toom3_sqr_n (p, a, n, ws);				\
1388   } while (0)
1389 
1390 void
1391 #if __STDC__
mpn_toom3_sqr_n(mp_ptr p,mp_srcptr a,mp_size_t n,mp_ptr ws)1392 mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
1393 #else
1394 mpn_toom3_sqr_n (p, a, n, ws)
1395      mp_ptr    p;
1396      mp_srcptr a;
1397      mp_size_t n;
1398      mp_ptr    ws;
1399 #endif
1400 {
1401   mp_limb_t cB,cC,cD, tB,tC,tD;
1402   mp_limb_t *A,*B,*C,*D,*E, *W;
1403   mp_size_t l,l2,l3,l4,l5,ls;
1404 
1405   SCHEME_BIGNUM_USE_FUEL(n);
1406 
1407   /* Break n words into chunks of size l, l and ls.
1408    * n = 3*k   => l = k,   ls = k
1409    * n = 3*k+1 => l = k+1, ls = k-1
1410    * n = 3*k+2 => l = k+1, ls = k
1411    */
1412   {
1413     mp_limb_t m;
1414 
1415     ASSERT (n >= TOOM3_MUL_THRESHOLD);
1416     l = ls = n / 3;
1417     m = n - l * 3;
1418     if (m != 0)
1419       ++l;
1420     if (m == 1)
1421       --ls;
1422 
1423     l2 = l * 2;
1424     l3 = l * 3;
1425     l4 = l * 4;
1426     l5 = l * 5;
1427     A = p;
1428     B = ws;
1429     C = p + l2;
1430     D = ws + l2;
1431     E = p + l4;
1432     W = ws + l4;
1433   }
1434 
1435   /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
1436   evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
1437 
1438   /** Second stage: pointwise multiplies. **/
1439   TOOM3_SQR_REC(D, C, l, W);
1440   tD = cD*cD;
1441   if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
1442   ASSERT (tD < 49);
1443   TOOM3_SQR_REC(C, B, l, W);
1444   tC = cC*cC;
1445   /* TO DO: choose one of the following alternatives. */
1446 #if 0
1447   if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
1448 #else
1449   if (cC >= 1)
1450     {
1451       tC += add2Times (C + l, C + l, B, l);
1452       if (cC == 2)
1453         tC += add2Times (C + l, C + l, B, l);
1454     }
1455 #endif
1456   ASSERT (tC < 9);
1457   TOOM3_SQR_REC(B, A, l, W);
1458   tB = cB*cB;
1459   if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
1460   ASSERT (tB < 49);
1461   TOOM3_SQR_REC(A, a, l, W);
1462   TOOM3_SQR_REC(E, a + l2, ls, W);
1463 
1464   /** Third stage: interpolation. **/
1465   interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
1466 
1467   /** Final stage: add up the coefficients. **/
1468   {
1469     /* mp_limb_t i, x, y; */
1470     tB += mpn_add_n (p + l, p + l, B, l2);
1471     tD += mpn_add_n (p + l3, p + l3, D, l2);
1472     mpn_incr_u (p + l3, tB);
1473     mpn_incr_u (p + l4, tC);
1474     mpn_incr_u (p + l5, tD);
1475   }
1476 }
1477 
1478 void
1479 #if __STDC__
mpn_mul_n(mp_ptr p,mp_srcptr a,mp_srcptr b,mp_size_t n)1480 mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
1481 #else
1482 mpn_mul_n (p, a, b, n)
1483      mp_ptr    p;
1484      mp_srcptr a;
1485      mp_srcptr b;
1486      mp_size_t n;
1487 #endif
1488 {
1489   if (n < KARATSUBA_MUL_THRESHOLD)
1490     mpn_mul_basecase (p, a, n, b, n);
1491   else if (n < TOOM3_MUL_THRESHOLD)
1492     {
1493       /* Allocate workspace of fixed size on stack: fast! */
1494 #if TUNE_PROGRAM_BUILD
1495       mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
1496 #else
1497       mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
1498 #endif
1499       mpn_kara_mul_n (p, a, b, n, ws);
1500     }
1501 #if WANT_FFT || TUNE_PROGRAM_BUILD
1502   else if (n < FFT_MUL_THRESHOLD)
1503 #else
1504   else
1505 #endif
1506     {
1507       /* Use workspace of unknown size in heap, as stack space may
1508        * be limited.  Since n is at least TOOM3_MUL_THRESHOLD, the
1509        * multiplication will take much longer than malloc()/free().  */
1510       mp_limb_t wsLen, *ws;
1511       TMP_DECL (marker);
1512       TMP_MARK (marker);
1513 
1514       wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
1515       ws = (mp_ptr) TMP_ALLOC ((size_t) wsLen * sizeof (mp_limb_t));
1516       mpn_toom3_mul_n (p, a, b, n, ws);
1517       TMP_FREE (marker);
1518     }
1519 #if WANT_FFT || TUNE_PROGRAM_BUILD
1520   else
1521     {
1522       mpn_mul_fft_full (p, a, n, b, n);
1523     }
1524 #endif
1525 }
1526 
1527 
1528 /* mpn_rshift -- Shift right a low-level natural-number integer. */
1529 
1530 /* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right
1531    and store the USIZE least significant limbs of the result at WP.
1532    The bits shifted out to the right are returned.
1533 
1534    Argument constraints:
1535    1. 0 < CNT < BITS_PER_MP_LIMB
1536    2. If the result is to be written over the input, WP must be <= UP.
1537 */
1538 
1539 mp_limb_t
1540 #if __STDC__
mpn_rshift(register mp_ptr wp,register mp_srcptr up,mp_size_t usize,register unsigned int cnt)1541 mpn_rshift (register mp_ptr wp,
1542 	    register mp_srcptr up, mp_size_t usize,
1543 	    register unsigned int cnt)
1544 #else
1545 mpn_rshift (wp, up, usize, cnt)
1546      register mp_ptr wp;
1547      register mp_srcptr up;
1548      mp_size_t usize;
1549      register unsigned int cnt;
1550 #endif
1551 {
1552   register mp_limb_t high_limb, low_limb;
1553   register unsigned sh_1, sh_2;
1554   register mp_size_t i;
1555   mp_limb_t retval;
1556 
1557 #ifdef DEBUG
1558   if (usize == 0 || cnt == 0)
1559     abort ();
1560 #endif
1561 
1562   sh_1 = cnt;
1563 
1564 #if 0
1565   if (sh_1 == 0)
1566     {
1567       if (wp != up)
1568 	{
1569 	  /* Copy from low end to high end, to allow specified input/output
1570 	     overlapping.  */
1571 	  for (i = 0; i < usize; i++)
1572 	    wp[i] = up[i];
1573 	}
1574       return usize;
1575     }
1576 #endif
1577 
1578   wp -= 1;
1579   sh_2 = BITS_PER_MP_LIMB - sh_1;
1580   high_limb = up[0];
1581   retval = high_limb << sh_2;
1582   low_limb = high_limb;
1583 
1584   for (i = 1; i < usize; i++)
1585     {
1586       high_limb = up[i];
1587       wp[i] = (low_limb >> sh_1) | (high_limb << sh_2);
1588       low_limb = high_limb;
1589     }
1590   wp[i] = low_limb >> sh_1;
1591 
1592   return retval;
1593 }
1594 
1595 /* mpn_lshift -- Shift left low level. */
1596 
1597 /* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left
1598    and store the USIZE least significant digits of the result at WP.
1599    Return the bits shifted out from the most significant digit.
1600 
1601    Argument constraints:
1602    1. 0 < CNT < BITS_PER_MP_LIMB
1603    2. If the result is to be written over the input, WP must be >= UP.
1604 */
1605 
1606 mp_limb_t
1607 #if __STDC__
mpn_lshift(register mp_ptr wp,register mp_srcptr up,mp_size_t usize,register unsigned int cnt)1608 mpn_lshift (register mp_ptr wp,
1609 	    register mp_srcptr up, mp_size_t usize,
1610 	    register unsigned int cnt)
1611 #else
1612 mpn_lshift (wp, up, usize, cnt)
1613      register mp_ptr wp;
1614      register mp_srcptr up;
1615      mp_size_t usize;
1616      register unsigned int cnt;
1617 #endif
1618 {
1619   register mp_limb_t high_limb, low_limb;
1620   register unsigned sh_1, sh_2;
1621   register mp_size_t i;
1622   mp_limb_t retval;
1623 
1624 #ifdef DEBUG
1625   if (usize == 0 || cnt == 0)
1626     abort ();
1627 #endif
1628 
1629   sh_1 = cnt;
1630 #if 0
1631   if (sh_1 == 0)
1632     {
1633       if (wp != up)
1634 	{
1635 	  /* Copy from high end to low end, to allow specified input/output
1636 	     overlapping.  */
1637 	  for (i = usize - 1; i >= 0; i--)
1638 	    wp[i] = up[i];
1639 	}
1640       return 0;
1641     }
1642 #endif
1643 
1644   wp += 1;
1645   sh_2 = BITS_PER_MP_LIMB - sh_1;
1646   i = usize - 1;
1647   low_limb = up[i];
1648   retval = low_limb >> sh_2;
1649   high_limb = low_limb;
1650   while (--i >= 0)
1651     {
1652       low_limb = up[i];
1653       wp[i] = (high_limb << sh_1) | (low_limb >> sh_2);
1654       high_limb = low_limb;
1655     }
1656   wp[i] = high_limb << sh_1;
1657 
1658   return retval;
1659 }
1660 
1661 
1662 
1663 /* Conversion of U {up,un} to a string in base b.  Internally, we convert to
1664      base B = b^m, the largest power of b that fits a limb.  Basic algorithms:
1665 
1666   A) Divide U repeatedly by B, generating a quotient and remainder, until the
1667      quotient becomes zero.  The remainders hold the converted digits.  Digits
1668      come out from right to left.  (Used in mpn_sb_get_str.)
1669 
1670   B) Divide U by b^g, for g such that 1/b <= U/b^g < 1, generating a fraction.
1671      Then develop digits by multiplying the fraction repeatedly by b.  Digits
1672      come out from left to right.  (Currently not used herein, except for in
1673      code for converting single limbs to individual digits.)
1674 
1675   C) Compute B^1, B^2, B^4, ..., B^(2^s), for s such that B^(2^s) > sqrt(U).
1676      Then divide U by B^(2^k), generating an integer quotient and remainder.
1677      Recursively convert the quotient, then the remainder, using the
1678      precomputed powers.  Digits come out from left to right.  (Used in
1679      mpn_dc_get_str.)
1680 
1681   When using algorithm C, algorithm B might be suitable for basecase code,
1682   since the required b^g power will be readily accessible.
1683 
1684   Optimization ideas:
1685   1. The recursive function of (C) could avoid TMP allocation:
1686      a) Overwrite dividend with quotient and remainder, just as permitted by
1687         mpn_sb_divrem_mn.
1688      b) If TMP memory is anyway needed, pass it as a parameter, similarly to
1689         how we do it in Karatsuba multiplication.
1690   2. Store the powers of (C) in normalized form, with the normalization count.
1691      Quotients will usually need to be left-shifted before each divide, and
1692      remainders will either need to be left-shifted of right-shifted.
1693   3. When b is even, the powers will end up with lots of low zero limbs.  Could
1694      save significant time in the mpn_tdiv_qr call by stripping these zeros.
1695   4. In the code for developing digits from a single limb, we could avoid using
1696      a full umul_ppmm except for the first (or first few) digits, provided base
1697      is even.  Subsequent digits can be developed using plain multiplication.
1698      (This saves on register-starved machines (read x86) and on all machines
1699      that generate the upper product half using a separate instruction (alpha,
1700      powerpc, IA-64) or lacks such support altogether (sparc64, hppa64).
1701   5. Separate mpn_dc_get_str basecase code from code for small conversions. The
1702      former code will have the exact right power readily available in the
1703      powtab parameter for dividing the current number into a fraction.  Convert
1704      that using algorithm B.
1705   6. Completely avoid division.  Compute the inverses of the powers now in
1706      powtab instead of the actual powers.
1707 
1708   Basic structure of (C):
1709     mpn_get_str:
1710       if POW2_P (n)
1711 	...
1712       else
1713 	if (un < GET_STR_PRECOMPUTE_THRESHOLD)
1714 	  mpn_sb_get_str (str, base, up, un);
1715 	else
1716 	  precompute_power_tables
1717 	  mpn_dc_get_str
1718 
1719     mpn_dc_get_str:
1720 	mpn_tdiv_qr
1721 	if (qn < GET_STR_DC_THRESHOLD)
1722 	  mpn_sb_get_str
1723 	else
1724 	  mpn_dc_get_str
1725 	if (rn < GET_STR_DC_THRESHOLD)
1726 	  mpn_sb_get_str
1727 	else
1728 	  mpn_dc_get_str
1729 
1730 
1731   The reason for the two threshold values is the cost of
1732   precompute_power_tables.  GET_STR_PRECOMPUTE_THRESHOLD will be considerably
1733   larger than GET_STR_PRECOMPUTE_THRESHOLD.  Do you think I should change
1734   mpn_dc_get_str to instead look like the following?  */
1735 
1736 
1737 /* The x86s and m68020 have a quotient and remainder "div" instruction and
1738    gcc recognises an adjacent "/" and "%" can be combined using that.
1739    Elsewhere "/" and "%" are either separate instructions, or separate
1740    libgcc calls (which unfortunately gcc as of version 3.0 doesn't combine).
1741    A multiply and subtract should be faster than a "%" in those cases.  */
1742 #if HAVE_HOST_CPU_FAMILY_x86            \
1743   || HAVE_HOST_CPU_m68020               \
1744   || HAVE_HOST_CPU_m68030               \
1745   || HAVE_HOST_CPU_m68040               \
1746   || HAVE_HOST_CPU_m68060               \
1747   || HAVE_HOST_CPU_m68360 /* CPU32 */
1748 #define udiv_qrnd_unnorm(q,r,n,d)       \
1749   do {                                  \
1750     mp_limb_t  __q = (n) / (d);         \
1751     mp_limb_t  __r = (n) % (d);         \
1752     (q) = __q;                          \
1753     (r) = __r;                          \
1754   } while (0)
1755 #else
1756 #define udiv_qrnd_unnorm(q,r,n,d)       \
1757   do {                                  \
1758     mp_limb_t  __q = (n) / (d);         \
1759     mp_limb_t  __r = (n) - __q*(d);     \
1760     (q) = __q;                          \
1761     (r) = __r;                          \
1762   } while (0)
1763 #endif
1764 
1765 /* When to stop divide-and-conquer and call the basecase mpn_get_str.  */
1766 #ifndef GET_STR_DC_THRESHOLD
1767 #define GET_STR_DC_THRESHOLD 15
1768 #endif
1769 /* Whether to bother at all with precomputing powers of the base, or go
1770    to the basecase mpn_get_str directly.  */
1771 #ifndef GET_STR_PRECOMPUTE_THRESHOLD
1772 #define GET_STR_PRECOMPUTE_THRESHOLD 30
1773 #endif
1774 
1775 struct powers
1776 {
1777   size_t digits_in_base;
1778   mp_ptr p;
1779   mp_size_t n;		/* mpz_struct uses int for sizes, but not mpn! */
1780   int base;
1781 };
1782 typedef struct powers powers_t;
1783 
1784 /* Convert {UP,UN} to a string with a base as represented in POWTAB, and put
1785    the string in STR.  Generate LEN characters, possibly padding with zeros to
1786    the left.  If LEN is zero, generate as many characters as required.
1787    Return a pointer immediately after the last digit of the result string.
1788    Complexity is O(UN^2); intended for small conversions.  */
1789 static unsigned char *
mpn_sb_get_str(unsigned char * str,size_t len,mp_ptr up,mp_size_t un,const powers_t * powtab)1790 mpn_sb_get_str (unsigned char *str, size_t len,
1791 		mp_ptr up, mp_size_t un,
1792 		const powers_t *powtab)
1793 {
1794   mp_limb_t rl, ul;
1795   unsigned char *s;
1796   int base;
1797   size_t l;
1798   /* Allocate memory for largest possible string, given that we only get here
1799      for operands with un < GET_STR_PRECOMPUTE_THRESHOLD and that the smallest
1800      base is 3.  7/11 is an approximation to 1/log2(3).  */
1801 #if TUNE_PROGRAM_BUILD
1802 #define BUF_ALLOC (GET_STR_THRESHOLD_LIMIT * BITS_PER_MP_LIMB * 7 / 11)
1803 #else
1804 #define BUF_ALLOC (GET_STR_PRECOMPUTE_THRESHOLD * BITS_PER_MP_LIMB * 7 / 11)
1805 #endif
1806   unsigned char buf[BUF_ALLOC];
1807 #if TUNE_PROGRAM_BUILD
1808   mp_limb_t rp[GET_STR_THRESHOLD_LIMIT];
1809 #else
1810   mp_limb_t rp[GET_STR_PRECOMPUTE_THRESHOLD];
1811 #endif
1812 
1813   base = powtab->base;
1814   if (base == 10)
1815     {
1816       /* Special case code for base==10 so that the compiler has a chance to
1817 	 optimize things.  */
1818 
1819       MPN_COPY (rp + 1, up, un);
1820 
1821       s = buf + BUF_ALLOC;
1822       while (un > 1)
1823 	{
1824 	  int i;
1825 	  mp_limb_t frac, digit;
1826 	  MPN_DIVREM_OR_PREINV_DIVREM_1 (rp, (mp_size_t) 1, rp + 1, un,
1827 					 MP_BASES_BIG_BASE_10,
1828 					 MP_BASES_BIG_BASE_INVERTED_10,
1829 					 MP_BASES_NORMALIZATION_STEPS_10);
1830 	  un -= rp[un] == 0;
1831 	  frac = (rp[0] + 1) << GMP_NAIL_BITS;
1832 	  s -= MP_BASES_CHARS_PER_LIMB_10;
1833 #if HAVE_HOST_CPU_FAMILY_x86
1834 	  /* The code below turns out to be a bit slower for x86 using gcc.
1835 	     Use plain code.  */
1836 	  i = MP_BASES_CHARS_PER_LIMB_10;
1837 	  do
1838 	    {
1839 	      umul_ppmm (digit, frac, frac, 10);
1840 	      *s++ = digit;
1841 	    }
1842 	  while (--i);
1843 #else
1844 	  /* Use the fact that 10 in binary is 1010, with the lowest bit 0.
1845 	     After a few umul_ppmm, we will have accumulated enough low zeros
1846 	     to use a plain multiply.  */
1847 	  if (MP_BASES_NORMALIZATION_STEPS_10 == 0)
1848 	    {
1849 	      umul_ppmm (digit, frac, frac, 10);
1850 	      *s++ = digit;
1851 	    }
1852 	  if (MP_BASES_NORMALIZATION_STEPS_10 <= 1)
1853 	    {
1854 	      umul_ppmm (digit, frac, frac, 10);
1855 	      *s++ = digit;
1856 	    }
1857 	  if (MP_BASES_NORMALIZATION_STEPS_10 <= 2)
1858 	    {
1859 	      umul_ppmm (digit, frac, frac, 10);
1860 	      *s++ = digit;
1861 	    }
1862 	  if (MP_BASES_NORMALIZATION_STEPS_10 <= 3)
1863 	    {
1864 	      umul_ppmm (digit, frac, frac, 10);
1865 	      *s++ = digit;
1866 	    }
1867 	  i = MP_BASES_CHARS_PER_LIMB_10 - (4-MP_BASES_NORMALIZATION_STEPS_10);
1868 	  frac = (frac + 0xf) >> 4;
1869 	  do
1870 	    {
1871 	      frac *= 10;
1872 	      digit = frac >> (BITS_PER_MP_LIMB - 4);
1873 	      *s++ = digit;
1874 	      frac &= (~(mp_limb_t) 0) >> 4;
1875 	    }
1876 	  while (--i);
1877 #endif
1878 	  s -= MP_BASES_CHARS_PER_LIMB_10;
1879 	}
1880 
1881       ul = rp[1];
1882       while (ul != 0)
1883 	{
1884 	  udiv_qrnd_unnorm (ul, rl, ul, 10);
1885 	  *--s = rl;
1886 	}
1887     }
1888   else /* not base 10 */
1889     {
1890       unsigned chars_per_limb;
1891       mp_limb_t big_base;
1892       mp_limb_t big_base_inverted USED_ONLY_SOMETIMES;
1893       unsigned normalization_steps USED_ONLY_SOMETIMES;
1894 
1895       chars_per_limb = __mp_bases[base].chars_per_limb;
1896       big_base = __mp_bases[base].big_base;
1897       big_base_inverted = __mp_bases[base].big_base_inverted;
1898       count_leading_zeros (normalization_steps, big_base);
1899 
1900       MPN_COPY (rp + 1, up, un);
1901 
1902       s = buf + BUF_ALLOC;
1903       while (un > 1)
1904 	{
1905 	  int i;
1906 	  mp_limb_t frac;
1907 	  MPN_DIVREM_OR_PREINV_DIVREM_1 (rp, (mp_size_t) 1, rp + 1, un,
1908 					 big_base, big_base_inverted,
1909 					 normalization_steps);
1910 	  un -= rp[un] == 0;
1911 	  frac = (rp[0] + 1) << GMP_NAIL_BITS;
1912 	  s -= chars_per_limb;
1913 	  i = chars_per_limb;
1914 	  do
1915 	    {
1916 	      mp_limb_t digit;
1917 	      umul_ppmm (digit, frac, frac, base);
1918 	      *s++ = digit;
1919 	    }
1920 	  while (--i);
1921 	  s -= chars_per_limb;
1922 	}
1923 
1924       ul = rp[1];
1925       while (ul != 0)
1926 	{
1927 	  udiv_qrnd_unnorm (ul, rl, ul, base);
1928 	  *--s = rl;
1929 	}
1930     }
1931 
1932   l = buf + BUF_ALLOC - s;
1933   while (l < len)
1934     {
1935       *str++ = 0;
1936       len--;
1937     }
1938   while (l != 0)
1939     {
1940       *str++ = *s++;
1941       l--;
1942     }
1943   return str;
1944 }
1945 
1946 /* Convert {UP,UN} to a string with a base as represented in POWTAB, and put
1947    the string in STR.  Generate LEN characters, possibly padding with zeros to
1948    the left.  If LEN is zero, generate as many characters as required.
1949    Return a pointer immediately after the last digit of the result string.
1950    This uses divide-and-conquer and is intended for large conversions.  */
1951 static unsigned char *
mpn_dc_get_str(unsigned char * str,size_t len,mp_ptr up,mp_size_t un,const powers_t * powtab)1952 mpn_dc_get_str (unsigned char *str, size_t len,
1953 		mp_ptr up, mp_size_t un,
1954 		const powers_t *powtab)
1955 {
1956   if (un < GET_STR_DC_THRESHOLD)
1957     {
1958       if (un != 0)
1959 	str = mpn_sb_get_str (str, len, up, un, powtab);
1960       else
1961 	{
1962 	  while (len != 0)
1963 	    {
1964 	      *str++ = 0;
1965 	      len--;
1966 	    }
1967 	}
1968     }
1969   else
1970     {
1971       mp_ptr pwp, qp, rp;
1972       mp_size_t pwn, qn;
1973 
1974       pwp = powtab->p;
1975       pwn = powtab->n;
1976 
1977       if (un < pwn || (un == pwn && mpn_cmp (up, pwp, un) < 0))
1978 	{
1979 	  str = mpn_dc_get_str (str, len, up, un, powtab - 1);
1980 	}
1981       else
1982 	{
1983 	  TMP_DECL (marker);
1984 	  TMP_MARK (marker);
1985 	  qp = TMP_ALLOC_LIMBS (un - pwn + 1);
1986 	  rp = TMP_ALLOC_LIMBS (pwn);
1987 
1988 	  mpn_tdiv_qr (qp, rp, 0L, up, un, pwp, pwn);
1989 	  qn = un - pwn; qn += qp[qn] != 0;		/* quotient size */
1990 	  if (len != 0)
1991 	    len = len - powtab->digits_in_base;
1992 	  str = mpn_dc_get_str (str, len, qp, qn, powtab - 1);
1993 	  str = mpn_dc_get_str (str, powtab->digits_in_base, rp, pwn, powtab - 1);
1994 	  TMP_FREE (marker);
1995 	}
1996     }
1997   return str;
1998 }
1999 
2000 /* There are no leading zeros on the digits generated at str, but that's not
2001    currently a documented feature.  */
2002 
2003 size_t
mpn_get_str(unsigned char * str,int base,mp_ptr up,mp_size_t un)2004 mpn_get_str (unsigned char *str, int base, mp_ptr up, mp_size_t un)
2005 {
2006   mp_ptr powtab_mem, powtab_mem_ptr;
2007   mp_limb_t big_base;
2008   size_t digits_in_base;
2009   powers_t powtab[30];
2010   int pi;
2011   mp_size_t n;
2012   mp_ptr p, t;
2013   size_t out_len;
2014 
2015   /* Special case zero, as the code below doesn't handle it.  */
2016   if (un == 0)
2017     {
2018       str[0] = 0;
2019       return 1;
2020     }
2021 
2022   if (POW2_P (base))
2023     {
2024       /* The base is a power of 2.  Convert from most significant end.  */
2025       mp_limb_t n1, n0;
2026       int bits_per_digit = __mp_bases[base].big_base;
2027       int cnt;
2028       int bit_pos;
2029       mp_size_t i;
2030       unsigned char *s = str;
2031 
2032       n1 = up[un - 1];
2033       count_leading_zeros (cnt, n1);
2034 
2035       /* BIT_POS should be R when input ends in least significant nibble,
2036 	 R + bits_per_digit * n when input ends in nth least significant
2037 	 nibble. */
2038 
2039       {
2040 	unsigned long bits;
2041 
2042 	bits = GMP_NUMB_BITS * un - cnt + GMP_NAIL_BITS;
2043 	cnt = bits % bits_per_digit;
2044 	if (cnt != 0)
2045 	  bits += bits_per_digit - cnt;
2046 	bit_pos = bits - (un - 1) * GMP_NUMB_BITS;
2047       }
2048 
2049       /* Fast loop for bit output.  */
2050       i = un - 1;
2051       for (;;)
2052 	{
2053 	  bit_pos -= bits_per_digit;
2054 	  while (bit_pos >= 0)
2055 	    {
2056 	      *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1);
2057 	      bit_pos -= bits_per_digit;
2058 	    }
2059 	  i--;
2060 	  if (i < 0)
2061 	    break;
2062 	  n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1);
2063 	  n1 = up[i];
2064 	  bit_pos += GMP_NUMB_BITS;
2065 	  *s++ = n0 | (n1 >> bit_pos);
2066 
2067 	  if (!(i & 0xFF)) SCHEME_BIGNUM_USE_FUEL(1);
2068 	}
2069 
2070       *s = 0;
2071 
2072       return s - str;
2073     }
2074 
2075   /* General case.  The base is not a power of 2.  */
2076 
2077   if (un < GET_STR_PRECOMPUTE_THRESHOLD)
2078     {
2079       struct powers ptab[1];
2080       ptab[0].base = base;
2081       return mpn_sb_get_str (str, (size_t) 0, up, un, ptab) - str;
2082     }
2083 
2084   /* Allocate one large block for the powers of big_base.  With the current
2085      scheme, we need to allocate twice as much as would be possible if a
2086      minimal set of powers were generated.  */
2087 #define ALLOC_SIZE (2 * un + 30)
2088  {
2089    TMP_DECL (marker);
2090    TMP_MARK (marker);
2091 
2092    powtab_mem = __GMP_ALLOCATE_FUNC_LIMBS (ALLOC_SIZE);
2093    powtab_mem_ptr = powtab_mem;
2094 
2095   /* Compute a table of powers: big_base^1, big_base^2, big_base^4, ...,
2096      big_base^(2^k), for k such that the biggest power is between U and
2097      sqrt(U).  */
2098 
2099   big_base = __mp_bases[base].big_base;
2100   digits_in_base = __mp_bases[base].chars_per_limb;
2101 
2102   powtab[0].base = base; /* FIXME: hack for getting base to mpn_sb_get_str */
2103   powtab[1].p = &big_base;
2104   powtab[1].n = 1;
2105   powtab[1].digits_in_base = digits_in_base;
2106   powtab[1].base = base;
2107   powtab[2].p = &big_base;
2108   powtab[2].n = 1;
2109   powtab[2].digits_in_base = digits_in_base;
2110   powtab[2].base = base;
2111   n = 1;
2112   pi = 2;
2113   p = &big_base;
2114   for (;;)
2115     {
2116       ++pi;
2117       t = powtab_mem_ptr;
2118       powtab_mem_ptr += 2 * n;
2119       mpn_sqr_n (t, p, n);
2120       n *= 2; n -= t[n - 1] == 0;
2121       digits_in_base *= 2;
2122       p = t;
2123       powtab[pi].p = p;
2124       powtab[pi].n = n;
2125       powtab[pi].digits_in_base = digits_in_base;
2126       powtab[pi].base = base;
2127 
2128       if (2 * n > un)
2129 	break;
2130     }
2131   ASSERT_ALWAYS (ALLOC_SIZE > powtab_mem_ptr - powtab_mem);
2132 
2133   /* Using our precomputed powers, now in powtab[], convert our number.  */
2134   out_len = mpn_dc_get_str (str, 0, up, un, powtab + pi) - str;
2135 
2136   __GMP_FREE_FUNC_LIMBS (powtab_mem, ALLOC_SIZE);
2137 
2138   TMP_FREE(marker);
2139  }
2140 
2141   return out_len;
2142 }
2143 
2144 
2145 /* When to switch to sub-quadratic code.  This counts characters/digits in
2146    the input string, not limbs as most other *_THRESHOLD.  */
2147 #ifndef SET_STR_THRESHOLD
2148 #define SET_STR_THRESHOLD 4000
2149 #endif
2150 
2151 /* Don't define this to anything but 1 for now.  In order to make other values
2152    work well, either the convert_blocks function should be generazed to handle
2153    larger blocks than chars_per_limb, or the basecase code should be broken out
2154    of the main function.  Also note that this must be a power of 2.  */
2155 #ifndef SET_STR_BLOCK_SIZE
2156 #define SET_STR_BLOCK_SIZE 1	/* Must be a power of 2. */
2157 #endif
2158 
2159 
2160 /* This check interferes with expression based values of SET_STR_THRESHOLD
2161    used for tuning and measuring.
2162 #if SET_STR_BLOCK_SIZE >= SET_STR_THRESHOLD
2163 These values are silly.
2164 The sub-quadratic code would recurse to itself.
2165 #endif
2166 */
2167 
2168 static mp_size_t
2169 convert_blocks (mp_ptr dp, const unsigned char *str, size_t str_len, int base);
2170 
2171 mp_size_t
mpn_set_str(mp_ptr rp,const unsigned char * str,size_t str_len,int base)2172 mpn_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, int base)
2173 {
2174   mp_size_t size;
2175   mp_limb_t big_base;
2176   int chars_per_limb;
2177   mp_limb_t res_digit;
2178 
2179   ASSERT (base >= 2);
2180   ASSERT (base < numberof (__mp_bases));
2181   ASSERT (str_len >= 1);
2182 
2183   big_base = __mp_bases[base].big_base;
2184   chars_per_limb = __mp_bases[base].chars_per_limb;
2185 
2186   size = 0;
2187 
2188   if (POW2_P (base))
2189     {
2190       /* The base is a power of 2.  Read the input string from least to most
2191 	 significant character/digit.  */
2192 
2193       const unsigned char *s;
2194       int next_bitpos;
2195       int bits_per_indigit = big_base;
2196 
2197       res_digit = 0;
2198       next_bitpos = 0;
2199 
2200       for (s = str + str_len - 1; s >= str; s--)
2201 	{
2202 	  int inp_digit = *s;
2203 
2204 	  res_digit |= ((mp_limb_t) inp_digit << next_bitpos) & GMP_NUMB_MASK;
2205 	  next_bitpos += bits_per_indigit;
2206 	  if (next_bitpos >= GMP_NUMB_BITS)
2207 	    {
2208 	      rp[size++] = res_digit;
2209 	      next_bitpos -= GMP_NUMB_BITS;
2210 	      res_digit = inp_digit >> (bits_per_indigit - next_bitpos);
2211 	    }
2212 
2213 	  if (!((intptr_t)s & 0xFF)) SCHEME_BIGNUM_USE_FUEL(1);
2214 	}
2215 
2216       if (res_digit != 0)
2217 	rp[size++] = res_digit;
2218       return size;
2219     }
2220   else
2221     {
2222       /* General case.  The base is not a power of 2.  */
2223 
2224       if (str_len < SET_STR_THRESHOLD)
2225 	{
2226 	  size_t i;
2227 	  int j;
2228 	  mp_limb_t cy_limb;
2229 
2230 	  for (i = chars_per_limb; i < str_len; i += chars_per_limb)
2231 	    {
2232 	      res_digit = *str++;
2233 	      if (base == 10)
2234 		{ /* This is a common case.
2235 		     Help the compiler to avoid multiplication.  */
2236 		  for (j = MP_BASES_CHARS_PER_LIMB_10 - 1; j != 0; j--)
2237 		    res_digit = res_digit * 10 + *str++;
2238 		}
2239 	      else
2240 		{
2241 		  for (j = chars_per_limb - 1; j != 0; j--)
2242 		    res_digit = res_digit * base + *str++;
2243 		}
2244 
2245 	      if (size == 0)
2246 		{
2247 		  if (res_digit != 0)
2248 		    {
2249 		      rp[0] = res_digit;
2250 		      size = 1;
2251 		    }
2252 		}
2253 	      else
2254 		{
2255 #if HAVE_NATIVE_mpn_mul_1c
2256 		  cy_limb = mpn_mul_1c (rp, rp, size, big_base, res_digit);
2257 #else
2258 		  cy_limb = mpn_mul_1 (rp, rp, size, big_base);
2259 		  cy_limb += mpn_add_1 (rp, rp, size, res_digit);
2260 #endif
2261 		  if (cy_limb != 0)
2262 		    rp[size++] = cy_limb;
2263 		}
2264 	    }
2265 
2266 	  big_base = base;
2267 	  res_digit = *str++;
2268 	  if (base == 10)
2269 	    { /* This is a common case.
2270 		 Help the compiler to avoid multiplication.  */
2271 	      for (j = str_len - (i - MP_BASES_CHARS_PER_LIMB_10) - 1; j > 0; j--)
2272 		{
2273 		  res_digit = res_digit * 10 + *str++;
2274 		  big_base *= 10;
2275 		}
2276 	    }
2277 	  else
2278 	    {
2279 	      for (j = str_len - (i - chars_per_limb) - 1; j > 0; j--)
2280 		{
2281 		  res_digit = res_digit * base + *str++;
2282 		  big_base *= base;
2283 		}
2284 	    }
2285 
2286 	  if (size == 0)
2287 	    {
2288 	      if (res_digit != 0)
2289 		{
2290 		  rp[0] = res_digit;
2291 		  size = 1;
2292 		}
2293 	    }
2294 	  else
2295 	    {
2296 #if HAVE_NATIVE_mpn_mul_1c
2297 	      cy_limb = mpn_mul_1c (rp, rp, size, big_base, res_digit);
2298 #else
2299 	      cy_limb = mpn_mul_1 (rp, rp, size, big_base);
2300 	      cy_limb += mpn_add_1 (rp, rp, size, res_digit);
2301 #endif
2302 	      if (cy_limb != 0)
2303 		rp[size++] = cy_limb;
2304 	    }
2305 	  return size;
2306 	}
2307       else
2308 	{
2309 	  /* Sub-quadratic code.  */
2310 
2311 	  mp_ptr dp;
2312 	  mp_size_t dsize;
2313 	  mp_ptr xp, tp;
2314 	  mp_size_t step;
2315 	  mp_size_t i;
2316 	  size_t alloc;
2317 	  mp_size_t n;
2318 	  mp_ptr pow_mem;
2319 	  TMP_DECL (marker);
2320 	  TMP_MARK (marker);
2321 
2322 	  alloc = (str_len + chars_per_limb - 1) / chars_per_limb;
2323 	  alloc = 2 * alloc;
2324 	  dp = __GMP_ALLOCATE_FUNC_LIMBS (alloc);
2325 
2326 #if SET_STR_BLOCK_SIZE == 1
2327 	  dsize = convert_blocks (dp, str, str_len, base);
2328 #else
2329 	  {
2330 	    const unsigned char *s;
2331 	    mp_ptr ddp = dp;
2332 
2333 	    s = str + str_len;
2334 	    while (s - str >  SET_STR_BLOCK_SIZE * chars_per_limb)
2335 	      {
2336 		s -= SET_STR_BLOCK_SIZE * chars_per_limb;
2337 		mpn_set_str (ddp, s, SET_STR_BLOCK_SIZE * chars_per_limb, base);
2338 		ddp += SET_STR_BLOCK_SIZE;
2339 	      }
2340 	    ddp += mpn_set_str (ddp, str, s - str, base);
2341 	    dsize = ddp - dp;
2342 	  }
2343 #endif
2344 
2345 	  /* Allocate space for powers of big_base.  Could trim this in two
2346 	     ways:
2347 	     1. Only really need 2^ceil(log2(dsize)) bits for the largest
2348 		power.
2349 	     2. Only the variable to get the largest power need that much
2350 		memory.  The other variable needs half as much.  Need just
2351 		figure out which of xp and tp will hold the last one.
2352 	     Net space savings would be in the range 1/4 to 5/8 of current
2353 	     allocation, depending on how close to the next power of 2 that
2354 	     dsize is.  */
2355 	  pow_mem = __GMP_ALLOCATE_FUNC_LIMBS (2 * alloc);
2356 	  xp = pow_mem;
2357 	  tp = pow_mem + alloc;
2358 
2359 	  xp[0] = big_base;
2360 	  n = 1;
2361 	  step = 1;
2362 #if SET_STR_BLOCK_SIZE != 1
2363 	  for (i = SET_STR_BLOCK_SIZE; i > 1; i >>= 1)
2364 	    {
2365 	      mpn_sqr_n (tp, xp, n);
2366 	      n = 2 * n;
2367 	      n -= tp[n - 1] == 0;
2368 
2369 	      step = 2 * step;
2370 	      MP_PTR_SWAP (tp, xp);
2371 	    }
2372 #endif
2373 
2374 	  /* Multiply every second limb block, each `step' limbs large by the
2375 	     base power currently in xp[], then add this to the adjacent block.
2376 	     We thereby convert from dsize blocks in base big_base, to dsize/2
2377 	     blocks in base big_base^2, then to dsize/4 blocks in base
2378 	     big_base^4, etc, etc.  */
2379 
2380 	  if (step < dsize)
2381 	    {
2382 	      for (;;)
2383 		{
2384 		  for (i = 0; i < dsize - step; i += 2 * step)
2385 		    {
2386 		      mp_ptr bp = dp + i;
2387 		      mp_size_t m = dsize - i - step;
2388 		      mp_size_t hi;
2389 		      if (n >= m)
2390 			{
2391 			  mpn_mul (tp, xp, n, bp + step, m);
2392 			  mpn_add (bp, tp, n + m, bp, n);
2393 			  hi = i + n + m;
2394 			  dsize = hi;
2395 			  dsize -= dp[dsize - 1] == 0;
2396 			}
2397 		      else
2398 			{
2399 			  mpn_mul_n (tp, xp, bp + step, n);
2400 			  mpn_add (bp, tp, n + n, bp, n);
2401 			}
2402 		    }
2403 
2404 		  step = 2 * step;
2405 		  if (! (step < dsize))
2406 		    break;
2407 
2408 		  mpn_sqr_n (tp, xp, n);
2409 		  n = 2 * n;
2410 		  n -= tp[n - 1] == 0;
2411 		  MP_PTR_SWAP (tp, xp);
2412 		}
2413 	    }
2414 
2415 	  MPN_NORMALIZE (dp, dsize);
2416 	  MPN_COPY (rp, dp, dsize);
2417 	  __GMP_FREE_FUNC_LIMBS (pow_mem, 2 * alloc);
2418 	  __GMP_FREE_FUNC_LIMBS (dp, alloc);
2419 	  TMP_FREE (marker);
2420 	  return dsize;
2421 	}
2422     }
2423 }
2424 
2425 static mp_size_t
convert_blocks(mp_ptr dp,const unsigned char * str,size_t str_len,int base)2426 convert_blocks (mp_ptr dp, const unsigned char *str, size_t str_len, int base)
2427 {
2428   int chars_per_limb;
2429   mp_size_t i;
2430   int j;
2431   int ds;
2432   mp_size_t dsize;
2433   mp_limb_t res_digit;
2434 
2435   chars_per_limb = __mp_bases[base].chars_per_limb;
2436 
2437   dsize = str_len / chars_per_limb;
2438   ds = str_len % chars_per_limb;
2439 
2440   if (ds != 0)
2441     {
2442       res_digit = *str++;
2443       for (j = ds - 1; j != 0; j--)
2444 	res_digit = res_digit * base + *str++;
2445       dp[dsize] = res_digit;
2446     }
2447 
2448   if (base == 10)
2449     {
2450       for (i = dsize - 1; i >= 0; i--)
2451 	{
2452 	  res_digit = *str++;
2453 	  for (j = MP_BASES_CHARS_PER_LIMB_10 - 1; j != 0; j--)
2454 	    res_digit = res_digit * 10 + *str++;
2455 	  dp[i] = res_digit;
2456 	}
2457     }
2458   else
2459     {
2460       for (i = dsize - 1; i >= 0; i--)
2461 	{
2462 	  res_digit = *str++;
2463 	  for (j = chars_per_limb - 1; j != 0; j--)
2464 	    res_digit = res_digit * base + *str++;
2465 	  dp[i] = res_digit;
2466 	}
2467     }
2468 
2469   return dsize + (ds != 0);
2470 }
2471 
2472 
2473 /* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and
2474    write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp.  If
2475    qxn is non-zero, generate that many fraction limbs and append them after the
2476    other quotient limbs, and update the remainder accordningly.  The input
2477    operands are unaffected.
2478 
2479    Preconditions:
2480    1. The most significant limb of of the divisor must be non-zero.
2481    2. No argument overlap is permitted.  (??? relax this ???)
2482    3. nn >= dn, even if qxn is non-zero.  (??? relax this ???)
2483 
2484    The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time
2485    complexity of multiplication. */
2486 
2487 #ifndef BZ_THRESHOLD
2488 #define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
2489 #endif
2490 
2491 /* Extract the middle limb from ((h,,l) << cnt) */
2492 #define SHL(h,l,cnt) \
2493   ((h << cnt) | ((l >> 1) >> ((~cnt) & (BITS_PER_MP_LIMB - 1))))
2494 
2495 void
2496 #if __STDC__
mpn_tdiv_qr(mp_ptr qp,mp_ptr rp,mp_size_t qxn,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn)2497 mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
2498 	     mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
2499 #else
2500 mpn_tdiv_qr (qp, rp, qxn, np, nn, dp, dn)
2501      mp_ptr qp;
2502      mp_ptr rp;
2503      mp_size_t qxn;
2504      mp_srcptr np;
2505      mp_size_t nn;
2506      mp_srcptr dp;
2507      mp_size_t dn;
2508 #endif
2509 {
2510   /* FIXME:
2511      1. qxn
2512      2. pass allocated storage in additional parameter?
2513   */
2514 #ifdef DEBUG
2515   if (qxn != 0)
2516     abort ();
2517 #endif
2518 
2519   switch (dn)
2520     {
2521     case 0:
2522 #if 0
2523       DIVIDE_BY_ZERO;
2524 #endif
2525       return;
2526 
2527     case 1:
2528       {
2529 	rp[0] = mpn_divmod_1 (qp, np, nn, dp[0]);
2530 	return;
2531       }
2532 
2533     case 2:
2534       {
2535 	int cnt;
2536 	mp_ptr n2p, d2p;
2537 	mp_limb_t qhl, cy;
2538 	TMP_DECL (marker);
2539 	TMP_MARK (marker);
2540 	count_leading_zeros (cnt, dp[dn - 1]);
2541 	if (cnt != 0)
2542 	  {
2543 	    d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
2544 	    mpn_lshift (d2p, dp, dn, cnt);
2545 	    n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
2546 	    cy = mpn_lshift (n2p, np, nn, cnt);
2547 	    n2p[nn] = cy;
2548 	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p);
2549 	    if (cy == 0)
2550 	      qp[nn - 2] = qhl;	/* always store nn-dn+1 quotient limbs */
2551 	  }
2552 	else
2553 	  {
2554 	    d2p = (mp_ptr) dp;
2555 	    n2p = (mp_ptr) TMP_ALLOC (nn * BYTES_PER_MP_LIMB);
2556 	    MPN_COPY (n2p, np, nn);
2557 	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
2558 	    qp[nn - 2] = qhl;	/* always store nn-dn+1 quotient limbs */
2559 	  }
2560 
2561 	if (cnt != 0)
2562 	  mpn_rshift (rp, n2p, dn, cnt);
2563 	else
2564 	  MPN_COPY (rp, n2p, dn);
2565 	TMP_FREE (marker);
2566 	return;
2567       }
2568 
2569     default:
2570       {
2571 	int adjust;
2572 	TMP_DECL (marker);
2573 	TMP_MARK (marker);
2574 	adjust = np[nn - 1] >= dp[dn - 1];	/* conservative tests for quotient size */
2575 	if (nn + adjust >= 2 * dn)
2576 	  {
2577 	    mp_ptr n2p, d2p;
2578 	    mp_limb_t cy;
2579 	    int cnt;
2580 	    count_leading_zeros (cnt, dp[dn - 1]);
2581 
2582 	    qp[nn - dn] = 0;			/* zero high quotient limb */
2583 	    if (cnt != 0)			/* normalize divisor if needed */
2584 	      {
2585 		d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
2586 		mpn_lshift (d2p, dp, dn, cnt);
2587 		n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
2588 		cy = mpn_lshift (n2p, np, nn, cnt);
2589 		n2p[nn] = cy;
2590 		nn += adjust;
2591 	      }
2592 	    else
2593 	      {
2594 		d2p = (mp_ptr) dp;
2595 		n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
2596 		MPN_COPY (n2p, np, nn);
2597 		n2p[nn] = 0;
2598 		nn += adjust;
2599 	      }
2600 
2601 	    if (dn == 2)
2602 	      mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
2603 	    else if (dn < BZ_THRESHOLD)
2604 	      mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
2605 	    else
2606 	      {
2607 		/* Perform 2*dn / dn limb divisions as long as the limbs
2608 		   in np last.  */
2609 		mp_ptr q2p = qp + nn - 2 * dn;
2610 		n2p += nn - 2 * dn;
2611 		mpn_bz_divrem_n (q2p, n2p, d2p, dn);
2612 		nn -= dn;
2613 		while (nn >= 2 * dn)
2614 		  {
2615 		    mp_limb_t c;
2616 		    q2p -= dn;  n2p -= dn;
2617 		    c = mpn_bz_divrem_n (q2p, n2p, d2p, dn);
2618 		    ASSERT_ALWAYS (c == 0);
2619 		    nn -= dn;
2620 		  }
2621 
2622 		if (nn != dn)
2623 		  {
2624 		    n2p -= nn - dn;
2625 		    /* In theory, we could fall out to the cute code below
2626 		       since we now have exactly the situation that code
2627 		       is designed to handle.  We botch this badly and call
2628 		       the basic mpn_sb_divrem_mn!  */
2629 		    if (dn == 2)
2630 		      mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
2631 		    else
2632 		      mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
2633 		  }
2634 	      }
2635 
2636 
2637 	    if (cnt != 0)
2638 	      mpn_rshift (rp, n2p, dn, cnt);
2639 	    else
2640 	      MPN_COPY (rp, n2p, dn);
2641 	    TMP_FREE (marker);
2642 	    return;
2643 	  }
2644 
2645 	/* When we come here, the numerator/partial remainder is less
2646 	   than twice the size of the denominator.  */
2647 
2648 	  {
2649 	    /* Problem:
2650 
2651 	       Divide a numerator N with nn limbs by a denominator D with dn
2652 	       limbs forming a quotient of nn-dn+1 limbs.  When qn is small
2653 	       compared to dn, conventional division algorithms perform poorly.
2654 	       We want an algorithm that has an expected running time that is
2655 	       dependent only on qn.  It is assumed that the most significant
2656 	       limb of the numerator is smaller than the most significant limb
2657 	       of the denominator.
2658 
2659 	       Algorithm (very informally stated):
2660 
2661 	       1) Divide the 2 x qn most significant limbs from the numerator
2662 		  by the qn most significant limbs from the denominator.  Call
2663 		  the result qest.  This is either the correct quotient, but
2664 		  might be 1 or 2 too large.  Compute the remainder from the
2665 		  division.  (This step is implemented by a mpn_divrem call.)
2666 
2667 	       2) Is the most significant limb from the remainder < p, where p
2668 		  is the product of the most significant limb from the quotient
2669 		  and the next(d).  (Next(d) denotes the next ignored limb from
2670 		  the denominator.)  If it is, decrement qest, and adjust the
2671 		  remainder accordingly.
2672 
2673 	       3) Is the remainder >= qest?  If it is, qest is the desired
2674 		  quotient.  The algorithm terminates.
2675 
2676 	       4) Subtract qest x next(d) from the remainder.  If there is
2677 		  borrow out, decrement qest, and adjust the remainder
2678 		  accordingly.
2679 
2680 	       5) Skip one word from the denominator (i.e., let next(d) denote
2681 		  the next less significant limb.  */
2682 
2683 	    mp_size_t qn;
2684 	    mp_ptr n2p, d2p;
2685 	    mp_ptr tp;
2686 	    mp_limb_t cy;
2687 	    mp_size_t in, rn;
2688 	    mp_limb_t quotient_too_large;
2689 	    int cnt;
2690 
2691 	    qn = nn - dn;
2692 	    qp[qn] = 0;				/* zero high quotient limb */
2693 	    qn += adjust;			/* qn cannot become bigger */
2694 
2695 	    if (qn == 0)
2696 	      {
2697 		MPN_COPY (rp, np, dn);
2698 		TMP_FREE (marker);
2699 		return;
2700 	      }
2701 
2702 	    in = dn - qn;		/* (at least partially) ignored # of limbs in ops */
2703 	    /* Normalize denominator by shifting it to the left such that its
2704 	       most significant bit is set.  Then shift the numerator the same
2705 	       amount, to mathematically preserve quotient.  */
2706 	    count_leading_zeros (cnt, dp[dn - 1]);
2707 	    if (cnt != 0)
2708 	      {
2709 		d2p = (mp_ptr) TMP_ALLOC (qn * BYTES_PER_MP_LIMB);
2710 
2711 		mpn_lshift (d2p, dp + in, qn, cnt);
2712 		d2p[0] |= dp[in - 1] >> (BITS_PER_MP_LIMB - cnt);
2713 
2714 		n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
2715 		cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt);
2716 		if (adjust)
2717 		  {
2718 		    n2p[2 * qn] = cy;
2719 		    n2p++;
2720 		  }
2721 		else
2722 		  {
2723 		    n2p[0] |= np[nn - 2 * qn - 1] >> (BITS_PER_MP_LIMB - cnt);
2724 		  }
2725 	      }
2726 	    else
2727 	      {
2728 		d2p = (mp_ptr) dp + in;
2729 
2730 		n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
2731 		MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn);
2732 		if (adjust)
2733 		  {
2734 		    n2p[2 * qn] = 0;
2735 		    n2p++;
2736 		  }
2737 	      }
2738 
2739 	    /* Get an approximate quotient using the extracted operands.  */
2740 	    if (qn == 1)
2741 	      {
2742 		mp_limb_t q0, r0;
2743 		mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0;
2744 		/* Due to a gcc 2.7.2.3 reload pass bug, we have to use some
2745 		   temps here.  This doesn't hurt code quality on any machines
2746 		   so we do it unconditionally.  */
2747 		gcc272bug_n1 = n2p[1];
2748 		gcc272bug_n0 = n2p[0];
2749 		gcc272bug_d0 = d2p[0];
2750 		udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0, gcc272bug_d0);
2751 		n2p[0] = r0;
2752 		qp[0] = q0;
2753 	      }
2754 	    else if (qn == 2)
2755 	      mpn_divrem_2 (qp, 0L, n2p, 4L, d2p);
2756 	    else if (qn < BZ_THRESHOLD)
2757 	      mpn_sb_divrem_mn (qp, n2p, qn * 2, d2p, qn);
2758 	    else
2759 	      mpn_bz_divrem_n (qp, n2p, d2p, qn);
2760 
2761 	    rn = qn;
2762 	    /* Multiply the first ignored divisor limb by the most significant
2763 	       quotient limb.  If that product is > the partial remainder's
2764 	       most significant limb, we know the quotient is too large.  This
2765 	       test quickly catches most cases where the quotient is too large;
2766 	       it catches all cases where the quotient is 2 too large.  */
2767 	    {
2768 	      mp_limb_t dl, x;
2769 	      mp_limb_t h;
2770 	      mp_limb_t l USED_ONLY_SOMETIMES;
2771 
2772 	      if (in - 2 < 0)
2773 		dl = 0;
2774 	      else
2775 		dl = dp[in - 2];
2776 
2777 	      x = SHL (dp[in - 1], dl, cnt);
2778 	      umul_ppmm (h, l, x, qp[qn - 1]);
2779 
2780 	      if (n2p[qn - 1] < h)
2781 		{
2782 		  mp_limb_t cy;
2783 
2784 		  mpn_decr_u (qp, (mp_limb_t) 1);
2785 		  cy = mpn_add_n (n2p, n2p, d2p, qn);
2786 		  if (cy)
2787 		    {
2788 		      /* The partial remainder is safely large.  */
2789 		      n2p[qn] = cy;
2790 		      ++rn;
2791 		    }
2792 		}
2793 	    }
2794 
2795 	    quotient_too_large = 0;
2796 	    if (cnt != 0)
2797 	      {
2798 		mp_limb_t cy1, cy2;
2799 
2800 		/* Append partially used numerator limb to partial remainder.  */
2801 		cy1 = mpn_lshift (n2p, n2p, rn, BITS_PER_MP_LIMB - cnt);
2802 		n2p[0] |= np[in - 1] & (~(mp_limb_t) 0 >> cnt);
2803 
2804 		/* Update partial remainder with partially used divisor limb.  */
2805 		cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (~(mp_limb_t) 0 >> cnt));
2806 		if (qn != rn)
2807 		  {
2808 #ifdef DEBUG
2809 		    if (n2p[qn] < cy2)
2810 		      abort ();
2811 #endif
2812 		    n2p[qn] -= cy2;
2813 		  }
2814 		else
2815 		  {
2816 		    n2p[qn] = cy1 - cy2;
2817 
2818 		    quotient_too_large = (cy1 < cy2);
2819 		    ++rn;
2820 		  }
2821 		--in;
2822 	      }
2823 	    /* True: partial remainder now is neutral, i.e., it is not shifted up.  */
2824 
2825 	    tp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
2826 
2827 	    if (in < qn)
2828 	      {
2829 		if (in == 0)
2830 		  {
2831 		    MPN_COPY (rp, n2p, rn);
2832 #ifdef DEBUG
2833 		    if (rn != dn)
2834 		      abort ();
2835 #endif
2836 		    goto foo;
2837 		  }
2838 		mpn_mul (tp, qp, qn, dp, in);
2839 	      }
2840 	    else
2841 	      mpn_mul (tp, dp, in, qp, qn);
2842 
2843 	    cy = mpn_sub (n2p, n2p, rn, tp + in, qn);
2844 	    MPN_COPY (rp + in, n2p, dn - in);
2845 	    quotient_too_large |= cy;
2846 	    cy = mpn_sub_n (rp, np, tp, in);
2847 	    cy = mpn_sub_1 (rp + in, rp + in, rn, cy);
2848 	    quotient_too_large |= cy;
2849 	  foo:
2850 	    if (quotient_too_large)
2851 	      {
2852 		mpn_decr_u (qp, (mp_limb_t) 1);
2853 		mpn_add_n (rp, rp, dp, dn);
2854 	      }
2855 	  }
2856 	TMP_FREE (marker);
2857 	return;
2858       }
2859     }
2860 }
2861 
2862 
2863 /* Square roots table.  Generated by the following program:
2864 #include "gmp.h"
2865 main(){mpz_t x;int i;mpz_init(x);for(i=64;i<256;i++){mpz_set_ui(x,256*i);
2866 mpz_sqrt(x,x);mpz_out_str(0,10,x);printf(",");if(i%16==15)printf("\n");}}
2867 */
2868 static const unsigned char approx_tab[192] =
2869   {
2870     128,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,
2871     143,144,144,145,146,147,148,149,150,150,151,152,153,154,155,155,
2872     156,157,158,159,160,160,161,162,163,163,164,165,166,167,167,168,
2873     169,170,170,171,172,173,173,174,175,176,176,177,178,178,179,180,
2874     181,181,182,183,183,184,185,185,186,187,187,188,189,189,190,191,
2875     192,192,193,193,194,195,195,196,197,197,198,199,199,200,201,201,
2876     202,203,203,204,204,205,206,206,207,208,208,209,209,210,211,211,
2877     212,212,213,214,214,215,215,216,217,217,218,218,219,219,220,221,
2878     221,222,222,223,224,224,225,225,226,226,227,227,228,229,229,230,
2879     230,231,231,232,232,233,234,234,235,235,236,236,237,237,238,238,
2880     239,240,240,241,241,242,242,243,243,244,244,245,245,246,246,247,
2881     247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255
2882   };
2883 
2884 #define HALF_NAIL (GMP_NAIL_BITS / 2)
2885 
2886 /* same as mpn_sqrtrem, but for size=1 and {np, 1} normalized */
2887 static mp_size_t
mpn_sqrtrem1(mp_ptr sp,mp_ptr rp,mp_srcptr np)2888 mpn_sqrtrem1 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
2889 {
2890   mp_limb_t np0, s, r, q, u;
2891   int prec;
2892 
2893   ASSERT (np[0] >= GMP_NUMB_HIGHBIT / 2);
2894   ASSERT (GMP_LIMB_BITS >= 16);
2895 
2896   /* first compute a 8-bit approximation from the high 8-bits of np[0] */
2897   np0 = np[0] << GMP_NAIL_BITS;
2898   q = np0 >> (GMP_LIMB_BITS - 8);
2899   /* 2^6 = 64 <= q < 256 = 2^8 */
2900   s = approx_tab[q - 64];				/* 128 <= s < 255 */
2901   r = (np0 >> (GMP_LIMB_BITS - 16)) - s * s;		/* r <= 2*s */
2902   if (r > 2 * s)
2903     {
2904       r -= 2 * s + 1;
2905       s++;
2906     }
2907 
2908   prec = 8;
2909   np0 <<= 2 * prec;
2910   while (2 * prec < GMP_LIMB_BITS)
2911     {
2912       /* invariant: s has prec bits, and r <= 2*s */
2913       r = (r << prec) + (np0 >> (GMP_LIMB_BITS - prec));
2914       np0 <<= prec;
2915       u = 2 * s;
2916       q = r / u;
2917       u = r - q * u;
2918       s = (s << prec) + q;
2919       u = (u << prec) + (np0 >> (GMP_LIMB_BITS - prec));
2920       q = q * q;
2921       r = u - q;
2922       if (u < q)
2923 	{
2924 	  r += 2 * s - 1;
2925 	  s --;
2926 	}
2927       np0 <<= prec;
2928       prec = 2 * prec;
2929     }
2930 
2931   ASSERT (2 * prec == GMP_LIMB_BITS); /* GMP_LIMB_BITS must be a power of 2 */
2932 
2933   /* normalize back, assuming GMP_NAIL_BITS is even */
2934   ASSERT (GMP_NAIL_BITS % 2 == 0);
2935   sp[0] = s >> HALF_NAIL;
2936   u = s - (sp[0] << HALF_NAIL); /* s mod 2^HALF_NAIL */
2937   r += u * ((sp[0] << (HALF_NAIL + 1)) + u);
2938   r = r >> GMP_NAIL_BITS;
2939 
2940   if (rp != NULL)
2941     rp[0] = r;
2942   return r != 0 ? 1 : 0;
2943 }
2944 
2945 
2946 #define Prec (GMP_NUMB_BITS >> 1)
2947 
2948 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
2949    return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
2950 static mp_limb_t
mpn_sqrtrem2(mp_ptr sp,mp_ptr rp,mp_srcptr np)2951 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
2952 {
2953   mp_limb_t qhl, q, u, np0;
2954   int cc;
2955 
2956   ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
2957 
2958   np0 = np[0];
2959   mpn_sqrtrem1 (sp, rp, np + 1);
2960   qhl = 0;
2961   while (rp[0] >= sp[0])
2962     {
2963       qhl++;
2964       rp[0] -= sp[0];
2965     }
2966   /* now rp[0] < sp[0] < 2^Prec */
2967   rp[0] = (rp[0] << Prec) + (np0 >> Prec);
2968   u = 2 * sp[0];
2969   q = rp[0] / u;
2970   u = rp[0] - q * u;
2971   q += (qhl & 1) << (Prec - 1);
2972   qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
2973   /* now we have (initial rp[0])<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp[0]) + u */
2974   sp[0] = ((sp[0] + qhl) << Prec) + q;
2975   cc = u >> Prec;
2976   rp[0] = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
2977   /* subtract q * q or qhl*2^(2*Prec) from rp */
2978   cc -= mpn_sub_1 (rp, rp, 1, q * q) + qhl;
2979   /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
2980   if (cc < 0)
2981     {
2982       cc += sp[0] != 0 ? mpn_add_1 (rp, rp, 1, sp[0]) : 1;
2983       cc += mpn_add_1 (rp, rp, 1, --sp[0]);
2984     }
2985 
2986   return cc;
2987 }
2988 
2989 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
2990    and in {np, n} the low n limbs of the remainder, returns the high
2991    limb of the remainder (which is 0 or 1).
2992    Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
2993    where B=2^GMP_NUMB_BITS.  */
2994 static mp_limb_t
mpn_dc_sqrtrem(mp_ptr sp,mp_ptr np,mp_size_t n)2995 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
2996 {
2997   mp_limb_t q;			/* carry out of {sp, n} */
2998   int c, b;			/* carry out of remainder */
2999   mp_size_t l, h;
3000 
3001   ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
3002 
3003   if (n == 1)
3004     c = mpn_sqrtrem2 (sp, np, np);
3005   else
3006     {
3007       l = n / 2;
3008       h = n - l;
3009       q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
3010       if (q != 0)
3011         mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
3012       q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
3013       c = sp[0] & 1;
3014       mpn_rshift (sp, sp, l, 1);
3015       sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
3016       q >>= 1;
3017       if (c != 0)
3018         c = mpn_add_n (np + l, np + l, sp + l, h);
3019       mpn_sqr_n (np + n, sp, l);
3020       b = q + mpn_sub_n (np, np, np + n, 2 * l);
3021       c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, b);
3022       q = mpn_add_1 (sp + l, sp + l, h, q);
3023 
3024       if (c < 0)
3025         {
3026           c += mpn_addmul_1 (np, sp, n, 2) + 2 * q;
3027           c -= mpn_sub_1 (np, np, n, 1);
3028           q -= mpn_sub_1 (sp, sp, n, 1);
3029         }
3030     }
3031 
3032   return c;
3033 }
3034 
3035 
3036 mp_size_t
mpn_sqrtrem(mp_ptr sp,mp_ptr rp,mp_srcptr np,mp_size_t nn)3037 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
3038 {
3039   mp_limb_t *tp, s0[1], cc, high, rl;
3040   int c;
3041   mp_size_t rn, tn;
3042   TMP_DECL (marker);
3043 
3044   ASSERT (nn >= 0);
3045 
3046   /* If OP is zero, both results are zero.  */
3047   if (nn == 0)
3048     return 0;
3049 
3050   ASSERT (np[nn - 1] != 0);
3051   ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
3052   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
3053   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
3054 
3055   high = np[nn - 1];
3056   if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
3057     return mpn_sqrtrem1 (sp, rp, np);
3058   count_leading_zeros (c, high);
3059   c -= GMP_NAIL_BITS;
3060 
3061   c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
3062   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
3063 
3064   TMP_MARK (marker);
3065   if (nn % 2 != 0 || c > 0)
3066     {
3067       tp = TMP_ALLOC_LIMBS (2 * tn);
3068       tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
3069       if (c != 0)
3070 	mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
3071       else
3072 	MPN_COPY (tp + 2 * tn - nn, np, nn);
3073       rl = mpn_dc_sqrtrem (sp, tp, tn);
3074       /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
3075 	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
3076       c += (nn % 2) * GMP_NUMB_BITS / 2;		/* c now represents k */
3077       s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1);	/* S mod 2^k */
3078       rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
3079       cc = mpn_submul_1 (tp, s0, 1, s0[0]);
3080       rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
3081       mpn_rshift (sp, sp, tn, c);
3082       tp[tn] = rl;
3083       if (rp == NULL)
3084 	rp = tp;
3085       c = c << 1;
3086       if (c < GMP_NUMB_BITS)
3087 	tn++;
3088       else
3089 	{
3090 	  tp++;
3091 	  c -= GMP_NUMB_BITS;
3092 	}
3093       if (c != 0)
3094 	mpn_rshift (rp, tp, tn, c);
3095       else
3096 	MPN_COPY_INCR (rp, tp, tn);
3097       rn = tn;
3098     }
3099   else
3100     {
3101       if (rp == NULL)
3102 	rp = TMP_ALLOC_LIMBS (nn);
3103       if (rp != np)
3104 	MPN_COPY (rp, np, nn);
3105       rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
3106     }
3107 
3108   MPN_NORMALIZE (rp, rn);
3109 
3110   TMP_FREE (marker);
3111   return rn;
3112 }
3113 
3114 /* mpn_bz_divrem_n and auxiliary routines. */
3115 
3116 /*
3117 [1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler,
3118     Technical report MPI-I-98-1-022, october 1998.
3119     http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz
3120 */
3121 
3122 static mp_limb_t mpn_bz_div_3_halves_by_2
3123   _PROTO ((mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n));
3124 
3125 
3126 /* mpn_bz_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than
3127    div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way,
3128    i.e. n/2 >= KARATSUBA_MUL_THRESHOLD */
3129 #ifndef BZ_THRESHOLD
3130 #define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
3131 #endif
3132 
3133 /* mpn_bz_divrem_n - Implements algorithm of page 8 in [1]: divides (np,2n)
3134    by (dp,n) and puts the quotient in (qp,n), the remainder in (np,n).
3135    Returns most significant limb of the quotient, which is 0 or 1.
3136    Requires that the most significant bit of the divisor is set.  */
3137 
3138 mp_limb_t
3139 #if __STDC__
mpn_bz_divrem_n(mp_ptr qp,mp_ptr np,mp_srcptr dp,mp_size_t n)3140 mpn_bz_divrem_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
3141 #else
3142 mpn_bz_divrem_n (qp, np, dp, n)
3143      mp_ptr qp;
3144      mp_ptr np;
3145      mp_srcptr dp;
3146      mp_size_t n;
3147 #endif
3148 {
3149   mp_limb_t qhl, cc;
3150 
3151   if (n % 2 != 0)
3152     {
3153       qhl = mpn_bz_divrem_n (qp + 1, np + 2, dp + 1, n - 1);
3154       cc = mpn_submul_1 (np + 1, qp + 1, n - 1, dp[0]);
3155       cc = mpn_sub_1 (np + n, np + n, 1, cc);
3156       if (qhl) cc += mpn_sub_1 (np + n, np + n, 1, dp[0]);
3157       while (cc)
3158         {
3159           qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, (mp_limb_t) 1);
3160           cc -= mpn_add_n (np + 1, np + 1, dp, n);
3161         }
3162       qhl += mpn_add_1 (qp + 1, qp + 1, n - 1,
3163                         mpn_sb_divrem_mn (qp, np, n + 1, dp, n));
3164     }
3165   else
3166     {
3167       mp_size_t n2 = n/2;
3168       qhl = mpn_bz_div_3_halves_by_2 (qp + n2, np + n2, dp, n2);
3169       qhl += mpn_add_1 (qp + n2, qp + n2, n2,
3170                         mpn_bz_div_3_halves_by_2 (qp, np, dp, n2));
3171     }
3172   return qhl;
3173 }
3174 
3175 
3176 /* divides (np, 3n) by (dp, 2n) and puts the quotient in (qp, n),
3177    the remainder in (np, 2n) */
3178 
3179 static mp_limb_t
3180 #if __STDC__
mpn_bz_div_3_halves_by_2(mp_ptr qp,mp_ptr np,mp_srcptr dp,mp_size_t n)3181 mpn_bz_div_3_halves_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
3182 #else
3183 mpn_bz_div_3_halves_by_2 (qp, np, dp, n)
3184      mp_ptr qp;
3185      mp_ptr np;
3186      mp_srcptr dp;
3187      mp_size_t n;
3188 #endif
3189 {
3190   mp_size_t twon = n + n;
3191   mp_limb_t qhl, cc;
3192   mp_ptr tmp;
3193   TMP_DECL (marker);
3194 
3195   TMP_MARK (marker);
3196   if (n < BZ_THRESHOLD)
3197     qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n);
3198   else
3199     qhl = mpn_bz_divrem_n (qp, np + n, dp + n, n);
3200   tmp = (mp_ptr) TMP_ALLOC (twon * BYTES_PER_MP_LIMB);
3201   mpn_mul_n (tmp, qp, dp, n);
3202   cc = mpn_sub_n (np, np, tmp, twon);
3203   TMP_FREE (marker);
3204   if (qhl) cc += mpn_sub_n (np + n, np + n, dp, n);
3205   while (cc)
3206     {
3207       qhl -= mpn_sub_1 (qp, qp, n, (mp_limb_t) 1);
3208       cc -= mpn_add_n (np, np, dp, twon);
3209     }
3210   return qhl;
3211 }
3212 
3213 /* mpn_sb_divrem_mn -- Divide natural numbers, producing both remainder and
3214    quotient. */
3215 
3216 /* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
3217    the NSIZE-DSIZE least significant quotient limbs at QP
3218    and the DSIZE long remainder at NP.  If QEXTRA_LIMBS is
3219    non-zero, generate that many fraction bits and append them after the
3220    other quotient limbs.
3221    Return the most significant limb of the quotient, this is always 0 or 1.
3222 
3223    Preconditions:
3224    0. NSIZE >= DSIZE.
3225    1. The most significant bit of the divisor must be set.
3226    2. QP must either not overlap with the input operands at all, or
3227       QP + DSIZE >= NP must hold true.  (This means that it's
3228       possible to put the quotient in the high part of NUM, right after the
3229       remainder in NUM.
3230    3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero.
3231    4. DSIZE >= 2.  */
3232 
3233 
3234 #define PREINVERT_VIABLE \
3235   (UDIV_TIME > 2 * UMUL_TIME + 6 /* && ! TARGET_REGISTER_STARVED */)
3236 
3237 mp_limb_t
3238 #if __STDC__
mpn_sb_divrem_mn(mp_ptr qp,mp_ptr np,mp_size_t nsize,mp_srcptr dp,mp_size_t dsize)3239 mpn_sb_divrem_mn (mp_ptr qp,
3240 	       mp_ptr np, mp_size_t nsize,
3241 	       mp_srcptr dp, mp_size_t dsize)
3242 #else
3243 mpn_sb_divrem_mn (qp, np, nsize, dp, dsize)
3244      mp_ptr qp;
3245      mp_ptr np;
3246      mp_size_t nsize;
3247      mp_srcptr dp;
3248      mp_size_t dsize;
3249 #endif
3250 {
3251   mp_limb_t most_significant_q_limb = 0;
3252   mp_size_t i;
3253   mp_limb_t dx, d1, n0;
3254   mp_limb_t dxinv = 0;
3255   int have_preinv;
3256 
3257   ASSERT_ALWAYS (dsize > 2);
3258 
3259   np += nsize - dsize;
3260   dx = dp[dsize - 1];
3261   d1 = dp[dsize - 2];
3262   n0 = np[dsize - 1];
3263 
3264   if (n0 >= dx)
3265     {
3266       if (n0 > dx || mpn_cmp (np, dp, dsize - 1) >= 0)
3267 	{
3268 	  mpn_sub_n (np, np, dp, dsize);
3269 	  most_significant_q_limb = 1;
3270 	}
3271     }
3272 
3273   /* If multiplication is much faster than division, preinvert the
3274      most significant divisor limb before entering the loop.  */
3275   if (PREINVERT_VIABLE)
3276     {
3277       have_preinv = 0;
3278       if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - dsize) > UDIV_TIME)
3279 	{
3280 	  invert_limb (dxinv, dx);
3281 	  have_preinv = 1;
3282 	}
3283     }
3284 
3285   for (i = nsize - dsize - 1; i >= 0; i--)
3286     {
3287       mp_limb_t q;
3288       mp_limb_t nx;
3289       mp_limb_t cy_limb;
3290 
3291       nx = np[dsize - 1];
3292       np--;
3293 
3294       if (nx == dx)
3295 	{
3296 	  /* This might over-estimate q, but it's probably not worth
3297 	     the extra code here to find out.  */
3298 	  q = ~(mp_limb_t) 0;
3299 
3300 #if 1
3301 	  cy_limb = mpn_submul_1 (np, dp, dsize, q);
3302 #else
3303 	  /* This should be faster on many machines */
3304 	  cy_limb = mpn_sub_n (np + 1, np + 1, dp, dsize);
3305 	  cy = mpn_add_n (np, np, dp, dsize);
3306 	  np[dsize] += cy;
3307 #endif
3308 
3309 	  if (nx != cy_limb)
3310 	    {
3311 	      mpn_add_n (np, np, dp, dsize);
3312 	      q--;
3313 	    }
3314 
3315 	  qp[i] = q;
3316 	}
3317       else
3318 	{
3319 	  mp_limb_t rx, r1, r0, p1, p0;
3320 
3321           /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register
3322              usage when np[dsize-1] is used in an asm statement like
3323              umul_ppmm in udiv_qrnnd_preinv.  The symptom is seg faults due
3324              to registers being clobbered.  gcc 2.95 i386 doesn't have the
3325              problem. */
3326           {
3327             mp_limb_t  workaround = np[dsize - 1];
3328             if (PREINVERT_VIABLE && have_preinv)
3329               udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv);
3330             else
3331               udiv_qrnnd (q, r1, nx, workaround, dx);
3332           }
3333 	  umul_ppmm (p1, p0, d1, q);
3334 
3335 	  r0 = np[dsize - 2];
3336 	  rx = 0;
3337 	  if (r1 < p1 || (r1 == p1 && r0 < p0))
3338 	    {
3339 	      p1 -= p0 < d1;
3340 	      p0 -= d1;
3341 	      q--;
3342 	      r1 += dx;
3343 	      rx = r1 < dx;
3344 	    }
3345 
3346 	  p1 += r0 < p0;	/* cannot carry! */
3347 	  rx -= r1 < p1;	/* may become 11..1 if q is still too large */
3348 	  r1 -= p1;
3349 	  r0 -= p0;
3350 
3351 	  cy_limb = mpn_submul_1 (np, dp, dsize - 2, q);
3352 
3353 	  {
3354 	    mp_limb_t cy1, cy2;
3355 	    cy1 = r0 < cy_limb;
3356 	    r0 -= cy_limb;
3357 	    cy2 = r1 < cy1;
3358 	    r1 -= cy1;
3359 	    np[dsize - 1] = r1;
3360 	    np[dsize - 2] = r0;
3361 	    if (cy2 != rx)
3362 	      {
3363 		mpn_add_n (np, np, dp, dsize);
3364 		q--;
3365 	      }
3366 	  }
3367 	  qp[i] = q;
3368 	}
3369     }
3370 
3371   /* ______ ______ ______
3372     |__rx__|__r1__|__r0__|		partial remainder
3373 	    ______ ______
3374 	 - |__p1__|__p0__|		partial product to subtract
3375 	    ______ ______
3376 	 - |______|cylimb|
3377 
3378      rx is -1, 0 or 1.  If rx=1, then q is correct (it should match
3379      carry out).  If rx=-1 then q is too large.  If rx=0, then q might
3380      be too large, but it is most likely correct.
3381   */
3382 
3383   return most_significant_q_limb;
3384 }
3385 
3386 /* mpn_divrem_1(quot_ptr, qsize, dividend_ptr, dividend_size, divisor_limb) --
3387    Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
3388    Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
3389    Return the single-limb remainder.
3390    There are no constraints on the value of the divisor.
3391 
3392    QUOT_PTR and DIVIDEND_PTR might point to the same limb.
3393  */
3394 
3395 /* __gmpn_divmod_1_internal(quot_ptr,dividend_ptr,dividend_size,divisor_limb)
3396    Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
3397    Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
3398    Return the single-limb remainder.
3399    There are no constraints on the value of the divisor.
3400 
3401    QUOT_PTR and DIVIDEND_PTR might point to the same limb. */
3402 
3403 #ifndef UMUL_TIME
3404 #define UMUL_TIME 1
3405 #endif
3406 
3407 #ifndef UDIV_TIME
3408 #define UDIV_TIME UMUL_TIME
3409 #endif
3410 
3411 static mp_limb_t
3412 #if __STDC__
__gmpn_divmod_1_internal(mp_ptr quot_ptr,mp_srcptr dividend_ptr,mp_size_t dividend_size,mp_limb_t divisor_limb)3413 __gmpn_divmod_1_internal (mp_ptr quot_ptr,
3414 	      mp_srcptr dividend_ptr, mp_size_t dividend_size,
3415 	      mp_limb_t divisor_limb)
3416 #else
3417 __gmpn_divmod_1_internal (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
3418      mp_ptr quot_ptr;
3419      mp_srcptr dividend_ptr;
3420      mp_size_t dividend_size;
3421      mp_limb_t divisor_limb;
3422 #endif
3423 {
3424   mp_size_t i;
3425   mp_limb_t n1, n0, r;
3426 
3427   /* ??? Should this be handled at all?  Rely on callers?  */
3428   if (dividend_size == 0)
3429     return 0;
3430 
3431   /* If multiplication is much faster than division, and the
3432      dividend is large, pre-invert the divisor, and use
3433      only multiplications in the inner loop.  */
3434 
3435   /* This test should be read:
3436        Does it ever help to use udiv_qrnnd_preinv?
3437 	 && Does what we save compensate for the inversion overhead?  */
3438   if (UDIV_TIME > (2 * UMUL_TIME + 6)
3439       && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
3440     {
3441       int normalization_steps;
3442 
3443       count_leading_zeros (normalization_steps, divisor_limb);
3444       if (normalization_steps != 0)
3445 	{
3446 	  mp_limb_t divisor_limb_inverted;
3447 
3448 	  divisor_limb <<= normalization_steps;
3449 	  invert_limb (divisor_limb_inverted, divisor_limb);
3450 
3451 	  n1 = dividend_ptr[dividend_size - 1];
3452 	  r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
3453 
3454 	  /* Possible optimization:
3455 	     if (r == 0
3456 	     && divisor_limb > ((n1 << normalization_steps)
3457 			     | (dividend_ptr[dividend_size - 2] >> ...)))
3458 	     ...one division less... */
3459 
3460 	  for (i = dividend_size - 2; i >= 0; i--)
3461 	    {
3462 	      n0 = dividend_ptr[i];
3463 	      udiv_qrnnd_preinv (quot_ptr[i + 1], r, r,
3464 				 ((n1 << normalization_steps)
3465 				  | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
3466 				 divisor_limb, divisor_limb_inverted);
3467 	      n1 = n0;
3468 	    }
3469 	  udiv_qrnnd_preinv (quot_ptr[0], r, r,
3470 			     n1 << normalization_steps,
3471 			     divisor_limb, divisor_limb_inverted);
3472 	  return r >> normalization_steps;
3473 	}
3474       else
3475 	{
3476 	  mp_limb_t divisor_limb_inverted;
3477 
3478 	  invert_limb (divisor_limb_inverted, divisor_limb);
3479 
3480 	  i = dividend_size - 1;
3481 	  r = dividend_ptr[i];
3482 
3483 	  if (r >= divisor_limb)
3484 	    r = 0;
3485 	  else
3486 	    {
3487 	      quot_ptr[i] = 0;
3488 	      i--;
3489 	    }
3490 
3491 	  for (; i >= 0; i--)
3492 	    {
3493 	      n0 = dividend_ptr[i];
3494 	      udiv_qrnnd_preinv (quot_ptr[i], r, r,
3495 				 n0, divisor_limb, divisor_limb_inverted);
3496 	    }
3497 	  return r;
3498 	}
3499     }
3500   else
3501     {
3502       if (UDIV_NEEDS_NORMALIZATION)
3503 	{
3504 	  int normalization_steps;
3505 
3506 	  count_leading_zeros (normalization_steps, divisor_limb);
3507 	  if (normalization_steps != 0)
3508 	    {
3509 	      divisor_limb <<= normalization_steps;
3510 
3511 	      n1 = dividend_ptr[dividend_size - 1];
3512 	      r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
3513 
3514 	      /* Possible optimization:
3515 		 if (r == 0
3516 		 && divisor_limb > ((n1 << normalization_steps)
3517 				 | (dividend_ptr[dividend_size - 2] >> ...)))
3518 		 ...one division less... */
3519 
3520 	      for (i = dividend_size - 2; i >= 0; i--)
3521 		{
3522 		  n0 = dividend_ptr[i];
3523 		  udiv_qrnnd (quot_ptr[i + 1], r, r,
3524 			      ((n1 << normalization_steps)
3525 			       | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
3526 			      divisor_limb);
3527 		  n1 = n0;
3528 		}
3529 	      udiv_qrnnd (quot_ptr[0], r, r,
3530 			  n1 << normalization_steps,
3531 			  divisor_limb);
3532 	      return r >> normalization_steps;
3533 	    }
3534 	}
3535       /* No normalization needed, either because udiv_qrnnd doesn't require
3536 	 it, or because DIVISOR_LIMB is already normalized.  */
3537 
3538       i = dividend_size - 1;
3539       r = dividend_ptr[i];
3540 
3541       if (r >= divisor_limb)
3542 	r = 0;
3543       else
3544 	{
3545 	  quot_ptr[i] = 0;
3546 	  i--;
3547 	}
3548 
3549       for (; i >= 0; i--)
3550 	{
3551 	  n0 = dividend_ptr[i];
3552 	  udiv_qrnnd (quot_ptr[i], r, r, n0, divisor_limb);
3553 	}
3554       return r;
3555     }
3556 }
3557 
3558 
3559 
3560 mp_limb_t
3561 #if __STDC__
mpn_divrem_1(mp_ptr qp,mp_size_t qxn,mp_srcptr np,mp_size_t nn,mp_limb_t d)3562 mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
3563 	      mp_srcptr np, mp_size_t nn,
3564 	      mp_limb_t d)
3565 #else
3566 mpn_divrem_1 (qp, qxn, np, nn, d)
3567      mp_ptr qp;
3568      mp_size_t qxn;
3569      mp_srcptr np;
3570      mp_size_t nn;
3571      mp_limb_t d;
3572 #endif
3573 {
3574   mp_limb_t rlimb;
3575   mp_size_t i;
3576 
3577   /* Develop integer part of quotient.  */
3578   rlimb = __gmpn_divmod_1_internal (qp + qxn, np, nn, d);
3579 
3580   /* Develop fraction part of quotient.  This is not as fast as it should;
3581      the preinvert stuff from __gmpn_divmod_1_internal ought to be used here
3582      too.  */
3583   if (UDIV_NEEDS_NORMALIZATION)
3584     {
3585       int normalization_steps;
3586 
3587       count_leading_zeros (normalization_steps, d);
3588       if (normalization_steps != 0)
3589 	{
3590 	  d <<= normalization_steps;
3591 	  rlimb <<= normalization_steps;
3592 
3593 	  for (i = qxn - 1; i >= 0; i--)
3594 	    udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
3595 
3596 	  return rlimb >> normalization_steps;
3597 	}
3598       else
3599 	/* fall out */
3600 	;
3601     }
3602 
3603   for (i = qxn - 1; i >= 0; i--)
3604     udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
3605 
3606   return rlimb;
3607 }
3608 
3609 /* mpn_divrem_2 -- Divide natural numbers, producing both remainder and
3610    quotient.  The divisor is two limbs. */
3611 
3612 /* Divide num (NP/NSIZE) by den (DP/2) and write
3613    the NSIZE-2 least significant quotient limbs at QP
3614    and the 2 long remainder at NP.  If QEXTRA_LIMBS is
3615    non-zero, generate that many fraction bits and append them after the
3616    other quotient limbs.
3617    Return the most significant limb of the quotient, this is always 0 or 1.
3618 
3619    Preconditions:
3620    0. NSIZE >= 2.
3621    1. The most significant bit of the divisor must be set.
3622    2. QP must either not overlap with the input operands at all, or
3623       QP + 2 >= NP must hold true.  (This means that it's
3624       possible to put the quotient in the high part of NUM, right after the
3625       remainder in NUM.
3626    3. NSIZE >= 2, even if QEXTRA_LIMBS is non-zero.  */
3627 
3628 mp_limb_t
3629 #if __STDC__
mpn_divrem_2(mp_ptr qp,mp_size_t qxn,mp_ptr np,mp_size_t nsize,mp_srcptr dp)3630 mpn_divrem_2 (mp_ptr qp, mp_size_t qxn,
3631 	      mp_ptr np, mp_size_t nsize,
3632 	      mp_srcptr dp)
3633 #else
3634 mpn_divrem_2 (qp, qxn, np, nsize, dp)
3635      mp_ptr qp;
3636      mp_size_t qxn;
3637      mp_ptr np;
3638      mp_size_t nsize;
3639      mp_srcptr dp;
3640 #endif
3641 {
3642   mp_limb_t most_significant_q_limb = 0;
3643   mp_size_t i;
3644   mp_limb_t n1, n0, n2;
3645   mp_limb_t d1, d0;
3646   mp_limb_t d1inv = 0;
3647   int have_preinv;
3648 
3649   np += nsize - 2;
3650   d1 = dp[1];
3651   d0 = dp[0];
3652   n1 = np[1];
3653   n0 = np[0];
3654 
3655   if (n1 >= d1 && (n1 > d1 || n0 >= d0))
3656     {
3657       sub_ddmmss (n1, n0, n1, n0, d1, d0);
3658       most_significant_q_limb = 1;
3659     }
3660 
3661   /* If multiplication is much faster than division, preinvert the most
3662      significant divisor limb before entering the loop.  */
3663   if (UDIV_TIME > 2 * UMUL_TIME + 6)
3664     {
3665       have_preinv = 0;
3666       if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - 2) > UDIV_TIME)
3667 	{
3668 	  invert_limb (d1inv, d1);
3669 	  have_preinv = 1;
3670 	}
3671     }
3672 
3673   for (i = qxn + nsize - 2 - 1; i >= 0; i--)
3674     {
3675       mp_limb_t q;
3676       mp_limb_t r;
3677 
3678       if (i >= qxn)
3679 	np--;
3680       else
3681 	np[0] = 0;
3682 
3683       if (n1 == d1)
3684 	{
3685 	  /* Q should be either 111..111 or 111..110.  Need special treatment
3686 	     of this rare case as normal division would give overflow.  */
3687 	  q = ~(mp_limb_t) 0;
3688 
3689 	  r = n0 + d1;
3690 	  if (r < d1)	/* Carry in the addition? */
3691 	    {
3692 	      add_ssaaaa (n1, n0, r - d0, np[0], 0, d0);
3693 	      qp[i] = q;
3694 	      continue;
3695 	    }
3696 	  n1 = d0 - (d0 != 0);
3697 	  n0 = -d0;
3698 	}
3699       else
3700 	{
3701 	  if (UDIV_TIME > 2 * UMUL_TIME + 6 && have_preinv)
3702 	    udiv_qrnnd_preinv (q, r, n1, n0, d1, d1inv);
3703 	  else
3704 	    udiv_qrnnd (q, r, n1, n0, d1);
3705 	  umul_ppmm (n1, n0, d0, q);
3706 	}
3707 
3708       n2 = np[0];
3709 
3710     q_test:
3711       if (n1 > r || (n1 == r && n0 > n2))
3712 	{
3713 	  /* The estimated Q was too large.  */
3714 	  q--;
3715 
3716 	  sub_ddmmss (n1, n0, n1, n0, 0, d0);
3717 	  r += d1;
3718 	  if (r >= d1)	/* If not carry, test Q again.  */
3719 	    goto q_test;
3720 	}
3721 
3722       qp[i] = q;
3723       sub_ddmmss (n1, n0, r, n2, n1, n0);
3724     }
3725   np[1] = n1;
3726   np[0] = n0;
3727 
3728   return most_significant_q_limb;
3729 }
3730 
3731 /* mpn_mul_basecase -- Internal routine to multiply two natural numbers
3732    of length m and n. */
3733 
3734 /* Handle simple cases with traditional multiplication.
3735 
3736    This is the most critical code of multiplication.  All multiplies rely on
3737    this, both small and huge.  Small ones arrive here immediately, huge ones
3738    arrive here as this is the base case for Karatsuba's recursive algorithm. */
3739 
3740 void
3741 #if __STDC__
mpn_mul_basecase(mp_ptr prodp,mp_srcptr up,mp_size_t usize,mp_srcptr vp,mp_size_t vsize)3742 mpn_mul_basecase (mp_ptr prodp,
3743 		     mp_srcptr up, mp_size_t usize,
3744 		     mp_srcptr vp, mp_size_t vsize)
3745 #else
3746 mpn_mul_basecase (prodp, up, usize, vp, vsize)
3747      mp_ptr prodp;
3748      mp_srcptr up;
3749      mp_size_t usize;
3750      mp_srcptr vp;
3751      mp_size_t vsize;
3752 #endif
3753 {
3754   /* We first multiply by the low order one or two limbs, as the result can
3755      be stored, not added, to PROD.  We also avoid a loop for zeroing this
3756      way.  */
3757 #if HAVE_NATIVE_mpn_mul_2
3758   if (vsize >= 2)
3759     {
3760       prodp[usize + 1] = mpn_mul_2 (prodp, up, usize, vp[0], vp[1]);
3761       prodp += 2, vp += 2, vsize -= 2;
3762     }
3763   else
3764     {
3765       prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
3766       return;
3767     }
3768 #else
3769   prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
3770   prodp += 1, vp += 1, vsize -= 1;
3771 #endif
3772 
3773 #if HAVE_NATIVE_mpn_addmul_2
3774   while (vsize >= 2)
3775     {
3776       prodp[usize + 1] = mpn_addmul_2 (prodp, up, usize, vp[0], vp[1]);
3777       prodp += 2, vp += 2, vsize -= 2;
3778     }
3779   if (vsize != 0)
3780     prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
3781 #else
3782   /* For each iteration in the loop, multiply U with one limb from V, and
3783      add the result to PROD.  */
3784   while (vsize != 0)
3785     {
3786       prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
3787       prodp += 1, vp += 1, vsize -= 1;
3788     }
3789 #endif
3790 }
3791 
3792 
3793 /* mpn_sqr_basecase -- Internal routine to square two natural numbers
3794    of length m and n. */
3795 
3796 
3797 #define SQR_KARATSUBA_THRESHOLD KARATSUBA_SQR_THRESHOLD
3798 
3799 void
mpn_sqr_basecase(mp_ptr prodp,mp_srcptr up,mp_size_t n)3800 mpn_sqr_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t n)
3801 {
3802   mp_size_t i;
3803 
3804   ASSERT (n >= 1);
3805   ASSERT (! MPN_OVERLAP_P (prodp, 2*n, up, n));
3806 
3807   {
3808     /* N.B.!  We need the superfluous indirection through argh to work around
3809        a reloader bug in GCC 2.7.*.  */
3810 #if GMP_NAIL_BITS == 0
3811     mp_limb_t ul, argh;
3812     ul = up[0];
3813     umul_ppmm (argh, prodp[0], ul, ul);
3814     prodp[1] = argh;
3815 #else
3816     mp_limb_t ul, lpl;
3817     ul = up[0];
3818     umul_ppmm (prodp[1], lpl, ul, ul << GMP_NAIL_BITS);
3819     prodp[0] = lpl >> GMP_NAIL_BITS;
3820 #endif
3821   }
3822   if (n > 1)
3823     {
3824       mp_limb_t tarr[2 * SQR_KARATSUBA_THRESHOLD];
3825       mp_ptr tp = tarr;
3826       mp_limb_t cy;
3827 
3828       /* must fit 2*n limbs in tarr */
3829       ASSERT (n <= SQR_KARATSUBA_THRESHOLD);
3830 
3831       cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
3832       tp[n - 1] = cy;
3833       for (i = 2; i < n; i++)
3834 	{
3835 	  mp_limb_t cy;
3836 	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
3837 	  tp[n + i - 2] = cy;
3838 	}
3839 #if HAVE_NATIVE_mpn_sqr_diagonal
3840       mpn_sqr_diagonal (prodp + 2, up + 1, n - 1);
3841 #else
3842       for (i = 1; i < n; i++)
3843 	{
3844 #if GMP_NAIL_BITS == 0
3845 	  mp_limb_t ul;
3846 	  ul = up[i];
3847 	  umul_ppmm (prodp[2 * i + 1], prodp[2 * i], ul, ul);
3848 #else
3849 	  mp_limb_t ul, lpl;
3850 	  ul = up[i];
3851 	  umul_ppmm (prodp[2 * i + 1], lpl, ul, ul << GMP_NAIL_BITS);
3852 	  prodp[2 * i] = lpl >> GMP_NAIL_BITS;
3853 #endif
3854 	}
3855 #endif
3856       {
3857 	mp_limb_t cy;
3858 	cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
3859 	cy += mpn_add_n (prodp + 1, prodp + 1, tp, 2 * n - 2);
3860 	prodp[2 * n - 1] += cy;
3861       }
3862     }
3863 }
3864 
3865 #if 0
3866 
3867 void
3868 #if __STDC__
3869 mpn_sqr_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t n)
3870 #else
3871 mpn_sqr_basecase (prodp, up, n)
3872      mp_ptr prodp;
3873      mp_srcptr up;
3874      mp_size_t n;
3875 #endif
3876 {
3877   mp_size_t i;
3878 
3879   {
3880     /* N.B.!  We need the superfluous indirection through argh to work around
3881        a reloader bug in GCC 2.7.*.  */
3882     mp_limb_t x;
3883     mp_limb_t argh;
3884     x = up[0];
3885     umul_ppmm (argh, prodp[0], x, x);
3886     prodp[1] = argh;
3887   }
3888   if (n > 1)
3889     {
3890       mp_limb_t tarr[2 * KARATSUBA_SQR_THRESHOLD];
3891       mp_ptr tp = tarr;
3892       mp_limb_t cy;
3893 
3894       /* must fit 2*n limbs in tarr */
3895       ASSERT (n <= KARATSUBA_SQR_THRESHOLD);
3896 
3897       cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
3898       tp[n - 1] = cy;
3899       for (i = 2; i < n; i++)
3900 	{
3901 	  mp_limb_t cy;
3902 	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
3903 	  tp[n + i - 2] = cy;
3904 	}
3905       for (i = 1; i < n; i++)
3906 	{
3907 	  mp_limb_t x;
3908 	  x = up[i];
3909 	  umul_ppmm (prodp[2 * i + 1], prodp[2 * i], x, x);
3910 	}
3911       {
3912 	mp_limb_t cy;
3913 	cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
3914 	cy += mpn_add_n (prodp + 1, prodp + 1, tp, 2 * n - 2);
3915 	prodp[2 * n - 1] += cy;
3916       }
3917     }
3918 }
3919 
3920 #endif
3921 
3922 /* mpn_mul_1 -- Multiply a limb vector with a single limb and
3923    store the product in a second limb vector. */
3924 
3925 mp_limb_t
mpn_mul_1(mp_ptr res_ptr,mp_srcptr s1_ptr,mp_size_t s1_size,mp_limb_t s2_limb)3926 mpn_mul_1 (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_size_t s1_size, mp_limb_t s2_limb)
3927 {
3928   register mp_limb_t cy_limb;
3929   register mp_size_t j;
3930   register mp_limb_t prod_high, prod_low;
3931 
3932   SCHEME_BIGNUM_USE_FUEL(s1_size);
3933 
3934   /* The loop counter and index J goes from -S1_SIZE to -1.  This way
3935      the loop becomes faster.  */
3936   j = -s1_size;
3937 
3938   /* Offset the base pointers to compensate for the negative indices.  */
3939   s1_ptr -= j;
3940   res_ptr -= j;
3941 
3942   cy_limb = 0;
3943   do
3944     {
3945       umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
3946 
3947       prod_low += cy_limb;
3948       cy_limb = (prod_low < cy_limb) + prod_high;
3949 
3950       res_ptr[j] = prod_low;
3951     }
3952   while (++j != 0);
3953 
3954   return cy_limb;
3955 }
3956 
3957 /* mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
3958    by S2_LIMB, add the S1_SIZE least significant limbs of the product to the
3959    limb vector pointed to by RES_PTR.  Return the most significant limb of
3960    the product, adjusted for carry-out from the addition. */
3961 
3962 mp_limb_t
mpn_addmul_1(mp_ptr rp,mp_srcptr up,mp_size_t n,mp_limb_t vl)3963 mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
3964 {
3965   mp_limb_t ul, cl, hpl, lpl, rl;
3966 
3967   SCHEME_BIGNUM_USE_FUEL(n);
3968 
3969   cl = 0;
3970   do
3971     {
3972       ul = *up++;
3973       umul_ppmm (hpl, lpl, ul, vl);
3974 
3975       lpl += cl;
3976       cl = (lpl < cl) + hpl;
3977 
3978       rl = *rp;
3979       lpl = rl + lpl;
3980       cl += lpl < rl;
3981       *rp++ = lpl;
3982     }
3983   while (--n != 0);
3984 
3985   return cl;
3986 }
3987 
3988 /* mpn_submul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
3989    by S2_LIMB, subtract the S1_SIZE least significant limbs of the product
3990    from the limb vector pointed to by RES_PTR.  Return the most significant
3991    limb of the product, adjusted for carry-out from the subtraction.
3992  */
3993 
3994 mp_limb_t
mpn_submul_1(mp_ptr res_ptr,mp_srcptr s1_ptr,mp_size_t s1_size,mp_limb_t s2_limb)3995 mpn_submul_1 (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_size_t s1_size, mp_limb_t s2_limb)
3996 {
3997   register mp_limb_t cy_limb;
3998   register mp_size_t j;
3999   register mp_limb_t prod_high, prod_low;
4000   register mp_limb_t x;
4001 
4002   SCHEME_BIGNUM_USE_FUEL(s1_size);
4003 
4004   /* The loop counter and index J goes from -SIZE to -1.  This way
4005      the loop becomes faster.  */
4006   j = -s1_size;
4007 
4008   /* Offset the base pointers to compensate for the negative indices.  */
4009   res_ptr -= j;
4010   s1_ptr -= j;
4011 
4012   cy_limb = 0;
4013   do
4014     {
4015       umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
4016 
4017       prod_low += cy_limb;
4018       cy_limb = (prod_low < cy_limb) + prod_high;
4019 
4020       x = res_ptr[j];
4021       prod_low = x - prod_low;
4022       cy_limb += (prod_low > x);
4023       res_ptr[j] = prod_low;
4024     }
4025   while (++j != 0);
4026 
4027   return cy_limb;
4028 }
4029 
4030 /* mpn_divexact_by3 -- mpn division by 3, expecting no remainder. */
4031 
4032 
4033 /* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
4034    0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
4035 #define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
4036 
4037 
4038 /* The "c += ..."s are adding the high limb of 3*l to c.  That high limb
4039    will be 0, 1 or 2.  Doing two separate "+="s seems to turn out better
4040    code on gcc (as of 2.95.2 at least).
4041 
4042    When a subtraction of a 0,1,2 carry value causes a borrow, that leaves a
4043    limb value of either 0xFF...FF or 0xFF...FE and the multiply by INVERSE_3
4044    gives 0x55...55 or 0xAA...AA respectively, producing a further borrow of
4045    only 0 or 1 respectively.  Hence the carry out of each stage and for the
4046    return value is always only 0, 1 or 2.  */
4047 
4048 mp_limb_t
4049 #if __STDC__
mpn_divexact_by3c(mp_ptr dst,mp_srcptr src,mp_size_t size,mp_limb_t c)4050 mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t c)
4051 #else
4052 mpn_divexact_by3c (dst, src, size, c)
4053      mp_ptr    dst;
4054      mp_srcptr src;
4055      mp_size_t size;
4056      mp_limb_t c;
4057 #endif
4058 {
4059   mp_size_t  i;
4060 
4061   SCHEME_BIGNUM_USE_FUEL(size);
4062 
4063   ASSERT (size >= 1);
4064 
4065   i = 0;
4066   do
4067     {
4068       mp_limb_t  l, s;
4069 
4070       s = src[i];
4071       l = s - c;
4072       c = (l > s);
4073 
4074       l *= INVERSE_3;
4075       dst[i] = l;
4076 
4077       c += (l > MP_LIMB_T_MAX/3);
4078       c += (l > (MP_LIMB_T_MAX/3)*2);
4079     }
4080   while (++i < size);
4081 
4082   return c;
4083 }
4084 
4085 /* mpn_mul -- Multiply two natural numbers. */
4086 
4087 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
4088    (pointed to by VP, with VN limbs), and store the result at PRODP.  The
4089    result is UN + VN limbs.  Return the most significant limb of the result.
4090 
4091    NOTE: The space pointed to by PRODP is overwritten before finished with U
4092    and V, so overlap is an error.
4093 
4094    Argument constraints:
4095    1. UN >= VN.
4096    2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
4097       the multiplier and the multiplicand.  */
4098 
4099 void
4100 #if __STDC__
mpn_sqr_n(mp_ptr prodp,mp_srcptr up,mp_size_t un)4101 mpn_sqr_n (mp_ptr prodp,
4102          mp_srcptr up, mp_size_t un)
4103 #else
4104 mpn_sqr_n (prodp, up, un)
4105      mp_ptr prodp;
4106      mp_srcptr up;
4107      mp_size_t un;
4108 #endif
4109 {
4110   if (un < KARATSUBA_SQR_THRESHOLD)
4111     { /* plain schoolbook multiplication */
4112       if (un == 0)
4113 	return;
4114       mpn_sqr_basecase (prodp, up, un);
4115     }
4116   else if (un < TOOM3_SQR_THRESHOLD)
4117     { /* karatsuba multiplication */
4118       mp_ptr tspace;
4119       TMP_DECL (marker);
4120       TMP_MARK (marker);
4121       tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
4122       mpn_kara_sqr_n (prodp, up, un, tspace);
4123       TMP_FREE (marker);
4124     }
4125 #if WANT_FFT || TUNE_PROGRAM_BUILD
4126   else if (un < FFT_SQR_THRESHOLD)
4127 #else
4128   else
4129 #endif
4130     { /* toom3 multiplication */
4131       mp_ptr tspace;
4132       TMP_DECL (marker);
4133       TMP_MARK (marker);
4134       tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
4135       mpn_toom3_sqr_n (prodp, up, un, tspace);
4136       TMP_FREE (marker);
4137     }
4138 #if WANT_FFT || TUNE_PROGRAM_BUILD
4139   else
4140     {
4141       /* schoenhage multiplication */
4142       mpn_mul_fft_full (prodp, up, un, up, un);
4143     }
4144 #endif
4145 }
4146 
4147 mp_limb_t
4148 #if __STDC__
mpn_mul(mp_ptr prodp,mp_srcptr up,mp_size_t un,mp_srcptr vp,mp_size_t vn)4149 mpn_mul (mp_ptr prodp,
4150 	 mp_srcptr up, mp_size_t un,
4151 	 mp_srcptr vp, mp_size_t vn)
4152 #else
4153 mpn_mul (prodp, up, un, vp, vn)
4154      mp_ptr prodp;
4155      mp_srcptr up;
4156      mp_size_t un;
4157      mp_srcptr vp;
4158      mp_size_t vn;
4159 #endif
4160 {
4161   mp_size_t l;
4162   mp_limb_t c;
4163 
4164   if (up == vp && un == vn)
4165     {
4166       mpn_sqr_n (prodp, up, un);
4167       return prodp[2 * un - 1];
4168     }
4169 
4170   if (vn < KARATSUBA_MUL_THRESHOLD)
4171     { /* long multiplication */
4172       mpn_mul_basecase (prodp, up, un, vp, vn);
4173       return prodp[un + vn - 1];
4174     }
4175 
4176   mpn_mul_n (prodp, up, vp, vn);
4177   if (un != vn)
4178     { mp_limb_t t;
4179       mp_ptr ws;
4180       TMP_DECL (marker);
4181       TMP_MARK (marker);
4182 
4183       prodp += vn;
4184       l = vn;
4185       up += vn;
4186       un -= vn;
4187 
4188       if (un < vn)
4189 	{
4190 	  /* Swap u's and v's. */
4191           MPN_SRCPTR_SWAP (up,un, vp,vn);
4192 	}
4193 
4194       ws = (mp_ptr) TMP_ALLOC (((vn >= KARATSUBA_MUL_THRESHOLD ? vn : un) + vn)
4195 			       * BYTES_PER_MP_LIMB);
4196 
4197       t = 0;
4198       while (vn >= KARATSUBA_MUL_THRESHOLD)
4199 	{
4200 	  mpn_mul_n (ws, up, vp, vn);
4201 	  if (l <= 2*vn)
4202 	    {
4203 	      t += mpn_add_n (prodp, prodp, ws, l);
4204 	      if (l != 2*vn)
4205 		{
4206 		  t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t);
4207 		  l = 2*vn;
4208 		}
4209 	    }
4210 	  else
4211 	    {
4212 	      c = mpn_add_n (prodp, prodp, ws, 2*vn);
4213 	      t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c);
4214 	    }
4215 	  prodp += vn;
4216 	  l -= vn;
4217 	  up += vn;
4218 	  un -= vn;
4219 	  if (un < vn)
4220 	    {
4221 	      /* Swap u's and v's. */
4222               MPN_SRCPTR_SWAP (up,un, vp,vn);
4223 	    }
4224 	}
4225 
4226       if (vn)
4227 	{
4228 	  mpn_mul_basecase (ws, up, un, vp, vn);
4229 	  if (l <= un + vn)
4230 	    {
4231 	      t += mpn_add_n (prodp, prodp, ws, l);
4232 	      if (l != un + vn)
4233 		t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t);
4234 	    }
4235 	  else
4236 	    {
4237 	      c = mpn_add_n (prodp, prodp, ws, un + vn);
4238 	      t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c);
4239 	    }
4240 	}
4241 
4242     TMP_FREE (marker);
4243   }
4244   return prodp[un + vn - 1];
4245 }
4246 
4247 /* mpn_divrem -- Divide natural numbers, producing both remainder and
4248    quotient.  This is now just a middle layer for calling the new
4249    internal mpn_tdiv_qr. */
4250 
4251 mp_limb_t
4252 #if __STDC__
mpn_divrem(mp_ptr qp,mp_size_t qxn,mp_ptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn)4253 mpn_divrem (mp_ptr qp, mp_size_t qxn,
4254 	    mp_ptr np, mp_size_t nn,
4255 	    mp_srcptr dp, mp_size_t dn)
4256 #else
4257 mpn_divrem (qp, qxn, np, nn, dp, dn)
4258      mp_ptr qp;
4259      mp_size_t qxn;
4260      mp_ptr np;
4261      mp_size_t nn;
4262      mp_srcptr dp;
4263      mp_size_t dn;
4264 #endif
4265 {
4266   SCHEME_BIGNUM_USE_FUEL(dn + nn);
4267 
4268   if (dn == 1)
4269     {
4270       mp_limb_t ret;
4271       mp_ptr q2p;
4272       mp_size_t qn;
4273       TMP_DECL (marker);
4274 
4275       TMP_MARK (marker);
4276       q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
4277 
4278       np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]);
4279       qn = nn + qxn - 1;
4280       MPN_COPY (qp, q2p, qn);
4281       ret = q2p[qn];
4282 
4283       TMP_FREE (marker);
4284       return ret;
4285     }
4286   else if (dn == 2)
4287     {
4288       return mpn_divrem_2 (qp, qxn, np, nn, dp);
4289     }
4290   else
4291     {
4292       mp_ptr rp, q2p;
4293       mp_limb_t qhl;
4294       mp_size_t qn;
4295       TMP_DECL (marker);
4296 
4297       TMP_MARK (marker);
4298       if (qxn != 0)
4299 	{
4300 	  mp_ptr n2p;
4301 	  n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
4302 	  MPN_ZERO (n2p, qxn);
4303 	  MPN_COPY (n2p + qxn, np, nn);
4304 	  q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB);
4305 	  rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
4306 	  mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn);
4307 	  MPN_COPY (np, rp, dn);
4308 	  qn = nn - dn + qxn;
4309 	  MPN_COPY (qp, q2p, qn);
4310 	  qhl = q2p[qn];
4311 	}
4312       else
4313 	{
4314 	  q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB);
4315 	  rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
4316 	  mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn);
4317 	  MPN_COPY (np, rp, dn);	/* overwrite np area with remainder */
4318 	  qn = nn - dn;
4319 	  MPN_COPY (qp, q2p, qn);
4320 	  qhl = q2p[qn];
4321 	}
4322       TMP_FREE (marker);
4323       return qhl;
4324     }
4325 }
4326 
4327 /************************************************************************/
4328 /* GCD: */
4329 
4330 #define MPN_MOD_OR_MODEXACT_1_ODD(src,size,divisor)     \
4331   mpn_mod_1 (src, size, divisor)
4332 
4333 
4334 
4335 /* The size where udiv_qrnnd_preinv should be used rather than udiv_qrnnd,
4336    meaning the quotient size where that should happen, the quotient size
4337    being how many udiv divisions will be done.
4338 
4339    The default is to use preinv always, CPUs where this doesn't suit have
4340    tuned thresholds.  Note in particular that preinv should certainly be
4341    used if that's the only division available (USE_PREINV_ALWAYS).  */
4342 
4343 #ifndef MOD_1_NORM_THRESHOLD
4344 #define MOD_1_NORM_THRESHOLD  0
4345 #endif
4346 #ifndef MOD_1_UNNORM_THRESHOLD
4347 #define MOD_1_UNNORM_THRESHOLD  0
4348 #endif
4349 
4350 
4351 /* The comments in mpn/generic/divrem_1.c apply here too.
4352 
4353    As noted in the algorithms section of the manual, the shifts in the loop
4354    for the unnorm case can be avoided by calculating r = a%(d*2^n), followed
4355    by a final (r*2^n)%(d*2^n).  In fact if it happens that a%(d*2^n) can
4356    skip a division where (a*2^n)%(d*2^n) can't then there's the same number
4357    of divide steps, though how often that happens depends on the assumed
4358    distributions of dividend and divisor.  In any case this idea is left to
4359    CPU specific implementations to consider.  */
4360 
4361 mp_limb_t
mpn_mod_1(mp_srcptr up,mp_size_t un,mp_limb_t d)4362 mpn_mod_1 (mp_srcptr up, mp_size_t un, mp_limb_t d)
4363 {
4364   mp_size_t  i;
4365   mp_limb_t  n1, n0, r;
4366   mp_limb_t  dummy USED_ONLY_SOMETIMES;
4367 
4368   ASSERT (un >= 0);
4369   ASSERT (d != 0);
4370 
4371   /* Botch: Should this be handled at all?  Rely on callers?
4372      But note un==0 is currently required by mpz/fdiv_r_ui.c and possibly
4373      other places.  */
4374   if (un == 0)
4375     return 0;
4376 
4377   d <<= GMP_NAIL_BITS;
4378 
4379   if ((d & GMP_LIMB_HIGHBIT) != 0)
4380     {
4381       /* High limb is initial remainder, possibly with one subtract of
4382 	 d to get r<d.  */
4383       r = up[un - 1] << GMP_NAIL_BITS;
4384       if (r >= d)
4385 	r -= d;
4386       r >>= GMP_NAIL_BITS;
4387       un--;
4388       if (un == 0)
4389 	return r;
4390 
4391       if (BELOW_THRESHOLD (un, MOD_1_NORM_THRESHOLD))
4392 	{
4393 	plain:
4394 	  for (i = un - 1; i >= 0; i--)
4395 	    {
4396 	      n0 = up[i] << GMP_NAIL_BITS;
4397 	      udiv_qrnnd (dummy, r, r, n0, d);
4398 	      r >>= GMP_NAIL_BITS;
4399 	    }
4400 	  return r;
4401 	}
4402       else
4403 	{
4404 	  mp_limb_t  inv;
4405 	  invert_limb (inv, d);
4406 	  for (i = un - 1; i >= 0; i--)
4407 	    {
4408 	      n0 = up[i] << GMP_NAIL_BITS;
4409 	      udiv_qrnnd_preinv (dummy, r, r, n0, d, inv);
4410 	      r >>= GMP_NAIL_BITS;
4411 	    }
4412 	  return r;
4413 	}
4414     }
4415   else
4416     {
4417       int norm;
4418 
4419       /* Skip a division if high < divisor.  Having the test here before
4420 	 normalizing will still skip as often as possible.  */
4421       r = up[un - 1] << GMP_NAIL_BITS;
4422       if (r < d)
4423 	{
4424 	  r >>= GMP_NAIL_BITS;
4425 	  un--;
4426 	  if (un == 0)
4427 	    return r;
4428 	}
4429       else
4430 	r = 0;
4431 
4432       /* If udiv_qrnnd doesn't need a normalized divisor, can use the simple
4433 	 code above. */
4434       if (! UDIV_NEEDS_NORMALIZATION
4435 	  && BELOW_THRESHOLD (un, MOD_1_UNNORM_THRESHOLD))
4436 	goto plain;
4437 
4438       count_leading_zeros (norm, d);
4439       d <<= norm;
4440 
4441       n1 = up[un - 1] << GMP_NAIL_BITS;
4442       r = (r << norm) | (n1 >> (GMP_LIMB_BITS - norm));
4443 
4444       if (UDIV_NEEDS_NORMALIZATION
4445 	  && BELOW_THRESHOLD (un, MOD_1_UNNORM_THRESHOLD))
4446 	{
4447 	  for (i = un - 2; i >= 0; i--)
4448 	    {
4449 	      n0 = up[i] << GMP_NAIL_BITS;
4450 	      udiv_qrnnd (dummy, r, r,
4451 			  (n1 << norm) | (n0 >> (GMP_NUMB_BITS - norm)),
4452 			  d);
4453 	      r >>= GMP_NAIL_BITS;
4454 	      n1 = n0;
4455 	    }
4456 	  udiv_qrnnd (dummy, r, r, n1 << norm, d);
4457 	  r >>= GMP_NAIL_BITS;
4458 	  return r >> norm;
4459 	}
4460       else
4461 	{
4462 	  mp_limb_t inv;
4463 	  invert_limb (inv, d);
4464 
4465 	  for (i = un - 2; i >= 0; i--)
4466 	    {
4467 	      n0 = up[i] << GMP_NAIL_BITS;
4468 	      udiv_qrnnd_preinv (dummy, r, r,
4469 				 (n1 << norm) | (n0 >> (GMP_NUMB_BITS - norm)),
4470 				 d, inv);
4471 	      r >>= GMP_NAIL_BITS;
4472 	      n1 = n0;
4473 	    }
4474 	  udiv_qrnnd_preinv (dummy, r, r, n1 << norm, d, inv);
4475 	  r >>= GMP_NAIL_BITS;
4476 	  return r >> norm;
4477 	}
4478     }
4479 }
4480 
4481 /* Does not work for U == 0 or V == 0.  It would be tough to make it work for
4482    V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t.
4483 
4484    The threshold for doing u%v when size==1 will vary by CPU according to
4485    the speed of a division and the code generated for the main loop.  Any
4486    tuning for this is left to a CPU specific implementation.  */
4487 
4488 mp_limb_t
mpn_gcd_1(mp_srcptr up,mp_size_t size,mp_limb_t vlimb)4489 mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb)
4490 {
4491   mp_limb_t      ulimb;
4492   unsigned long  zero_bits, u_low_zero_bits;
4493 
4494   ASSERT (size >= 1);
4495   ASSERT (vlimb != 0);
4496   /* ASSERT_MPN_NONZERO_P (up, size); */
4497 
4498   ulimb = up[0];
4499 
4500   /* Need vlimb odd for modexact, want it odd to get common zeros. */
4501   count_trailing_zeros (zero_bits, vlimb);
4502   vlimb >>= zero_bits;
4503 
4504   if (size > 1)
4505     {
4506       /* Must get common zeros before the mod reduction.  If ulimb==0 then
4507 	 vlimb already gives the common zeros.  */
4508       if (ulimb != 0)
4509 	{
4510 	  count_trailing_zeros (u_low_zero_bits, ulimb);
4511 	  zero_bits = MIN (zero_bits, u_low_zero_bits);
4512 	}
4513 
4514       ulimb = MPN_MOD_OR_MODEXACT_1_ODD (up, size, vlimb);
4515       if (ulimb == 0)
4516 	goto done;
4517 
4518       goto strip_u_maybe;
4519     }
4520 
4521   /* size==1, so up[0]!=0 */
4522   count_trailing_zeros (u_low_zero_bits, ulimb);
4523   ulimb >>= u_low_zero_bits;
4524   zero_bits = MIN (zero_bits, u_low_zero_bits);
4525 
4526   /* make u bigger */
4527   if (vlimb > ulimb)
4528     MP_LIMB_T_SWAP (ulimb, vlimb);
4529 
4530   /* if u is much bigger than v, reduce using a division rather than
4531      chipping away at it bit-by-bit */
4532   if ((ulimb >> 16) > vlimb)
4533     {
4534       ulimb %= vlimb;
4535       if (ulimb == 0)
4536 	goto done;
4537       goto strip_u_maybe;
4538     }
4539 
4540   while (ulimb != vlimb)
4541     {
4542       ASSERT (ulimb & 1);
4543       ASSERT (vlimb & 1);
4544 
4545       if (ulimb > vlimb)
4546 	{
4547 	  ulimb -= vlimb;
4548 	  do
4549 	    {
4550 	      ulimb >>= 1;
4551 	      ASSERT (ulimb != 0);
4552 	    strip_u_maybe:
4553 	      ;
4554 	    }
4555 	  while ((ulimb & 1) == 0);
4556 	}
4557       else /*  vlimb > ulimb.  */
4558 	{
4559 	  vlimb -= ulimb;
4560 	  do
4561 	    {
4562 	      vlimb >>= 1;
4563 	      ASSERT (vlimb != 0);
4564 	    }
4565 	  while ((vlimb & 1) == 0);
4566 	}
4567     }
4568 
4569  done:
4570   return vlimb << zero_bits;
4571 }
4572 
4573 
4574 /* Integer greatest common divisor of two unsigned integers, using
4575    the accelerated algorithm (see reference below).
4576 
4577    mp_size_t mpn_gcd (up, usize, vp, vsize).
4578 
4579    Preconditions [U = (up, usize) and V = (vp, vsize)]:
4580 
4581    1.  V is odd.
4582    2.  numbits(U) >= numbits(V).
4583 
4584    Both U and V are destroyed by the operation.  The result is left at vp,
4585    and its size is returned.
4586 
4587    Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
4588 
4589    Funding for this work has been partially provided by Conselho Nacional
4590    de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
4591    301314194-2, and was done while I was a visiting reseacher in the Instituto
4592    de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
4593 
4594    Refer to
4595 	K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
4596 	Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
4597 
4598 /* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
4599    algorithm is used, otherwise the binary algorithm is used.  This may be
4600    adjusted for different architectures.  */
4601 #ifndef GCD_ACCEL_THRESHOLD
4602 #define GCD_ACCEL_THRESHOLD 5
4603 #endif
4604 
4605 /* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
4606    algorithm reduces using the bmod operation.  Otherwise, the k-ary reduction
4607    is used.  0 <= BMOD_THRESHOLD < GMP_NUMB_BITS.  */
4608 enum
4609   {
4610     BMOD_THRESHOLD = GMP_NUMB_BITS/2
4611   };
4612 
4613 
4614 /* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
4615    Both U and V must be odd.  */
4616 static inline mp_size_t
gcd_2(mp_ptr vp,mp_srcptr up)4617 gcd_2 (mp_ptr vp, mp_srcptr up)
4618 {
4619   mp_limb_t u0, u1, v0, v1;
4620   mp_size_t vsize;
4621 
4622   u0 = up[0];
4623   u1 = up[1];
4624   v0 = vp[0];
4625   v1 = vp[1];
4626 
4627   while (u1 != v1 && u0 != v0)
4628     {
4629       unsigned long int r;
4630       if (u1 > v1)
4631 	{
4632 	  u1 -= v1 + (u0 < v0);
4633 	  u0 = (u0 - v0) & GMP_NUMB_MASK;
4634 	  count_trailing_zeros (r, u0);
4635 	  u0 = ((u1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (u0 >> r);
4636 	  u1 >>= r;
4637 	}
4638       else  /* u1 < v1.  */
4639 	{
4640 	  v1 -= u1 + (v0 < u0);
4641 	  v0 = (v0 - u0) & GMP_NUMB_MASK;
4642 	  count_trailing_zeros (r, v0);
4643 	  v0 = ((v1 << (GMP_NUMB_BITS - r)) & GMP_NUMB_MASK) | (v0 >> r);
4644 	  v1 >>= r;
4645 	}
4646     }
4647 
4648   vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
4649 
4650   /* If U == V == GCD, done.  Otherwise, compute GCD (V, |U - V|).  */
4651   if (u1 == v1 && u0 == v0)
4652     return vsize;
4653 
4654   v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
4655   vp[0] = mpn_gcd_1 (vp, vsize, v0);
4656 
4657   return 1;
4658 }
4659 
4660 /* The function find_a finds 0 < N < 2^GMP_NUMB_BITS such that there exists
4661    0 < |D| < 2^GMP_NUMB_BITS, and N == D * C mod 2^(2*GMP_NUMB_BITS).
4662    In the reference article, D was computed along with N, but it is better to
4663    compute D separately as D <-- N / C mod 2^(GMP_NUMB_BITS + 1), treating
4664    the result as a twos' complement signed integer.
4665 
4666    Initialize N1 to C mod 2^(2*GMP_NUMB_BITS).  According to the reference
4667    article, N2 should be initialized to 2^(2*GMP_NUMB_BITS), but we use
4668    2^(2*GMP_NUMB_BITS) - N1 to start the calculations within double
4669    precision.  If N2 > N1 initially, the first iteration of the while loop
4670    will swap them.  In all other situations, N1 >= N2 is maintained.  */
4671 
4672 #if HAVE_NATIVE_mpn_gcd_finda
4673 #define find_a(cp)  mpn_gcd_finda (cp)
4674 
4675 #else
4676 static
4677 #if ! defined (__i386__)
4678 inline				/* don't inline this for the x86 */
4679 #endif
4680 mp_limb_t
find_a(mp_srcptr cp)4681 find_a (mp_srcptr cp)
4682 {
4683   unsigned long int leading_zero_bits = 0;
4684 
4685   mp_limb_t n1_l = cp[0];	/* N1 == n1_h * 2^GMP_NUMB_BITS + n1_l.  */
4686   mp_limb_t n1_h = cp[1];
4687 
4688   mp_limb_t n2_l = (-n1_l & GMP_NUMB_MASK);	/* N2 == n2_h * 2^GMP_NUMB_BITS + n2_l.  */
4689   mp_limb_t n2_h = (~n1_h & GMP_NUMB_MASK);
4690 
4691   /* Main loop.  */
4692   while (n2_h != 0)		/* While N2 >= 2^GMP_NUMB_BITS.  */
4693     {
4694       /* N1 <-- N1 % N2.  */
4695       if (((GMP_NUMB_HIGHBIT >> leading_zero_bits) & n2_h) == 0)
4696 	{
4697 	  unsigned long int i;
4698 	  count_leading_zeros (i, n2_h);
4699 	  i -= GMP_NAIL_BITS;
4700 	  i -= leading_zero_bits;
4701 	  leading_zero_bits += i;
4702 	  n2_h = ((n2_h << i) & GMP_NUMB_MASK) | (n2_l >> (GMP_NUMB_BITS - i));
4703 	  n2_l = (n2_l << i) & GMP_NUMB_MASK;
4704 	  do
4705 	    {
4706 	      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
4707 		{
4708 		  n1_h -= n2_h + (n1_l < n2_l);
4709 		  n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
4710 		}
4711 	      n2_l = (n2_l >> 1) | ((n2_h << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK);
4712 	      n2_h >>= 1;
4713 	      i -= 1;
4714 	    }
4715 	  while (i != 0);
4716 	}
4717       if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
4718 	{
4719 	  n1_h -= n2_h + (n1_l < n2_l);
4720 	  n1_l = (n1_l - n2_l) & GMP_NUMB_MASK;
4721 	}
4722 
4723       MP_LIMB_T_SWAP (n1_h, n2_h);
4724       MP_LIMB_T_SWAP (n1_l, n2_l);
4725     }
4726 
4727   return n2_l;
4728 }
4729 #endif
4730 
4731 
4732 mp_size_t
mpn_gcd(mp_ptr gp,mp_ptr up,mp_size_t usize,mp_ptr vp,mp_size_t vsize)4733 mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
4734 {
4735   mp_ptr orig_vp = vp;
4736   mp_size_t orig_vsize = vsize;
4737   int binary_gcd_ctr;		/* Number of times binary gcd will execute.  */
4738   TMP_DECL (marker);
4739 
4740   ASSERT (usize >= 1);
4741   ASSERT (vsize >= 1);
4742   ASSERT (usize >= vsize);
4743   ASSERT (vp[0] & 1);
4744   ASSERT (up[usize - 1] != 0);
4745   ASSERT (vp[vsize - 1] != 0);
4746 #if WANT_ASSERT
4747   if (usize == vsize)
4748     {
4749       int  uzeros, vzeros;
4750       count_leading_zeros (uzeros, up[usize - 1]);
4751       count_leading_zeros (vzeros, vp[vsize - 1]);
4752       ASSERT (uzeros <= vzeros);
4753     }
4754 #endif
4755   ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize));
4756   ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, up, usize));
4757   ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, vsize, vp, vsize));
4758 
4759   TMP_MARK (marker);
4760 
4761   /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
4762      Two EXTRA limbs for U and V are required for kary reduction.  */
4763   if (vsize >= GCD_ACCEL_THRESHOLD)
4764     {
4765       unsigned long int vbitsize, d;
4766       mp_ptr orig_up = up;
4767       mp_size_t orig_usize = usize;
4768       mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
4769 
4770       MPN_COPY (anchor_up, orig_up, usize);
4771       up = anchor_up;
4772 
4773       count_leading_zeros (d, up[usize - 1]);
4774       d -= GMP_NAIL_BITS;
4775       d = usize * GMP_NUMB_BITS - d;
4776       count_leading_zeros (vbitsize, vp[vsize - 1]);
4777       vbitsize -= GMP_NAIL_BITS;
4778       vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
4779       ASSERT (d >= vbitsize);
4780       d = d - vbitsize + 1;
4781 
4782       /* Use bmod reduction to quickly discover whether V divides U.  */
4783       up[usize++] = 0;				/* Insert leading zero.  */
4784       mpn_bdivmod (up, up, usize, vp, vsize, d);
4785 
4786       /* Now skip U/V mod 2^d and any low zero limbs.  */
4787       d /= GMP_NUMB_BITS, up += d, usize -= d;
4788       while (usize != 0 && up[0] == 0)
4789 	up++, usize--;
4790 
4791       if (usize == 0)				/* GCD == ORIG_V.  */
4792 	goto done;
4793 
4794       vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
4795       MPN_COPY (vp, orig_vp, vsize);
4796 
4797       do					/* Main loop.  */
4798 	{
4799 	  /* mpn_com_n can't be used here because anchor_up and up may
4800 	     partially overlap */
4801 	  if ((up[usize - 1] & GMP_NUMB_HIGHBIT) != 0)  /* U < 0; take twos' compl. */
4802 	    {
4803 	      mp_size_t i;
4804 	      anchor_up[0] = -up[0] & GMP_NUMB_MASK;
4805 	      for (i = 1; i < usize; i++)
4806 		anchor_up[i] = (~up[i] & GMP_NUMB_MASK);
4807 	      up = anchor_up;
4808 	    }
4809 
4810 	  MPN_NORMALIZE_NOT_ZERO (up, usize);
4811 
4812 	  if ((up[0] & 1) == 0)			/* Result even; remove twos. */
4813 	    {
4814 	      unsigned int r;
4815 	      count_trailing_zeros (r, up[0]);
4816 	      mpn_rshift (anchor_up, up, usize, r);
4817 	      usize -= (anchor_up[usize - 1] == 0);
4818 	    }
4819 	  else if (anchor_up != up)
4820 	    MPN_COPY_INCR (anchor_up, up, usize);
4821 
4822 	  MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
4823 	  up = anchor_up;
4824 
4825 	  if (vsize <= 2)		/* Kary can't handle < 2 limbs and  */
4826 	    break;			/* isn't efficient for == 2 limbs.  */
4827 
4828 	  d = vbitsize;
4829 	  count_leading_zeros (vbitsize, vp[vsize - 1]);
4830 	  vbitsize -= GMP_NAIL_BITS;
4831 	  vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
4832 	  d = d - vbitsize + 1;
4833 
4834 	  if (d > BMOD_THRESHOLD)	/* Bmod reduction.  */
4835 	    {
4836 	      up[usize++] = 0;
4837 	      mpn_bdivmod (up, up, usize, vp, vsize, d);
4838 	      d /= GMP_NUMB_BITS, up += d, usize -= d;
4839 	    }
4840 	  else				/* Kary reduction.  */
4841 	    {
4842 	      mp_limb_t bp[2], cp[2];
4843 
4844 	      /* C <-- V/U mod 2^(2*GMP_NUMB_BITS).  */
4845 	      {
4846 		mp_limb_t u_inv, hi, lo;
4847 		modlimb_invert (u_inv, up[0]);
4848 		cp[0] = (vp[0] * u_inv) & GMP_NUMB_MASK;
4849 		umul_ppmm (hi, lo, cp[0], up[0] << GMP_NAIL_BITS);
4850 		lo >>= GMP_NAIL_BITS;
4851 		cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv & GMP_NUMB_MASK;
4852 	      }
4853 
4854 	      /* U <-- find_a (C)  *  U.  */
4855 	      up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
4856 	      usize++;
4857 
4858 	      /* B <-- A/C == U/V mod 2^(GMP_NUMB_BITS + 1).
4859 		  bp[0] <-- U/V mod 2^GMP_NUMB_BITS and
4860 		  bp[1] <-- ( (U - bp[0] * V)/2^GMP_NUMB_BITS ) / V mod 2
4861 
4862 		  Like V/U above, but simplified because only the low bit of
4863 		  bp[1] is wanted. */
4864 	      {
4865 		mp_limb_t  v_inv, hi, lo;
4866 		modlimb_invert (v_inv, vp[0]);
4867 		bp[0] = (up[0] * v_inv) & GMP_NUMB_MASK;
4868 		umul_ppmm (hi, lo, bp[0], vp[0] << GMP_NAIL_BITS);
4869 		lo >>= GMP_NAIL_BITS;
4870 		bp[1] = (up[1] + hi + (bp[0] & vp[1])) & 1;
4871 	      }
4872 
4873 	      up[usize++] = 0;
4874 	      if (bp[1] != 0)	/* B < 0: U <-- U + (-B)  * V.  */
4875 		{
4876 		   mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0] & GMP_NUMB_MASK);
4877 		   mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
4878 		}
4879 	      else		/* B >= 0:  U <-- U - B * V.  */
4880 		{
4881 		  mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
4882 		  mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
4883 		}
4884 
4885 	      up += 2, usize -= 2;  /* At least two low limbs are zero.  */
4886 	    }
4887 
4888 	  /* Must remove low zero limbs before complementing.  */
4889 	  while (usize != 0 && up[0] == 0)
4890 	    up++, usize--;
4891 	}
4892       while (usize != 0);
4893 
4894       /* Compute GCD (ORIG_V, GCD (ORIG_U, V)).  Binary will execute twice.  */
4895       up = orig_up, usize = orig_usize;
4896       binary_gcd_ctr = 2;
4897     }
4898   else
4899     binary_gcd_ctr = 1;
4900 
4901   /* Finish up with the binary algorithm.  Executes once or twice.  */
4902   for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
4903     {
4904       if (usize > 2)		/* First make U close to V in size.  */
4905 	{
4906 	  unsigned long int vbitsize, d;
4907 	  count_leading_zeros (d, up[usize - 1]);
4908 	  d -= GMP_NAIL_BITS;
4909 	  d = usize * GMP_NUMB_BITS - d;
4910 	  count_leading_zeros (vbitsize, vp[vsize - 1]);
4911 	  vbitsize -= GMP_NAIL_BITS;
4912 	  vbitsize = vsize * GMP_NUMB_BITS - vbitsize;
4913 	  d = d - vbitsize - 1;
4914 	  if (d != -(unsigned long int)1 && d > 2)
4915 	    {
4916 	      mpn_bdivmod (up, up, usize, vp, vsize, d);  /* Result > 0.  */
4917 	      d /= (unsigned long int)GMP_NUMB_BITS, up += d, usize -= d;
4918 	    }
4919 	}
4920 
4921       /* Start binary GCD.  */
4922       do
4923 	{
4924 	  mp_size_t zeros;
4925 
4926 	  /* Make sure U is odd.  */
4927 	  MPN_NORMALIZE (up, usize);
4928 	  while (up[0] == 0)
4929 	    up += 1, usize -= 1;
4930 	  if ((up[0] & 1) == 0)
4931 	    {
4932 	      unsigned int r;
4933 	      count_trailing_zeros (r, up[0]);
4934 	      mpn_rshift (up, up, usize, r);
4935 	      usize -= (up[usize - 1] == 0);
4936 	    }
4937 
4938 	  /* Keep usize >= vsize.  */
4939 	  if (usize < vsize)
4940 	    MPN_PTR_SWAP (up, usize, vp, vsize);
4941 
4942 	  if (usize <= 2)				/* Double precision. */
4943 	    {
4944 	      if (vsize == 1)
4945 		vp[0] = mpn_gcd_1 (up, usize, vp[0]);
4946 	      else
4947 		vsize = gcd_2 (vp, up);
4948 	      break;					/* Binary GCD done.  */
4949 	    }
4950 
4951 	  /* Count number of low zero limbs of U - V.  */
4952 	  for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
4953 	    continue;
4954 
4955 	  /* If U < V, swap U and V; in any case, subtract V from U.  */
4956 	  if (zeros == vsize)				/* Subtract done.  */
4957 	    up += zeros, usize -= zeros;
4958 	  else if (usize == vsize)
4959 	    {
4960 	      mp_size_t size = vsize;
4961 	      do
4962 		size--;
4963 	      while (up[size] == vp[size]);
4964 	      if (up[size] < vp[size])			/* usize == vsize.  */
4965 		MP_PTR_SWAP (up, vp);
4966 	      up += zeros, usize = size + 1 - zeros;
4967 	      mpn_sub_n (up, up, vp + zeros, usize);
4968 	    }
4969 	  else
4970 	    {
4971 	      mp_size_t size = vsize - zeros;
4972 	      up += zeros, usize -= zeros;
4973 	      if (mpn_sub_n (up, up, vp + zeros, size))
4974 		{
4975 		  while (up[size] == 0)			/* Propagate borrow. */
4976 		    up[size++] = -(mp_limb_t)1;
4977 		  up[size] -= 1;
4978 		}
4979 	    }
4980 	}
4981       while (usize);					/* End binary GCD.  */
4982     }
4983 
4984 done:
4985   if (vp != gp)
4986     MPN_COPY_INCR (gp, vp, vsize);
4987   TMP_FREE (marker);
4988   return vsize;
4989 }
4990 
4991 
4992 /* q_high = mpn_bdivmod (qp, up, usize, vp, vsize, d).
4993 
4994    Puts the low d/BITS_PER_MP_LIMB limbs of Q = U / V mod 2^d at qp, and
4995    returns the high d%BITS_PER_MP_LIMB bits of Q as the result.
4996 
4997    Also, U - Q * V mod 2^(usize*BITS_PER_MP_LIMB) is placed at up.  Since the
4998    low d/BITS_PER_MP_LIMB limbs of this difference are zero, the code allows
4999    the limb vectors at qp to overwrite the low limbs at up, provided qp <= up.
5000 
5001    Preconditions:
5002    1.  V is odd.
5003    2.  usize * BITS_PER_MP_LIMB >= d.
5004    3.  If Q and U overlap, qp <= up.
5005 
5006    Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
5007 
5008    Funding for this work has been partially provided by Conselho Nacional
5009    de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
5010    301314194-2, and was done while I was a visiting reseacher in the Instituto
5011    de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
5012 
5013    References:
5014        T. Jebelean, An algorithm for exact division, Journal of Symbolic
5015        Computation, v. 15, 1993, pp. 169-180.
5016 
5017        K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
5018        Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
5019 
5020 mp_limb_t
mpn_bdivmod(mp_ptr qp,mp_ptr up,mp_size_t usize,mp_srcptr vp,mp_size_t vsize,unsigned long int d)5021 mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize,
5022 	     mp_srcptr vp, mp_size_t vsize, unsigned long int d)
5023 {
5024   mp_limb_t v_inv;
5025 
5026   ASSERT (usize >= 1);
5027   ASSERT (vsize >= 1);
5028   ASSERT (usize * GMP_NUMB_BITS >= d);
5029   ASSERT (! MPN_OVERLAP_P (up, usize, vp, vsize));
5030   ASSERT (! MPN_OVERLAP_P (qp, d/GMP_NUMB_BITS, vp, vsize));
5031   ASSERT (MPN_SAME_OR_INCR2_P (qp, d/GMP_NUMB_BITS, up, usize));
5032   /* ASSERT_MPN (up, usize); */
5033   /* ASSERT_MPN (vp, vsize); */
5034 
5035   /* 1/V mod 2^GMP_NUMB_BITS. */
5036   modlimb_invert (v_inv, vp[0]);
5037 
5038   /* Fast code for two cases previously used by the accel part of mpn_gcd.
5039      (Could probably remove this now it's inlined there.) */
5040   if (usize == 2 && vsize == 2 &&
5041       (d == GMP_NUMB_BITS || d == 2*GMP_NUMB_BITS))
5042     {
5043       mp_limb_t hi;
5044       mp_limb_t lo USED_ONLY_SOMETIMES;
5045       mp_limb_t q = (up[0] * v_inv) & GMP_NUMB_MASK;
5046       umul_ppmm (hi, lo, q, vp[0] << GMP_NAIL_BITS);
5047       up[0] = 0;
5048       up[1] -= hi + q*vp[1];
5049       qp[0] = q;
5050       if (d == 2*GMP_NUMB_BITS)
5051         {
5052           q = (up[1] * v_inv) & GMP_NUMB_MASK;
5053           up[1] = 0;
5054           qp[1] = q;
5055         }
5056       return 0;
5057     }
5058 
5059   /* Main loop.  */
5060   while (d >= GMP_NUMB_BITS)
5061     {
5062       mp_limb_t q = (up[0] * v_inv) & GMP_NUMB_MASK;
5063       mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
5064       if (usize > vsize)
5065 	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
5066       d -= GMP_NUMB_BITS;
5067       up += 1, usize -= 1;
5068       *qp++ = q;
5069     }
5070 
5071   if (d)
5072     {
5073       mp_limb_t b;
5074       mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1);
5075       if (q <= 1)
5076 	{
5077 	  if (q == 0)
5078 	    return 0;
5079 	  else
5080 	    b = mpn_sub_n (up, up, vp, MIN (usize, vsize));
5081 	}
5082       else
5083 	b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
5084 
5085       if (usize > vsize)
5086 	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
5087       return q;
5088     }
5089 
5090   return 0;
5091 }
5092 
5093 
5094 /* modlimb_invert_table[i] is the multiplicative inverse of 2*i+1 mod 256,
5095    ie. (modlimb_invert_table[i] * (2*i+1)) % 256 == 1 */
5096 
5097 const unsigned char  modlimb_invert_table[128] = {
5098   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
5099   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
5100   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
5101   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
5102   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
5103   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
5104   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
5105   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
5106   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
5107   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
5108   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
5109   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
5110   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
5111   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
5112   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
5113   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
5114 };
5115 
5116 
5117 /************************************************************************/
5118 
5119 /* __mp_bases -- Structure for conversion between internal binary
5120    format and strings in base 2..255.  The fields are explained in
5121    gmp-impl.h. */
5122 
5123 #if BITS_PER_MP_LIMB == 32
5124 const struct bases __mp_bases[256] =
5125 {
5126   /*  0 */ {0, 0.0, 0, 0},
5127   /*  1 */ {0, 1e38, 0, 0},
5128   /*  2 */ {32, 1.0000000000000000, 0x1, 0x0},
5129   /*  3 */ {20, 0.6309297535714575, 0xcfd41b91, 0x3b563c24},
5130   /*  4 */ {16, 0.5000000000000000, 0x2, 0x0},
5131   /*  5 */ {13, 0.4306765580733931, 0x48c27395, 0xc25c2684},
5132   /*  6 */ {12, 0.3868528072345416, 0x81bf1000, 0xf91bd1b6},
5133   /*  7 */ {11, 0.3562071871080222, 0x75db9c97, 0x1607a2cb},
5134   /*  8 */ {10, 0.3333333333333334, 0x3, 0x0},
5135   /*  9 */ {10, 0.3154648767857287, 0xcfd41b91, 0x3b563c24},
5136   /* 10 */ {9, 0.3010299956639811, 0x3b9aca00, 0x12e0be82},
5137   /* 11 */ {9, 0.2890648263178878, 0x8c8b6d2b, 0xd24cde04},
5138   /* 12 */ {8, 0.2789429456511298, 0x19a10000, 0x3fa39ab5},
5139   /* 13 */ {8, 0.2702381544273197, 0x309f1021, 0x50f8ac5f},
5140   /* 14 */ {8, 0.2626495350371936, 0x57f6c100, 0x74843b1e},
5141   /* 15 */ {8, 0.2559580248098155, 0x98c29b81, 0xad0326c2},
5142   /* 16 */ {8, 0.2500000000000000, 0x4, 0x0},
5143   /* 17 */ {7, 0.2446505421182260, 0x18754571, 0x4ef0b6bd},
5144   /* 18 */ {7, 0.2398124665681315, 0x247dbc80, 0xc0fc48a1},
5145   /* 19 */ {7, 0.2354089133666382, 0x3547667b, 0x33838942},
5146   /* 20 */ {7, 0.2313782131597592, 0x4c4b4000, 0xad7f29ab},
5147   /* 21 */ {7, 0.2276702486969530, 0x6b5a6e1d, 0x313c3d15},
5148   /* 22 */ {7, 0.2242438242175754, 0x94ace180, 0xb8cca9e0},
5149   /* 23 */ {7, 0.2210647294575037, 0xcaf18367, 0x42ed6de9},
5150   /* 24 */ {6, 0.2181042919855316, 0xb640000, 0x67980e0b},
5151   /* 25 */ {6, 0.2153382790366965, 0xe8d4a51, 0x19799812},
5152   /* 26 */ {6, 0.2127460535533632, 0x1269ae40, 0xbce85396},
5153   /* 27 */ {6, 0.2103099178571525, 0x17179149, 0x62c103a9},
5154   /* 28 */ {6, 0.2080145976765095, 0x1cb91000, 0x1d353d43},
5155   /* 29 */ {6, 0.2058468324604344, 0x23744899, 0xce1decea},
5156   /* 30 */ {6, 0.2037950470905062, 0x2b73a840, 0x790fc511},
5157   /* 31 */ {6, 0.2018490865820999, 0x34e63b41, 0x35b865a0},
5158   /* 32 */ {6, 0.2000000000000000, 0x5, 0x0},
5159   /* 33 */ {6, 0.1982398631705605, 0x4cfa3cc1, 0xa9aed1b3},
5160   /* 34 */ {6, 0.1965616322328226, 0x5c13d840, 0x63dfc229},
5161   /* 35 */ {6, 0.1949590218937863, 0x6d91b519, 0x2b0fee30},
5162   /* 36 */ {6, 0.1934264036172708, 0x81bf1000, 0xf91bd1b6},
5163   /* 37 */ {6, 0.1919587200065601, 0x98ede0c9, 0xac89c3a9},
5164   /* 38 */ {6, 0.1905514124267734, 0xb3773e40, 0x6d2c32fe},
5165   /* 39 */ {6, 0.1892003595168700, 0xd1bbc4d1, 0x387907c9},
5166   /* 40 */ {6, 0.1879018247091076, 0xf4240000, 0xc6f7a0b},
5167   /* 41 */ {5, 0.1866524112389434, 0x6e7d349, 0x28928154},
5168   /* 42 */ {5, 0.1854490234153689, 0x7ca30a0, 0x6e8629d},
5169   /* 43 */ {5, 0.1842888331487062, 0x8c32bbb, 0xd373dca0},
5170   /* 44 */ {5, 0.1831692509136336, 0x9d46c00, 0xa0b17895},
5171   /* 45 */ {5, 0.1820879004699383, 0xaffacfd, 0x746811a5},
5172   /* 46 */ {5, 0.1810425967800402, 0xc46bee0, 0x4da6500f},
5173   /* 47 */ {5, 0.1800313266566926, 0xdab86ef, 0x2ba23582},
5174   /* 48 */ {5, 0.1790522317510414, 0xf300000, 0xdb20a88},
5175   /* 49 */ {5, 0.1781035935540111, 0x10d63af1, 0xe68d5ce4},
5176   /* 50 */ {5, 0.1771838201355579, 0x12a05f20, 0xb7cdfd9d},
5177   /* 51 */ {5, 0.1762914343888821, 0x1490aae3, 0x8e583933},
5178   /* 52 */ {5, 0.1754250635819545, 0x16a97400, 0x697cc3ea},
5179   /* 53 */ {5, 0.1745834300480449, 0x18ed2825, 0x48a5ca6c},
5180   /* 54 */ {5, 0.1737653428714400, 0x1b5e4d60, 0x2b52db16},
5181   /* 55 */ {5, 0.1729696904450771, 0x1dff8297, 0x111586a6},
5182   /* 56 */ {5, 0.1721954337940981, 0x20d38000, 0xf31d2b36},
5183   /* 57 */ {5, 0.1714416005739134, 0x23dd1799, 0xc8d76d19},
5184   /* 58 */ {5, 0.1707072796637201, 0x271f35a0, 0xa2cb1eb4},
5185   /* 59 */ {5, 0.1699916162869140, 0x2a9ce10b, 0x807c3ec3},
5186   /* 60 */ {5, 0.1692938075987814, 0x2e593c00, 0x617ec8bf},
5187   /* 61 */ {5, 0.1686130986895011, 0x3257844d, 0x45746cbe},
5188   /* 62 */ {5, 0.1679487789570419, 0x369b13e0, 0x2c0aa273},
5189   /* 63 */ {5, 0.1673001788101741, 0x3b27613f, 0x14f90805},
5190   /* 64 */ {5, 0.1666666666666667, 0x6, 0x0},
5191   /* 65 */ {5, 0.1660476462159378, 0x4528a141, 0xd9cf0829},
5192   /* 66 */ {5, 0.1654425539190583, 0x4aa51420, 0xb6fc4841},
5193   /* 67 */ {5, 0.1648508567221604, 0x50794633, 0x973054cb},
5194   /* 68 */ {5, 0.1642720499620502, 0x56a94400, 0x7a1dbe4b},
5195   /* 69 */ {5, 0.1637056554452156, 0x5d393975, 0x5f7fcd7f},
5196   /* 70 */ {5, 0.1631512196835108, 0x642d7260, 0x47196c84},
5197   /* 71 */ {5, 0.1626083122716341, 0x6b8a5ae7, 0x30b43635},
5198   /* 72 */ {5, 0.1620765243931223, 0x73548000, 0x1c1fa5f6},
5199   /* 73 */ {5, 0.1615554674429964, 0x7b908fe9, 0x930634a},
5200   /* 74 */ {5, 0.1610447717564445, 0x84435aa0, 0xef7f4a3c},
5201   /* 75 */ {5, 0.1605440854340214, 0x8d71d25b, 0xcf5552d2},
5202   /* 76 */ {5, 0.1600530732548213, 0x97210c00, 0xb1a47c8e},
5203   /* 77 */ {5, 0.1595714156699382, 0xa1563f9d, 0x9634b43e},
5204   /* 78 */ {5, 0.1590988078692941, 0xac16c8e0, 0x7cd3817d},
5205   /* 79 */ {5, 0.1586349589155960, 0xb768278f, 0x65536761},
5206   /* 80 */ {5, 0.1581795909397823, 0xc3500000, 0x4f8b588e},
5207   /* 81 */ {5, 0.1577324383928644, 0xcfd41b91, 0x3b563c24},
5208   /* 82 */ {5, 0.1572932473495469, 0xdcfa6920, 0x28928154},
5209   /* 83 */ {5, 0.1568617748594410, 0xeac8fd83, 0x1721bfb0},
5210   /* 84 */ {5, 0.1564377883420716, 0xf9461400, 0x6e8629d},
5211   /* 85 */ {4, 0.1560210650222250, 0x31c84b1, 0x491cc17c},
5212   /* 86 */ {4, 0.1556113914024940, 0x342ab10, 0x3a11d83b},
5213   /* 87 */ {4, 0.1552085627701551, 0x36a2c21, 0x2be074cd},
5214   /* 88 */ {4, 0.1548123827357682, 0x3931000, 0x1e7a02e7},
5215   /* 89 */ {4, 0.1544226628011101, 0x3bd5ee1, 0x11d10edd},
5216   /* 90 */ {4, 0.1540392219542636, 0x3e92110, 0x5d92c68},
5217   /* 91 */ {4, 0.1536618862898642, 0x4165ef1, 0xf50dbfb2},
5218   /* 92 */ {4, 0.1532904886526781, 0x4452100, 0xdf9f1316},
5219   /* 93 */ {4, 0.1529248683028321, 0x4756fd1, 0xcb52a684},
5220   /* 94 */ {4, 0.1525648706011593, 0x4a75410, 0xb8163e97},
5221   /* 95 */ {4, 0.1522103467132434, 0x4dad681, 0xa5d8f269},
5222   /* 96 */ {4, 0.1518611533308632, 0x5100000, 0x948b0fcd},
5223   /* 97 */ {4, 0.1515171524096389, 0x546d981, 0x841e0215},
5224   /* 98 */ {4, 0.1511782109217764, 0x57f6c10, 0x74843b1e},
5225   /* 99 */ {4, 0.1508442006228941, 0x5b9c0d1, 0x65b11e6e},
5226   /* 100 */ {4, 0.1505149978319906, 0x5f5e100, 0x5798ee23},
5227   /* 101 */ {4, 0.1501904832236879, 0x633d5f1, 0x4a30b99b},
5228   /* 102 */ {4, 0.1498705416319474, 0x673a910, 0x3d6e4d94},
5229   /* 103 */ {4, 0.1495550618645152, 0x6b563e1, 0x314825b0},
5230   /* 104 */ {4, 0.1492439365274121, 0x6f91000, 0x25b55f2e},
5231   /* 105 */ {4, 0.1489370618588283, 0x73eb721, 0x1aadaccb},
5232   /* 106 */ {4, 0.1486343375718350, 0x7866310, 0x10294ba2},
5233   /* 107 */ {4, 0.1483356667053617, 0x7d01db1, 0x620f8f6},
5234   /* 108 */ {4, 0.1480409554829326, 0x81bf100, 0xf91bd1b6},
5235   /* 109 */ {4, 0.1477501131786861, 0x869e711, 0xe6d37b2a},
5236   /* 110 */ {4, 0.1474630519902391, 0x8ba0a10, 0xd55cff6e},
5237   /* 111 */ {4, 0.1471796869179852, 0x90c6441, 0xc4ad2db2},
5238   /* 112 */ {4, 0.1468999356504447, 0x9610000, 0xb4b985cf},
5239   /* 113 */ {4, 0.1466237184553111, 0x9b7e7c1, 0xa5782bef},
5240   /* 114 */ {4, 0.1463509580758620, 0xa112610, 0x96dfdd2a},
5241   /* 115 */ {4, 0.1460815796324244, 0xa6cc591, 0x88e7e509},
5242   /* 116 */ {4, 0.1458155105286054, 0xacad100, 0x7b8813d3},
5243   /* 117 */ {4, 0.1455526803620167, 0xb2b5331, 0x6eb8b595},
5244   /* 118 */ {4, 0.1452930208392428, 0xb8e5710, 0x627289db},
5245   /* 119 */ {4, 0.1450364656948130, 0xbf3e7a1, 0x56aebc07},
5246   /* 120 */ {4, 0.1447829506139581, 0xc5c1000, 0x4b66dc33},
5247   /* 121 */ {4, 0.1445324131589439, 0xcc6db61, 0x4094d8a3},
5248   /* 122 */ {4, 0.1442847926987864, 0xd345510, 0x3632f7a5},
5249   /* 123 */ {4, 0.1440400303421672, 0xda48871, 0x2c3bd1f0},
5250   /* 124 */ {4, 0.1437980688733775, 0xe178100, 0x22aa4d5f},
5251   /* 125 */ {4, 0.1435588526911310, 0xe8d4a51, 0x19799812},
5252   /* 126 */ {4, 0.1433223277500932, 0xf05f010, 0x10a523e5},
5253   /* 127 */ {4, 0.1430884415049874, 0xf817e01, 0x828a237},
5254   /* 128 */ {4, 0.1428571428571428, 0x7, 0x0},
5255   /* 129 */ {4, 0.1426283821033600, 0x10818201, 0xf04ec452},
5256   /* 130 */ {4, 0.1424021108869747, 0x11061010, 0xe136444a},
5257   /* 131 */ {4, 0.1421782821510107, 0x118db651, 0xd2af9589},
5258   /* 132 */ {4, 0.1419568500933153, 0x12188100, 0xc4b42a83},
5259   /* 133 */ {4, 0.1417377701235801, 0x12a67c71, 0xb73dccf5},
5260   /* 134 */ {4, 0.1415209988221527, 0x1337b510, 0xaa4698c5},
5261   /* 135 */ {4, 0.1413064939005528, 0x13cc3761, 0x9dc8f729},
5262   /* 136 */ {4, 0.1410942141636095, 0x14641000, 0x91bf9a30},
5263   /* 137 */ {4, 0.1408841194731412, 0x14ff4ba1, 0x86257887},
5264   /* 138 */ {4, 0.1406761707131039, 0x159df710, 0x7af5c98c},
5265   /* 139 */ {4, 0.1404703297561400, 0x16401f31, 0x702c01a0},
5266   /* 140 */ {4, 0.1402665594314587, 0x16e5d100, 0x65c3ceb1},
5267   /* 141 */ {4, 0.1400648234939879, 0x178f1991, 0x5bb91502},
5268   /* 142 */ {4, 0.1398650865947379, 0x183c0610, 0x5207ec23},
5269   /* 143 */ {4, 0.1396673142523192, 0x18eca3c1, 0x48ac9c19},
5270   /* 144 */ {4, 0.1394714728255649, 0x19a10000, 0x3fa39ab5},
5271   /* 145 */ {4, 0.1392775294872041, 0x1a592841, 0x36e98912},
5272   /* 146 */ {4, 0.1390854521985406, 0x1b152a10, 0x2e7b3140},
5273   /* 147 */ {4, 0.1388952096850913, 0x1bd51311, 0x2655840b},
5274   /* 148 */ {4, 0.1387067714131417, 0x1c98f100, 0x1e7596ea},
5275   /* 149 */ {4, 0.1385201075671774, 0x1d60d1b1, 0x16d8a20d},
5276   /* 150 */ {4, 0.1383351890281539, 0x1e2cc310, 0xf7bfe87},
5277   /* 151 */ {4, 0.1381519873525671, 0x1efcd321, 0x85d2492},
5278   /* 152 */ {4, 0.1379704747522905, 0x1fd11000, 0x179a9f4},
5279   /* 153 */ {4, 0.1377906240751463, 0x20a987e1, 0xf59e80eb},
5280   /* 154 */ {4, 0.1376124087861776, 0x21864910, 0xe8b768db},
5281   /* 155 */ {4, 0.1374358029495937, 0x226761f1, 0xdc39d6d5},
5282   /* 156 */ {4, 0.1372607812113589, 0x234ce100, 0xd021c5d1},
5283   /* 157 */ {4, 0.1370873187823978, 0x2436d4d1, 0xc46b5e37},
5284   /* 158 */ {4, 0.1369153914223921, 0x25254c10, 0xb912f39c},
5285   /* 159 */ {4, 0.1367449754241439, 0x26185581, 0xae150294},
5286   /* 160 */ {4, 0.1365760475984821, 0x27100000, 0xa36e2eb1},
5287   /* 161 */ {4, 0.1364085852596902, 0x280c5a81, 0x991b4094},
5288   /* 162 */ {4, 0.1362425662114337, 0x290d7410, 0x8f19241e},
5289   /* 163 */ {4, 0.1360779687331669, 0x2a135bd1, 0x8564e6b7},
5290   /* 164 */ {4, 0.1359147715670014, 0x2b1e2100, 0x7bfbb5b4},
5291   /* 165 */ {4, 0.1357529539050150, 0x2c2dd2f1, 0x72dadcc8},
5292   /* 166 */ {4, 0.1355924953769863, 0x2d428110, 0x69ffc498},
5293   /* 167 */ {4, 0.1354333760385373, 0x2e5c3ae1, 0x6167f154},
5294   /* 168 */ {4, 0.1352755763596663, 0x2f7b1000, 0x5911016e},
5295   /* 169 */ {4, 0.1351190772136599, 0x309f1021, 0x50f8ac5f},
5296   /* 170 */ {4, 0.1349638598663645, 0x31c84b10, 0x491cc17c},
5297   /* 171 */ {4, 0.1348099059658079, 0x32f6d0b1, 0x417b26d8},
5298   /* 172 */ {4, 0.1346571975321549, 0x342ab100, 0x3a11d83b},
5299   /* 173 */ {4, 0.1345057169479844, 0x3563fc11, 0x32dee622},
5300   /* 174 */ {4, 0.1343554469488779, 0x36a2c210, 0x2be074cd},
5301   /* 175 */ {4, 0.1342063706143054, 0x37e71341, 0x2514bb58},
5302   /* 176 */ {4, 0.1340584713587980, 0x39310000, 0x1e7a02e7},
5303   /* 177 */ {4, 0.1339117329233981, 0x3a8098c1, 0x180ea5d0},
5304   /* 178 */ {4, 0.1337661393673756, 0x3bd5ee10, 0x11d10edd},
5305   /* 179 */ {4, 0.1336216750601996, 0x3d311091, 0xbbfb88e},
5306   /* 180 */ {4, 0.1334783246737591, 0x3e921100, 0x5d92c68},
5307   /* 181 */ {4, 0.1333360731748201, 0x3ff90031, 0x1c024c},
5308   /* 182 */ {4, 0.1331949058177136, 0x4165ef10, 0xf50dbfb2},
5309   /* 183 */ {4, 0.1330548081372441, 0x42d8eea1, 0xea30efa3},
5310   /* 184 */ {4, 0.1329157659418126, 0x44521000, 0xdf9f1316},
5311   /* 185 */ {4, 0.1327777653067443, 0x45d16461, 0xd555c0c9},
5312   /* 186 */ {4, 0.1326407925678156, 0x4756fd10, 0xcb52a684},
5313   /* 187 */ {4, 0.1325048343149731, 0x48e2eb71, 0xc193881f},
5314   /* 188 */ {4, 0.1323698773862368, 0x4a754100, 0xb8163e97},
5315   /* 189 */ {4, 0.1322359088617821, 0x4c0e0f51, 0xaed8b724},
5316   /* 190 */ {4, 0.1321029160581950, 0x4dad6810, 0xa5d8f269},
5317   /* 191 */ {4, 0.1319708865228925, 0x4f535d01, 0x9d15039d},
5318   /* 192 */ {4, 0.1318398080287045, 0x51000000, 0x948b0fcd},
5319   /* 193 */ {4, 0.1317096685686114, 0x52b36301, 0x8c394d1d},
5320   /* 194 */ {4, 0.1315804563506306, 0x546d9810, 0x841e0215},
5321   /* 195 */ {4, 0.1314521597928493, 0x562eb151, 0x7c3784f8},
5322   /* 196 */ {4, 0.1313247675185968, 0x57f6c100, 0x74843b1e},
5323   /* 197 */ {4, 0.1311982683517524, 0x59c5d971, 0x6d02985d},
5324   /* 198 */ {4, 0.1310726513121843, 0x5b9c0d10, 0x65b11e6e},
5325   /* 199 */ {4, 0.1309479056113158, 0x5d796e61, 0x5e8e5c64},
5326   /* 200 */ {4, 0.1308240206478128, 0x5f5e1000, 0x5798ee23},
5327   /* 201 */ {4, 0.1307009860033912, 0x614a04a1, 0x50cf7bde},
5328   /* 202 */ {4, 0.1305787914387386, 0x633d5f10, 0x4a30b99b},
5329   /* 203 */ {4, 0.1304574268895465, 0x65383231, 0x43bb66bd},
5330   /* 204 */ {4, 0.1303368824626505, 0x673a9100, 0x3d6e4d94},
5331   /* 205 */ {4, 0.1302171484322746, 0x69448e91, 0x374842ee},
5332   /* 206 */ {4, 0.1300982152363760, 0x6b563e10, 0x314825b0},
5333   /* 207 */ {4, 0.1299800734730872, 0x6d6fb2c1, 0x2b6cde75},
5334   /* 208 */ {4, 0.1298627138972530, 0x6f910000, 0x25b55f2e},
5335   /* 209 */ {4, 0.1297461274170591, 0x71ba3941, 0x2020a2c5},
5336   /* 210 */ {4, 0.1296303050907487, 0x73eb7210, 0x1aadaccb},
5337   /* 211 */ {4, 0.1295152381234257, 0x7624be11, 0x155b891f},
5338   /* 212 */ {4, 0.1294009178639407, 0x78663100, 0x10294ba2},
5339   /* 213 */ {4, 0.1292873358018581, 0x7aafdeb1, 0xb160fe9},
5340   /* 214 */ {4, 0.1291744835645007, 0x7d01db10, 0x620f8f6},
5341   /* 215 */ {4, 0.1290623529140715, 0x7f5c3a21, 0x14930ef},
5342   /* 216 */ {4, 0.1289509357448472, 0x81bf1000, 0xf91bd1b6},
5343   /* 217 */ {4, 0.1288402240804449, 0x842a70e1, 0xefdcb0c7},
5344   /* 218 */ {4, 0.1287302100711567, 0x869e7110, 0xe6d37b2a},
5345   /* 219 */ {4, 0.1286208859913518, 0x891b24f1, 0xddfeb94a},
5346   /* 220 */ {4, 0.1285122442369443, 0x8ba0a100, 0xd55cff6e},
5347   /* 221 */ {4, 0.1284042773229231, 0x8e2ef9d1, 0xcceced50},
5348   /* 222 */ {4, 0.1282969778809442, 0x90c64410, 0xc4ad2db2},
5349   /* 223 */ {4, 0.1281903386569819, 0x93669481, 0xbc9c75f9},
5350   /* 224 */ {4, 0.1280843525090381, 0x96100000, 0xb4b985cf},
5351   /* 225 */ {4, 0.1279790124049077, 0x98c29b81, 0xad0326c2},
5352   /* 226 */ {4, 0.1278743114199984, 0x9b7e7c10, 0xa5782bef},
5353   /* 227 */ {4, 0.1277702427352035, 0x9e43b6d1, 0x9e1771a9},
5354   /* 228 */ {4, 0.1276667996348261, 0xa1126100, 0x96dfdd2a},
5355   /* 229 */ {4, 0.1275639755045533, 0xa3ea8ff1, 0x8fd05c41},
5356   /* 230 */ {4, 0.1274617638294791, 0xa6cc5910, 0x88e7e509},
5357   /* 231 */ {4, 0.1273601581921741, 0xa9b7d1e1, 0x8225759d},
5358   /* 232 */ {4, 0.1272591522708010, 0xacad1000, 0x7b8813d3},
5359   /* 233 */ {4, 0.1271587398372755, 0xafac2921, 0x750eccf9},
5360   /* 234 */ {4, 0.1270589147554692, 0xb2b53310, 0x6eb8b595},
5361   /* 235 */ {4, 0.1269596709794558, 0xb5c843b1, 0x6884e923},
5362   /* 236 */ {4, 0.1268610025517973, 0xb8e57100, 0x627289db},
5363   /* 237 */ {4, 0.1267629036018709, 0xbc0cd111, 0x5c80c07b},
5364   /* 238 */ {4, 0.1266653683442337, 0xbf3e7a10, 0x56aebc07},
5365   /* 239 */ {4, 0.1265683910770258, 0xc27a8241, 0x50fbb19b},
5366   /* 240 */ {4, 0.1264719661804097, 0xc5c10000, 0x4b66dc33},
5367   /* 241 */ {4, 0.1263760881150453, 0xc91209c1, 0x45ef7c7c},
5368   /* 242 */ {4, 0.1262807514205999, 0xcc6db610, 0x4094d8a3},
5369   /* 243 */ {4, 0.1261859507142915, 0xcfd41b91, 0x3b563c24},
5370   /* 244 */ {4, 0.1260916806894653, 0xd3455100, 0x3632f7a5},
5371   /* 245 */ {4, 0.1259979361142023, 0xd6c16d31, 0x312a60c3},
5372   /* 246 */ {4, 0.1259047118299582, 0xda488710, 0x2c3bd1f0},
5373   /* 247 */ {4, 0.1258120027502338, 0xdddab5a1, 0x2766aa45},
5374   /* 248 */ {4, 0.1257198038592741, 0xe1781000, 0x22aa4d5f},
5375   /* 249 */ {4, 0.1256281102107963, 0xe520ad61, 0x1e06233c},
5376   /* 250 */ {4, 0.1255369169267456, 0xe8d4a510, 0x19799812},
5377   /* 251 */ {4, 0.1254462191960791, 0xec940e71, 0x15041c33},
5378   /* 252 */ {4, 0.1253560122735751, 0xf05f0100, 0x10a523e5},
5379   /* 253 */ {4, 0.1252662914786691, 0xf4359451, 0xc5c2749},
5380   /* 254 */ {4, 0.1251770521943144, 0xf817e010, 0x828a237},
5381   /* 255 */ {4, 0.1250882898658681, 0xfc05fc01, 0x40a1423},
5382 };
5383 #endif
5384 #if BITS_PER_MP_LIMB == 64
5385 const struct bases __mp_bases[256] =
5386 {
5387   /*  0 */ {0, 0.0, 0, 0},
5388   /*  1 */ {0, 1e38, 0, 0},
5389   /*  2 */ {64, 1.0000000000000000, CNST_LIMB(0x1), CNST_LIMB(0x0)},
5390   /*  3 */ {40, 0.6309297535714574, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
5391   /*  4 */ {32, 0.5000000000000000, CNST_LIMB(0x2), CNST_LIMB(0x0)},
5392   /*  5 */ {27, 0.4306765580733931, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)},
5393   /*  6 */ {24, 0.3868528072345416, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
5394   /*  7 */ {22, 0.3562071871080222, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)},
5395   /*  8 */ {21, 0.3333333333333334, CNST_LIMB(0x3), CNST_LIMB(0x0)},
5396   /*  9 */ {20, 0.3154648767857287, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
5397   /* 10 */ {19, 0.3010299956639811, CNST_LIMB(0x8ac7230489e80000), CNST_LIMB(0xd83c94fb6d2ac34a)},
5398   /* 11 */ {18, 0.2890648263178878, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)},
5399   /* 12 */ {17, 0.2789429456511298, CNST_LIMB(0x1eca170c00000000), CNST_LIMB(0xa10c2bec5da8f8f)},
5400   /* 13 */ {17, 0.2702381544273197, CNST_LIMB(0x780c7372621bd74d), CNST_LIMB(0x10f4becafe412ec3)},
5401   /* 14 */ {16, 0.2626495350371936, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)},
5402   /* 15 */ {16, 0.2559580248098155, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)},
5403   /* 16 */ {16, 0.2500000000000000, CNST_LIMB(0x4), CNST_LIMB(0x0)},
5404   /* 17 */ {15, 0.2446505421182260, CNST_LIMB(0x27b95e997e21d9f1), CNST_LIMB(0x9c71e11bab279323)},
5405   /* 18 */ {15, 0.2398124665681315, CNST_LIMB(0x5da0e1e53c5c8000), CNST_LIMB(0x5dfaa697ec6f6a1c)},
5406   /* 19 */ {15, 0.2354089133666382, CNST_LIMB(0xd2ae3299c1c4aedb), CNST_LIMB(0x3711783f6be7e9ec)},
5407   /* 20 */ {14, 0.2313782131597592, CNST_LIMB(0x16bcc41e90000000), CNST_LIMB(0x6849b86a12b9b01e)},
5408   /* 21 */ {14, 0.2276702486969530, CNST_LIMB(0x2d04b7fdd9c0ef49), CNST_LIMB(0x6bf097ba5ca5e239)},
5409   /* 22 */ {14, 0.2242438242175754, CNST_LIMB(0x5658597bcaa24000), CNST_LIMB(0x7b8015c8d7af8f08)},
5410   /* 23 */ {14, 0.2210647294575037, CNST_LIMB(0xa0e2073737609371), CNST_LIMB(0x975a24b3a3151b38)},
5411   /* 24 */ {13, 0.2181042919855316, CNST_LIMB(0xc29e98000000000), CNST_LIMB(0x50bd367972689db1)},
5412   /* 25 */ {13, 0.2153382790366965, CNST_LIMB(0x14adf4b7320334b9), CNST_LIMB(0x8c240c4aecb13bb5)},
5413   /* 26 */ {13, 0.2127460535533632, CNST_LIMB(0x226ed36478bfa000), CNST_LIMB(0xdbd2e56854e118c9)},
5414   /* 27 */ {13, 0.2103099178571525, CNST_LIMB(0x383d9170b85ff80b), CNST_LIMB(0x2351ffcaa9c7c4ae)},
5415   /* 28 */ {13, 0.2080145976765095, CNST_LIMB(0x5a3c23e39c000000), CNST_LIMB(0x6b24188ca33b0636)},
5416   /* 29 */ {13, 0.2058468324604344, CNST_LIMB(0x8e65137388122bcd), CNST_LIMB(0xcc3dceaf2b8ba99d)},
5417   /* 30 */ {13, 0.2037950470905062, CNST_LIMB(0xdd41bb36d259e000), CNST_LIMB(0x2832e835c6c7d6b6)},
5418   /* 31 */ {12, 0.2018490865820999, CNST_LIMB(0xaee5720ee830681), CNST_LIMB(0x76b6aa272e1873c5)},
5419   /* 32 */ {12, 0.2000000000000000, CNST_LIMB(0x5), CNST_LIMB(0x0)},
5420   /* 33 */ {12, 0.1982398631705605, CNST_LIMB(0x172588ad4f5f0981), CNST_LIMB(0x61eaf5d402c7bf4f)},
5421   /* 34 */ {12, 0.1965616322328226, CNST_LIMB(0x211e44f7d02c1000), CNST_LIMB(0xeeb658123ffb27ec)},
5422   /* 35 */ {12, 0.1949590218937863, CNST_LIMB(0x2ee56725f06e5c71), CNST_LIMB(0x5d5e3762e6fdf509)},
5423   /* 36 */ {12, 0.1934264036172708, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
5424   /* 37 */ {12, 0.1919587200065601, CNST_LIMB(0x5b5b57f8a98a5dd1), CNST_LIMB(0x66ae7831762efb6f)},
5425   /* 38 */ {12, 0.1905514124267734, CNST_LIMB(0x7dcff8986ea31000), CNST_LIMB(0x47388865a00f544)},
5426   /* 39 */ {12, 0.1892003595168700, CNST_LIMB(0xabd4211662a6b2a1), CNST_LIMB(0x7d673c33a123b54c)},
5427   /* 40 */ {12, 0.1879018247091076, CNST_LIMB(0xe8d4a51000000000), CNST_LIMB(0x19799812dea11197)},
5428   /* 41 */ {11, 0.1866524112389434, CNST_LIMB(0x7a32956ad081b79), CNST_LIMB(0xc27e62e0686feae)},
5429   /* 42 */ {11, 0.1854490234153689, CNST_LIMB(0x9f49aaff0e86800), CNST_LIMB(0x9b6e7507064ce7c7)},
5430   /* 43 */ {11, 0.1842888331487062, CNST_LIMB(0xce583bb812d37b3), CNST_LIMB(0x3d9ac2bf66cfed94)},
5431   /* 44 */ {11, 0.1831692509136336, CNST_LIMB(0x109b79a654c00000), CNST_LIMB(0xed46bc50ce59712a)},
5432   /* 45 */ {11, 0.1820879004699383, CNST_LIMB(0x1543beff214c8b95), CNST_LIMB(0x813d97e2c89b8d46)},
5433   /* 46 */ {11, 0.1810425967800402, CNST_LIMB(0x1b149a79459a3800), CNST_LIMB(0x2e81751956af8083)},
5434   /* 47 */ {11, 0.1800313266566926, CNST_LIMB(0x224edfb5434a830f), CNST_LIMB(0xdd8e0a95e30c0988)},
5435   /* 48 */ {11, 0.1790522317510413, CNST_LIMB(0x2b3fb00000000000), CNST_LIMB(0x7ad4dd48a0b5b167)},
5436   /* 49 */ {11, 0.1781035935540111, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)},
5437   /* 50 */ {11, 0.1771838201355579, CNST_LIMB(0x43c33c1937564800), CNST_LIMB(0xe392010175ee5962)},
5438   /* 51 */ {11, 0.1762914343888821, CNST_LIMB(0x54411b2441c3cd8b), CNST_LIMB(0x84eaf11b2fe7738e)},
5439   /* 52 */ {11, 0.1754250635819545, CNST_LIMB(0x6851455acd400000), CNST_LIMB(0x3a1e3971e008995d)},
5440   /* 53 */ {11, 0.1745834300480449, CNST_LIMB(0x80a23b117c8feb6d), CNST_LIMB(0xfd7a462344ffce25)},
5441   /* 54 */ {11, 0.1737653428714400, CNST_LIMB(0x9dff7d32d5dc1800), CNST_LIMB(0x9eca40b40ebcef8a)},
5442   /* 55 */ {11, 0.1729696904450771, CNST_LIMB(0xc155af6faeffe6a7), CNST_LIMB(0x52fa161a4a48e43d)},
5443   /* 56 */ {11, 0.1721954337940981, CNST_LIMB(0xebb7392e00000000), CNST_LIMB(0x1607a2cbacf930c1)},
5444   /* 57 */ {10, 0.1714416005739134, CNST_LIMB(0x50633659656d971), CNST_LIMB(0x97a014f8e3be55f1)},
5445   /* 58 */ {10, 0.1707072796637201, CNST_LIMB(0x5fa8624c7fba400), CNST_LIMB(0x568df8b76cbf212c)},
5446   /* 59 */ {10, 0.1699916162869140, CNST_LIMB(0x717d9faa73c5679), CNST_LIMB(0x20ba7c4b4e6ef492)},
5447   /* 60 */ {10, 0.1692938075987814, CNST_LIMB(0x86430aac6100000), CNST_LIMB(0xe81ee46b9ef492f5)},
5448   /* 61 */ {10, 0.1686130986895011, CNST_LIMB(0x9e64d9944b57f29), CNST_LIMB(0x9dc0d10d51940416)},
5449   /* 62 */ {10, 0.1679487789570419, CNST_LIMB(0xba5ca5392cb0400), CNST_LIMB(0x5fa8ed2f450272a5)},
5450   /* 63 */ {10, 0.1673001788101741, CNST_LIMB(0xdab2ce1d022cd81), CNST_LIMB(0x2ba9eb8c5e04e641)},
5451   /* 64 */ {10, 0.1666666666666667, CNST_LIMB(0x6), CNST_LIMB(0x0)},
5452   /* 65 */ {10, 0.1660476462159378, CNST_LIMB(0x12aeed5fd3e2d281), CNST_LIMB(0xb67759cc00287bf1)},
5453   /* 66 */ {10, 0.1654425539190583, CNST_LIMB(0x15c3da1572d50400), CNST_LIMB(0x78621feeb7f4ed33)},
5454   /* 67 */ {10, 0.1648508567221604, CNST_LIMB(0x194c05534f75ee29), CNST_LIMB(0x43d55b5f72943bc0)},
5455   /* 68 */ {10, 0.1642720499620502, CNST_LIMB(0x1d56299ada100000), CNST_LIMB(0x173decb64d1d4409)},
5456   /* 69 */ {10, 0.1637056554452156, CNST_LIMB(0x21f2a089a4ff4f79), CNST_LIMB(0xe29fb54fd6b6074f)},
5457   /* 70 */ {10, 0.1631512196835108, CNST_LIMB(0x2733896c68d9a400), CNST_LIMB(0xa1f1f5c210d54e62)},
5458   /* 71 */ {10, 0.1626083122716341, CNST_LIMB(0x2d2cf2c33b533c71), CNST_LIMB(0x6aac7f9bfafd57b2)},
5459   /* 72 */ {10, 0.1620765243931223, CNST_LIMB(0x33f506e440000000), CNST_LIMB(0x3b563c2478b72ee2)},
5460   /* 73 */ {10, 0.1615554674429964, CNST_LIMB(0x3ba43bec1d062211), CNST_LIMB(0x12b536b574e92d1b)},
5461   /* 74 */ {10, 0.1610447717564444, CNST_LIMB(0x4455872d8fd4e400), CNST_LIMB(0xdf86c03020404fa5)},
5462   /* 75 */ {10, 0.1605440854340214, CNST_LIMB(0x4e2694539f2f6c59), CNST_LIMB(0xa34adf02234eea8e)},
5463   /* 76 */ {10, 0.1600530732548213, CNST_LIMB(0x5938006c18900000), CNST_LIMB(0x6f46eb8574eb59dd)},
5464   /* 77 */ {10, 0.1595714156699382, CNST_LIMB(0x65ad9912474aa649), CNST_LIMB(0x42459b481df47cec)},
5465   /* 78 */ {10, 0.1590988078692941, CNST_LIMB(0x73ae9ff4241ec400), CNST_LIMB(0x1b424b95d80ca505)},
5466   /* 79 */ {10, 0.1586349589155960, CNST_LIMB(0x836612ee9c4ce1e1), CNST_LIMB(0xf2c1b982203a0dac)},
5467   /* 80 */ {10, 0.1581795909397823, CNST_LIMB(0x9502f90000000000), CNST_LIMB(0xb7cdfd9d7bdbab7d)},
5468   /* 81 */ {10, 0.1577324383928644, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
5469   /* 82 */ {10, 0.1572932473495469, CNST_LIMB(0xbebf59a07dab4400), CNST_LIMB(0x57931eeaf85cf64f)},
5470   /* 83 */ {10, 0.1568617748594410, CNST_LIMB(0xd7540d4093bc3109), CNST_LIMB(0x305a944507c82f47)},
5471   /* 84 */ {10, 0.1564377883420716, CNST_LIMB(0xf2b96616f1900000), CNST_LIMB(0xe007ccc9c22781a)},
5472   /* 85 */ {9, 0.1560210650222250, CNST_LIMB(0x336de62af2bca35), CNST_LIMB(0x3e92c42e000eeed4)},
5473   /* 86 */ {9, 0.1556113914024940, CNST_LIMB(0x39235ec33d49600), CNST_LIMB(0x1ebe59130db2795e)},
5474   /* 87 */ {9, 0.1552085627701551, CNST_LIMB(0x3f674e539585a17), CNST_LIMB(0x268859e90f51b89)},
5475   /* 88 */ {9, 0.1548123827357682, CNST_LIMB(0x4645b6958000000), CNST_LIMB(0xd24cde0463108cfa)},
5476   /* 89 */ {9, 0.1544226628011101, CNST_LIMB(0x4dcb74afbc49c19), CNST_LIMB(0xa536009f37adc383)},
5477   /* 90 */ {9, 0.1540392219542636, CNST_LIMB(0x56064e1d18d9a00), CNST_LIMB(0x7cea06ce1c9ace10)},
5478   /* 91 */ {9, 0.1536618862898642, CNST_LIMB(0x5f04fe2cd8a39fb), CNST_LIMB(0x58db032e72e8ba43)},
5479   /* 92 */ {9, 0.1532904886526781, CNST_LIMB(0x68d74421f5c0000), CNST_LIMB(0x388cc17cae105447)},
5480   /* 93 */ {9, 0.1529248683028321, CNST_LIMB(0x738df1f6ab4827d), CNST_LIMB(0x1b92672857620ce0)},
5481   /* 94 */ {9, 0.1525648706011593, CNST_LIMB(0x7f3afbc9cfb5e00), CNST_LIMB(0x18c6a9575c2ade4)},
5482   /* 95 */ {9, 0.1522103467132434, CNST_LIMB(0x8bf187fba88f35f), CNST_LIMB(0xd44da7da8e44b24f)},
5483   /* 96 */ {9, 0.1518611533308632, CNST_LIMB(0x99c600000000000), CNST_LIMB(0xaa2f78f1b4cc6794)},
5484   /* 97 */ {9, 0.1515171524096389, CNST_LIMB(0xa8ce21eb6531361), CNST_LIMB(0x843c067d091ee4cc)},
5485   /* 98 */ {9, 0.1511782109217764, CNST_LIMB(0xb92112c1a0b6200), CNST_LIMB(0x62005e1e913356e3)},
5486   /* 99 */ {9, 0.1508442006228941, CNST_LIMB(0xcad7718b8747c43), CNST_LIMB(0x4316eed01dedd518)},
5487   /* 100 */ {9, 0.1505149978319906, CNST_LIMB(0xde0b6b3a7640000), CNST_LIMB(0x2725dd1d243aba0e)},
5488   /* 101 */ {9, 0.1501904832236879, CNST_LIMB(0xf2d8cf5fe6d74c5), CNST_LIMB(0xddd9057c24cb54f)},
5489   /* 102 */ {9, 0.1498705416319474, CNST_LIMB(0x1095d25bfa712600), CNST_LIMB(0xedeee175a736d2a1)},
5490   /* 103 */ {9, 0.1495550618645152, CNST_LIMB(0x121b7c4c3698faa7), CNST_LIMB(0xc4699f3df8b6b328)},
5491   /* 104 */ {9, 0.1492439365274121, CNST_LIMB(0x13c09e8d68000000), CNST_LIMB(0x9ebbe7d859cb5a7c)},
5492   /* 105 */ {9, 0.1489370618588283, CNST_LIMB(0x15876ccb0b709ca9), CNST_LIMB(0x7c828b9887eb2179)},
5493   /* 106 */ {9, 0.1486343375718350, CNST_LIMB(0x17723c2976da2a00), CNST_LIMB(0x5d652ab99001adcf)},
5494   /* 107 */ {9, 0.1483356667053617, CNST_LIMB(0x198384e9c259048b), CNST_LIMB(0x4114f1754e5d7b32)},
5495   /* 108 */ {9, 0.1480409554829326, CNST_LIMB(0x1bbde41dfeec0000), CNST_LIMB(0x274b7c902f7e0188)},
5496   /* 109 */ {9, 0.1477501131786861, CNST_LIMB(0x1e241d6e3337910d), CNST_LIMB(0xfc9e0fbb32e210c)},
5497   /* 110 */ {9, 0.1474630519902391, CNST_LIMB(0x20b91cee9901ee00), CNST_LIMB(0xf4afa3e594f8ea1f)},
5498   /* 111 */ {9, 0.1471796869179852, CNST_LIMB(0x237ff9079863dfef), CNST_LIMB(0xcd85c32e9e4437b0)},
5499   /* 112 */ {9, 0.1468999356504447, CNST_LIMB(0x267bf47000000000), CNST_LIMB(0xa9bbb147e0dd92a8)},
5500   /* 113 */ {9, 0.1466237184553111, CNST_LIMB(0x29b08039fbeda7f1), CNST_LIMB(0x8900447b70e8eb82)},
5501   /* 114 */ {9, 0.1463509580758620, CNST_LIMB(0x2d213df34f65f200), CNST_LIMB(0x6b0a92adaad5848a)},
5502   /* 115 */ {9, 0.1460815796324244, CNST_LIMB(0x30d201d957a7c2d3), CNST_LIMB(0x4f990ad8740f0ee5)},
5503   /* 116 */ {9, 0.1458155105286054, CNST_LIMB(0x34c6d52160f40000), CNST_LIMB(0x3670a9663a8d3610)},
5504   /* 117 */ {9, 0.1455526803620167, CNST_LIMB(0x3903f855d8f4c755), CNST_LIMB(0x1f5c44188057be3c)},
5505   /* 118 */ {9, 0.1452930208392428, CNST_LIMB(0x3d8de5c8ec59b600), CNST_LIMB(0xa2bea956c4e4977)},
5506   /* 119 */ {9, 0.1450364656948130, CNST_LIMB(0x4269541d1ff01337), CNST_LIMB(0xed68b23033c3637e)},
5507   /* 120 */ {9, 0.1447829506139581, CNST_LIMB(0x479b38e478000000), CNST_LIMB(0xc99cf624e50549c5)},
5508   /* 121 */ {9, 0.1445324131589439, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)},
5509   /* 122 */ {9, 0.1442847926987864, CNST_LIMB(0x5317871fa13aba00), CNST_LIMB(0x8a5bc740b1c113e5)},
5510   /* 123 */ {9, 0.1440400303421672, CNST_LIMB(0x596d2f44de9fa71b), CNST_LIMB(0x6e6c7efb81cfbb9b)},
5511   /* 124 */ {9, 0.1437980688733775, CNST_LIMB(0x602fd125c47c0000), CNST_LIMB(0x54aba5c5cada5f10)},
5512   /* 125 */ {9, 0.1435588526911310, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)},
5513   /* 126 */ {9, 0.1433223277500932, CNST_LIMB(0x6f15be069b847e00), CNST_LIMB(0x26fb43de2c8cd2a8)},
5514   /* 127 */ {9, 0.1430884415049874, CNST_LIMB(0x7746b3e82a77047f), CNST_LIMB(0x12b94793db8486a1)},
5515   /* 128 */ {9, 0.1428571428571428, CNST_LIMB(0x7), CNST_LIMB(0x0)},
5516   /* 129 */ {9, 0.1426283821033600, CNST_LIMB(0x894953f7ea890481), CNST_LIMB(0xdd5deca404c0156d)},
5517   /* 130 */ {9, 0.1424021108869747, CNST_LIMB(0x932abffea4848200), CNST_LIMB(0xbd51373330291de0)},
5518   /* 131 */ {9, 0.1421782821510107, CNST_LIMB(0x9dacb687d3d6a163), CNST_LIMB(0x9fa4025d66f23085)},
5519   /* 132 */ {9, 0.1419568500933153, CNST_LIMB(0xa8d8102a44840000), CNST_LIMB(0x842530ee2db4949d)},
5520   /* 133 */ {9, 0.1417377701235801, CNST_LIMB(0xb4b60f9d140541e5), CNST_LIMB(0x6aa7f2766b03dc25)},
5521   /* 134 */ {9, 0.1415209988221527, CNST_LIMB(0xc15065d4856e4600), CNST_LIMB(0x53035ba7ebf32e8d)},
5522   /* 135 */ {9, 0.1413064939005528, CNST_LIMB(0xceb1363f396d23c7), CNST_LIMB(0x3d12091fc9fb4914)},
5523   /* 136 */ {9, 0.1410942141636095, CNST_LIMB(0xdce31b2488000000), CNST_LIMB(0x28b1cb81b1ef1849)},
5524   /* 137 */ {9, 0.1408841194731412, CNST_LIMB(0xebf12a24bca135c9), CNST_LIMB(0x15c35be67ae3e2c9)},
5525   /* 138 */ {9, 0.1406761707131039, CNST_LIMB(0xfbe6f8dbf88f4a00), CNST_LIMB(0x42a17bd09be1ff0)},
5526   /* 139 */ {8, 0.1404703297561400, CNST_LIMB(0x1ef156c084ce761), CNST_LIMB(0x8bf461f03cf0bbf)},
5527   /* 140 */ {8, 0.1402665594314587, CNST_LIMB(0x20c4e3b94a10000), CNST_LIMB(0xf3fbb43f68a32d05)},
5528   /* 141 */ {8, 0.1400648234939879, CNST_LIMB(0x22b0695a08ba421), CNST_LIMB(0xd84f44c48564dc19)},
5529   /* 142 */ {8, 0.1398650865947379, CNST_LIMB(0x24b4f35d7a4c100), CNST_LIMB(0xbe58ebcce7956abe)},
5530   /* 143 */ {8, 0.1396673142523192, CNST_LIMB(0x26d397284975781), CNST_LIMB(0xa5fac463c7c134b7)},
5531   /* 144 */ {8, 0.1394714728255649, CNST_LIMB(0x290d74100000000), CNST_LIMB(0x8f19241e28c7d757)},
5532   /* 145 */ {8, 0.1392775294872041, CNST_LIMB(0x2b63b3a37866081), CNST_LIMB(0x799a6d046c0ae1ae)},
5533   /* 146 */ {8, 0.1390854521985406, CNST_LIMB(0x2dd789f4d894100), CNST_LIMB(0x6566e37d746a9e40)},
5534   /* 147 */ {8, 0.1388952096850913, CNST_LIMB(0x306a35e51b58721), CNST_LIMB(0x526887dbfb5f788f)},
5535   /* 148 */ {8, 0.1387067714131417, CNST_LIMB(0x331d01712e10000), CNST_LIMB(0x408af3382b8efd3d)},
5536   /* 149 */ {8, 0.1385201075671774, CNST_LIMB(0x35f14200a827c61), CNST_LIMB(0x2fbb374806ec05f1)},
5537   /* 150 */ {8, 0.1383351890281539, CNST_LIMB(0x38e858b62216100), CNST_LIMB(0x1fe7c0f0afce87fe)},
5538   /* 151 */ {8, 0.1381519873525671, CNST_LIMB(0x3c03b2c13176a41), CNST_LIMB(0x11003d517540d32e)},
5539   /* 152 */ {8, 0.1379704747522905, CNST_LIMB(0x3f44c9b21000000), CNST_LIMB(0x2f5810f98eff0dc)},
5540   /* 153 */ {8, 0.1377906240751463, CNST_LIMB(0x42ad23cef3113c1), CNST_LIMB(0xeb72e35e7840d910)},
5541   /* 154 */ {8, 0.1376124087861776, CNST_LIMB(0x463e546b19a2100), CNST_LIMB(0xd27de19593dc3614)},
5542   /* 155 */ {8, 0.1374358029495937, CNST_LIMB(0x49f9fc3f96684e1), CNST_LIMB(0xbaf391fd3e5e6fc2)},
5543   /* 156 */ {8, 0.1372607812113589, CNST_LIMB(0x4de1c9c5dc10000), CNST_LIMB(0xa4bd38c55228c81d)},
5544   /* 157 */ {8, 0.1370873187823978, CNST_LIMB(0x51f77994116d2a1), CNST_LIMB(0x8fc5a8de8e1de782)},
5545   /* 158 */ {8, 0.1369153914223921, CNST_LIMB(0x563cd6bb3398100), CNST_LIMB(0x7bf9265bea9d3a3b)},
5546   /* 159 */ {8, 0.1367449754241439, CNST_LIMB(0x5ab3bb270beeb01), CNST_LIMB(0x69454b325983dccd)},
5547   /* 160 */ {8, 0.1365760475984821, CNST_LIMB(0x5f5e10000000000), CNST_LIMB(0x5798ee2308c39df9)},
5548   /* 161 */ {8, 0.1364085852596902, CNST_LIMB(0x643dce0ec16f501), CNST_LIMB(0x46e40ba0fa66a753)},
5549   /* 162 */ {8, 0.1362425662114337, CNST_LIMB(0x6954fe21e3e8100), CNST_LIMB(0x3717b0870b0db3a7)},
5550   /* 163 */ {8, 0.1360779687331669, CNST_LIMB(0x6ea5b9755f440a1), CNST_LIMB(0x2825e6775d11cdeb)},
5551   /* 164 */ {8, 0.1359147715670014, CNST_LIMB(0x74322a1c0410000), CNST_LIMB(0x1a01a1c09d1b4dac)},
5552   /* 165 */ {8, 0.1357529539050150, CNST_LIMB(0x79fc8b6ae8a46e1), CNST_LIMB(0xc9eb0a8bebc8f3e)},
5553   /* 166 */ {8, 0.1355924953769863, CNST_LIMB(0x80072a66d512100), CNST_LIMB(0xffe357ff59e6a004)},
5554   /* 167 */ {8, 0.1354333760385373, CNST_LIMB(0x86546633b42b9c1), CNST_LIMB(0xe7dfd1be05fa61a8)},
5555   /* 168 */ {8, 0.1352755763596663, CNST_LIMB(0x8ce6b0861000000), CNST_LIMB(0xd11ed6fc78f760e5)},
5556   /* 169 */ {8, 0.1351190772136599, CNST_LIMB(0x93c08e16a022441), CNST_LIMB(0xbb8db609dd29ebfe)},
5557   /* 170 */ {8, 0.1349638598663645, CNST_LIMB(0x9ae49717f026100), CNST_LIMB(0xa71aec8d1813d532)},
5558   /* 171 */ {8, 0.1348099059658079, CNST_LIMB(0xa25577ae24c1a61), CNST_LIMB(0x93b612a9f20fbc02)},
5559   /* 172 */ {8, 0.1346571975321549, CNST_LIMB(0xaa15f068e610000), CNST_LIMB(0x814fc7b19a67d317)},
5560   /* 173 */ {8, 0.1345057169479844, CNST_LIMB(0xb228d6bf7577921), CNST_LIMB(0x6fd9a03f2e0a4b7c)},
5561   /* 174 */ {8, 0.1343554469488779, CNST_LIMB(0xba91158ef5c4100), CNST_LIMB(0x5f4615a38d0d316e)},
5562   /* 175 */ {8, 0.1342063706143054, CNST_LIMB(0xc351ad9aec0b681), CNST_LIMB(0x4f8876863479a286)},
5563   /* 176 */ {8, 0.1340584713587980, CNST_LIMB(0xcc6db6100000000), CNST_LIMB(0x4094d8a3041b60eb)},
5564   /* 177 */ {8, 0.1339117329233981, CNST_LIMB(0xd5e85d09025c181), CNST_LIMB(0x32600b8ed883a09b)},
5565   /* 178 */ {8, 0.1337661393673756, CNST_LIMB(0xdfc4e816401c100), CNST_LIMB(0x24df8c6eb4b6d1f1)},
5566   /* 179 */ {8, 0.1336216750601996, CNST_LIMB(0xea06b4c72947221), CNST_LIMB(0x18097a8ee151acef)},
5567   /* 180 */ {8, 0.1334783246737591, CNST_LIMB(0xf4b139365210000), CNST_LIMB(0xbd48cc8ec1cd8e3)},
5568   /* 181 */ {8, 0.1333360731748201, CNST_LIMB(0xffc80497d520961), CNST_LIMB(0x3807a8d67485fb)},
5569   /* 182 */ {8, 0.1331949058177136, CNST_LIMB(0x10b4ebfca1dee100), CNST_LIMB(0xea5768860b62e8d8)},
5570   /* 183 */ {8, 0.1330548081372441, CNST_LIMB(0x117492de921fc141), CNST_LIMB(0xd54faf5b635c5005)},
5571   /* 184 */ {8, 0.1329157659418126, CNST_LIMB(0x123bb2ce41000000), CNST_LIMB(0xc14a56233a377926)},
5572   /* 185 */ {8, 0.1327777653067443, CNST_LIMB(0x130a8b6157bdecc1), CNST_LIMB(0xae39a88db7cd329f)},
5573   /* 186 */ {8, 0.1326407925678156, CNST_LIMB(0x13e15dede0e8a100), CNST_LIMB(0x9c10bde69efa7ab6)},
5574   /* 187 */ {8, 0.1325048343149731, CNST_LIMB(0x14c06d941c0ca7e1), CNST_LIMB(0x8ac36c42a2836497)},
5575   /* 188 */ {8, 0.1323698773862368, CNST_LIMB(0x15a7ff487a810000), CNST_LIMB(0x7a463c8b84f5ef67)},
5576   /* 189 */ {8, 0.1322359088617821, CNST_LIMB(0x169859ddc5c697a1), CNST_LIMB(0x6a8e5f5ad090fd4b)},
5577   /* 190 */ {8, 0.1321029160581950, CNST_LIMB(0x1791c60f6fed0100), CNST_LIMB(0x5b91a2943596fc56)},
5578   /* 191 */ {8, 0.1319708865228925, CNST_LIMB(0x18948e8c0e6fba01), CNST_LIMB(0x4d4667b1c468e8f0)},
5579   /* 192 */ {8, 0.1318398080287045, CNST_LIMB(0x19a1000000000000), CNST_LIMB(0x3fa39ab547994daf)},
5580   /* 193 */ {8, 0.1317096685686114, CNST_LIMB(0x1ab769203dafc601), CNST_LIMB(0x32a0a9b2faee1e2a)},
5581   /* 194 */ {8, 0.1315804563506306, CNST_LIMB(0x1bd81ab557f30100), CNST_LIMB(0x26357ceac0e96962)},
5582   /* 195 */ {8, 0.1314521597928493, CNST_LIMB(0x1d0367a69fed1ba1), CNST_LIMB(0x1a5a6f65caa5859e)},
5583   /* 196 */ {8, 0.1313247675185968, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)},
5584   /* 197 */ {8, 0.1311982683517524, CNST_LIMB(0x1f7b2a18f29ac3e1), CNST_LIMB(0x4383340615612ca)},
5585   /* 198 */ {8, 0.1310726513121843, CNST_LIMB(0x20c850694c2aa100), CNST_LIMB(0xf3c77969ee4be5a2)},
5586   /* 199 */ {8, 0.1309479056113158, CNST_LIMB(0x222173cc014980c1), CNST_LIMB(0xe00993cc187c5ec9)},
5587   /* 200 */ {8, 0.1308240206478128, CNST_LIMB(0x2386f26fc1000000), CNST_LIMB(0xcd2b297d889bc2b6)},
5588   /* 201 */ {8, 0.1307009860033912, CNST_LIMB(0x24f92ce8af296d41), CNST_LIMB(0xbb214d5064862b22)},
5589   /* 202 */ {8, 0.1305787914387386, CNST_LIMB(0x2678863cd0ece100), CNST_LIMB(0xa9e1a7ca7ea10e20)},
5590   /* 203 */ {8, 0.1304574268895465, CNST_LIMB(0x280563f0a9472d61), CNST_LIMB(0x99626e72b39ea0cf)},
5591   /* 204 */ {8, 0.1303368824626505, CNST_LIMB(0x29a02e1406210000), CNST_LIMB(0x899a5ba9c13fafd9)},
5592   /* 205 */ {8, 0.1302171484322746, CNST_LIMB(0x2b494f4efe6d2e21), CNST_LIMB(0x7a80a705391e96ff)},
5593   /* 206 */ {8, 0.1300982152363760, CNST_LIMB(0x2d0134ef21cbc100), CNST_LIMB(0x6c0cfe23de23042a)},
5594   /* 207 */ {8, 0.1299800734730872, CNST_LIMB(0x2ec84ef4da2ef581), CNST_LIMB(0x5e377df359c944dd)},
5595   /* 208 */ {8, 0.1298627138972530, CNST_LIMB(0x309f102100000000), CNST_LIMB(0x50f8ac5fc8f53985)},
5596   /* 209 */ {8, 0.1297461274170591, CNST_LIMB(0x3285ee02a1420281), CNST_LIMB(0x44497266278e35b7)},
5597   /* 210 */ {8, 0.1296303050907487, CNST_LIMB(0x347d6104fc324100), CNST_LIMB(0x382316831f7ee175)},
5598   /* 211 */ {8, 0.1295152381234257, CNST_LIMB(0x3685e47dade53d21), CNST_LIMB(0x2c7f377833b8946e)},
5599   /* 212 */ {8, 0.1294009178639407, CNST_LIMB(0x389ff6bb15610000), CNST_LIMB(0x2157c761ab4163ef)},
5600   /* 213 */ {8, 0.1292873358018581, CNST_LIMB(0x3acc1912ebb57661), CNST_LIMB(0x16a7071803cc49a9)},
5601   /* 214 */ {8, 0.1291744835645007, CNST_LIMB(0x3d0acff111946100), CNST_LIMB(0xc6781d80f8224fc)},
5602   /* 215 */ {8, 0.1290623529140715, CNST_LIMB(0x3f5ca2e692eaf841), CNST_LIMB(0x294092d370a900b)},
5603   /* 216 */ {8, 0.1289509357448472, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
5604   /* 217 */ {8, 0.1288402240804449, CNST_LIMB(0x443bcb714399a5c1), CNST_LIMB(0xe03b98f103fad6d2)},
5605   /* 218 */ {8, 0.1287302100711567, CNST_LIMB(0x46ca406c81af2100), CNST_LIMB(0xcee3d32cad2a9049)},
5606   /* 219 */ {8, 0.1286208859913518, CNST_LIMB(0x496e106ac22aaae1), CNST_LIMB(0xbe3f9df9277fdada)},
5607   /* 220 */ {8, 0.1285122442369443, CNST_LIMB(0x4c27d39fa5410000), CNST_LIMB(0xae46f0d94c05e933)},
5608   /* 221 */ {8, 0.1284042773229231, CNST_LIMB(0x4ef825c296e43ca1), CNST_LIMB(0x9ef2280fb437a33d)},
5609   /* 222 */ {8, 0.1282969778809442, CNST_LIMB(0x51dfa61f5ad88100), CNST_LIMB(0x9039ff426d3f284b)},
5610   /* 223 */ {8, 0.1281903386569819, CNST_LIMB(0x54def7a6d2f16901), CNST_LIMB(0x82178c6d6b51f8f4)},
5611   /* 224 */ {8, 0.1280843525090381, CNST_LIMB(0x57f6c10000000000), CNST_LIMB(0x74843b1ee4c1e053)},
5612   /* 225 */ {8, 0.1279790124049077, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)},
5613   /* 226 */ {8, 0.1278743114199984, CNST_LIMB(0x5e7268b9bbdf8100), CNST_LIMB(0x5af23c74f9ad9fe9)},
5614   /* 227 */ {8, 0.1277702427352035, CNST_LIMB(0x61d7a7932ff3d6a1), CNST_LIMB(0x4ee7eae2acdc617e)},
5615   /* 228 */ {8, 0.1276667996348261, CNST_LIMB(0x65581f53c8c10000), CNST_LIMB(0x43556aa2ac262a0b)},
5616   /* 229 */ {8, 0.1275639755045533, CNST_LIMB(0x68f48a385b8320e1), CNST_LIMB(0x3835949593b8ddd1)},
5617   /* 230 */ {8, 0.1274617638294791, CNST_LIMB(0x6cada69ed07c2100), CNST_LIMB(0x2d837fbe78458762)},
5618   /* 231 */ {8, 0.1273601581921741, CNST_LIMB(0x70843718cdbf27c1), CNST_LIMB(0x233a7e150a54a555)},
5619   /* 232 */ {8, 0.1272591522708010, CNST_LIMB(0x7479027ea1000000), CNST_LIMB(0x19561984a50ff8fe)},
5620   /* 233 */ {8, 0.1271587398372755, CNST_LIMB(0x788cd40268f39641), CNST_LIMB(0xfd211159fe3490f)},
5621   /* 234 */ {8, 0.1270589147554692, CNST_LIMB(0x7cc07b437ecf6100), CNST_LIMB(0x6aa563e655033e3)},
5622   /* 235 */ {8, 0.1269596709794558, CNST_LIMB(0x8114cc6220762061), CNST_LIMB(0xfbb614b3f2d3b14c)},
5623   /* 236 */ {8, 0.1268610025517973, CNST_LIMB(0x858aa0135be10000), CNST_LIMB(0xeac0f8837fb05773)},
5624   /* 237 */ {8, 0.1267629036018709, CNST_LIMB(0x8a22d3b53c54c321), CNST_LIMB(0xda6e4c10e8615ca5)},
5625   /* 238 */ {8, 0.1266653683442337, CNST_LIMB(0x8ede496339f34100), CNST_LIMB(0xcab755a8d01fa67f)},
5626   /* 239 */ {8, 0.1265683910770258, CNST_LIMB(0x93bde80aec3a1481), CNST_LIMB(0xbb95a9ae71aa3e0c)},
5627   /* 240 */ {8, 0.1264719661804097, CNST_LIMB(0x98c29b8100000000), CNST_LIMB(0xad0326c296b4f529)},
5628   /* 241 */ {8, 0.1263760881150453, CNST_LIMB(0x9ded549671832381), CNST_LIMB(0x9ef9f21eed31b7c1)},
5629   /* 242 */ {8, 0.1262807514205999, CNST_LIMB(0xa33f092e0b1ac100), CNST_LIMB(0x91747422be14b0b2)},
5630   /* 243 */ {8, 0.1261859507142915, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
5631   /* 244 */ {8, 0.1260916806894653, CNST_LIMB(0xae5b564ac3a10000), CNST_LIMB(0x77df79e9a96c06f6)},
5632   /* 245 */ {8, 0.1259979361142023, CNST_LIMB(0xb427f4b3be74c361), CNST_LIMB(0x6bc6019636c7d0c2)},
5633   /* 246 */ {8, 0.1259047118299582, CNST_LIMB(0xba1f9a938041e100), CNST_LIMB(0x601c4205aebd9e47)},
5634   /* 247 */ {8, 0.1258120027502338, CNST_LIMB(0xc0435871d1110f41), CNST_LIMB(0x54ddc59756f05016)},
5635   /* 248 */ {8, 0.1257198038592741, CNST_LIMB(0xc694446f01000000), CNST_LIMB(0x4a0648979c838c18)},
5636   /* 249 */ {8, 0.1256281102107963, CNST_LIMB(0xcd137a5b57ac3ec1), CNST_LIMB(0x3f91b6e0bb3a053d)},
5637   /* 250 */ {8, 0.1255369169267456, CNST_LIMB(0xd3c21bcecceda100), CNST_LIMB(0x357c299a88ea76a5)},
5638   /* 251 */ {8, 0.1254462191960791, CNST_LIMB(0xdaa150410b788de1), CNST_LIMB(0x2bc1e517aecc56e3)},
5639   /* 252 */ {8, 0.1253560122735751, CNST_LIMB(0xe1b24521be010000), CNST_LIMB(0x225f56ceb3da9f5d)},
5640   /* 253 */ {8, 0.1252662914786691, CNST_LIMB(0xe8f62df12777c1a1), CNST_LIMB(0x1951136d53ad63ac)},
5641   /* 254 */ {8, 0.1251770521943144, CNST_LIMB(0xf06e445906fc0100), CNST_LIMB(0x1093d504b3cd7d93)},
5642   /* 255 */ {8, 0.1250882898658681, CNST_LIMB(0xf81bc845c81bf801), CNST_LIMB(0x824794d1ec1814f)},
5643 };
5644 #endif
5645 
5646 static int
5647 #if __STDC__
__gmp_assert_fail(const char * filename,int linenum,const char * expr)5648 __gmp_assert_fail (const char *filename, int linenum,
5649                    const char *expr)
5650 #else
5651 __gmp_assert_fail (filename, linenum, expr)
5652 char *filename;
5653 int  linenum;
5654 char *expr;
5655 #endif
5656 {
5657 #if 0
5658   if (filename != NULL && filename[0] != '\0')
5659     {
5660       fprintf (stderr, "%s:", filename);
5661       if (linenum != -1)
5662         fprintf (stderr, "%d: ", linenum);
5663     }
5664 
5665   fprintf (stderr, "GNU MP assertion failed: %s\n", expr);
5666   abort();
5667 #endif
5668 
5669   /*NOTREACHED*/
5670   return 0;
5671 }
5672 
5673 /* __clz_tab -- support for longlong.h */
5674 
5675 const
5676 unsigned char __clz_tab[] =
5677 {
5678   0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
5679   6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
5680   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
5681   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
5682   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
5683   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
5684   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
5685   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
5686 };
5687 
5688 
5689 /****************************************/
5690 
5691 #ifndef HAVE_ALLOCA
5692 
5693 typedef struct gmp_tmp_stack tmp_stack;
5694 
5695 THREAD_LOCAL_DECL(static uintptr_t max_total_allocation);
5696 THREAD_LOCAL_DECL(static uintptr_t current_total_allocation);
5697 
5698 THREAD_LOCAL_DECL(static tmp_stack gmp_tmp_xxx);
5699 THREAD_LOCAL_DECL(static tmp_stack *gmp_tmp_current = 0);
5700 
5701 /* The rounded size of the header of each allocation block.  */
5702 #define HSIZ ((sizeof (tmp_stack) + __TMP_ALIGN - 1) & -__TMP_ALIGN)
5703 
5704 /* Allocate a block of exactly <size> bytes.  This should only be called
5705    through the TMP_ALLOC macro, which takes care of rounding/alignment.  */
5706 void *
5707 #if __STDC__
__gmp_tmp_alloc(unsigned long size)5708 __gmp_tmp_alloc (unsigned long size)
5709 #else
5710 __gmp_tmp_alloc (size)
5711      unsigned long size;
5712 #endif
5713 {
5714   void *that;
5715 
5716   if (size > (char *) gmp_tmp_current->end - (char *) gmp_tmp_current->alloc_point)
5717     {
5718       void *chunk;
5719       tmp_stack *header;
5720       unsigned long chunk_size;
5721       unsigned long now;
5722 
5723       /* Allocate a chunk that makes the total current allocation somewhat
5724 	 larger than the maximum allocation ever.  If size is very large, we
5725 	 allocate that much.  */
5726 
5727       now = current_total_allocation + size;
5728       if (now > max_total_allocation)
5729 	{
5730 	  /* We need more temporary memory than ever before.  Increase
5731 	     for future needs.  */
5732 	  now = now * 3 / 2;
5733 	  chunk_size = now - current_total_allocation + HSIZ;
5734 	  current_total_allocation = now;
5735 	  max_total_allocation = current_total_allocation;
5736 	}
5737       else
5738 	{
5739 	  chunk_size = max_total_allocation - current_total_allocation + HSIZ;
5740 	  current_total_allocation = max_total_allocation;
5741 	}
5742 
5743       chunk = MALLOC (chunk_size);
5744       header = (tmp_stack *) chunk;
5745       header->end = (char *) chunk + chunk_size;
5746       header->alloc_point = (char *) chunk + HSIZ;
5747       header->prev = gmp_tmp_current;
5748       gmp_tmp_current = header;
5749     }
5750 
5751   that = gmp_tmp_current->alloc_point;
5752   gmp_tmp_current->alloc_point = (char *) that + size;
5753   return that;
5754 }
5755 
5756 /* Typically called at function entry.  <mark> is assigned so that
5757    __gmp_tmp_free can later be used to reclaim all subsequently allocated
5758    storage.  */
5759 void
5760 #if __STDC__
__gmp_tmp_mark(tmp_marker * mark)5761 __gmp_tmp_mark (tmp_marker *mark)
5762 #else
5763 __gmp_tmp_mark (mark)
5764      tmp_marker *mark;
5765 #endif
5766 {
5767   mark->which_chunk = gmp_tmp_current;
5768   mark->alloc_point = gmp_tmp_current->alloc_point;
5769 }
5770 
5771 /* Free everything allocated since <mark> was assigned by __gmp_tmp_mark */
5772 void
5773 #if __STDC__
__gmp_tmp_free(tmp_marker * mark)5774 __gmp_tmp_free (tmp_marker *mark)
5775 #else
5776 __gmp_tmp_free (mark)
5777      tmp_marker *mark;
5778 #endif
5779 {
5780   while (mark->which_chunk != gmp_tmp_current)
5781     {
5782       tmp_stack *tmp;
5783 
5784       tmp = gmp_tmp_current;
5785       gmp_tmp_current = tmp->prev;
5786       current_total_allocation -= (((char *) (tmp->end) - (char *) tmp) - HSIZ);
5787       FREE (tmp, (char *) tmp->end - (char *) tmp);
5788     }
5789   gmp_tmp_current->alloc_point = mark->alloc_point;
5790 }
5791 
scheme_init_gmp_places()5792 void scheme_init_gmp_places() {
5793   gmp_tmp_xxx.end = &gmp_tmp_xxx;
5794   gmp_tmp_xxx.alloc_point = &gmp_tmp_xxx;
5795   gmp_tmp_xxx.prev = 0;
5796   gmp_tmp_current = &gmp_tmp_xxx;
5797   REGISTER_SO(gmp_mem_pool);
5798 }
5799 
scheme_gmp_tls_init(intptr_t * s)5800 void scheme_gmp_tls_init(intptr_t *s)
5801 {
5802   s[0] = 0;
5803   s[1] = 0;
5804   s[2] = (intptr_t)&gmp_tmp_xxx;
5805   ((tmp_marker *)(s + 3))->which_chunk = &gmp_tmp_xxx;
5806   ((tmp_marker *)(s + 3))->alloc_point = &gmp_tmp_xxx;
5807 }
5808 
scheme_gmp_tls_load(intptr_t * s)5809 void *scheme_gmp_tls_load(intptr_t *s)
5810 {
5811   s[0] = (intptr_t)current_total_allocation;
5812   s[1] = (intptr_t)max_total_allocation;
5813   s[2] = (intptr_t)gmp_tmp_current;
5814   return gmp_mem_pool;
5815 }
5816 
scheme_gmp_tls_unload(intptr_t * s,void * data)5817 void scheme_gmp_tls_unload(intptr_t *s, void *data)
5818 {
5819   current_total_allocation = (uintptr_t)s[0];
5820   max_total_allocation = (uintptr_t)s[1];
5821   gmp_tmp_current = (tmp_stack *)s[2];
5822   s[0] = 0;
5823   gmp_mem_pool = data;
5824 }
5825 
scheme_gmp_tls_snapshot(intptr_t * s,intptr_t * save)5826 void scheme_gmp_tls_snapshot(intptr_t *s, intptr_t *save)
5827 {
5828   save[0] = s[3];
5829   save[1] = s[4];
5830   __gmp_tmp_mark((tmp_marker *)(s + 3));
5831 }
5832 
scheme_gmp_tls_restore_snapshot(intptr_t * s,void * data,intptr_t * save,int do_free)5833 void scheme_gmp_tls_restore_snapshot(intptr_t *s, void *data, intptr_t *save, int do_free)
5834 {
5835   /* silence gcc "may be used uninitialized in this function" warnings */
5836   intptr_t other[6] = {0,0,0,0,0,0};
5837   void *other_data;
5838 
5839   if (do_free == 2) {
5840     other_data = scheme_gmp_tls_load(other);
5841     scheme_gmp_tls_unload(s, data);
5842   } else
5843     other_data = NULL;
5844 
5845   if (do_free)
5846     __gmp_tmp_free((tmp_marker *)(s + 3));
5847 
5848   if (save) {
5849     s[3] = save[0];
5850     s[4] = save[1];
5851 
5852   }
5853 
5854   if (do_free == 2) {
5855     data = scheme_gmp_tls_load(s);
5856     scheme_gmp_tls_unload(other, other_data);
5857   }
5858 }
5859 
5860 #else
5861 
scheme_gmp_tls_init(intptr_t * s)5862 void scheme_gmp_tls_init(intptr_t *s)
5863 {
5864 }
5865 
scheme_gmp_tls_load(intptr_t * s)5866 void scheme_gmp_tls_load(intptr_t *s)
5867 {
5868 }
5869 
scheme_gmp_tls_unload(intptr_t * s)5870 void scheme_gmp_tls_unload(intptr_t *s)
5871 {
5872 }
5873 
5874 #endif
5875