1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <unistd.h>
5 #include <float.h>
6 
7 /* Use long double to get a little more precision when we're calculating the
8  * math functions -- especially those calculated with a series.  Long double
9  * is defined in C89 (ISO C).  Note that 'long double' on many platforms is
10  * identical to 'double so it may buy us nothing.  But it's worth trying.
11  *
12  * While the type was in C89, math functions using it are in C99.  Some
13  * systems didn't really get it right (e.g. NetBSD which left out some
14  * functions for 13 years).
15  */
16 #include <math.h>
17 #if _MSC_VER || defined(__IBMC__) | defined(__IBMCPP__) || (defined(__STDC_VERSION__) && __STDC_VERSION >= 199901L)
18   /* math.h should give us these as functions or macros.
19    *
20    *  extern long double fabsl(long double);
21    *  extern long double floorl(long double);
22    *  extern long double ceill(long double);
23    *  extern long double sqrtl(long double);
24    *  extern long double powl(long double, long double);
25    *  extern long double expl(long double);
26    *  extern long double logl(long double);
27    */
28 #else
29   #define fabsl(x)    (long double) fabs( (double) (x) )
30   #define floorl(x)   (long double) floor( (double) (x) )
31   #define ceill(x)    (long double) ceil( (double) (x) )
32   #define sqrtl(x)    (long double) sqrt( (double) (x) )
33   #define powl(x, y)  (long double) pow( (double) (x), (double) (y) )
34   #define expl(x)     (long double) exp( (double) (x) )
35   #define logl(x)     (long double) log( (double) (x) )
36 #endif
37 
38 #ifdef LDBL_INFINITY
39   #undef INFINITY
40   #define INFINITY LDBL_INFINITY
41 #elif !defined(INFINITY)
42   #define INFINITY (DBL_MAX + DBL_MAX)
43 #endif
44 #ifndef LDBL_EPSILON
45   #define LDBL_EPSILON 1e-16
46 #endif
47 #ifndef LDBL_MAX
48   #define LDBL_MAX DBL_MAX
49 #endif
50 
51 #include "ptypes.h"
52 #define FUNC_isqrt 1
53 #define FUNC_icbrt 1
54 #define FUNC_lcm_ui 1
55 #define FUNC_ctz 1
56 #define FUNC_log2floor 1
57 #define FUNC_is_perfect_square
58 #define FUNC_is_perfect_cube
59 #define FUNC_is_perfect_fifth
60 #define FUNC_is_perfect_seventh
61 #define FUNC_next_prime_in_sieve 1
62 #define FUNC_prev_prime_in_sieve 1
63 #define FUNC_ipow 1
64 #include "util.h"
65 #include "sieve.h"
66 #include "primality.h"
67 #include "cache.h"
68 #include "lmo.h"
69 #include "factor.h"
70 #include "mulmod.h"
71 #include "constants.h"
72 #include "montmath.h"
73 #include "csprng.h"
74 #include "keyval.h"
75 
76 #define KAHAN_INIT(s) \
77   LNV s ## _y, s ## _t; \
78   LNV s ## _c = 0.0; \
79   LNV s = 0.0;
80 
81 #define KAHAN_SUM(s, term) \
82   do { \
83     s ## _y = (term) - s ## _c; \
84     s ## _t = s + s ## _y; \
85     s ## _c = (s ## _t - s) - s ## _y; \
86     s = s ## _t; \
87   } while (0)
88 
_numcmp(const void * a,const void * b)89 int _numcmp(const void *a, const void *b) {
90   const UV *x = a, *y = b;
91   return (*x > *y) ? 1 : (*x < *y) ? -1 : 0;
92 }
93 
94 static int _verbose = 0;
_XS_set_verbose(int v)95 void _XS_set_verbose(int v) { _verbose = v; }
_XS_get_verbose(void)96 int _XS_get_verbose(void) { return _verbose; }
97 
98 static int _call_gmp = 0;
_XS_set_callgmp(int v)99 void _XS_set_callgmp(int v) { _call_gmp = v; }
_XS_get_callgmp(void)100 int  _XS_get_callgmp(void) { return _call_gmp; }
101 
102 static int _secure = 0;
_XS_set_secure(void)103 void _XS_set_secure(void) { _secure = 1; }
_XS_get_secure(void)104 int  _XS_get_secure(void) { return _secure; }
105 
106 /* We'll use this little static sieve to quickly answer small values of
107  *   is_prime, next_prime, prev_prime, prime_count
108  * for non-threaded Perl it's basically the same as getting the primary
109  * cache.  It guarantees we'll have an answer with no waiting on any version.
110  */
111 static const unsigned char prime_sieve30[] =
112   {0x01,0x20,0x10,0x81,0x49,0x24,0xc2,0x06,0x2a,0xb0,0xe1,0x0c,0x15,0x59,0x12,
113    0x61,0x19,0xf3,0x2c,0x2c,0xc4,0x22,0xa6,0x5a,0x95,0x98,0x6d,0x42,0x87,0xe1,
114    0x59,0xa9,0xa9,0x1c,0x52,0xd2,0x21,0xd5,0xb3,0xaa,0x26,0x5c,0x0f,0x60,0xfc,
115    0xab,0x5e,0x07,0xd1,0x02,0xbb,0x16,0x99,0x09,0xec,0xc5,0x47,0xb3,0xd4,0xc5,
116    0xba,0xee,0x40,0xab,0x73,0x3e,0x85,0x4c,0x37,0x43,0x73,0xb0,0xde,0xa7,0x8e,
117    0x8e,0x64,0x3e,0xe8,0x10,0xab,0x69,0xe5,0xf7,0x1a,0x7c,0x73,0xb9,0x8d,0x04,
118    0x51,0x9a,0x6d,0x70,0xa7,0x78,0x2d,0x6d,0x27,0x7e,0x9a,0xd9,0x1c,0x5f,0xee,
119    0xc7,0x38,0xd9,0xc3,0x7e,0x14,0x66,0x72,0xae,0x77,0xc1,0xdb,0x0c,0xcc,0xb2,
120    0xa5,0x74,0xe3,0x58,0xd5,0x4b,0xa7,0xb3,0xb1,0xd9,0x09,0xe6,0x7d,0x23,0x7c,
121    0x3c,0xd3,0x0e,0xc7,0xfd,0x4a,0x32,0x32,0xfd,0x4d,0xb5,0x6b,0xf3,0xa8,0xb3,
122    0x85,0xcf,0xbc,0xf4,0x0e,0x34,0xbb,0x93,0xdb,0x07,0xe6,0xfe,0x6a,0x57,0xa3,
123    0x8c,0x15,0x72,0xdb,0x69,0xd4,0xaf,0x59,0xdd,0xe1,0x3b,0x2e,0xb7,0xf9,0x2b,
124    0xc5,0xd0,0x8b,0x63,0xf8,0x95,0xfa,0x77,0x40,0x97,0xea,0xd1,0x9f,0xaa,0x1c,
125    0x48,0xae,0x67,0xf7,0xeb,0x79,0xa5,0x55,0xba,0xb2,0xb6,0x8f,0xd8,0x2d,0x6c,
126    0x2a,0x35,0x54,0xfd,0x7c,0x9e,0xfa,0xdb,0x31,0x78,0xdd,0x3d,0x56,0x52,0xe7,
127    0x73,0xb2,0x87,0x2e,0x76,0xe9,0x4f,0xa8,0x38,0x9d,0x5d,0x3f,0xcb,0xdb,0xad,
128    0x51,0xa5,0xbf,0xcd,0x72,0xde,0xf7,0xbc,0xcb,0x49,0x2d,0x49,0x26,0xe6,0x1e,
129    0x9f,0x98,0xe5,0xc6,0x9f,0x2f,0xbb,0x85,0x6b,0x65,0xf6,0x77,0x7c,0x57,0x8b,
130    0xaa,0xef,0xd8,0x5e,0xa2,0x97,0xe1,0xdc,0x37,0xcd,0x1f,0xe6,0xfc,0xbb,0x8c,
131    0xb7,0x4e,0xc7,0x3c,0x19,0xd5,0xa8,0x9e,0x67,0x4a,0xe3,0xf5,0x97,0x3a,0x7e,
132    0x70,0x53,0xfd,0xd6,0xe5,0xb8,0x1c,0x6b,0xee,0xb1,0x9b,0xd1,0xeb,0x34,0xc2,
133    0x23,0xeb,0x3a,0xf9,0xef,0x16,0xd6,0x4e,0x7d,0x16,0xcf,0xb8,0x1c,0xcb,0xe6,
134    0x3c,0xda,0xf5,0xcf};
135 #define NPRIME_SIEVE30 (sizeof(prime_sieve30)/sizeof(prime_sieve30[0]))
136 
137 static const unsigned short primes_tiny[] =
138   {0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
139    101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,
140    193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,
141    293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,
142    409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503};
143 #define NPRIMES_TINY (sizeof(primes_tiny)/sizeof(primes_tiny[0]))
144 
145 /* Return of 2 if n is prime, 0 if not.  Do it fast. */
is_prime(UV n)146 int is_prime(UV n)
147 {
148   if (n <= 10)
149     return (n == 2 || n == 3 || n == 5 || n == 7) ? 2 : 0;
150 
151   if (n < UVCONST(200000000)) {
152     UV d = n/30;
153     UV m = n - d*30;
154     unsigned char mtab = masktab30[m];  /* Bitmask in mod30 wheel */
155     const unsigned char* sieve;
156 
157     /* Return 0 if a multiple of 2, 3, or 5 */
158     if (mtab == 0)
159       return 0;
160 
161     /* Check static tiny sieve */
162     if (d < NPRIME_SIEVE30)
163       return (prime_sieve30[d] & mtab) ? 0 : 2;
164 
165     if (!(n%7) || !(n%11) || !(n%13)) return 0;
166 
167     /* Check primary cache */
168     if (n <= get_prime_cache(0,0)) {
169       int isprime = -1;
170       if (n <= get_prime_cache(0, &sieve))
171         isprime = 2*((sieve[d] & mtab) == 0);
172       release_prime_cache(sieve);
173       if (isprime >= 0)
174         return isprime;
175     }
176   }
177   return is_prob_prime(n);
178 }
179 
180 
next_prime(UV n)181 UV next_prime(UV n)
182 {
183   UV m, next;
184 
185   if (n < 30*NPRIME_SIEVE30) {
186     next = next_prime_in_sieve(prime_sieve30, n, 30*NPRIME_SIEVE30);
187     if (next != 0) return next;
188   }
189 
190   if (n >= MPU_MAX_PRIME) return 0; /* Overflow */
191 
192   if (n < get_prime_cache(0,0)) {
193     const unsigned char* sieve;
194     UV sieve_size = get_prime_cache(0, &sieve);
195     next = (n < sieve_size)  ?  next_prime_in_sieve(sieve, n, sieve_size)  :  0;
196     release_prime_cache(sieve);
197     if (next != 0) return next;
198   }
199 
200   m = n % 30;
201   do { /* Move forward one. */
202     n += wheeladvance30[m];
203     m = nextwheel30[m];
204   } while (!is_prob_prime(n));
205   return n;
206 }
207 
208 
prev_prime(UV n)209 UV prev_prime(UV n)
210 {
211   UV m, prev;
212 
213   if (n < 30*NPRIME_SIEVE30)
214     return prev_prime_in_sieve(prime_sieve30, n);
215 
216   if (n < get_prime_cache(0,0)) {
217     const unsigned char* sieve;
218     UV sieve_size = get_prime_cache(0, &sieve);
219     prev = (n < sieve_size)  ?  prev_prime_in_sieve(sieve, n)  :  0;
220     release_prime_cache(sieve);
221     if (prev != 0) return prev;
222   }
223 
224   m = n % 30;
225   do { /* Move back one. */
226     n -= wheelretreat30[m];
227     m = prevwheel30[m];
228   } while (!is_prob_prime(n));
229   return n;
230 }
231 
232 /******************************************************************************/
233 /*                                 PRINTING                                   */
234 /******************************************************************************/
235 
my_sprint(char * ptr,UV val)236 static int my_sprint(char* ptr, UV val) {
237   int nchars;
238   UV t;
239   char *s = ptr;
240   do {
241     t = val / 10; *s++ = (char) ('0' + val - 10 * t);
242   } while ((val = t));
243   nchars = s - ptr + 1;  *s = '\n';
244   while (--s > ptr) { char c = *s; *s = *ptr; *ptr++ = c; }
245   return nchars;
246 }
write_buf(int fd,char * buf,char * bend)247 static char* write_buf(int fd, char* buf, char* bend) {
248   int res = (int) write(fd, buf, bend-buf);
249   if (res == -1) croak("print_primes write error");
250   return buf;
251 }
print_primes(UV low,UV high,int fd)252 void print_primes(UV low, UV high, int fd) {
253   char buf[8000+25];
254   char* bend = buf;
255   if ((low <= 2) && (high >= 2)) bend += my_sprint(bend,2);
256   if ((low <= 3) && (high >= 3)) bend += my_sprint(bend,3);
257   if ((low <= 5) && (high >= 5)) bend += my_sprint(bend,5);
258   if (low < 7) low = 7;
259 
260   if (low <= high) {
261     unsigned char* segment;
262     UV seg_base, seg_low, seg_high;
263     void* ctx = start_segment_primes(low, high, &segment);
264     while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
265       START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high )
266         bend += my_sprint(bend,p);
267         if (bend-buf > 8000) { bend = write_buf(fd, buf, bend); }
268       END_DO_FOR_EACH_SIEVE_PRIME
269     }
270     end_segment_primes(ctx);
271   }
272   if (bend > buf) { bend = write_buf(fd, buf, bend); }
273 }
274 
275 /******************************************************************************/
276 /*                     TOTIENT, MOEBIUS, MERTENS                              */
277 /******************************************************************************/
278 
279 /* Return a char array with lo-hi+1 elements. mu[k-lo] = µ(k) for k = lo .. hi.
280  * It is the callers responsibility to call Safefree on the result. */
281 #define PGTLO(ip,p,lo)  ((ip)>=(lo)) ? (ip) : ((p)*((lo)/(p)) + (((lo)%(p))?(p):0))
range_moebius(UV lo,UV hi)282 signed char* range_moebius(UV lo, UV hi)
283 {
284   signed char* mu;
285   UV i, sqrtn = isqrt(hi), count = hi-lo+1;
286 
287   /* Kuznetsov indicates that the Deléglise & Rivat (1996) method can be
288    * modified to work on logs, which allows us to operate with no
289    * intermediate memory at all.  Same time as the D&R method, less memory. */
290   unsigned char logp;
291   UV nextlog, nextlogi;
292 
293   Newz(0, mu, count, signed char);
294   if (sqrtn*sqrtn != hi && sqrtn < (UVCONST(1)<<(BITS_PER_WORD/2))-1) sqrtn++;
295 
296   /* For small ranges, do it by hand */
297   if (hi < 100 || count <= 10 || (hi > (1UL<<25) && count < icbrt(hi)/4)) {
298     for (i = 0; i < count; i++)
299       mu[i] = moebius(lo+i);
300     return mu;
301   }
302 
303   logp = 1; nextlog = 3; /* 2+1 */
304   START_DO_FOR_EACH_PRIME(2, sqrtn) {
305     UV p2 = p*p;
306     if (p > nextlog) {
307       logp += 2;   /* logp is 1 | ceil(log(p)/log(2)) */
308       nextlog = ((nextlog-1)*4)+1;
309     }
310     for (i = PGTLO(p, p, lo); i >= lo && i <= hi; i += p)
311       mu[i-lo] += logp;
312     for (i = PGTLO(p2, p2, lo); i >= lo && i <= hi; i += p2)
313       mu[i-lo] = 0x80;
314   } END_DO_FOR_EACH_PRIME
315 
316   logp = log2floor(lo);
317   nextlogi = (UVCONST(2) << logp) - lo;
318   for (i = 0; i < count; i++) {
319     unsigned char a = mu[i];
320     if (i >= nextlogi) nextlogi = (UVCONST(2) << ++logp) - lo;
321     if (a & 0x80)       { a = 0; }
322     else if (a >= logp) { a =  1 - 2*(a&1); }
323     else                { a = -1 + 2*(a&1); }
324     mu[i] = a;
325   }
326   if (lo == 0)  mu[0] = 0;
327 
328   return mu;
329 }
330 
range_totient(UV lo,UV hi)331 UV* range_totient(UV lo, UV hi) {
332   UV* totients;
333   UV i, seg_base, seg_low, seg_high, count = hi-lo+1;
334   unsigned char* segment;
335   void* ctx;
336 
337   if (hi < lo) croak("range_totient error hi %"UVuf" < lo %"UVuf"\n", hi, lo);
338   New(0, totients, count, UV);
339 
340   /* Do via factoring if very small or if we have a small range */
341   if (hi < 100 || count <= 10 || hi/count > 1000) {
342     for (i = 0; i < count; i++)
343       totients[i] = totient(lo+i);
344     return totients;
345   }
346 
347   if (hi == UV_MAX) {
348     totients[--count] = totient(UV_MAX);
349     hi--;
350   }
351 
352   /* If doing a full sieve, do it monolithic.  Faster. */
353   if (lo == 0) {
354     UV* prime;
355     double loghi = log(hi);
356     UV max_index = (hi < 67)     ? 18
357                  : (hi < 355991) ? 15+(hi/(loghi-1.09))
358                  : (hi/loghi) * (1.0+1.0/loghi+2.51/(loghi*loghi));
359     UV j, index, nprimes = 0;
360 
361     New(0, prime, max_index, UV);  /* could use prime_count_upper(hi) */
362     memset(totients, 0, count * sizeof(UV));
363     for (i = 2; i <= hi/2; i++) {
364       index = 2*i;
365       if ( !(i&1) ) {
366         if (i == 2) { totients[2] = 1; prime[nprimes++] = 2; }
367         totients[index] = totients[i]*2;
368       } else {
369         if (totients[i] == 0) {
370           totients[i] = i-1;
371           prime[nprimes++] = i;
372         }
373         for (j=0; j < nprimes && index <= hi; index = i*prime[++j]) {
374           if (i % prime[j] == 0) {
375             totients[index] = totients[i]*prime[j];
376             break;
377           } else {
378             totients[index] = totients[i]*(prime[j]-1);
379           }
380         }
381       }
382     }
383     Safefree(prime);
384     /* All totient values have been filled in except the primes.  Mark them. */
385     for (i = ((hi/2) + 1) | 1; i <= hi; i += 2)
386       if (totients[i] == 0)
387         totients[i] = i-1;
388     totients[1] = 1;
389     totients[0] = 0;
390     return totients;
391   }
392 
393   for (i = 0; i < count; i++) {
394     UV v = lo+i, nv = v;
395     if (v % 2 == 0)  nv -= nv/2;
396     if (v % 3 == 0)  nv -= nv/3;
397     if (v % 5 == 0)  nv -= nv/5;
398     totients[i] = nv;
399   }
400 
401   ctx = start_segment_primes(7, hi/2, &segment);
402   while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
403     START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
404       for (i = PGTLO(2*p,p,lo); i >= lo && i <= hi; i += p)
405         totients[i-lo] -= totients[i-lo]/p;
406     } END_DO_FOR_EACH_SIEVE_PRIME
407   }
408   end_segment_primes(ctx);
409 
410   /* Fill in all primes */
411   for (i = (lo | 1) - lo; i < count; i += 2)
412     if (totients[i] == i+lo)
413       totients[i]--;
414   if (lo <= 1) totients[1-lo] = 1;
415 
416   return totients;
417 }
418 
mertens(UV n)419 IV mertens(UV n) {
420   /* See Deléglise and Rivat (1996) for O(n^2/3 log(log(n))^1/3) algorithm.
421    * This implementation uses their lemma 2.1 directly, so is ~ O(n).
422    * In serial it is quite a bit faster than segmented summation of mu
423    * ranges, though the latter seems to be a favored method for GPUs.
424    */
425   UV u, j, m, nmk, maxmu;
426   signed char* mu;
427   short* M;   /* 16 bits is enough range for all 32-bit M => 64-bit n */
428   IV sum;
429 
430   if (n <= 1)  return n;
431   u = isqrt(n);
432   maxmu = (n/(u+1));              /* maxmu lets us handle u < sqrt(n) */
433   if (maxmu < u) maxmu = u;
434   mu = range_moebius(0, maxmu);
435   New(0, M, maxmu+1, short);      /* Works up to maxmu < 7613644886 */
436   M[0] = 0;
437   for (j = 1; j <= maxmu; j++)
438     M[j] = M[j-1] + mu[j];
439   sum = M[u];
440   for (m = 1; m <= u; m++) {
441     if (mu[m] != 0) {
442       IV inner_sum = 0;
443       UV lower = (u/m) + 1;
444       UV last_nmk = n/(m*lower);
445       UV this_k = 0;
446       UV next_k = n/(m*1);
447       UV nmkm = m * 2;
448       for (nmk = 1; nmk <= last_nmk; nmk++, nmkm += m) {
449         this_k = next_k;
450         next_k = n/nmkm;
451         inner_sum += M[nmk] * (this_k - next_k);
452       }
453       sum += (mu[m] > 0) ? -inner_sum : inner_sum;
454     }
455   }
456   Safefree(M);
457   Safefree(mu);
458   return sum;
459 }
460 
461 /******************************************************************************/
462 /*                             POWERS and ROOTS                               */
463 /******************************************************************************/
464 
465 /* There are at least 4 ways to do this, plus hybrids.
466  * 1) use a table.  Great for 32-bit, too big for 64-bit.
467  * 2) Use pow() to check.  Relatively slow and FP is always dangerous.
468  * 3) factor or trial factor.  Slow for 64-bit.
469  * 4) Dietzfelbinger algorithm 2.3.5.  Quite slow.
470  * This currently uses a hybrid of 1 and 2.
471  */
powerof(UV n)472 int powerof(UV n) {
473   UV t;
474   if ((n <= 3) || (n == UV_MAX)) return 1;
475   if ((n & (n-1)) == 0)          return ctz(n);  /* powers of 2    */
476   if (is_perfect_square(n))      return 2 * powerof(isqrt(n));
477   if (is_perfect_cube(n))        return 3 * powerof(icbrt(n));
478 
479   /* Simple rejection filter for non-powers of 5-37.  Rejects 47.85%. */
480   t = n & 511; if ((t*77855451) & (t*4598053) & 862)  return 1;
481 
482   if (is_perfect_fifth(n))       return 5 * powerof(rootof(n,5));
483   if (is_perfect_seventh(n))     return 7 * powerof(rootof(n,7));
484 
485   if (n > 177146 && n <= UVCONST(1977326743)) {
486     switch (n) { /* Check for powers of 11, 13, 17, 19 within 32 bits */
487       case 177147: case 48828125: case 362797056: case 1977326743: return 11;
488       case 1594323: case 1220703125: return 13;
489       case 129140163: return 17;
490       case 1162261467: return 19;
491       default:  break;
492     }
493   }
494 #if BITS_PER_WORD == 64
495   if (n >= UVCONST(8589934592)) {
496     /* The Bloom filters reject about 90% of inputs each, about 99% for two.
497      * Bach/Sorenson type sieves do about as well, but are much slower due
498      * to using a powmod. */
499     if ( (t = n %121, !((t*19706187) & (t*61524433) & 876897796)) &&
500          (t = n % 89, !((t*28913398) & (t*69888189) & 2705511937U)) ) {
501       /* (t = n % 67, !((t*117621317) & (t*48719734) & 537242019)) ) { */
502       UV root = rootof(n,11);
503       if (n == ipow(root,11)) return 11;
504     }
505     if ( (t = n %131, !((t*1545928325) & (t*1355660813) & 2771533888U)) &&
506          (t = n % 79, !((t*48902028) & (t*48589927) & 404082779)) ) {
507       /* (t = n % 53, !((t*79918293) & (t*236846524) & 694943819)) ) { */
508       UV root = rootof(n,13);
509       if (n == ipow(root,13)) return 13;
510     }
511     switch (n) {
512       case UVCONST(762939453125):
513       case UVCONST(16926659444736):
514       case UVCONST(232630513987207):
515       case UVCONST(100000000000000000):
516       case UVCONST(505447028499293771):
517       case UVCONST(2218611106740436992):
518       case UVCONST(8650415919381337933):  return 17;
519       case UVCONST(19073486328125):
520       case UVCONST(609359740010496):
521       case UVCONST(11398895185373143):
522       case UVCONST(10000000000000000000): return 19;
523       case UVCONST(94143178827):
524       case UVCONST(11920928955078125):
525       case UVCONST(789730223053602816):   return 23;
526       case UVCONST(68630377364883):       return 29;
527       case UVCONST(617673396283947):      return 31;
528       case UVCONST(450283905890997363):   return 37;
529       default:  break;
530     }
531   }
532 #endif
533   return 1;
534 }
is_power(UV n,UV a)535 int is_power(UV n, UV a)
536 {
537   int ret;
538   if (a > 0) {
539     if (a == 1 || n <= 1) return 1;
540     if ((a % 2) == 0)
541       return !is_perfect_square(n) ? 0 : (a == 2) ? 1 : is_power(isqrt(n),a>>1);
542     if ((a % 3) == 0)
543       return !is_perfect_cube(n) ? 0 : (a == 3) ? 1 : is_power(icbrt(n),a/3);
544     if ((a % 5) == 0)
545       return !is_perfect_fifth(n) ? 0 : (a == 5) ? 1 :is_power(rootof(n,5),a/5);
546   }
547   ret = powerof(n);
548   if (a != 0) return !(ret % a);  /* Is the max power divisible by a? */
549   return (ret == 1) ? 0 : ret;
550 }
551 
552 #if BITS_PER_WORD == 64
553 #define ROOT_MAX_3 41
554 static const uint32_t root_max[ROOT_MAX_3] = {0,0,0,2642245,65535,7131,1625,565,255,138,84,56,40,30,23,19,15,13,11,10,9,8,7,6,6,5,5,5,4,4,4,4,3,3,3,3,3,3,3,3,3};
555 #else
556 #define ROOT_MAX_3 21
557 static const uint32_t root_max[ROOT_MAX_3] = {0,0,0,1625,255,84,40,23,15,11,9,7,6,5,4,4,3,3,3,3,3};
558 #endif
559 
rootof(UV n,UV k)560 UV rootof(UV n, UV k) {
561   UV lo, hi, max;
562   if (k == 0) return 0;
563   if (k == 1) return n;
564   if (k == 2) return isqrt(n);
565   if (k == 3) return icbrt(n);
566 
567   /* Bracket between powers of 2, but never exceed max power so ipow works */
568   max = 1 + ((k >= ROOT_MAX_3) ? 2 : root_max[k]);
569   lo = UVCONST(1) << (log2floor(n)/k);
570   hi = ((lo*2) < max) ? lo*2 : max;
571 
572   /* Binary search */
573   while (lo < hi) {
574     UV mid = lo + (hi-lo)/2;
575     if (ipow(mid,k) <= n) lo = mid+1;
576     else                  hi = mid;
577   }
578   return lo-1;
579 }
580 
primepower(UV n,UV * prime)581 int primepower(UV n, UV* prime)
582 {
583   int power = 0;
584   if (n < 2) return 0;
585   /* Check for small divisors */
586   if (!(n&1)) {
587     if (n & (n-1)) return 0;
588     *prime = 2;
589     return ctz(n);
590   }
591   if ((n%3) == 0) {
592     /* if (UVCONST(12157665459056928801) % n) return 0; */
593     do { n /= 3; power++; } while (n > 1 && (n%3) == 0);
594     if (n != 1) return 0;
595     *prime = 3;
596     return power;
597   }
598   if ((n%5) == 0) {
599     do { n /= 5; power++; } while (n > 1 && (n%5) == 0);
600     if (n != 1) return 0;
601     *prime = 5;
602     return power;
603   }
604   if ((n%7) == 0) {
605     do { n /= 7; power++; } while (n > 1 && (n%7) == 0);
606     if (n != 1) return 0;
607     *prime = 7;
608     return power;
609   }
610   if (is_prob_prime(n))
611     { *prime = n; return 1; }
612   /* Composite.  Test for perfect power with prime root. */
613   power = powerof(n);
614   if (power == 1) power = 0;
615   if (power) {
616     UV root = rootof(n, (UV)power);
617     if (is_prob_prime(root))
618       *prime = root;
619     else
620       power = 0;
621   }
622   return power;
623 }
624 
valuation(UV n,UV k)625 UV valuation(UV n, UV k)
626 {
627   UV v = 0;
628   UV kpower = k;
629   if (k < 2 || n < 2) return 0;
630   if (k == 2) return ctz(n);
631   while ( !(n % kpower) ) {
632     kpower *= k;
633     v++;
634   }
635   return v;
636 }
637 
logint(UV n,UV b)638 UV logint(UV n, UV b)
639 {
640   /* UV e;  for (e=0; n; n /= b) e++;  return e-1; */
641   UV v, e = 0;
642   if (b == 2)
643     return log2floor(n);
644   if (n > UV_MAX/b) {
645     n /= b;
646     e = 1;
647   }
648   for (v = b; v <= n; v *= b)
649     e++;
650   return e;
651 }
652 
mpu_popcount_string(const char * ptr,uint32_t len)653 UV mpu_popcount_string(const char* ptr, uint32_t len)
654 {
655   uint32_t count = 0, i, j, d, v, power, slen, *s, *sptr;
656 
657   while (len > 0 && (*ptr == '0' || *ptr == '+' || *ptr == '-'))
658     {  ptr++;  len--;  }
659 
660   /* Create s as array of base 10^8 numbers */
661   slen = (len + 7) / 8;
662   Newz(0, s, slen, uint32_t);
663   for (i = 0; i < slen; i++) {  /* Chunks of 8 digits */
664     for (j = 0, d = 0, power = 1;  j < 8 && len > 0;  j++, power *= 10) {
665       v = ptr[--len] - '0';
666       if (v > 9) croak("Parameter '%s' must be a positive integer",ptr);
667       d += power * v;
668     }
669     s[slen - 1 - i] = d;
670   }
671   /* Repeatedly count and divide by 2 across s */
672   while (slen > 1) {
673     if (s[slen-1] & 1)  count++;
674     sptr = s;
675     if (s[0] == 1) {
676       if (--slen == 0) break;
677       *++sptr += 100000000;
678     }
679     for (i = 0; i < slen; i++) {
680       if ( (i+1) < slen  &&  sptr[i] & 1 ) sptr[i+1] += 100000000;
681       s[i] = sptr[i] >> 1;
682     }
683   }
684   /* For final base 10^8 number just do naive popcnt */
685   for (d = s[0]; d > 0; d >>= 1)
686     if (d & 1)
687       count++;
688   Safefree(s);
689   return count;
690 }
691 
692 
693 /* How many times does 2 divide n? */
694 #define padic2(n)  ctz(n)
695 #define IS_MOD8_3OR5(x)  (((x)&7)==3 || ((x)&7)==5)
696 
kronecker_uu_sign(UV a,UV b,int s)697 static int kronecker_uu_sign(UV a, UV b, int s) {
698   while (a) {
699     int r = padic2(a);
700     if (r) {
701       if ((r&1)  &&  IS_MOD8_3OR5(b))  s = -s;
702       a >>= r;
703     }
704     if (a & b & 2)  s = -s;
705     { UV t = b % a;  b = a;  a = t; }
706   }
707   return (b == 1) ? s : 0;
708 }
709 
kronecker_uu(UV a,UV b)710 int kronecker_uu(UV a, UV b) {
711   int r, s;
712   if (b & 1)   return kronecker_uu_sign(a, b, 1);
713   if (!(a&1))  return 0;
714   s = 1;
715   r = padic2(b);
716   if (r) {
717     if ((r&1) && IS_MOD8_3OR5(a))  s = -s;
718     b >>= r;
719   }
720   return kronecker_uu_sign(a, b, s);
721 }
722 
kronecker_su(IV a,UV b)723 int kronecker_su(IV a, UV b) {
724   int r, s;
725   if (a >= 0)  return kronecker_uu(a, b);
726   if (b == 0)  return (a == 1 || a == -1) ? 1 : 0;
727   s = 1;
728   r = padic2(b);
729   if (r) {
730     if (!(a&1))  return 0;
731     if ((r&1) && IS_MOD8_3OR5(a))  s = -s;
732     b >>= r;
733   }
734   a %= (IV) b;
735   if (a < 0)  a += b;
736   return kronecker_uu_sign(a, b, s);
737 }
738 
kronecker_ss(IV a,IV b)739 int kronecker_ss(IV a, IV b) {
740   if (a >= 0 && b >= 0)
741     return (b & 1)  ?  kronecker_uu_sign(a, b, 1)  :  kronecker_uu(a,b);
742   if (b >= 0)
743     return kronecker_su(a, b);
744   return kronecker_su(a, -b) * ((a < 0) ? -1 : 1);
745 }
746 
factorial(UV n)747 UV factorial(UV n) {
748   UV i, r = 1;
749   if ( (n > 12 && sizeof(UV) <= 4) || (n > 20 && sizeof(UV) <= 8) ) return 0;
750   for (i = 2; i <= n; i++)
751     r *= i;
752   return r;
753 }
754 
binomial(UV n,UV k)755 UV binomial(UV n, UV k) {    /* Thanks to MJD and RosettaCode for ideas */
756   UV d, g, r = 1;
757   if (k == 0) return 1;
758   if (k == 1) return n;
759   if (k >= n) return (k == n);
760   if (k > n/2) k = n-k;
761   for (d = 1; d <= k; d++) {
762     if (r >= UV_MAX/n) {  /* Possible overflow */
763       UV nr, dr;  /* reduced numerator / denominator */
764       g = gcd_ui(n, d);  nr = n/g;  dr = d/g;
765       g = gcd_ui(r, dr);  r = r/g;  dr = dr/g;
766       if (r >= UV_MAX/nr) return 0;  /* Unavoidable overflow */
767       r *= nr;
768       r /= dr;
769       n--;
770     } else {
771       r *= n--;
772       r /= d;
773     }
774   }
775   return r;
776 }
777 
stirling3(UV n,UV m)778 UV stirling3(UV n, UV m) {   /* Lah numbers */
779   UV f1, f2;
780 
781   if (m == n) return 1;
782   if (n == 0 || m == 0 || m > n) return 0;
783   if (m == 1) return factorial(n);
784 
785   f1 = binomial(n, m);
786   if (f1 == 0)  return 0;
787   f2 = binomial(n-1, m-1);
788   if (f2 == 0 || f1 >= UV_MAX/f2)  return 0;
789   f1 *= f2;
790   f2 = factorial(n-m);
791   if (f2 == 0 || f1 >= UV_MAX/f2)  return 0;
792   return f1 * f2;
793 }
794 
stirling2(UV n,UV m)795 IV stirling2(UV n, UV m) {
796   UV f;
797   IV j, k, t, s = 0;
798 
799   if (m == n) return 1;
800   if (n == 0 || m == 0 || m > n) return 0;
801   if (m == 1) return 1;
802 
803   if ((f = factorial(m)) == 0) return 0;
804   for (j = 1; j <= (IV)m; j++) {
805     t = binomial(m, j);
806     for (k = 1; k <= (IV)n; k++) {
807       if (t == 0 || j >= IV_MAX/t) return 0;
808       t *= j;
809     }
810     if ((m-j) & 1) t *= -1;
811     s += t;
812   }
813   return s/f;
814 }
815 
stirling1(UV n,UV m)816 IV stirling1(UV n, UV m) {
817   IV k, t, b1, b2, s2, s = 0;
818 
819   if (m == n) return 1;
820   if (n == 0 || m == 0 || m > n) return 0;
821   if (m == 1) {
822     UV f = factorial(n-1);
823     if (f>(UV)IV_MAX) return 0;
824     return (n&1) ? ((IV)f) : -((IV)f);
825   }
826 
827   for (k = 1; k <= (IV)(n-m); k++) {
828     b1 = binomial(k + n - 1, n - m + k);
829     b2 = binomial(2 * n - m, n - m - k);
830     s2 = stirling2(n - m + k, k);
831     if (b1 == 0 || b2 == 0 || s2 == 0 || b1 > IV_MAX/b2) return 0;
832     t = b1 * b2;
833     if (s2 > IV_MAX/t) return 0;
834     t *= s2;
835     s += (k & 1) ? -t : t;
836   }
837   return s;
838 }
839 
totient(UV n)840 UV totient(UV n) {
841   UV i, nfacs, totient, lastf, facs[MPU_MAX_FACTORS+1];
842   if (n <= 1) return n;
843   totient = 1;
844   /* phi(2m) = 2phi(m) if m even, phi(m) if m odd */
845   while ((n & 0x3) == 0) { n >>= 1; totient <<= 1; }
846   if ((n & 0x1) == 0) { n >>= 1; }
847   /* factor and calculate totient */
848   nfacs = factor(n, facs);
849   lastf = 0;
850   for (i = 0; i < nfacs; i++) {
851     UV f = facs[i];
852     if (f == lastf) { totient *= f;               }
853     else            { totient *= f-1;  lastf = f; }
854   }
855   return totient;
856 }
857 
858 static const UV jordan_overflow[5] =
859 #if BITS_PER_WORD == 64
860   {UVCONST(4294967311), 2642249, 65537, 7133, 1627};
861 #else
862   {UVCONST(     65537),    1627,   257,   85,   41};
863 #endif
jordan_totient(UV k,UV n)864 UV jordan_totient(UV k, UV n) {
865   UV factors[MPU_MAX_FACTORS+1];
866   int nfac, i;
867   UV totient;
868   if (k == 0 || n <= 1) return (n == 1);
869   if (k > 6 || (k > 1 && n >= jordan_overflow[k-2])) return 0;
870 
871   totient = 1;
872   /* Similar to Euler totient, shortcut even inputs */
873   while ((n & 0x3) == 0) { n >>= 1; totient *= (1<<k); }
874   if ((n & 0x1) == 0) { n >>= 1; totient *= ((1<<k)-1); }
875   nfac = factor(n,factors);
876   for (i = 0; i < nfac; i++) {
877     UV p = factors[i];
878     UV pk = ipow(p,k);
879     totient *= (pk-1);
880     while (i+1 < nfac && p == factors[i+1]) {
881       i++;
882       totient *= pk;
883     }
884   }
885   return totient;
886 }
887 
carmichael_lambda(UV n)888 UV carmichael_lambda(UV n) {
889   UV fac[MPU_MAX_FACTORS+1];
890   int i, nfactors;
891   UV lambda = 1;
892 
893   if (n < 8) return totient(n);
894   if ((n & (n-1)) == 0) return n >> 2;
895 
896   i = ctz(n);
897   if (i > 0) {
898     n >>= i;
899     lambda <<= (i>2) ? i-2 : i-1;
900   }
901   nfactors = factor(n, fac);
902   for (i = 0; i < nfactors; i++) {
903     UV p = fac[i], pk = p-1;
904     while (i+1 < nfactors && p == fac[i+1]) {
905       i++;
906       pk *= p;
907     }
908     lambda = lcm_ui(lambda, pk);
909   }
910   return lambda;
911 }
912 
is_carmichael(UV n)913 int is_carmichael(UV n) {
914   UV fac[MPU_MAX_FACTORS+1];
915   UV exp[MPU_MAX_FACTORS+1];
916   int i, nfactors;
917 
918   /* Small or even is not a Carmichael number */
919   if (n < 561 || !(n&1)) return 0;
920 
921   /* Simple pre-test for square free (odds only) */
922   if (!(n% 9) || !(n%25) || !(n%49) || !(n%121) || !(n%169))
923     return 0;
924 
925   /* Check Korselt's criterion for small divisors */
926   if (!(n% 5) && ((n-1) %  4 != 0)) return 0;
927   if (!(n% 7) && ((n-1) %  6 != 0)) return 0;
928   if (!(n%11) && ((n-1) % 10 != 0)) return 0;
929   if (!(n%13) && ((n-1) % 12 != 0)) return 0;
930   if (!(n%17) && ((n-1) % 16 != 0)) return 0;
931   if (!(n%19) && ((n-1) % 18 != 0)) return 0;
932   if (!(n%23) && ((n-1) % 22 != 0)) return 0;
933 
934   /* Fast check without having to factor */
935   if (n > 5000000) {
936     if (!(n%29) && ((n-1) % 28 != 0)) return 0;
937     if (!(n%31) && ((n-1) % 30 != 0)) return 0;
938     if (!(n%37) && ((n-1) % 36 != 0)) return 0;
939     if (!(n%41) && ((n-1) % 40 != 0)) return 0;
940     if (!(n%43) && ((n-1) % 42 != 0)) return 0;
941     if (!is_pseudoprime(n,2)) return 0;
942   }
943 
944   nfactors = factor_exp(n, fac, exp);
945   if (nfactors < 3)
946     return 0;
947   for (i = 0; i < nfactors; i++) {
948     if (exp[i] > 1  ||  ((n-1) % (fac[i]-1)) != 0)
949       return 0;
950   }
951   return 1;
952 }
953 
is_quasi_base(int nfactors,UV * fac,UV p,UV b)954 static int is_quasi_base(int nfactors, UV *fac, UV p, UV b) {
955   int i;
956   for (i = 0; i < nfactors; i++) {
957     UV d = fac[i] - b;
958     if (d == 0 || (p % d) != 0)
959       return 0;
960   }
961   return 1;
962 }
963 
964 /* Returns number of bases that pass */
is_quasi_carmichael(UV n)965 UV is_quasi_carmichael(UV n) {
966   UV nbases, fac[MPU_MAX_FACTORS+1], exp[MPU_MAX_FACTORS+1];
967   UV spf, lpf, ndivisors, *divs;
968   int i, nfactors;
969 
970   if (n < 35) return 0;
971 
972   /* Simple pre-test for square free */
973   if (!(n% 4) || !(n% 9) || !(n%25) || !(n%49) || !(n%121) || !(n%169))
974     return 0;
975 
976   nfactors = factor_exp(n, fac, exp);
977   /* Must be composite */
978   if (nfactors < 2)
979     return 0;
980   /* Must be square free */
981   for (i = 0; i < nfactors; i++)
982     if (exp[i] > 1)
983       return 0;
984 
985   nbases = 0;
986   spf = fac[0];
987   lpf = fac[nfactors-1];
988 
989   /* Algorithm from Hiroaki Yamanouchi, 2015 */
990   if (nfactors == 2) {
991     divs = _divisor_list(n / spf - 1, &ndivisors);
992     for (i = 0; i < (int)ndivisors; i++) {
993       UV d = divs[i];
994       UV k = spf - d;
995       if (d >= spf) break;
996       if (is_quasi_base(nfactors, fac, n-k, k))
997         nbases++;
998     }
999   } else {
1000     divs = _divisor_list(lpf * (n / lpf - 1), &ndivisors);
1001     for (i = 0; i < (int)ndivisors; i++) {
1002       UV d = divs[i];
1003       UV k = lpf - d;
1004       if (lpf > d && k >= spf) continue;
1005       if (k != 0 && is_quasi_base(nfactors, fac, n-k, k))
1006         nbases++;
1007     }
1008   }
1009   Safefree(divs);
1010   return nbases;
1011 }
1012 
is_semiprime(UV n)1013 int is_semiprime(UV n) {
1014   UV sp, p, n3, factors[2];
1015 
1016   if (n < 6) return (n == 4);
1017   if (!(n&1)) return !!is_prob_prime(n>>1);
1018   if (!(n%3)) return !!is_prob_prime(n/3);
1019   if (!(n%5)) return !!is_prob_prime(n/5);
1020   /* 27% of random inputs left */
1021   n3 = icbrt(n);
1022   for (sp = 4; sp < 60; sp++) {
1023     p = primes_tiny[sp];
1024     if (p > n3)
1025       break;
1026     if ((n % p) == 0)
1027       return !!is_prob_prime(n/p);
1028   }
1029   /* 9.8% of random inputs left */
1030   if (is_def_prime(n)) return 0;
1031   if (p > n3) return 1; /* past this, n is a composite and larger than p^3 */
1032   /* 4-8% of random inputs left */
1033   if (factor_one(n, factors, 0, 0) != 2) return 0;
1034   return (is_def_prime(factors[0]) && is_def_prime(factors[1]));
1035 }
1036 
is_fundamental(UV n,int neg)1037 int is_fundamental(UV n, int neg) {
1038   UV r = n & 15;
1039   if (r) {
1040     if (!neg) {
1041       switch (r & 3) {
1042         case 0:  return (r ==  4) ? 0 : is_square_free(n >> 2);
1043         case 1:  return is_square_free(n);
1044         default: break;
1045       }
1046     } else {
1047       switch (r & 3) {
1048         case 0:  return (r == 12) ? 0 : is_square_free(n >> 2);
1049         case 3:  return is_square_free(n);
1050         default: break;
1051       }
1052     }
1053   }
1054   return 0;
1055 }
1056 
_totpred(UV n,UV maxd)1057 static int _totpred(UV n, UV maxd) {
1058   UV i, ndivisors, *divs;
1059   int res;
1060 
1061   if (n & 1) return 0;
1062   n >>= 1;
1063   if (n == 1) return 1;
1064   if (n < maxd && is_prime(2*n+1)) return 1;
1065 
1066   divs = _divisor_list(n, &ndivisors);
1067   for (i = 0, res = 0; i < ndivisors && divs[i] < maxd && res == 0; i++) {
1068     UV r, d = divs[i], p = 2*d+1;
1069     if (!is_prime(p)) continue;
1070     r = n/d;
1071     while (1) {
1072       if (r == p || _totpred(r, d)) { res = 1; break; }
1073       if (r % p) break;
1074       r /= p;
1075     }
1076   }
1077   Safefree(divs);
1078   return res;
1079 }
1080 
is_totient(UV n)1081 int is_totient(UV n) {
1082   return (n == 0 || (n & 1))  ?  (n==1)  :  _totpred(n,n);
1083 }
1084 
1085 
inverse_totient_count(UV n)1086 UV inverse_totient_count(UV n) {
1087   set_t set, sumset;
1088   keyval_t keyval;
1089   UV res, i, ndivisors, *divs;
1090 
1091   if (n == 1) return 2;
1092   if (n < 1 || n & 1) return 0;
1093   if (is_prime(n >> 1)) { /* Coleman Remark 3.3 (Thm 3.1) and Prop 6.2 */
1094     if (!is_prime(n+1)) return 0;
1095     if (n >= 10)        return 2;
1096   }
1097 
1098   divs = _divisor_list(n, &ndivisors);
1099 
1100   init_set(&set, 2*ndivisors);
1101   keyval.key = 1;  keyval.val = 1;
1102   set_addsum(&set, keyval);
1103 
1104   for (i = 0; i < ndivisors; i++) {
1105     UV d = divs[i],  p = d+1;
1106     if (is_prime(p)) {
1107       UV j,  np = d,  v = valuation(n, p);
1108       init_set(&sumset, ndivisors/2);
1109       for (j = 0; j <= v; j++) {
1110         UV k, ndiv = n/np;  /* Loop over divisors of n/np */
1111         if (np == 1) {
1112           keyval_t kv;  kv.key = 1; kv.val = 1;
1113           set_addsum(&sumset, kv);
1114         } else {
1115           for (k = 0; k < ndivisors && divs[k] <= ndiv; k++) {
1116             UV val, d2 = divs[k];
1117             if ((ndiv % d2) != 0) continue;
1118             val = set_getval(set, d2);
1119             if (val > 0) {
1120               keyval_t kv;  kv.key = d2*np; kv.val = val;
1121               set_addsum(&sumset, kv);
1122             }
1123           }
1124         }
1125         /* if (j < v && np > UV_MAX/p) croak("overflow np  d %lu", d); */
1126         np *= p;
1127       }
1128       set_merge(&set, sumset);
1129       free_set(&sumset);
1130     }
1131   }
1132   Safefree(divs);
1133   res = set_getval(set, n);
1134   free_set(&set);
1135   return res;
1136 }
1137 
inverse_totient_list(UV * ntotients,UV n)1138 UV* inverse_totient_list(UV *ntotients, UV n) {
1139   set_list_t setlist, divlist;
1140   UV i, ndivisors, *divs, *tlist;
1141   UV *totlist = 0;
1142 
1143   MPUassert(n <= UV_MAX/7.5, "inverse_totient_list n too large");
1144 
1145   if (n == 1) {
1146     New(0, totlist, 2, UV);
1147     totlist[0] = 1;  totlist[1] = 2;
1148     *ntotients = 2;
1149     return totlist;
1150   }
1151   if (n < 1 || n & 1) {
1152     *ntotients = 0;
1153     return totlist;
1154   }
1155   if (is_prime(n >> 1)) { /* Coleman Remark 3.3 (Thm 3.1) and Prop 6.2 */
1156     if (!is_prime(n+1)) {
1157       *ntotients = 0;
1158       return totlist;
1159     }
1160     if (n >= 10) {
1161       New(0, totlist, 2, UV);
1162       totlist[0] = n+1;  totlist[1] = 2*n+2;
1163       *ntotients = 2;
1164       return totlist;
1165     }
1166   }
1167 
1168   divs = _divisor_list(n, &ndivisors);
1169 
1170   init_setlist(&setlist, 2*ndivisors);
1171   setlist_addval(&setlist, 1, 1);   /* Add 1 => [1] */
1172 
1173   for (i = 0; i < ndivisors; i++) {
1174     UV d = divs[i],  p = d+1;
1175     if (is_prime(p)) {
1176       UV j,  dp = d,  pp = p,  v = valuation(n, p);
1177       init_setlist(&divlist, ndivisors/2);
1178       for (j = 0; j <= v; j++) {
1179         UV k, ndiv = n/dp;  /* Loop over divisors of n/dp */
1180         if (dp == 1) {
1181           setlist_addval(&divlist, 1, 2);   /* Add 1 => [2] */
1182         } else {
1183           for (k = 0; k < ndivisors && divs[k] <= ndiv; k++) {
1184             UV nvals, *vals, d2 = divs[k];
1185             if ((ndiv % d2) != 0) continue;
1186             vals = setlist_getlist(&nvals, setlist, d2);
1187             if (vals != 0)
1188               setlist_addlist(&divlist, d2 * dp, nvals, vals, pp);
1189           }
1190         }
1191         dp *= p;
1192         pp *= p;
1193       }
1194       setlist_merge(&setlist, divlist);
1195       free_setlist(&divlist);
1196     }
1197   }
1198   Safefree(divs);
1199   tlist = setlist_getlist(ntotients, setlist, n);
1200   if (tlist != 0 && *ntotients > 0) {
1201     New(0, totlist, *ntotients, UV);
1202     memcpy(totlist, tlist, *ntotients * sizeof(UV));
1203     qsort(totlist, *ntotients, sizeof(UV), _numcmp);
1204   }
1205   free_setlist(&setlist);
1206   return totlist;
1207 }
1208 
1209 
pillai_v(UV n)1210 UV pillai_v(UV n) {
1211   UV v, fac;
1212   if (n == 0) return 0;
1213   for (v = 8, fac = 5040 % n; v < n-1 && fac != 0; v++) {
1214     fac = (n < HALF_WORD) ? (fac*v) % n : mulmod(fac,v,n);
1215     if (fac == n-1 && (n % v) != 1)
1216       return v;
1217   }
1218   return 0;
1219 }
1220 
1221 
moebius(UV n)1222 int moebius(UV n) {
1223   UV factors[MPU_MAX_FACTORS+1];
1224   int i, nfactors;
1225 
1226   if (n <= 5) return (n == 1) ? 1 : (n % 4) ? -1 : 0;
1227   if (n >=  49 && (!(n %   4) || !(n %   9) || !(n %  25) || !(n %  49)))
1228     return 0;
1229   if (n >= 361 && (!(n % 121) || !(n % 169) || !(n % 289) || !(n % 361)))
1230     return 0;
1231   if (n >= 961 && (!(n % 529) || !(n % 841) || !(n % 961)))
1232     return 0;
1233 
1234   nfactors = factor(n, factors);
1235   for (i = 1; i < nfactors; i++)
1236     if (factors[i] == factors[i-1])
1237       return 0;
1238   return (nfactors % 2) ? -1 : 1;
1239 }
1240 
exp_mangoldt(UV n)1241 UV exp_mangoldt(UV n) {
1242   UV p;
1243   if (!primepower(n,&p)) return 1;     /* Not a prime power */
1244   return p;
1245 }
1246 
1247 
znorder(UV a,UV n)1248 UV znorder(UV a, UV n) {
1249   UV fac[MPU_MAX_FACTORS+1];
1250   UV exp[MPU_MAX_FACTORS+1];
1251   int i, nfactors;
1252   UV k, phi;
1253 
1254   if (n <= 1) return n;   /* znorder(x,0) = 0, znorder(x,1) = 1          */
1255   if (a <= 1) return a;   /* znorder(0,x) = 0, znorder(1,x) = 1  (x > 1) */
1256   if (gcd_ui(a,n) > 1)  return 0;
1257 
1258   /* Cohen 1.4.3 using Carmichael Lambda */
1259   phi = carmichael_lambda(n);
1260   nfactors = factor_exp(phi, fac, exp);
1261   k = phi;
1262 #if USE_MONTMATH
1263   if (n & 1) {
1264     const uint64_t npi = mont_inverse(n),  mont1 = mont_get1(n);
1265     UV ma = mont_geta(a, n);
1266     for (i = 0; i < nfactors; i++) {
1267       UV b, a1, ek, pi = fac[i], ei = exp[i];
1268       b = ipow(pi,ei);
1269       k /= b;
1270       a1 = mont_powmod(ma, k, n);
1271       for (ek = 0; a1 != mont1 && ek++ <= ei; a1 = mont_powmod(a1, pi, n))
1272         k *= pi;
1273       if (ek > ei) return 0;
1274     }
1275   } else
1276 #endif
1277   for (i = 0; i < nfactors; i++) {
1278     UV b, a1, ek, pi = fac[i], ei = exp[i];
1279     b = ipow(pi,ei);
1280     k /= b;
1281     a1 = powmod(a, k, n);
1282     for (ek = 0; a1 != 1 && ek++ <= ei; a1 = powmod(a1, pi, n))
1283       k *= pi;
1284     if (ek > ei) return 0;
1285   }
1286   return k;
1287 }
1288 
znprimroot(UV n)1289 UV znprimroot(UV n) {
1290   UV fac[MPU_MAX_FACTORS+1];
1291   UV phi_div_fac[MPU_MAX_FACTORS+1];
1292   UV a, phi, on, r;
1293   int i, nfactors;
1294 
1295   if (n <= 4) return (n == 0) ? 0 : n-1;
1296   if (n % 4 == 0)  return 0;
1297 
1298   on = (n&1) ? n : (n>>1);
1299   a = powerof(on);
1300   r = rootof(on, a);
1301   if (!is_prob_prime(r)) return 0;        /* c^a or 2c^a */
1302   phi = (r-1) * (on/r);                   /* p^a or 2p^a */
1303 
1304   nfactors = factor_exp(phi, fac, 0);
1305   for (i = 0; i < nfactors; i++)
1306     phi_div_fac[i] = phi / fac[i];
1307 
1308 #if USE_MONTMATH
1309   if (n & 1) {
1310     const uint64_t npi = mont_inverse(n),  mont1 = mont_get1(n);
1311     for (a = 2; a < n; a++) {
1312       if (a == 4 || a == 8 || a == 9) continue;  /* Skip some perfect powers */
1313       /* Skip values we know can't be right: (a|n) = 0 (or 1 for odd primes) */
1314       if (phi == n-1) { if (kronecker_uu(a, n) != -1) continue; }
1315       else            { if (gcd_ui(a,n) != 1) continue; }
1316       r = mont_geta(a, n);
1317       for (i = 0; i < nfactors; i++)
1318         if (mont_powmod(r, phi_div_fac[i], n) == mont1)
1319           break;
1320       if (i == nfactors) return a;
1321     }
1322   } else
1323 #endif
1324   for (a = 2; a < n; a++) {
1325     if (a == 4 || a == 8 || a == 9) continue;  /* Skip some perfect powers */
1326     /* Skip values we know can't be right: (a|n) = 0 (or 1 for odd primes) */
1327     if (phi == n-1) { if (kronecker_uu(a, n) != -1) continue; }
1328     else            { if (gcd_ui(a,n) != 1) continue; }
1329     for (i = 0; i < nfactors; i++)
1330       if (powmod(a, phi_div_fac[i], n) == 1)
1331         break;
1332     if (i == nfactors) return a;
1333   }
1334   return 0;
1335 }
1336 
is_primitive_root(UV a,UV n,int nprime)1337 int is_primitive_root(UV a, UV n, int nprime) {
1338   UV s, fac[MPU_MAX_FACTORS+1];
1339   int i, nfacs;
1340 
1341   if (n <= 1) return n;
1342   if (a >= n) a %= n;
1343   if (n <= 4) return a == n-1;
1344   if (n % 4 == 0)  return 0;
1345 
1346   /* Very simple, but not fast:
1347    *     s = nprime ? n-1 : totient(n);
1348    *     return s == znorder(a, n);
1349    */
1350 
1351   if (gcd_ui(a,n) != 1) return 0;
1352   if (nprime) {
1353     s = n-1;
1354   } else {
1355     UV on = (n&1) ? n : (n>>1);
1356     UV k = powerof(on);
1357     UV r = rootof(on, k);
1358     if (!is_prob_prime(r)) return 0;        /* c^a or 2c^a */
1359     s = (r-1) * (on/r);                     /* p^a or 2p^a */
1360   }
1361   if (s == n-1 && kronecker_uu(a,n) != -1) return 0;
1362 
1363   /* a^x can be a primitive root only if gcd(x,s) = 1 */
1364   i = is_power(a,0);
1365   if (i > 1 && gcd_ui(i, s) != 1) return 0;
1366 
1367 #if USE_MONTMATH
1368   if (n & 1) {
1369     const uint64_t npi = mont_inverse(n),  mont1 = mont_get1(n);
1370     a = mont_geta(a, n);
1371     /* Quick check for small factors before full factor */
1372     if ((s % 2) == 0 && mont_powmod(a, s/2, n) == mont1) return 0;
1373     if ((s % 3) == 0 && mont_powmod(a, s/3, n) == mont1) return 0;
1374     if ((s % 5) == 0 && mont_powmod(a, s/5, n) == mont1) return 0;
1375     nfacs = factor_exp(s, fac, 0);
1376     for (i = 0; i < nfacs; i++)
1377       if (fac[i] > 5 && mont_powmod(a, s/fac[i], n) == mont1)
1378         return 0;
1379   } else
1380 #endif
1381   {
1382     /* Quick check for small factors before full factor */
1383     if ((s % 2) == 0 && powmod(a, s/2, n) == 1) return 0;
1384     if ((s % 3) == 0 && powmod(a, s/3, n) == 1) return 0;
1385     if ((s % 5) == 0 && powmod(a, s/5, n) == 1) return 0;
1386     /* Complete factor and check each one not found above. */
1387     nfacs = factor_exp(s, fac, 0);
1388     for (i = 0; i < nfacs; i++)
1389       if (fac[i] > 5 && powmod(a, s/fac[i], n) == 1)
1390         return 0;
1391   }
1392   return 1;
1393 }
1394 
gcdext(IV a,IV b,IV * u,IV * v,IV * cs,IV * ct)1395 IV gcdext(IV a, IV b, IV* u, IV* v, IV* cs, IV* ct) {
1396   IV s = 0;  IV os = 1;
1397   IV t = 1;  IV ot = 0;
1398   IV r = b;  IV or = a;
1399   if (a == 0 && b == 0) { os = 0; t = 0; }
1400   while (r != 0) {
1401     IV quot = or / r;
1402     { IV tmp = r; r = or - quot * r;  or = tmp; }
1403     { IV tmp = s; s = os - quot * s;  os = tmp; }
1404     { IV tmp = t; t = ot - quot * t;  ot = tmp; }
1405   }
1406   if (or < 0) /* correct sign */
1407     { or = -or; os = -os; ot = -ot; }
1408   if (u  != 0) *u = os;
1409   if (v  != 0) *v = ot;
1410   if (cs != 0) *cs = s;
1411   if (ct != 0) *ct = t;
1412   return or;
1413 }
1414 
1415 /* Calculate 1/a mod n. */
modinverse(UV a,UV n)1416 UV modinverse(UV a, UV n) {
1417   IV t = 0;  UV nt = 1;
1418   UV r = n;  UV nr = a;
1419   while (nr != 0) {
1420     UV quot = r / nr;
1421     { UV tmp = nt;  nt = t - quot*nt;  t = tmp; }
1422     { UV tmp = nr;  nr = r - quot*nr;  r = tmp; }
1423   }
1424   if (r > 1) return 0;  /* No inverse */
1425   if (t < 0) t += n;
1426   return t;
1427 }
1428 
divmod(UV a,UV b,UV n)1429 UV divmod(UV a, UV b, UV n) {   /* a / b  mod n */
1430   UV binv = modinverse(b, n);
1431   if (binv == 0)  return 0;
1432   return mulmod(a, binv, n);
1433 }
1434 
_powfactor(UV p,UV d,UV m)1435 static UV _powfactor(UV p, UV d, UV m) {
1436   UV e = 0;
1437   do { d /= p; e += d; } while (d > 0);
1438   return powmod(p, e, m);
1439 }
1440 
factorialmod(UV n,UV m)1441 UV factorialmod(UV n, UV m) {  /*  n! mod m */
1442   UV i, d = n, res = 1;
1443 
1444   if (n >= m || m == 1) return 0;
1445 
1446   if (n <= 10) { /* Keep things simple for small n */
1447     for (i = 2; i <= n && res != 0; i++)
1448       res = (res * i) % m;
1449     return res;
1450   }
1451 
1452   if (n > m/2 && is_prime(m))    /* Check if we can go backwards */
1453     d = m-n-1;
1454   if (d < 2)
1455     return (d == 0) ? m-1 : 1;   /* Wilson's Theorem: n = m-1 and n = m-2 */
1456 
1457   if (d == n && d > 5000000) {   /* Check for composite m that leads to 0 */
1458     UV fac[MPU_MAX_FACTORS+1], exp[MPU_MAX_FACTORS+1];
1459     int j, k, nfacs = factor_exp(m, fac, exp);
1460     for (j = 0; j < nfacs; j++) {
1461       UV t = fac[j];
1462       for (k = 1; (UV)k < exp[j]; k++)
1463         t *= fac[j];
1464       if (n >= t) return 0;
1465     }
1466   }
1467 
1468 #if USE_MONTMATH
1469   if (m & 1 && d < 40000) {
1470     const uint64_t npi = mont_inverse(m),  mont1 = mont_get1(m);
1471     uint64_t monti = mont1;
1472     res = mont1;
1473     for (i = 2; i <= d && res != 0; i++) {
1474       monti = addmod(monti,mont1,m);
1475       res = mont_mulmod(res,monti,m);
1476     }
1477     res = mont_recover(res, m);
1478   } else
1479 #endif
1480   if (d < 10000) {
1481     for (i = 2; i <= d && res != 0; i++)
1482       res = mulmod(res,i,m);
1483   } else {
1484 #if 0    /* Monolithic prime walk */
1485     START_DO_FOR_EACH_PRIME(2, d) {
1486       UV k = (p > (d>>1))  ?  p  :  _powfactor(p, d, m);
1487       res = mulmod(res, k, m);
1488       if (res == 0) break;
1489     } END_DO_FOR_EACH_PRIME;
1490 #else    /* Segmented prime walk */
1491     unsigned char* segment;
1492     UV seg_base, seg_low, seg_high;
1493     void* ctx = start_segment_primes(7, d, &segment);
1494     for (i = 1; i <= 3; i++)    /* Handle 2,3,5 assume d>10*/
1495       res = mulmod(res, _powfactor(2*i - (i>1), d, m), m);
1496     while (res != 0 && next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
1497       START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high )
1498         UV k = (p > (d>>1))  ?  p  :  _powfactor(p, d, m);
1499         res = mulmod(res, k, m);
1500         if (res == 0) break;
1501       END_DO_FOR_EACH_SIEVE_PRIME
1502     }
1503     end_segment_primes(ctx);
1504 #endif
1505   }
1506 
1507   if (d != n && res != 0) {      /* Handle backwards case */
1508     if (!(d&1)) res = submod(m,res,m);
1509     res = modinverse(res,m);
1510   }
1511 
1512   return res;
1513 }
1514 
verify_sqrtmod(UV s,UV * rs,UV a,UV p)1515 static int verify_sqrtmod(UV s, UV *rs, UV a, UV p) {
1516   if (p-s < s)  s = p-s;
1517   if (mulmod(s, s, p) != a) return 0;
1518   *rs = s;
1519   return 1;
1520 }
1521 #if !USE_MONTMATH
_sqrtmod_prime(UV a,UV p)1522 UV _sqrtmod_prime(UV a, UV p) {
1523   if ((p % 4) == 3) {
1524     return powmod(a, (p+1)>>2, p);
1525   }
1526   if ((p % 8) == 5) { /* Atkin's algorithm.  Faster than Legendre. */
1527     UV a2, alpha, beta, b;
1528     a2 = addmod(a,a,p);
1529     alpha = powmod(a2,(p-5)>>3,p);
1530     beta  = mulmod(a2,sqrmod(alpha,p),p);
1531     b     = mulmod(alpha, mulmod(a, (beta ? beta-1 : p-1), p), p);
1532     return b;
1533   }
1534   if ((p % 16) == 9) { /* Müller's algorithm extending Atkin */
1535     UV a2, alpha, beta, b, d = 1;
1536     a2 = addmod(a,a,p);
1537     alpha = powmod(a2, (p-9)>>4, p);
1538     beta  = mulmod(a2, sqrmod(alpha,p), p);
1539     if (sqrmod(beta,p) != p-1) {
1540       do { d += 2; } while (kronecker_uu(d,p) != -1 && d < p);
1541       alpha = mulmod(alpha, powmod(d,(p-9)>>3,p), p);
1542       beta  = mulmod(a2, mulmod(sqrmod(d,p),sqrmod(alpha,p),p), p);
1543     }
1544     b = mulmod(alpha, mulmod(a, mulmod(d,(beta ? beta-1 : p-1),p),p),p);
1545     return b;
1546   }
1547 
1548   /* Verify Euler condition for odd p */
1549   if ((p & 1) && powmod(a,(p-1)>>1,p) != 1) return 0;
1550 
1551   {
1552     UV x, q, e, t, z, r, m, b;
1553     q = p-1;
1554     e = valuation(q, 2);
1555     q >>= e;
1556     t = 3;
1557     while (kronecker_uu(t, p) != -1) {
1558       t += 2;
1559       if (t == 201) {           /* exit if p looks like a composite */
1560         if ((p % 2) == 0 || powmod(2, p-1, p) != 1 || powmod(3, p-1, p) != 1)
1561           return 0;
1562       } else if (t >= 20000) {  /* should never happen */
1563         return 0;
1564       }
1565     }
1566     z = powmod(t, q, p);
1567     b = powmod(a, q, p);
1568     r = e;
1569     q = (q+1) >> 1;
1570     x = powmod(a, q, p);
1571     while (b != 1) {
1572       t = b;
1573       for (m = 0; m < r && t != 1; m++)
1574         t = sqrmod(t, p);
1575       if (m >= r) break;
1576       t = powmod(z, UVCONST(1) << (r-m-1), p);
1577       x = mulmod(x, t, p);
1578       z = mulmod(t, t, p);
1579       b = mulmod(b, z, p);
1580       r = m;
1581     }
1582     return x;
1583   }
1584   return 0;
1585 }
1586 #else
_sqrtmod_prime(UV a,UV p)1587 UV _sqrtmod_prime(UV a, UV p) {
1588   const uint64_t npi = mont_inverse(p),  mont1 = mont_get1(p);
1589   a = mont_geta(a,p);
1590 
1591   if ((p % 4) == 3) {
1592     UV b = mont_powmod(a, (p+1)>>2, p);
1593     return mont_recover(b, p);
1594   }
1595 
1596   if ((p % 8) == 5) { /* Atkin's algorithm.  Faster than Legendre. */
1597     UV a2, alpha, beta, b;
1598     a2 = addmod(a,a,p);
1599     alpha = mont_powmod(a2,(p-5)>>3,p);
1600     beta  = mont_mulmod(a2,mont_sqrmod(alpha,p),p);
1601     beta  = submod(beta, mont1, p);
1602     b     = mont_mulmod(alpha, mont_mulmod(a, beta, p), p);
1603     return mont_recover(b, p);
1604   }
1605   if ((p % 16) == 9) { /* Müller's algorithm extending Atkin */
1606     UV a2, alpha, beta, b, d = 1;
1607     a2 = addmod(a,a,p);
1608     alpha = mont_powmod(a2, (p-9)>>4, p);
1609     beta  = mont_mulmod(a2, mont_sqrmod(alpha,p), p);
1610     if (mont_sqrmod(beta,p) != submod(0,mont1,p)) {
1611       do { d += 2; } while (kronecker_uu(d,p) != -1 && d < p);
1612       d = mont_geta(d,p);
1613       alpha = mont_mulmod(alpha, mont_powmod(d,(p-9)>>3,p), p);
1614       beta  = mont_mulmod(a2, mont_mulmod(mont_sqrmod(d,p),mont_sqrmod(alpha,p),p), p);
1615       beta  = mont_mulmod(submod(beta,mont1,p), d, p);
1616     } else {
1617       beta  = submod(beta, mont1, p);
1618     }
1619     b = mont_mulmod(alpha, mont_mulmod(a, beta, p), p);
1620     return mont_recover(b, p);
1621   }
1622 
1623   /* Verify Euler condition for odd p */
1624   if ((p & 1) && mont_powmod(a,(p-1)>>1,p) != mont1) return 0;
1625 
1626   {
1627     UV x, q, e, t, z, r, m, b;
1628     q = p-1;
1629     e = valuation(q, 2);
1630     q >>= e;
1631     t = 3;
1632     while (kronecker_uu(t, p) != -1) {
1633       t += 2;
1634       if (t == 201) {           /* exit if p looks like a composite */
1635         if ((p % 2) == 0 || powmod(2, p-1, p) != 1 || powmod(3, p-1, p) != 1)
1636           return 0;
1637       } else if (t >= 20000) {  /* should never happen */
1638         return 0;
1639       }
1640     }
1641     t = mont_geta(t, p);
1642     z = mont_powmod(t, q, p);
1643     b = mont_powmod(a, q, p);
1644     r = e;
1645     q = (q+1) >> 1;
1646     x = mont_powmod(a, q, p);
1647     while (b != mont1) {
1648       t = b;
1649       for (m = 0; m < r && t != mont1; m++)
1650         t = mont_sqrmod(t, p);
1651       if (m >= r) break;
1652       t = mont_powmod(z, UVCONST(1) << (r-m-1), p);
1653       x = mont_mulmod(x, t, p);
1654       z = mont_mulmod(t, t, p);
1655       b = mont_mulmod(b, z, p);
1656       r = m;
1657     }
1658     return mont_recover(x, p);
1659   }
1660   return 0;
1661 }
1662 #endif
1663 
sqrtmod(UV * s,UV a,UV p)1664 int sqrtmod(UV *s, UV a, UV p) {
1665   if (p == 0) return 0;
1666   if (a >= p) a %= p;
1667   if (p <= 2 || a <= 1) return verify_sqrtmod(a, s,a,p);
1668 
1669   return verify_sqrtmod(_sqrtmod_prime(a,p), s,a,p);
1670 }
1671 
sqrtmod_composite(UV * s,UV a,UV n)1672 int sqrtmod_composite(UV *s, UV a, UV n) {
1673   UV fac[MPU_MAX_FACTORS+1];
1674   UV exp[MPU_MAX_FACTORS+1];
1675   UV sqr[MPU_MAX_FACTORS+1];
1676   UV p, j, k, gcdan;
1677   int i, nfactors;
1678 
1679   if (n == 0) return 0;
1680   if (a >= n) a %= n;
1681   if (n <= 2 || a <= 1) return verify_sqrtmod(a, s,a,n);
1682 
1683   /* Simple existence check.  It's still possible no solution exists.*/
1684   if (kronecker_uu(a, ((n%4) == 2) ? n/2 : n) == -1) return 0;
1685 
1686   /* if 8|n 'a' must = 1 mod 8, else if 4|n 'a' must = 1 mod 4 */
1687   if ((n % 4) == 0) {
1688     if ((n % 8) == 0) {
1689       if ((a % 8) != 1) return 0;
1690     } else {
1691       if ((a % 4) != 1) return 0;
1692     }
1693   }
1694 
1695   /* More detailed existence check before factoring.  Still possible. */
1696   gcdan = gcd_ui(a, n);
1697   if (gcdan == 1) {
1698     if ((n % 3) == 0 && kronecker_uu(a, 3) != 1) return 0;
1699     if ((n % 5) == 0 && kronecker_uu(a, 5) != 1) return 0;
1700     if ((n % 7) == 0 && kronecker_uu(a, 7) != 1) return 0;
1701   }
1702 
1703   /* Factor n */
1704   nfactors = factor_exp(n, fac, exp);
1705 
1706   /* If gcd(a,n)==1, this answers comclusively if a solution exists. */
1707   if (gcdan == 1) {
1708     for (i = 0; i < nfactors; i++)
1709       if (fac[i] > 7 && kronecker_uu(a, fac[i]) != 1) return 0;
1710   }
1711 
1712   for (i = 0; i < nfactors; i++) {
1713 
1714     /* Powers of 2 */
1715     if (fac[i] == 2) {
1716       if (exp[i] == 1) {
1717         sqr[i] = a & 1;
1718       } else if (exp[i] == 2) {
1719         sqr[i] = 1;  /* and 3 */
1720       } else {
1721         UV this_roots[256], next_roots[256];
1722         UV nthis = 0, nnext = 0;
1723         this_roots[nthis++] = 1;
1724         this_roots[nthis++] = 3;
1725         for (j = 2; j < exp[i]; j++) {
1726           p = UVCONST(1) << (j+1);
1727           nnext = 0;
1728           for (k = 0; k < nthis && nnext < 254; k++) {
1729             UV r = this_roots[k];
1730             if (sqrmod(r,p) == (a % p))
1731               next_roots[nnext++] = r;
1732             if (sqrmod(p-r,p) == (a % p))
1733               next_roots[nnext++] = p-r;
1734           }
1735           if (nnext == 0) return 0;
1736           /* copy next exponent's found roots to this one */
1737           nthis = nnext;
1738           for (k = 0; k < nnext; k++)
1739             this_roots[k] = next_roots[k];
1740         }
1741         sqr[i] = this_roots[0];
1742       }
1743       continue;
1744     }
1745 
1746     /* p is an odd prime */
1747     p = fac[i];
1748     if (!sqrtmod(&(sqr[i]), a, p))
1749       return 0;
1750 
1751     /* Lift solution of x^2 = a mod p  to  x^2 = a mod p^e */
1752     for (j = 1; j < exp[i]; j++) {
1753       UV xk2, yk, expect, sol;
1754       xk2 = addmod(sqr[i],sqr[i],p);
1755       yk = modinverse(xk2, p);
1756       expect = mulmod(xk2,yk,p);
1757       p *= fac[i];
1758       sol = submod(sqr[i], mulmod(submod(sqrmod(sqr[i],p), a % p, p), yk, p), p);
1759       if (expect != 1 || sqrmod(sol,p) != (a % p)) {
1760         /* printf("a %lu failure to lift to %lu^%d\n", a, fac[i], j+1); */
1761         return 0;
1762       }
1763       sqr[i] = sol;
1764     }
1765   }
1766 
1767   /* raise fac[i] */
1768   for (i = 0; i < nfactors; i++)
1769     fac[i] = ipow(fac[i], exp[i]);
1770 
1771   p = chinese(sqr, fac, nfactors, &i);
1772   return (i == 1) ? verify_sqrtmod(p, s, a, n) : 0;
1773 }
1774 
1775 /* works only for co-prime inputs and also slower than the algorithm below,
1776  * but handles the case where IV_MAX < lcm <= UV_MAX.
1777  */
_simple_chinese(UV * a,UV * n,UV num,int * status)1778 static UV _simple_chinese(UV* a, UV* n, UV num, int* status) {
1779   UV i, lcm = 1, res = 0;
1780   *status = 0;
1781   if (num == 0) return 0;
1782 
1783   for (i = 0; i < num; i++) {
1784     UV ni = n[i];
1785     UV gcd = gcd_ui(lcm, ni);
1786     if (gcd != 1) return 0;           /* not coprime */
1787     ni /= gcd;
1788     if (ni > (UV_MAX/lcm)) return 0;  /* lcm overflow */
1789     lcm *= ni;
1790   }
1791   for (i = 0; i < num; i++) {
1792     UV p, inverse, term;
1793     p = lcm / n[i];
1794     inverse = modinverse(p, n[i]);
1795     if (inverse == 0) return 0;       /* n's coprime so should never happen */
1796     term = mulmod(p, mulmod(a[i], inverse, lcm), lcm);
1797     res = addmod(res, term, lcm);
1798   }
1799   *status = 1;
1800   return res;
1801 }
1802 
1803 /* status: 1 ok, -1 no inverse, 0 overflow */
chinese(UV * a,UV * n,UV num,int * status)1804 UV chinese(UV* a, UV* n, UV num, int* status) {
1805   static unsigned short sgaps[] = {7983,3548,1577,701,301,132,57,23,10,4,1,0};
1806   UV gcd, i, j, lcm, sum, gi, gap;
1807   *status = 1;
1808   if (num == 0) return 0;
1809 
1810   /* Sort modulii, largest first */
1811   for (gi = 0, gap = sgaps[gi]; gap >= 1; gap = sgaps[++gi]) {
1812     for (i = gap; i < num; i++) {
1813       UV tn = n[i], ta = a[i];
1814       for (j = i; j >= gap && n[j-gap] < tn; j -= gap)
1815         {  n[j] = n[j-gap];  a[j] = a[j-gap];  }
1816       n[j] = tn;  a[j] = ta;
1817     }
1818   }
1819 
1820   if (n[0] > IV_MAX) return _simple_chinese(a,n,num,status);
1821   lcm = n[0]; sum = a[0] % n[0];
1822   for (i = 1; i < num; i++) {
1823     IV u, v, t, s;
1824     UV vs, ut;
1825     gcd = gcdext(lcm, n[i], &u, &v, &s, &t);
1826     if (gcd != 1 && ((sum % gcd) != (a[i] % gcd))) { *status = -1; return 0; }
1827     if (s < 0) s = -s;
1828     if (t < 0) t = -t;
1829     if (s > (IV)(IV_MAX/lcm)) return _simple_chinese(a,n,num,status);
1830     lcm *= s;
1831     if (u < 0) u += lcm;
1832     if (v < 0) v += lcm;
1833     vs = mulmod((UV)v, (UV)s, lcm);
1834     ut = mulmod((UV)u, (UV)t, lcm);
1835     sum = addmod(  mulmod(vs, sum, lcm),  mulmod(ut, a[i], lcm),  lcm  );
1836   }
1837   return sum;
1838 }
1839 
chebyshev_psi(UV n)1840 NV chebyshev_psi(UV n)
1841 {
1842   UV k;
1843   KAHAN_INIT(sum);
1844 
1845   for (k = log2floor(n); k > 0; k--) {
1846     KAHAN_SUM(sum, chebyshev_theta(rootof(n,k)));
1847   }
1848   return sum;
1849 }
1850 
1851 #if BITS_PER_WORD == 64
1852 typedef struct {
1853   UV n;
1854   LNV theta;
1855 } cheby_theta_t;
1856 static const cheby_theta_t _cheby_theta[] = { /* >= quad math precision */
1857   { UVCONST(      67108864),LNVCONST(    67100507.6357700963903836828562472350035880) },
1858   { UVCONST(     100000000),LNVCONST(    99987730.0180220043832124342600487053812729) },
1859   { UVCONST(     134217728),LNVCONST(   134204014.5735572091791081610859055728165544) },
1860   { UVCONST(     268435456),LNVCONST(   268419741.6134308193112682817754501071404173) },
1861   { UVCONST(     536870912),LNVCONST(   536842885.8045763840625719515011160692495056) },
1862   { UVCONST(    1000000000),LNVCONST(   999968978.5775661447991262386023331863364793) },
1863   { UVCONST(    1073741824),LNVCONST(  1073716064.8860663337617909073555831842945484) },
1864   { UVCONST(    2147483648),LNVCONST(  2147432200.2475857676814950053003448716360822) },
1865   { UVCONST(    4294967296),LNVCONST(  4294889489.1735446386752045191908417183337361) },
1866   { UVCONST(    8589934592),LNVCONST(  8589863179.5654263491545135406516173629373070) },
1867   { UVCONST(   10000000000),LNVCONST(  9999939830.6577573841592219954033850595228736) },
1868   { UVCONST(   12884901888),LNVCONST( 12884796620.4324254952601520445848183460347362) },
1869   { UVCONST(   17179869184),LNVCONST( 17179757715.9924077567777285147574707468995695) },
1870   { UVCONST(   21474836480),LNVCONST( 21474693322.0998273969188369449626287713082943) },
1871   { UVCONST(   25769803776),LNVCONST( 25769579799.3751535467593954636665656772211515) },
1872   { UVCONST(   30064771072),LNVCONST( 30064545001.2305211029215168703433831598544454) },
1873   { UVCONST(   34359738368),LNVCONST( 34359499180.0126643918259085362039638823175054) },
1874   { UVCONST(   51539607552),LNVCONST( 51539356394.9531019037592855639826469993402730) },
1875   { UVCONST(   68719476736),LNVCONST( 68719165213.6369838785284711480925219076501720) },
1876   { UVCONST(   85899345920),LNVCONST( 85899083852.3471545629838432726841470626910905) },
1877   { UVCONST(  100000000000),LNVCONST( 99999737653.1074446948519125729820679772770146) },
1878   { UVCONST(  103079215104),LNVCONST(103079022007.113299711630969211422868856259124) },
1879   { UVCONST(  120259084288),LNVCONST(120258614516.787336970535750737470005730125261) },
1880   { UVCONST(  137438953472),LNVCONST(137438579206.444595884982301543904849253294539) },
1881   { UVCONST(  171798691840),LNVCONST(171798276885.585945657918751085729734540334501) },
1882   { UVCONST(  206158430208),LNVCONST(206158003808.160276853604927822609009916573462) },
1883   { UVCONST(  240518168576),LNVCONST(240517893445.995868018331936763125264759516048) },
1884   { UVCONST(  274877906944),LNVCONST(274877354651.045354829956619821889825596300686) },
1885   { UVCONST(  309237645312),LNVCONST(309237050379.850690561796126460858271984023198) },
1886   { UVCONST(  343597383680),LNVCONST(343596855806.595496630500062749631211394707114) },
1887   { UVCONST(  377957122048),LNVCONST(377956498560.227794386327526022452943941537993) },
1888   { UVCONST(  412316860416),LNVCONST(412316008796.349553568121442261222464590518293) },
1889   { UVCONST(  446676598784),LNVCONST(446675972485.936512329625489223180824947531484) },
1890   { UVCONST(  481036337152),LNVCONST(481035608287.572961376833237046440177624505864) },
1891   { UVCONST(  515396075520),LNVCONST(515395302740.633513931333424447688399032397200) },
1892   { UVCONST(  549755813888),LNVCONST(549755185085.539613556787409928561107952681488) },
1893   { UVCONST(  584115552256),LNVCONST(584115015741.698143680148976236958207248900725) },
1894   { UVCONST(  618475290624),LNVCONST(618474400071.621528348965919774195984612254220) },
1895   { UVCONST(  652835028992),LNVCONST(652834230470.583317059774197550110194348469358) },
1896   { UVCONST(  687194767360),LNVCONST(687193697328.927006867624832386534836384752774) },
1897   { UVCONST(  721554505728),LNVCONST(721553211683.605313067593521060195071837766347) },
1898   { UVCONST(  755914244096),LNVCONST(755913502349.878525212441903698096011352015192) },
1899   { UVCONST(  790273982464),LNVCONST(790273042590.053075430445971969285969445183076) },
1900   { UVCONST(  824633720832),LNVCONST(824633080997.428352876758261549475609957696369) },
1901   { UVCONST(  858993459200),LNVCONST(858992716288.318498931165663742671579465316192) },
1902   { UVCONST(  893353197568),LNVCONST(893352235882.851072417721659027263613727927680) },
1903   { UVCONST(  927712935936),LNVCONST(927711881043.628817668337317445143018372892386) },
1904   { UVCONST(  962072674304),LNVCONST(962071726126.508938539006575212272731584070786) },
1905   { UVCONST(  996432412672),LNVCONST(996431411588.361462717402562171913706963939018) },
1906   { UVCONST( 1099511627776),LNVCONST(1099510565082.05800550569923209414874779035972) },
1907   { UVCONST( 1168231104512),LNVCONST(1168230478726.83399452743801182220790107593115) },
1908   { UVCONST( 1236950581248),LNVCONST(1236949680081.02610603189530371762093291521116) },
1909   { UVCONST( 1305670057984),LNVCONST(1305668780900.04255251887970870257110498423202) },
1910   { UVCONST( 1374389534720),LNVCONST(1374388383792.63751003694755359184583212193880) },
1911   { UVCONST( 1443109011456),LNVCONST(1443107961091.80955496949174183091839841371227) },
1912   { UVCONST( 1511828488192),LNVCONST(1511827317611.91227277802426032456922797572429) },
1913   { UVCONST( 1580547964928),LNVCONST(1580546753969.30607547506449941085747942395437) },
1914   { UVCONST( 1649267441664),LNVCONST(1649265973878.75361554498682516738256005501353) },
1915   { UVCONST( 1717986918400),LNVCONST(1717985403764.24562741452793071287954107946922) },
1916   { UVCONST( 1786706395136),LNVCONST(1786704769212.04241689416220650800274263053933) },
1917   { UVCONST( 1855425871872),LNVCONST(1855425013030.54920163513184322741954734357404) },
1918   { UVCONST( 1924145348608),LNVCONST(1924143701943.02957992419280264060220278182021) },
1919   { UVCONST( 1992864825344),LNVCONST(1992863373568.84039296068619447120308124302086) },
1920   { UVCONST( 2061584302080),LNVCONST(2061583632335.91985095534685076604018573279204) },
1921   { UVCONST( 2130303778816),LNVCONST(2113122935598.01727180199783433992649406589029) },
1922   { UVCONST( 2199023255552),LNVCONST(2199021399611.18488312543276191461914978761981) },
1923   { UVCONST( 2267742732288),LNVCONST(2267740947106.05038218811506263712808318234921) },
1924   { UVCONST( 2336462209024),LNVCONST(2336460081480.34962633829077377680844065198307) },
1925   { UVCONST( 2405181685760),LNVCONST(2405179969505.38642629423585641169740223940265) },
1926   { UVCONST( 2473901162496),LNVCONST(2473899311193.37872375168104562948639924654178) },
1927   { UVCONST( 2542620639232),LNVCONST(2542619362554.88893589220737167756411653816418) },
1928   { UVCONST( 2611340115968),LNVCONST(2611338370515.94936514022501267847930999670553) },
1929   { UVCONST( 2680059592704),LNVCONST(2680057722824.52981820001574883706268873541107) },
1930   { UVCONST( 2748779069440),LNVCONST(2748777610452.18903407570165081726781627254885) },
1931   { UVCONST( 2817498546176),LNVCONST(2817497017165.31924616507392971415494161401775) },
1932   { UVCONST( 2886218022912),LNVCONST(2886216579432.32232322707222172612181994322081) },
1933   { UVCONST( 2954937499648),LNVCONST(2954936100812.97301730406598982753121204977388) },
1934   { UVCONST( 3023656976384),LNVCONST(3023654789503.82041452274471455184651411931920) },
1935   { UVCONST( 3298534883328),LNVCONST(3298533215621.76606493931157388037915263658637) },
1936   { UVCONST( 3573412790272),LNVCONST(3573411344351.74163523704886736624674718378131) },
1937   { UVCONST( 3848290697216),LNVCONST(3848288415701.82534219216958446478503907262807) },
1938   { UVCONST( 4123168604160),LNVCONST(4123166102085.86116301709394219323327831487542) },
1939   { UVCONST( 4398046511104),LNVCONST(4398044965678.05143041707871320554940671182665) },
1940   { UVCONST( 4672924418048),LNVCONST(4672922414672.04998927945349278916525727295687) },
1941   { UVCONST( 4947802324992),LNVCONST(4947800056419.04384937181159608905993450182729) },
1942   { UVCONST( 5222680231936),LNVCONST(5222678728087.69487334278665824384732845008859) },
1943   { UVCONST( 5497558138880),LNVCONST(5497555766573.55159115560501595606332808978878) },
1944   { UVCONST( 5772436045824),LNVCONST(5772433560746.27053256770924553245647027548204) },
1945   { UVCONST( 6047313952768),LNVCONST(6047310750621.24497633828761530843255989494448) },
1946   { UVCONST( 6322191859712),LNVCONST(6322189275338.39747421237532473168802646234745) },
1947   { UVCONST( 6597069766656),LNVCONST(6579887620000.56226807898107616294821989189226) },
1948   { UVCONST( 6871947673600),LNVCONST(6871945430474.61791600096091374271286154432006) },
1949   { UVCONST( 7146825580544),LNVCONST(7146823258390.34361980709600216319269118247416) },
1950   { UVCONST( 7421703487488),LNVCONST(7421700443390.35536080251964387835425662360121) },
1951   { UVCONST( 7696581394432),LNVCONST(7696578975137.73249441643024336954233783264803) },
1952   { UVCONST( 7971459301376),LNVCONST(7971457197928.90863708984184849978605273042512) },
1953   { UVCONST( 8246337208320),LNVCONST(8246333982863.77146812177727648999195989358960) },
1954   { UVCONST( 8521215115264),LNVCONST(8529802085075.55635100929751669785228592926043) },
1955   { UVCONST( 8796093022208),LNVCONST(8796089836425.34909684634625258535266362465034) },
1956   { UVCONST( 9345848836096),LNVCONST(9345845828116.77456046925508587313) },
1957   { UVCONST( 9895604649984),LNVCONST(9895601077915.26821447819584407150) },
1958   { UVCONST(10000000000000),LNVCONST(9999996988293.03419965318214160284) },
1959   { UVCONST(15000000000000),LNVCONST(14999996482301.7098815115045166858) },
1960   { UVCONST(20000000000000),LNVCONST(19999995126082.2286880312461318496) },
1961   { UVCONST(25000000000000),LNVCONST(24999994219058.4086216020475916538) },
1962   { UVCONST(30000000000000),LNVCONST(29999995531389.8454274046657200568) },
1963   { UVCONST(35000000000000),LNVCONST(34999992921190.8049427456456479005) },
1964   { UVCONST(40000000000000),LNVCONST(39999993533724.3168289589273168844) },
1965   { UVCONST(45000000000000),LNVCONST(44999993567606.9795798378256194424) },
1966   { UVCONST(50000000000000),LNVCONST(49999992543194.2636545758235373677) },
1967   { UVCONST(55000000000000),LNVCONST(54999990847877.2435105757522625171) },
1968   { UVCONST(60000000000000),LNVCONST(59999990297033.6261976055811111726) },
1969   { UVCONST(65000000000000),LNVCONST(64999990861395.5522142429859245014) },
1970   { UVCONST(70000000000000),LNVCONST(69999994316409.8717306521862685981) },
1971   { UVCONST(75000000000000),LNVCONST(74999990126219.8344899338374090165) },
1972   { UVCONST(80000000000000),LNVCONST(79999990160858.3042387288372250950) },
1973   { UVCONST(85000000000000),LNVCONST(84999987096970.5915212896832780715) },
1974   { UVCONST(90000000000000),LNVCONST(89999989501395.0738966599857919767) },
1975   { UVCONST(95000000000000),LNVCONST(94999990785908.6672552042792168144) },
1976   { UVCONST(100000000000000),LNVCONST(99999990573246.9785384070303475639) },
1977 };
1978 #define NCHEBY_VALS (sizeof(_cheby_theta)/sizeof(_cheby_theta[0]))
1979 #endif
1980 
chebyshev_theta(UV n)1981 NV chebyshev_theta(UV n)
1982 {
1983   uint16_t i = 0;
1984   UV tp, startn, seg_base, seg_low, seg_high;
1985   unsigned char* segment;
1986   void* ctx;
1987   LNV initial_sum, prod = LNV_ONE;
1988   KAHAN_INIT(sum);
1989 
1990   if (n < 500) {
1991     for (i = 1;  (tp = primes_tiny[i]) <= n; i++) {
1992       KAHAN_SUM(sum, loglnv(tp));
1993     }
1994     return sum;
1995   }
1996 
1997 #if defined NCHEBY_VALS
1998   if (n >= _cheby_theta[0].n) {
1999     for (i = 1; i < NCHEBY_VALS; i++)
2000       if (n < _cheby_theta[i].n)
2001         break;
2002     startn = _cheby_theta[i-1].n;
2003     initial_sum = _cheby_theta[i-1].theta;
2004   } else
2005 #endif
2006   {
2007     KAHAN_SUM(sum, loglnv(2*3*5*7*11*13));
2008     startn = 17;
2009     initial_sum = 0;
2010   }
2011 
2012   ctx = start_segment_primes(startn, n, &segment);
2013 #if 0
2014   while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
2015     START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
2016       KAHAN_SUM(sum, loglnv(p));
2017     } END_DO_FOR_EACH_SIEVE_PRIME
2018   }
2019 #else
2020   while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
2021     START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
2022       prod *= (LNV) p;
2023       if (++i >= (LNV_IS_QUAD ? 64 : 8)) {
2024         KAHAN_SUM(sum, loglnv(prod));
2025         prod = LNV_ONE;
2026         i = 0;
2027       }
2028     } END_DO_FOR_EACH_SIEVE_PRIME
2029   }
2030   if (prod > 1.0) { KAHAN_SUM(sum, loglnv(prod));  prod = LNV_ONE; }
2031 #endif
2032   end_segment_primes(ctx);
2033 
2034   if (initial_sum > 0) KAHAN_SUM(sum, initial_sum);
2035   return sum;
2036 }
2037 
2038 
2039 
2040 /*
2041  * See:
2042  *  "Multiple-Precision Exponential Integral and Related Functions"
2043  *      by David M. Smith
2044  *  "On the Evaluation of the Complex-Valued Exponential Integral"
2045  *      by Vincent Pegoraro and Philipp Slusallek
2046  *  "Numerical Recipes" 3rd edition
2047  *      by William H. Press et al.
2048  *  "Rational Chevyshev Approximations for the Exponential Integral E_1(x)"
2049  *      by W. J. Cody and Henry C. Thacher, Jr.
2050  *
2051  * Any mistakes here are completely my fault.  This code has not been
2052  * verified for anything serious.  For better results, see:
2053  *    http://www.trnicely.net/pi/pix_0000.htm
2054  * which although the author claims are demonstration programs, will
2055  * undoubtedly produce more reliable results than this code does (I don't
2056  * know of any obvious issues with this code, but it just hasn't been used
2057  * by many people).
2058  */
2059 
2060 static LNV const euler_mascheroni = LNVCONST(0.57721566490153286060651209008240243104215933593992);
2061 static LNV const li2 = LNVCONST(1.045163780117492784844588889194613136522615578151);
2062 
Ei(NV x)2063 NV Ei(NV x) {
2064   LNV val, term;
2065   unsigned int n;
2066   KAHAN_INIT(sum);
2067 
2068   if (x == 0) croak("Invalid input to ExponentialIntegral:  x must be != 0");
2069   /* Protect against messed up rounding modes */
2070   if (x >=  12000) return INFINITY;
2071   if (x <= -12000) return 0;
2072 
2073   if (x < -1) {
2074     /* Continued fraction, good for x < -1 */
2075     LNV lc = 0;
2076     LNV ld = LNV_ONE / (LNV_ONE - (LNV)x);
2077     val = ld * (-explnv(x));
2078     for (n = 1; n <= 100000; n++) {
2079       LNV old, t, n2;
2080       t = (LNV)(2*n + 1) - (LNV) x;
2081       n2 = n * n;
2082       lc = LNV_ONE / (t - n2 * lc);
2083       ld = LNV_ONE / (t - n2 * ld);
2084       old = val;
2085       val *= ld/lc;
2086       if ( fabslnv(val-old) <= LNV_EPSILON*fabslnv(val) )
2087         break;
2088     }
2089   } else if (x < 0) {
2090     /* Rational Chebyshev approximation (Cody, Thacher), good for -1 < x < 0 */
2091     static const LNV C6p[7] = { LNVCONST(-148151.02102575750838086),
2092                                 LNVCONST( 150260.59476436982420737),
2093                                 LNVCONST(  89904.972007457256553251),
2094                                 LNVCONST(  15924.175980637303639884),
2095                                 LNVCONST(   2150.0672908092918123209),
2096                                 LNVCONST(    116.69552669734461083368),
2097                                 LNVCONST(      5.0196785185439843791020) };
2098     static const LNV C6q[7] = { LNVCONST( 256664.93484897117319268),
2099                                 LNVCONST( 184340.70063353677359298),
2100                                 LNVCONST(  52440.529172056355429883),
2101                                 LNVCONST(   8125.8035174768735759866),
2102                                 LNVCONST(    750.43163907103936624165),
2103                                 LNVCONST(     40.205465640027706061433),
2104                                 LNVCONST(      1.0000000000000000000000) };
2105     LNV sumn = C6p[0]-x*(C6p[1]-x*(C6p[2]-x*(C6p[3]-x*(C6p[4]-x*(C6p[5]-x*C6p[6])))));
2106     LNV sumd = C6q[0]-x*(C6q[1]-x*(C6q[2]-x*(C6q[3]-x*(C6q[4]-x*(C6q[5]-x*C6q[6])))));
2107     val = loglnv(-x) - sumn/sumd;
2108   } else if (x < (-2 * loglnv(LNV_EPSILON))) {
2109     /* Convergent series.  Accurate but slow especially with large x. */
2110     LNV fact_n = x;
2111     for (n = 2; n <= 200; n++) {
2112       LNV invn = LNV_ONE / n;
2113       fact_n *= (LNV)x * invn;
2114       term = fact_n * invn;
2115       KAHAN_SUM(sum, term);
2116       /* printf("C  after adding %.20Lf, val = %.20Lf\n", term, sum); */
2117       if (term < LNV_EPSILON*sum) break;
2118     }
2119     KAHAN_SUM(sum, euler_mascheroni);
2120     KAHAN_SUM(sum, loglnv(x));
2121     KAHAN_SUM(sum, x);
2122     val = sum;
2123   } else if (x >= 24) {
2124     /* Cody / Thacher rational Chebyshev */
2125     static const LNV P2[10] = {
2126         LNVCONST( 1.75338801265465972390E02),
2127         LNVCONST(-2.23127670777632409550E02),
2128         LNVCONST(-1.81949664929868906455E01),
2129         LNVCONST(-2.79798528624305389340E01),
2130         LNVCONST(-7.63147701620253630855E00),
2131         LNVCONST(-1.52856623636929636839E01),
2132         LNVCONST(-7.06810977895029358836E00),
2133         LNVCONST(-5.00006640413131002475E00),
2134         LNVCONST(-3.00000000320981265753E00),
2135         LNVCONST( 1.00000000000000485503E00) };
2136     static const LNV Q2[9] = {
2137         LNVCONST( 3.97845977167414720840E04),
2138         LNVCONST( 3.97277109100414518365E00),
2139         LNVCONST( 1.37790390235747998793E02),
2140         LNVCONST( 1.17179220502086455287E02),
2141         LNVCONST( 7.04831847180424675988E01),
2142         LNVCONST(-1.20187763547154743238E01),
2143         LNVCONST(-7.99243595776339741065E00),
2144         LNVCONST(-2.99999894040324959612E00),
2145         LNVCONST( 1.99999999999048104167E00) };
2146     LNV invx = LNV_ONE / x, frac = 0.0;
2147     for (n = 0; n <= 8; n++)
2148       frac = Q2[n] / (P2[n] + x + frac);
2149     frac += P2[9];
2150     val = explnv(x) * (invx + invx*invx*frac);
2151   } else {
2152     /* Asymptotic divergent series */
2153     LNV invx = LNV_ONE / x;
2154     term = 1.0;
2155     for (n = 1; n <= 200; n++) {
2156       LNV last_term = term;
2157       term = term * ( (LNV)n * invx );
2158       if (term < LNV_EPSILON*sum) break;
2159       if (term < last_term) {
2160         KAHAN_SUM(sum, term);
2161         /* printf("A  after adding %.20llf, sum = %.20llf\n", term, sum); */
2162       } else {
2163         KAHAN_SUM(sum, (-last_term/3) );
2164         /* printf("A  after adding %.20llf, sum = %.20llf\n", -last_term/3, sum); */
2165         break;
2166       }
2167     }
2168     KAHAN_SUM(sum, LNV_ONE);
2169     val = explnv(x) * sum * invx;
2170   }
2171 
2172   return val;
2173 }
2174 
Li(NV x)2175 NV Li(NV x) {
2176   if (x == 0) return 0;
2177   if (x == 1) return -INFINITY;
2178   if (x == 2) return li2;
2179   if (x < 0) croak("Invalid input to LogarithmicIntegral:  x must be >= 0");
2180   if (x >= NV_MAX) return INFINITY;
2181 
2182   /* Calculate directly using Ramanujan's series. */
2183   if (x > 1) {
2184     const LNV logx = loglnv(x);
2185     LNV sum = 0, inner_sum = 0, old_sum, factorial = 1, power2 = 1;
2186     LNV q, p = -1;
2187     int k = 0, n = 0;
2188 
2189     for (n = 1, k = 0; n < 200; n++) {
2190       factorial *= n;
2191       p *= -logx;
2192       q = factorial * power2;
2193       power2 *= 2;
2194       for (; k <= (n - 1) / 2; k++)
2195         inner_sum += LNV_ONE / (2 * k + 1);
2196       old_sum = sum;
2197       sum += (p / q) * inner_sum;
2198       if (fabslnv(sum - old_sum) <= LNV_EPSILON) break;
2199     }
2200     return euler_mascheroni + loglnv(logx) + sqrtlnv(x) * sum;
2201   }
2202 
2203   return Ei(loglnv(x));
2204 }
2205 
ld_inverse_li(long double lx)2206 static long double ld_inverse_li(long double lx) {
2207   int i;
2208   long double t, term, old_term = 0;
2209   /* Iterate Halley's method until error grows. */
2210   t = (lx <= 2)  ?  2  :  lx * logl(lx);
2211   for (i = 0; i < 4; i++) {
2212     long double dn = Li(t) - lx;
2213     term = dn*logl(t) / (1.0L + dn/(2*t));
2214     if (i > 0 && fabsl(term) >= fabsl(old_term)) { t -= term/4; break; }
2215     old_term = term;
2216     t -= term;
2217   }
2218   return t;
2219 }
2220 
inverse_li(UV x)2221 UV inverse_li(UV x) {
2222   UV r, i;
2223   long double lx = (long double) x;
2224 
2225   if (x <= 2) return x + (x > 0);
2226   r = (UV) ceill( ld_inverse_li(lx) );
2227   /* Meet our more stringent goal of an exact answer. */
2228   i = (x > 4e16) ? 2048 : 128;
2229   if (Li(r-1) >= lx) {
2230     while (Li(r-i) >= lx) r -= i;
2231     for (i = i/2; i > 0; i /= 2)
2232       if (Li(r-i) >= lx) r -= i;
2233   } else {
2234     while (Li(r+i-1) < lx) r += i;
2235     for (i = i/2; i > 0; i /= 2)
2236       if (Li(r+i-1) < lx) r += i;
2237   }
2238   return r;
2239 }
2240 
ld_inverse_R(long double lx)2241 static long double ld_inverse_R(long double lx) {
2242   int i;
2243   long double t, dn, term, old_term = 0;
2244 
2245   /* Rough estimate */
2246   if (lx <= 3.5) {
2247     t = lx + 2.24*(lx-1)/2;
2248   } else {
2249     t = lx * logl(lx);
2250     if      (lx <   50) { t *= 1.2; }
2251     else if (lx < 1000) { t *= 1.15; }
2252     else {   /* use inverse Li (one iteration) for first inverse R approx */
2253       dn = Li(t) - lx;
2254       term = dn * logl(t) / (1.0L + dn/(2*t));
2255       t -= term;
2256     }
2257   }
2258   /* Iterate 1-n rounds of Halley, usually only 3 needed. */
2259   for (i = 0; i < 100; i++) {
2260     dn = RiemannR(t) - lx;
2261 #if 1  /* Use f(t) = li(t) for derivatives */
2262     term = dn * logl(t) / (1.0L + dn/(2*t));
2263 #else  /* Use f(t) = li(t) - li(sqrt(t))/2 for derivatives */
2264     long double logt = logl(t);
2265     long double sqrtt = sqrtl(t);
2266     long double FA = 2 * sqrtt * logt;
2267     long double FB = 2 * sqrtt - 1;
2268     long double ifz = FA / FB;
2269     long double iffz = (logt - 2*FB) / (2 * sqrtt * FA * FA * FA * FA);
2270     term = dn * ifz * (1.0L - dn * iffz);
2271 #endif
2272     if (i > 0 && fabsl(term) >= fabsl(old_term)) { t -= term/4; break; }
2273     old_term = term;
2274     t -= term;
2275   }
2276   return t;
2277 }
2278 
inverse_R(UV x)2279 UV inverse_R(UV x) {
2280   if (x < 2) return x + (x > 0);
2281   return (UV) ceill( ld_inverse_R( (long double) x) );
2282 }
2283 
2284 
2285 /*
2286  * Storing the first 10-20 Zeta values makes sense.  Past that it is purely
2287  * to avoid making the call to generate them ourselves.  We could cache the
2288  * calculated values. These all have 1 subtracted from them.  */
2289 static const long double riemann_zeta_table[] = {
2290   0.6449340668482264364724151666460251892L,  /* zeta(2) */
2291   0.2020569031595942853997381615114499908L,
2292   0.0823232337111381915160036965411679028L,
2293   0.0369277551433699263313654864570341681L,
2294   0.0173430619844491397145179297909205279L,
2295   0.0083492773819228268397975498497967596L,
2296   0.0040773561979443393786852385086524653L,
2297   0.0020083928260822144178527692324120605L,
2298   0.0009945751278180853371459589003190170L,
2299   0.0004941886041194645587022825264699365L,
2300   0.0002460865533080482986379980477396710L,
2301   0.0001227133475784891467518365263573957L,
2302   0.0000612481350587048292585451051353337L,
2303   0.0000305882363070204935517285106450626L,
2304   0.0000152822594086518717325714876367220L,
2305   0.0000076371976378997622736002935630292L,  /* zeta(17)  Past here all we're */
2306   0.0000038172932649998398564616446219397L,  /* zeta(18)  getting is speed.   */
2307   0.0000019082127165539389256569577951013L,
2308   0.0000009539620338727961131520386834493L,
2309   0.0000004769329867878064631167196043730L,
2310   0.0000002384505027277329900036481867530L,
2311   0.0000001192199259653110730677887188823L,
2312   0.0000000596081890512594796124402079358L,
2313   0.0000000298035035146522801860637050694L,
2314   0.0000000149015548283650412346585066307L,
2315   0.0000000074507117898354294919810041706L,
2316   0.0000000037253340247884570548192040184L,
2317   0.0000000018626597235130490064039099454L,
2318   0.0000000009313274324196681828717647350L,
2319   0.0000000004656629065033784072989233251L,
2320   0.0000000002328311833676505492001455976L,
2321   0.0000000001164155017270051977592973835L,
2322   0.0000000000582077208790270088924368599L,
2323   0.0000000000291038504449709968692942523L,
2324   0.0000000000145519218910419842359296322L,
2325   0.0000000000072759598350574810145208690L,
2326   0.0000000000036379795473786511902372363L,
2327   0.0000000000018189896503070659475848321L,
2328   0.0000000000009094947840263889282533118L,
2329   0.0000000000004547473783042154026799112L,
2330   0.0000000000002273736845824652515226821L,
2331   0.0000000000001136868407680227849349105L,
2332   0.0000000000000568434198762758560927718L,
2333   0.0000000000000284217097688930185545507L,
2334   0.0000000000000142108548280316067698343L,
2335   0.00000000000000710542739521085271287735L,
2336   0.00000000000000355271369133711367329847L,
2337   0.00000000000000177635684357912032747335L,
2338   0.000000000000000888178421093081590309609L,
2339   0.000000000000000444089210314381336419777L,
2340   0.000000000000000222044605079804198399932L,
2341   0.000000000000000111022302514106613372055L,
2342   0.0000000000000000555111512484548124372374L,
2343   0.0000000000000000277555756213612417258163L,
2344   0.0000000000000000138777878097252327628391L,
2345 };
2346 #define NPRECALC_ZETA (sizeof(riemann_zeta_table)/sizeof(riemann_zeta_table[0]))
2347 
2348 /* Riemann Zeta on the real line, with 1 subtracted.
2349  * Compare to Math::Cephes zetac.  Also zeta with q=1 and subtracting 1.
2350  *
2351  * The Cephes zeta function uses a series (2k)!/B_2k which converges rapidly
2352  * and has a very wide range of values.  We use it here for some values.
2353  *
2354  * Note: Calculations here are done on long doubles and we try to generate as
2355  *       much accuracy as possible.  They will get returned to Perl as an NV,
2356  *       which is typically a 64-bit double with 15 digits.
2357  *
2358  * For values 0.5 to 5, this code uses the rational Chebyshev approximation
2359  * from Cody and Thacher.  This method is extraordinarily fast and very
2360  * accurate over its range (slightly better than Cephes for most values).  If
2361  * we had quad floats, we could use the 9-term polynomial.
2362  */
ld_riemann_zeta(long double x)2363 long double ld_riemann_zeta(long double x) {
2364   int i;
2365 
2366   if (x < 0)  croak("Invalid input to RiemannZeta:  x must be >= 0");
2367   if (x == 1) return INFINITY;
2368 
2369   if (x == (unsigned int)x) {
2370     int k = x - 2;
2371     if ((k >= 0) && (k < (int)NPRECALC_ZETA))
2372       return riemann_zeta_table[k];
2373   }
2374 
2375   /* Cody / Thacher rational Chebyshev approximation for small values */
2376   if (x >= 0.5 && x <= 5.0) {
2377     static const long double C8p[9] = { 1.287168121482446392809e10L,
2378                                         1.375396932037025111825e10L,
2379                                         5.106655918364406103683e09L,
2380                                         8.561471002433314862469e08L,
2381                                         7.483618124380232984824e07L,
2382                                         4.860106585461882511535e06L,
2383                                         2.739574990221406087728e05L,
2384                                         4.631710843183427123061e03L,
2385                                         5.787581004096660659109e01L };
2386     static const long double C8q[9] = { 2.574336242964846244667e10L,
2387                                         5.938165648679590160003e09L,
2388                                         9.006330373261233439089e08L,
2389                                         8.042536634283289888587e07L,
2390                                         5.609711759541920062814e06L,
2391                                         2.247431202899137523543e05L,
2392                                         7.574578909341537560115e03L,
2393                                        -2.373835781373772623086e01L,
2394                                         1.000000000000000000000L    };
2395     long double sumn = C8p[0]+x*(C8p[1]+x*(C8p[2]+x*(C8p[3]+x*(C8p[4]+x*(C8p[5]+x*(C8p[6]+x*(C8p[7]+x*C8p[8])))))));
2396     long double sumd = C8q[0]+x*(C8q[1]+x*(C8q[2]+x*(C8q[3]+x*(C8q[4]+x*(C8q[5]+x*(C8q[6]+x*(C8q[7]+x*C8q[8])))))));
2397     long double sum = (sumn - (x-1)*sumd) / ((x-1)*sumd);
2398     return sum;
2399   }
2400 
2401   if (x > 17000.0)
2402     return 0.0;
2403 
2404 #if 0
2405   {
2406     KAHAN_INIT(sum);
2407     /* Simple defining series, works well. */
2408     for (i = 5; i <= 1000000; i++) {
2409       long double term = powl(i, -x);
2410       KAHAN_SUM(sum, term);
2411       if (term < LDBL_EPSILON*sum) break;
2412     }
2413     KAHAN_SUM(sum, powl(4, -x) );
2414     KAHAN_SUM(sum, powl(3, -x) );
2415     KAHAN_SUM(sum, powl(2, -x) );
2416     return sum;
2417   }
2418 #endif
2419 
2420   /* The 2n!/B_2k series used by the Cephes library. */
2421   {
2422     /* gp/pari:
2423      *   for(i=1,13,printf("%.38g\n",(2*i)!/bernreal(2*i)))
2424      * MPU:
2425      *   use bignum;
2426      *   say +(factorial(2*$_)/bernreal(2*$_))->bround(38) for 1..13;
2427      */
2428     static const long double A[] = {
2429       12.0L,
2430      -720.0L,
2431       30240.0L,
2432      -1209600.0L,
2433       47900160.0L,
2434      -1892437580.3183791606367583212735166425L,
2435       74724249600.0L,
2436      -2950130727918.1642244954382084600497650L,
2437       116467828143500.67248729113000661089201L,
2438      -4597978722407472.6105457273596737891656L,
2439       181521054019435467.73425331153534235290L,
2440      -7166165256175667011.3346447367083352775L,
2441       282908877253042996618.18640556532523927L,
2442     };
2443     long double a, b, s, t;
2444     const long double w = 10.0;
2445     s = 0.0;
2446     b = 0.0;
2447     for (i = 2; i < 11; i++) {
2448       b = powl( i, -x );
2449       s += b;
2450       if (fabsl(b) < fabsl(LDBL_EPSILON * s))
2451         return s;
2452     }
2453     s = s + b*w/(x-1.0) - 0.5 * b;
2454     a = 1.0;
2455     for (i = 0; i < 13; i++) {
2456       long double k = 2*i;
2457       a *= x + k;
2458       b /= w;
2459       t = a*b/A[i];
2460       s = s + t;
2461       if (fabsl(t) < fabsl(LDBL_EPSILON * s))
2462         break;
2463       a *= x + k + 1.0;
2464       b /= w;
2465     }
2466     return s;
2467   }
2468 }
2469 
RiemannR(long double x)2470 long double RiemannR(long double x) {
2471   long double part_term, term, flogx, ki, old_sum;
2472   unsigned int k;
2473   KAHAN_INIT(sum);
2474 
2475   if (x <= 0) croak("Invalid input to RiemannR:  x must be > 0");
2476 
2477   if (x > 1e19) {
2478     const signed char* amob = range_moebius(0, 100);
2479     KAHAN_SUM(sum, Li(x));
2480     for (k = 2; k <= 100; k++) {
2481       if (amob[k] == 0) continue;
2482       ki = 1.0L / (long double) k;
2483       part_term = powl(x,ki);
2484       if (part_term > LDBL_MAX) return INFINITY;
2485       term = amob[k] * ki * Li(part_term);
2486       old_sum = sum;
2487       KAHAN_SUM(sum, term);
2488       if (fabsl(sum - old_sum) <= LDBL_EPSILON) break;
2489     }
2490     Safefree(amob);
2491     return sum;
2492   }
2493 
2494   KAHAN_SUM(sum, 1.0);
2495   flogx = logl(x);
2496   part_term = 1;
2497 
2498   for (k = 1; k <= 10000; k++) {
2499     ki = (k-1 < NPRECALC_ZETA) ? riemann_zeta_table[k-1] : ld_riemann_zeta(k+1);
2500     part_term *= flogx / k;
2501     term = part_term / (k + k * ki);
2502     old_sum = sum;
2503     KAHAN_SUM(sum, term);
2504     /* printf("R %5d after adding %.18Lg, sum = %.19Lg (%Lg)\n", k, term, sum, fabsl(sum-old_sum)); */
2505     if (fabsl(sum - old_sum) <= LDBL_EPSILON) break;
2506   }
2507 
2508   return sum;
2509 }
2510 
_lambertw_approx(long double x)2511 static long double _lambertw_approx(long double x) {
2512   /* See Veberic 2009 for other approximations */
2513   if (x < -0.060) {  /* Pade(3,2) */
2514     long double ti = 5.4365636569180904707205749L * x + 2.0L;
2515     long double t  = (ti <= 0.0L) ? 0.0L : sqrtl(ti);
2516     long double t2 = t*t;
2517     long double t3 = t*t2;
2518     return (-1.0L + (1.0L/6.0L)*t + (257.0L/720.0L)*t2 + (13.0L/720.0L)*t3) / (1.0L + (5.0L/6.0L)*t + (103.0L/720.0L)*t2);
2519   } else if (x < 1.363) {  /* Winitzki 2003 section 3.5 */
2520     long double l1 = logl(1.0L+x);
2521     return l1 * (1.0L - logl(1.0L+l1) / (2.0L+l1));
2522   } else if (x < 3.7) {    /* Modification of Vargas 2013 */
2523     long double l1 = logl(x);
2524     long double l2 = logl(l1);
2525     return l1 - l2 - logl(1.0L - l2/l1)/2.0L;
2526   } else {                 /* Corless et al. 1993, page 22 */
2527     long double l1 = logl(x);
2528     long double l2 = logl(l1);
2529     long double d1 = 2.0L*l1*l1;
2530     long double d2 = 3.0L*l1*d1;
2531     long double d3 = 2.0L*l1*d2;
2532     long double d4 = 5.0L*l1*d3;
2533     long double w = l1 - l2 + l2/l1 + l2*(l2-2.0L)/d1;
2534     w += l2*(6.0L+l2*(-9.0L+2.0L*l2))/d2;
2535     w += l2*(-12.0L+l2*(36.0L+l2*(-22.0L+3.0L*l2)))/d3;
2536     w += l2*(60.0L+l2*(-300.0L+l2*(350.0L+l2*(-125.0L+12.0L*l2))))/d4;
2537     return w;
2538   }
2539 }
2540 
lambertw(NV x)2541 NV lambertw(NV x) {
2542   long double w;
2543   int i;
2544 
2545   if (x < -0.36787944117145L)
2546     croak("Invalid input to LambertW:  x must be >= -1/e");
2547   if (x == 0.0L) return 0.0L;
2548 
2549   /* Estimate initial value */
2550   w = _lambertw_approx(x);
2551   /* If input is too small, return .99999.... */
2552   if (w <= -1.0L) return -1.0L + 8*LDBL_EPSILON;
2553   /* For very small inputs, don't iterate, return approx directly. */
2554   if (x < -0.36783) return w;
2555 
2556 #if 0  /* Halley */
2557   lastw = w;
2558   for (i = 0; i < 100; i++) {
2559     long double ew = expl(w);
2560     long double wew = w * ew;
2561     long double wewx = wew - x;
2562     long double w1 = w + 1;
2563     w = w - wewx / (ew * w1 - (w+2) * wewx/(2*w1));
2564     if (w != 0.0L && fabsl((w-lastw)/w) <= 8*LDBL_EPSILON) break;
2565     lastw = w;
2566   }
2567 #else  /* Fritsch, see Veberic 2009.  1-2 iterations are enough. */
2568   for (i = 0; i < 6 && w != 0.0L; i++) {
2569     long double w1 = 1 + w;
2570     long double zn = logl((long double)x/w) - w;
2571     long double qn = 2 * w1 * (w1+(2.0L/3.0L)*zn);
2572     long double en = (zn/w1) * (qn-zn)/(qn-2.0L*zn);
2573     /* w *= 1.0L + en;  if (fabsl(en) <= 16*LDBL_EPSILON) break; */
2574     long double wen = w * en;
2575     w += wen;
2576     if (fabsl(wen) <= 64*LDBL_EPSILON) break;
2577   }
2578 #endif
2579 #if LNV_IS_QUAD /* For quadmath, one high precision correction */
2580   if (w != LNV_ZERO) {
2581     LNV lw = w;
2582     LNV w1 = LNV_ONE + lw;
2583     LNV zn = loglnv((LNV)x/lw) - lw;
2584     LNV qn = LNVCONST(2.0) * w1 * (w1+(LNVCONST(2.0)/LNVCONST(3.0))*zn);
2585     LNV en = (zn/w1) * (qn-zn)/(qn-LNVCONST(2.0)*zn);
2586     return lw + lw * en;
2587   }
2588 #endif
2589 
2590   return w;
2591 }
2592 
2593 #if HAVE_STD_U64
2594   #define U64T uint64_t
2595 #else
2596   #define U64T UV
2597 #endif
2598 
2599 /* Spigot from Arndt, Haenel, Winter, and Flammenkamp. */
2600 /* Modified for larger digits and rounding by Dana Jacobsen */
pidigits(int digits)2601 char* pidigits(int digits)
2602 {
2603   char* out;
2604   uint32_t *a, b, c, d, e, g, i, d4, d3, d2, d1;
2605   uint32_t const f = 10000;
2606   U64T d64;  /* 64-bit intermediate for 2*2*10000*b > 2^32 (~30k digits) */
2607 
2608   if (digits <= 0) return 0;
2609   if (digits <= DBL_DIG && digits <= 18) {
2610     Newz(0, out, 19, char);
2611     (void)sprintf(out, "%.*lf", (digits-1), 3.141592653589793238);
2612     return out;
2613   }
2614   digits++;   /* For rounding */
2615   c = 14*(digits/4 + 2);
2616   New(0, out, digits+5+1, char);
2617   *out++ = '3';  /* We'll turn "31415..." into "3.1415..." */
2618   New(0, a, c, uint32_t);
2619   for (b = 0; b < c; b++)  a[b] = 2000;
2620 
2621   d = i = 0;
2622   while ((b = c -= 14) > 0 && i < (uint32_t)digits) {
2623     d = e = d % f;
2624     if (b > 107000) {  /* Use 64-bit intermediate while necessary. */
2625       for (d64 = d; --b > 107000; ) {
2626         g = (b << 1) - 1;
2627         d64 = d64 * b  +  f * (U64T)a[b];
2628         a[b] = d64 % g;
2629         d64 /= g;
2630       }
2631       d = d64;
2632       b++;
2633     }
2634     while (--b > 0) {
2635       g = (b << 1) - 1;
2636       d = d * b  +  f * a[b];
2637       a[b] = d % g;
2638       d /= g;
2639     }
2640     /* sprintf(out+i, "%04d", e+d/f);   i += 4; */
2641     d4 = e + d/f;
2642     if (d4 > 9999) {
2643       d4 -= 10000;
2644       out[i-1]++;
2645       for (b=i-1; out[b] == '0'+1; b--) { out[b]='0'; out[b-1]++; }
2646     }
2647     d3 = d4/10;  d2 = d3/10;  d1 = d2/10;
2648     out[i++] = '0' + d1;
2649     out[i++] = '0' + d2-d1*10;
2650     out[i++] = '0' + d3-d2*10;
2651     out[i++] = '0' + d4-d3*10;
2652   }
2653   Safefree(a);
2654   if (out[digits-1] >= '5') out[digits-2]++;  /* Round */
2655   for (i = digits-2; out[i] == '9'+1; i--)    /* Keep rounding */
2656     { out[i] = '0';  out[i-1]++; }
2657   digits--;  /* Undo the extra digit we used for rounding */
2658   out[digits] = '\0';
2659   *out-- = '.';
2660   return out;
2661 }
2662 
2663 /* 1. Perform signed integer validation on b/blen.
2664  * 2. Compare to a/alen using min or max based on first arg.
2665  * 3. Return 0 to select a, 1 to select b.
2666  */
strnum_minmax(int min,char * a,STRLEN alen,char * b,STRLEN blen)2667 int strnum_minmax(int min, char* a, STRLEN alen, char* b, STRLEN blen)
2668 {
2669   int aneg, bneg;
2670   STRLEN i;
2671   /* a is checked, process b */
2672   if (b == 0 || blen == 0) croak("Parameter must be a positive integer");
2673   bneg = (b[0] == '-');
2674   if (b[0] == '-' || b[0] == '+') { b++; blen--; }
2675   while (blen > 0 && *b == '0') { b++; blen--; }
2676   for (i = 0; i < blen; i++)
2677     if (!isDIGIT(b[i]))
2678       break;
2679   if (blen == 0 || i < blen)
2680     croak("Parameter must be a positive integer");
2681 
2682   if (a == 0) return 1;
2683 
2684   aneg = (a[0] == '-');
2685   if (a[0] == '-' || a[0] == '+') { a++; alen--; }
2686   while (alen > 0 && *a == '0') { a++; alen--; }
2687 
2688   if (aneg != bneg)  return  min  ?  (bneg == 1)  :  (aneg == 1);
2689   if (aneg == 1)  min = !min;
2690   if (alen != blen)  return  min  ?  (alen > blen) :  (blen > alen);
2691 
2692   for (i = 0; i < blen; i++)
2693     if (a[i] != b[i])
2694       return  min  ?  (a[i] > b[i])  :  (b[i] > a[i]);
2695   return 0; /* equal */
2696 }
2697 
from_digit_string(UV * rn,const char * s,int base)2698 int from_digit_string(UV* rn, const char* s, int base)
2699 {
2700   UV max, n = 0;
2701   int i, len;
2702 
2703   /* Skip leading -/+ and zeros */
2704   if (s[0] == '-' || s[0] == '+') s++;
2705   while (s[0] == '0') s++;
2706 
2707   len = strlen(s);
2708   max = (UV_MAX-base+1)/base;
2709 
2710   for (i = 0, len = strlen(s); i < len; i++) {
2711     const char c = s[i];
2712     int d = !isalnum(c) ? 255 : (c <= '9') ? c-'0' : (c <= 'Z') ? c-'A'+10 : c-'a'+10;
2713     if (d >= base) croak("Invalid digit for base %d", base);
2714     if (n > max) return 0;   /* Overflow */
2715     n = n * base + d;
2716   }
2717   *rn = n;
2718   return 1;
2719 }
2720 
from_digit_to_UV(UV * rn,UV * r,int len,int base)2721 int from_digit_to_UV(UV* rn, UV* r, int len, int base)
2722 {
2723   UV d, n = 0;
2724   int i;
2725   if (len < 0 || len > BITS_PER_WORD)
2726     return 0;
2727   for (i = 0; i < len; i++) {
2728     d = r[i];
2729     if (n > (UV_MAX-d)/base) break;  /* overflow */
2730     n = n * base + d;
2731   }
2732   *rn = n;
2733   return (i >= len);
2734 }
2735 
2736 
from_digit_to_str(char ** rstr,UV * r,int len,int base)2737 int from_digit_to_str(char** rstr, UV* r, int len, int base)
2738 {
2739   char *so, *s;
2740   int i;
2741 
2742   if (len < 0 || !(base == 2 || base == 10 || base == 16)) return 0;
2743 
2744   if (r[0] >= (UV) base) return 0;  /* TODO: We don't apply extended carry */
2745 
2746   New(0, so, len + 3, char);
2747   s = so;
2748   if (base == 2 || base == 16) {
2749     *s++ = '0';
2750     *s++ = (base == 2) ? 'b' : 'x';
2751   }
2752   for (i = 0; i < len; i++) {
2753     UV d = r[i];
2754     s[i] = (d < 10) ? '0'+d : 'a'+d-10;
2755   }
2756   s[len] = '\0';
2757   *rstr = so;
2758   return 1;
2759 }
2760 
to_digit_array(int * bits,UV n,int base,int length)2761 int to_digit_array(int* bits, UV n, int base, int length)
2762 {
2763   int d;
2764 
2765   if (base < 2 || length > 128) return -1;
2766 
2767   if (base == 2) {
2768     for (d = 0; n; n >>= 1)
2769       bits[d++] = n & 1;
2770   } else {
2771     for (d = 0; n; n /= base)
2772       bits[d++] = n % base;
2773   }
2774   if (length < 0) length = d;
2775   while (d < length)
2776     bits[d++] = 0;
2777   return length;
2778 }
2779 
to_digit_string(char * s,UV n,int base,int length)2780 int to_digit_string(char* s, UV n, int base, int length)
2781 {
2782   int digits[128];
2783   int i, len = to_digit_array(digits, n, base, length);
2784 
2785   if (len < 0) return -1;
2786   if (base > 36) croak("invalid base for string: %d", base);
2787 
2788   for (i = 0; i < len; i++) {
2789     int dig = digits[len-i-1];
2790     s[i] = (dig < 10) ? '0'+dig : 'a'+dig-10;
2791   }
2792   s[len] = '\0';
2793   return len;
2794 }
2795 
to_string_128(char str[40],IV hi,UV lo)2796 int to_string_128(char str[40], IV hi, UV lo)
2797 {
2798   int i, slen = 0, isneg = 0;
2799 
2800   if (hi < 0) {
2801     isneg = 1;
2802     hi = -(hi+1);
2803     lo = UV_MAX - lo + 1;
2804   }
2805 #if BITS_PER_WORD == 64 && HAVE_UINT128
2806   {
2807     uint128_t dd, sum = (((uint128_t) hi) << 64) + lo;
2808     do {
2809       dd = sum / 10;
2810       str[slen++] = '0' + (sum - dd*10);
2811       sum = dd;
2812     } while (sum);
2813   }
2814 #else
2815   {
2816     UV d, r;
2817     uint32_t a[4];
2818     a[0] = hi >> (BITS_PER_WORD/2);
2819     a[1] = hi & (UV_MAX >> (BITS_PER_WORD/2));
2820     a[2] = lo >> (BITS_PER_WORD/2);
2821     a[3] = lo & (UV_MAX >> (BITS_PER_WORD/2));
2822     do {
2823       r = a[0];
2824       d = r/10;  r = ((r-d*10) << (BITS_PER_WORD/2)) + a[1];  a[0] = d;
2825       d = r/10;  r = ((r-d*10) << (BITS_PER_WORD/2)) + a[2];  a[1] = d;
2826       d = r/10;  r = ((r-d*10) << (BITS_PER_WORD/2)) + a[3];  a[2] = d;
2827       d = r/10;  r = r-d*10;  a[3] = d;
2828       str[slen++] = '0'+(r%10);
2829     } while (a[0] || a[1] || a[2] || a[3]);
2830   }
2831 #endif
2832   /* Reverse the order */
2833   for (i=0; i < slen/2; i++) {
2834     char t=str[i];
2835     str[i]=str[slen-i-1];
2836     str[slen-i-1] = t;
2837   }
2838   /* Prepend a negative sign if needed */
2839   if (isneg) {
2840     for (i = slen; i > 0; i--)
2841       str[i] = str[i-1];
2842     str[0] = '-';
2843     slen++;
2844   }
2845   /* Add terminator */
2846   str[slen] = '\0';
2847   return slen;
2848 }
2849 
2850 /* Oddball primality test.
2851  * In this file rather than primality.c because it uses factoring (!).
2852  * Algorithm from Charles R Greathouse IV, 2015 */
_catalan_v32(uint32_t n,uint32_t p)2853 static INLINE uint32_t _catalan_v32(uint32_t n, uint32_t p) {
2854   uint32_t s = 0;
2855   while (n /= p)  s += n % 2;
2856   return s;
2857 }
_catalan_v(UV n,UV p)2858 static INLINE uint32_t _catalan_v(UV n, UV p) {
2859   uint32_t s = 0;
2860   while (n /= p)  s += n % 2;
2861   return s;
2862 }
_catalan_mult(UV m,UV p,UV n,UV a)2863 static UV _catalan_mult(UV m, UV p, UV n, UV a) {
2864   if (p > a) {
2865     m = mulmod(m, p, n);
2866   } else {
2867     UV pow = (n <= 4294967295UL) ? _catalan_v32(a<<1,p) : _catalan_v(a<<1,p);
2868     m = (pow == 0) ? m
2869       : (pow == 1) ? mulmod(m,p,n)
2870                    : mulmod(m,powmod(p,pow,n),n);
2871   }
2872   return m;
2873 }
_catalan_vtest(UV n,UV p)2874 static int _catalan_vtest(UV n, UV p) {
2875   while (n /= p)
2876     if (n % 2)
2877       return 1;
2878   return 0;
2879 }
is_catalan_pseudoprime(UV n)2880 int is_catalan_pseudoprime(UV n) {
2881   UV m, a;
2882   int i;
2883 
2884   if (n < 2 || ((n % 2) == 0 && n != 2)) return 0;
2885   if (is_prob_prime(n)) return 1;
2886 
2887   m = 1;
2888   a = n >> 1;
2889   /*
2890    * Ideally we could use some of the requirements for a mod 4/8/64 here:
2891    * http://www.combinatorics.net/conf/Z60/sp/sp/Shu-Chung%20Liu.pdf
2892    * But, how do we make +/-2 = X mod n into a solution for x = X mod 8?
2893    *
2894    * We could also exploit the exhaustive testing that shows there only
2895    * exist three below 1e10:  5907, 1194649, and 12327121.
2896    */
2897   {
2898     UV factors[MPU_MAX_FACTORS+1];
2899     int nfactors = factor_exp(n, factors, 0);
2900 #if BITS_PER_WORD == 32
2901     if (nfactors == 2) return 0;  /* Page 9, all 32-bit semiprimes */
2902 #else
2903     if (nfactors == 2) {   /* Conditions from Aebi and Cairns (2008) */
2904       if (n < UVCONST(10000000000)) return 0;     /* Page 9 */
2905       if (2*factors[0]+1 >= factors[1]) return 0; /* Corollary 2 and 3 */
2906     }
2907 #endif
2908     /* Test every factor */
2909     for (i = 0; i < nfactors; i++) {
2910       if (_catalan_vtest(a << 1, factors[i]))
2911         return 0;
2912     }
2913   }
2914   {
2915     UV seg_base, seg_low, seg_high;
2916     unsigned char* segment;
2917     void* ctx;
2918     m = _catalan_mult(m, 2, n, a);
2919     m = _catalan_mult(m, 3, n, a);
2920     m = _catalan_mult(m, 5, n, a);
2921     ctx = start_segment_primes(7, n, &segment);
2922     while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
2923       START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
2924         m = _catalan_mult(m, p, n, a);
2925       } END_DO_FOR_EACH_SIEVE_PRIME
2926     }
2927     end_segment_primes(ctx);
2928   }
2929   return (a & 1) ? (m==(n-1)) : (m==1);
2930 }
2931 
2932 /* If we have fast CTZ, use this GCD.  See Brent Alg V and FLINT Abhinav Baid */
gcdz(UV x,UV y)2933 UV gcdz(UV x, UV y) {
2934   UV f, x2, y2;
2935 
2936   if (x == 0) return y;
2937 
2938   if (y & 1) {  /* Optimize y odd */
2939     x >>= ctz(x);
2940     while (x != y) {
2941       if (x < y) { y -= x; y >>= ctz(y); }
2942       else       { x -= y; x >>= ctz(x); }
2943     }
2944     return x;
2945   }
2946 
2947   if (y == 0) return x;
2948 
2949   /* Alternately:  f = ctz(x|y); x >>= ctz(x); y >>= ctz(y); */
2950   x2 = ctz(x);
2951   y2 = ctz(y);
2952   f = (x2 <= y2) ? x2 : y2;
2953   x >>= x2;
2954   y >>= y2;
2955 
2956   while (x != y) {
2957     if (x < y) {
2958       y -= x;
2959       y >>= ctz(y);
2960     } else {
2961       x -= y;
2962       x >>= ctz(x);
2963     }
2964   }
2965   return x << f;
2966 }
2967 
2968 /* The intermediate values are so large that we can only stay in 64-bit
2969  * up to 53 or so using the divisor_sum calculations.  So just use a table.
2970  * Save space by just storing the 32-bit values. */
2971 static const int32_t tau_table[] = {
2972   0,1,-24,252,-1472,4830,-6048,-16744,84480,-113643,-115920,534612,-370944,-577738,401856,1217160,987136,-6905934,2727432,10661420,-7109760,-4219488,-12830688,18643272,21288960,-25499225,13865712,-73279080,24647168,128406630,-29211840,-52843168,-196706304,134722224,165742416,-80873520,167282496,-182213314,-255874080,-145589976,408038400,308120442,101267712,-17125708,-786948864,-548895690,-447438528
2973 };
2974 #define NTAU (sizeof(tau_table)/sizeof(tau_table[0]))
ramanujan_tau(UV n)2975 IV ramanujan_tau(UV n) {
2976   return (n < NTAU)  ?  tau_table[n]  :  0;
2977 }
2978 
_count_class_div(UV s,UV b2)2979 static UV _count_class_div(UV s, UV b2) {
2980   UV h = 0, i, ndivisors, *divs, lim;
2981 
2982   lim = isqrt(b2);
2983   if (lim*lim == b2) lim--;
2984   if (s > lim) return 0;
2985 
2986   if ((lim-s) < 70) {  /* Iterate looking for divisors */
2987     for (i = s; i <= lim; i++)
2988       if (b2 % i == 0)
2989         h++;
2990   } else {             /* Walk through all the divisors */
2991     divs = _divisor_list(b2, &ndivisors);
2992     for (i = 0; i < ndivisors && divs[i] <= lim; i++)
2993       if (divs[i] >= s)
2994         h++;
2995     Safefree(divs);
2996   }
2997   return h;
2998 }
2999 
3000 /* Returns 12 * H(n).  See Cohen 5.3.5 or Pari/GP.
3001  * Pari/GP uses a different method for n > 500000, which is quite a bit
3002  * faster, but assumes the GRH. */
hclassno(UV n)3003 IV hclassno(UV n) {
3004   UV nmod4 = n % 4, b2, b, h;
3005   int square;
3006 
3007   if (n == 0) return -1;
3008   if (nmod4 == 1 || nmod4 == 2) return 0;
3009   if (n == 3) return 4;
3010 
3011   b = n & 1;
3012   b2 = (n+1) >> 2;
3013   square = is_perfect_square(b2);
3014 
3015   h = divisor_sum(b2,0) >> 1;
3016   if (b == 1)
3017     h = 1 + square + ((h - 1) << 1);
3018   b += 2;
3019 
3020   for (;  b2 = (n + b*b) >> 2, 3*b2 < n;  b += 2) {
3021     h += (b2 % b == 0)
3022       +  is_perfect_square(b2)
3023       +  (_count_class_div(b+1, b2) << 1);
3024   }
3025   return 12*h + ((b2*3 == n) ? 4 : square && !(n&1) ? 6 : 0);
3026 }
3027 
polygonal_root(UV n,UV k,int * overflow)3028 UV polygonal_root(UV n, UV k, int* overflow) {
3029   UV D, R;
3030   MPUassert(k >= 3, "is_polygonal root < 3");
3031   *overflow = 0;
3032   if (n <= 1) return n;
3033   if (k == 4) return is_perfect_square(n) ? isqrt(n) : 0;
3034   if (k == 3) {
3035     if (n >= UV_MAX/8) *overflow = 1;
3036     D = n << 3;
3037     R = 1;
3038   } else {
3039     if (k > UV_MAX/k || n > UV_MAX/(8*k-16)) *overflow = 1;
3040     D = (8*k-16) * n;
3041     R = (k-4) * (k-4);
3042   }
3043   if (D+R <= D) *overflow = 1;
3044   D += R;
3045   if (*overflow || !is_perfect_square(D)) return 0;
3046   D = isqrt(D) + (k-4);
3047   R = 2*k - 4;
3048   if ((D % R) != 0) return 0;
3049   return D/R;
3050 }
3051 
3052 /* These rank/unrank are O(n^2) algorithms using O(n) in-place space.
3053  * Bonet 2008 gives O(n log n) algorithms using a bit more space.
3054  */
3055 
num_to_perm(UV k,int n,int * vec)3056 int num_to_perm(UV k, int n, int *vec) {
3057   int i, j, t, si = 0;
3058   UV f = factorial(n-1);
3059 
3060   while (f == 0) /* We can handle n! overflow if we have a valid k */
3061     f = factorial(n - 1 - ++si);
3062 
3063   if (k/f >= (UV)n)
3064     k %= f*n;
3065 
3066   for (i = 0; i < n; i++)
3067     vec[i] = i;
3068   for (i = si; i < n-1; i++) {
3069     UV p = k/f;
3070     k -= p*f;
3071     f /= n-i-1;
3072     if (p > 0) {
3073       for (j = i+p, t = vec[j]; j > i; j--)
3074         vec[j] = vec[j-1];
3075       vec[i] = t;
3076     }
3077   }
3078   return 1;
3079 }
3080 
perm_to_num(int n,int * vec,UV * rank)3081 int perm_to_num(int n, int *vec, UV *rank) {
3082   int i, j, k;
3083   UV f, num = 0;
3084   f = factorial(n-1);
3085   if (f == 0) return 0;
3086   for (i = 0; i < n-1; i++) {
3087     for (j = i+1, k = 0; j < n; j++)
3088       if (vec[j] < vec[i])
3089         k++;
3090     if ((UV)k > (UV_MAX-num)/f) return 0;  /* overflow */
3091     num += k*f;
3092     f /= n-i-1;
3093   }
3094   *rank = num;
3095   return 1;
3096 }
3097 
3098 /*
3099  * For k<n, an O(k) time and space method is shown on page 39 of
3100  *    https://www.math.upenn.edu/~wilf/website/CombinatorialAlgorithms.pdf
3101  * Note it requires an O(k) complete shuffle as the results are sorted.
3102  *
3103  * This seems to be 4-100x faster than NumPy's random.{permutation,choice}
3104  * for n under 100k or so.  It's even faster with larger n.  For example
3105  *   from numpy.random import choice;  choice(100000000, 4, replace=False)
3106  * uses 774MB and takes 55 seconds.  We take less than 1 microsecond.
3107  */
randperm(void * ctx,UV n,UV k,UV * S)3108 void randperm(void* ctx, UV n, UV k, UV *S) {
3109   UV i, j;
3110 
3111   if (k > n)  k = n;
3112 
3113   if        (k == 0) {                  /* 0 of n */
3114   } else if (k == 1) {                  /* 1 of n.  Pick one at random */
3115     S[0] = urandomm64(ctx,n);
3116   } else if (k == 2 && n == 2) {        /* 2 of 2.  Flip a coin */
3117     S[0] = urandomb(ctx,1);
3118     S[1] = 1-S[0];
3119   } else if (k == 2) {                  /* 2 of n.  Pick 2 skipping dup */
3120     S[0] = urandomm64(ctx,n);
3121     S[1] = urandomm64(ctx,n-1);
3122     if (S[1] >= S[0]) S[1]++;
3123   } else if (k < n/100 && k < 30) {     /* k of n.  Pick k with loop */
3124     for (i = 0; i < k; i++) {
3125       do {
3126         S[i] = urandomm64(ctx,n);
3127         for (j = 0; j < i; j++)
3128           if (S[j] == S[i])
3129             break;
3130       } while (j < i);
3131     }
3132   } else if (k < n/100 && n > 1000000) {/* k of n.  Pick k with dedup retry */
3133     for (j = 0; j < k; ) {
3134       for (i = j; i < k; i++) /* Fill S[j .. k-1] then sort S */
3135         S[i] = urandomm64(ctx,n);
3136       qsort(S, k, sizeof(UV), _numcmp);
3137       for (j = 0, i = 1; i < k; i++)  /* Find and remove dups.  O(n). */
3138         if (S[j] != S[i])
3139           S[++j] = S[i];
3140       j++;
3141     }
3142     /* S is sorted unique k-selection of 0 to n-1.  Shuffle. */
3143     for (i = 0; i < k; i++) {
3144       j = urandomm64(ctx,k-i);
3145       { UV t = S[i]; S[i] = S[i+j]; S[i+j] = t; }
3146     }
3147   } else if (k < n/4) {                 /* k of n.  Pick k with mask */
3148     uint32_t *mask, smask[8] = {0};
3149     if (n <= 32*8) mask = smask;
3150     else           Newz(0, mask, n/32 + ((n%32)?1:0), uint32_t);
3151     for (i = 0; i < k; i++) {
3152       do {
3153         j = urandomm64(ctx,n);
3154       } while ( mask[j>>5] & (1U << (j&0x1F)) );
3155       S[i] = j;
3156       mask[j>>5] |= (1U << (j&0x1F));
3157     }
3158     if (mask != smask) Safefree(mask);
3159   } else if (k < n) {                   /* k of n.  FYK shuffle n, pick k */
3160     UV *T;
3161     New(0, T, n, UV);
3162     for (i = 0; i < n; i++)
3163       T[i] = i;
3164     for (i = 0; i < k && i <= n-2; i++) {
3165       j = urandomm64(ctx,n-i);
3166       S[i] = T[i+j];
3167       T[i+j] = T[i];
3168     }
3169     Safefree(T);
3170   } else {                              /* n of n.  FYK shuffle. */
3171     for (i = 0; i < n; i++)
3172       S[i] = i;
3173     for (i = 0; i < k && i <= n-2; i++) {
3174       j = urandomm64(ctx,n-i);
3175       { UV t = S[i]; S[i] = S[i+j]; S[i+j] = t; }
3176     }
3177   }
3178 }
3179 
random_factored_integer(void * ctx,UV n,int * nf,UV * factors)3180 UV random_factored_integer(void* ctx, UV n, int *nf, UV *factors) {
3181   UV r, s, nfac;
3182   if (n < 1)
3183     return 0;
3184 #if BITS_PER_WORD == 64 && (USE_MONTMATH || MULMODS_ARE_FAST)
3185   if (1)   /* Our factoring is very fast, just use it */
3186 #elif BITS_PER_WORD == 64
3187   if (n < UVCONST(1000000000000))
3188 #endif
3189   {
3190     r = 1 + urandomm64(ctx, n);
3191     *nf = factor(r, factors);
3192     return r;
3193   }
3194   do {  /* Kalai's algorithm */
3195     for (s = n, r = 1, nfac = 0;  s > 1;  ) {
3196       s = 1 + urandomm64(ctx, s);
3197       if (!is_prime(s)) continue;
3198       if (s > n / r) { r = 0; break; } /* overflow */
3199       r *= s;
3200       factors[nfac++] = s;
3201     }
3202   } while (r == 0 || r > n || (1 + urandomm64(ctx,n)) > r);
3203   *nf = nfac;
3204   return r;
3205 }
3206