1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <unistd.h>
5 #include <float.h>
6
7 /* Use long double to get a little more precision when we're calculating the
8 * math functions -- especially those calculated with a series. Long double
9 * is defined in C89 (ISO C). Note that 'long double' on many platforms is
10 * identical to 'double so it may buy us nothing. But it's worth trying.
11 *
12 * While the type was in C89, math functions using it are in C99. Some
13 * systems didn't really get it right (e.g. NetBSD which left out some
14 * functions for 13 years).
15 */
16 #include <math.h>
17 #if _MSC_VER || defined(__IBMC__) | defined(__IBMCPP__) || (defined(__STDC_VERSION__) && __STDC_VERSION >= 199901L)
18 /* math.h should give us these as functions or macros.
19 *
20 * extern long double fabsl(long double);
21 * extern long double floorl(long double);
22 * extern long double ceill(long double);
23 * extern long double sqrtl(long double);
24 * extern long double powl(long double, long double);
25 * extern long double expl(long double);
26 * extern long double logl(long double);
27 */
28 #else
29 #define fabsl(x) (long double) fabs( (double) (x) )
30 #define floorl(x) (long double) floor( (double) (x) )
31 #define ceill(x) (long double) ceil( (double) (x) )
32 #define sqrtl(x) (long double) sqrt( (double) (x) )
33 #define powl(x, y) (long double) pow( (double) (x), (double) (y) )
34 #define expl(x) (long double) exp( (double) (x) )
35 #define logl(x) (long double) log( (double) (x) )
36 #endif
37
38 #ifdef LDBL_INFINITY
39 #undef INFINITY
40 #define INFINITY LDBL_INFINITY
41 #elif !defined(INFINITY)
42 #define INFINITY (DBL_MAX + DBL_MAX)
43 #endif
44 #ifndef LDBL_EPSILON
45 #define LDBL_EPSILON 1e-16
46 #endif
47 #ifndef LDBL_MAX
48 #define LDBL_MAX DBL_MAX
49 #endif
50
51 #include "ptypes.h"
52 #define FUNC_isqrt 1
53 #define FUNC_icbrt 1
54 #define FUNC_lcm_ui 1
55 #define FUNC_ctz 1
56 #define FUNC_log2floor 1
57 #define FUNC_is_perfect_square
58 #define FUNC_is_perfect_cube
59 #define FUNC_is_perfect_fifth
60 #define FUNC_is_perfect_seventh
61 #define FUNC_next_prime_in_sieve 1
62 #define FUNC_prev_prime_in_sieve 1
63 #define FUNC_ipow 1
64 #include "util.h"
65 #include "sieve.h"
66 #include "primality.h"
67 #include "cache.h"
68 #include "lmo.h"
69 #include "factor.h"
70 #include "mulmod.h"
71 #include "constants.h"
72 #include "montmath.h"
73 #include "csprng.h"
74 #include "keyval.h"
75
76 #define KAHAN_INIT(s) \
77 LNV s ## _y, s ## _t; \
78 LNV s ## _c = 0.0; \
79 LNV s = 0.0;
80
81 #define KAHAN_SUM(s, term) \
82 do { \
83 s ## _y = (term) - s ## _c; \
84 s ## _t = s + s ## _y; \
85 s ## _c = (s ## _t - s) - s ## _y; \
86 s = s ## _t; \
87 } while (0)
88
_numcmp(const void * a,const void * b)89 int _numcmp(const void *a, const void *b) {
90 const UV *x = a, *y = b;
91 return (*x > *y) ? 1 : (*x < *y) ? -1 : 0;
92 }
93
94 static int _verbose = 0;
_XS_set_verbose(int v)95 void _XS_set_verbose(int v) { _verbose = v; }
_XS_get_verbose(void)96 int _XS_get_verbose(void) { return _verbose; }
97
98 static int _call_gmp = 0;
_XS_set_callgmp(int v)99 void _XS_set_callgmp(int v) { _call_gmp = v; }
_XS_get_callgmp(void)100 int _XS_get_callgmp(void) { return _call_gmp; }
101
102 static int _secure = 0;
_XS_set_secure(void)103 void _XS_set_secure(void) { _secure = 1; }
_XS_get_secure(void)104 int _XS_get_secure(void) { return _secure; }
105
106 /* We'll use this little static sieve to quickly answer small values of
107 * is_prime, next_prime, prev_prime, prime_count
108 * for non-threaded Perl it's basically the same as getting the primary
109 * cache. It guarantees we'll have an answer with no waiting on any version.
110 */
111 static const unsigned char prime_sieve30[] =
112 {0x01,0x20,0x10,0x81,0x49,0x24,0xc2,0x06,0x2a,0xb0,0xe1,0x0c,0x15,0x59,0x12,
113 0x61,0x19,0xf3,0x2c,0x2c,0xc4,0x22,0xa6,0x5a,0x95,0x98,0x6d,0x42,0x87,0xe1,
114 0x59,0xa9,0xa9,0x1c,0x52,0xd2,0x21,0xd5,0xb3,0xaa,0x26,0x5c,0x0f,0x60,0xfc,
115 0xab,0x5e,0x07,0xd1,0x02,0xbb,0x16,0x99,0x09,0xec,0xc5,0x47,0xb3,0xd4,0xc5,
116 0xba,0xee,0x40,0xab,0x73,0x3e,0x85,0x4c,0x37,0x43,0x73,0xb0,0xde,0xa7,0x8e,
117 0x8e,0x64,0x3e,0xe8,0x10,0xab,0x69,0xe5,0xf7,0x1a,0x7c,0x73,0xb9,0x8d,0x04,
118 0x51,0x9a,0x6d,0x70,0xa7,0x78,0x2d,0x6d,0x27,0x7e,0x9a,0xd9,0x1c,0x5f,0xee,
119 0xc7,0x38,0xd9,0xc3,0x7e,0x14,0x66,0x72,0xae,0x77,0xc1,0xdb,0x0c,0xcc,0xb2,
120 0xa5,0x74,0xe3,0x58,0xd5,0x4b,0xa7,0xb3,0xb1,0xd9,0x09,0xe6,0x7d,0x23,0x7c,
121 0x3c,0xd3,0x0e,0xc7,0xfd,0x4a,0x32,0x32,0xfd,0x4d,0xb5,0x6b,0xf3,0xa8,0xb3,
122 0x85,0xcf,0xbc,0xf4,0x0e,0x34,0xbb,0x93,0xdb,0x07,0xe6,0xfe,0x6a,0x57,0xa3,
123 0x8c,0x15,0x72,0xdb,0x69,0xd4,0xaf,0x59,0xdd,0xe1,0x3b,0x2e,0xb7,0xf9,0x2b,
124 0xc5,0xd0,0x8b,0x63,0xf8,0x95,0xfa,0x77,0x40,0x97,0xea,0xd1,0x9f,0xaa,0x1c,
125 0x48,0xae,0x67,0xf7,0xeb,0x79,0xa5,0x55,0xba,0xb2,0xb6,0x8f,0xd8,0x2d,0x6c,
126 0x2a,0x35,0x54,0xfd,0x7c,0x9e,0xfa,0xdb,0x31,0x78,0xdd,0x3d,0x56,0x52,0xe7,
127 0x73,0xb2,0x87,0x2e,0x76,0xe9,0x4f,0xa8,0x38,0x9d,0x5d,0x3f,0xcb,0xdb,0xad,
128 0x51,0xa5,0xbf,0xcd,0x72,0xde,0xf7,0xbc,0xcb,0x49,0x2d,0x49,0x26,0xe6,0x1e,
129 0x9f,0x98,0xe5,0xc6,0x9f,0x2f,0xbb,0x85,0x6b,0x65,0xf6,0x77,0x7c,0x57,0x8b,
130 0xaa,0xef,0xd8,0x5e,0xa2,0x97,0xe1,0xdc,0x37,0xcd,0x1f,0xe6,0xfc,0xbb,0x8c,
131 0xb7,0x4e,0xc7,0x3c,0x19,0xd5,0xa8,0x9e,0x67,0x4a,0xe3,0xf5,0x97,0x3a,0x7e,
132 0x70,0x53,0xfd,0xd6,0xe5,0xb8,0x1c,0x6b,0xee,0xb1,0x9b,0xd1,0xeb,0x34,0xc2,
133 0x23,0xeb,0x3a,0xf9,0xef,0x16,0xd6,0x4e,0x7d,0x16,0xcf,0xb8,0x1c,0xcb,0xe6,
134 0x3c,0xda,0xf5,0xcf};
135 #define NPRIME_SIEVE30 (sizeof(prime_sieve30)/sizeof(prime_sieve30[0]))
136
137 static const unsigned short primes_tiny[] =
138 {0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
139 101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,
140 193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,281,283,
141 293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,
142 409,419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503};
143 #define NPRIMES_TINY (sizeof(primes_tiny)/sizeof(primes_tiny[0]))
144
145 /* Return of 2 if n is prime, 0 if not. Do it fast. */
is_prime(UV n)146 int is_prime(UV n)
147 {
148 if (n <= 10)
149 return (n == 2 || n == 3 || n == 5 || n == 7) ? 2 : 0;
150
151 if (n < UVCONST(200000000)) {
152 UV d = n/30;
153 UV m = n - d*30;
154 unsigned char mtab = masktab30[m]; /* Bitmask in mod30 wheel */
155 const unsigned char* sieve;
156
157 /* Return 0 if a multiple of 2, 3, or 5 */
158 if (mtab == 0)
159 return 0;
160
161 /* Check static tiny sieve */
162 if (d < NPRIME_SIEVE30)
163 return (prime_sieve30[d] & mtab) ? 0 : 2;
164
165 if (!(n%7) || !(n%11) || !(n%13)) return 0;
166
167 /* Check primary cache */
168 if (n <= get_prime_cache(0,0)) {
169 int isprime = -1;
170 if (n <= get_prime_cache(0, &sieve))
171 isprime = 2*((sieve[d] & mtab) == 0);
172 release_prime_cache(sieve);
173 if (isprime >= 0)
174 return isprime;
175 }
176 }
177 return is_prob_prime(n);
178 }
179
180
next_prime(UV n)181 UV next_prime(UV n)
182 {
183 UV m, next;
184
185 if (n < 30*NPRIME_SIEVE30) {
186 next = next_prime_in_sieve(prime_sieve30, n, 30*NPRIME_SIEVE30);
187 if (next != 0) return next;
188 }
189
190 if (n >= MPU_MAX_PRIME) return 0; /* Overflow */
191
192 if (n < get_prime_cache(0,0)) {
193 const unsigned char* sieve;
194 UV sieve_size = get_prime_cache(0, &sieve);
195 next = (n < sieve_size) ? next_prime_in_sieve(sieve, n, sieve_size) : 0;
196 release_prime_cache(sieve);
197 if (next != 0) return next;
198 }
199
200 m = n % 30;
201 do { /* Move forward one. */
202 n += wheeladvance30[m];
203 m = nextwheel30[m];
204 } while (!is_prob_prime(n));
205 return n;
206 }
207
208
prev_prime(UV n)209 UV prev_prime(UV n)
210 {
211 UV m, prev;
212
213 if (n < 30*NPRIME_SIEVE30)
214 return prev_prime_in_sieve(prime_sieve30, n);
215
216 if (n < get_prime_cache(0,0)) {
217 const unsigned char* sieve;
218 UV sieve_size = get_prime_cache(0, &sieve);
219 prev = (n < sieve_size) ? prev_prime_in_sieve(sieve, n) : 0;
220 release_prime_cache(sieve);
221 if (prev != 0) return prev;
222 }
223
224 m = n % 30;
225 do { /* Move back one. */
226 n -= wheelretreat30[m];
227 m = prevwheel30[m];
228 } while (!is_prob_prime(n));
229 return n;
230 }
231
232 /******************************************************************************/
233 /* PRINTING */
234 /******************************************************************************/
235
my_sprint(char * ptr,UV val)236 static int my_sprint(char* ptr, UV val) {
237 int nchars;
238 UV t;
239 char *s = ptr;
240 do {
241 t = val / 10; *s++ = (char) ('0' + val - 10 * t);
242 } while ((val = t));
243 nchars = s - ptr + 1; *s = '\n';
244 while (--s > ptr) { char c = *s; *s = *ptr; *ptr++ = c; }
245 return nchars;
246 }
write_buf(int fd,char * buf,char * bend)247 static char* write_buf(int fd, char* buf, char* bend) {
248 int res = (int) write(fd, buf, bend-buf);
249 if (res == -1) croak("print_primes write error");
250 return buf;
251 }
print_primes(UV low,UV high,int fd)252 void print_primes(UV low, UV high, int fd) {
253 char buf[8000+25];
254 char* bend = buf;
255 if ((low <= 2) && (high >= 2)) bend += my_sprint(bend,2);
256 if ((low <= 3) && (high >= 3)) bend += my_sprint(bend,3);
257 if ((low <= 5) && (high >= 5)) bend += my_sprint(bend,5);
258 if (low < 7) low = 7;
259
260 if (low <= high) {
261 unsigned char* segment;
262 UV seg_base, seg_low, seg_high;
263 void* ctx = start_segment_primes(low, high, &segment);
264 while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
265 START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high )
266 bend += my_sprint(bend,p);
267 if (bend-buf > 8000) { bend = write_buf(fd, buf, bend); }
268 END_DO_FOR_EACH_SIEVE_PRIME
269 }
270 end_segment_primes(ctx);
271 }
272 if (bend > buf) { bend = write_buf(fd, buf, bend); }
273 }
274
275 /******************************************************************************/
276 /* TOTIENT, MOEBIUS, MERTENS */
277 /******************************************************************************/
278
279 /* Return a char array with lo-hi+1 elements. mu[k-lo] = µ(k) for k = lo .. hi.
280 * It is the callers responsibility to call Safefree on the result. */
281 #define PGTLO(ip,p,lo) ((ip)>=(lo)) ? (ip) : ((p)*((lo)/(p)) + (((lo)%(p))?(p):0))
range_moebius(UV lo,UV hi)282 signed char* range_moebius(UV lo, UV hi)
283 {
284 signed char* mu;
285 UV i, sqrtn = isqrt(hi), count = hi-lo+1;
286
287 /* Kuznetsov indicates that the Deléglise & Rivat (1996) method can be
288 * modified to work on logs, which allows us to operate with no
289 * intermediate memory at all. Same time as the D&R method, less memory. */
290 unsigned char logp;
291 UV nextlog, nextlogi;
292
293 Newz(0, mu, count, signed char);
294 if (sqrtn*sqrtn != hi && sqrtn < (UVCONST(1)<<(BITS_PER_WORD/2))-1) sqrtn++;
295
296 /* For small ranges, do it by hand */
297 if (hi < 100 || count <= 10 || (hi > (1UL<<25) && count < icbrt(hi)/4)) {
298 for (i = 0; i < count; i++)
299 mu[i] = moebius(lo+i);
300 return mu;
301 }
302
303 logp = 1; nextlog = 3; /* 2+1 */
304 START_DO_FOR_EACH_PRIME(2, sqrtn) {
305 UV p2 = p*p;
306 if (p > nextlog) {
307 logp += 2; /* logp is 1 | ceil(log(p)/log(2)) */
308 nextlog = ((nextlog-1)*4)+1;
309 }
310 for (i = PGTLO(p, p, lo); i >= lo && i <= hi; i += p)
311 mu[i-lo] += logp;
312 for (i = PGTLO(p2, p2, lo); i >= lo && i <= hi; i += p2)
313 mu[i-lo] = 0x80;
314 } END_DO_FOR_EACH_PRIME
315
316 logp = log2floor(lo);
317 nextlogi = (UVCONST(2) << logp) - lo;
318 for (i = 0; i < count; i++) {
319 unsigned char a = mu[i];
320 if (i >= nextlogi) nextlogi = (UVCONST(2) << ++logp) - lo;
321 if (a & 0x80) { a = 0; }
322 else if (a >= logp) { a = 1 - 2*(a&1); }
323 else { a = -1 + 2*(a&1); }
324 mu[i] = a;
325 }
326 if (lo == 0) mu[0] = 0;
327
328 return mu;
329 }
330
range_totient(UV lo,UV hi)331 UV* range_totient(UV lo, UV hi) {
332 UV* totients;
333 UV i, seg_base, seg_low, seg_high, count = hi-lo+1;
334 unsigned char* segment;
335 void* ctx;
336
337 if (hi < lo) croak("range_totient error hi %"UVuf" < lo %"UVuf"\n", hi, lo);
338 New(0, totients, count, UV);
339
340 /* Do via factoring if very small or if we have a small range */
341 if (hi < 100 || count <= 10 || hi/count > 1000) {
342 for (i = 0; i < count; i++)
343 totients[i] = totient(lo+i);
344 return totients;
345 }
346
347 if (hi == UV_MAX) {
348 totients[--count] = totient(UV_MAX);
349 hi--;
350 }
351
352 /* If doing a full sieve, do it monolithic. Faster. */
353 if (lo == 0) {
354 UV* prime;
355 double loghi = log(hi);
356 UV max_index = (hi < 67) ? 18
357 : (hi < 355991) ? 15+(hi/(loghi-1.09))
358 : (hi/loghi) * (1.0+1.0/loghi+2.51/(loghi*loghi));
359 UV j, index, nprimes = 0;
360
361 New(0, prime, max_index, UV); /* could use prime_count_upper(hi) */
362 memset(totients, 0, count * sizeof(UV));
363 for (i = 2; i <= hi/2; i++) {
364 index = 2*i;
365 if ( !(i&1) ) {
366 if (i == 2) { totients[2] = 1; prime[nprimes++] = 2; }
367 totients[index] = totients[i]*2;
368 } else {
369 if (totients[i] == 0) {
370 totients[i] = i-1;
371 prime[nprimes++] = i;
372 }
373 for (j=0; j < nprimes && index <= hi; index = i*prime[++j]) {
374 if (i % prime[j] == 0) {
375 totients[index] = totients[i]*prime[j];
376 break;
377 } else {
378 totients[index] = totients[i]*(prime[j]-1);
379 }
380 }
381 }
382 }
383 Safefree(prime);
384 /* All totient values have been filled in except the primes. Mark them. */
385 for (i = ((hi/2) + 1) | 1; i <= hi; i += 2)
386 if (totients[i] == 0)
387 totients[i] = i-1;
388 totients[1] = 1;
389 totients[0] = 0;
390 return totients;
391 }
392
393 for (i = 0; i < count; i++) {
394 UV v = lo+i, nv = v;
395 if (v % 2 == 0) nv -= nv/2;
396 if (v % 3 == 0) nv -= nv/3;
397 if (v % 5 == 0) nv -= nv/5;
398 totients[i] = nv;
399 }
400
401 ctx = start_segment_primes(7, hi/2, &segment);
402 while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
403 START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
404 for (i = PGTLO(2*p,p,lo); i >= lo && i <= hi; i += p)
405 totients[i-lo] -= totients[i-lo]/p;
406 } END_DO_FOR_EACH_SIEVE_PRIME
407 }
408 end_segment_primes(ctx);
409
410 /* Fill in all primes */
411 for (i = (lo | 1) - lo; i < count; i += 2)
412 if (totients[i] == i+lo)
413 totients[i]--;
414 if (lo <= 1) totients[1-lo] = 1;
415
416 return totients;
417 }
418
mertens(UV n)419 IV mertens(UV n) {
420 /* See Deléglise and Rivat (1996) for O(n^2/3 log(log(n))^1/3) algorithm.
421 * This implementation uses their lemma 2.1 directly, so is ~ O(n).
422 * In serial it is quite a bit faster than segmented summation of mu
423 * ranges, though the latter seems to be a favored method for GPUs.
424 */
425 UV u, j, m, nmk, maxmu;
426 signed char* mu;
427 short* M; /* 16 bits is enough range for all 32-bit M => 64-bit n */
428 IV sum;
429
430 if (n <= 1) return n;
431 u = isqrt(n);
432 maxmu = (n/(u+1)); /* maxmu lets us handle u < sqrt(n) */
433 if (maxmu < u) maxmu = u;
434 mu = range_moebius(0, maxmu);
435 New(0, M, maxmu+1, short); /* Works up to maxmu < 7613644886 */
436 M[0] = 0;
437 for (j = 1; j <= maxmu; j++)
438 M[j] = M[j-1] + mu[j];
439 sum = M[u];
440 for (m = 1; m <= u; m++) {
441 if (mu[m] != 0) {
442 IV inner_sum = 0;
443 UV lower = (u/m) + 1;
444 UV last_nmk = n/(m*lower);
445 UV this_k = 0;
446 UV next_k = n/(m*1);
447 UV nmkm = m * 2;
448 for (nmk = 1; nmk <= last_nmk; nmk++, nmkm += m) {
449 this_k = next_k;
450 next_k = n/nmkm;
451 inner_sum += M[nmk] * (this_k - next_k);
452 }
453 sum += (mu[m] > 0) ? -inner_sum : inner_sum;
454 }
455 }
456 Safefree(M);
457 Safefree(mu);
458 return sum;
459 }
460
461 /******************************************************************************/
462 /* POWERS and ROOTS */
463 /******************************************************************************/
464
465 /* There are at least 4 ways to do this, plus hybrids.
466 * 1) use a table. Great for 32-bit, too big for 64-bit.
467 * 2) Use pow() to check. Relatively slow and FP is always dangerous.
468 * 3) factor or trial factor. Slow for 64-bit.
469 * 4) Dietzfelbinger algorithm 2.3.5. Quite slow.
470 * This currently uses a hybrid of 1 and 2.
471 */
powerof(UV n)472 int powerof(UV n) {
473 UV t;
474 if ((n <= 3) || (n == UV_MAX)) return 1;
475 if ((n & (n-1)) == 0) return ctz(n); /* powers of 2 */
476 if (is_perfect_square(n)) return 2 * powerof(isqrt(n));
477 if (is_perfect_cube(n)) return 3 * powerof(icbrt(n));
478
479 /* Simple rejection filter for non-powers of 5-37. Rejects 47.85%. */
480 t = n & 511; if ((t*77855451) & (t*4598053) & 862) return 1;
481
482 if (is_perfect_fifth(n)) return 5 * powerof(rootof(n,5));
483 if (is_perfect_seventh(n)) return 7 * powerof(rootof(n,7));
484
485 if (n > 177146 && n <= UVCONST(1977326743)) {
486 switch (n) { /* Check for powers of 11, 13, 17, 19 within 32 bits */
487 case 177147: case 48828125: case 362797056: case 1977326743: return 11;
488 case 1594323: case 1220703125: return 13;
489 case 129140163: return 17;
490 case 1162261467: return 19;
491 default: break;
492 }
493 }
494 #if BITS_PER_WORD == 64
495 if (n >= UVCONST(8589934592)) {
496 /* The Bloom filters reject about 90% of inputs each, about 99% for two.
497 * Bach/Sorenson type sieves do about as well, but are much slower due
498 * to using a powmod. */
499 if ( (t = n %121, !((t*19706187) & (t*61524433) & 876897796)) &&
500 (t = n % 89, !((t*28913398) & (t*69888189) & 2705511937U)) ) {
501 /* (t = n % 67, !((t*117621317) & (t*48719734) & 537242019)) ) { */
502 UV root = rootof(n,11);
503 if (n == ipow(root,11)) return 11;
504 }
505 if ( (t = n %131, !((t*1545928325) & (t*1355660813) & 2771533888U)) &&
506 (t = n % 79, !((t*48902028) & (t*48589927) & 404082779)) ) {
507 /* (t = n % 53, !((t*79918293) & (t*236846524) & 694943819)) ) { */
508 UV root = rootof(n,13);
509 if (n == ipow(root,13)) return 13;
510 }
511 switch (n) {
512 case UVCONST(762939453125):
513 case UVCONST(16926659444736):
514 case UVCONST(232630513987207):
515 case UVCONST(100000000000000000):
516 case UVCONST(505447028499293771):
517 case UVCONST(2218611106740436992):
518 case UVCONST(8650415919381337933): return 17;
519 case UVCONST(19073486328125):
520 case UVCONST(609359740010496):
521 case UVCONST(11398895185373143):
522 case UVCONST(10000000000000000000): return 19;
523 case UVCONST(94143178827):
524 case UVCONST(11920928955078125):
525 case UVCONST(789730223053602816): return 23;
526 case UVCONST(68630377364883): return 29;
527 case UVCONST(617673396283947): return 31;
528 case UVCONST(450283905890997363): return 37;
529 default: break;
530 }
531 }
532 #endif
533 return 1;
534 }
is_power(UV n,UV a)535 int is_power(UV n, UV a)
536 {
537 int ret;
538 if (a > 0) {
539 if (a == 1 || n <= 1) return 1;
540 if ((a % 2) == 0)
541 return !is_perfect_square(n) ? 0 : (a == 2) ? 1 : is_power(isqrt(n),a>>1);
542 if ((a % 3) == 0)
543 return !is_perfect_cube(n) ? 0 : (a == 3) ? 1 : is_power(icbrt(n),a/3);
544 if ((a % 5) == 0)
545 return !is_perfect_fifth(n) ? 0 : (a == 5) ? 1 :is_power(rootof(n,5),a/5);
546 }
547 ret = powerof(n);
548 if (a != 0) return !(ret % a); /* Is the max power divisible by a? */
549 return (ret == 1) ? 0 : ret;
550 }
551
552 #if BITS_PER_WORD == 64
553 #define ROOT_MAX_3 41
554 static const uint32_t root_max[ROOT_MAX_3] = {0,0,0,2642245,65535,7131,1625,565,255,138,84,56,40,30,23,19,15,13,11,10,9,8,7,6,6,5,5,5,4,4,4,4,3,3,3,3,3,3,3,3,3};
555 #else
556 #define ROOT_MAX_3 21
557 static const uint32_t root_max[ROOT_MAX_3] = {0,0,0,1625,255,84,40,23,15,11,9,7,6,5,4,4,3,3,3,3,3};
558 #endif
559
rootof(UV n,UV k)560 UV rootof(UV n, UV k) {
561 UV lo, hi, max;
562 if (k == 0) return 0;
563 if (k == 1) return n;
564 if (k == 2) return isqrt(n);
565 if (k == 3) return icbrt(n);
566
567 /* Bracket between powers of 2, but never exceed max power so ipow works */
568 max = 1 + ((k >= ROOT_MAX_3) ? 2 : root_max[k]);
569 lo = UVCONST(1) << (log2floor(n)/k);
570 hi = ((lo*2) < max) ? lo*2 : max;
571
572 /* Binary search */
573 while (lo < hi) {
574 UV mid = lo + (hi-lo)/2;
575 if (ipow(mid,k) <= n) lo = mid+1;
576 else hi = mid;
577 }
578 return lo-1;
579 }
580
primepower(UV n,UV * prime)581 int primepower(UV n, UV* prime)
582 {
583 int power = 0;
584 if (n < 2) return 0;
585 /* Check for small divisors */
586 if (!(n&1)) {
587 if (n & (n-1)) return 0;
588 *prime = 2;
589 return ctz(n);
590 }
591 if ((n%3) == 0) {
592 /* if (UVCONST(12157665459056928801) % n) return 0; */
593 do { n /= 3; power++; } while (n > 1 && (n%3) == 0);
594 if (n != 1) return 0;
595 *prime = 3;
596 return power;
597 }
598 if ((n%5) == 0) {
599 do { n /= 5; power++; } while (n > 1 && (n%5) == 0);
600 if (n != 1) return 0;
601 *prime = 5;
602 return power;
603 }
604 if ((n%7) == 0) {
605 do { n /= 7; power++; } while (n > 1 && (n%7) == 0);
606 if (n != 1) return 0;
607 *prime = 7;
608 return power;
609 }
610 if (is_prob_prime(n))
611 { *prime = n; return 1; }
612 /* Composite. Test for perfect power with prime root. */
613 power = powerof(n);
614 if (power == 1) power = 0;
615 if (power) {
616 UV root = rootof(n, (UV)power);
617 if (is_prob_prime(root))
618 *prime = root;
619 else
620 power = 0;
621 }
622 return power;
623 }
624
valuation(UV n,UV k)625 UV valuation(UV n, UV k)
626 {
627 UV v = 0;
628 UV kpower = k;
629 if (k < 2 || n < 2) return 0;
630 if (k == 2) return ctz(n);
631 while ( !(n % kpower) ) {
632 kpower *= k;
633 v++;
634 }
635 return v;
636 }
637
logint(UV n,UV b)638 UV logint(UV n, UV b)
639 {
640 /* UV e; for (e=0; n; n /= b) e++; return e-1; */
641 UV v, e = 0;
642 if (b == 2)
643 return log2floor(n);
644 if (n > UV_MAX/b) {
645 n /= b;
646 e = 1;
647 }
648 for (v = b; v <= n; v *= b)
649 e++;
650 return e;
651 }
652
mpu_popcount_string(const char * ptr,uint32_t len)653 UV mpu_popcount_string(const char* ptr, uint32_t len)
654 {
655 uint32_t count = 0, i, j, d, v, power, slen, *s, *sptr;
656
657 while (len > 0 && (*ptr == '0' || *ptr == '+' || *ptr == '-'))
658 { ptr++; len--; }
659
660 /* Create s as array of base 10^8 numbers */
661 slen = (len + 7) / 8;
662 Newz(0, s, slen, uint32_t);
663 for (i = 0; i < slen; i++) { /* Chunks of 8 digits */
664 for (j = 0, d = 0, power = 1; j < 8 && len > 0; j++, power *= 10) {
665 v = ptr[--len] - '0';
666 if (v > 9) croak("Parameter '%s' must be a positive integer",ptr);
667 d += power * v;
668 }
669 s[slen - 1 - i] = d;
670 }
671 /* Repeatedly count and divide by 2 across s */
672 while (slen > 1) {
673 if (s[slen-1] & 1) count++;
674 sptr = s;
675 if (s[0] == 1) {
676 if (--slen == 0) break;
677 *++sptr += 100000000;
678 }
679 for (i = 0; i < slen; i++) {
680 if ( (i+1) < slen && sptr[i] & 1 ) sptr[i+1] += 100000000;
681 s[i] = sptr[i] >> 1;
682 }
683 }
684 /* For final base 10^8 number just do naive popcnt */
685 for (d = s[0]; d > 0; d >>= 1)
686 if (d & 1)
687 count++;
688 Safefree(s);
689 return count;
690 }
691
692
693 /* How many times does 2 divide n? */
694 #define padic2(n) ctz(n)
695 #define IS_MOD8_3OR5(x) (((x)&7)==3 || ((x)&7)==5)
696
kronecker_uu_sign(UV a,UV b,int s)697 static int kronecker_uu_sign(UV a, UV b, int s) {
698 while (a) {
699 int r = padic2(a);
700 if (r) {
701 if ((r&1) && IS_MOD8_3OR5(b)) s = -s;
702 a >>= r;
703 }
704 if (a & b & 2) s = -s;
705 { UV t = b % a; b = a; a = t; }
706 }
707 return (b == 1) ? s : 0;
708 }
709
kronecker_uu(UV a,UV b)710 int kronecker_uu(UV a, UV b) {
711 int r, s;
712 if (b & 1) return kronecker_uu_sign(a, b, 1);
713 if (!(a&1)) return 0;
714 s = 1;
715 r = padic2(b);
716 if (r) {
717 if ((r&1) && IS_MOD8_3OR5(a)) s = -s;
718 b >>= r;
719 }
720 return kronecker_uu_sign(a, b, s);
721 }
722
kronecker_su(IV a,UV b)723 int kronecker_su(IV a, UV b) {
724 int r, s;
725 if (a >= 0) return kronecker_uu(a, b);
726 if (b == 0) return (a == 1 || a == -1) ? 1 : 0;
727 s = 1;
728 r = padic2(b);
729 if (r) {
730 if (!(a&1)) return 0;
731 if ((r&1) && IS_MOD8_3OR5(a)) s = -s;
732 b >>= r;
733 }
734 a %= (IV) b;
735 if (a < 0) a += b;
736 return kronecker_uu_sign(a, b, s);
737 }
738
kronecker_ss(IV a,IV b)739 int kronecker_ss(IV a, IV b) {
740 if (a >= 0 && b >= 0)
741 return (b & 1) ? kronecker_uu_sign(a, b, 1) : kronecker_uu(a,b);
742 if (b >= 0)
743 return kronecker_su(a, b);
744 return kronecker_su(a, -b) * ((a < 0) ? -1 : 1);
745 }
746
factorial(UV n)747 UV factorial(UV n) {
748 UV i, r = 1;
749 if ( (n > 12 && sizeof(UV) <= 4) || (n > 20 && sizeof(UV) <= 8) ) return 0;
750 for (i = 2; i <= n; i++)
751 r *= i;
752 return r;
753 }
754
binomial(UV n,UV k)755 UV binomial(UV n, UV k) { /* Thanks to MJD and RosettaCode for ideas */
756 UV d, g, r = 1;
757 if (k == 0) return 1;
758 if (k == 1) return n;
759 if (k >= n) return (k == n);
760 if (k > n/2) k = n-k;
761 for (d = 1; d <= k; d++) {
762 if (r >= UV_MAX/n) { /* Possible overflow */
763 UV nr, dr; /* reduced numerator / denominator */
764 g = gcd_ui(n, d); nr = n/g; dr = d/g;
765 g = gcd_ui(r, dr); r = r/g; dr = dr/g;
766 if (r >= UV_MAX/nr) return 0; /* Unavoidable overflow */
767 r *= nr;
768 r /= dr;
769 n--;
770 } else {
771 r *= n--;
772 r /= d;
773 }
774 }
775 return r;
776 }
777
stirling3(UV n,UV m)778 UV stirling3(UV n, UV m) { /* Lah numbers */
779 UV f1, f2;
780
781 if (m == n) return 1;
782 if (n == 0 || m == 0 || m > n) return 0;
783 if (m == 1) return factorial(n);
784
785 f1 = binomial(n, m);
786 if (f1 == 0) return 0;
787 f2 = binomial(n-1, m-1);
788 if (f2 == 0 || f1 >= UV_MAX/f2) return 0;
789 f1 *= f2;
790 f2 = factorial(n-m);
791 if (f2 == 0 || f1 >= UV_MAX/f2) return 0;
792 return f1 * f2;
793 }
794
stirling2(UV n,UV m)795 IV stirling2(UV n, UV m) {
796 UV f;
797 IV j, k, t, s = 0;
798
799 if (m == n) return 1;
800 if (n == 0 || m == 0 || m > n) return 0;
801 if (m == 1) return 1;
802
803 if ((f = factorial(m)) == 0) return 0;
804 for (j = 1; j <= (IV)m; j++) {
805 t = binomial(m, j);
806 for (k = 1; k <= (IV)n; k++) {
807 if (t == 0 || j >= IV_MAX/t) return 0;
808 t *= j;
809 }
810 if ((m-j) & 1) t *= -1;
811 s += t;
812 }
813 return s/f;
814 }
815
stirling1(UV n,UV m)816 IV stirling1(UV n, UV m) {
817 IV k, t, b1, b2, s2, s = 0;
818
819 if (m == n) return 1;
820 if (n == 0 || m == 0 || m > n) return 0;
821 if (m == 1) {
822 UV f = factorial(n-1);
823 if (f>(UV)IV_MAX) return 0;
824 return (n&1) ? ((IV)f) : -((IV)f);
825 }
826
827 for (k = 1; k <= (IV)(n-m); k++) {
828 b1 = binomial(k + n - 1, n - m + k);
829 b2 = binomial(2 * n - m, n - m - k);
830 s2 = stirling2(n - m + k, k);
831 if (b1 == 0 || b2 == 0 || s2 == 0 || b1 > IV_MAX/b2) return 0;
832 t = b1 * b2;
833 if (s2 > IV_MAX/t) return 0;
834 t *= s2;
835 s += (k & 1) ? -t : t;
836 }
837 return s;
838 }
839
totient(UV n)840 UV totient(UV n) {
841 UV i, nfacs, totient, lastf, facs[MPU_MAX_FACTORS+1];
842 if (n <= 1) return n;
843 totient = 1;
844 /* phi(2m) = 2phi(m) if m even, phi(m) if m odd */
845 while ((n & 0x3) == 0) { n >>= 1; totient <<= 1; }
846 if ((n & 0x1) == 0) { n >>= 1; }
847 /* factor and calculate totient */
848 nfacs = factor(n, facs);
849 lastf = 0;
850 for (i = 0; i < nfacs; i++) {
851 UV f = facs[i];
852 if (f == lastf) { totient *= f; }
853 else { totient *= f-1; lastf = f; }
854 }
855 return totient;
856 }
857
858 static const UV jordan_overflow[5] =
859 #if BITS_PER_WORD == 64
860 {UVCONST(4294967311), 2642249, 65537, 7133, 1627};
861 #else
862 {UVCONST( 65537), 1627, 257, 85, 41};
863 #endif
jordan_totient(UV k,UV n)864 UV jordan_totient(UV k, UV n) {
865 UV factors[MPU_MAX_FACTORS+1];
866 int nfac, i;
867 UV totient;
868 if (k == 0 || n <= 1) return (n == 1);
869 if (k > 6 || (k > 1 && n >= jordan_overflow[k-2])) return 0;
870
871 totient = 1;
872 /* Similar to Euler totient, shortcut even inputs */
873 while ((n & 0x3) == 0) { n >>= 1; totient *= (1<<k); }
874 if ((n & 0x1) == 0) { n >>= 1; totient *= ((1<<k)-1); }
875 nfac = factor(n,factors);
876 for (i = 0; i < nfac; i++) {
877 UV p = factors[i];
878 UV pk = ipow(p,k);
879 totient *= (pk-1);
880 while (i+1 < nfac && p == factors[i+1]) {
881 i++;
882 totient *= pk;
883 }
884 }
885 return totient;
886 }
887
carmichael_lambda(UV n)888 UV carmichael_lambda(UV n) {
889 UV fac[MPU_MAX_FACTORS+1];
890 int i, nfactors;
891 UV lambda = 1;
892
893 if (n < 8) return totient(n);
894 if ((n & (n-1)) == 0) return n >> 2;
895
896 i = ctz(n);
897 if (i > 0) {
898 n >>= i;
899 lambda <<= (i>2) ? i-2 : i-1;
900 }
901 nfactors = factor(n, fac);
902 for (i = 0; i < nfactors; i++) {
903 UV p = fac[i], pk = p-1;
904 while (i+1 < nfactors && p == fac[i+1]) {
905 i++;
906 pk *= p;
907 }
908 lambda = lcm_ui(lambda, pk);
909 }
910 return lambda;
911 }
912
is_carmichael(UV n)913 int is_carmichael(UV n) {
914 UV fac[MPU_MAX_FACTORS+1];
915 UV exp[MPU_MAX_FACTORS+1];
916 int i, nfactors;
917
918 /* Small or even is not a Carmichael number */
919 if (n < 561 || !(n&1)) return 0;
920
921 /* Simple pre-test for square free (odds only) */
922 if (!(n% 9) || !(n%25) || !(n%49) || !(n%121) || !(n%169))
923 return 0;
924
925 /* Check Korselt's criterion for small divisors */
926 if (!(n% 5) && ((n-1) % 4 != 0)) return 0;
927 if (!(n% 7) && ((n-1) % 6 != 0)) return 0;
928 if (!(n%11) && ((n-1) % 10 != 0)) return 0;
929 if (!(n%13) && ((n-1) % 12 != 0)) return 0;
930 if (!(n%17) && ((n-1) % 16 != 0)) return 0;
931 if (!(n%19) && ((n-1) % 18 != 0)) return 0;
932 if (!(n%23) && ((n-1) % 22 != 0)) return 0;
933
934 /* Fast check without having to factor */
935 if (n > 5000000) {
936 if (!(n%29) && ((n-1) % 28 != 0)) return 0;
937 if (!(n%31) && ((n-1) % 30 != 0)) return 0;
938 if (!(n%37) && ((n-1) % 36 != 0)) return 0;
939 if (!(n%41) && ((n-1) % 40 != 0)) return 0;
940 if (!(n%43) && ((n-1) % 42 != 0)) return 0;
941 if (!is_pseudoprime(n,2)) return 0;
942 }
943
944 nfactors = factor_exp(n, fac, exp);
945 if (nfactors < 3)
946 return 0;
947 for (i = 0; i < nfactors; i++) {
948 if (exp[i] > 1 || ((n-1) % (fac[i]-1)) != 0)
949 return 0;
950 }
951 return 1;
952 }
953
is_quasi_base(int nfactors,UV * fac,UV p,UV b)954 static int is_quasi_base(int nfactors, UV *fac, UV p, UV b) {
955 int i;
956 for (i = 0; i < nfactors; i++) {
957 UV d = fac[i] - b;
958 if (d == 0 || (p % d) != 0)
959 return 0;
960 }
961 return 1;
962 }
963
964 /* Returns number of bases that pass */
is_quasi_carmichael(UV n)965 UV is_quasi_carmichael(UV n) {
966 UV nbases, fac[MPU_MAX_FACTORS+1], exp[MPU_MAX_FACTORS+1];
967 UV spf, lpf, ndivisors, *divs;
968 int i, nfactors;
969
970 if (n < 35) return 0;
971
972 /* Simple pre-test for square free */
973 if (!(n% 4) || !(n% 9) || !(n%25) || !(n%49) || !(n%121) || !(n%169))
974 return 0;
975
976 nfactors = factor_exp(n, fac, exp);
977 /* Must be composite */
978 if (nfactors < 2)
979 return 0;
980 /* Must be square free */
981 for (i = 0; i < nfactors; i++)
982 if (exp[i] > 1)
983 return 0;
984
985 nbases = 0;
986 spf = fac[0];
987 lpf = fac[nfactors-1];
988
989 /* Algorithm from Hiroaki Yamanouchi, 2015 */
990 if (nfactors == 2) {
991 divs = _divisor_list(n / spf - 1, &ndivisors);
992 for (i = 0; i < (int)ndivisors; i++) {
993 UV d = divs[i];
994 UV k = spf - d;
995 if (d >= spf) break;
996 if (is_quasi_base(nfactors, fac, n-k, k))
997 nbases++;
998 }
999 } else {
1000 divs = _divisor_list(lpf * (n / lpf - 1), &ndivisors);
1001 for (i = 0; i < (int)ndivisors; i++) {
1002 UV d = divs[i];
1003 UV k = lpf - d;
1004 if (lpf > d && k >= spf) continue;
1005 if (k != 0 && is_quasi_base(nfactors, fac, n-k, k))
1006 nbases++;
1007 }
1008 }
1009 Safefree(divs);
1010 return nbases;
1011 }
1012
is_semiprime(UV n)1013 int is_semiprime(UV n) {
1014 UV sp, p, n3, factors[2];
1015
1016 if (n < 6) return (n == 4);
1017 if (!(n&1)) return !!is_prob_prime(n>>1);
1018 if (!(n%3)) return !!is_prob_prime(n/3);
1019 if (!(n%5)) return !!is_prob_prime(n/5);
1020 /* 27% of random inputs left */
1021 n3 = icbrt(n);
1022 for (sp = 4; sp < 60; sp++) {
1023 p = primes_tiny[sp];
1024 if (p > n3)
1025 break;
1026 if ((n % p) == 0)
1027 return !!is_prob_prime(n/p);
1028 }
1029 /* 9.8% of random inputs left */
1030 if (is_def_prime(n)) return 0;
1031 if (p > n3) return 1; /* past this, n is a composite and larger than p^3 */
1032 /* 4-8% of random inputs left */
1033 if (factor_one(n, factors, 0, 0) != 2) return 0;
1034 return (is_def_prime(factors[0]) && is_def_prime(factors[1]));
1035 }
1036
is_fundamental(UV n,int neg)1037 int is_fundamental(UV n, int neg) {
1038 UV r = n & 15;
1039 if (r) {
1040 if (!neg) {
1041 switch (r & 3) {
1042 case 0: return (r == 4) ? 0 : is_square_free(n >> 2);
1043 case 1: return is_square_free(n);
1044 default: break;
1045 }
1046 } else {
1047 switch (r & 3) {
1048 case 0: return (r == 12) ? 0 : is_square_free(n >> 2);
1049 case 3: return is_square_free(n);
1050 default: break;
1051 }
1052 }
1053 }
1054 return 0;
1055 }
1056
_totpred(UV n,UV maxd)1057 static int _totpred(UV n, UV maxd) {
1058 UV i, ndivisors, *divs;
1059 int res;
1060
1061 if (n & 1) return 0;
1062 n >>= 1;
1063 if (n == 1) return 1;
1064 if (n < maxd && is_prime(2*n+1)) return 1;
1065
1066 divs = _divisor_list(n, &ndivisors);
1067 for (i = 0, res = 0; i < ndivisors && divs[i] < maxd && res == 0; i++) {
1068 UV r, d = divs[i], p = 2*d+1;
1069 if (!is_prime(p)) continue;
1070 r = n/d;
1071 while (1) {
1072 if (r == p || _totpred(r, d)) { res = 1; break; }
1073 if (r % p) break;
1074 r /= p;
1075 }
1076 }
1077 Safefree(divs);
1078 return res;
1079 }
1080
is_totient(UV n)1081 int is_totient(UV n) {
1082 return (n == 0 || (n & 1)) ? (n==1) : _totpred(n,n);
1083 }
1084
1085
inverse_totient_count(UV n)1086 UV inverse_totient_count(UV n) {
1087 set_t set, sumset;
1088 keyval_t keyval;
1089 UV res, i, ndivisors, *divs;
1090
1091 if (n == 1) return 2;
1092 if (n < 1 || n & 1) return 0;
1093 if (is_prime(n >> 1)) { /* Coleman Remark 3.3 (Thm 3.1) and Prop 6.2 */
1094 if (!is_prime(n+1)) return 0;
1095 if (n >= 10) return 2;
1096 }
1097
1098 divs = _divisor_list(n, &ndivisors);
1099
1100 init_set(&set, 2*ndivisors);
1101 keyval.key = 1; keyval.val = 1;
1102 set_addsum(&set, keyval);
1103
1104 for (i = 0; i < ndivisors; i++) {
1105 UV d = divs[i], p = d+1;
1106 if (is_prime(p)) {
1107 UV j, np = d, v = valuation(n, p);
1108 init_set(&sumset, ndivisors/2);
1109 for (j = 0; j <= v; j++) {
1110 UV k, ndiv = n/np; /* Loop over divisors of n/np */
1111 if (np == 1) {
1112 keyval_t kv; kv.key = 1; kv.val = 1;
1113 set_addsum(&sumset, kv);
1114 } else {
1115 for (k = 0; k < ndivisors && divs[k] <= ndiv; k++) {
1116 UV val, d2 = divs[k];
1117 if ((ndiv % d2) != 0) continue;
1118 val = set_getval(set, d2);
1119 if (val > 0) {
1120 keyval_t kv; kv.key = d2*np; kv.val = val;
1121 set_addsum(&sumset, kv);
1122 }
1123 }
1124 }
1125 /* if (j < v && np > UV_MAX/p) croak("overflow np d %lu", d); */
1126 np *= p;
1127 }
1128 set_merge(&set, sumset);
1129 free_set(&sumset);
1130 }
1131 }
1132 Safefree(divs);
1133 res = set_getval(set, n);
1134 free_set(&set);
1135 return res;
1136 }
1137
inverse_totient_list(UV * ntotients,UV n)1138 UV* inverse_totient_list(UV *ntotients, UV n) {
1139 set_list_t setlist, divlist;
1140 UV i, ndivisors, *divs, *tlist;
1141 UV *totlist = 0;
1142
1143 MPUassert(n <= UV_MAX/7.5, "inverse_totient_list n too large");
1144
1145 if (n == 1) {
1146 New(0, totlist, 2, UV);
1147 totlist[0] = 1; totlist[1] = 2;
1148 *ntotients = 2;
1149 return totlist;
1150 }
1151 if (n < 1 || n & 1) {
1152 *ntotients = 0;
1153 return totlist;
1154 }
1155 if (is_prime(n >> 1)) { /* Coleman Remark 3.3 (Thm 3.1) and Prop 6.2 */
1156 if (!is_prime(n+1)) {
1157 *ntotients = 0;
1158 return totlist;
1159 }
1160 if (n >= 10) {
1161 New(0, totlist, 2, UV);
1162 totlist[0] = n+1; totlist[1] = 2*n+2;
1163 *ntotients = 2;
1164 return totlist;
1165 }
1166 }
1167
1168 divs = _divisor_list(n, &ndivisors);
1169
1170 init_setlist(&setlist, 2*ndivisors);
1171 setlist_addval(&setlist, 1, 1); /* Add 1 => [1] */
1172
1173 for (i = 0; i < ndivisors; i++) {
1174 UV d = divs[i], p = d+1;
1175 if (is_prime(p)) {
1176 UV j, dp = d, pp = p, v = valuation(n, p);
1177 init_setlist(&divlist, ndivisors/2);
1178 for (j = 0; j <= v; j++) {
1179 UV k, ndiv = n/dp; /* Loop over divisors of n/dp */
1180 if (dp == 1) {
1181 setlist_addval(&divlist, 1, 2); /* Add 1 => [2] */
1182 } else {
1183 for (k = 0; k < ndivisors && divs[k] <= ndiv; k++) {
1184 UV nvals, *vals, d2 = divs[k];
1185 if ((ndiv % d2) != 0) continue;
1186 vals = setlist_getlist(&nvals, setlist, d2);
1187 if (vals != 0)
1188 setlist_addlist(&divlist, d2 * dp, nvals, vals, pp);
1189 }
1190 }
1191 dp *= p;
1192 pp *= p;
1193 }
1194 setlist_merge(&setlist, divlist);
1195 free_setlist(&divlist);
1196 }
1197 }
1198 Safefree(divs);
1199 tlist = setlist_getlist(ntotients, setlist, n);
1200 if (tlist != 0 && *ntotients > 0) {
1201 New(0, totlist, *ntotients, UV);
1202 memcpy(totlist, tlist, *ntotients * sizeof(UV));
1203 qsort(totlist, *ntotients, sizeof(UV), _numcmp);
1204 }
1205 free_setlist(&setlist);
1206 return totlist;
1207 }
1208
1209
pillai_v(UV n)1210 UV pillai_v(UV n) {
1211 UV v, fac;
1212 if (n == 0) return 0;
1213 for (v = 8, fac = 5040 % n; v < n-1 && fac != 0; v++) {
1214 fac = (n < HALF_WORD) ? (fac*v) % n : mulmod(fac,v,n);
1215 if (fac == n-1 && (n % v) != 1)
1216 return v;
1217 }
1218 return 0;
1219 }
1220
1221
moebius(UV n)1222 int moebius(UV n) {
1223 UV factors[MPU_MAX_FACTORS+1];
1224 int i, nfactors;
1225
1226 if (n <= 5) return (n == 1) ? 1 : (n % 4) ? -1 : 0;
1227 if (n >= 49 && (!(n % 4) || !(n % 9) || !(n % 25) || !(n % 49)))
1228 return 0;
1229 if (n >= 361 && (!(n % 121) || !(n % 169) || !(n % 289) || !(n % 361)))
1230 return 0;
1231 if (n >= 961 && (!(n % 529) || !(n % 841) || !(n % 961)))
1232 return 0;
1233
1234 nfactors = factor(n, factors);
1235 for (i = 1; i < nfactors; i++)
1236 if (factors[i] == factors[i-1])
1237 return 0;
1238 return (nfactors % 2) ? -1 : 1;
1239 }
1240
exp_mangoldt(UV n)1241 UV exp_mangoldt(UV n) {
1242 UV p;
1243 if (!primepower(n,&p)) return 1; /* Not a prime power */
1244 return p;
1245 }
1246
1247
znorder(UV a,UV n)1248 UV znorder(UV a, UV n) {
1249 UV fac[MPU_MAX_FACTORS+1];
1250 UV exp[MPU_MAX_FACTORS+1];
1251 int i, nfactors;
1252 UV k, phi;
1253
1254 if (n <= 1) return n; /* znorder(x,0) = 0, znorder(x,1) = 1 */
1255 if (a <= 1) return a; /* znorder(0,x) = 0, znorder(1,x) = 1 (x > 1) */
1256 if (gcd_ui(a,n) > 1) return 0;
1257
1258 /* Cohen 1.4.3 using Carmichael Lambda */
1259 phi = carmichael_lambda(n);
1260 nfactors = factor_exp(phi, fac, exp);
1261 k = phi;
1262 #if USE_MONTMATH
1263 if (n & 1) {
1264 const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
1265 UV ma = mont_geta(a, n);
1266 for (i = 0; i < nfactors; i++) {
1267 UV b, a1, ek, pi = fac[i], ei = exp[i];
1268 b = ipow(pi,ei);
1269 k /= b;
1270 a1 = mont_powmod(ma, k, n);
1271 for (ek = 0; a1 != mont1 && ek++ <= ei; a1 = mont_powmod(a1, pi, n))
1272 k *= pi;
1273 if (ek > ei) return 0;
1274 }
1275 } else
1276 #endif
1277 for (i = 0; i < nfactors; i++) {
1278 UV b, a1, ek, pi = fac[i], ei = exp[i];
1279 b = ipow(pi,ei);
1280 k /= b;
1281 a1 = powmod(a, k, n);
1282 for (ek = 0; a1 != 1 && ek++ <= ei; a1 = powmod(a1, pi, n))
1283 k *= pi;
1284 if (ek > ei) return 0;
1285 }
1286 return k;
1287 }
1288
znprimroot(UV n)1289 UV znprimroot(UV n) {
1290 UV fac[MPU_MAX_FACTORS+1];
1291 UV phi_div_fac[MPU_MAX_FACTORS+1];
1292 UV a, phi, on, r;
1293 int i, nfactors;
1294
1295 if (n <= 4) return (n == 0) ? 0 : n-1;
1296 if (n % 4 == 0) return 0;
1297
1298 on = (n&1) ? n : (n>>1);
1299 a = powerof(on);
1300 r = rootof(on, a);
1301 if (!is_prob_prime(r)) return 0; /* c^a or 2c^a */
1302 phi = (r-1) * (on/r); /* p^a or 2p^a */
1303
1304 nfactors = factor_exp(phi, fac, 0);
1305 for (i = 0; i < nfactors; i++)
1306 phi_div_fac[i] = phi / fac[i];
1307
1308 #if USE_MONTMATH
1309 if (n & 1) {
1310 const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
1311 for (a = 2; a < n; a++) {
1312 if (a == 4 || a == 8 || a == 9) continue; /* Skip some perfect powers */
1313 /* Skip values we know can't be right: (a|n) = 0 (or 1 for odd primes) */
1314 if (phi == n-1) { if (kronecker_uu(a, n) != -1) continue; }
1315 else { if (gcd_ui(a,n) != 1) continue; }
1316 r = mont_geta(a, n);
1317 for (i = 0; i < nfactors; i++)
1318 if (mont_powmod(r, phi_div_fac[i], n) == mont1)
1319 break;
1320 if (i == nfactors) return a;
1321 }
1322 } else
1323 #endif
1324 for (a = 2; a < n; a++) {
1325 if (a == 4 || a == 8 || a == 9) continue; /* Skip some perfect powers */
1326 /* Skip values we know can't be right: (a|n) = 0 (or 1 for odd primes) */
1327 if (phi == n-1) { if (kronecker_uu(a, n) != -1) continue; }
1328 else { if (gcd_ui(a,n) != 1) continue; }
1329 for (i = 0; i < nfactors; i++)
1330 if (powmod(a, phi_div_fac[i], n) == 1)
1331 break;
1332 if (i == nfactors) return a;
1333 }
1334 return 0;
1335 }
1336
is_primitive_root(UV a,UV n,int nprime)1337 int is_primitive_root(UV a, UV n, int nprime) {
1338 UV s, fac[MPU_MAX_FACTORS+1];
1339 int i, nfacs;
1340
1341 if (n <= 1) return n;
1342 if (a >= n) a %= n;
1343 if (n <= 4) return a == n-1;
1344 if (n % 4 == 0) return 0;
1345
1346 /* Very simple, but not fast:
1347 * s = nprime ? n-1 : totient(n);
1348 * return s == znorder(a, n);
1349 */
1350
1351 if (gcd_ui(a,n) != 1) return 0;
1352 if (nprime) {
1353 s = n-1;
1354 } else {
1355 UV on = (n&1) ? n : (n>>1);
1356 UV k = powerof(on);
1357 UV r = rootof(on, k);
1358 if (!is_prob_prime(r)) return 0; /* c^a or 2c^a */
1359 s = (r-1) * (on/r); /* p^a or 2p^a */
1360 }
1361 if (s == n-1 && kronecker_uu(a,n) != -1) return 0;
1362
1363 /* a^x can be a primitive root only if gcd(x,s) = 1 */
1364 i = is_power(a,0);
1365 if (i > 1 && gcd_ui(i, s) != 1) return 0;
1366
1367 #if USE_MONTMATH
1368 if (n & 1) {
1369 const uint64_t npi = mont_inverse(n), mont1 = mont_get1(n);
1370 a = mont_geta(a, n);
1371 /* Quick check for small factors before full factor */
1372 if ((s % 2) == 0 && mont_powmod(a, s/2, n) == mont1) return 0;
1373 if ((s % 3) == 0 && mont_powmod(a, s/3, n) == mont1) return 0;
1374 if ((s % 5) == 0 && mont_powmod(a, s/5, n) == mont1) return 0;
1375 nfacs = factor_exp(s, fac, 0);
1376 for (i = 0; i < nfacs; i++)
1377 if (fac[i] > 5 && mont_powmod(a, s/fac[i], n) == mont1)
1378 return 0;
1379 } else
1380 #endif
1381 {
1382 /* Quick check for small factors before full factor */
1383 if ((s % 2) == 0 && powmod(a, s/2, n) == 1) return 0;
1384 if ((s % 3) == 0 && powmod(a, s/3, n) == 1) return 0;
1385 if ((s % 5) == 0 && powmod(a, s/5, n) == 1) return 0;
1386 /* Complete factor and check each one not found above. */
1387 nfacs = factor_exp(s, fac, 0);
1388 for (i = 0; i < nfacs; i++)
1389 if (fac[i] > 5 && powmod(a, s/fac[i], n) == 1)
1390 return 0;
1391 }
1392 return 1;
1393 }
1394
gcdext(IV a,IV b,IV * u,IV * v,IV * cs,IV * ct)1395 IV gcdext(IV a, IV b, IV* u, IV* v, IV* cs, IV* ct) {
1396 IV s = 0; IV os = 1;
1397 IV t = 1; IV ot = 0;
1398 IV r = b; IV or = a;
1399 if (a == 0 && b == 0) { os = 0; t = 0; }
1400 while (r != 0) {
1401 IV quot = or / r;
1402 { IV tmp = r; r = or - quot * r; or = tmp; }
1403 { IV tmp = s; s = os - quot * s; os = tmp; }
1404 { IV tmp = t; t = ot - quot * t; ot = tmp; }
1405 }
1406 if (or < 0) /* correct sign */
1407 { or = -or; os = -os; ot = -ot; }
1408 if (u != 0) *u = os;
1409 if (v != 0) *v = ot;
1410 if (cs != 0) *cs = s;
1411 if (ct != 0) *ct = t;
1412 return or;
1413 }
1414
1415 /* Calculate 1/a mod n. */
modinverse(UV a,UV n)1416 UV modinverse(UV a, UV n) {
1417 IV t = 0; UV nt = 1;
1418 UV r = n; UV nr = a;
1419 while (nr != 0) {
1420 UV quot = r / nr;
1421 { UV tmp = nt; nt = t - quot*nt; t = tmp; }
1422 { UV tmp = nr; nr = r - quot*nr; r = tmp; }
1423 }
1424 if (r > 1) return 0; /* No inverse */
1425 if (t < 0) t += n;
1426 return t;
1427 }
1428
divmod(UV a,UV b,UV n)1429 UV divmod(UV a, UV b, UV n) { /* a / b mod n */
1430 UV binv = modinverse(b, n);
1431 if (binv == 0) return 0;
1432 return mulmod(a, binv, n);
1433 }
1434
_powfactor(UV p,UV d,UV m)1435 static UV _powfactor(UV p, UV d, UV m) {
1436 UV e = 0;
1437 do { d /= p; e += d; } while (d > 0);
1438 return powmod(p, e, m);
1439 }
1440
factorialmod(UV n,UV m)1441 UV factorialmod(UV n, UV m) { /* n! mod m */
1442 UV i, d = n, res = 1;
1443
1444 if (n >= m || m == 1) return 0;
1445
1446 if (n <= 10) { /* Keep things simple for small n */
1447 for (i = 2; i <= n && res != 0; i++)
1448 res = (res * i) % m;
1449 return res;
1450 }
1451
1452 if (n > m/2 && is_prime(m)) /* Check if we can go backwards */
1453 d = m-n-1;
1454 if (d < 2)
1455 return (d == 0) ? m-1 : 1; /* Wilson's Theorem: n = m-1 and n = m-2 */
1456
1457 if (d == n && d > 5000000) { /* Check for composite m that leads to 0 */
1458 UV fac[MPU_MAX_FACTORS+1], exp[MPU_MAX_FACTORS+1];
1459 int j, k, nfacs = factor_exp(m, fac, exp);
1460 for (j = 0; j < nfacs; j++) {
1461 UV t = fac[j];
1462 for (k = 1; (UV)k < exp[j]; k++)
1463 t *= fac[j];
1464 if (n >= t) return 0;
1465 }
1466 }
1467
1468 #if USE_MONTMATH
1469 if (m & 1 && d < 40000) {
1470 const uint64_t npi = mont_inverse(m), mont1 = mont_get1(m);
1471 uint64_t monti = mont1;
1472 res = mont1;
1473 for (i = 2; i <= d && res != 0; i++) {
1474 monti = addmod(monti,mont1,m);
1475 res = mont_mulmod(res,monti,m);
1476 }
1477 res = mont_recover(res, m);
1478 } else
1479 #endif
1480 if (d < 10000) {
1481 for (i = 2; i <= d && res != 0; i++)
1482 res = mulmod(res,i,m);
1483 } else {
1484 #if 0 /* Monolithic prime walk */
1485 START_DO_FOR_EACH_PRIME(2, d) {
1486 UV k = (p > (d>>1)) ? p : _powfactor(p, d, m);
1487 res = mulmod(res, k, m);
1488 if (res == 0) break;
1489 } END_DO_FOR_EACH_PRIME;
1490 #else /* Segmented prime walk */
1491 unsigned char* segment;
1492 UV seg_base, seg_low, seg_high;
1493 void* ctx = start_segment_primes(7, d, &segment);
1494 for (i = 1; i <= 3; i++) /* Handle 2,3,5 assume d>10*/
1495 res = mulmod(res, _powfactor(2*i - (i>1), d, m), m);
1496 while (res != 0 && next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
1497 START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high )
1498 UV k = (p > (d>>1)) ? p : _powfactor(p, d, m);
1499 res = mulmod(res, k, m);
1500 if (res == 0) break;
1501 END_DO_FOR_EACH_SIEVE_PRIME
1502 }
1503 end_segment_primes(ctx);
1504 #endif
1505 }
1506
1507 if (d != n && res != 0) { /* Handle backwards case */
1508 if (!(d&1)) res = submod(m,res,m);
1509 res = modinverse(res,m);
1510 }
1511
1512 return res;
1513 }
1514
verify_sqrtmod(UV s,UV * rs,UV a,UV p)1515 static int verify_sqrtmod(UV s, UV *rs, UV a, UV p) {
1516 if (p-s < s) s = p-s;
1517 if (mulmod(s, s, p) != a) return 0;
1518 *rs = s;
1519 return 1;
1520 }
1521 #if !USE_MONTMATH
_sqrtmod_prime(UV a,UV p)1522 UV _sqrtmod_prime(UV a, UV p) {
1523 if ((p % 4) == 3) {
1524 return powmod(a, (p+1)>>2, p);
1525 }
1526 if ((p % 8) == 5) { /* Atkin's algorithm. Faster than Legendre. */
1527 UV a2, alpha, beta, b;
1528 a2 = addmod(a,a,p);
1529 alpha = powmod(a2,(p-5)>>3,p);
1530 beta = mulmod(a2,sqrmod(alpha,p),p);
1531 b = mulmod(alpha, mulmod(a, (beta ? beta-1 : p-1), p), p);
1532 return b;
1533 }
1534 if ((p % 16) == 9) { /* Müller's algorithm extending Atkin */
1535 UV a2, alpha, beta, b, d = 1;
1536 a2 = addmod(a,a,p);
1537 alpha = powmod(a2, (p-9)>>4, p);
1538 beta = mulmod(a2, sqrmod(alpha,p), p);
1539 if (sqrmod(beta,p) != p-1) {
1540 do { d += 2; } while (kronecker_uu(d,p) != -1 && d < p);
1541 alpha = mulmod(alpha, powmod(d,(p-9)>>3,p), p);
1542 beta = mulmod(a2, mulmod(sqrmod(d,p),sqrmod(alpha,p),p), p);
1543 }
1544 b = mulmod(alpha, mulmod(a, mulmod(d,(beta ? beta-1 : p-1),p),p),p);
1545 return b;
1546 }
1547
1548 /* Verify Euler condition for odd p */
1549 if ((p & 1) && powmod(a,(p-1)>>1,p) != 1) return 0;
1550
1551 {
1552 UV x, q, e, t, z, r, m, b;
1553 q = p-1;
1554 e = valuation(q, 2);
1555 q >>= e;
1556 t = 3;
1557 while (kronecker_uu(t, p) != -1) {
1558 t += 2;
1559 if (t == 201) { /* exit if p looks like a composite */
1560 if ((p % 2) == 0 || powmod(2, p-1, p) != 1 || powmod(3, p-1, p) != 1)
1561 return 0;
1562 } else if (t >= 20000) { /* should never happen */
1563 return 0;
1564 }
1565 }
1566 z = powmod(t, q, p);
1567 b = powmod(a, q, p);
1568 r = e;
1569 q = (q+1) >> 1;
1570 x = powmod(a, q, p);
1571 while (b != 1) {
1572 t = b;
1573 for (m = 0; m < r && t != 1; m++)
1574 t = sqrmod(t, p);
1575 if (m >= r) break;
1576 t = powmod(z, UVCONST(1) << (r-m-1), p);
1577 x = mulmod(x, t, p);
1578 z = mulmod(t, t, p);
1579 b = mulmod(b, z, p);
1580 r = m;
1581 }
1582 return x;
1583 }
1584 return 0;
1585 }
1586 #else
_sqrtmod_prime(UV a,UV p)1587 UV _sqrtmod_prime(UV a, UV p) {
1588 const uint64_t npi = mont_inverse(p), mont1 = mont_get1(p);
1589 a = mont_geta(a,p);
1590
1591 if ((p % 4) == 3) {
1592 UV b = mont_powmod(a, (p+1)>>2, p);
1593 return mont_recover(b, p);
1594 }
1595
1596 if ((p % 8) == 5) { /* Atkin's algorithm. Faster than Legendre. */
1597 UV a2, alpha, beta, b;
1598 a2 = addmod(a,a,p);
1599 alpha = mont_powmod(a2,(p-5)>>3,p);
1600 beta = mont_mulmod(a2,mont_sqrmod(alpha,p),p);
1601 beta = submod(beta, mont1, p);
1602 b = mont_mulmod(alpha, mont_mulmod(a, beta, p), p);
1603 return mont_recover(b, p);
1604 }
1605 if ((p % 16) == 9) { /* Müller's algorithm extending Atkin */
1606 UV a2, alpha, beta, b, d = 1;
1607 a2 = addmod(a,a,p);
1608 alpha = mont_powmod(a2, (p-9)>>4, p);
1609 beta = mont_mulmod(a2, mont_sqrmod(alpha,p), p);
1610 if (mont_sqrmod(beta,p) != submod(0,mont1,p)) {
1611 do { d += 2; } while (kronecker_uu(d,p) != -1 && d < p);
1612 d = mont_geta(d,p);
1613 alpha = mont_mulmod(alpha, mont_powmod(d,(p-9)>>3,p), p);
1614 beta = mont_mulmod(a2, mont_mulmod(mont_sqrmod(d,p),mont_sqrmod(alpha,p),p), p);
1615 beta = mont_mulmod(submod(beta,mont1,p), d, p);
1616 } else {
1617 beta = submod(beta, mont1, p);
1618 }
1619 b = mont_mulmod(alpha, mont_mulmod(a, beta, p), p);
1620 return mont_recover(b, p);
1621 }
1622
1623 /* Verify Euler condition for odd p */
1624 if ((p & 1) && mont_powmod(a,(p-1)>>1,p) != mont1) return 0;
1625
1626 {
1627 UV x, q, e, t, z, r, m, b;
1628 q = p-1;
1629 e = valuation(q, 2);
1630 q >>= e;
1631 t = 3;
1632 while (kronecker_uu(t, p) != -1) {
1633 t += 2;
1634 if (t == 201) { /* exit if p looks like a composite */
1635 if ((p % 2) == 0 || powmod(2, p-1, p) != 1 || powmod(3, p-1, p) != 1)
1636 return 0;
1637 } else if (t >= 20000) { /* should never happen */
1638 return 0;
1639 }
1640 }
1641 t = mont_geta(t, p);
1642 z = mont_powmod(t, q, p);
1643 b = mont_powmod(a, q, p);
1644 r = e;
1645 q = (q+1) >> 1;
1646 x = mont_powmod(a, q, p);
1647 while (b != mont1) {
1648 t = b;
1649 for (m = 0; m < r && t != mont1; m++)
1650 t = mont_sqrmod(t, p);
1651 if (m >= r) break;
1652 t = mont_powmod(z, UVCONST(1) << (r-m-1), p);
1653 x = mont_mulmod(x, t, p);
1654 z = mont_mulmod(t, t, p);
1655 b = mont_mulmod(b, z, p);
1656 r = m;
1657 }
1658 return mont_recover(x, p);
1659 }
1660 return 0;
1661 }
1662 #endif
1663
sqrtmod(UV * s,UV a,UV p)1664 int sqrtmod(UV *s, UV a, UV p) {
1665 if (p == 0) return 0;
1666 if (a >= p) a %= p;
1667 if (p <= 2 || a <= 1) return verify_sqrtmod(a, s,a,p);
1668
1669 return verify_sqrtmod(_sqrtmod_prime(a,p), s,a,p);
1670 }
1671
sqrtmod_composite(UV * s,UV a,UV n)1672 int sqrtmod_composite(UV *s, UV a, UV n) {
1673 UV fac[MPU_MAX_FACTORS+1];
1674 UV exp[MPU_MAX_FACTORS+1];
1675 UV sqr[MPU_MAX_FACTORS+1];
1676 UV p, j, k, gcdan;
1677 int i, nfactors;
1678
1679 if (n == 0) return 0;
1680 if (a >= n) a %= n;
1681 if (n <= 2 || a <= 1) return verify_sqrtmod(a, s,a,n);
1682
1683 /* Simple existence check. It's still possible no solution exists.*/
1684 if (kronecker_uu(a, ((n%4) == 2) ? n/2 : n) == -1) return 0;
1685
1686 /* if 8|n 'a' must = 1 mod 8, else if 4|n 'a' must = 1 mod 4 */
1687 if ((n % 4) == 0) {
1688 if ((n % 8) == 0) {
1689 if ((a % 8) != 1) return 0;
1690 } else {
1691 if ((a % 4) != 1) return 0;
1692 }
1693 }
1694
1695 /* More detailed existence check before factoring. Still possible. */
1696 gcdan = gcd_ui(a, n);
1697 if (gcdan == 1) {
1698 if ((n % 3) == 0 && kronecker_uu(a, 3) != 1) return 0;
1699 if ((n % 5) == 0 && kronecker_uu(a, 5) != 1) return 0;
1700 if ((n % 7) == 0 && kronecker_uu(a, 7) != 1) return 0;
1701 }
1702
1703 /* Factor n */
1704 nfactors = factor_exp(n, fac, exp);
1705
1706 /* If gcd(a,n)==1, this answers comclusively if a solution exists. */
1707 if (gcdan == 1) {
1708 for (i = 0; i < nfactors; i++)
1709 if (fac[i] > 7 && kronecker_uu(a, fac[i]) != 1) return 0;
1710 }
1711
1712 for (i = 0; i < nfactors; i++) {
1713
1714 /* Powers of 2 */
1715 if (fac[i] == 2) {
1716 if (exp[i] == 1) {
1717 sqr[i] = a & 1;
1718 } else if (exp[i] == 2) {
1719 sqr[i] = 1; /* and 3 */
1720 } else {
1721 UV this_roots[256], next_roots[256];
1722 UV nthis = 0, nnext = 0;
1723 this_roots[nthis++] = 1;
1724 this_roots[nthis++] = 3;
1725 for (j = 2; j < exp[i]; j++) {
1726 p = UVCONST(1) << (j+1);
1727 nnext = 0;
1728 for (k = 0; k < nthis && nnext < 254; k++) {
1729 UV r = this_roots[k];
1730 if (sqrmod(r,p) == (a % p))
1731 next_roots[nnext++] = r;
1732 if (sqrmod(p-r,p) == (a % p))
1733 next_roots[nnext++] = p-r;
1734 }
1735 if (nnext == 0) return 0;
1736 /* copy next exponent's found roots to this one */
1737 nthis = nnext;
1738 for (k = 0; k < nnext; k++)
1739 this_roots[k] = next_roots[k];
1740 }
1741 sqr[i] = this_roots[0];
1742 }
1743 continue;
1744 }
1745
1746 /* p is an odd prime */
1747 p = fac[i];
1748 if (!sqrtmod(&(sqr[i]), a, p))
1749 return 0;
1750
1751 /* Lift solution of x^2 = a mod p to x^2 = a mod p^e */
1752 for (j = 1; j < exp[i]; j++) {
1753 UV xk2, yk, expect, sol;
1754 xk2 = addmod(sqr[i],sqr[i],p);
1755 yk = modinverse(xk2, p);
1756 expect = mulmod(xk2,yk,p);
1757 p *= fac[i];
1758 sol = submod(sqr[i], mulmod(submod(sqrmod(sqr[i],p), a % p, p), yk, p), p);
1759 if (expect != 1 || sqrmod(sol,p) != (a % p)) {
1760 /* printf("a %lu failure to lift to %lu^%d\n", a, fac[i], j+1); */
1761 return 0;
1762 }
1763 sqr[i] = sol;
1764 }
1765 }
1766
1767 /* raise fac[i] */
1768 for (i = 0; i < nfactors; i++)
1769 fac[i] = ipow(fac[i], exp[i]);
1770
1771 p = chinese(sqr, fac, nfactors, &i);
1772 return (i == 1) ? verify_sqrtmod(p, s, a, n) : 0;
1773 }
1774
1775 /* works only for co-prime inputs and also slower than the algorithm below,
1776 * but handles the case where IV_MAX < lcm <= UV_MAX.
1777 */
_simple_chinese(UV * a,UV * n,UV num,int * status)1778 static UV _simple_chinese(UV* a, UV* n, UV num, int* status) {
1779 UV i, lcm = 1, res = 0;
1780 *status = 0;
1781 if (num == 0) return 0;
1782
1783 for (i = 0; i < num; i++) {
1784 UV ni = n[i];
1785 UV gcd = gcd_ui(lcm, ni);
1786 if (gcd != 1) return 0; /* not coprime */
1787 ni /= gcd;
1788 if (ni > (UV_MAX/lcm)) return 0; /* lcm overflow */
1789 lcm *= ni;
1790 }
1791 for (i = 0; i < num; i++) {
1792 UV p, inverse, term;
1793 p = lcm / n[i];
1794 inverse = modinverse(p, n[i]);
1795 if (inverse == 0) return 0; /* n's coprime so should never happen */
1796 term = mulmod(p, mulmod(a[i], inverse, lcm), lcm);
1797 res = addmod(res, term, lcm);
1798 }
1799 *status = 1;
1800 return res;
1801 }
1802
1803 /* status: 1 ok, -1 no inverse, 0 overflow */
chinese(UV * a,UV * n,UV num,int * status)1804 UV chinese(UV* a, UV* n, UV num, int* status) {
1805 static unsigned short sgaps[] = {7983,3548,1577,701,301,132,57,23,10,4,1,0};
1806 UV gcd, i, j, lcm, sum, gi, gap;
1807 *status = 1;
1808 if (num == 0) return 0;
1809
1810 /* Sort modulii, largest first */
1811 for (gi = 0, gap = sgaps[gi]; gap >= 1; gap = sgaps[++gi]) {
1812 for (i = gap; i < num; i++) {
1813 UV tn = n[i], ta = a[i];
1814 for (j = i; j >= gap && n[j-gap] < tn; j -= gap)
1815 { n[j] = n[j-gap]; a[j] = a[j-gap]; }
1816 n[j] = tn; a[j] = ta;
1817 }
1818 }
1819
1820 if (n[0] > IV_MAX) return _simple_chinese(a,n,num,status);
1821 lcm = n[0]; sum = a[0] % n[0];
1822 for (i = 1; i < num; i++) {
1823 IV u, v, t, s;
1824 UV vs, ut;
1825 gcd = gcdext(lcm, n[i], &u, &v, &s, &t);
1826 if (gcd != 1 && ((sum % gcd) != (a[i] % gcd))) { *status = -1; return 0; }
1827 if (s < 0) s = -s;
1828 if (t < 0) t = -t;
1829 if (s > (IV)(IV_MAX/lcm)) return _simple_chinese(a,n,num,status);
1830 lcm *= s;
1831 if (u < 0) u += lcm;
1832 if (v < 0) v += lcm;
1833 vs = mulmod((UV)v, (UV)s, lcm);
1834 ut = mulmod((UV)u, (UV)t, lcm);
1835 sum = addmod( mulmod(vs, sum, lcm), mulmod(ut, a[i], lcm), lcm );
1836 }
1837 return sum;
1838 }
1839
chebyshev_psi(UV n)1840 NV chebyshev_psi(UV n)
1841 {
1842 UV k;
1843 KAHAN_INIT(sum);
1844
1845 for (k = log2floor(n); k > 0; k--) {
1846 KAHAN_SUM(sum, chebyshev_theta(rootof(n,k)));
1847 }
1848 return sum;
1849 }
1850
1851 #if BITS_PER_WORD == 64
1852 typedef struct {
1853 UV n;
1854 LNV theta;
1855 } cheby_theta_t;
1856 static const cheby_theta_t _cheby_theta[] = { /* >= quad math precision */
1857 { UVCONST( 67108864),LNVCONST( 67100507.6357700963903836828562472350035880) },
1858 { UVCONST( 100000000),LNVCONST( 99987730.0180220043832124342600487053812729) },
1859 { UVCONST( 134217728),LNVCONST( 134204014.5735572091791081610859055728165544) },
1860 { UVCONST( 268435456),LNVCONST( 268419741.6134308193112682817754501071404173) },
1861 { UVCONST( 536870912),LNVCONST( 536842885.8045763840625719515011160692495056) },
1862 { UVCONST( 1000000000),LNVCONST( 999968978.5775661447991262386023331863364793) },
1863 { UVCONST( 1073741824),LNVCONST( 1073716064.8860663337617909073555831842945484) },
1864 { UVCONST( 2147483648),LNVCONST( 2147432200.2475857676814950053003448716360822) },
1865 { UVCONST( 4294967296),LNVCONST( 4294889489.1735446386752045191908417183337361) },
1866 { UVCONST( 8589934592),LNVCONST( 8589863179.5654263491545135406516173629373070) },
1867 { UVCONST( 10000000000),LNVCONST( 9999939830.6577573841592219954033850595228736) },
1868 { UVCONST( 12884901888),LNVCONST( 12884796620.4324254952601520445848183460347362) },
1869 { UVCONST( 17179869184),LNVCONST( 17179757715.9924077567777285147574707468995695) },
1870 { UVCONST( 21474836480),LNVCONST( 21474693322.0998273969188369449626287713082943) },
1871 { UVCONST( 25769803776),LNVCONST( 25769579799.3751535467593954636665656772211515) },
1872 { UVCONST( 30064771072),LNVCONST( 30064545001.2305211029215168703433831598544454) },
1873 { UVCONST( 34359738368),LNVCONST( 34359499180.0126643918259085362039638823175054) },
1874 { UVCONST( 51539607552),LNVCONST( 51539356394.9531019037592855639826469993402730) },
1875 { UVCONST( 68719476736),LNVCONST( 68719165213.6369838785284711480925219076501720) },
1876 { UVCONST( 85899345920),LNVCONST( 85899083852.3471545629838432726841470626910905) },
1877 { UVCONST( 100000000000),LNVCONST( 99999737653.1074446948519125729820679772770146) },
1878 { UVCONST( 103079215104),LNVCONST(103079022007.113299711630969211422868856259124) },
1879 { UVCONST( 120259084288),LNVCONST(120258614516.787336970535750737470005730125261) },
1880 { UVCONST( 137438953472),LNVCONST(137438579206.444595884982301543904849253294539) },
1881 { UVCONST( 171798691840),LNVCONST(171798276885.585945657918751085729734540334501) },
1882 { UVCONST( 206158430208),LNVCONST(206158003808.160276853604927822609009916573462) },
1883 { UVCONST( 240518168576),LNVCONST(240517893445.995868018331936763125264759516048) },
1884 { UVCONST( 274877906944),LNVCONST(274877354651.045354829956619821889825596300686) },
1885 { UVCONST( 309237645312),LNVCONST(309237050379.850690561796126460858271984023198) },
1886 { UVCONST( 343597383680),LNVCONST(343596855806.595496630500062749631211394707114) },
1887 { UVCONST( 377957122048),LNVCONST(377956498560.227794386327526022452943941537993) },
1888 { UVCONST( 412316860416),LNVCONST(412316008796.349553568121442261222464590518293) },
1889 { UVCONST( 446676598784),LNVCONST(446675972485.936512329625489223180824947531484) },
1890 { UVCONST( 481036337152),LNVCONST(481035608287.572961376833237046440177624505864) },
1891 { UVCONST( 515396075520),LNVCONST(515395302740.633513931333424447688399032397200) },
1892 { UVCONST( 549755813888),LNVCONST(549755185085.539613556787409928561107952681488) },
1893 { UVCONST( 584115552256),LNVCONST(584115015741.698143680148976236958207248900725) },
1894 { UVCONST( 618475290624),LNVCONST(618474400071.621528348965919774195984612254220) },
1895 { UVCONST( 652835028992),LNVCONST(652834230470.583317059774197550110194348469358) },
1896 { UVCONST( 687194767360),LNVCONST(687193697328.927006867624832386534836384752774) },
1897 { UVCONST( 721554505728),LNVCONST(721553211683.605313067593521060195071837766347) },
1898 { UVCONST( 755914244096),LNVCONST(755913502349.878525212441903698096011352015192) },
1899 { UVCONST( 790273982464),LNVCONST(790273042590.053075430445971969285969445183076) },
1900 { UVCONST( 824633720832),LNVCONST(824633080997.428352876758261549475609957696369) },
1901 { UVCONST( 858993459200),LNVCONST(858992716288.318498931165663742671579465316192) },
1902 { UVCONST( 893353197568),LNVCONST(893352235882.851072417721659027263613727927680) },
1903 { UVCONST( 927712935936),LNVCONST(927711881043.628817668337317445143018372892386) },
1904 { UVCONST( 962072674304),LNVCONST(962071726126.508938539006575212272731584070786) },
1905 { UVCONST( 996432412672),LNVCONST(996431411588.361462717402562171913706963939018) },
1906 { UVCONST( 1099511627776),LNVCONST(1099510565082.05800550569923209414874779035972) },
1907 { UVCONST( 1168231104512),LNVCONST(1168230478726.83399452743801182220790107593115) },
1908 { UVCONST( 1236950581248),LNVCONST(1236949680081.02610603189530371762093291521116) },
1909 { UVCONST( 1305670057984),LNVCONST(1305668780900.04255251887970870257110498423202) },
1910 { UVCONST( 1374389534720),LNVCONST(1374388383792.63751003694755359184583212193880) },
1911 { UVCONST( 1443109011456),LNVCONST(1443107961091.80955496949174183091839841371227) },
1912 { UVCONST( 1511828488192),LNVCONST(1511827317611.91227277802426032456922797572429) },
1913 { UVCONST( 1580547964928),LNVCONST(1580546753969.30607547506449941085747942395437) },
1914 { UVCONST( 1649267441664),LNVCONST(1649265973878.75361554498682516738256005501353) },
1915 { UVCONST( 1717986918400),LNVCONST(1717985403764.24562741452793071287954107946922) },
1916 { UVCONST( 1786706395136),LNVCONST(1786704769212.04241689416220650800274263053933) },
1917 { UVCONST( 1855425871872),LNVCONST(1855425013030.54920163513184322741954734357404) },
1918 { UVCONST( 1924145348608),LNVCONST(1924143701943.02957992419280264060220278182021) },
1919 { UVCONST( 1992864825344),LNVCONST(1992863373568.84039296068619447120308124302086) },
1920 { UVCONST( 2061584302080),LNVCONST(2061583632335.91985095534685076604018573279204) },
1921 { UVCONST( 2130303778816),LNVCONST(2113122935598.01727180199783433992649406589029) },
1922 { UVCONST( 2199023255552),LNVCONST(2199021399611.18488312543276191461914978761981) },
1923 { UVCONST( 2267742732288),LNVCONST(2267740947106.05038218811506263712808318234921) },
1924 { UVCONST( 2336462209024),LNVCONST(2336460081480.34962633829077377680844065198307) },
1925 { UVCONST( 2405181685760),LNVCONST(2405179969505.38642629423585641169740223940265) },
1926 { UVCONST( 2473901162496),LNVCONST(2473899311193.37872375168104562948639924654178) },
1927 { UVCONST( 2542620639232),LNVCONST(2542619362554.88893589220737167756411653816418) },
1928 { UVCONST( 2611340115968),LNVCONST(2611338370515.94936514022501267847930999670553) },
1929 { UVCONST( 2680059592704),LNVCONST(2680057722824.52981820001574883706268873541107) },
1930 { UVCONST( 2748779069440),LNVCONST(2748777610452.18903407570165081726781627254885) },
1931 { UVCONST( 2817498546176),LNVCONST(2817497017165.31924616507392971415494161401775) },
1932 { UVCONST( 2886218022912),LNVCONST(2886216579432.32232322707222172612181994322081) },
1933 { UVCONST( 2954937499648),LNVCONST(2954936100812.97301730406598982753121204977388) },
1934 { UVCONST( 3023656976384),LNVCONST(3023654789503.82041452274471455184651411931920) },
1935 { UVCONST( 3298534883328),LNVCONST(3298533215621.76606493931157388037915263658637) },
1936 { UVCONST( 3573412790272),LNVCONST(3573411344351.74163523704886736624674718378131) },
1937 { UVCONST( 3848290697216),LNVCONST(3848288415701.82534219216958446478503907262807) },
1938 { UVCONST( 4123168604160),LNVCONST(4123166102085.86116301709394219323327831487542) },
1939 { UVCONST( 4398046511104),LNVCONST(4398044965678.05143041707871320554940671182665) },
1940 { UVCONST( 4672924418048),LNVCONST(4672922414672.04998927945349278916525727295687) },
1941 { UVCONST( 4947802324992),LNVCONST(4947800056419.04384937181159608905993450182729) },
1942 { UVCONST( 5222680231936),LNVCONST(5222678728087.69487334278665824384732845008859) },
1943 { UVCONST( 5497558138880),LNVCONST(5497555766573.55159115560501595606332808978878) },
1944 { UVCONST( 5772436045824),LNVCONST(5772433560746.27053256770924553245647027548204) },
1945 { UVCONST( 6047313952768),LNVCONST(6047310750621.24497633828761530843255989494448) },
1946 { UVCONST( 6322191859712),LNVCONST(6322189275338.39747421237532473168802646234745) },
1947 { UVCONST( 6597069766656),LNVCONST(6579887620000.56226807898107616294821989189226) },
1948 { UVCONST( 6871947673600),LNVCONST(6871945430474.61791600096091374271286154432006) },
1949 { UVCONST( 7146825580544),LNVCONST(7146823258390.34361980709600216319269118247416) },
1950 { UVCONST( 7421703487488),LNVCONST(7421700443390.35536080251964387835425662360121) },
1951 { UVCONST( 7696581394432),LNVCONST(7696578975137.73249441643024336954233783264803) },
1952 { UVCONST( 7971459301376),LNVCONST(7971457197928.90863708984184849978605273042512) },
1953 { UVCONST( 8246337208320),LNVCONST(8246333982863.77146812177727648999195989358960) },
1954 { UVCONST( 8521215115264),LNVCONST(8529802085075.55635100929751669785228592926043) },
1955 { UVCONST( 8796093022208),LNVCONST(8796089836425.34909684634625258535266362465034) },
1956 { UVCONST( 9345848836096),LNVCONST(9345845828116.77456046925508587313) },
1957 { UVCONST( 9895604649984),LNVCONST(9895601077915.26821447819584407150) },
1958 { UVCONST(10000000000000),LNVCONST(9999996988293.03419965318214160284) },
1959 { UVCONST(15000000000000),LNVCONST(14999996482301.7098815115045166858) },
1960 { UVCONST(20000000000000),LNVCONST(19999995126082.2286880312461318496) },
1961 { UVCONST(25000000000000),LNVCONST(24999994219058.4086216020475916538) },
1962 { UVCONST(30000000000000),LNVCONST(29999995531389.8454274046657200568) },
1963 { UVCONST(35000000000000),LNVCONST(34999992921190.8049427456456479005) },
1964 { UVCONST(40000000000000),LNVCONST(39999993533724.3168289589273168844) },
1965 { UVCONST(45000000000000),LNVCONST(44999993567606.9795798378256194424) },
1966 { UVCONST(50000000000000),LNVCONST(49999992543194.2636545758235373677) },
1967 { UVCONST(55000000000000),LNVCONST(54999990847877.2435105757522625171) },
1968 { UVCONST(60000000000000),LNVCONST(59999990297033.6261976055811111726) },
1969 { UVCONST(65000000000000),LNVCONST(64999990861395.5522142429859245014) },
1970 { UVCONST(70000000000000),LNVCONST(69999994316409.8717306521862685981) },
1971 { UVCONST(75000000000000),LNVCONST(74999990126219.8344899338374090165) },
1972 { UVCONST(80000000000000),LNVCONST(79999990160858.3042387288372250950) },
1973 { UVCONST(85000000000000),LNVCONST(84999987096970.5915212896832780715) },
1974 { UVCONST(90000000000000),LNVCONST(89999989501395.0738966599857919767) },
1975 { UVCONST(95000000000000),LNVCONST(94999990785908.6672552042792168144) },
1976 { UVCONST(100000000000000),LNVCONST(99999990573246.9785384070303475639) },
1977 };
1978 #define NCHEBY_VALS (sizeof(_cheby_theta)/sizeof(_cheby_theta[0]))
1979 #endif
1980
chebyshev_theta(UV n)1981 NV chebyshev_theta(UV n)
1982 {
1983 uint16_t i = 0;
1984 UV tp, startn, seg_base, seg_low, seg_high;
1985 unsigned char* segment;
1986 void* ctx;
1987 LNV initial_sum, prod = LNV_ONE;
1988 KAHAN_INIT(sum);
1989
1990 if (n < 500) {
1991 for (i = 1; (tp = primes_tiny[i]) <= n; i++) {
1992 KAHAN_SUM(sum, loglnv(tp));
1993 }
1994 return sum;
1995 }
1996
1997 #if defined NCHEBY_VALS
1998 if (n >= _cheby_theta[0].n) {
1999 for (i = 1; i < NCHEBY_VALS; i++)
2000 if (n < _cheby_theta[i].n)
2001 break;
2002 startn = _cheby_theta[i-1].n;
2003 initial_sum = _cheby_theta[i-1].theta;
2004 } else
2005 #endif
2006 {
2007 KAHAN_SUM(sum, loglnv(2*3*5*7*11*13));
2008 startn = 17;
2009 initial_sum = 0;
2010 }
2011
2012 ctx = start_segment_primes(startn, n, &segment);
2013 #if 0
2014 while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
2015 START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
2016 KAHAN_SUM(sum, loglnv(p));
2017 } END_DO_FOR_EACH_SIEVE_PRIME
2018 }
2019 #else
2020 while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
2021 START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
2022 prod *= (LNV) p;
2023 if (++i >= (LNV_IS_QUAD ? 64 : 8)) {
2024 KAHAN_SUM(sum, loglnv(prod));
2025 prod = LNV_ONE;
2026 i = 0;
2027 }
2028 } END_DO_FOR_EACH_SIEVE_PRIME
2029 }
2030 if (prod > 1.0) { KAHAN_SUM(sum, loglnv(prod)); prod = LNV_ONE; }
2031 #endif
2032 end_segment_primes(ctx);
2033
2034 if (initial_sum > 0) KAHAN_SUM(sum, initial_sum);
2035 return sum;
2036 }
2037
2038
2039
2040 /*
2041 * See:
2042 * "Multiple-Precision Exponential Integral and Related Functions"
2043 * by David M. Smith
2044 * "On the Evaluation of the Complex-Valued Exponential Integral"
2045 * by Vincent Pegoraro and Philipp Slusallek
2046 * "Numerical Recipes" 3rd edition
2047 * by William H. Press et al.
2048 * "Rational Chevyshev Approximations for the Exponential Integral E_1(x)"
2049 * by W. J. Cody and Henry C. Thacher, Jr.
2050 *
2051 * Any mistakes here are completely my fault. This code has not been
2052 * verified for anything serious. For better results, see:
2053 * http://www.trnicely.net/pi/pix_0000.htm
2054 * which although the author claims are demonstration programs, will
2055 * undoubtedly produce more reliable results than this code does (I don't
2056 * know of any obvious issues with this code, but it just hasn't been used
2057 * by many people).
2058 */
2059
2060 static LNV const euler_mascheroni = LNVCONST(0.57721566490153286060651209008240243104215933593992);
2061 static LNV const li2 = LNVCONST(1.045163780117492784844588889194613136522615578151);
2062
Ei(NV x)2063 NV Ei(NV x) {
2064 LNV val, term;
2065 unsigned int n;
2066 KAHAN_INIT(sum);
2067
2068 if (x == 0) croak("Invalid input to ExponentialIntegral: x must be != 0");
2069 /* Protect against messed up rounding modes */
2070 if (x >= 12000) return INFINITY;
2071 if (x <= -12000) return 0;
2072
2073 if (x < -1) {
2074 /* Continued fraction, good for x < -1 */
2075 LNV lc = 0;
2076 LNV ld = LNV_ONE / (LNV_ONE - (LNV)x);
2077 val = ld * (-explnv(x));
2078 for (n = 1; n <= 100000; n++) {
2079 LNV old, t, n2;
2080 t = (LNV)(2*n + 1) - (LNV) x;
2081 n2 = n * n;
2082 lc = LNV_ONE / (t - n2 * lc);
2083 ld = LNV_ONE / (t - n2 * ld);
2084 old = val;
2085 val *= ld/lc;
2086 if ( fabslnv(val-old) <= LNV_EPSILON*fabslnv(val) )
2087 break;
2088 }
2089 } else if (x < 0) {
2090 /* Rational Chebyshev approximation (Cody, Thacher), good for -1 < x < 0 */
2091 static const LNV C6p[7] = { LNVCONST(-148151.02102575750838086),
2092 LNVCONST( 150260.59476436982420737),
2093 LNVCONST( 89904.972007457256553251),
2094 LNVCONST( 15924.175980637303639884),
2095 LNVCONST( 2150.0672908092918123209),
2096 LNVCONST( 116.69552669734461083368),
2097 LNVCONST( 5.0196785185439843791020) };
2098 static const LNV C6q[7] = { LNVCONST( 256664.93484897117319268),
2099 LNVCONST( 184340.70063353677359298),
2100 LNVCONST( 52440.529172056355429883),
2101 LNVCONST( 8125.8035174768735759866),
2102 LNVCONST( 750.43163907103936624165),
2103 LNVCONST( 40.205465640027706061433),
2104 LNVCONST( 1.0000000000000000000000) };
2105 LNV sumn = C6p[0]-x*(C6p[1]-x*(C6p[2]-x*(C6p[3]-x*(C6p[4]-x*(C6p[5]-x*C6p[6])))));
2106 LNV sumd = C6q[0]-x*(C6q[1]-x*(C6q[2]-x*(C6q[3]-x*(C6q[4]-x*(C6q[5]-x*C6q[6])))));
2107 val = loglnv(-x) - sumn/sumd;
2108 } else if (x < (-2 * loglnv(LNV_EPSILON))) {
2109 /* Convergent series. Accurate but slow especially with large x. */
2110 LNV fact_n = x;
2111 for (n = 2; n <= 200; n++) {
2112 LNV invn = LNV_ONE / n;
2113 fact_n *= (LNV)x * invn;
2114 term = fact_n * invn;
2115 KAHAN_SUM(sum, term);
2116 /* printf("C after adding %.20Lf, val = %.20Lf\n", term, sum); */
2117 if (term < LNV_EPSILON*sum) break;
2118 }
2119 KAHAN_SUM(sum, euler_mascheroni);
2120 KAHAN_SUM(sum, loglnv(x));
2121 KAHAN_SUM(sum, x);
2122 val = sum;
2123 } else if (x >= 24) {
2124 /* Cody / Thacher rational Chebyshev */
2125 static const LNV P2[10] = {
2126 LNVCONST( 1.75338801265465972390E02),
2127 LNVCONST(-2.23127670777632409550E02),
2128 LNVCONST(-1.81949664929868906455E01),
2129 LNVCONST(-2.79798528624305389340E01),
2130 LNVCONST(-7.63147701620253630855E00),
2131 LNVCONST(-1.52856623636929636839E01),
2132 LNVCONST(-7.06810977895029358836E00),
2133 LNVCONST(-5.00006640413131002475E00),
2134 LNVCONST(-3.00000000320981265753E00),
2135 LNVCONST( 1.00000000000000485503E00) };
2136 static const LNV Q2[9] = {
2137 LNVCONST( 3.97845977167414720840E04),
2138 LNVCONST( 3.97277109100414518365E00),
2139 LNVCONST( 1.37790390235747998793E02),
2140 LNVCONST( 1.17179220502086455287E02),
2141 LNVCONST( 7.04831847180424675988E01),
2142 LNVCONST(-1.20187763547154743238E01),
2143 LNVCONST(-7.99243595776339741065E00),
2144 LNVCONST(-2.99999894040324959612E00),
2145 LNVCONST( 1.99999999999048104167E00) };
2146 LNV invx = LNV_ONE / x, frac = 0.0;
2147 for (n = 0; n <= 8; n++)
2148 frac = Q2[n] / (P2[n] + x + frac);
2149 frac += P2[9];
2150 val = explnv(x) * (invx + invx*invx*frac);
2151 } else {
2152 /* Asymptotic divergent series */
2153 LNV invx = LNV_ONE / x;
2154 term = 1.0;
2155 for (n = 1; n <= 200; n++) {
2156 LNV last_term = term;
2157 term = term * ( (LNV)n * invx );
2158 if (term < LNV_EPSILON*sum) break;
2159 if (term < last_term) {
2160 KAHAN_SUM(sum, term);
2161 /* printf("A after adding %.20llf, sum = %.20llf\n", term, sum); */
2162 } else {
2163 KAHAN_SUM(sum, (-last_term/3) );
2164 /* printf("A after adding %.20llf, sum = %.20llf\n", -last_term/3, sum); */
2165 break;
2166 }
2167 }
2168 KAHAN_SUM(sum, LNV_ONE);
2169 val = explnv(x) * sum * invx;
2170 }
2171
2172 return val;
2173 }
2174
Li(NV x)2175 NV Li(NV x) {
2176 if (x == 0) return 0;
2177 if (x == 1) return -INFINITY;
2178 if (x == 2) return li2;
2179 if (x < 0) croak("Invalid input to LogarithmicIntegral: x must be >= 0");
2180 if (x >= NV_MAX) return INFINITY;
2181
2182 /* Calculate directly using Ramanujan's series. */
2183 if (x > 1) {
2184 const LNV logx = loglnv(x);
2185 LNV sum = 0, inner_sum = 0, old_sum, factorial = 1, power2 = 1;
2186 LNV q, p = -1;
2187 int k = 0, n = 0;
2188
2189 for (n = 1, k = 0; n < 200; n++) {
2190 factorial *= n;
2191 p *= -logx;
2192 q = factorial * power2;
2193 power2 *= 2;
2194 for (; k <= (n - 1) / 2; k++)
2195 inner_sum += LNV_ONE / (2 * k + 1);
2196 old_sum = sum;
2197 sum += (p / q) * inner_sum;
2198 if (fabslnv(sum - old_sum) <= LNV_EPSILON) break;
2199 }
2200 return euler_mascheroni + loglnv(logx) + sqrtlnv(x) * sum;
2201 }
2202
2203 return Ei(loglnv(x));
2204 }
2205
ld_inverse_li(long double lx)2206 static long double ld_inverse_li(long double lx) {
2207 int i;
2208 long double t, term, old_term = 0;
2209 /* Iterate Halley's method until error grows. */
2210 t = (lx <= 2) ? 2 : lx * logl(lx);
2211 for (i = 0; i < 4; i++) {
2212 long double dn = Li(t) - lx;
2213 term = dn*logl(t) / (1.0L + dn/(2*t));
2214 if (i > 0 && fabsl(term) >= fabsl(old_term)) { t -= term/4; break; }
2215 old_term = term;
2216 t -= term;
2217 }
2218 return t;
2219 }
2220
inverse_li(UV x)2221 UV inverse_li(UV x) {
2222 UV r, i;
2223 long double lx = (long double) x;
2224
2225 if (x <= 2) return x + (x > 0);
2226 r = (UV) ceill( ld_inverse_li(lx) );
2227 /* Meet our more stringent goal of an exact answer. */
2228 i = (x > 4e16) ? 2048 : 128;
2229 if (Li(r-1) >= lx) {
2230 while (Li(r-i) >= lx) r -= i;
2231 for (i = i/2; i > 0; i /= 2)
2232 if (Li(r-i) >= lx) r -= i;
2233 } else {
2234 while (Li(r+i-1) < lx) r += i;
2235 for (i = i/2; i > 0; i /= 2)
2236 if (Li(r+i-1) < lx) r += i;
2237 }
2238 return r;
2239 }
2240
ld_inverse_R(long double lx)2241 static long double ld_inverse_R(long double lx) {
2242 int i;
2243 long double t, dn, term, old_term = 0;
2244
2245 /* Rough estimate */
2246 if (lx <= 3.5) {
2247 t = lx + 2.24*(lx-1)/2;
2248 } else {
2249 t = lx * logl(lx);
2250 if (lx < 50) { t *= 1.2; }
2251 else if (lx < 1000) { t *= 1.15; }
2252 else { /* use inverse Li (one iteration) for first inverse R approx */
2253 dn = Li(t) - lx;
2254 term = dn * logl(t) / (1.0L + dn/(2*t));
2255 t -= term;
2256 }
2257 }
2258 /* Iterate 1-n rounds of Halley, usually only 3 needed. */
2259 for (i = 0; i < 100; i++) {
2260 dn = RiemannR(t) - lx;
2261 #if 1 /* Use f(t) = li(t) for derivatives */
2262 term = dn * logl(t) / (1.0L + dn/(2*t));
2263 #else /* Use f(t) = li(t) - li(sqrt(t))/2 for derivatives */
2264 long double logt = logl(t);
2265 long double sqrtt = sqrtl(t);
2266 long double FA = 2 * sqrtt * logt;
2267 long double FB = 2 * sqrtt - 1;
2268 long double ifz = FA / FB;
2269 long double iffz = (logt - 2*FB) / (2 * sqrtt * FA * FA * FA * FA);
2270 term = dn * ifz * (1.0L - dn * iffz);
2271 #endif
2272 if (i > 0 && fabsl(term) >= fabsl(old_term)) { t -= term/4; break; }
2273 old_term = term;
2274 t -= term;
2275 }
2276 return t;
2277 }
2278
inverse_R(UV x)2279 UV inverse_R(UV x) {
2280 if (x < 2) return x + (x > 0);
2281 return (UV) ceill( ld_inverse_R( (long double) x) );
2282 }
2283
2284
2285 /*
2286 * Storing the first 10-20 Zeta values makes sense. Past that it is purely
2287 * to avoid making the call to generate them ourselves. We could cache the
2288 * calculated values. These all have 1 subtracted from them. */
2289 static const long double riemann_zeta_table[] = {
2290 0.6449340668482264364724151666460251892L, /* zeta(2) */
2291 0.2020569031595942853997381615114499908L,
2292 0.0823232337111381915160036965411679028L,
2293 0.0369277551433699263313654864570341681L,
2294 0.0173430619844491397145179297909205279L,
2295 0.0083492773819228268397975498497967596L,
2296 0.0040773561979443393786852385086524653L,
2297 0.0020083928260822144178527692324120605L,
2298 0.0009945751278180853371459589003190170L,
2299 0.0004941886041194645587022825264699365L,
2300 0.0002460865533080482986379980477396710L,
2301 0.0001227133475784891467518365263573957L,
2302 0.0000612481350587048292585451051353337L,
2303 0.0000305882363070204935517285106450626L,
2304 0.0000152822594086518717325714876367220L,
2305 0.0000076371976378997622736002935630292L, /* zeta(17) Past here all we're */
2306 0.0000038172932649998398564616446219397L, /* zeta(18) getting is speed. */
2307 0.0000019082127165539389256569577951013L,
2308 0.0000009539620338727961131520386834493L,
2309 0.0000004769329867878064631167196043730L,
2310 0.0000002384505027277329900036481867530L,
2311 0.0000001192199259653110730677887188823L,
2312 0.0000000596081890512594796124402079358L,
2313 0.0000000298035035146522801860637050694L,
2314 0.0000000149015548283650412346585066307L,
2315 0.0000000074507117898354294919810041706L,
2316 0.0000000037253340247884570548192040184L,
2317 0.0000000018626597235130490064039099454L,
2318 0.0000000009313274324196681828717647350L,
2319 0.0000000004656629065033784072989233251L,
2320 0.0000000002328311833676505492001455976L,
2321 0.0000000001164155017270051977592973835L,
2322 0.0000000000582077208790270088924368599L,
2323 0.0000000000291038504449709968692942523L,
2324 0.0000000000145519218910419842359296322L,
2325 0.0000000000072759598350574810145208690L,
2326 0.0000000000036379795473786511902372363L,
2327 0.0000000000018189896503070659475848321L,
2328 0.0000000000009094947840263889282533118L,
2329 0.0000000000004547473783042154026799112L,
2330 0.0000000000002273736845824652515226821L,
2331 0.0000000000001136868407680227849349105L,
2332 0.0000000000000568434198762758560927718L,
2333 0.0000000000000284217097688930185545507L,
2334 0.0000000000000142108548280316067698343L,
2335 0.00000000000000710542739521085271287735L,
2336 0.00000000000000355271369133711367329847L,
2337 0.00000000000000177635684357912032747335L,
2338 0.000000000000000888178421093081590309609L,
2339 0.000000000000000444089210314381336419777L,
2340 0.000000000000000222044605079804198399932L,
2341 0.000000000000000111022302514106613372055L,
2342 0.0000000000000000555111512484548124372374L,
2343 0.0000000000000000277555756213612417258163L,
2344 0.0000000000000000138777878097252327628391L,
2345 };
2346 #define NPRECALC_ZETA (sizeof(riemann_zeta_table)/sizeof(riemann_zeta_table[0]))
2347
2348 /* Riemann Zeta on the real line, with 1 subtracted.
2349 * Compare to Math::Cephes zetac. Also zeta with q=1 and subtracting 1.
2350 *
2351 * The Cephes zeta function uses a series (2k)!/B_2k which converges rapidly
2352 * and has a very wide range of values. We use it here for some values.
2353 *
2354 * Note: Calculations here are done on long doubles and we try to generate as
2355 * much accuracy as possible. They will get returned to Perl as an NV,
2356 * which is typically a 64-bit double with 15 digits.
2357 *
2358 * For values 0.5 to 5, this code uses the rational Chebyshev approximation
2359 * from Cody and Thacher. This method is extraordinarily fast and very
2360 * accurate over its range (slightly better than Cephes for most values). If
2361 * we had quad floats, we could use the 9-term polynomial.
2362 */
ld_riemann_zeta(long double x)2363 long double ld_riemann_zeta(long double x) {
2364 int i;
2365
2366 if (x < 0) croak("Invalid input to RiemannZeta: x must be >= 0");
2367 if (x == 1) return INFINITY;
2368
2369 if (x == (unsigned int)x) {
2370 int k = x - 2;
2371 if ((k >= 0) && (k < (int)NPRECALC_ZETA))
2372 return riemann_zeta_table[k];
2373 }
2374
2375 /* Cody / Thacher rational Chebyshev approximation for small values */
2376 if (x >= 0.5 && x <= 5.0) {
2377 static const long double C8p[9] = { 1.287168121482446392809e10L,
2378 1.375396932037025111825e10L,
2379 5.106655918364406103683e09L,
2380 8.561471002433314862469e08L,
2381 7.483618124380232984824e07L,
2382 4.860106585461882511535e06L,
2383 2.739574990221406087728e05L,
2384 4.631710843183427123061e03L,
2385 5.787581004096660659109e01L };
2386 static const long double C8q[9] = { 2.574336242964846244667e10L,
2387 5.938165648679590160003e09L,
2388 9.006330373261233439089e08L,
2389 8.042536634283289888587e07L,
2390 5.609711759541920062814e06L,
2391 2.247431202899137523543e05L,
2392 7.574578909341537560115e03L,
2393 -2.373835781373772623086e01L,
2394 1.000000000000000000000L };
2395 long double sumn = C8p[0]+x*(C8p[1]+x*(C8p[2]+x*(C8p[3]+x*(C8p[4]+x*(C8p[5]+x*(C8p[6]+x*(C8p[7]+x*C8p[8])))))));
2396 long double sumd = C8q[0]+x*(C8q[1]+x*(C8q[2]+x*(C8q[3]+x*(C8q[4]+x*(C8q[5]+x*(C8q[6]+x*(C8q[7]+x*C8q[8])))))));
2397 long double sum = (sumn - (x-1)*sumd) / ((x-1)*sumd);
2398 return sum;
2399 }
2400
2401 if (x > 17000.0)
2402 return 0.0;
2403
2404 #if 0
2405 {
2406 KAHAN_INIT(sum);
2407 /* Simple defining series, works well. */
2408 for (i = 5; i <= 1000000; i++) {
2409 long double term = powl(i, -x);
2410 KAHAN_SUM(sum, term);
2411 if (term < LDBL_EPSILON*sum) break;
2412 }
2413 KAHAN_SUM(sum, powl(4, -x) );
2414 KAHAN_SUM(sum, powl(3, -x) );
2415 KAHAN_SUM(sum, powl(2, -x) );
2416 return sum;
2417 }
2418 #endif
2419
2420 /* The 2n!/B_2k series used by the Cephes library. */
2421 {
2422 /* gp/pari:
2423 * for(i=1,13,printf("%.38g\n",(2*i)!/bernreal(2*i)))
2424 * MPU:
2425 * use bignum;
2426 * say +(factorial(2*$_)/bernreal(2*$_))->bround(38) for 1..13;
2427 */
2428 static const long double A[] = {
2429 12.0L,
2430 -720.0L,
2431 30240.0L,
2432 -1209600.0L,
2433 47900160.0L,
2434 -1892437580.3183791606367583212735166425L,
2435 74724249600.0L,
2436 -2950130727918.1642244954382084600497650L,
2437 116467828143500.67248729113000661089201L,
2438 -4597978722407472.6105457273596737891656L,
2439 181521054019435467.73425331153534235290L,
2440 -7166165256175667011.3346447367083352775L,
2441 282908877253042996618.18640556532523927L,
2442 };
2443 long double a, b, s, t;
2444 const long double w = 10.0;
2445 s = 0.0;
2446 b = 0.0;
2447 for (i = 2; i < 11; i++) {
2448 b = powl( i, -x );
2449 s += b;
2450 if (fabsl(b) < fabsl(LDBL_EPSILON * s))
2451 return s;
2452 }
2453 s = s + b*w/(x-1.0) - 0.5 * b;
2454 a = 1.0;
2455 for (i = 0; i < 13; i++) {
2456 long double k = 2*i;
2457 a *= x + k;
2458 b /= w;
2459 t = a*b/A[i];
2460 s = s + t;
2461 if (fabsl(t) < fabsl(LDBL_EPSILON * s))
2462 break;
2463 a *= x + k + 1.0;
2464 b /= w;
2465 }
2466 return s;
2467 }
2468 }
2469
RiemannR(long double x)2470 long double RiemannR(long double x) {
2471 long double part_term, term, flogx, ki, old_sum;
2472 unsigned int k;
2473 KAHAN_INIT(sum);
2474
2475 if (x <= 0) croak("Invalid input to RiemannR: x must be > 0");
2476
2477 if (x > 1e19) {
2478 const signed char* amob = range_moebius(0, 100);
2479 KAHAN_SUM(sum, Li(x));
2480 for (k = 2; k <= 100; k++) {
2481 if (amob[k] == 0) continue;
2482 ki = 1.0L / (long double) k;
2483 part_term = powl(x,ki);
2484 if (part_term > LDBL_MAX) return INFINITY;
2485 term = amob[k] * ki * Li(part_term);
2486 old_sum = sum;
2487 KAHAN_SUM(sum, term);
2488 if (fabsl(sum - old_sum) <= LDBL_EPSILON) break;
2489 }
2490 Safefree(amob);
2491 return sum;
2492 }
2493
2494 KAHAN_SUM(sum, 1.0);
2495 flogx = logl(x);
2496 part_term = 1;
2497
2498 for (k = 1; k <= 10000; k++) {
2499 ki = (k-1 < NPRECALC_ZETA) ? riemann_zeta_table[k-1] : ld_riemann_zeta(k+1);
2500 part_term *= flogx / k;
2501 term = part_term / (k + k * ki);
2502 old_sum = sum;
2503 KAHAN_SUM(sum, term);
2504 /* printf("R %5d after adding %.18Lg, sum = %.19Lg (%Lg)\n", k, term, sum, fabsl(sum-old_sum)); */
2505 if (fabsl(sum - old_sum) <= LDBL_EPSILON) break;
2506 }
2507
2508 return sum;
2509 }
2510
_lambertw_approx(long double x)2511 static long double _lambertw_approx(long double x) {
2512 /* See Veberic 2009 for other approximations */
2513 if (x < -0.060) { /* Pade(3,2) */
2514 long double ti = 5.4365636569180904707205749L * x + 2.0L;
2515 long double t = (ti <= 0.0L) ? 0.0L : sqrtl(ti);
2516 long double t2 = t*t;
2517 long double t3 = t*t2;
2518 return (-1.0L + (1.0L/6.0L)*t + (257.0L/720.0L)*t2 + (13.0L/720.0L)*t3) / (1.0L + (5.0L/6.0L)*t + (103.0L/720.0L)*t2);
2519 } else if (x < 1.363) { /* Winitzki 2003 section 3.5 */
2520 long double l1 = logl(1.0L+x);
2521 return l1 * (1.0L - logl(1.0L+l1) / (2.0L+l1));
2522 } else if (x < 3.7) { /* Modification of Vargas 2013 */
2523 long double l1 = logl(x);
2524 long double l2 = logl(l1);
2525 return l1 - l2 - logl(1.0L - l2/l1)/2.0L;
2526 } else { /* Corless et al. 1993, page 22 */
2527 long double l1 = logl(x);
2528 long double l2 = logl(l1);
2529 long double d1 = 2.0L*l1*l1;
2530 long double d2 = 3.0L*l1*d1;
2531 long double d3 = 2.0L*l1*d2;
2532 long double d4 = 5.0L*l1*d3;
2533 long double w = l1 - l2 + l2/l1 + l2*(l2-2.0L)/d1;
2534 w += l2*(6.0L+l2*(-9.0L+2.0L*l2))/d2;
2535 w += l2*(-12.0L+l2*(36.0L+l2*(-22.0L+3.0L*l2)))/d3;
2536 w += l2*(60.0L+l2*(-300.0L+l2*(350.0L+l2*(-125.0L+12.0L*l2))))/d4;
2537 return w;
2538 }
2539 }
2540
lambertw(NV x)2541 NV lambertw(NV x) {
2542 long double w;
2543 int i;
2544
2545 if (x < -0.36787944117145L)
2546 croak("Invalid input to LambertW: x must be >= -1/e");
2547 if (x == 0.0L) return 0.0L;
2548
2549 /* Estimate initial value */
2550 w = _lambertw_approx(x);
2551 /* If input is too small, return .99999.... */
2552 if (w <= -1.0L) return -1.0L + 8*LDBL_EPSILON;
2553 /* For very small inputs, don't iterate, return approx directly. */
2554 if (x < -0.36783) return w;
2555
2556 #if 0 /* Halley */
2557 lastw = w;
2558 for (i = 0; i < 100; i++) {
2559 long double ew = expl(w);
2560 long double wew = w * ew;
2561 long double wewx = wew - x;
2562 long double w1 = w + 1;
2563 w = w - wewx / (ew * w1 - (w+2) * wewx/(2*w1));
2564 if (w != 0.0L && fabsl((w-lastw)/w) <= 8*LDBL_EPSILON) break;
2565 lastw = w;
2566 }
2567 #else /* Fritsch, see Veberic 2009. 1-2 iterations are enough. */
2568 for (i = 0; i < 6 && w != 0.0L; i++) {
2569 long double w1 = 1 + w;
2570 long double zn = logl((long double)x/w) - w;
2571 long double qn = 2 * w1 * (w1+(2.0L/3.0L)*zn);
2572 long double en = (zn/w1) * (qn-zn)/(qn-2.0L*zn);
2573 /* w *= 1.0L + en; if (fabsl(en) <= 16*LDBL_EPSILON) break; */
2574 long double wen = w * en;
2575 w += wen;
2576 if (fabsl(wen) <= 64*LDBL_EPSILON) break;
2577 }
2578 #endif
2579 #if LNV_IS_QUAD /* For quadmath, one high precision correction */
2580 if (w != LNV_ZERO) {
2581 LNV lw = w;
2582 LNV w1 = LNV_ONE + lw;
2583 LNV zn = loglnv((LNV)x/lw) - lw;
2584 LNV qn = LNVCONST(2.0) * w1 * (w1+(LNVCONST(2.0)/LNVCONST(3.0))*zn);
2585 LNV en = (zn/w1) * (qn-zn)/(qn-LNVCONST(2.0)*zn);
2586 return lw + lw * en;
2587 }
2588 #endif
2589
2590 return w;
2591 }
2592
2593 #if HAVE_STD_U64
2594 #define U64T uint64_t
2595 #else
2596 #define U64T UV
2597 #endif
2598
2599 /* Spigot from Arndt, Haenel, Winter, and Flammenkamp. */
2600 /* Modified for larger digits and rounding by Dana Jacobsen */
pidigits(int digits)2601 char* pidigits(int digits)
2602 {
2603 char* out;
2604 uint32_t *a, b, c, d, e, g, i, d4, d3, d2, d1;
2605 uint32_t const f = 10000;
2606 U64T d64; /* 64-bit intermediate for 2*2*10000*b > 2^32 (~30k digits) */
2607
2608 if (digits <= 0) return 0;
2609 if (digits <= DBL_DIG && digits <= 18) {
2610 Newz(0, out, 19, char);
2611 (void)sprintf(out, "%.*lf", (digits-1), 3.141592653589793238);
2612 return out;
2613 }
2614 digits++; /* For rounding */
2615 c = 14*(digits/4 + 2);
2616 New(0, out, digits+5+1, char);
2617 *out++ = '3'; /* We'll turn "31415..." into "3.1415..." */
2618 New(0, a, c, uint32_t);
2619 for (b = 0; b < c; b++) a[b] = 2000;
2620
2621 d = i = 0;
2622 while ((b = c -= 14) > 0 && i < (uint32_t)digits) {
2623 d = e = d % f;
2624 if (b > 107000) { /* Use 64-bit intermediate while necessary. */
2625 for (d64 = d; --b > 107000; ) {
2626 g = (b << 1) - 1;
2627 d64 = d64 * b + f * (U64T)a[b];
2628 a[b] = d64 % g;
2629 d64 /= g;
2630 }
2631 d = d64;
2632 b++;
2633 }
2634 while (--b > 0) {
2635 g = (b << 1) - 1;
2636 d = d * b + f * a[b];
2637 a[b] = d % g;
2638 d /= g;
2639 }
2640 /* sprintf(out+i, "%04d", e+d/f); i += 4; */
2641 d4 = e + d/f;
2642 if (d4 > 9999) {
2643 d4 -= 10000;
2644 out[i-1]++;
2645 for (b=i-1; out[b] == '0'+1; b--) { out[b]='0'; out[b-1]++; }
2646 }
2647 d3 = d4/10; d2 = d3/10; d1 = d2/10;
2648 out[i++] = '0' + d1;
2649 out[i++] = '0' + d2-d1*10;
2650 out[i++] = '0' + d3-d2*10;
2651 out[i++] = '0' + d4-d3*10;
2652 }
2653 Safefree(a);
2654 if (out[digits-1] >= '5') out[digits-2]++; /* Round */
2655 for (i = digits-2; out[i] == '9'+1; i--) /* Keep rounding */
2656 { out[i] = '0'; out[i-1]++; }
2657 digits--; /* Undo the extra digit we used for rounding */
2658 out[digits] = '\0';
2659 *out-- = '.';
2660 return out;
2661 }
2662
2663 /* 1. Perform signed integer validation on b/blen.
2664 * 2. Compare to a/alen using min or max based on first arg.
2665 * 3. Return 0 to select a, 1 to select b.
2666 */
strnum_minmax(int min,char * a,STRLEN alen,char * b,STRLEN blen)2667 int strnum_minmax(int min, char* a, STRLEN alen, char* b, STRLEN blen)
2668 {
2669 int aneg, bneg;
2670 STRLEN i;
2671 /* a is checked, process b */
2672 if (b == 0 || blen == 0) croak("Parameter must be a positive integer");
2673 bneg = (b[0] == '-');
2674 if (b[0] == '-' || b[0] == '+') { b++; blen--; }
2675 while (blen > 0 && *b == '0') { b++; blen--; }
2676 for (i = 0; i < blen; i++)
2677 if (!isDIGIT(b[i]))
2678 break;
2679 if (blen == 0 || i < blen)
2680 croak("Parameter must be a positive integer");
2681
2682 if (a == 0) return 1;
2683
2684 aneg = (a[0] == '-');
2685 if (a[0] == '-' || a[0] == '+') { a++; alen--; }
2686 while (alen > 0 && *a == '0') { a++; alen--; }
2687
2688 if (aneg != bneg) return min ? (bneg == 1) : (aneg == 1);
2689 if (aneg == 1) min = !min;
2690 if (alen != blen) return min ? (alen > blen) : (blen > alen);
2691
2692 for (i = 0; i < blen; i++)
2693 if (a[i] != b[i])
2694 return min ? (a[i] > b[i]) : (b[i] > a[i]);
2695 return 0; /* equal */
2696 }
2697
from_digit_string(UV * rn,const char * s,int base)2698 int from_digit_string(UV* rn, const char* s, int base)
2699 {
2700 UV max, n = 0;
2701 int i, len;
2702
2703 /* Skip leading -/+ and zeros */
2704 if (s[0] == '-' || s[0] == '+') s++;
2705 while (s[0] == '0') s++;
2706
2707 len = strlen(s);
2708 max = (UV_MAX-base+1)/base;
2709
2710 for (i = 0, len = strlen(s); i < len; i++) {
2711 const char c = s[i];
2712 int d = !isalnum(c) ? 255 : (c <= '9') ? c-'0' : (c <= 'Z') ? c-'A'+10 : c-'a'+10;
2713 if (d >= base) croak("Invalid digit for base %d", base);
2714 if (n > max) return 0; /* Overflow */
2715 n = n * base + d;
2716 }
2717 *rn = n;
2718 return 1;
2719 }
2720
from_digit_to_UV(UV * rn,UV * r,int len,int base)2721 int from_digit_to_UV(UV* rn, UV* r, int len, int base)
2722 {
2723 UV d, n = 0;
2724 int i;
2725 if (len < 0 || len > BITS_PER_WORD)
2726 return 0;
2727 for (i = 0; i < len; i++) {
2728 d = r[i];
2729 if (n > (UV_MAX-d)/base) break; /* overflow */
2730 n = n * base + d;
2731 }
2732 *rn = n;
2733 return (i >= len);
2734 }
2735
2736
from_digit_to_str(char ** rstr,UV * r,int len,int base)2737 int from_digit_to_str(char** rstr, UV* r, int len, int base)
2738 {
2739 char *so, *s;
2740 int i;
2741
2742 if (len < 0 || !(base == 2 || base == 10 || base == 16)) return 0;
2743
2744 if (r[0] >= (UV) base) return 0; /* TODO: We don't apply extended carry */
2745
2746 New(0, so, len + 3, char);
2747 s = so;
2748 if (base == 2 || base == 16) {
2749 *s++ = '0';
2750 *s++ = (base == 2) ? 'b' : 'x';
2751 }
2752 for (i = 0; i < len; i++) {
2753 UV d = r[i];
2754 s[i] = (d < 10) ? '0'+d : 'a'+d-10;
2755 }
2756 s[len] = '\0';
2757 *rstr = so;
2758 return 1;
2759 }
2760
to_digit_array(int * bits,UV n,int base,int length)2761 int to_digit_array(int* bits, UV n, int base, int length)
2762 {
2763 int d;
2764
2765 if (base < 2 || length > 128) return -1;
2766
2767 if (base == 2) {
2768 for (d = 0; n; n >>= 1)
2769 bits[d++] = n & 1;
2770 } else {
2771 for (d = 0; n; n /= base)
2772 bits[d++] = n % base;
2773 }
2774 if (length < 0) length = d;
2775 while (d < length)
2776 bits[d++] = 0;
2777 return length;
2778 }
2779
to_digit_string(char * s,UV n,int base,int length)2780 int to_digit_string(char* s, UV n, int base, int length)
2781 {
2782 int digits[128];
2783 int i, len = to_digit_array(digits, n, base, length);
2784
2785 if (len < 0) return -1;
2786 if (base > 36) croak("invalid base for string: %d", base);
2787
2788 for (i = 0; i < len; i++) {
2789 int dig = digits[len-i-1];
2790 s[i] = (dig < 10) ? '0'+dig : 'a'+dig-10;
2791 }
2792 s[len] = '\0';
2793 return len;
2794 }
2795
to_string_128(char str[40],IV hi,UV lo)2796 int to_string_128(char str[40], IV hi, UV lo)
2797 {
2798 int i, slen = 0, isneg = 0;
2799
2800 if (hi < 0) {
2801 isneg = 1;
2802 hi = -(hi+1);
2803 lo = UV_MAX - lo + 1;
2804 }
2805 #if BITS_PER_WORD == 64 && HAVE_UINT128
2806 {
2807 uint128_t dd, sum = (((uint128_t) hi) << 64) + lo;
2808 do {
2809 dd = sum / 10;
2810 str[slen++] = '0' + (sum - dd*10);
2811 sum = dd;
2812 } while (sum);
2813 }
2814 #else
2815 {
2816 UV d, r;
2817 uint32_t a[4];
2818 a[0] = hi >> (BITS_PER_WORD/2);
2819 a[1] = hi & (UV_MAX >> (BITS_PER_WORD/2));
2820 a[2] = lo >> (BITS_PER_WORD/2);
2821 a[3] = lo & (UV_MAX >> (BITS_PER_WORD/2));
2822 do {
2823 r = a[0];
2824 d = r/10; r = ((r-d*10) << (BITS_PER_WORD/2)) + a[1]; a[0] = d;
2825 d = r/10; r = ((r-d*10) << (BITS_PER_WORD/2)) + a[2]; a[1] = d;
2826 d = r/10; r = ((r-d*10) << (BITS_PER_WORD/2)) + a[3]; a[2] = d;
2827 d = r/10; r = r-d*10; a[3] = d;
2828 str[slen++] = '0'+(r%10);
2829 } while (a[0] || a[1] || a[2] || a[3]);
2830 }
2831 #endif
2832 /* Reverse the order */
2833 for (i=0; i < slen/2; i++) {
2834 char t=str[i];
2835 str[i]=str[slen-i-1];
2836 str[slen-i-1] = t;
2837 }
2838 /* Prepend a negative sign if needed */
2839 if (isneg) {
2840 for (i = slen; i > 0; i--)
2841 str[i] = str[i-1];
2842 str[0] = '-';
2843 slen++;
2844 }
2845 /* Add terminator */
2846 str[slen] = '\0';
2847 return slen;
2848 }
2849
2850 /* Oddball primality test.
2851 * In this file rather than primality.c because it uses factoring (!).
2852 * Algorithm from Charles R Greathouse IV, 2015 */
_catalan_v32(uint32_t n,uint32_t p)2853 static INLINE uint32_t _catalan_v32(uint32_t n, uint32_t p) {
2854 uint32_t s = 0;
2855 while (n /= p) s += n % 2;
2856 return s;
2857 }
_catalan_v(UV n,UV p)2858 static INLINE uint32_t _catalan_v(UV n, UV p) {
2859 uint32_t s = 0;
2860 while (n /= p) s += n % 2;
2861 return s;
2862 }
_catalan_mult(UV m,UV p,UV n,UV a)2863 static UV _catalan_mult(UV m, UV p, UV n, UV a) {
2864 if (p > a) {
2865 m = mulmod(m, p, n);
2866 } else {
2867 UV pow = (n <= 4294967295UL) ? _catalan_v32(a<<1,p) : _catalan_v(a<<1,p);
2868 m = (pow == 0) ? m
2869 : (pow == 1) ? mulmod(m,p,n)
2870 : mulmod(m,powmod(p,pow,n),n);
2871 }
2872 return m;
2873 }
_catalan_vtest(UV n,UV p)2874 static int _catalan_vtest(UV n, UV p) {
2875 while (n /= p)
2876 if (n % 2)
2877 return 1;
2878 return 0;
2879 }
is_catalan_pseudoprime(UV n)2880 int is_catalan_pseudoprime(UV n) {
2881 UV m, a;
2882 int i;
2883
2884 if (n < 2 || ((n % 2) == 0 && n != 2)) return 0;
2885 if (is_prob_prime(n)) return 1;
2886
2887 m = 1;
2888 a = n >> 1;
2889 /*
2890 * Ideally we could use some of the requirements for a mod 4/8/64 here:
2891 * http://www.combinatorics.net/conf/Z60/sp/sp/Shu-Chung%20Liu.pdf
2892 * But, how do we make +/-2 = X mod n into a solution for x = X mod 8?
2893 *
2894 * We could also exploit the exhaustive testing that shows there only
2895 * exist three below 1e10: 5907, 1194649, and 12327121.
2896 */
2897 {
2898 UV factors[MPU_MAX_FACTORS+1];
2899 int nfactors = factor_exp(n, factors, 0);
2900 #if BITS_PER_WORD == 32
2901 if (nfactors == 2) return 0; /* Page 9, all 32-bit semiprimes */
2902 #else
2903 if (nfactors == 2) { /* Conditions from Aebi and Cairns (2008) */
2904 if (n < UVCONST(10000000000)) return 0; /* Page 9 */
2905 if (2*factors[0]+1 >= factors[1]) return 0; /* Corollary 2 and 3 */
2906 }
2907 #endif
2908 /* Test every factor */
2909 for (i = 0; i < nfactors; i++) {
2910 if (_catalan_vtest(a << 1, factors[i]))
2911 return 0;
2912 }
2913 }
2914 {
2915 UV seg_base, seg_low, seg_high;
2916 unsigned char* segment;
2917 void* ctx;
2918 m = _catalan_mult(m, 2, n, a);
2919 m = _catalan_mult(m, 3, n, a);
2920 m = _catalan_mult(m, 5, n, a);
2921 ctx = start_segment_primes(7, n, &segment);
2922 while (next_segment_primes(ctx, &seg_base, &seg_low, &seg_high)) {
2923 START_DO_FOR_EACH_SIEVE_PRIME( segment, seg_base, seg_low, seg_high ) {
2924 m = _catalan_mult(m, p, n, a);
2925 } END_DO_FOR_EACH_SIEVE_PRIME
2926 }
2927 end_segment_primes(ctx);
2928 }
2929 return (a & 1) ? (m==(n-1)) : (m==1);
2930 }
2931
2932 /* If we have fast CTZ, use this GCD. See Brent Alg V and FLINT Abhinav Baid */
gcdz(UV x,UV y)2933 UV gcdz(UV x, UV y) {
2934 UV f, x2, y2;
2935
2936 if (x == 0) return y;
2937
2938 if (y & 1) { /* Optimize y odd */
2939 x >>= ctz(x);
2940 while (x != y) {
2941 if (x < y) { y -= x; y >>= ctz(y); }
2942 else { x -= y; x >>= ctz(x); }
2943 }
2944 return x;
2945 }
2946
2947 if (y == 0) return x;
2948
2949 /* Alternately: f = ctz(x|y); x >>= ctz(x); y >>= ctz(y); */
2950 x2 = ctz(x);
2951 y2 = ctz(y);
2952 f = (x2 <= y2) ? x2 : y2;
2953 x >>= x2;
2954 y >>= y2;
2955
2956 while (x != y) {
2957 if (x < y) {
2958 y -= x;
2959 y >>= ctz(y);
2960 } else {
2961 x -= y;
2962 x >>= ctz(x);
2963 }
2964 }
2965 return x << f;
2966 }
2967
2968 /* The intermediate values are so large that we can only stay in 64-bit
2969 * up to 53 or so using the divisor_sum calculations. So just use a table.
2970 * Save space by just storing the 32-bit values. */
2971 static const int32_t tau_table[] = {
2972 0,1,-24,252,-1472,4830,-6048,-16744,84480,-113643,-115920,534612,-370944,-577738,401856,1217160,987136,-6905934,2727432,10661420,-7109760,-4219488,-12830688,18643272,21288960,-25499225,13865712,-73279080,24647168,128406630,-29211840,-52843168,-196706304,134722224,165742416,-80873520,167282496,-182213314,-255874080,-145589976,408038400,308120442,101267712,-17125708,-786948864,-548895690,-447438528
2973 };
2974 #define NTAU (sizeof(tau_table)/sizeof(tau_table[0]))
ramanujan_tau(UV n)2975 IV ramanujan_tau(UV n) {
2976 return (n < NTAU) ? tau_table[n] : 0;
2977 }
2978
_count_class_div(UV s,UV b2)2979 static UV _count_class_div(UV s, UV b2) {
2980 UV h = 0, i, ndivisors, *divs, lim;
2981
2982 lim = isqrt(b2);
2983 if (lim*lim == b2) lim--;
2984 if (s > lim) return 0;
2985
2986 if ((lim-s) < 70) { /* Iterate looking for divisors */
2987 for (i = s; i <= lim; i++)
2988 if (b2 % i == 0)
2989 h++;
2990 } else { /* Walk through all the divisors */
2991 divs = _divisor_list(b2, &ndivisors);
2992 for (i = 0; i < ndivisors && divs[i] <= lim; i++)
2993 if (divs[i] >= s)
2994 h++;
2995 Safefree(divs);
2996 }
2997 return h;
2998 }
2999
3000 /* Returns 12 * H(n). See Cohen 5.3.5 or Pari/GP.
3001 * Pari/GP uses a different method for n > 500000, which is quite a bit
3002 * faster, but assumes the GRH. */
hclassno(UV n)3003 IV hclassno(UV n) {
3004 UV nmod4 = n % 4, b2, b, h;
3005 int square;
3006
3007 if (n == 0) return -1;
3008 if (nmod4 == 1 || nmod4 == 2) return 0;
3009 if (n == 3) return 4;
3010
3011 b = n & 1;
3012 b2 = (n+1) >> 2;
3013 square = is_perfect_square(b2);
3014
3015 h = divisor_sum(b2,0) >> 1;
3016 if (b == 1)
3017 h = 1 + square + ((h - 1) << 1);
3018 b += 2;
3019
3020 for (; b2 = (n + b*b) >> 2, 3*b2 < n; b += 2) {
3021 h += (b2 % b == 0)
3022 + is_perfect_square(b2)
3023 + (_count_class_div(b+1, b2) << 1);
3024 }
3025 return 12*h + ((b2*3 == n) ? 4 : square && !(n&1) ? 6 : 0);
3026 }
3027
polygonal_root(UV n,UV k,int * overflow)3028 UV polygonal_root(UV n, UV k, int* overflow) {
3029 UV D, R;
3030 MPUassert(k >= 3, "is_polygonal root < 3");
3031 *overflow = 0;
3032 if (n <= 1) return n;
3033 if (k == 4) return is_perfect_square(n) ? isqrt(n) : 0;
3034 if (k == 3) {
3035 if (n >= UV_MAX/8) *overflow = 1;
3036 D = n << 3;
3037 R = 1;
3038 } else {
3039 if (k > UV_MAX/k || n > UV_MAX/(8*k-16)) *overflow = 1;
3040 D = (8*k-16) * n;
3041 R = (k-4) * (k-4);
3042 }
3043 if (D+R <= D) *overflow = 1;
3044 D += R;
3045 if (*overflow || !is_perfect_square(D)) return 0;
3046 D = isqrt(D) + (k-4);
3047 R = 2*k - 4;
3048 if ((D % R) != 0) return 0;
3049 return D/R;
3050 }
3051
3052 /* These rank/unrank are O(n^2) algorithms using O(n) in-place space.
3053 * Bonet 2008 gives O(n log n) algorithms using a bit more space.
3054 */
3055
num_to_perm(UV k,int n,int * vec)3056 int num_to_perm(UV k, int n, int *vec) {
3057 int i, j, t, si = 0;
3058 UV f = factorial(n-1);
3059
3060 while (f == 0) /* We can handle n! overflow if we have a valid k */
3061 f = factorial(n - 1 - ++si);
3062
3063 if (k/f >= (UV)n)
3064 k %= f*n;
3065
3066 for (i = 0; i < n; i++)
3067 vec[i] = i;
3068 for (i = si; i < n-1; i++) {
3069 UV p = k/f;
3070 k -= p*f;
3071 f /= n-i-1;
3072 if (p > 0) {
3073 for (j = i+p, t = vec[j]; j > i; j--)
3074 vec[j] = vec[j-1];
3075 vec[i] = t;
3076 }
3077 }
3078 return 1;
3079 }
3080
perm_to_num(int n,int * vec,UV * rank)3081 int perm_to_num(int n, int *vec, UV *rank) {
3082 int i, j, k;
3083 UV f, num = 0;
3084 f = factorial(n-1);
3085 if (f == 0) return 0;
3086 for (i = 0; i < n-1; i++) {
3087 for (j = i+1, k = 0; j < n; j++)
3088 if (vec[j] < vec[i])
3089 k++;
3090 if ((UV)k > (UV_MAX-num)/f) return 0; /* overflow */
3091 num += k*f;
3092 f /= n-i-1;
3093 }
3094 *rank = num;
3095 return 1;
3096 }
3097
3098 /*
3099 * For k<n, an O(k) time and space method is shown on page 39 of
3100 * https://www.math.upenn.edu/~wilf/website/CombinatorialAlgorithms.pdf
3101 * Note it requires an O(k) complete shuffle as the results are sorted.
3102 *
3103 * This seems to be 4-100x faster than NumPy's random.{permutation,choice}
3104 * for n under 100k or so. It's even faster with larger n. For example
3105 * from numpy.random import choice; choice(100000000, 4, replace=False)
3106 * uses 774MB and takes 55 seconds. We take less than 1 microsecond.
3107 */
randperm(void * ctx,UV n,UV k,UV * S)3108 void randperm(void* ctx, UV n, UV k, UV *S) {
3109 UV i, j;
3110
3111 if (k > n) k = n;
3112
3113 if (k == 0) { /* 0 of n */
3114 } else if (k == 1) { /* 1 of n. Pick one at random */
3115 S[0] = urandomm64(ctx,n);
3116 } else if (k == 2 && n == 2) { /* 2 of 2. Flip a coin */
3117 S[0] = urandomb(ctx,1);
3118 S[1] = 1-S[0];
3119 } else if (k == 2) { /* 2 of n. Pick 2 skipping dup */
3120 S[0] = urandomm64(ctx,n);
3121 S[1] = urandomm64(ctx,n-1);
3122 if (S[1] >= S[0]) S[1]++;
3123 } else if (k < n/100 && k < 30) { /* k of n. Pick k with loop */
3124 for (i = 0; i < k; i++) {
3125 do {
3126 S[i] = urandomm64(ctx,n);
3127 for (j = 0; j < i; j++)
3128 if (S[j] == S[i])
3129 break;
3130 } while (j < i);
3131 }
3132 } else if (k < n/100 && n > 1000000) {/* k of n. Pick k with dedup retry */
3133 for (j = 0; j < k; ) {
3134 for (i = j; i < k; i++) /* Fill S[j .. k-1] then sort S */
3135 S[i] = urandomm64(ctx,n);
3136 qsort(S, k, sizeof(UV), _numcmp);
3137 for (j = 0, i = 1; i < k; i++) /* Find and remove dups. O(n). */
3138 if (S[j] != S[i])
3139 S[++j] = S[i];
3140 j++;
3141 }
3142 /* S is sorted unique k-selection of 0 to n-1. Shuffle. */
3143 for (i = 0; i < k; i++) {
3144 j = urandomm64(ctx,k-i);
3145 { UV t = S[i]; S[i] = S[i+j]; S[i+j] = t; }
3146 }
3147 } else if (k < n/4) { /* k of n. Pick k with mask */
3148 uint32_t *mask, smask[8] = {0};
3149 if (n <= 32*8) mask = smask;
3150 else Newz(0, mask, n/32 + ((n%32)?1:0), uint32_t);
3151 for (i = 0; i < k; i++) {
3152 do {
3153 j = urandomm64(ctx,n);
3154 } while ( mask[j>>5] & (1U << (j&0x1F)) );
3155 S[i] = j;
3156 mask[j>>5] |= (1U << (j&0x1F));
3157 }
3158 if (mask != smask) Safefree(mask);
3159 } else if (k < n) { /* k of n. FYK shuffle n, pick k */
3160 UV *T;
3161 New(0, T, n, UV);
3162 for (i = 0; i < n; i++)
3163 T[i] = i;
3164 for (i = 0; i < k && i <= n-2; i++) {
3165 j = urandomm64(ctx,n-i);
3166 S[i] = T[i+j];
3167 T[i+j] = T[i];
3168 }
3169 Safefree(T);
3170 } else { /* n of n. FYK shuffle. */
3171 for (i = 0; i < n; i++)
3172 S[i] = i;
3173 for (i = 0; i < k && i <= n-2; i++) {
3174 j = urandomm64(ctx,n-i);
3175 { UV t = S[i]; S[i] = S[i+j]; S[i+j] = t; }
3176 }
3177 }
3178 }
3179
random_factored_integer(void * ctx,UV n,int * nf,UV * factors)3180 UV random_factored_integer(void* ctx, UV n, int *nf, UV *factors) {
3181 UV r, s, nfac;
3182 if (n < 1)
3183 return 0;
3184 #if BITS_PER_WORD == 64 && (USE_MONTMATH || MULMODS_ARE_FAST)
3185 if (1) /* Our factoring is very fast, just use it */
3186 #elif BITS_PER_WORD == 64
3187 if (n < UVCONST(1000000000000))
3188 #endif
3189 {
3190 r = 1 + urandomm64(ctx, n);
3191 *nf = factor(r, factors);
3192 return r;
3193 }
3194 do { /* Kalai's algorithm */
3195 for (s = n, r = 1, nfac = 0; s > 1; ) {
3196 s = 1 + urandomm64(ctx, s);
3197 if (!is_prime(s)) continue;
3198 if (s > n / r) { r = 0; break; } /* overflow */
3199 r *= s;
3200 factors[nfac++] = s;
3201 }
3202 } while (r == 0 || r > n || (1 + urandomm64(ctx,n)) > r);
3203 *nf = nfac;
3204 return r;
3205 }
3206