1 /* factor -- print prime factors of n.
2 Copyright (C) 1986-2020 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
16
17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
18 Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
19 Arbitrary-precision code adapted by James Youngman from Torbjörn
20 Granlund's factorize.c, from GNU MP version 4.2.2.
21 In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
22 Contains code from GNU MP. */
23
24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
25 or, with GMP, numbers of any size.
26
27 Code organisation:
28
29 There are several variants of many functions, for handling one word, two
30 words, and GMP's mpz_t type. If the one-word variant is called foo, the
31 two-word variant will be foo2, and the one for mpz_t will be mp_foo. In
32 some cases, the plain function variants will handle both one-word and
33 two-word numbers, evidenced by function arguments.
34
35 The factoring code for two words will fall into the code for one word when
36 progress allows that.
37
38 Using GMP is optional. Define HAVE_GMP to make this code include GMP
39 factoring code. The GMP factoring code is based on GMP's demos/factorize.c
40 (last synced 2012-09-07). The GMP-based factoring code will stay in GMP
41 factoring code even if numbers get small enough for using the two-word
42 code.
43
44 Algorithm:
45
46 (1) Perform trial division using a small primes table, but without hardware
47 division since the primes table store inverses modulo the word base.
48 (The GMP variant of this code doesn't make use of the precomputed
49 inverses, but instead relies on GMP for fast divisibility testing.)
50 (2) Check the nature of any non-factored part using Miller-Rabin for
51 detecting composites, and Lucas for detecting primes.
52 (3) Factor any remaining composite part using the Pollard-Brent rho
53 algorithm or if USE_SQUFOF is defined to 1, try that first.
54 Status of found factors are checked again using Miller-Rabin and Lucas.
55
56 We prefer using Hensel norm in the divisions, not the more familiar
57 Euclidian norm, since the former leads to much faster code. In the
58 Pollard-Brent rho code and the prime testing code, we use Montgomery's
59 trick of multiplying all n-residues by the word base, allowing cheap Hensel
60 reductions mod n.
61
62 Improvements:
63
64 * Use modular inverses also for exact division in the Lucas code, and
65 elsewhere. A problem is to locate the inverses not from an index, but
66 from a prime. We might instead compute the inverse on-the-fly.
67
68 * Tune trial division table size (not forgetting that this is a standalone
69 program where the table will be read from disk for each invocation).
70
71 * Implement less naive powm, using k-ary exponentiation for k = 3 or
72 perhaps k = 4.
73
74 * Try to speed trial division code for single uintmax_t numbers, i.e., the
75 code using DIVBLOCK. It currently runs at 2 cycles per prime (Intel SBR,
76 IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
77 using gcc 4.6 and 4.7. Some software pipelining should help; 1, 2, and 4
78 respectively cycles ought to be possible.
79
80 * The redcify function could be vastly improved by using (plain Euclidian)
81 pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
82 GMP's gmp-impl.h). The redcify2 function could be vastly improved using
83 similar methoods. These functions currently dominate run time when using
84 the -w option.
85 */
86
87 /* Whether to recursively factor to prove primality,
88 or run faster probabilistic tests. */
89 #ifndef PROVE_PRIMALITY
90 # define PROVE_PRIMALITY 1
91 #endif
92
93 /* Faster for certain ranges but less general. */
94 #ifndef USE_SQUFOF
95 # define USE_SQUFOF 0
96 #endif
97
98 /* Output SQUFOF statistics. */
99 #ifndef STAT_SQUFOF
100 # define STAT_SQUFOF 0
101 #endif
102
103
104 #include <config.h>
105 #include <getopt.h>
106 #include <stdio.h>
107 #if HAVE_GMP
108 # include <gmp.h>
109 # if !HAVE_DECL_MPZ_INITS
110 # include <stdarg.h>
111 # endif
112 #endif
113
114 #include <assert.h>
115
116 #include "system.h"
117 #include "die.h"
118 #include "error.h"
119 #include "full-write.h"
120 #include "quote.h"
121 #include "readtokens.h"
122 #include "xstrtol.h"
123
124 /* The official name of this program (e.g., no 'g' prefix). */
125 #define PROGRAM_NAME "factor"
126
127 #define AUTHORS \
128 proper_name ("Paul Rubin"), \
129 proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
130 proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
131
132 /* Token delimiters when reading from a file. */
133 #define DELIM "\n\t "
134
135 #ifndef USE_LONGLONG_H
136 /* With the way we use longlong.h, it's only safe to use
137 when UWtype = UHWtype, as there were various cases
138 (as can be seen in the history for longlong.h) where
139 for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
140 to avoid compile time or run time issues. */
141 # if LONG_MAX == INTMAX_MAX
142 # define USE_LONGLONG_H 1
143 # endif
144 #endif
145
146 #if USE_LONGLONG_H
147
148 /* Make definitions for longlong.h to make it do what it can do for us */
149
150 /* bitcount for uintmax_t */
151 # if UINTMAX_MAX == UINT32_MAX
152 # define W_TYPE_SIZE 32
153 # elif UINTMAX_MAX == UINT64_MAX
154 # define W_TYPE_SIZE 64
155 # elif UINTMAX_MAX == UINT128_MAX
156 # define W_TYPE_SIZE 128
157 # endif
158
159 # define UWtype uintmax_t
160 # define UHWtype unsigned long int
161 # undef UDWtype
162 # if HAVE_ATTRIBUTE_MODE
163 typedef unsigned int UQItype __attribute__ ((mode (QI)));
164 typedef int SItype __attribute__ ((mode (SI)));
165 typedef unsigned int USItype __attribute__ ((mode (SI)));
166 typedef int DItype __attribute__ ((mode (DI)));
167 typedef unsigned int UDItype __attribute__ ((mode (DI)));
168 # else
169 typedef unsigned char UQItype;
170 typedef long SItype;
171 typedef unsigned long int USItype;
172 # if HAVE_LONG_LONG_INT
173 typedef long long int DItype;
174 typedef unsigned long long int UDItype;
175 # else /* Assume `long' gives us a wide enough type. Needed for hppa2.0w. */
176 typedef long int DItype;
177 typedef unsigned long int UDItype;
178 # endif
179 # endif
180 # define LONGLONG_STANDALONE /* Don't require GMP's longlong.h mdep files */
181 # define ASSERT(x) /* FIXME make longlong.h really standalone */
182 # define __GMP_DECLSPEC /* FIXME make longlong.h really standalone */
183 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
184 # ifndef __GMP_GNUC_PREREQ
185 # define __GMP_GNUC_PREREQ(a,b) 1
186 # endif
187
188 /* These stub macros are only used in longlong.h in certain system compiler
189 combinations, so ensure usage to avoid -Wunused-macros warnings. */
190 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
191 ASSERT (1)
192 __GMP_DECLSPEC
193 # endif
194
195 # if _ARCH_PPC
196 # define HAVE_HOST_CPU_FAMILY_powerpc 1
197 # endif
198 # include "longlong.h"
199 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
200 const unsigned char factor_clz_tab[129] =
201 {
202 1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
203 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
204 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
205 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
206 9
207 };
208 # endif
209
210 #else /* not USE_LONGLONG_H */
211
212 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
213 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
214 # define __ll_lowpart(t) ((uintmax_t) (t) & (__ll_B - 1))
215 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
216
217 #endif
218
219 #if !defined __clz_tab && !defined UHWtype
220 /* Without this seemingly useless conditional, gcc -Wunused-macros
221 warns that each of the two tested macros is unused on Fedora 18.
222 FIXME: this is just an ugly band-aid. Fix it properly. */
223 #endif
224
225 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
226 #define MAX_NFACTS 26
227
228 enum
229 {
230 DEV_DEBUG_OPTION = CHAR_MAX + 1
231 };
232
233 static struct option const long_options[] =
234 {
235 {"-debug", no_argument, NULL, DEV_DEBUG_OPTION},
236 {GETOPT_HELP_OPTION_DECL},
237 {GETOPT_VERSION_OPTION_DECL},
238 {NULL, 0, NULL, 0}
239 };
240
241 struct factors
242 {
243 uintmax_t plarge[2]; /* Can have a single large factor */
244 uintmax_t p[MAX_NFACTS];
245 unsigned char e[MAX_NFACTS];
246 unsigned char nfactors;
247 };
248
249 #if HAVE_GMP
250 struct mp_factors
251 {
252 mpz_t *p;
253 unsigned long int *e;
254 unsigned long int nfactors;
255 };
256 #endif
257
258 static void factor (uintmax_t, uintmax_t, struct factors *);
259
260 #ifndef umul_ppmm
261 # define umul_ppmm(w1, w0, u, v) \
262 do { \
263 uintmax_t __x0, __x1, __x2, __x3; \
264 unsigned long int __ul, __vl, __uh, __vh; \
265 uintmax_t __u = (u), __v = (v); \
266 \
267 __ul = __ll_lowpart (__u); \
268 __uh = __ll_highpart (__u); \
269 __vl = __ll_lowpart (__v); \
270 __vh = __ll_highpart (__v); \
271 \
272 __x0 = (uintmax_t) __ul * __vl; \
273 __x1 = (uintmax_t) __ul * __vh; \
274 __x2 = (uintmax_t) __uh * __vl; \
275 __x3 = (uintmax_t) __uh * __vh; \
276 \
277 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
278 __x1 += __x2; /* but this indeed can */ \
279 if (__x1 < __x2) /* did we get it? */ \
280 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
281 \
282 (w1) = __x3 + __ll_highpart (__x1); \
283 (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0); \
284 } while (0)
285 #endif
286
287 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
288 /* Define our own, not needing normalization. This function is
289 currently not performance critical, so keep it simple. Similar to
290 the mod macro below. */
291 # undef udiv_qrnnd
292 # define udiv_qrnnd(q, r, n1, n0, d) \
293 do { \
294 uintmax_t __d1, __d0, __q, __r1, __r0; \
295 \
296 assert ((n1) < (d)); \
297 __d1 = (d); __d0 = 0; \
298 __r1 = (n1); __r0 = (n0); \
299 __q = 0; \
300 for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--) \
301 { \
302 rsh2 (__d1, __d0, __d1, __d0, 1); \
303 __q <<= 1; \
304 if (ge2 (__r1, __r0, __d1, __d0)) \
305 { \
306 __q++; \
307 sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0); \
308 } \
309 } \
310 (r) = __r0; \
311 (q) = __q; \
312 } while (0)
313 #endif
314
315 #if !defined add_ssaaaa
316 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
317 do { \
318 uintmax_t _add_x; \
319 _add_x = (al) + (bl); \
320 (sh) = (ah) + (bh) + (_add_x < (al)); \
321 (sl) = _add_x; \
322 } while (0)
323 #endif
324
325 #define rsh2(rh, rl, ah, al, cnt) \
326 do { \
327 (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt)); \
328 (rh) = (ah) >> (cnt); \
329 } while (0)
330
331 #define lsh2(rh, rl, ah, al, cnt) \
332 do { \
333 (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt))); \
334 (rl) = (al) << (cnt); \
335 } while (0)
336
337 #define ge2(ah, al, bh, bl) \
338 ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
339
340 #define gt2(ah, al, bh, bl) \
341 ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
342
343 #ifndef sub_ddmmss
344 # define sub_ddmmss(rh, rl, ah, al, bh, bl) \
345 do { \
346 uintmax_t _cy; \
347 _cy = (al) < (bl); \
348 (rl) = (al) - (bl); \
349 (rh) = (ah) - (bh) - _cy; \
350 } while (0)
351 #endif
352
353 #ifndef count_leading_zeros
354 # define count_leading_zeros(count, x) do { \
355 uintmax_t __clz_x = (x); \
356 unsigned int __clz_c; \
357 for (__clz_c = 0; \
358 (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0; \
359 __clz_c += 8) \
360 __clz_x <<= 8; \
361 for (; (intmax_t)__clz_x >= 0; __clz_c++) \
362 __clz_x <<= 1; \
363 (count) = __clz_c; \
364 } while (0)
365 #endif
366
367 #ifndef count_trailing_zeros
368 # define count_trailing_zeros(count, x) do { \
369 uintmax_t __ctz_x = (x); \
370 unsigned int __ctz_c = 0; \
371 while ((__ctz_x & 1) == 0) \
372 { \
373 __ctz_x >>= 1; \
374 __ctz_c++; \
375 } \
376 (count) = __ctz_c; \
377 } while (0)
378 #endif
379
380 /* Requires that a < n and b <= n */
381 #define submod(r,a,b,n) \
382 do { \
383 uintmax_t _t = - (uintmax_t) (a < b); \
384 (r) = ((n) & _t) + (a) - (b); \
385 } while (0)
386
387 #define addmod(r,a,b,n) \
388 submod ((r), (a), ((n) - (b)), (n))
389
390 /* Modular two-word addition and subtraction. For performance reasons, the
391 most significant bit of n1 must be clear. The destination variables must be
392 distinct from the mod operand. */
393 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0) \
394 do { \
395 add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0)); \
396 if (ge2 ((r1), (r0), (n1), (n0))) \
397 sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0)); \
398 } while (0)
399 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0) \
400 do { \
401 sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0)); \
402 if ((intmax_t) (r1) < 0) \
403 add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0)); \
404 } while (0)
405
406 #define HIGHBIT_TO_MASK(x) \
407 (((intmax_t)-1 >> 1) < 0 \
408 ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1)) \
409 : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1)) \
410 ? UINTMAX_MAX : (uintmax_t) 0))
411
412 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
413 Requires that d1 != 0. */
414 static uintmax_t
mod2(uintmax_t * r1,uintmax_t a1,uintmax_t a0,uintmax_t d1,uintmax_t d0)415 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
416 {
417 int cntd, cnta;
418
419 assert (d1 != 0);
420
421 if (a1 == 0)
422 {
423 *r1 = 0;
424 return a0;
425 }
426
427 count_leading_zeros (cntd, d1);
428 count_leading_zeros (cnta, a1);
429 int cnt = cntd - cnta;
430 lsh2 (d1, d0, d1, d0, cnt);
431 for (int i = 0; i < cnt; i++)
432 {
433 if (ge2 (a1, a0, d1, d0))
434 sub_ddmmss (a1, a0, a1, a0, d1, d0);
435 rsh2 (d1, d0, d1, d0, 1);
436 }
437
438 *r1 = a1;
439 return a0;
440 }
441
442 static uintmax_t _GL_ATTRIBUTE_CONST
gcd_odd(uintmax_t a,uintmax_t b)443 gcd_odd (uintmax_t a, uintmax_t b)
444 {
445 if ( (b & 1) == 0)
446 {
447 uintmax_t t = b;
448 b = a;
449 a = t;
450 }
451 if (a == 0)
452 return b;
453
454 /* Take out least significant one bit, to make room for sign */
455 b >>= 1;
456
457 for (;;)
458 {
459 uintmax_t t;
460 uintmax_t bgta;
461
462 while ((a & 1) == 0)
463 a >>= 1;
464 a >>= 1;
465
466 t = a - b;
467 if (t == 0)
468 return (a << 1) + 1;
469
470 bgta = HIGHBIT_TO_MASK (t);
471
472 /* b <-- min (a, b) */
473 b += (bgta & t);
474
475 /* a <-- |a - b| */
476 a = (t ^ bgta) - bgta;
477 }
478 }
479
480 static uintmax_t
gcd2_odd(uintmax_t * r1,uintmax_t a1,uintmax_t a0,uintmax_t b1,uintmax_t b0)481 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
482 {
483 assert (b0 & 1);
484
485 if ( (a0 | a1) == 0)
486 {
487 *r1 = b1;
488 return b0;
489 }
490
491 while ((a0 & 1) == 0)
492 rsh2 (a1, a0, a1, a0, 1);
493
494 for (;;)
495 {
496 if ((b1 | a1) == 0)
497 {
498 *r1 = 0;
499 return gcd_odd (b0, a0);
500 }
501
502 if (gt2 (a1, a0, b1, b0))
503 {
504 sub_ddmmss (a1, a0, a1, a0, b1, b0);
505 do
506 rsh2 (a1, a0, a1, a0, 1);
507 while ((a0 & 1) == 0);
508 }
509 else if (gt2 (b1, b0, a1, a0))
510 {
511 sub_ddmmss (b1, b0, b1, b0, a1, a0);
512 do
513 rsh2 (b1, b0, b1, b0, 1);
514 while ((b0 & 1) == 0);
515 }
516 else
517 break;
518 }
519
520 *r1 = a1;
521 return a0;
522 }
523
524 static void
factor_insert_multiplicity(struct factors * factors,uintmax_t prime,unsigned int m)525 factor_insert_multiplicity (struct factors *factors,
526 uintmax_t prime, unsigned int m)
527 {
528 unsigned int nfactors = factors->nfactors;
529 uintmax_t *p = factors->p;
530 unsigned char *e = factors->e;
531
532 /* Locate position for insert new or increment e. */
533 int i;
534 for (i = nfactors - 1; i >= 0; i--)
535 {
536 if (p[i] <= prime)
537 break;
538 }
539
540 if (i < 0 || p[i] != prime)
541 {
542 for (int j = nfactors - 1; j > i; j--)
543 {
544 p[j + 1] = p[j];
545 e[j + 1] = e[j];
546 }
547 p[i + 1] = prime;
548 e[i + 1] = m;
549 factors->nfactors = nfactors + 1;
550 }
551 else
552 {
553 e[i] += m;
554 }
555 }
556
557 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
558
559 static void
factor_insert_large(struct factors * factors,uintmax_t p1,uintmax_t p0)560 factor_insert_large (struct factors *factors,
561 uintmax_t p1, uintmax_t p0)
562 {
563 if (p1 > 0)
564 {
565 assert (factors->plarge[1] == 0);
566 factors->plarge[0] = p0;
567 factors->plarge[1] = p1;
568 }
569 else
570 factor_insert (factors, p0);
571 }
572
573 #if HAVE_GMP
574
575 # if !HAVE_DECL_MPZ_INITS
576
577 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
578 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
579
580 static void
mpz_va_init(void (* mpz_single_init)(mpz_t),...)581 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
582 {
583 va_list ap;
584
585 va_start (ap, mpz_single_init);
586
587 mpz_t *mpz;
588 while ((mpz = va_arg (ap, mpz_t *)))
589 mpz_single_init (*mpz);
590
591 va_end (ap);
592 }
593 # endif
594
595 static void mp_factor (mpz_t, struct mp_factors *);
596
597 static void
mp_factor_init(struct mp_factors * factors)598 mp_factor_init (struct mp_factors *factors)
599 {
600 factors->p = NULL;
601 factors->e = NULL;
602 factors->nfactors = 0;
603 }
604
605 static void
mp_factor_clear(struct mp_factors * factors)606 mp_factor_clear (struct mp_factors *factors)
607 {
608 for (unsigned int i = 0; i < factors->nfactors; i++)
609 mpz_clear (factors->p[i]);
610
611 free (factors->p);
612 free (factors->e);
613 }
614
615 static void
mp_factor_insert(struct mp_factors * factors,mpz_t prime)616 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
617 {
618 unsigned long int nfactors = factors->nfactors;
619 mpz_t *p = factors->p;
620 unsigned long int *e = factors->e;
621 long i;
622
623 /* Locate position for insert new or increment e. */
624 for (i = nfactors - 1; i >= 0; i--)
625 {
626 if (mpz_cmp (p[i], prime) <= 0)
627 break;
628 }
629
630 if (i < 0 || mpz_cmp (p[i], prime) != 0)
631 {
632 p = xrealloc (p, (nfactors + 1) * sizeof p[0]);
633 e = xrealloc (e, (nfactors + 1) * sizeof e[0]);
634
635 mpz_init (p[nfactors]);
636 for (long j = nfactors - 1; j > i; j--)
637 {
638 mpz_set (p[j + 1], p[j]);
639 e[j + 1] = e[j];
640 }
641 mpz_set (p[i + 1], prime);
642 e[i + 1] = 1;
643
644 factors->p = p;
645 factors->e = e;
646 factors->nfactors = nfactors + 1;
647 }
648 else
649 {
650 e[i] += 1;
651 }
652 }
653
654 static void
mp_factor_insert_ui(struct mp_factors * factors,unsigned long int prime)655 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
656 {
657 mpz_t pz;
658
659 mpz_init_set_ui (pz, prime);
660 mp_factor_insert (factors, pz);
661 mpz_clear (pz);
662 }
663 #endif /* HAVE_GMP */
664
665
666 /* Number of bits in an uintmax_t. */
667 enum { W = sizeof (uintmax_t) * CHAR_BIT };
668
669 /* Verify that uintmax_t does not have holes in its representation. */
670 verify (UINTMAX_MAX >> (W - 1) == 1);
671
672 #define P(a,b,c,d) a,
673 static const unsigned char primes_diff[] = {
674 #include "primes.h"
675 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
676 };
677 #undef P
678
679 #define PRIMES_PTAB_ENTRIES \
680 (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
681
682 #define P(a,b,c,d) b,
683 static const unsigned char primes_diff8[] = {
684 #include "primes.h"
685 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
686 };
687 #undef P
688
689 struct primes_dtab
690 {
691 uintmax_t binv, lim;
692 };
693
694 #define P(a,b,c,d) {c,d},
695 static const struct primes_dtab primes_dtab[] = {
696 #include "primes.h"
697 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
698 };
699 #undef P
700
701 /* Verify that uintmax_t is not wider than
702 the integers used to generate primes.h. */
703 verify (W <= WIDE_UINT_BITS);
704
705 /* debugging for developers. Enables devmsg().
706 This flag is used only in the GMP code. */
707 static bool dev_debug = false;
708
709 /* Prove primality or run probabilistic tests. */
710 static bool flag_prove_primality = PROVE_PRIMALITY;
711
712 /* Number of Miller-Rabin tests to run when not proving primality. */
713 #define MR_REPS 25
714
715 static void
factor_insert_refind(struct factors * factors,uintmax_t p,unsigned int i,unsigned int off)716 factor_insert_refind (struct factors *factors, uintmax_t p, unsigned int i,
717 unsigned int off)
718 {
719 for (unsigned int j = 0; j < off; j++)
720 p += primes_diff[i + j];
721 factor_insert (factors, p);
722 }
723
724 /* Trial division with odd primes uses the following trick.
725
726 Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
727 consider the case t < B (this is the second loop below).
728
729 From our tables we get
730
731 binv = p^{-1} (mod B)
732 lim = floor ( (B-1) / p ).
733
734 First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
735 (and all quotients in this range occur for some t).
736
737 Then t = q * p is true also (mod B), and p is invertible we get
738
739 q = t * binv (mod B).
740
741 Next, assume that t is *not* divisible by p. Since multiplication
742 by binv (mod B) is a one-to-one mapping,
743
744 t * binv (mod B) > lim,
745
746 because all the smaller values are already taken.
747
748 This can be summed up by saying that the function
749
750 q(t) = binv * t (mod B)
751
752 is a permutation of the range 0 <= t < B, with the curious property
753 that it maps the multiples of p onto the range 0 <= q <= lim, in
754 order, and the non-multiples of p onto the range lim < q < B.
755 */
756
757 static uintmax_t
factor_using_division(uintmax_t * t1p,uintmax_t t1,uintmax_t t0,struct factors * factors)758 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
759 struct factors *factors)
760 {
761 if (t0 % 2 == 0)
762 {
763 unsigned int cnt;
764
765 if (t0 == 0)
766 {
767 count_trailing_zeros (cnt, t1);
768 t0 = t1 >> cnt;
769 t1 = 0;
770 cnt += W_TYPE_SIZE;
771 }
772 else
773 {
774 count_trailing_zeros (cnt, t0);
775 rsh2 (t1, t0, t1, t0, cnt);
776 }
777
778 factor_insert_multiplicity (factors, 2, cnt);
779 }
780
781 uintmax_t p = 3;
782 unsigned int i;
783 for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
784 {
785 for (;;)
786 {
787 uintmax_t q1, q0, hi, lo _GL_UNUSED;
788
789 q0 = t0 * primes_dtab[i].binv;
790 umul_ppmm (hi, lo, q0, p);
791 if (hi > t1)
792 break;
793 hi = t1 - hi;
794 q1 = hi * primes_dtab[i].binv;
795 if (LIKELY (q1 > primes_dtab[i].lim))
796 break;
797 t1 = q1; t0 = q0;
798 factor_insert (factors, p);
799 }
800 p += primes_diff[i + 1];
801 }
802 if (t1p)
803 *t1p = t1;
804
805 #define DIVBLOCK(I) \
806 do { \
807 for (;;) \
808 { \
809 q = t0 * pd[I].binv; \
810 if (LIKELY (q > pd[I].lim)) \
811 break; \
812 t0 = q; \
813 factor_insert_refind (factors, p, i + 1, I); \
814 } \
815 } while (0)
816
817 for (; i < PRIMES_PTAB_ENTRIES; i += 8)
818 {
819 uintmax_t q;
820 const struct primes_dtab *pd = &primes_dtab[i];
821 DIVBLOCK (0);
822 DIVBLOCK (1);
823 DIVBLOCK (2);
824 DIVBLOCK (3);
825 DIVBLOCK (4);
826 DIVBLOCK (5);
827 DIVBLOCK (6);
828 DIVBLOCK (7);
829
830 p += primes_diff8[i];
831 if (p * p > t0)
832 break;
833 }
834
835 return t0;
836 }
837
838 #if HAVE_GMP
839 static void
mp_factor_using_division(mpz_t t,struct mp_factors * factors)840 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
841 {
842 mpz_t q;
843 unsigned long int p;
844
845 devmsg ("[trial division] ");
846
847 mpz_init (q);
848
849 p = mpz_scan1 (t, 0);
850 mpz_div_2exp (t, t, p);
851 while (p)
852 {
853 mp_factor_insert_ui (factors, 2);
854 --p;
855 }
856
857 p = 3;
858 for (unsigned int i = 1; i <= PRIMES_PTAB_ENTRIES;)
859 {
860 if (! mpz_divisible_ui_p (t, p))
861 {
862 p += primes_diff[i++];
863 if (mpz_cmp_ui (t, p * p) < 0)
864 break;
865 }
866 else
867 {
868 mpz_tdiv_q_ui (t, t, p);
869 mp_factor_insert_ui (factors, p);
870 }
871 }
872
873 mpz_clear (q);
874 }
875 #endif
876
877 /* Entry i contains (2i+1)^(-1) mod 2^8. */
878 static const unsigned char binvert_table[128] =
879 {
880 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
881 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
882 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
883 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
884 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
885 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
886 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
887 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
888 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
889 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
890 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
891 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
892 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
893 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
894 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
895 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
896 };
897
898 /* Compute n^(-1) mod B, using a Newton iteration. */
899 #define binv(inv,n) \
900 do { \
901 uintmax_t __n = (n); \
902 uintmax_t __inv; \
903 \
904 __inv = binvert_table[(__n / 2) & 0x7F]; /* 8 */ \
905 if (W_TYPE_SIZE > 8) __inv = 2 * __inv - __inv * __inv * __n; \
906 if (W_TYPE_SIZE > 16) __inv = 2 * __inv - __inv * __inv * __n; \
907 if (W_TYPE_SIZE > 32) __inv = 2 * __inv - __inv * __inv * __n; \
908 \
909 if (W_TYPE_SIZE > 64) \
910 { \
911 int __invbits = 64; \
912 do { \
913 __inv = 2 * __inv - __inv * __inv * __n; \
914 __invbits *= 2; \
915 } while (__invbits < W_TYPE_SIZE); \
916 } \
917 \
918 (inv) = __inv; \
919 } while (0)
920
921 /* q = u / d, assuming d|u. */
922 #define divexact_21(q1, q0, u1, u0, d) \
923 do { \
924 uintmax_t _di, _q0; \
925 binv (_di, (d)); \
926 _q0 = (u0) * _di; \
927 if ((u1) >= (d)) \
928 { \
929 uintmax_t _p1, _p0 _GL_UNUSED; \
930 umul_ppmm (_p1, _p0, _q0, d); \
931 (q1) = ((u1) - _p1) * _di; \
932 (q0) = _q0; \
933 } \
934 else \
935 { \
936 (q0) = _q0; \
937 (q1) = 0; \
938 } \
939 } while (0)
940
941 /* x B (mod n). */
942 #define redcify(r_prim, r, n) \
943 do { \
944 uintmax_t _redcify_q _GL_UNUSED; \
945 udiv_qrnnd (_redcify_q, r_prim, r, 0, n); \
946 } while (0)
947
948 /* x B^2 (mod n). Requires x > 0, n1 < B/2 */
949 #define redcify2(r1, r0, x, n1, n0) \
950 do { \
951 uintmax_t _r1, _r0, _i; \
952 if ((x) < (n1)) \
953 { \
954 _r1 = (x); _r0 = 0; \
955 _i = W_TYPE_SIZE; \
956 } \
957 else \
958 { \
959 _r1 = 0; _r0 = (x); \
960 _i = 2*W_TYPE_SIZE; \
961 } \
962 while (_i-- > 0) \
963 { \
964 lsh2 (_r1, _r0, _r1, _r0, 1); \
965 if (ge2 (_r1, _r0, (n1), (n0))) \
966 sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0)); \
967 } \
968 (r1) = _r1; \
969 (r0) = _r0; \
970 } while (0)
971
972 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
973 Both a and b must be in redc form, the result will be in redc form too. */
974 static inline uintmax_t
mulredc(uintmax_t a,uintmax_t b,uintmax_t m,uintmax_t mi)975 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
976 {
977 uintmax_t rh, rl, q, th, tl _GL_UNUSED, xh;
978
979 umul_ppmm (rh, rl, a, b);
980 q = rl * mi;
981 umul_ppmm (th, tl, q, m);
982 xh = rh - th;
983 if (rh < th)
984 xh += m;
985
986 return xh;
987 }
988
989 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
990 Both a and b must be in redc form, the result will be in redc form too.
991 For performance reasons, the most significant bit of m must be clear. */
992 static uintmax_t
mulredc2(uintmax_t * r1p,uintmax_t a1,uintmax_t a0,uintmax_t b1,uintmax_t b0,uintmax_t m1,uintmax_t m0,uintmax_t mi)993 mulredc2 (uintmax_t *r1p,
994 uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
995 uintmax_t m1, uintmax_t m0, uintmax_t mi)
996 {
997 uintmax_t r1, r0, q, p1, p0 _GL_UNUSED, t1, t0, s1, s0;
998 mi = -mi;
999 assert ( (a1 >> (W_TYPE_SIZE - 1)) == 0);
1000 assert ( (b1 >> (W_TYPE_SIZE - 1)) == 0);
1001 assert ( (m1 >> (W_TYPE_SIZE - 1)) == 0);
1002
1003 /* First compute a0 * <b1, b0> B^{-1}
1004 +-----+
1005 |a0 b0|
1006 +--+--+--+
1007 |a0 b1|
1008 +--+--+--+
1009 |q0 m0|
1010 +--+--+--+
1011 |q0 m1|
1012 -+--+--+--+
1013 |r1|r0| 0|
1014 +--+--+--+
1015 */
1016 umul_ppmm (t1, t0, a0, b0);
1017 umul_ppmm (r1, r0, a0, b1);
1018 q = mi * t0;
1019 umul_ppmm (p1, p0, q, m0);
1020 umul_ppmm (s1, s0, q, m1);
1021 r0 += (t0 != 0); /* Carry */
1022 add_ssaaaa (r1, r0, r1, r0, 0, p1);
1023 add_ssaaaa (r1, r0, r1, r0, 0, t1);
1024 add_ssaaaa (r1, r0, r1, r0, s1, s0);
1025
1026 /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1027 +-----+
1028 |a1 b0|
1029 +--+--+
1030 |r1|r0|
1031 +--+--+--+
1032 |a1 b1|
1033 +--+--+--+
1034 |q1 m0|
1035 +--+--+--+
1036 |q1 m1|
1037 -+--+--+--+
1038 |r1|r0| 0|
1039 +--+--+--+
1040 */
1041 umul_ppmm (t1, t0, a1, b0);
1042 umul_ppmm (s1, s0, a1, b1);
1043 add_ssaaaa (t1, t0, t1, t0, 0, r0);
1044 q = mi * t0;
1045 add_ssaaaa (r1, r0, s1, s0, 0, r1);
1046 umul_ppmm (p1, p0, q, m0);
1047 umul_ppmm (s1, s0, q, m1);
1048 r0 += (t0 != 0); /* Carry */
1049 add_ssaaaa (r1, r0, r1, r0, 0, p1);
1050 add_ssaaaa (r1, r0, r1, r0, 0, t1);
1051 add_ssaaaa (r1, r0, r1, r0, s1, s0);
1052
1053 if (ge2 (r1, r0, m1, m0))
1054 sub_ddmmss (r1, r0, r1, r0, m1, m0);
1055
1056 *r1p = r1;
1057 return r0;
1058 }
1059
1060 static uintmax_t _GL_ATTRIBUTE_CONST
powm(uintmax_t b,uintmax_t e,uintmax_t n,uintmax_t ni,uintmax_t one)1061 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1062 {
1063 uintmax_t y = one;
1064
1065 if (e & 1)
1066 y = b;
1067
1068 while (e != 0)
1069 {
1070 b = mulredc (b, b, n, ni);
1071 e >>= 1;
1072
1073 if (e & 1)
1074 y = mulredc (y, b, n, ni);
1075 }
1076
1077 return y;
1078 }
1079
1080 static uintmax_t
powm2(uintmax_t * r1m,const uintmax_t * bp,const uintmax_t * ep,const uintmax_t * np,uintmax_t ni,const uintmax_t * one)1081 powm2 (uintmax_t *r1m,
1082 const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1083 uintmax_t ni, const uintmax_t *one)
1084 {
1085 uintmax_t r1, r0, b1, b0, n1, n0;
1086 unsigned int i;
1087 uintmax_t e;
1088
1089 b0 = bp[0];
1090 b1 = bp[1];
1091 n0 = np[0];
1092 n1 = np[1];
1093
1094 r0 = one[0];
1095 r1 = one[1];
1096
1097 for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1098 {
1099 if (e & 1)
1100 {
1101 r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1102 r1 = *r1m;
1103 }
1104 b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1105 b1 = *r1m;
1106 }
1107 for (e = ep[1]; e > 0; e >>= 1)
1108 {
1109 if (e & 1)
1110 {
1111 r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1112 r1 = *r1m;
1113 }
1114 b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1115 b1 = *r1m;
1116 }
1117 *r1m = r1;
1118 return r0;
1119 }
1120
1121 static bool _GL_ATTRIBUTE_CONST
millerrabin(uintmax_t n,uintmax_t ni,uintmax_t b,uintmax_t q,unsigned int k,uintmax_t one)1122 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1123 unsigned int k, uintmax_t one)
1124 {
1125 uintmax_t y = powm (b, q, n, ni, one);
1126
1127 uintmax_t nm1 = n - one; /* -1, but in redc representation. */
1128
1129 if (y == one || y == nm1)
1130 return true;
1131
1132 for (unsigned int i = 1; i < k; i++)
1133 {
1134 y = mulredc (y, y, n, ni);
1135
1136 if (y == nm1)
1137 return true;
1138 if (y == one)
1139 return false;
1140 }
1141 return false;
1142 }
1143
1144 static bool
millerrabin2(const uintmax_t * np,uintmax_t ni,const uintmax_t * bp,const uintmax_t * qp,unsigned int k,const uintmax_t * one)1145 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1146 const uintmax_t *qp, unsigned int k, const uintmax_t *one)
1147 {
1148 uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1149
1150 y0 = powm2 (&r1m, bp, qp, np, ni, one);
1151 y1 = r1m;
1152
1153 if (y0 == one[0] && y1 == one[1])
1154 return true;
1155
1156 sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1157
1158 if (y0 == nm1_0 && y1 == nm1_1)
1159 return true;
1160
1161 for (unsigned int i = 1; i < k; i++)
1162 {
1163 y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1164 y1 = r1m;
1165
1166 if (y0 == nm1_0 && y1 == nm1_1)
1167 return true;
1168 if (y0 == one[0] && y1 == one[1])
1169 return false;
1170 }
1171 return false;
1172 }
1173
1174 #if HAVE_GMP
1175 static bool
mp_millerrabin(mpz_srcptr n,mpz_srcptr nm1,mpz_ptr x,mpz_ptr y,mpz_srcptr q,unsigned long int k)1176 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1177 mpz_srcptr q, unsigned long int k)
1178 {
1179 mpz_powm (y, x, q, n);
1180
1181 if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1182 return true;
1183
1184 for (unsigned long int i = 1; i < k; i++)
1185 {
1186 mpz_powm_ui (y, y, 2, n);
1187 if (mpz_cmp (y, nm1) == 0)
1188 return true;
1189 if (mpz_cmp_ui (y, 1) == 0)
1190 return false;
1191 }
1192 return false;
1193 }
1194 #endif
1195
1196 /* Lucas' prime test. The number of iterations vary greatly, up to a few dozen
1197 have been observed. The average seem to be about 2. */
1198 static bool
prime_p(uintmax_t n)1199 prime_p (uintmax_t n)
1200 {
1201 int k;
1202 bool is_prime;
1203 uintmax_t a_prim, one, ni;
1204 struct factors factors;
1205
1206 if (n <= 1)
1207 return false;
1208
1209 /* We have already casted out small primes. */
1210 if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1211 return true;
1212
1213 /* Precomputation for Miller-Rabin. */
1214 uintmax_t q = n - 1;
1215 for (k = 0; (q & 1) == 0; k++)
1216 q >>= 1;
1217
1218 uintmax_t a = 2;
1219 binv (ni, n); /* ni <- 1/n mod B */
1220 redcify (one, 1, n);
1221 addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1222
1223 /* Perform a Miller-Rabin test, finds most composites quickly. */
1224 if (!millerrabin (n, ni, a_prim, q, k, one))
1225 return false;
1226
1227 if (flag_prove_primality)
1228 {
1229 /* Factor n-1 for Lucas. */
1230 factor (0, n - 1, &factors);
1231 }
1232
1233 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1234 number composite. */
1235 for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1236 {
1237 if (flag_prove_primality)
1238 {
1239 is_prime = true;
1240 for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1241 {
1242 is_prime
1243 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1244 }
1245 }
1246 else
1247 {
1248 /* After enough Miller-Rabin runs, be content. */
1249 is_prime = (r == MR_REPS - 1);
1250 }
1251
1252 if (is_prime)
1253 return true;
1254
1255 a += primes_diff[r]; /* Establish new base. */
1256
1257 /* The following is equivalent to redcify (a_prim, a, n). It runs faster
1258 on most processors, since it avoids udiv_qrnnd. If we go down the
1259 udiv_qrnnd_preinv path, this code should be replaced. */
1260 {
1261 uintmax_t s1, s0;
1262 umul_ppmm (s1, s0, one, a);
1263 if (LIKELY (s1 == 0))
1264 a_prim = s0 % n;
1265 else
1266 {
1267 uintmax_t dummy _GL_UNUSED;
1268 udiv_qrnnd (dummy, a_prim, s1, s0, n);
1269 }
1270 }
1271
1272 if (!millerrabin (n, ni, a_prim, q, k, one))
1273 return false;
1274 }
1275
1276 error (0, 0, _("Lucas prime test failure. This should not happen"));
1277 abort ();
1278 }
1279
1280 static bool
prime2_p(uintmax_t n1,uintmax_t n0)1281 prime2_p (uintmax_t n1, uintmax_t n0)
1282 {
1283 uintmax_t q[2], nm1[2];
1284 uintmax_t a_prim[2];
1285 uintmax_t one[2];
1286 uintmax_t na[2];
1287 uintmax_t ni;
1288 unsigned int k;
1289 struct factors factors;
1290
1291 if (n1 == 0)
1292 return prime_p (n0);
1293
1294 nm1[1] = n1 - (n0 == 0);
1295 nm1[0] = n0 - 1;
1296 if (nm1[0] == 0)
1297 {
1298 count_trailing_zeros (k, nm1[1]);
1299
1300 q[0] = nm1[1] >> k;
1301 q[1] = 0;
1302 k += W_TYPE_SIZE;
1303 }
1304 else
1305 {
1306 count_trailing_zeros (k, nm1[0]);
1307 rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1308 }
1309
1310 uintmax_t a = 2;
1311 binv (ni, n0);
1312 redcify2 (one[1], one[0], 1, n1, n0);
1313 addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1314
1315 /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1316 na[0] = n0;
1317 na[1] = n1;
1318
1319 if (!millerrabin2 (na, ni, a_prim, q, k, one))
1320 return false;
1321
1322 if (flag_prove_primality)
1323 {
1324 /* Factor n-1 for Lucas. */
1325 factor (nm1[1], nm1[0], &factors);
1326 }
1327
1328 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1329 number composite. */
1330 for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1331 {
1332 bool is_prime;
1333 uintmax_t e[2], y[2];
1334
1335 if (flag_prove_primality)
1336 {
1337 is_prime = true;
1338 if (factors.plarge[1])
1339 {
1340 uintmax_t pi;
1341 binv (pi, factors.plarge[0]);
1342 e[0] = pi * nm1[0];
1343 e[1] = 0;
1344 y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1345 is_prime = (y[0] != one[0] || y[1] != one[1]);
1346 }
1347 for (unsigned int i = 0; i < factors.nfactors && is_prime; i++)
1348 {
1349 /* FIXME: We always have the factor 2. Do we really need to
1350 handle it here? We have done the same powering as part
1351 of millerrabin. */
1352 if (factors.p[i] == 2)
1353 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1354 else
1355 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1356 y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1357 is_prime = (y[0] != one[0] || y[1] != one[1]);
1358 }
1359 }
1360 else
1361 {
1362 /* After enough Miller-Rabin runs, be content. */
1363 is_prime = (r == MR_REPS - 1);
1364 }
1365
1366 if (is_prime)
1367 return true;
1368
1369 a += primes_diff[r]; /* Establish new base. */
1370 redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1371
1372 if (!millerrabin2 (na, ni, a_prim, q, k, one))
1373 return false;
1374 }
1375
1376 error (0, 0, _("Lucas prime test failure. This should not happen"));
1377 abort ();
1378 }
1379
1380 #if HAVE_GMP
1381 static bool
mp_prime_p(mpz_t n)1382 mp_prime_p (mpz_t n)
1383 {
1384 bool is_prime;
1385 mpz_t q, a, nm1, tmp;
1386 struct mp_factors factors;
1387
1388 if (mpz_cmp_ui (n, 1) <= 0)
1389 return false;
1390
1391 /* We have already casted out small primes. */
1392 if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1393 return true;
1394
1395 mpz_inits (q, a, nm1, tmp, NULL);
1396
1397 /* Precomputation for Miller-Rabin. */
1398 mpz_sub_ui (nm1, n, 1);
1399
1400 /* Find q and k, where q is odd and n = 1 + 2**k * q. */
1401 unsigned long int k = mpz_scan1 (nm1, 0);
1402 mpz_tdiv_q_2exp (q, nm1, k);
1403
1404 mpz_set_ui (a, 2);
1405
1406 /* Perform a Miller-Rabin test, finds most composites quickly. */
1407 if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1408 {
1409 is_prime = false;
1410 goto ret2;
1411 }
1412
1413 if (flag_prove_primality)
1414 {
1415 /* Factor n-1 for Lucas. */
1416 mpz_set (tmp, nm1);
1417 mp_factor (tmp, &factors);
1418 }
1419
1420 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1421 number composite. */
1422 for (unsigned int r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1423 {
1424 if (flag_prove_primality)
1425 {
1426 is_prime = true;
1427 for (unsigned long int i = 0; i < factors.nfactors && is_prime; i++)
1428 {
1429 mpz_divexact (tmp, nm1, factors.p[i]);
1430 mpz_powm (tmp, a, tmp, n);
1431 is_prime = mpz_cmp_ui (tmp, 1) != 0;
1432 }
1433 }
1434 else
1435 {
1436 /* After enough Miller-Rabin runs, be content. */
1437 is_prime = (r == MR_REPS - 1);
1438 }
1439
1440 if (is_prime)
1441 goto ret1;
1442
1443 mpz_add_ui (a, a, primes_diff[r]); /* Establish new base. */
1444
1445 if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1446 {
1447 is_prime = false;
1448 goto ret1;
1449 }
1450 }
1451
1452 error (0, 0, _("Lucas prime test failure. This should not happen"));
1453 abort ();
1454
1455 ret1:
1456 if (flag_prove_primality)
1457 mp_factor_clear (&factors);
1458 ret2:
1459 mpz_clears (q, a, nm1, tmp, NULL);
1460
1461 return is_prime;
1462 }
1463 #endif
1464
1465 static void
factor_using_pollard_rho(uintmax_t n,unsigned long int a,struct factors * factors)1466 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1467 struct factors *factors)
1468 {
1469 uintmax_t x, z, y, P, t, ni, g;
1470
1471 unsigned long int k = 1;
1472 unsigned long int l = 1;
1473
1474 redcify (P, 1, n);
1475 addmod (x, P, P, n); /* i.e., redcify(2) */
1476 y = z = x;
1477
1478 while (n != 1)
1479 {
1480 assert (a < n);
1481
1482 binv (ni, n); /* FIXME: when could we use old 'ni' value? */
1483
1484 for (;;)
1485 {
1486 do
1487 {
1488 x = mulredc (x, x, n, ni);
1489 addmod (x, x, a, n);
1490
1491 submod (t, z, x, n);
1492 P = mulredc (P, t, n, ni);
1493
1494 if (k % 32 == 1)
1495 {
1496 if (gcd_odd (P, n) != 1)
1497 goto factor_found;
1498 y = x;
1499 }
1500 }
1501 while (--k != 0);
1502
1503 z = x;
1504 k = l;
1505 l = 2 * l;
1506 for (unsigned long int i = 0; i < k; i++)
1507 {
1508 x = mulredc (x, x, n, ni);
1509 addmod (x, x, a, n);
1510 }
1511 y = x;
1512 }
1513
1514 factor_found:
1515 do
1516 {
1517 y = mulredc (y, y, n, ni);
1518 addmod (y, y, a, n);
1519
1520 submod (t, z, y, n);
1521 g = gcd_odd (t, n);
1522 }
1523 while (g == 1);
1524
1525 if (n == g)
1526 {
1527 /* Found n itself as factor. Restart with different params. */
1528 factor_using_pollard_rho (n, a + 1, factors);
1529 return;
1530 }
1531
1532 n = n / g;
1533
1534 if (!prime_p (g))
1535 factor_using_pollard_rho (g, a + 1, factors);
1536 else
1537 factor_insert (factors, g);
1538
1539 if (prime_p (n))
1540 {
1541 factor_insert (factors, n);
1542 break;
1543 }
1544
1545 x = x % n;
1546 z = z % n;
1547 y = y % n;
1548 }
1549 }
1550
1551 static void
factor_using_pollard_rho2(uintmax_t n1,uintmax_t n0,unsigned long int a,struct factors * factors)1552 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1553 struct factors *factors)
1554 {
1555 uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1556
1557 unsigned long int k = 1;
1558 unsigned long int l = 1;
1559
1560 redcify2 (P1, P0, 1, n1, n0);
1561 addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1562 y1 = z1 = x1;
1563 y0 = z0 = x0;
1564
1565 while (n1 != 0 || n0 != 1)
1566 {
1567 binv (ni, n0);
1568
1569 for (;;)
1570 {
1571 do
1572 {
1573 x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1574 x1 = r1m;
1575 addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1576
1577 submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1578 P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1579 P1 = r1m;
1580
1581 if (k % 32 == 1)
1582 {
1583 g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1584 if (g1 != 0 || g0 != 1)
1585 goto factor_found;
1586 y1 = x1; y0 = x0;
1587 }
1588 }
1589 while (--k != 0);
1590
1591 z1 = x1; z0 = x0;
1592 k = l;
1593 l = 2 * l;
1594 for (unsigned long int i = 0; i < k; i++)
1595 {
1596 x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1597 x1 = r1m;
1598 addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1599 }
1600 y1 = x1; y0 = x0;
1601 }
1602
1603 factor_found:
1604 do
1605 {
1606 y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1607 y1 = r1m;
1608 addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1609
1610 submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1611 g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1612 }
1613 while (g1 == 0 && g0 == 1);
1614
1615 if (g1 == 0)
1616 {
1617 /* The found factor is one word, and > 1. */
1618 divexact_21 (n1, n0, n1, n0, g0); /* n = n / g */
1619
1620 if (!prime_p (g0))
1621 factor_using_pollard_rho (g0, a + 1, factors);
1622 else
1623 factor_insert (factors, g0);
1624 }
1625 else
1626 {
1627 /* The found factor is two words. This is highly unlikely, thus hard
1628 to trigger. Please be careful before you change this code! */
1629 uintmax_t ginv;
1630
1631 if (n1 == g1 && n0 == g0)
1632 {
1633 /* Found n itself as factor. Restart with different params. */
1634 factor_using_pollard_rho2 (n1, n0, a + 1, factors);
1635 return;
1636 }
1637
1638 binv (ginv, g0); /* Compute n = n / g. Since the result will */
1639 n0 = ginv * n0; /* fit one word, we can compute the quotient */
1640 n1 = 0; /* modulo B, ignoring the high divisor word. */
1641
1642 if (!prime2_p (g1, g0))
1643 factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1644 else
1645 factor_insert_large (factors, g1, g0);
1646 }
1647
1648 if (n1 == 0)
1649 {
1650 if (prime_p (n0))
1651 {
1652 factor_insert (factors, n0);
1653 break;
1654 }
1655
1656 factor_using_pollard_rho (n0, a, factors);
1657 return;
1658 }
1659
1660 if (prime2_p (n1, n0))
1661 {
1662 factor_insert_large (factors, n1, n0);
1663 break;
1664 }
1665
1666 x0 = mod2 (&x1, x1, x0, n1, n0);
1667 z0 = mod2 (&z1, z1, z0, n1, n0);
1668 y0 = mod2 (&y1, y1, y0, n1, n0);
1669 }
1670 }
1671
1672 #if HAVE_GMP
1673 static void
mp_factor_using_pollard_rho(mpz_t n,unsigned long int a,struct mp_factors * factors)1674 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1675 struct mp_factors *factors)
1676 {
1677 mpz_t x, z, y, P;
1678 mpz_t t, t2;
1679
1680 devmsg ("[pollard-rho (%lu)] ", a);
1681
1682 mpz_inits (t, t2, NULL);
1683 mpz_init_set_si (y, 2);
1684 mpz_init_set_si (x, 2);
1685 mpz_init_set_si (z, 2);
1686 mpz_init_set_ui (P, 1);
1687
1688 unsigned long long int k = 1;
1689 unsigned long long int l = 1;
1690
1691 while (mpz_cmp_ui (n, 1) != 0)
1692 {
1693 for (;;)
1694 {
1695 do
1696 {
1697 mpz_mul (t, x, x);
1698 mpz_mod (x, t, n);
1699 mpz_add_ui (x, x, a);
1700
1701 mpz_sub (t, z, x);
1702 mpz_mul (t2, P, t);
1703 mpz_mod (P, t2, n);
1704
1705 if (k % 32 == 1)
1706 {
1707 mpz_gcd (t, P, n);
1708 if (mpz_cmp_ui (t, 1) != 0)
1709 goto factor_found;
1710 mpz_set (y, x);
1711 }
1712 }
1713 while (--k != 0);
1714
1715 mpz_set (z, x);
1716 k = l;
1717 l = 2 * l;
1718 for (unsigned long long int i = 0; i < k; i++)
1719 {
1720 mpz_mul (t, x, x);
1721 mpz_mod (x, t, n);
1722 mpz_add_ui (x, x, a);
1723 }
1724 mpz_set (y, x);
1725 }
1726
1727 factor_found:
1728 do
1729 {
1730 mpz_mul (t, y, y);
1731 mpz_mod (y, t, n);
1732 mpz_add_ui (y, y, a);
1733
1734 mpz_sub (t, z, y);
1735 mpz_gcd (t, t, n);
1736 }
1737 while (mpz_cmp_ui (t, 1) == 0);
1738
1739 mpz_divexact (n, n, t); /* divide by t, before t is overwritten */
1740
1741 if (!mp_prime_p (t))
1742 {
1743 devmsg ("[composite factor--restarting pollard-rho] ");
1744 mp_factor_using_pollard_rho (t, a + 1, factors);
1745 }
1746 else
1747 {
1748 mp_factor_insert (factors, t);
1749 }
1750
1751 if (mp_prime_p (n))
1752 {
1753 mp_factor_insert (factors, n);
1754 break;
1755 }
1756
1757 mpz_mod (x, x, n);
1758 mpz_mod (z, z, n);
1759 mpz_mod (y, y, n);
1760 }
1761
1762 mpz_clears (P, t2, t, z, x, y, NULL);
1763 }
1764 #endif
1765
1766 #if USE_SQUFOF
1767 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)? If
1768 algorithm is replaced, consider also returning the remainder. */
1769 static uintmax_t _GL_ATTRIBUTE_CONST
isqrt(uintmax_t n)1770 isqrt (uintmax_t n)
1771 {
1772 uintmax_t x;
1773 unsigned c;
1774 if (n == 0)
1775 return 0;
1776
1777 count_leading_zeros (c, n);
1778
1779 /* Make x > sqrt(n). This will be invariant through the loop. */
1780 x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) / 2);
1781
1782 for (;;)
1783 {
1784 uintmax_t y = (x + n/x) / 2;
1785 if (y >= x)
1786 return x;
1787
1788 x = y;
1789 }
1790 }
1791
1792 static uintmax_t _GL_ATTRIBUTE_CONST
isqrt2(uintmax_t nh,uintmax_t nl)1793 isqrt2 (uintmax_t nh, uintmax_t nl)
1794 {
1795 unsigned int shift;
1796 uintmax_t x;
1797
1798 /* Ensures the remainder fits in an uintmax_t. */
1799 assert (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1800
1801 if (nh == 0)
1802 return isqrt (nl);
1803
1804 count_leading_zeros (shift, nh);
1805 shift &= ~1;
1806
1807 /* Make x > sqrt(n) */
1808 x = isqrt ( (nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1809 x <<= (W_TYPE_SIZE - shift) / 2;
1810
1811 /* Do we need more than one iteration? */
1812 for (;;)
1813 {
1814 uintmax_t r _GL_UNUSED;
1815 uintmax_t q, y;
1816 udiv_qrnnd (q, r, nh, nl, x);
1817 y = (x + q) / 2;
1818
1819 if (y >= x)
1820 {
1821 uintmax_t hi, lo;
1822 umul_ppmm (hi, lo, x + 1, x + 1);
1823 assert (gt2 (hi, lo, nh, nl));
1824
1825 umul_ppmm (hi, lo, x, x);
1826 assert (ge2 (nh, nl, hi, lo));
1827 sub_ddmmss (hi, lo, nh, nl, hi, lo);
1828 assert (hi == 0);
1829
1830 return x;
1831 }
1832
1833 x = y;
1834 }
1835 }
1836
1837 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1838 # define MAGIC64 0x0202021202030213ULL
1839 # define MAGIC63 0x0402483012450293ULL
1840 # define MAGIC65 0x218a019866014613ULL
1841 # define MAGIC11 0x23b
1842
1843 /* Return the square root if the input is a square, otherwise 0. */
1844 static uintmax_t _GL_ATTRIBUTE_CONST
is_square(uintmax_t x)1845 is_square (uintmax_t x)
1846 {
1847 /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1848 computing the square root. */
1849 if (((MAGIC64 >> (x & 63)) & 1)
1850 && ((MAGIC63 >> (x % 63)) & 1)
1851 /* Both 0 and 64 are squares mod (65) */
1852 && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1853 && ((MAGIC11 >> (x % 11) & 1)))
1854 {
1855 uintmax_t r = isqrt (x);
1856 if (r*r == x)
1857 return r;
1858 }
1859 return 0;
1860 }
1861
1862 /* invtab[i] = floor(0x10000 / (0x100 + i) */
1863 static const unsigned short invtab[0x81] =
1864 {
1865 0x200,
1866 0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1867 0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1868 0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1869 0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1870 0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1871 0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1872 0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1873 0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1874 0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1875 0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1876 0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1877 0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1878 0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1879 0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1880 0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1881 0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1882 };
1883
1884 /* Compute q = [u/d], r = u mod d. Avoids slow hardware division for the case
1885 that q < 0x40; here it instead uses a table of (Euclidian) inverses. */
1886 # define div_smallq(q, r, u, d) \
1887 do { \
1888 if ((u) / 0x40 < (d)) \
1889 { \
1890 int _cnt; \
1891 uintmax_t _dinv, _mask, _q, _r; \
1892 count_leading_zeros (_cnt, (d)); \
1893 _r = (u); \
1894 if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8))) \
1895 { \
1896 _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80]; \
1897 _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt); \
1898 } \
1899 else \
1900 { \
1901 _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f]; \
1902 _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11; \
1903 } \
1904 _r -= _q*(d); \
1905 \
1906 _mask = -(uintmax_t) (_r >= (d)); \
1907 (r) = _r - (_mask & (d)); \
1908 (q) = _q - _mask; \
1909 assert ( (q) * (d) + (r) == u); \
1910 } \
1911 else \
1912 { \
1913 uintmax_t _q = (u) / (d); \
1914 (r) = (u) - _q * (d); \
1915 (q) = _q; \
1916 } \
1917 } while (0)
1918
1919 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1920 = 3025, P = 1737, representing F_{18} = (-6314, 2* 1737, 3025),
1921 with 3025 = 55^2.
1922
1923 Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1924 representing G_0 = (-55, 2*4652, 8653).
1925
1926 In the notation of the paper:
1927
1928 S_{-1} = 55, S_0 = 8653, R_0 = 4652
1929
1930 Put
1931
1932 t_0 = floor([q_0 + R_0] / S0) = 1
1933 R_1 = t_0 * S_0 - R_0 = 4001
1934 S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1935 */
1936
1937 /* Multipliers, in order of efficiency:
1938 0.7268 3*5*7*11 = 1155 = 3 (mod 4)
1939 0.7317 3*5*7 = 105 = 1
1940 0.7820 3*5*11 = 165 = 1
1941 0.7872 3*5 = 15 = 3
1942 0.8101 3*7*11 = 231 = 3
1943 0.8155 3*7 = 21 = 1
1944 0.8284 5*7*11 = 385 = 1
1945 0.8339 5*7 = 35 = 3
1946 0.8716 3*11 = 33 = 1
1947 0.8774 3 = 3 = 3
1948 0.8913 5*11 = 55 = 3
1949 0.8972 5 = 5 = 1
1950 0.9233 7*11 = 77 = 1
1951 0.9295 7 = 7 = 3
1952 0.9934 11 = 11 = 3
1953 */
1954 # define QUEUE_SIZE 50
1955 #endif
1956
1957 #if STAT_SQUFOF
1958 # define Q_FREQ_SIZE 50
1959 /* Element 0 keeps the total */
1960 static unsigned int q_freq[Q_FREQ_SIZE + 1];
1961 # define MIN(a,b) ((a) < (b) ? (a) : (b))
1962 #endif
1963
1964 #if USE_SQUFOF
1965 /* Return true on success. Expected to fail only for numbers
1966 >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1967 static bool
factor_using_squfof(uintmax_t n1,uintmax_t n0,struct factors * factors)1968 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1969 {
1970 /* Uses algorithm and notation from
1971
1972 SQUARE FORM FACTORIZATION
1973 JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1974
1975 https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1976 */
1977
1978 static const unsigned int multipliers_1[] =
1979 { /* = 1 (mod 4) */
1980 105, 165, 21, 385, 33, 5, 77, 1, 0
1981 };
1982 static const unsigned int multipliers_3[] =
1983 { /* = 3 (mod 4) */
1984 1155, 15, 231, 35, 3, 55, 7, 11, 0
1985 };
1986
1987 const unsigned int *m;
1988
1989 struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1990
1991 if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1992 return false;
1993
1994 uintmax_t sqrt_n = isqrt2 (n1, n0);
1995
1996 if (n0 == sqrt_n * sqrt_n)
1997 {
1998 uintmax_t p1, p0;
1999
2000 umul_ppmm (p1, p0, sqrt_n, sqrt_n);
2001 assert (p0 == n0);
2002
2003 if (n1 == p1)
2004 {
2005 if (prime_p (sqrt_n))
2006 factor_insert_multiplicity (factors, sqrt_n, 2);
2007 else
2008 {
2009 struct factors f;
2010
2011 f.nfactors = 0;
2012 if (!factor_using_squfof (0, sqrt_n, &f))
2013 {
2014 /* Try pollard rho instead */
2015 factor_using_pollard_rho (sqrt_n, 1, &f);
2016 }
2017 /* Duplicate the new factors */
2018 for (unsigned int i = 0; i < f.nfactors; i++)
2019 factor_insert_multiplicity (factors, f.p[i], 2*f.e[i]);
2020 }
2021 return true;
2022 }
2023 }
2024
2025 /* Select multipliers so we always get n * mu = 3 (mod 4) */
2026 for (m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2027 *m; m++)
2028 {
2029 uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2030 unsigned int i;
2031 unsigned int mu = *m;
2032 unsigned int qpos = 0;
2033
2034 assert (mu * n0 % 4 == 3);
2035
2036 /* In the notation of the paper, with mu * n == 3 (mod 4), we
2037 get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2038 I understand it, the necessary bound is 4 \mu^3 < n, or 32
2039 mu^3 < n.
2040
2041 However, this seems insufficient: With n = 37243139 and mu =
2042 105, we get a trivial factor, from the square 38809 = 197^2,
2043 without any corresponding Q earlier in the iteration.
2044
2045 Requiring 64 mu^3 < n seems sufficient. */
2046 if (n1 == 0)
2047 {
2048 if ((uintmax_t) mu*mu*mu >= n0 / 64)
2049 continue;
2050 }
2051 else
2052 {
2053 if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2054 continue;
2055 }
2056 umul_ppmm (Dh, Dl, n0, mu);
2057 Dh += n1 * mu;
2058
2059 assert (Dl % 4 != 1);
2060 assert (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2061
2062 S = isqrt2 (Dh, Dl);
2063
2064 Q1 = 1;
2065 P = S;
2066
2067 /* Square root remainder fits in one word, so ignore high part. */
2068 Q = Dl - P*P;
2069 /* FIXME: When can this differ from floor(sqrt(2 sqrt(D)))? */
2070 L = isqrt (2*S);
2071 B = 2*L;
2072 L1 = mu * 2 * L;
2073
2074 /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2075 4 D. */
2076
2077 for (i = 0; i <= B; i++)
2078 {
2079 uintmax_t q, P1, t, rem;
2080
2081 div_smallq (q, rem, S+P, Q);
2082 P1 = S - rem; /* P1 = q*Q - P */
2083
2084 IF_LINT (assert (q > 0 && Q > 0));
2085
2086 # if STAT_SQUFOF
2087 q_freq[0]++;
2088 q_freq[MIN (q, Q_FREQ_SIZE)]++;
2089 # endif
2090
2091 if (Q <= L1)
2092 {
2093 uintmax_t g = Q;
2094
2095 if ( (Q & 1) == 0)
2096 g /= 2;
2097
2098 g /= gcd_odd (g, mu);
2099
2100 if (g <= L)
2101 {
2102 if (qpos >= QUEUE_SIZE)
2103 die (EXIT_FAILURE, 0, _("squfof queue overflow"));
2104 queue[qpos].Q = g;
2105 queue[qpos].P = P % g;
2106 qpos++;
2107 }
2108 }
2109
2110 /* I think the difference can be either sign, but mod
2111 2^W_TYPE_SIZE arithmetic should be fine. */
2112 t = Q1 + q * (P - P1);
2113 Q1 = Q;
2114 Q = t;
2115 P = P1;
2116
2117 if ( (i & 1) == 0)
2118 {
2119 uintmax_t r = is_square (Q);
2120 if (r)
2121 {
2122 for (unsigned int j = 0; j < qpos; j++)
2123 {
2124 if (queue[j].Q == r)
2125 {
2126 if (r == 1)
2127 /* Traversed entire cycle. */
2128 goto next_multiplier;
2129
2130 /* Need the absolute value for divisibility test. */
2131 if (P >= queue[j].P)
2132 t = P - queue[j].P;
2133 else
2134 t = queue[j].P - P;
2135 if (t % r == 0)
2136 {
2137 /* Delete entries up to and including entry
2138 j, which matched. */
2139 memmove (queue, queue + j + 1,
2140 (qpos - j - 1) * sizeof (queue[0]));
2141 qpos -= (j + 1);
2142 }
2143 goto next_i;
2144 }
2145 }
2146
2147 /* We have found a square form, which should give a
2148 factor. */
2149 Q1 = r;
2150 assert (S >= P); /* What signs are possible? */
2151 P += r * ((S - P) / r);
2152
2153 /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2154 for the case D = 2N. */
2155 /* Compute Q = (D - P*P) / Q1, but we need double
2156 precision. */
2157 uintmax_t hi, lo;
2158 umul_ppmm (hi, lo, P, P);
2159 sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2160 udiv_qrnnd (Q, rem, hi, lo, Q1);
2161 assert (rem == 0);
2162
2163 for (;;)
2164 {
2165 /* Note: There appears to by a typo in the paper,
2166 Step 4a in the algorithm description says q <--
2167 floor([S+P]/\hat Q), but looking at the equations
2168 in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2169 (In this code, \hat Q is Q1). */
2170 div_smallq (q, rem, S+P, Q);
2171 P1 = S - rem; /* P1 = q*Q - P */
2172
2173 # if STAT_SQUFOF
2174 q_freq[0]++;
2175 q_freq[MIN (q, Q_FREQ_SIZE)]++;
2176 # endif
2177 if (P == P1)
2178 break;
2179 t = Q1 + q * (P - P1);
2180 Q1 = Q;
2181 Q = t;
2182 P = P1;
2183 }
2184
2185 if ( (Q & 1) == 0)
2186 Q /= 2;
2187 Q /= gcd_odd (Q, mu);
2188
2189 assert (Q > 1 && (n1 || Q < n0));
2190
2191 if (prime_p (Q))
2192 factor_insert (factors, Q);
2193 else if (!factor_using_squfof (0, Q, factors))
2194 factor_using_pollard_rho (Q, 2, factors);
2195
2196 divexact_21 (n1, n0, n1, n0, Q);
2197
2198 if (prime2_p (n1, n0))
2199 factor_insert_large (factors, n1, n0);
2200 else
2201 {
2202 if (!factor_using_squfof (n1, n0, factors))
2203 {
2204 if (n1 == 0)
2205 factor_using_pollard_rho (n0, 1, factors);
2206 else
2207 factor_using_pollard_rho2 (n1, n0, 1, factors);
2208 }
2209 }
2210
2211 return true;
2212 }
2213 }
2214 next_i:;
2215 }
2216 next_multiplier:;
2217 }
2218 return false;
2219 }
2220 #endif
2221
2222 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2223 results in FACTORS. */
2224 static void
factor(uintmax_t t1,uintmax_t t0,struct factors * factors)2225 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2226 {
2227 factors->nfactors = 0;
2228 factors->plarge[1] = 0;
2229
2230 if (t1 == 0 && t0 < 2)
2231 return;
2232
2233 t0 = factor_using_division (&t1, t1, t0, factors);
2234
2235 if (t1 == 0 && t0 < 2)
2236 return;
2237
2238 if (prime2_p (t1, t0))
2239 factor_insert_large (factors, t1, t0);
2240 else
2241 {
2242 #if USE_SQUFOF
2243 if (factor_using_squfof (t1, t0, factors))
2244 return;
2245 #endif
2246
2247 if (t1 == 0)
2248 factor_using_pollard_rho (t0, 1, factors);
2249 else
2250 factor_using_pollard_rho2 (t1, t0, 1, factors);
2251 }
2252 }
2253
2254 #if HAVE_GMP
2255 /* Use Pollard-rho to compute the prime factors of
2256 arbitrary-precision T, and put the results in FACTORS. */
2257 static void
mp_factor(mpz_t t,struct mp_factors * factors)2258 mp_factor (mpz_t t, struct mp_factors *factors)
2259 {
2260 mp_factor_init (factors);
2261
2262 if (mpz_sgn (t) != 0)
2263 {
2264 mp_factor_using_division (t, factors);
2265
2266 if (mpz_cmp_ui (t, 1) != 0)
2267 {
2268 devmsg ("[is number prime?] ");
2269 if (mp_prime_p (t))
2270 mp_factor_insert (factors, t);
2271 else
2272 mp_factor_using_pollard_rho (t, 1, factors);
2273 }
2274 }
2275 }
2276 #endif
2277
2278 static strtol_error
strto2uintmax(uintmax_t * hip,uintmax_t * lop,const char * s)2279 strto2uintmax (uintmax_t *hip, uintmax_t *lop, const char *s)
2280 {
2281 unsigned int lo_carry;
2282 uintmax_t hi = 0, lo = 0;
2283
2284 strtol_error err = LONGINT_INVALID;
2285
2286 /* Skip initial spaces and '+'. */
2287 for (;;)
2288 {
2289 char c = *s;
2290 if (c == ' ')
2291 s++;
2292 else if (c == '+')
2293 {
2294 s++;
2295 break;
2296 }
2297 else
2298 break;
2299 }
2300
2301 /* Initial scan for invalid digits. */
2302 const char *p = s;
2303 for (;;)
2304 {
2305 unsigned int c = *p++;
2306 if (c == 0)
2307 break;
2308
2309 if (UNLIKELY (!ISDIGIT (c)))
2310 {
2311 err = LONGINT_INVALID;
2312 break;
2313 }
2314
2315 err = LONGINT_OK; /* we've seen at least one valid digit */
2316 }
2317
2318 for (;err == LONGINT_OK;)
2319 {
2320 unsigned int c = *s++;
2321 if (c == 0)
2322 break;
2323
2324 c -= '0';
2325
2326 if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2327 {
2328 err = LONGINT_OVERFLOW;
2329 break;
2330 }
2331 hi = 10 * hi;
2332
2333 lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2334 lo_carry += 10 * lo < 2 * lo;
2335
2336 lo = 10 * lo;
2337 lo += c;
2338
2339 lo_carry += lo < c;
2340 hi += lo_carry;
2341 if (UNLIKELY (hi < lo_carry))
2342 {
2343 err = LONGINT_OVERFLOW;
2344 break;
2345 }
2346 }
2347
2348 *hip = hi;
2349 *lop = lo;
2350
2351 return err;
2352 }
2353
2354 /* Structure and routines for buffering and outputting full lines,
2355 to support parallel operation efficiently. */
2356 static struct lbuf_
2357 {
2358 char *buf;
2359 char *end;
2360 } lbuf;
2361
2362 /* 512 is chosen to give good performance,
2363 and also is the max guaranteed size that
2364 consumers can read atomically through pipes.
2365 Also it's big enough to cater for max line length
2366 even with 128 bit uintmax_t. */
2367 #define FACTOR_PIPE_BUF 512
2368
2369 static void
lbuf_alloc(void)2370 lbuf_alloc (void)
2371 {
2372 if (lbuf.buf)
2373 return;
2374
2375 /* Double to ensure enough space for
2376 previous numbers + next number. */
2377 lbuf.buf = xmalloc (FACTOR_PIPE_BUF * 2);
2378 lbuf.end = lbuf.buf;
2379 }
2380
2381 /* Write complete LBUF to standard output. */
2382 static void
lbuf_flush(void)2383 lbuf_flush (void)
2384 {
2385 size_t size = lbuf.end - lbuf.buf;
2386 if (full_write (STDOUT_FILENO, lbuf.buf, size) != size)
2387 die (EXIT_FAILURE, errno, "%s", _("write error"));
2388 lbuf.end = lbuf.buf;
2389 }
2390
2391 /* Add a character C to LBUF and if it's a newline
2392 and enough bytes are already buffered,
2393 then write atomically to standard output. */
2394 static void
lbuf_putc(char c)2395 lbuf_putc (char c)
2396 {
2397 *lbuf.end++ = c;
2398
2399 if (c == '\n')
2400 {
2401 size_t buffered = lbuf.end - lbuf.buf;
2402
2403 /* Provide immediate output for interactive use. */
2404 static int line_buffered = -1;
2405 if (line_buffered == -1)
2406 line_buffered = isatty (STDIN_FILENO) || isatty (STDOUT_FILENO);
2407 if (line_buffered)
2408 lbuf_flush ();
2409 else if (buffered >= FACTOR_PIPE_BUF)
2410 {
2411 /* Write output in <= PIPE_BUF chunks
2412 so consumers can read atomically. */
2413 char const *tend = lbuf.end;
2414
2415 /* Since a umaxint_t's factors must fit in 512
2416 we're guaranteed to find a newline here. */
2417 char *tlend = lbuf.buf + FACTOR_PIPE_BUF;
2418 while (*--tlend != '\n');
2419 tlend++;
2420
2421 lbuf.end = tlend;
2422 lbuf_flush ();
2423
2424 /* Buffer the remainder. */
2425 memcpy (lbuf.buf, tlend, tend - tlend);
2426 lbuf.end = lbuf.buf + (tend - tlend);
2427 }
2428 }
2429 }
2430
2431 /* Buffer an int to the internal LBUF. */
2432 static void
lbuf_putint(uintmax_t i,size_t min_width)2433 lbuf_putint (uintmax_t i, size_t min_width)
2434 {
2435 char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2436 char const *umaxstr = umaxtostr (i, buf);
2437 size_t width = sizeof (buf) - (umaxstr - buf) - 1;
2438 size_t z = width;
2439
2440 for (; z < min_width; z++)
2441 *lbuf.end++ = '0';
2442
2443 memcpy (lbuf.end, umaxstr, width);
2444 lbuf.end += width;
2445 }
2446
2447 static void
print_uintmaxes(uintmax_t t1,uintmax_t t0)2448 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2449 {
2450 uintmax_t q, r;
2451
2452 if (t1 == 0)
2453 lbuf_putint (t0, 0);
2454 else
2455 {
2456 /* Use very plain code here since it seems hard to write fast code
2457 without assuming a specific word size. */
2458 q = t1 / 1000000000;
2459 r = t1 % 1000000000;
2460 udiv_qrnnd (t0, r, r, t0, 1000000000);
2461 print_uintmaxes (q, t0);
2462 lbuf_putint (r, 9);
2463 }
2464 }
2465
2466 /* Single-precision factoring */
2467 static void
print_factors_single(uintmax_t t1,uintmax_t t0)2468 print_factors_single (uintmax_t t1, uintmax_t t0)
2469 {
2470 struct factors factors;
2471
2472 print_uintmaxes (t1, t0);
2473 lbuf_putc (':');
2474
2475 factor (t1, t0, &factors);
2476
2477 for (unsigned int j = 0; j < factors.nfactors; j++)
2478 for (unsigned int k = 0; k < factors.e[j]; k++)
2479 {
2480 lbuf_putc (' ');
2481 print_uintmaxes (0, factors.p[j]);
2482 }
2483
2484 if (factors.plarge[1])
2485 {
2486 lbuf_putc (' ');
2487 print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2488 }
2489
2490 lbuf_putc ('\n');
2491 }
2492
2493 /* Emit the factors of the indicated number. If we have the option of using
2494 either algorithm, we select on the basis of the length of the number.
2495 For longer numbers, we prefer the MP algorithm even if the native algorithm
2496 has enough digits, because the algorithm is better. The turnover point
2497 depends on the value. */
2498 static bool
print_factors(const char * input)2499 print_factors (const char *input)
2500 {
2501 uintmax_t t1, t0;
2502
2503 /* Try converting the number to one or two words. If it fails, use GMP or
2504 print an error message. The 2nd condition checks that the most
2505 significant bit of the two-word number is clear, in a typesize neutral
2506 way. */
2507 strtol_error err = strto2uintmax (&t1, &t0, input);
2508
2509 switch (err)
2510 {
2511 case LONGINT_OK:
2512 if (((t1 << 1) >> 1) == t1)
2513 {
2514 devmsg ("[using single-precision arithmetic] ");
2515 print_factors_single (t1, t0);
2516 return true;
2517 }
2518 break;
2519
2520 case LONGINT_OVERFLOW:
2521 /* Try GMP. */
2522 break;
2523
2524 default:
2525 error (0, 0, _("%s is not a valid positive integer"), quote (input));
2526 return false;
2527 }
2528
2529 #if HAVE_GMP
2530 devmsg ("[using arbitrary-precision arithmetic] ");
2531 mpz_t t;
2532 struct mp_factors factors;
2533
2534 mpz_init_set_str (t, input, 10);
2535
2536 gmp_printf ("%Zd:", t);
2537 mp_factor (t, &factors);
2538
2539 for (unsigned int j = 0; j < factors.nfactors; j++)
2540 for (unsigned int k = 0; k < factors.e[j]; k++)
2541 gmp_printf (" %Zd", factors.p[j]);
2542
2543 mp_factor_clear (&factors);
2544 mpz_clear (t);
2545 putchar ('\n');
2546 fflush (stdout);
2547 return true;
2548 #else
2549 error (0, 0, _("%s is too large"), quote (input));
2550 return false;
2551 #endif
2552 }
2553
2554 void
usage(int status)2555 usage (int status)
2556 {
2557 if (status != EXIT_SUCCESS)
2558 emit_try_help ();
2559 else
2560 {
2561 printf (_("\
2562 Usage: %s [NUMBER]...\n\
2563 or: %s OPTION\n\
2564 "),
2565 program_name, program_name);
2566 fputs (_("\
2567 Print the prime factors of each specified integer NUMBER. If none\n\
2568 are specified on the command line, read them from standard input.\n\
2569 \n\
2570 "), stdout);
2571 fputs (HELP_OPTION_DESCRIPTION, stdout);
2572 fputs (VERSION_OPTION_DESCRIPTION, stdout);
2573 emit_ancillary_info (PROGRAM_NAME);
2574 }
2575 exit (status);
2576 }
2577
2578 static bool
do_stdin(void)2579 do_stdin (void)
2580 {
2581 bool ok = true;
2582 token_buffer tokenbuffer;
2583
2584 init_tokenbuffer (&tokenbuffer);
2585
2586 while (true)
2587 {
2588 size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2589 &tokenbuffer);
2590 if (token_length == (size_t) -1)
2591 break;
2592 ok &= print_factors (tokenbuffer.buffer);
2593 }
2594 free (tokenbuffer.buffer);
2595
2596 return ok;
2597 }
2598
2599 int
main(int argc,char ** argv)2600 main (int argc, char **argv)
2601 {
2602 initialize_main (&argc, &argv);
2603 set_program_name (argv[0]);
2604 setlocale (LC_ALL, "");
2605 bindtextdomain (PACKAGE, LOCALEDIR);
2606 textdomain (PACKAGE);
2607
2608 lbuf_alloc ();
2609 atexit (close_stdout);
2610 atexit (lbuf_flush);
2611
2612 int c;
2613 while ((c = getopt_long (argc, argv, "", long_options, NULL)) != -1)
2614 {
2615 switch (c)
2616 {
2617 case DEV_DEBUG_OPTION:
2618 dev_debug = true;
2619 break;
2620
2621 case_GETOPT_HELP_CHAR;
2622
2623 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2624
2625 default:
2626 usage (EXIT_FAILURE);
2627 }
2628 }
2629
2630 #if STAT_SQUFOF
2631 memset (q_freq, 0, sizeof (q_freq));
2632 #endif
2633
2634 bool ok;
2635 if (argc <= optind)
2636 ok = do_stdin ();
2637 else
2638 {
2639 ok = true;
2640 for (int i = optind; i < argc; i++)
2641 if (! print_factors (argv[i]))
2642 ok = false;
2643 }
2644
2645 #if STAT_SQUFOF
2646 if (q_freq[0] > 0)
2647 {
2648 double acc_f;
2649 printf ("q freq. cum. freq.(total: %d)\n", q_freq[0]);
2650 for (unsigned int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2651 {
2652 double f = (double) q_freq[i] / q_freq[0];
2653 acc_f += f;
2654 printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2655 100.0 * f, 100.0 * acc_f);
2656 }
2657 }
2658 #endif
2659
2660 return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2661 }
2662