1 #include <gmp.h>
2 /*
3   Polynomial selection using Kleinjung's algorithm (cf slides presented
4   at the CADO Workshop in October 2008, Nancy, France).
5 
6   [1. Run and parameters]
7 
8   The parameters are similar to those in polyselect2.c, except the following,
9 
10   "-nq xxx" denotes the number of special-q's trials for each ad;
11 
12   Please report bugs to the Bug Tracking System on:
13   https://gforge.inria.fr/tracker/?atid=7442&group_id=2065&func=browse
14 */
15 
16 #define EMIT_ADDRESSABLE_shash_add
17 
18 #include "cado.h" // IWYU pragma: keep
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <limits.h>
22 #include <pthread.h>
23 #include <float.h>      // DBL_MAX
24 #include <math.h> // pow
25 #include "mpz_poly.h"
26 #include "area.h"
27 #include "polyselect_str.h"
28 #include "polyselect_arith.h"
29 #include "modredc_ul.h"
30 #include "mpz_vector.h"
31 #include "roots_mod.h"  // roots_mod_uint64
32 #include "timing.h"     // milliseconds
33 #include "verbose.h"    // verbose_decl_usage
34 #include "auxiliary.h"  // DEFAULT_INCR
35 
36 #define TARGET_TIME 10000000 /* print stats every TARGET_TIME milliseconds */
37 #define NEW_ROOTSIEVE
38 #define INIT_FACTOR 8UL
39 #define PREFIX_HASH
40 //#define DEBUG_POLYSELECT
41 
42 #ifdef NEW_ROOTSIEVE
43 #include "ropt.h"
44 #include "macros.h"
45 #include "params.h"
46 #endif
47 
48 #ifdef PREFIX_HASH
49 char *phash = "# ";
50 #else
51 char *phash = "";
52 #endif
53 
54 #define BATCH_SIZE 20 /* number of special (q, r) per batch */
55 
56 /* Read-Only */
57 uint32_t *Primes = NULL;
58 unsigned long lenPrimes = 1; // length of Primes[]
59 int nq = INT_MAX;
60 static int verbose = 0;
61 static unsigned long incr = DEFAULT_INCR;
62 cado_poly best_poly, curr_poly;
63 double best_E = DBL_MAX; /* combined score E (the smaller the better) */
64 double aver_E = 0.0;
65 unsigned long found = 0; /* number of polynomials found so far */
66 
67 /* read-write global variables */
68 static pthread_mutex_t lock=PTHREAD_MUTEX_INITIALIZER; /* used as mutual exclusion
69                                                    lock for those variables */
70 
71 mpz_t maxS; /* maximun skewness. O for default max */
72 
73 /* inline function */
74 extern void shash_add (shash_t, uint64_t);
75 
76 /* -- functions starts here -- */
77 
78 static void
mutex_lock(pthread_mutex_t * lock)79 mutex_lock(pthread_mutex_t *lock)
80 {
81   pthread_mutex_lock (lock);
82 }
83 
84 static void
mutex_unlock(pthread_mutex_t * lock)85 mutex_unlock(pthread_mutex_t *lock)
86 {
87   pthread_mutex_unlock (lock);
88 }
89 
90 /* crt, set r and qqz */
91 void
crt_sq(mpz_t qqz,mpz_t r,unsigned long * q,unsigned long * rq,unsigned long lq)92 crt_sq ( mpz_t qqz,
93          mpz_t r,
94          unsigned long *q,
95          unsigned long *rq,
96          unsigned long lq )
97 {
98   mpz_t prod, pprod, mod, inv, sum;
99   unsigned long i;
100   unsigned long qq[lq];
101 
102   mpz_init_set_ui (prod, 1);
103   mpz_init (pprod);
104   mpz_init (mod);
105   mpz_init (inv);
106   mpz_init_set_ui (sum, 0);
107 
108   for (i = 0; i < lq; i ++) {
109     qq[i] = q[i] * q[i]; // q small
110     mpz_mul_ui (prod, prod, qq[i]);
111   }
112 
113   for (i = 0; i < lq; i ++) {
114     mpz_divexact_ui (pprod, prod, qq[i]);
115     mpz_set_ui (mod, qq[i]);
116     mpz_invert (inv, pprod, mod);
117     mpz_mul_ui (inv, inv, rq[i]);
118     mpz_mul (inv, inv, pprod);
119     mpz_add (sum, sum, inv);
120   }
121 
122   mpz_mod (sum, sum, prod);
123   mpz_set (r, sum);
124   mpz_set (qqz, prod);
125 
126   mpz_clear (prod);
127   mpz_clear (pprod);
128   mpz_clear (mod);
129   mpz_clear (inv);
130   mpz_clear (sum);
131 }
132 
133 /* check that l/2 <= d*m0/P^2, where l = p1 * p2 * q with P <= p1, p2 <= 2P
134    q is the product of special-q primes. It suffices to check that
135    q <= d*m0/(2P^4). */
136 static int
check_parameters(mpz_t m0,unsigned long d,unsigned long lq)137 check_parameters (mpz_t m0, unsigned long d, unsigned long lq)
138 {
139   double maxq = 1.0, maxP;
140   int k = lq;
141 
142   while (k > 0)
143     maxq *= (double) SPECIAL_Q[LEN_SPECIAL_Q - 1 - (k--)];
144 
145   maxP = (double) Primes[lenPrimes - 1];
146   if (2.0 * pow (maxP, 4.0) * maxq >= (double) d * mpz_get_d (m0))
147     return 0;
148 
149   if (maxq > pow (maxP, 2.0))
150     return 0;
151 
152   return 1;
153 }
154 
155 
156 /* Compute maximun skewness, which in floor(N^(1/d^2)) */
157 void
compute_default_max_skew(mpz_t skew,mpz_t N,int d)158 compute_default_max_skew (mpz_t skew, mpz_t N, int d)
159 {
160   mpz_root (skew, N, (unsigned long) d*d);
161 }
162 
163 
164 /* rq is a root of N = (m0 + rq)^d mod (q^2) */
165 void
match(unsigned long p1,unsigned long p2,int64_t i,mpz_t m0,mpz_t ad,unsigned long d,mpz_t N,unsigned long q,mpz_t rq)166 match (unsigned long p1, unsigned long p2, int64_t i, mpz_t m0,
167        mpz_t ad, unsigned long d, mpz_t N, unsigned long q, mpz_t rq)
168 {
169 #if 0
170   unsigned long j;
171   int cmp;
172   double skew, logmu, E;
173   mpz_poly F;
174 #endif
175 
176   mpz_t l, r, k, mprime, Nprime, C, l2, tmp, r1, r0, t, adm1, m, skew, root;
177   mpz_vector_t a, b, reduced_a, reduced_b;
178   mpz_poly f, g;
179 
180   mpz_init (root);
181   mpz_init (m);
182   mpz_init (adm1);
183   mpz_init (l);
184   mpz_init (l2);
185   mpz_init (r);
186   mpz_init (k);
187   mpz_init (mprime);
188   mpz_init (Nprime);
189   mpz_init (C);
190   mpz_init (tmp);
191   mpz_init (r1);
192   mpz_init (t);
193   mpz_init (r0);
194   mpz_init (skew);
195 
196   mpz_vector_init (a, d+1);
197   mpz_vector_init (b, d+1);
198   mpz_vector_init (reduced_a, d+1);
199   mpz_vector_init (reduced_b, d+1);
200 
201   mpz_poly_init (f, d);
202   mpz_poly_init (g, d);
203 
204 #ifdef DEBUG_POLYSELECT
205   gmp_printf ("#### MATCH ######\nN = %Zd\nd = %d\nad = %Zd\n"
206               "p1 = %lu\np2 = %lu\nq = %lu\ni = %" PRId64 "\n"
207               "rq = %Zd\n", N, d, ad, p1, p2, q, i, rq);
208 #endif
209 
210   /* l = p1*p2*q */
211   mpz_set_ui (l, p1);
212   mpz_mul_ui (l, l, p2);
213   mpz_mul_ui (l, l, q);
214 #ifdef DEBUG_POLYSELECT
215   gmp_printf ("l = p1 * p2 * q\nl == %Zd # has %lu bits\n", l,
216               mpz_sizeinbase(l, 2));
217 #endif
218   mpz_mul (l2, l, l); /* l2 = l^2 */
219 
220   /* r = rq + i*q^2 */
221   mpz_set_si (r, i);
222   mpz_mul_ui (r, r, q);
223   mpz_mul_ui (r, r, q);
224   mpz_add (r, r, rq);
225 #ifdef DEBUG_POLYSELECT
226   gmp_printf ("r = rq + i * q^2 \nr == %Zd # has %lu bits\n", r,
227               mpz_sizeinbase(r, 2));
228 #endif
229 
230   /* k = d^d * ad^(d-1) */
231   mpz_mul_ui (k, ad, d);
232   mpz_pow_ui (k, k, d-1);
233   mpz_mul_ui (k, k, d);
234 #ifdef DEBUG_POLYSELECT
235   gmp_printf ("k = d^d * ad^(d-1)\nk == %Zd\n", k);
236 #endif
237 
238   /* Nprime = k * N */
239   mpz_mul (Nprime, k, N);
240 #ifdef DEBUG_POLYSELECT
241   gmp_printf ("Nprime = k * N\nNprime == %Zd\n", Nprime);
242 #endif
243 
244   /* mprime = m0 + r */
245   mpz_add (mprime, m0, r);
246 #ifdef DEBUG_POLYSELECT
247   gmp_printf ("m0 = %Zd\nmprime = m0 + r\nmprime == %Zd\n", m0, mprime);
248 #endif
249 
250   /* C = mprime^d - Nprime */
251   mpz_pow_ui (C, mprime, d);
252   mpz_sub (C, C, Nprime);
253   ASSERT_ALWAYS (mpz_divisible_p (C, l2));
254 #ifdef DEBUG_POLYSELECT
255   gmp_printf ("(mprime^d - Nprime) %% l^2 == 0\n");
256 #endif
257 
258   /* adm1 is such that mprime = d*ad*m + adm1*l and -d*ad/2 <= adm1 < d*ad/2
259      We have adm1 = mprime/l mod (d*ad). */
260   mpz_mul_ui (tmp, ad, d); /* tmp = d*ad */
261   if (mpz_invert (adm1, l, tmp) == 0)
262   {
263     fprintf (stderr, "Error in 1/l mod (d*ad)\n");
264     abort();
265   }
266   mpz_mul (adm1, adm1, mprime);
267   mpz_mod (adm1, adm1, tmp);
268   mpz_mul_2exp (t, adm1, 1);
269   if (mpz_cmp (t, tmp) >= 0)
270     mpz_sub (adm1, adm1, tmp);
271 #ifdef DEBUG_POLYSELECT
272   gmp_printf ("adm1 = %Zd\n", adm1);
273 #endif
274 
275   /* m = (mprime - adm1 * l)/ (d * ad) */
276   mpz_mul (m, adm1, l);
277   mpz_sub (m, mprime, m);
278   ASSERT_ALWAYS (mpz_divisible_ui_p (m, d));
279   mpz_divexact_ui (m, m, d);
280   ASSERT_ALWAYS (mpz_divisible_p (m, ad));
281   mpz_divexact (m, m, ad);
282 #ifdef DEBUG_POLYSELECT
283   gmp_printf ("m = (mprime - adm1*l) / (d*ad)\nm == %Zd\n", m);
284 #endif
285 
286   /* Set vector a = (-m, l, 0, ..., 0) */
287   mpz_neg (tmp, m);
288   mpz_vector_setcoordinate (a, 0, tmp); /* a[0] = -m */
289   mpz_vector_setcoordinate (a, 1, l); /* a[1] = -l */
290   for (unsigned int j = 2; j <= d; j++)
291     mpz_vector_setcoordinate_ui (a, j, 0); /* a[j] = 0 */
292 
293   /* Set vector b = (a0, a1, ..., ai, ..., adm1, ad) */
294   mpz_vector_setcoordinate (b, d, ad);  /* b[d] = ad */
295   mpz_vector_setcoordinate (b, d-1, adm1); /* b[d-1] = adm1 */
296 
297 
298 
299 
300   mpz_pow_ui (t, m, d);
301   mpz_mul (t, t, ad);
302   mpz_sub (t, N, t);
303   ASSERT_ALWAYS (mpz_divisible_p (t, l));
304 
305   mpz_divexact (t, t, l);
306   mpz_pow_ui (tmp, m, d-1);
307   mpz_mul (tmp, tmp, adm1);
308   mpz_sub (t, t, tmp);
309   for (int j = d - 2; j > 0; j--)
310   {
311     ASSERT_ALWAYS (mpz_divisible_p (t, l));
312     mpz_divexact (t, t, l);
313     /* t = a_j*m^j + l*R thus a_j = t/m^j mod l */
314     mpz_pow_ui (tmp, m, j);
315     /* fdiv rounds toward -infinity: r1 = floor(t/tmp) */
316     mpz_fdiv_q (r1, t, tmp); /* t -> r1 * tmp + t */
317     mpz_invert (k, tmp, l); /* search r1 + k such that */
318 
319     mpz_mul (k, k, t);
320     mpz_sub (k, k, r1);
321     mpz_mod (k, k, l);
322 
323     mpz_mul_2exp (k, k, 1);
324     int cmp = mpz_cmp (k, l);
325     mpz_div_2exp (k, k, 1);
326     if (cmp >= 0)
327       mpz_sub (k, k, l);
328     mpz_add (r1, r1, k);
329     mpz_vector_setcoordinate (b, j, r1);
330     /* subtract r1*m^j */
331     mpz_submul (t, tmp, r1);
332   }
333   ASSERT_ALWAYS (mpz_divisible_p (t, l));
334   mpz_divexact (t, t, l);
335   mpz_vector_setcoordinate (b, 0, t);
336 
337 
338   mpz_vector_get_mpz_poly(f, a);
339   mpz_vector_get_mpz_poly(g, b);
340 #ifdef DEBUG_POLYSELECT
341   printf ("a = ");
342   mpz_poly_fprintf (stdout, f);
343   printf ("b = ");
344   mpz_poly_fprintf (stdout, g);
345 #endif
346 
347   mpz_vector_reduce_with_max_skew (reduced_a, reduced_b, skew, a, b, maxS, d);
348 
349   mpz_vector_get_mpz_poly(f, reduced_a);
350   mpz_vector_get_mpz_poly(g, reduced_b);
351 
352 #ifdef DEBUG_POLYSELECT
353   gmp_printf ("skew = %Zd\nf = ", skew);
354   mpz_poly_fprintf (stdout, f);
355   printf ("g = ");
356   mpz_poly_fprintf (stdout, g);
357 #endif
358 
359   mpz_invert (root, l, N);
360   mpz_mul (root, root, m);
361   mpz_mod (root, root, N);
362 #ifdef DEBUG_POLYSELECT
363   gmp_printf ("root = (m / l) %% N\nroot == %Zd\n", root);
364 
365   gmp_printf ("## Begin poly file for ad = %Zd and l = %Zd\n", ad, l);
366 #endif
367 
368   double skewness, logmu[2], alpha[2], E;
369 
370   skewness = L2_combined_skewness2 (f, g, SKEWNESS_DEFAULT_PREC);
371   logmu[0] = L2_lognorm (g, skewness);
372   logmu[1] = L2_lognorm (f, skewness);
373   alpha[0] = get_alpha (g, ALPHA_BOUND_SMALL);
374   alpha[1] = get_alpha (f, ALPHA_BOUND_SMALL);
375   E = logmu[1] + alpha[1] + logmu[0] + alpha[0];
376 
377   found ++;
378   aver_E += E;
379 
380   if (E < best_E)
381     {
382       mutex_lock (&lock);
383       best_E = E;
384       gmp_printf("n: %Zd\n", N);
385       for (int i = 0; i <= f->deg; i++)
386         gmp_printf ("c%d: %Zd\n", i, f->coeff[i]);
387       for (int i = 0; i <= g->deg; i++)
388         gmp_printf ("Y%d: %Zd\n", i, g->coeff[i]);
389       printf ("skew: %1.2f\n", skewness);
390       printf ("# f lognorm %1.2f, alpha %1.2f, score %1.2f\n",
391               logmu[1], alpha[1], logmu[1] + alpha[1]);
392       printf ("# g lognorm %1.2f, alpha %1.2f, score %1.2f\n",
393               logmu[0], alpha[0], logmu[0] + alpha[0]);
394       printf ("# f+g score %1.2f\n", E);
395       printf ("# found %lu polynomial(s) so far, aver. E = %1.2f\n",
396               found, aver_E / (double) found);
397 #ifdef DEBUG_POLYSELECT
398       printf ("## End poly file\n");
399 #else
400       printf ("\n");
401 #endif
402       mutex_unlock (&lock);
403     }
404 
405   mpz_clear (root);
406   mpz_clear (skew);
407   mpz_clear (adm1);
408   mpz_clear (l);
409   mpz_clear (l2);
410   mpz_clear (C);
411   mpz_clear (r);
412   mpz_clear (k);
413   mpz_clear (mprime);
414   mpz_clear (m);
415   mpz_clear (Nprime);
416   mpz_clear (tmp);
417   mpz_clear (r1);
418   mpz_clear (r0);
419   mpz_clear (t);
420 
421   mpz_poly_clear (f);
422   mpz_poly_clear (g);
423 
424   mpz_vector_clear (a);
425   mpz_vector_clear (b);
426   mpz_vector_clear (reduced_a);
427   mpz_vector_clear (reduced_b);
428 }
429 
430 void
gmp_match(uint32_t p1,uint32_t p2,int64_t i,mpz_t m0,mpz_t ad,unsigned long d,mpz_t N,uint64_t q,mpz_t rq)431 gmp_match (uint32_t p1, uint32_t p2, int64_t i, mpz_t m0,
432 	   mpz_t ad, unsigned long d, mpz_t N, uint64_t q, mpz_t rq)
433 {
434   match (p1, p2, i, m0, ad, d, N, q, rq);
435 }
436 
437 
438 /* find collisions between "P" primes, return number of loops */
439 static inline unsigned long
collision_on_p(header_t header,proots_t R)440 collision_on_p ( header_t header,
441                  proots_t R )
442 {
443   unsigned long i, j, nprimes, p, nrp, c = 0, tot_roots = 0;
444   uint64_t *rp;
445   int64_t ppl = 0, u, umax;
446   mpz_t *f, tmp;
447   int found = 0;
448   shash_t H;
449   int st = milliseconds ();
450 
451   /* init f for roots computation */
452   mpz_init_set_ui (tmp, 0);
453   f = (mpz_t*) malloc ((header->d + 1) * sizeof (mpz_t));
454   if (f == NULL) {
455     fprintf (stderr, "Error, cannot allocate memory in collision_on_p\n");
456     exit (1);
457   }
458   for (i = 0; i <= header->d; i++)
459     mpz_init (f[i]);
460   mpz_set_ui (f[header->d], 1);
461 
462   rp = (uint64_t*) malloc (header->d * sizeof (uint64_t));
463   if (rp == NULL) {
464     fprintf (stderr, "Error, cannot allocate memory in collision_on_p\n");
465     exit (1);
466   }
467 
468   shash_init (H, 4 * lenPrimes);
469   shash_reset (H);
470   umax = (int64_t) Primes[lenPrimes - 1] * (int64_t) Primes[lenPrimes - 1];
471   for (nprimes = 0; nprimes < lenPrimes; nprimes ++)
472     {
473       p = Primes[nprimes];
474       ppl = (int64_t) p * (int64_t) p;
475 
476       /* add fake roots to keep indices */
477       if (header_skip (header, p))
478         {
479           R->nr[nprimes] = 0; // nr = 0.
480           R->roots[nprimes] = NULL;
481           continue;
482         }
483 
484       nrp = roots_mod_uint64 (rp, mpz_fdiv_ui (header->Ntilde, p), header->d,
485                               p);
486       tot_roots += nrp;
487       roots_lift (rp, header->Ntilde, header->d, header->m0, p, nrp);
488       proots_add (R, nrp, rp, nprimes);
489       for (j = 0; j < nrp; j++, c++)
490             {
491               for (u = (int64_t) rp[j]; u < umax; u += ppl)
492                 shash_add (H, u);
493               for (u = ppl - (int64_t) rp[j]; u < umax; u += ppl)
494                 shash_add (H, -u);
495             }
496         }
497   found = shash_find_collision (H);
498   shash_clear (H);
499   free (rp);
500 
501   if (verbose > 2)
502     printf ("# computing %lu p-roots took %dms\n", tot_roots, st);
503 
504   if (found) /* do the real work */
505     {
506       hash_t H;
507 
508       hash_init (H, INIT_FACTOR * lenPrimes);
509       for (nprimes = 0; nprimes < lenPrimes; nprimes ++)
510         {
511           nrp = R->nr[nprimes];
512           if (nrp == 0)
513             continue;
514           p = Primes[nprimes];
515           ppl = (int64_t) p * (int64_t) p;
516           rp = R->roots[nprimes];
517 
518           for (j = 0; j < nrp; j++)
519             {
520               for (u = (int64_t) rp[j]; u < umax; u += ppl)
521                 hash_add (H, p, u, header->m0, header->ad, header->d,
522                           header->N, 1, tmp);
523               for (u = ppl - (int64_t) rp[j]; u < umax; u += ppl)
524                 hash_add (H, p, -u, header->m0, header->ad,
525                           header->d, header->N, 1, tmp);
526             }
527         }
528 #ifdef DEBUG_POLYSELECT
529       printf ("# collision_on_p took %lums\n", milliseconds () - st);
530       gmp_printf ("# p hash_size: %u for ad = %Zd\n", H->size, header->ad);
531 #endif
532 
533 #ifdef DEBUG_HASH_TABLE
534       printf ("# p hash_size: %u, hash_alloc: %u\n", H->size, H->alloc);
535       printf ("# hash table coll: %lu, all_coll: %lu\n", H->coll, H->coll_all);
536 #endif
537       hash_clear (H);
538     }
539 
540   for (i = 0; i <= header->d; i++)
541     mpz_clear (f[i]);
542   free (f);
543   mpz_clear (tmp);
544 
545   return c;
546 }
547 
548 
549 /* collision on each special-q, call collision_on_batch_p() */
550 static inline void
collision_on_each_sq(header_t header,proots_t R,unsigned long q,mpz_t rqqz,unsigned long * inv_qq)551 collision_on_each_sq ( header_t header,
552                        proots_t R,
553                        unsigned long q,
554                        mpz_t rqqz,
555                        unsigned long *inv_qq )
556 {
557   shash_t H;
558   uint64_t **cur1, **cur2, *ccur1, *ccur2;
559   long *pc, *epc;
560   uint64_t pp;
561   int64_t ppl, neg_umax, umax, v1, v2, nv;
562   unsigned long p, nprimes, c;
563   uint8_t vpnr, *pnr, nr, j;
564   uint32_t *pprimes, i;
565   int found;
566 
567 #ifdef DEBUG_POLYSELECT
568   int st = milliseconds();
569 #endif
570 #if SHASH_NBUCKETS == 256
571 #define CURRENT(V) (H->current + (uint8_t) (V))
572 #else
573 #define CURRENT(V) (H->current + ((V) & (SHASH_NBUCKETS - 1)))
574 #endif
575   /*
576   uint64_t t1, t2;
577   static uint64_t sum1 = 0, sum2 = 0;
578   */
579 
580   shash_init (H, 4 * lenPrimes);
581   shash_reset (H);
582 
583   pc = (long *) inv_qq;
584   nv = *pc;
585   pprimes = Primes - 1;
586   pnr = R->nr;
587   R->nr[R->size] = 0xff; /* I use guard to end */
588   umax = Primes[lenPrimes - 1];
589   umax *= umax;
590   neg_umax = -umax;
591 
592   /* This define inserts 2 values v1 and v2 with a interlace.
593      The goal is to have a little time to read ccurX from L0
594      cache before to use it. The best seems a
595      three read interlacing in fact, two seems too short. */
596 #define INSERT_2I(I1,I2)                                                \
597   do {                                                                  \
598     cur1 = CURRENT(I1); ccur1 = *cur1;					\
599     cur2 = CURRENT(I2); ccur2 = *cur2;					\
600     *ccur1++ = I1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;	\
601     *ccur2++ = I2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;	\
602   } while (0)
603   /* This version is slow because ccur1 is used immediatly after
604      it has been read from L0 cache -> 3 ticks of latency on P4 Nehalem. */
605 #define INSERT_I(I)						\
606   do {								\
607     cur1 = CURRENT(I); ccur1 = *cur1; *ccur1++ = I;		\
608     __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;		\
609   } while (0)
610 
611   int64_t b;
612   b = (int64_t) ((double) umax * 0.3333333333333333);
613   do {
614     do {
615       vpnr = *pnr++;
616       pprimes++;
617     } while (!vpnr);
618     if (UNLIKELY(vpnr == 0xff)) goto bend;
619     ppl = *pprimes;
620     __builtin_prefetch(((void *) pnr) + 0x040, 0, 3);
621     __builtin_prefetch(((void *) pprimes) + 0x80, 0, 3);
622     __builtin_prefetch(((void *) pc) + 0x100, 0, 3);
623     ppl *= ppl;
624     epc = pc + vpnr;
625     if (UNLIKELY(ppl > b)) { b = umax >> 1; goto iter2; }
626     do {
627       v1 = nv;                    cur1 = CURRENT(v1); ccur1 = *cur1;
628       v2 = v1 - ppl;              cur2 = CURRENT(v2); ccur2 = *cur2;
629       nv = *++pc;
630       *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
631       *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
632       v1 += ppl;                  cur1 = CURRENT(v1); ccur1 = *cur1;
633       v2 -= ppl;                  cur2 = CURRENT(v2); ccur2 = *cur2;
634       *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
635       *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
636       v1 += ppl;                  cur1 = CURRENT(v1); ccur1 = *cur1;
637       v2 -= ppl;                  cur2 = CURRENT(v2); ccur2 = *cur2;
638       *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
639       *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
640       v1 += ppl; v2 -= ppl;
641       if (LIKELY (v1 > umax)) {
642 	if (UNLIKELY (v2 >= neg_umax)) INSERT_I(v2);
643       } else if (UNLIKELY (v2 >= neg_umax)) INSERT_2I(v1, v2);
644       else INSERT_I(v1);
645     } while (pc != epc);
646   } while (1);
647 
648   do {
649     do {
650       vpnr = *pnr++;
651       pprimes++;
652     } while (!vpnr);
653     if (UNLIKELY(vpnr == 0xff)) goto bend;
654     ppl = *pprimes;
655     __builtin_prefetch(((void *) pnr) + 0x040, 0, 3);
656     __builtin_prefetch(((void *) pprimes) + 0x100, 0, 3);
657     __builtin_prefetch(((void *) pc) + 0x280, 0, 3);
658     ppl *= ppl;
659     epc = pc + vpnr;
660   iter2:
661     if (UNLIKELY(ppl > b)) goto iter1;
662     do {
663       v1 = nv;                    cur1 = CURRENT(v1); ccur1 = *cur1;
664       v2 = v1 - ppl;              cur2 = CURRENT(v2); ccur2 = *cur2;
665       nv = *++pc;
666       *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
667       *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
668       v1 += ppl;                  cur1 = CURRENT(v1); ccur1 = *cur1;
669       v2 -= ppl;                  cur2 = CURRENT(v2); ccur2 = *cur2;
670       *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
671       *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
672       v1 += ppl; v2 -= ppl;
673       if (LIKELY (v1 > umax)) {
674 	if (UNLIKELY (v2 >= neg_umax)) INSERT_I(v2);
675       } else if (UNLIKELY (v2 >= neg_umax)) INSERT_2I(v1, v2);
676       else INSERT_I(v1);
677     } while (pc != epc);
678   } while (1);
679 
680   do {
681     do {
682       vpnr = *pnr++;
683       pprimes++;
684     } while (!vpnr);
685     if (UNLIKELY(vpnr == 0xff)) goto bend;
686     ppl = *pprimes;
687     __builtin_prefetch(((void *) pnr) + 0x040, 0, 3);
688     __builtin_prefetch(((void *) pprimes) + 0x100, 0, 3);
689     __builtin_prefetch(((void *) pc) + 0x280, 0, 3);
690     ppl *= ppl;
691     epc = pc + vpnr;
692   iter1:
693     do {
694       v1 = nv;                    cur1 = CURRENT(v1); ccur1 = *cur1;
695       v2 = v1 - ppl;              cur2 = CURRENT(v2); ccur2 = *cur2;
696       nv = *++pc;
697       *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
698       *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
699       v1 += ppl; v2 -= ppl;
700       if (LIKELY (v1 > umax)) {
701 	if (UNLIKELY (v2 >= neg_umax)) INSERT_I(v2);
702       } else if (UNLIKELY (v2 >= neg_umax)) INSERT_2I(v1, v2);
703       else INSERT_I(v1);
704     } while (pc != epc);
705   } while (1);
706 
707  bend:
708 #undef INSERT_2I
709 #undef INSERT_I
710 
711   for (i = 0; i < SHASH_NBUCKETS; i++) ASSERT (H->current[i] <= H->base[i+1]);
712 
713   found = shash_find_collision (H);
714   shash_clear (H);
715 
716   if (found) /* do the real work */
717     {
718       hash_t H;
719 
720       hash_init (H, INIT_FACTOR * lenPrimes);
721 
722       umax = (int64_t) Primes[lenPrimes - 1] * (int64_t) Primes[lenPrimes - 1];
723       for (nprimes = c = 0; nprimes < lenPrimes; nprimes ++)
724         {
725           p = Primes[nprimes];
726           if (header_skip (header, p))
727             continue;
728           pp = p * p;
729           ppl = (long) pp;
730           nr = R->nr[nprimes];
731           for (j = 0; j < nr; j++, c++)
732             {
733               v1 = (long) inv_qq[c];
734               for (v2 = v1; v2 < umax; v2 += ppl)
735                 hash_add (H, p, v2, header->m0, header->ad, header->d,
736                           header->N, q, rqqz);
737               for (v2 = ppl - v1; v2 < umax; v2 += ppl)
738                 hash_add (H, p, -v2, header->m0, header->ad, header->d,
739                           header->N, q, rqqz);
740             }
741         }
742       hash_clear (H);
743     }
744 
745 #ifdef DEBUG_POLYSELECT
746   printf ("# inner collision_on_each_sq took %lums\n", milliseconds () - st);
747   printf ("# - q hash_alloc (q=%lu): %u\n", q, H->alloc);
748 #endif
749 
750 #ifdef DEBUG_HASH_TABLE
751   printf ("# p hash_size: %u, hash_alloc: %u\n", H->size, H->alloc);
752   printf ("# hash table coll: %lu, all_coll: %lu\n", H->coll, H->coll_all);
753 #endif
754 }
755 
756 
757 /* Given p, rp, q, invqq[], for each rq of q, compute (rp - rq) / q^2 */
758 static inline void
collision_on_each_sq_r(header_t header,proots_t R,unsigned long q,mpz_t * rqqz,unsigned long * inv_qq,unsigned long number_pr,int count)759 collision_on_each_sq_r ( header_t header,
760                          proots_t R,
761                          unsigned long q,
762                          mpz_t *rqqz,
763                          unsigned long *inv_qq,
764                          unsigned long number_pr,
765                          int count )
766 {
767   if (count == 0)
768     return;
769 
770   uint8_t i, nr, *pnr;
771   unsigned long nprimes, p, c = 0, rp, rqi;
772   int k;
773   uint64_t pp;
774   unsigned long **tinv_qq = malloc (count * sizeof (unsigned long*));
775 
776   if (!tinv_qq)
777   {
778     fprintf (stderr, "Error, cannot allocate memory in %s\n", __FUNCTION__);
779     exit (1);
780   }
781   for (k = 0; k < count; k++) {
782     /* number_pr + 1 for guard for pre-load in collision_on_each_sq (nv) */
783     tinv_qq[k] = malloc ((number_pr + 1) * sizeof (unsigned long));
784     tinv_qq[k][number_pr] = 0;
785   }
786 
787   int st = milliseconds();
788   pnr = R->nr;
789 
790   /* for each rp, compute (rp-rq)*1/q^2 (mod p^2) */
791   for (nprimes = 0; nprimes < lenPrimes; nprimes ++)
792   {
793     if (!pnr[nprimes]) continue;
794     nr = pnr[nprimes];
795     p = Primes[nprimes];
796     pp = p*p;
797 
798     modulusredcul_t modpp;
799     residueredcul_t res_rqi, res_rp, res_tmp;
800     modredcul_initmod_ul_raw (modpp, pp);
801     modredcul_init (res_rqi, modpp);
802     modredcul_init (res_rp, modpp);
803     modredcul_init (res_tmp, modpp);
804 
805     for (k = 0; k < count; k ++)
806     {
807       rqi = mpz_fdiv_ui (rqqz[k], pp);
808       modredcul_intset_ul (res_rqi, rqi);
809       modredcul_intset_ul (res_tmp, inv_qq[nprimes]);
810       for (i = 0; i < nr; i ++, c++)
811       {
812         rp = R->roots[nprimes][i];
813         modredcul_intset_ul (res_rp, rp);
814         /* rp - rq */
815         modredcul_sub (res_rp, res_rp, res_rqi, modpp);
816         /* res_rp = (rp - rq) / q[i]^2 */
817         modredcul_mul (res_rp, res_rp, res_tmp, modpp);
818         tinv_qq[k][c] = modredcul_intget_ul (res_rp);
819       }
820       c -= nr;
821     }
822     c += nr;
823 
824     modredcul_clear (res_rp, modpp);
825     modredcul_clear (res_rqi, modpp);
826     modredcul_clear (res_tmp, modpp);
827     modredcul_clearmod (modpp);
828   }
829 
830   if (verbose > 2) {
831     printf ("#  substage: batch %d many (rp-rq)*1/q^2 took %lums\n",
832             count, milliseconds () - st);
833     st = milliseconds();
834   }
835 
836   /* core function to find collisions */
837   for (k = 0; k < count; k ++) {
838     collision_on_each_sq (header, R, q, rqqz[k], tinv_qq[k]);
839   }
840 
841   if (verbose > 2)
842     printf ("#  substage: collision-detection %d many rq took %lums\n",
843             count, milliseconds () - st);
844 
845   for (k = 0; k < count; k++)
846     free (tinv_qq[k]);
847   free (tinv_qq);
848 }
849 
850 
851 /* Next combination */
852 static inline unsigned int
aux_nextcomb(unsigned int * ind,unsigned int len_q,unsigned int * len_nr)853 aux_nextcomb ( unsigned int *ind,
854                unsigned int len_q,
855                unsigned int *len_nr )
856 {
857   unsigned int i;
858 
859   /* bottom change first */
860   for (i = len_q - 1; ; i--) {
861     if (ind[i] < (len_nr[i] - 1)) {
862       ind[i]++;
863       return 1;
864     }
865     else {
866       if (i == 0)
867         break;
868       ind[i] = 0;
869     }
870   }
871   return 0;
872 }
873 
874 
875 /* Compute crted rq */
876 static inline void
aux_return_rq(qroots_t SQ_R,unsigned long * idx_q,unsigned int * idx_nr,unsigned long k,mpz_t qqz,mpz_t rqqz,unsigned long lq)877 aux_return_rq ( qroots_t SQ_R,
878                 unsigned long *idx_q,
879                 unsigned int *idx_nr,
880                 unsigned long k,
881                 mpz_t qqz,
882                 mpz_t rqqz,
883                 unsigned long lq )
884 {
885   unsigned long i, q[k], rq[k];
886 
887   /* q and roots */
888   for (i = 0; i < k; i ++) {
889     q[i] = SQ_R->q[idx_q[i]];
890     rq[i] = SQ_R->roots[idx_q[i]][idx_nr[i]];
891   }
892 
893   /* crt roots */
894   crt_sq (qqz, rqqz, q, rq, lq);
895 
896   return;
897 }
898 
899 
900 /* Consider each rq */
901 static inline void
collision_on_batch_sq_r(header_t header,proots_t R,qroots_t SQ_R,unsigned long q,unsigned long * idx_q,unsigned long * inv_qq,unsigned long number_pr,int * curr_nq,unsigned long lq)902 collision_on_batch_sq_r ( header_t header,
903                           proots_t R,
904                           qroots_t SQ_R,
905                           unsigned long q,
906                           unsigned long *idx_q,
907                           unsigned long *inv_qq,
908                           unsigned long number_pr,
909                           int *curr_nq,
910                           unsigned long lq )
911 {
912   int count;
913   unsigned int ind_qr[lq]; /* indices of roots for each small q */
914   unsigned int len_qnr[lq]; /* for each small q, number of roots */
915   unsigned long i;
916   mpz_t qqz, rqqz[BATCH_SIZE];
917 
918   mpz_init (qqz);
919   for (i = 0; i < BATCH_SIZE; i ++)
920     mpz_init (rqqz[i]);
921 
922   /* initialization indices */
923   for (i = 0; i < lq; i ++) {
924     ind_qr[i] = 0;
925     len_qnr[i] = SQ_R->nr[idx_q[i]];
926   }
927 
928 #if 0
929   printf ("q: %lu, ", q);
930   for (i = 0; i < lq; i ++)
931     printf ("%u ", SQ_R->q[idx_q[i]]);
932   printf (", ");
933   for (i = 0; i < lq; i ++)
934     printf ("%u ", SQ_R->nr[idx_q[i]]);
935   printf ("\n");
936 #endif
937 
938   /* we proceed with BATCH_SIZE many rq for each time */
939   i = count = 0;
940   int re = 1, num_rq;
941   while (re) {
942     /* compute BATCH_SIZE such many rqqz[] */
943     num_rq = 0;
944     for (count = 0; count < BATCH_SIZE; count ++)
945     {
946         aux_return_rq (SQ_R, idx_q, ind_qr, lq, qqz, rqqz[count], lq);
947         re = aux_nextcomb (ind_qr, lq, len_qnr);
948         (*curr_nq)++;
949         num_rq ++;
950         if ((*curr_nq) >= nq)
951           re = 0;
952         if (!re)
953           break;
954     }
955 
956     /* core function for a fixed qq and several rqqz[] */
957     collision_on_each_sq_r (header, R, q, rqqz, inv_qq, number_pr, num_rq);
958   }
959 
960   mpz_clear (qqz);
961   for (i = 0; i < BATCH_SIZE; i ++)
962     mpz_clear (rqqz[i]);
963 }
964 
965 
966 /* SQ inversion, write 1/q^2 (mod p_i^2) to invqq[i] */
967 static inline void
collision_on_batch_sq(header_t header,proots_t R,qroots_t SQ_R,unsigned long q,unsigned long * idx_q,unsigned long number_pr,unsigned long lq)968 collision_on_batch_sq ( header_t header,
969                         proots_t R,
970                         qroots_t SQ_R,
971                         unsigned long q,
972                         unsigned long *idx_q,
973                         unsigned long number_pr,
974                         unsigned long lq )
975 {
976   unsigned nr;
977   int curr_nq = 0;
978   uint64_t pp;
979   unsigned long nprimes, p;
980   unsigned long *invqq = malloc (lenPrimes * sizeof (unsigned long));
981   if (!invqq) {
982     fprintf (stderr, "Error, cannot allocate memory in %s\n", __FUNCTION__);
983     exit (1);
984   }
985 
986   int st = milliseconds();
987 
988   /* Step 1: inversion */
989   for (nprimes = 0; nprimes < lenPrimes; nprimes ++) {
990 
991     p = Primes[nprimes];
992     if (header_skip (header, p))
993       continue;
994     nr = R->nr[nprimes];
995     if (nr == 0)
996       continue;
997     pp = p * p;
998 
999     modulusredcul_t modpp;
1000     residueredcul_t qq, tmp;
1001     modredcul_initmod_ul (modpp, pp);
1002     modredcul_init (qq, modpp);
1003     modredcul_init (tmp, modpp);
1004 
1005     /* q^2/B (mod pp) */
1006     modredcul_intset_ul (tmp, q);
1007     modredcul_sqr (qq, tmp, modpp);
1008     /* B/q^2 (mod pp) */
1009     modredcul_intinv (tmp, qq, modpp);
1010     invqq[nprimes] = modredcul_intget_ul (tmp);
1011 
1012     modredcul_clear (tmp, modpp);
1013     modredcul_clear (qq, modpp);
1014     modredcul_clearmod (modpp);
1015   }
1016 
1017   if (verbose > 2)
1018     printf ("# stage (1/q^2 inversion) for %lu primes took %lums\n",
1019             lenPrimes, milliseconds () - st);
1020 
1021   /* Step 2: find collisions on q. */
1022   int st2 = milliseconds();
1023 
1024   collision_on_batch_sq_r ( header, R, SQ_R, q, idx_q, invqq, number_pr,
1025                             &curr_nq, lq );
1026   if (verbose > 2)
1027     printf ("#  stage (special-q) for %d special-q's took %lums\n",
1028             curr_nq, milliseconds() - st2);
1029 
1030   free (invqq);
1031 }
1032 
1033 
1034 /* collision on special-q, call collision_on_batch_sq */
1035 static inline void
collision_on_sq(header_t header,proots_t R,unsigned long c)1036 collision_on_sq ( header_t header,
1037                   proots_t R,
1038                   unsigned long c )
1039 {
1040   int prod = 1;
1041   unsigned int i;
1042   unsigned long j, lq = 0UL;
1043   qroots_t SQ_R;
1044 
1045   /* init special-q roots */
1046   qroots_init (SQ_R);
1047   comp_sq_roots (header, SQ_R);
1048   //qroots_print (SQ_R);
1049 
1050   /* find a suitable lq */
1051   for (i = 0; i < SQ_R->size; i++) {
1052     if (prod < nq) {
1053       if (!check_parameters (header->m0, header->d, lq))
1054         break;
1055       prod *= SQ_R->nr[i];
1056       lq ++;
1057     }
1058   }
1059 
1060   /* lq < 8 for the moment */
1061   if (lq > 7)
1062     lq = 7;
1063   if (lq < 1)
1064     lq = 1;
1065 
1066   unsigned long q, idx_q[lq];
1067 
1068   for (j = 0; j < lq; j ++)
1069     idx_q[j] = j;
1070   q = return_q_norq (SQ_R, idx_q, lq);
1071 
1072   /* collision batch */
1073   collision_on_batch_sq (header, R, SQ_R, q, idx_q, c, lq);
1074 
1075   /* clean */
1076   qroots_clear (SQ_R);
1077   return;
1078 }
1079 
1080 
1081 static void
newAlgo(mpz_t N,unsigned long d,mpz_t ad)1082 newAlgo (mpz_t N, unsigned long d, mpz_t ad)
1083 {
1084   unsigned long c = 0;
1085   header_t header;
1086   proots_t R;
1087 
1088   header_init (header, N, d, ad);
1089   proots_init (R, lenPrimes);
1090 
1091   c = collision_on_p (header, R);
1092   if (nq > 0)
1093     collision_on_sq (header, R, c);
1094 
1095   proots_clear (R, lenPrimes);
1096   header_clear (header);
1097 }
1098 
1099 void*
one_thread(void * args)1100 one_thread (void* args)
1101 {
1102   tab_t *tab = (tab_t*) args;
1103   newAlgo (tab[0]->N, tab[0]->d, tab[0]->ad);
1104   return NULL;
1105 }
1106 
1107 static void
declare_usage(param_list pl)1108 declare_usage(param_list pl)
1109 {
1110   param_list_decl_usage(pl, "degree", "polynomial degree (2 or 3, "
1111                                       "default is 3)");
1112   param_list_decl_usage(pl, "n", "(required, alias N) input number");
1113   param_list_decl_usage(pl, "P", "(required) deg-1 coeff of g(x) has two prime factors in [P,2P]\n");
1114 
1115   param_list_decl_usage(pl, "admax", "max value for ad (+1)");
1116   param_list_decl_usage(pl, "admin", "min value for ad (default 0)");
1117   param_list_decl_usage(pl, "incr", "increment of ad (default 60)");
1118   param_list_decl_usage(pl, "skewness", "maximun skewness possible "
1119                                         "(default N^(1/9))");
1120   param_list_decl_usage(pl, "maxtime", "stop the search after maxtime seconds");
1121 
1122   char str[200];
1123   snprintf (str, 200, "maximum number of special-q's considered\n"
1124             "               for each ad (default %d)", INT_MAX);
1125   param_list_decl_usage(pl, "nq", str);
1126   param_list_decl_usage(pl, "keep", "number of polynomials kept (default 10)");
1127   snprintf(str, 200, "time interval (seconds) for printing statistics (default %d)", TARGET_TIME / 1000);
1128   param_list_decl_usage(pl, "s", str);
1129   param_list_decl_usage(pl, "t", "number of threads to use (default 1)");
1130   param_list_decl_usage(pl, "v", "verbose mode");
1131   param_list_decl_usage(pl, "q", "quiet mode");
1132   snprintf (str, 200, "sieving area (default %.2e)", AREA);
1133   param_list_decl_usage(pl, "area", str);
1134   snprintf (str, 200, "algebraic smoothness bound (default %.2e)", BOUND_F);
1135   param_list_decl_usage(pl, "Bf", str);
1136   snprintf (str, 200, "rational smoothness bound (default %.2e)", BOUND_G);
1137   param_list_decl_usage(pl, "Bg", str);
1138   verbose_decl_usage(pl);
1139 }
1140 
1141 static void
usage(const char * argv,const char * missing,param_list pl)1142 usage (const char *argv, const char * missing, param_list pl)
1143 {
1144   if (missing) {
1145     fprintf(stderr, "\nError: missing or invalid parameter \"-%s\"\n",
1146         missing);
1147   }
1148   param_list_print_usage(pl, argv, stderr);
1149   exit (EXIT_FAILURE);
1150 }
1151 
1152 int
main(int argc,char * argv[])1153 main (int argc, char *argv[])
1154 {
1155   char **argv0 = argv;
1156   double st0 = seconds (), maxtime = DBL_MAX;
1157   mpz_t N, admin, admax;
1158   unsigned int d = 3;
1159   unsigned long P = 0;
1160   int quiet = 0, tries = 0, i, nthreads = 1, st,
1161     target_time = TARGET_TIME;
1162   tab_t *T;
1163   pthread_t *tid;
1164 
1165   mpz_init (N);
1166   mpz_init (maxS);
1167   cado_poly_init (best_poly);
1168   cado_poly_init (curr_poly);
1169 
1170   /* read params */
1171   param_list pl;
1172   param_list_init (pl);
1173 
1174   declare_usage(pl);
1175 
1176   param_list_configure_switch (pl, "-v", &verbose);
1177   param_list_configure_switch (pl, "-q", &quiet);
1178   param_list_configure_alias(pl, "incr", "-i");
1179   param_list_configure_alias(pl, "n", "-N");
1180   param_list_configure_alias(pl, "degree", "-d");
1181 
1182   if (argc == 1)
1183     usage (argv0[0], NULL, pl);
1184 
1185   argv++, argc--;
1186   for ( ; argc; ) {
1187     if (param_list_update_cmdline (pl, &argc, &argv)) continue;
1188     fprintf (stderr, "Unhandled parameter %s\n", argv[0]);
1189     usage (argv0[0], NULL, pl);
1190   }
1191 
1192   /* parse and check N in the first place */
1193   int have_n = param_list_parse_mpz(pl, "n", N);
1194 
1195   if (!have_n) {
1196     printf ("# Reading n from stdin\n");
1197     param_list_read_stream(pl, stdin, 0);
1198     have_n = param_list_parse_mpz(pl, "n", N);
1199   }
1200 
1201   if (!have_n) {
1202       fprintf(stderr, "No n defined ; sorry.\n");
1203       exit(1);
1204   }
1205 
1206   if (mpz_cmp_ui (N, 0) <= 0) usage(argv0[0], "n", pl);
1207 
1208   param_list_parse_ulong(pl, "P", &P);
1209   if (P == 0) usage(argv0[0], "P", pl);
1210 
1211   param_list_parse_int (pl, "t", &nthreads);
1212   param_list_parse_int (pl, "nq", &nq);
1213   param_list_parse_int (pl, "s", &target_time);
1214   param_list_parse_uint (pl, "degree", &d);
1215   ASSERT_ALWAYS (2 <= d && d <= 3);
1216   if (param_list_parse_double (pl, "area", &area) == 0) /* no -area */
1217     area = AREA;
1218   if (param_list_parse_double (pl, "Bf", &bound_f) == 0) /* no -Bf */
1219     bound_f = BOUND_F;
1220   if (param_list_parse_double (pl, "Bg", &bound_g) == 0) /* no -Bg */
1221     bound_g = BOUND_G;
1222 
1223   mpz_init (admin);
1224   param_list_parse_mpz (pl, "admin", admin);
1225 
1226   mpz_init (admax);
1227   if (param_list_parse_mpz (pl, "admax", admax) == 0) /* no -admax */
1228     mpz_set (admax, N);
1229 
1230   param_list_parse_ulong (pl, "incr", &incr);
1231   param_list_parse_double (pl, "maxtime", &maxtime);
1232 
1233   if (!param_list_parse_mpz(pl, "skewness", maxS))
1234     mpz_set_ui (maxS, 0);
1235   else if (mpz_cmp_ui (maxS, 1) < 0)
1236   {
1237     gmp_fprintf(stderr, "Error, skewness (%Zd) should be greater or equal "
1238                         "to 1\n", maxS);
1239     abort();
1240   }
1241 
1242   if (param_list_warn_unused(pl))
1243     usage (argv0[0], NULL, pl);
1244 
1245   /* print command line */
1246   verbose_interpret_parameters(pl);
1247   param_list_print_command_line (stdout, pl);
1248 
1249   /* check lq and nq */
1250   if (nq < 0) {
1251     fprintf (stderr, "Error, number of special-q's should >= 0\n");
1252     exit (1);
1253   }
1254 
1255   /* allocate threads */
1256   tid = malloc (nthreads * sizeof (pthread_t));
1257   ASSERT_ALWAYS(tid != NULL);
1258 
1259   /* quiet mode */
1260   if (quiet == 1)
1261     verbose = -1;
1262 
1263   /* set cpoly */
1264   mpz_set (best_poly->n, N);
1265   mpz_set (curr_poly->n, N);
1266   best_poly->pols[ALG_SIDE]->deg = d;
1267   best_poly->pols[RAT_SIDE]->deg = d;
1268   curr_poly->pols[ALG_SIDE]->deg = d;
1269   curr_poly->pols[RAT_SIDE]->deg = d;
1270 
1271   /* Compute maxS: use maxS if maxS argument is greater than 0 and lesser than
1272      default value */
1273   mpz_t tmp;
1274   mpz_init (tmp);
1275   compute_default_max_skew (tmp, N, 2);
1276   if (mpz_cmp_ui(maxS, 0) == 0 || mpz_cmp(maxS, tmp) > 0)
1277     mpz_set (maxS, tmp);
1278 
1279   /* Initialize primes. Since we use modredc to perform computations mod p^2
1280      for p < 2P, we need that 4P^2 fits in an unsigned long. */
1281   double Pd;
1282   Pd = (double) P;
1283   if (4.0 * Pd * Pd >= (double) ULONG_MAX)
1284     {
1285       mpz_set_ui (tmp, ULONG_MAX >> 2);
1286       mpz_sqrt (tmp, tmp);
1287       gmp_fprintf (stderr, "Error, too large value of P, maximum is %Zd\n",
1288                    tmp);
1289       exit (1);
1290     }
1291   mpz_clear (tmp);
1292   if (P <= (unsigned long) SPECIAL_Q[LEN_SPECIAL_Q - 2]) {
1293     fprintf (stderr, "Error, too small value of P, need P > %u\n",
1294              SPECIAL_Q[LEN_SPECIAL_Q - 2]);
1295     exit (1);
1296   }
1297 
1298   /* detect L1 cache size */
1299   ropt_L1_cachesize ();
1300 
1301   st = milliseconds ();
1302   lenPrimes = initPrimes (P, &Primes);
1303 
1304   printf ( "# Info: initializing %lu P primes took %lums,"
1305            " nq=%d, target_time=%d\n",
1306            lenPrimes, milliseconds () - st, nq, target_time / 1000 );
1307 
1308 
1309   printf ( "# Info: estimated peak memory=%.2fMB (%d thread(s),"
1310            " batch %d inversions on SQ)\n",
1311            (double) (nthreads * (BATCH_SIZE * 2 + INIT_FACTOR) * lenPrimes
1312            * (sizeof(uint32_t) + sizeof(uint64_t)) / 1024 / 1024),
1313            nthreads, BATCH_SIZE );
1314 
1315   //printPrimes (Primes, lenPrimes);
1316 
1317   /* init tabs_t for threads */
1318   T = malloc (nthreads * sizeof (tab_t));
1319   if (T == NULL)
1320   {
1321     fprintf (stderr, "Error, cannot allocate memory in main\n");
1322     exit (1);
1323   }
1324   for (i = 0; i < nthreads ; i++)
1325   {
1326     mpz_init_set (T[i]->N, N);
1327     mpz_init (T[i]->ad);
1328     T[i]->d = d;
1329     T[i]->thread = i;
1330   }
1331 
1332   if (incr <= 0)
1333   {
1334     fprintf (stderr, "Error, incr should be positive\n");
1335     exit (1);
1336   }
1337 
1338   /* admin should be nonnegative */
1339   if (mpz_cmp_ui (admin, 0) < 0)
1340     {
1341       fprintf (stderr, "Error, admin should be nonnegative\n");
1342       exit (1);
1343     }
1344 
1345   /* ensure admin > 0 and admin is a multiple of incr */
1346   if (mpz_cmp_ui (admin, 0) == 0)
1347     mpz_set_ui (admin, incr);
1348   else if (mpz_fdiv_ui (admin, incr) != 0)
1349     mpz_add_ui (admin, admin, incr - mpz_fdiv_ui (admin, incr));
1350 
1351   while (mpz_cmp (admin, admax) < 0 && seconds () - st0 <= maxtime)
1352   {
1353     for (i = 0; i < nthreads ; i++)
1354     {
1355       tries ++;
1356       if (verbose >= 1)
1357         gmp_printf ("# %d ad=%Zd\n", tries, admin);
1358       mpz_set (T[i]->ad, admin);
1359       pthread_create (&tid[i], NULL, one_thread, (void *) (T+i));
1360       mpz_add_ui (admin, admin, incr);
1361     }
1362     for (i = 0 ; i < nthreads ; i++)
1363       pthread_join(tid[i], NULL);
1364   }
1365 
1366   printf ("# Stat: tried %d ad-value(s), found %lu polynomial(s)\n", tries,
1367           found);
1368 
1369   /* print total time (this gets parsed by the scripts) */
1370   printf ("# Stat: total phase took %.2fs\n", seconds () - st0);
1371 
1372   if (best_E == DBL_MAX)
1373     /* This line is required by the script: */
1374     printf ("# No polynomial found, please increase the ad range or decrease P\n");
1375 
1376   for (i = 0; i < nthreads ; i++)
1377     {
1378       mpz_clear (T[i]->N);
1379       mpz_clear (T[i]->ad);
1380     }
1381   free (T);
1382   mpz_clear (N);
1383   mpz_clear (admin);
1384   mpz_clear (admax);
1385   mpz_clear (maxS);
1386   clearPrimes (&Primes);
1387   cado_poly_clear (best_poly);
1388   cado_poly_clear (curr_poly);
1389   param_list_clear (pl);
1390   free (tid);
1391 
1392   return 0;
1393 }
1394