1 #include <gmp.h>
2 /*
3 Polynomial selection using Kleinjung's algorithm (cf slides presented
4 at the CADO Workshop in October 2008, Nancy, France).
5
6 [1. Run and parameters]
7
8 The parameters are similar to those in polyselect2.c, except the following,
9
10 "-nq xxx" denotes the number of special-q's trials for each ad;
11
12 Please report bugs to the Bug Tracking System on:
13 https://gforge.inria.fr/tracker/?atid=7442&group_id=2065&func=browse
14 */
15
16 #define EMIT_ADDRESSABLE_shash_add
17
18 #include "cado.h" // IWYU pragma: keep
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <limits.h>
22 #include <pthread.h>
23 #include <float.h> // DBL_MAX
24 #include <math.h> // pow
25 #include "mpz_poly.h"
26 #include "area.h"
27 #include "polyselect_str.h"
28 #include "polyselect_arith.h"
29 #include "modredc_ul.h"
30 #include "mpz_vector.h"
31 #include "roots_mod.h" // roots_mod_uint64
32 #include "timing.h" // milliseconds
33 #include "verbose.h" // verbose_decl_usage
34 #include "auxiliary.h" // DEFAULT_INCR
35
36 #define TARGET_TIME 10000000 /* print stats every TARGET_TIME milliseconds */
37 #define NEW_ROOTSIEVE
38 #define INIT_FACTOR 8UL
39 #define PREFIX_HASH
40 //#define DEBUG_POLYSELECT
41
42 #ifdef NEW_ROOTSIEVE
43 #include "ropt.h"
44 #include "macros.h"
45 #include "params.h"
46 #endif
47
48 #ifdef PREFIX_HASH
49 char *phash = "# ";
50 #else
51 char *phash = "";
52 #endif
53
54 #define BATCH_SIZE 20 /* number of special (q, r) per batch */
55
56 /* Read-Only */
57 uint32_t *Primes = NULL;
58 unsigned long lenPrimes = 1; // length of Primes[]
59 int nq = INT_MAX;
60 static int verbose = 0;
61 static unsigned long incr = DEFAULT_INCR;
62 cado_poly best_poly, curr_poly;
63 double best_E = DBL_MAX; /* combined score E (the smaller the better) */
64 double aver_E = 0.0;
65 unsigned long found = 0; /* number of polynomials found so far */
66
67 /* read-write global variables */
68 static pthread_mutex_t lock=PTHREAD_MUTEX_INITIALIZER; /* used as mutual exclusion
69 lock for those variables */
70
71 mpz_t maxS; /* maximun skewness. O for default max */
72
73 /* inline function */
74 extern void shash_add (shash_t, uint64_t);
75
76 /* -- functions starts here -- */
77
78 static void
mutex_lock(pthread_mutex_t * lock)79 mutex_lock(pthread_mutex_t *lock)
80 {
81 pthread_mutex_lock (lock);
82 }
83
84 static void
mutex_unlock(pthread_mutex_t * lock)85 mutex_unlock(pthread_mutex_t *lock)
86 {
87 pthread_mutex_unlock (lock);
88 }
89
90 /* crt, set r and qqz */
91 void
crt_sq(mpz_t qqz,mpz_t r,unsigned long * q,unsigned long * rq,unsigned long lq)92 crt_sq ( mpz_t qqz,
93 mpz_t r,
94 unsigned long *q,
95 unsigned long *rq,
96 unsigned long lq )
97 {
98 mpz_t prod, pprod, mod, inv, sum;
99 unsigned long i;
100 unsigned long qq[lq];
101
102 mpz_init_set_ui (prod, 1);
103 mpz_init (pprod);
104 mpz_init (mod);
105 mpz_init (inv);
106 mpz_init_set_ui (sum, 0);
107
108 for (i = 0; i < lq; i ++) {
109 qq[i] = q[i] * q[i]; // q small
110 mpz_mul_ui (prod, prod, qq[i]);
111 }
112
113 for (i = 0; i < lq; i ++) {
114 mpz_divexact_ui (pprod, prod, qq[i]);
115 mpz_set_ui (mod, qq[i]);
116 mpz_invert (inv, pprod, mod);
117 mpz_mul_ui (inv, inv, rq[i]);
118 mpz_mul (inv, inv, pprod);
119 mpz_add (sum, sum, inv);
120 }
121
122 mpz_mod (sum, sum, prod);
123 mpz_set (r, sum);
124 mpz_set (qqz, prod);
125
126 mpz_clear (prod);
127 mpz_clear (pprod);
128 mpz_clear (mod);
129 mpz_clear (inv);
130 mpz_clear (sum);
131 }
132
133 /* check that l/2 <= d*m0/P^2, where l = p1 * p2 * q with P <= p1, p2 <= 2P
134 q is the product of special-q primes. It suffices to check that
135 q <= d*m0/(2P^4). */
136 static int
check_parameters(mpz_t m0,unsigned long d,unsigned long lq)137 check_parameters (mpz_t m0, unsigned long d, unsigned long lq)
138 {
139 double maxq = 1.0, maxP;
140 int k = lq;
141
142 while (k > 0)
143 maxq *= (double) SPECIAL_Q[LEN_SPECIAL_Q - 1 - (k--)];
144
145 maxP = (double) Primes[lenPrimes - 1];
146 if (2.0 * pow (maxP, 4.0) * maxq >= (double) d * mpz_get_d (m0))
147 return 0;
148
149 if (maxq > pow (maxP, 2.0))
150 return 0;
151
152 return 1;
153 }
154
155
156 /* Compute maximun skewness, which in floor(N^(1/d^2)) */
157 void
compute_default_max_skew(mpz_t skew,mpz_t N,int d)158 compute_default_max_skew (mpz_t skew, mpz_t N, int d)
159 {
160 mpz_root (skew, N, (unsigned long) d*d);
161 }
162
163
164 /* rq is a root of N = (m0 + rq)^d mod (q^2) */
165 void
match(unsigned long p1,unsigned long p2,int64_t i,mpz_t m0,mpz_t ad,unsigned long d,mpz_t N,unsigned long q,mpz_t rq)166 match (unsigned long p1, unsigned long p2, int64_t i, mpz_t m0,
167 mpz_t ad, unsigned long d, mpz_t N, unsigned long q, mpz_t rq)
168 {
169 #if 0
170 unsigned long j;
171 int cmp;
172 double skew, logmu, E;
173 mpz_poly F;
174 #endif
175
176 mpz_t l, r, k, mprime, Nprime, C, l2, tmp, r1, r0, t, adm1, m, skew, root;
177 mpz_vector_t a, b, reduced_a, reduced_b;
178 mpz_poly f, g;
179
180 mpz_init (root);
181 mpz_init (m);
182 mpz_init (adm1);
183 mpz_init (l);
184 mpz_init (l2);
185 mpz_init (r);
186 mpz_init (k);
187 mpz_init (mprime);
188 mpz_init (Nprime);
189 mpz_init (C);
190 mpz_init (tmp);
191 mpz_init (r1);
192 mpz_init (t);
193 mpz_init (r0);
194 mpz_init (skew);
195
196 mpz_vector_init (a, d+1);
197 mpz_vector_init (b, d+1);
198 mpz_vector_init (reduced_a, d+1);
199 mpz_vector_init (reduced_b, d+1);
200
201 mpz_poly_init (f, d);
202 mpz_poly_init (g, d);
203
204 #ifdef DEBUG_POLYSELECT
205 gmp_printf ("#### MATCH ######\nN = %Zd\nd = %d\nad = %Zd\n"
206 "p1 = %lu\np2 = %lu\nq = %lu\ni = %" PRId64 "\n"
207 "rq = %Zd\n", N, d, ad, p1, p2, q, i, rq);
208 #endif
209
210 /* l = p1*p2*q */
211 mpz_set_ui (l, p1);
212 mpz_mul_ui (l, l, p2);
213 mpz_mul_ui (l, l, q);
214 #ifdef DEBUG_POLYSELECT
215 gmp_printf ("l = p1 * p2 * q\nl == %Zd # has %lu bits\n", l,
216 mpz_sizeinbase(l, 2));
217 #endif
218 mpz_mul (l2, l, l); /* l2 = l^2 */
219
220 /* r = rq + i*q^2 */
221 mpz_set_si (r, i);
222 mpz_mul_ui (r, r, q);
223 mpz_mul_ui (r, r, q);
224 mpz_add (r, r, rq);
225 #ifdef DEBUG_POLYSELECT
226 gmp_printf ("r = rq + i * q^2 \nr == %Zd # has %lu bits\n", r,
227 mpz_sizeinbase(r, 2));
228 #endif
229
230 /* k = d^d * ad^(d-1) */
231 mpz_mul_ui (k, ad, d);
232 mpz_pow_ui (k, k, d-1);
233 mpz_mul_ui (k, k, d);
234 #ifdef DEBUG_POLYSELECT
235 gmp_printf ("k = d^d * ad^(d-1)\nk == %Zd\n", k);
236 #endif
237
238 /* Nprime = k * N */
239 mpz_mul (Nprime, k, N);
240 #ifdef DEBUG_POLYSELECT
241 gmp_printf ("Nprime = k * N\nNprime == %Zd\n", Nprime);
242 #endif
243
244 /* mprime = m0 + r */
245 mpz_add (mprime, m0, r);
246 #ifdef DEBUG_POLYSELECT
247 gmp_printf ("m0 = %Zd\nmprime = m0 + r\nmprime == %Zd\n", m0, mprime);
248 #endif
249
250 /* C = mprime^d - Nprime */
251 mpz_pow_ui (C, mprime, d);
252 mpz_sub (C, C, Nprime);
253 ASSERT_ALWAYS (mpz_divisible_p (C, l2));
254 #ifdef DEBUG_POLYSELECT
255 gmp_printf ("(mprime^d - Nprime) %% l^2 == 0\n");
256 #endif
257
258 /* adm1 is such that mprime = d*ad*m + adm1*l and -d*ad/2 <= adm1 < d*ad/2
259 We have adm1 = mprime/l mod (d*ad). */
260 mpz_mul_ui (tmp, ad, d); /* tmp = d*ad */
261 if (mpz_invert (adm1, l, tmp) == 0)
262 {
263 fprintf (stderr, "Error in 1/l mod (d*ad)\n");
264 abort();
265 }
266 mpz_mul (adm1, adm1, mprime);
267 mpz_mod (adm1, adm1, tmp);
268 mpz_mul_2exp (t, adm1, 1);
269 if (mpz_cmp (t, tmp) >= 0)
270 mpz_sub (adm1, adm1, tmp);
271 #ifdef DEBUG_POLYSELECT
272 gmp_printf ("adm1 = %Zd\n", adm1);
273 #endif
274
275 /* m = (mprime - adm1 * l)/ (d * ad) */
276 mpz_mul (m, adm1, l);
277 mpz_sub (m, mprime, m);
278 ASSERT_ALWAYS (mpz_divisible_ui_p (m, d));
279 mpz_divexact_ui (m, m, d);
280 ASSERT_ALWAYS (mpz_divisible_p (m, ad));
281 mpz_divexact (m, m, ad);
282 #ifdef DEBUG_POLYSELECT
283 gmp_printf ("m = (mprime - adm1*l) / (d*ad)\nm == %Zd\n", m);
284 #endif
285
286 /* Set vector a = (-m, l, 0, ..., 0) */
287 mpz_neg (tmp, m);
288 mpz_vector_setcoordinate (a, 0, tmp); /* a[0] = -m */
289 mpz_vector_setcoordinate (a, 1, l); /* a[1] = -l */
290 for (unsigned int j = 2; j <= d; j++)
291 mpz_vector_setcoordinate_ui (a, j, 0); /* a[j] = 0 */
292
293 /* Set vector b = (a0, a1, ..., ai, ..., adm1, ad) */
294 mpz_vector_setcoordinate (b, d, ad); /* b[d] = ad */
295 mpz_vector_setcoordinate (b, d-1, adm1); /* b[d-1] = adm1 */
296
297
298
299
300 mpz_pow_ui (t, m, d);
301 mpz_mul (t, t, ad);
302 mpz_sub (t, N, t);
303 ASSERT_ALWAYS (mpz_divisible_p (t, l));
304
305 mpz_divexact (t, t, l);
306 mpz_pow_ui (tmp, m, d-1);
307 mpz_mul (tmp, tmp, adm1);
308 mpz_sub (t, t, tmp);
309 for (int j = d - 2; j > 0; j--)
310 {
311 ASSERT_ALWAYS (mpz_divisible_p (t, l));
312 mpz_divexact (t, t, l);
313 /* t = a_j*m^j + l*R thus a_j = t/m^j mod l */
314 mpz_pow_ui (tmp, m, j);
315 /* fdiv rounds toward -infinity: r1 = floor(t/tmp) */
316 mpz_fdiv_q (r1, t, tmp); /* t -> r1 * tmp + t */
317 mpz_invert (k, tmp, l); /* search r1 + k such that */
318
319 mpz_mul (k, k, t);
320 mpz_sub (k, k, r1);
321 mpz_mod (k, k, l);
322
323 mpz_mul_2exp (k, k, 1);
324 int cmp = mpz_cmp (k, l);
325 mpz_div_2exp (k, k, 1);
326 if (cmp >= 0)
327 mpz_sub (k, k, l);
328 mpz_add (r1, r1, k);
329 mpz_vector_setcoordinate (b, j, r1);
330 /* subtract r1*m^j */
331 mpz_submul (t, tmp, r1);
332 }
333 ASSERT_ALWAYS (mpz_divisible_p (t, l));
334 mpz_divexact (t, t, l);
335 mpz_vector_setcoordinate (b, 0, t);
336
337
338 mpz_vector_get_mpz_poly(f, a);
339 mpz_vector_get_mpz_poly(g, b);
340 #ifdef DEBUG_POLYSELECT
341 printf ("a = ");
342 mpz_poly_fprintf (stdout, f);
343 printf ("b = ");
344 mpz_poly_fprintf (stdout, g);
345 #endif
346
347 mpz_vector_reduce_with_max_skew (reduced_a, reduced_b, skew, a, b, maxS, d);
348
349 mpz_vector_get_mpz_poly(f, reduced_a);
350 mpz_vector_get_mpz_poly(g, reduced_b);
351
352 #ifdef DEBUG_POLYSELECT
353 gmp_printf ("skew = %Zd\nf = ", skew);
354 mpz_poly_fprintf (stdout, f);
355 printf ("g = ");
356 mpz_poly_fprintf (stdout, g);
357 #endif
358
359 mpz_invert (root, l, N);
360 mpz_mul (root, root, m);
361 mpz_mod (root, root, N);
362 #ifdef DEBUG_POLYSELECT
363 gmp_printf ("root = (m / l) %% N\nroot == %Zd\n", root);
364
365 gmp_printf ("## Begin poly file for ad = %Zd and l = %Zd\n", ad, l);
366 #endif
367
368 double skewness, logmu[2], alpha[2], E;
369
370 skewness = L2_combined_skewness2 (f, g, SKEWNESS_DEFAULT_PREC);
371 logmu[0] = L2_lognorm (g, skewness);
372 logmu[1] = L2_lognorm (f, skewness);
373 alpha[0] = get_alpha (g, ALPHA_BOUND_SMALL);
374 alpha[1] = get_alpha (f, ALPHA_BOUND_SMALL);
375 E = logmu[1] + alpha[1] + logmu[0] + alpha[0];
376
377 found ++;
378 aver_E += E;
379
380 if (E < best_E)
381 {
382 mutex_lock (&lock);
383 best_E = E;
384 gmp_printf("n: %Zd\n", N);
385 for (int i = 0; i <= f->deg; i++)
386 gmp_printf ("c%d: %Zd\n", i, f->coeff[i]);
387 for (int i = 0; i <= g->deg; i++)
388 gmp_printf ("Y%d: %Zd\n", i, g->coeff[i]);
389 printf ("skew: %1.2f\n", skewness);
390 printf ("# f lognorm %1.2f, alpha %1.2f, score %1.2f\n",
391 logmu[1], alpha[1], logmu[1] + alpha[1]);
392 printf ("# g lognorm %1.2f, alpha %1.2f, score %1.2f\n",
393 logmu[0], alpha[0], logmu[0] + alpha[0]);
394 printf ("# f+g score %1.2f\n", E);
395 printf ("# found %lu polynomial(s) so far, aver. E = %1.2f\n",
396 found, aver_E / (double) found);
397 #ifdef DEBUG_POLYSELECT
398 printf ("## End poly file\n");
399 #else
400 printf ("\n");
401 #endif
402 mutex_unlock (&lock);
403 }
404
405 mpz_clear (root);
406 mpz_clear (skew);
407 mpz_clear (adm1);
408 mpz_clear (l);
409 mpz_clear (l2);
410 mpz_clear (C);
411 mpz_clear (r);
412 mpz_clear (k);
413 mpz_clear (mprime);
414 mpz_clear (m);
415 mpz_clear (Nprime);
416 mpz_clear (tmp);
417 mpz_clear (r1);
418 mpz_clear (r0);
419 mpz_clear (t);
420
421 mpz_poly_clear (f);
422 mpz_poly_clear (g);
423
424 mpz_vector_clear (a);
425 mpz_vector_clear (b);
426 mpz_vector_clear (reduced_a);
427 mpz_vector_clear (reduced_b);
428 }
429
430 void
gmp_match(uint32_t p1,uint32_t p2,int64_t i,mpz_t m0,mpz_t ad,unsigned long d,mpz_t N,uint64_t q,mpz_t rq)431 gmp_match (uint32_t p1, uint32_t p2, int64_t i, mpz_t m0,
432 mpz_t ad, unsigned long d, mpz_t N, uint64_t q, mpz_t rq)
433 {
434 match (p1, p2, i, m0, ad, d, N, q, rq);
435 }
436
437
438 /* find collisions between "P" primes, return number of loops */
439 static inline unsigned long
collision_on_p(header_t header,proots_t R)440 collision_on_p ( header_t header,
441 proots_t R )
442 {
443 unsigned long i, j, nprimes, p, nrp, c = 0, tot_roots = 0;
444 uint64_t *rp;
445 int64_t ppl = 0, u, umax;
446 mpz_t *f, tmp;
447 int found = 0;
448 shash_t H;
449 int st = milliseconds ();
450
451 /* init f for roots computation */
452 mpz_init_set_ui (tmp, 0);
453 f = (mpz_t*) malloc ((header->d + 1) * sizeof (mpz_t));
454 if (f == NULL) {
455 fprintf (stderr, "Error, cannot allocate memory in collision_on_p\n");
456 exit (1);
457 }
458 for (i = 0; i <= header->d; i++)
459 mpz_init (f[i]);
460 mpz_set_ui (f[header->d], 1);
461
462 rp = (uint64_t*) malloc (header->d * sizeof (uint64_t));
463 if (rp == NULL) {
464 fprintf (stderr, "Error, cannot allocate memory in collision_on_p\n");
465 exit (1);
466 }
467
468 shash_init (H, 4 * lenPrimes);
469 shash_reset (H);
470 umax = (int64_t) Primes[lenPrimes - 1] * (int64_t) Primes[lenPrimes - 1];
471 for (nprimes = 0; nprimes < lenPrimes; nprimes ++)
472 {
473 p = Primes[nprimes];
474 ppl = (int64_t) p * (int64_t) p;
475
476 /* add fake roots to keep indices */
477 if (header_skip (header, p))
478 {
479 R->nr[nprimes] = 0; // nr = 0.
480 R->roots[nprimes] = NULL;
481 continue;
482 }
483
484 nrp = roots_mod_uint64 (rp, mpz_fdiv_ui (header->Ntilde, p), header->d,
485 p);
486 tot_roots += nrp;
487 roots_lift (rp, header->Ntilde, header->d, header->m0, p, nrp);
488 proots_add (R, nrp, rp, nprimes);
489 for (j = 0; j < nrp; j++, c++)
490 {
491 for (u = (int64_t) rp[j]; u < umax; u += ppl)
492 shash_add (H, u);
493 for (u = ppl - (int64_t) rp[j]; u < umax; u += ppl)
494 shash_add (H, -u);
495 }
496 }
497 found = shash_find_collision (H);
498 shash_clear (H);
499 free (rp);
500
501 if (verbose > 2)
502 printf ("# computing %lu p-roots took %dms\n", tot_roots, st);
503
504 if (found) /* do the real work */
505 {
506 hash_t H;
507
508 hash_init (H, INIT_FACTOR * lenPrimes);
509 for (nprimes = 0; nprimes < lenPrimes; nprimes ++)
510 {
511 nrp = R->nr[nprimes];
512 if (nrp == 0)
513 continue;
514 p = Primes[nprimes];
515 ppl = (int64_t) p * (int64_t) p;
516 rp = R->roots[nprimes];
517
518 for (j = 0; j < nrp; j++)
519 {
520 for (u = (int64_t) rp[j]; u < umax; u += ppl)
521 hash_add (H, p, u, header->m0, header->ad, header->d,
522 header->N, 1, tmp);
523 for (u = ppl - (int64_t) rp[j]; u < umax; u += ppl)
524 hash_add (H, p, -u, header->m0, header->ad,
525 header->d, header->N, 1, tmp);
526 }
527 }
528 #ifdef DEBUG_POLYSELECT
529 printf ("# collision_on_p took %lums\n", milliseconds () - st);
530 gmp_printf ("# p hash_size: %u for ad = %Zd\n", H->size, header->ad);
531 #endif
532
533 #ifdef DEBUG_HASH_TABLE
534 printf ("# p hash_size: %u, hash_alloc: %u\n", H->size, H->alloc);
535 printf ("# hash table coll: %lu, all_coll: %lu\n", H->coll, H->coll_all);
536 #endif
537 hash_clear (H);
538 }
539
540 for (i = 0; i <= header->d; i++)
541 mpz_clear (f[i]);
542 free (f);
543 mpz_clear (tmp);
544
545 return c;
546 }
547
548
549 /* collision on each special-q, call collision_on_batch_p() */
550 static inline void
collision_on_each_sq(header_t header,proots_t R,unsigned long q,mpz_t rqqz,unsigned long * inv_qq)551 collision_on_each_sq ( header_t header,
552 proots_t R,
553 unsigned long q,
554 mpz_t rqqz,
555 unsigned long *inv_qq )
556 {
557 shash_t H;
558 uint64_t **cur1, **cur2, *ccur1, *ccur2;
559 long *pc, *epc;
560 uint64_t pp;
561 int64_t ppl, neg_umax, umax, v1, v2, nv;
562 unsigned long p, nprimes, c;
563 uint8_t vpnr, *pnr, nr, j;
564 uint32_t *pprimes, i;
565 int found;
566
567 #ifdef DEBUG_POLYSELECT
568 int st = milliseconds();
569 #endif
570 #if SHASH_NBUCKETS == 256
571 #define CURRENT(V) (H->current + (uint8_t) (V))
572 #else
573 #define CURRENT(V) (H->current + ((V) & (SHASH_NBUCKETS - 1)))
574 #endif
575 /*
576 uint64_t t1, t2;
577 static uint64_t sum1 = 0, sum2 = 0;
578 */
579
580 shash_init (H, 4 * lenPrimes);
581 shash_reset (H);
582
583 pc = (long *) inv_qq;
584 nv = *pc;
585 pprimes = Primes - 1;
586 pnr = R->nr;
587 R->nr[R->size] = 0xff; /* I use guard to end */
588 umax = Primes[lenPrimes - 1];
589 umax *= umax;
590 neg_umax = -umax;
591
592 /* This define inserts 2 values v1 and v2 with a interlace.
593 The goal is to have a little time to read ccurX from L0
594 cache before to use it. The best seems a
595 three read interlacing in fact, two seems too short. */
596 #define INSERT_2I(I1,I2) \
597 do { \
598 cur1 = CURRENT(I1); ccur1 = *cur1; \
599 cur2 = CURRENT(I2); ccur2 = *cur2; \
600 *ccur1++ = I1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1; \
601 *ccur2++ = I2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2; \
602 } while (0)
603 /* This version is slow because ccur1 is used immediatly after
604 it has been read from L0 cache -> 3 ticks of latency on P4 Nehalem. */
605 #define INSERT_I(I) \
606 do { \
607 cur1 = CURRENT(I); ccur1 = *cur1; *ccur1++ = I; \
608 __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1; \
609 } while (0)
610
611 int64_t b;
612 b = (int64_t) ((double) umax * 0.3333333333333333);
613 do {
614 do {
615 vpnr = *pnr++;
616 pprimes++;
617 } while (!vpnr);
618 if (UNLIKELY(vpnr == 0xff)) goto bend;
619 ppl = *pprimes;
620 __builtin_prefetch(((void *) pnr) + 0x040, 0, 3);
621 __builtin_prefetch(((void *) pprimes) + 0x80, 0, 3);
622 __builtin_prefetch(((void *) pc) + 0x100, 0, 3);
623 ppl *= ppl;
624 epc = pc + vpnr;
625 if (UNLIKELY(ppl > b)) { b = umax >> 1; goto iter2; }
626 do {
627 v1 = nv; cur1 = CURRENT(v1); ccur1 = *cur1;
628 v2 = v1 - ppl; cur2 = CURRENT(v2); ccur2 = *cur2;
629 nv = *++pc;
630 *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
631 *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
632 v1 += ppl; cur1 = CURRENT(v1); ccur1 = *cur1;
633 v2 -= ppl; cur2 = CURRENT(v2); ccur2 = *cur2;
634 *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
635 *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
636 v1 += ppl; cur1 = CURRENT(v1); ccur1 = *cur1;
637 v2 -= ppl; cur2 = CURRENT(v2); ccur2 = *cur2;
638 *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
639 *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
640 v1 += ppl; v2 -= ppl;
641 if (LIKELY (v1 > umax)) {
642 if (UNLIKELY (v2 >= neg_umax)) INSERT_I(v2);
643 } else if (UNLIKELY (v2 >= neg_umax)) INSERT_2I(v1, v2);
644 else INSERT_I(v1);
645 } while (pc != epc);
646 } while (1);
647
648 do {
649 do {
650 vpnr = *pnr++;
651 pprimes++;
652 } while (!vpnr);
653 if (UNLIKELY(vpnr == 0xff)) goto bend;
654 ppl = *pprimes;
655 __builtin_prefetch(((void *) pnr) + 0x040, 0, 3);
656 __builtin_prefetch(((void *) pprimes) + 0x100, 0, 3);
657 __builtin_prefetch(((void *) pc) + 0x280, 0, 3);
658 ppl *= ppl;
659 epc = pc + vpnr;
660 iter2:
661 if (UNLIKELY(ppl > b)) goto iter1;
662 do {
663 v1 = nv; cur1 = CURRENT(v1); ccur1 = *cur1;
664 v2 = v1 - ppl; cur2 = CURRENT(v2); ccur2 = *cur2;
665 nv = *++pc;
666 *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
667 *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
668 v1 += ppl; cur1 = CURRENT(v1); ccur1 = *cur1;
669 v2 -= ppl; cur2 = CURRENT(v2); ccur2 = *cur2;
670 *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
671 *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
672 v1 += ppl; v2 -= ppl;
673 if (LIKELY (v1 > umax)) {
674 if (UNLIKELY (v2 >= neg_umax)) INSERT_I(v2);
675 } else if (UNLIKELY (v2 >= neg_umax)) INSERT_2I(v1, v2);
676 else INSERT_I(v1);
677 } while (pc != epc);
678 } while (1);
679
680 do {
681 do {
682 vpnr = *pnr++;
683 pprimes++;
684 } while (!vpnr);
685 if (UNLIKELY(vpnr == 0xff)) goto bend;
686 ppl = *pprimes;
687 __builtin_prefetch(((void *) pnr) + 0x040, 0, 3);
688 __builtin_prefetch(((void *) pprimes) + 0x100, 0, 3);
689 __builtin_prefetch(((void *) pc) + 0x280, 0, 3);
690 ppl *= ppl;
691 epc = pc + vpnr;
692 iter1:
693 do {
694 v1 = nv; cur1 = CURRENT(v1); ccur1 = *cur1;
695 v2 = v1 - ppl; cur2 = CURRENT(v2); ccur2 = *cur2;
696 nv = *++pc;
697 *ccur1++ = v1; __builtin_prefetch(ccur1, 1, 3); *cur1 = ccur1;
698 *ccur2++ = v2; __builtin_prefetch(ccur2, 1, 3); *cur2 = ccur2;
699 v1 += ppl; v2 -= ppl;
700 if (LIKELY (v1 > umax)) {
701 if (UNLIKELY (v2 >= neg_umax)) INSERT_I(v2);
702 } else if (UNLIKELY (v2 >= neg_umax)) INSERT_2I(v1, v2);
703 else INSERT_I(v1);
704 } while (pc != epc);
705 } while (1);
706
707 bend:
708 #undef INSERT_2I
709 #undef INSERT_I
710
711 for (i = 0; i < SHASH_NBUCKETS; i++) ASSERT (H->current[i] <= H->base[i+1]);
712
713 found = shash_find_collision (H);
714 shash_clear (H);
715
716 if (found) /* do the real work */
717 {
718 hash_t H;
719
720 hash_init (H, INIT_FACTOR * lenPrimes);
721
722 umax = (int64_t) Primes[lenPrimes - 1] * (int64_t) Primes[lenPrimes - 1];
723 for (nprimes = c = 0; nprimes < lenPrimes; nprimes ++)
724 {
725 p = Primes[nprimes];
726 if (header_skip (header, p))
727 continue;
728 pp = p * p;
729 ppl = (long) pp;
730 nr = R->nr[nprimes];
731 for (j = 0; j < nr; j++, c++)
732 {
733 v1 = (long) inv_qq[c];
734 for (v2 = v1; v2 < umax; v2 += ppl)
735 hash_add (H, p, v2, header->m0, header->ad, header->d,
736 header->N, q, rqqz);
737 for (v2 = ppl - v1; v2 < umax; v2 += ppl)
738 hash_add (H, p, -v2, header->m0, header->ad, header->d,
739 header->N, q, rqqz);
740 }
741 }
742 hash_clear (H);
743 }
744
745 #ifdef DEBUG_POLYSELECT
746 printf ("# inner collision_on_each_sq took %lums\n", milliseconds () - st);
747 printf ("# - q hash_alloc (q=%lu): %u\n", q, H->alloc);
748 #endif
749
750 #ifdef DEBUG_HASH_TABLE
751 printf ("# p hash_size: %u, hash_alloc: %u\n", H->size, H->alloc);
752 printf ("# hash table coll: %lu, all_coll: %lu\n", H->coll, H->coll_all);
753 #endif
754 }
755
756
757 /* Given p, rp, q, invqq[], for each rq of q, compute (rp - rq) / q^2 */
758 static inline void
collision_on_each_sq_r(header_t header,proots_t R,unsigned long q,mpz_t * rqqz,unsigned long * inv_qq,unsigned long number_pr,int count)759 collision_on_each_sq_r ( header_t header,
760 proots_t R,
761 unsigned long q,
762 mpz_t *rqqz,
763 unsigned long *inv_qq,
764 unsigned long number_pr,
765 int count )
766 {
767 if (count == 0)
768 return;
769
770 uint8_t i, nr, *pnr;
771 unsigned long nprimes, p, c = 0, rp, rqi;
772 int k;
773 uint64_t pp;
774 unsigned long **tinv_qq = malloc (count * sizeof (unsigned long*));
775
776 if (!tinv_qq)
777 {
778 fprintf (stderr, "Error, cannot allocate memory in %s\n", __FUNCTION__);
779 exit (1);
780 }
781 for (k = 0; k < count; k++) {
782 /* number_pr + 1 for guard for pre-load in collision_on_each_sq (nv) */
783 tinv_qq[k] = malloc ((number_pr + 1) * sizeof (unsigned long));
784 tinv_qq[k][number_pr] = 0;
785 }
786
787 int st = milliseconds();
788 pnr = R->nr;
789
790 /* for each rp, compute (rp-rq)*1/q^2 (mod p^2) */
791 for (nprimes = 0; nprimes < lenPrimes; nprimes ++)
792 {
793 if (!pnr[nprimes]) continue;
794 nr = pnr[nprimes];
795 p = Primes[nprimes];
796 pp = p*p;
797
798 modulusredcul_t modpp;
799 residueredcul_t res_rqi, res_rp, res_tmp;
800 modredcul_initmod_ul_raw (modpp, pp);
801 modredcul_init (res_rqi, modpp);
802 modredcul_init (res_rp, modpp);
803 modredcul_init (res_tmp, modpp);
804
805 for (k = 0; k < count; k ++)
806 {
807 rqi = mpz_fdiv_ui (rqqz[k], pp);
808 modredcul_intset_ul (res_rqi, rqi);
809 modredcul_intset_ul (res_tmp, inv_qq[nprimes]);
810 for (i = 0; i < nr; i ++, c++)
811 {
812 rp = R->roots[nprimes][i];
813 modredcul_intset_ul (res_rp, rp);
814 /* rp - rq */
815 modredcul_sub (res_rp, res_rp, res_rqi, modpp);
816 /* res_rp = (rp - rq) / q[i]^2 */
817 modredcul_mul (res_rp, res_rp, res_tmp, modpp);
818 tinv_qq[k][c] = modredcul_intget_ul (res_rp);
819 }
820 c -= nr;
821 }
822 c += nr;
823
824 modredcul_clear (res_rp, modpp);
825 modredcul_clear (res_rqi, modpp);
826 modredcul_clear (res_tmp, modpp);
827 modredcul_clearmod (modpp);
828 }
829
830 if (verbose > 2) {
831 printf ("# substage: batch %d many (rp-rq)*1/q^2 took %lums\n",
832 count, milliseconds () - st);
833 st = milliseconds();
834 }
835
836 /* core function to find collisions */
837 for (k = 0; k < count; k ++) {
838 collision_on_each_sq (header, R, q, rqqz[k], tinv_qq[k]);
839 }
840
841 if (verbose > 2)
842 printf ("# substage: collision-detection %d many rq took %lums\n",
843 count, milliseconds () - st);
844
845 for (k = 0; k < count; k++)
846 free (tinv_qq[k]);
847 free (tinv_qq);
848 }
849
850
851 /* Next combination */
852 static inline unsigned int
aux_nextcomb(unsigned int * ind,unsigned int len_q,unsigned int * len_nr)853 aux_nextcomb ( unsigned int *ind,
854 unsigned int len_q,
855 unsigned int *len_nr )
856 {
857 unsigned int i;
858
859 /* bottom change first */
860 for (i = len_q - 1; ; i--) {
861 if (ind[i] < (len_nr[i] - 1)) {
862 ind[i]++;
863 return 1;
864 }
865 else {
866 if (i == 0)
867 break;
868 ind[i] = 0;
869 }
870 }
871 return 0;
872 }
873
874
875 /* Compute crted rq */
876 static inline void
aux_return_rq(qroots_t SQ_R,unsigned long * idx_q,unsigned int * idx_nr,unsigned long k,mpz_t qqz,mpz_t rqqz,unsigned long lq)877 aux_return_rq ( qroots_t SQ_R,
878 unsigned long *idx_q,
879 unsigned int *idx_nr,
880 unsigned long k,
881 mpz_t qqz,
882 mpz_t rqqz,
883 unsigned long lq )
884 {
885 unsigned long i, q[k], rq[k];
886
887 /* q and roots */
888 for (i = 0; i < k; i ++) {
889 q[i] = SQ_R->q[idx_q[i]];
890 rq[i] = SQ_R->roots[idx_q[i]][idx_nr[i]];
891 }
892
893 /* crt roots */
894 crt_sq (qqz, rqqz, q, rq, lq);
895
896 return;
897 }
898
899
900 /* Consider each rq */
901 static inline void
collision_on_batch_sq_r(header_t header,proots_t R,qroots_t SQ_R,unsigned long q,unsigned long * idx_q,unsigned long * inv_qq,unsigned long number_pr,int * curr_nq,unsigned long lq)902 collision_on_batch_sq_r ( header_t header,
903 proots_t R,
904 qroots_t SQ_R,
905 unsigned long q,
906 unsigned long *idx_q,
907 unsigned long *inv_qq,
908 unsigned long number_pr,
909 int *curr_nq,
910 unsigned long lq )
911 {
912 int count;
913 unsigned int ind_qr[lq]; /* indices of roots for each small q */
914 unsigned int len_qnr[lq]; /* for each small q, number of roots */
915 unsigned long i;
916 mpz_t qqz, rqqz[BATCH_SIZE];
917
918 mpz_init (qqz);
919 for (i = 0; i < BATCH_SIZE; i ++)
920 mpz_init (rqqz[i]);
921
922 /* initialization indices */
923 for (i = 0; i < lq; i ++) {
924 ind_qr[i] = 0;
925 len_qnr[i] = SQ_R->nr[idx_q[i]];
926 }
927
928 #if 0
929 printf ("q: %lu, ", q);
930 for (i = 0; i < lq; i ++)
931 printf ("%u ", SQ_R->q[idx_q[i]]);
932 printf (", ");
933 for (i = 0; i < lq; i ++)
934 printf ("%u ", SQ_R->nr[idx_q[i]]);
935 printf ("\n");
936 #endif
937
938 /* we proceed with BATCH_SIZE many rq for each time */
939 i = count = 0;
940 int re = 1, num_rq;
941 while (re) {
942 /* compute BATCH_SIZE such many rqqz[] */
943 num_rq = 0;
944 for (count = 0; count < BATCH_SIZE; count ++)
945 {
946 aux_return_rq (SQ_R, idx_q, ind_qr, lq, qqz, rqqz[count], lq);
947 re = aux_nextcomb (ind_qr, lq, len_qnr);
948 (*curr_nq)++;
949 num_rq ++;
950 if ((*curr_nq) >= nq)
951 re = 0;
952 if (!re)
953 break;
954 }
955
956 /* core function for a fixed qq and several rqqz[] */
957 collision_on_each_sq_r (header, R, q, rqqz, inv_qq, number_pr, num_rq);
958 }
959
960 mpz_clear (qqz);
961 for (i = 0; i < BATCH_SIZE; i ++)
962 mpz_clear (rqqz[i]);
963 }
964
965
966 /* SQ inversion, write 1/q^2 (mod p_i^2) to invqq[i] */
967 static inline void
collision_on_batch_sq(header_t header,proots_t R,qroots_t SQ_R,unsigned long q,unsigned long * idx_q,unsigned long number_pr,unsigned long lq)968 collision_on_batch_sq ( header_t header,
969 proots_t R,
970 qroots_t SQ_R,
971 unsigned long q,
972 unsigned long *idx_q,
973 unsigned long number_pr,
974 unsigned long lq )
975 {
976 unsigned nr;
977 int curr_nq = 0;
978 uint64_t pp;
979 unsigned long nprimes, p;
980 unsigned long *invqq = malloc (lenPrimes * sizeof (unsigned long));
981 if (!invqq) {
982 fprintf (stderr, "Error, cannot allocate memory in %s\n", __FUNCTION__);
983 exit (1);
984 }
985
986 int st = milliseconds();
987
988 /* Step 1: inversion */
989 for (nprimes = 0; nprimes < lenPrimes; nprimes ++) {
990
991 p = Primes[nprimes];
992 if (header_skip (header, p))
993 continue;
994 nr = R->nr[nprimes];
995 if (nr == 0)
996 continue;
997 pp = p * p;
998
999 modulusredcul_t modpp;
1000 residueredcul_t qq, tmp;
1001 modredcul_initmod_ul (modpp, pp);
1002 modredcul_init (qq, modpp);
1003 modredcul_init (tmp, modpp);
1004
1005 /* q^2/B (mod pp) */
1006 modredcul_intset_ul (tmp, q);
1007 modredcul_sqr (qq, tmp, modpp);
1008 /* B/q^2 (mod pp) */
1009 modredcul_intinv (tmp, qq, modpp);
1010 invqq[nprimes] = modredcul_intget_ul (tmp);
1011
1012 modredcul_clear (tmp, modpp);
1013 modredcul_clear (qq, modpp);
1014 modredcul_clearmod (modpp);
1015 }
1016
1017 if (verbose > 2)
1018 printf ("# stage (1/q^2 inversion) for %lu primes took %lums\n",
1019 lenPrimes, milliseconds () - st);
1020
1021 /* Step 2: find collisions on q. */
1022 int st2 = milliseconds();
1023
1024 collision_on_batch_sq_r ( header, R, SQ_R, q, idx_q, invqq, number_pr,
1025 &curr_nq, lq );
1026 if (verbose > 2)
1027 printf ("# stage (special-q) for %d special-q's took %lums\n",
1028 curr_nq, milliseconds() - st2);
1029
1030 free (invqq);
1031 }
1032
1033
1034 /* collision on special-q, call collision_on_batch_sq */
1035 static inline void
collision_on_sq(header_t header,proots_t R,unsigned long c)1036 collision_on_sq ( header_t header,
1037 proots_t R,
1038 unsigned long c )
1039 {
1040 int prod = 1;
1041 unsigned int i;
1042 unsigned long j, lq = 0UL;
1043 qroots_t SQ_R;
1044
1045 /* init special-q roots */
1046 qroots_init (SQ_R);
1047 comp_sq_roots (header, SQ_R);
1048 //qroots_print (SQ_R);
1049
1050 /* find a suitable lq */
1051 for (i = 0; i < SQ_R->size; i++) {
1052 if (prod < nq) {
1053 if (!check_parameters (header->m0, header->d, lq))
1054 break;
1055 prod *= SQ_R->nr[i];
1056 lq ++;
1057 }
1058 }
1059
1060 /* lq < 8 for the moment */
1061 if (lq > 7)
1062 lq = 7;
1063 if (lq < 1)
1064 lq = 1;
1065
1066 unsigned long q, idx_q[lq];
1067
1068 for (j = 0; j < lq; j ++)
1069 idx_q[j] = j;
1070 q = return_q_norq (SQ_R, idx_q, lq);
1071
1072 /* collision batch */
1073 collision_on_batch_sq (header, R, SQ_R, q, idx_q, c, lq);
1074
1075 /* clean */
1076 qroots_clear (SQ_R);
1077 return;
1078 }
1079
1080
1081 static void
newAlgo(mpz_t N,unsigned long d,mpz_t ad)1082 newAlgo (mpz_t N, unsigned long d, mpz_t ad)
1083 {
1084 unsigned long c = 0;
1085 header_t header;
1086 proots_t R;
1087
1088 header_init (header, N, d, ad);
1089 proots_init (R, lenPrimes);
1090
1091 c = collision_on_p (header, R);
1092 if (nq > 0)
1093 collision_on_sq (header, R, c);
1094
1095 proots_clear (R, lenPrimes);
1096 header_clear (header);
1097 }
1098
1099 void*
one_thread(void * args)1100 one_thread (void* args)
1101 {
1102 tab_t *tab = (tab_t*) args;
1103 newAlgo (tab[0]->N, tab[0]->d, tab[0]->ad);
1104 return NULL;
1105 }
1106
1107 static void
declare_usage(param_list pl)1108 declare_usage(param_list pl)
1109 {
1110 param_list_decl_usage(pl, "degree", "polynomial degree (2 or 3, "
1111 "default is 3)");
1112 param_list_decl_usage(pl, "n", "(required, alias N) input number");
1113 param_list_decl_usage(pl, "P", "(required) deg-1 coeff of g(x) has two prime factors in [P,2P]\n");
1114
1115 param_list_decl_usage(pl, "admax", "max value for ad (+1)");
1116 param_list_decl_usage(pl, "admin", "min value for ad (default 0)");
1117 param_list_decl_usage(pl, "incr", "increment of ad (default 60)");
1118 param_list_decl_usage(pl, "skewness", "maximun skewness possible "
1119 "(default N^(1/9))");
1120 param_list_decl_usage(pl, "maxtime", "stop the search after maxtime seconds");
1121
1122 char str[200];
1123 snprintf (str, 200, "maximum number of special-q's considered\n"
1124 " for each ad (default %d)", INT_MAX);
1125 param_list_decl_usage(pl, "nq", str);
1126 param_list_decl_usage(pl, "keep", "number of polynomials kept (default 10)");
1127 snprintf(str, 200, "time interval (seconds) for printing statistics (default %d)", TARGET_TIME / 1000);
1128 param_list_decl_usage(pl, "s", str);
1129 param_list_decl_usage(pl, "t", "number of threads to use (default 1)");
1130 param_list_decl_usage(pl, "v", "verbose mode");
1131 param_list_decl_usage(pl, "q", "quiet mode");
1132 snprintf (str, 200, "sieving area (default %.2e)", AREA);
1133 param_list_decl_usage(pl, "area", str);
1134 snprintf (str, 200, "algebraic smoothness bound (default %.2e)", BOUND_F);
1135 param_list_decl_usage(pl, "Bf", str);
1136 snprintf (str, 200, "rational smoothness bound (default %.2e)", BOUND_G);
1137 param_list_decl_usage(pl, "Bg", str);
1138 verbose_decl_usage(pl);
1139 }
1140
1141 static void
usage(const char * argv,const char * missing,param_list pl)1142 usage (const char *argv, const char * missing, param_list pl)
1143 {
1144 if (missing) {
1145 fprintf(stderr, "\nError: missing or invalid parameter \"-%s\"\n",
1146 missing);
1147 }
1148 param_list_print_usage(pl, argv, stderr);
1149 exit (EXIT_FAILURE);
1150 }
1151
1152 int
main(int argc,char * argv[])1153 main (int argc, char *argv[])
1154 {
1155 char **argv0 = argv;
1156 double st0 = seconds (), maxtime = DBL_MAX;
1157 mpz_t N, admin, admax;
1158 unsigned int d = 3;
1159 unsigned long P = 0;
1160 int quiet = 0, tries = 0, i, nthreads = 1, st,
1161 target_time = TARGET_TIME;
1162 tab_t *T;
1163 pthread_t *tid;
1164
1165 mpz_init (N);
1166 mpz_init (maxS);
1167 cado_poly_init (best_poly);
1168 cado_poly_init (curr_poly);
1169
1170 /* read params */
1171 param_list pl;
1172 param_list_init (pl);
1173
1174 declare_usage(pl);
1175
1176 param_list_configure_switch (pl, "-v", &verbose);
1177 param_list_configure_switch (pl, "-q", &quiet);
1178 param_list_configure_alias(pl, "incr", "-i");
1179 param_list_configure_alias(pl, "n", "-N");
1180 param_list_configure_alias(pl, "degree", "-d");
1181
1182 if (argc == 1)
1183 usage (argv0[0], NULL, pl);
1184
1185 argv++, argc--;
1186 for ( ; argc; ) {
1187 if (param_list_update_cmdline (pl, &argc, &argv)) continue;
1188 fprintf (stderr, "Unhandled parameter %s\n", argv[0]);
1189 usage (argv0[0], NULL, pl);
1190 }
1191
1192 /* parse and check N in the first place */
1193 int have_n = param_list_parse_mpz(pl, "n", N);
1194
1195 if (!have_n) {
1196 printf ("# Reading n from stdin\n");
1197 param_list_read_stream(pl, stdin, 0);
1198 have_n = param_list_parse_mpz(pl, "n", N);
1199 }
1200
1201 if (!have_n) {
1202 fprintf(stderr, "No n defined ; sorry.\n");
1203 exit(1);
1204 }
1205
1206 if (mpz_cmp_ui (N, 0) <= 0) usage(argv0[0], "n", pl);
1207
1208 param_list_parse_ulong(pl, "P", &P);
1209 if (P == 0) usage(argv0[0], "P", pl);
1210
1211 param_list_parse_int (pl, "t", &nthreads);
1212 param_list_parse_int (pl, "nq", &nq);
1213 param_list_parse_int (pl, "s", &target_time);
1214 param_list_parse_uint (pl, "degree", &d);
1215 ASSERT_ALWAYS (2 <= d && d <= 3);
1216 if (param_list_parse_double (pl, "area", &area) == 0) /* no -area */
1217 area = AREA;
1218 if (param_list_parse_double (pl, "Bf", &bound_f) == 0) /* no -Bf */
1219 bound_f = BOUND_F;
1220 if (param_list_parse_double (pl, "Bg", &bound_g) == 0) /* no -Bg */
1221 bound_g = BOUND_G;
1222
1223 mpz_init (admin);
1224 param_list_parse_mpz (pl, "admin", admin);
1225
1226 mpz_init (admax);
1227 if (param_list_parse_mpz (pl, "admax", admax) == 0) /* no -admax */
1228 mpz_set (admax, N);
1229
1230 param_list_parse_ulong (pl, "incr", &incr);
1231 param_list_parse_double (pl, "maxtime", &maxtime);
1232
1233 if (!param_list_parse_mpz(pl, "skewness", maxS))
1234 mpz_set_ui (maxS, 0);
1235 else if (mpz_cmp_ui (maxS, 1) < 0)
1236 {
1237 gmp_fprintf(stderr, "Error, skewness (%Zd) should be greater or equal "
1238 "to 1\n", maxS);
1239 abort();
1240 }
1241
1242 if (param_list_warn_unused(pl))
1243 usage (argv0[0], NULL, pl);
1244
1245 /* print command line */
1246 verbose_interpret_parameters(pl);
1247 param_list_print_command_line (stdout, pl);
1248
1249 /* check lq and nq */
1250 if (nq < 0) {
1251 fprintf (stderr, "Error, number of special-q's should >= 0\n");
1252 exit (1);
1253 }
1254
1255 /* allocate threads */
1256 tid = malloc (nthreads * sizeof (pthread_t));
1257 ASSERT_ALWAYS(tid != NULL);
1258
1259 /* quiet mode */
1260 if (quiet == 1)
1261 verbose = -1;
1262
1263 /* set cpoly */
1264 mpz_set (best_poly->n, N);
1265 mpz_set (curr_poly->n, N);
1266 best_poly->pols[ALG_SIDE]->deg = d;
1267 best_poly->pols[RAT_SIDE]->deg = d;
1268 curr_poly->pols[ALG_SIDE]->deg = d;
1269 curr_poly->pols[RAT_SIDE]->deg = d;
1270
1271 /* Compute maxS: use maxS if maxS argument is greater than 0 and lesser than
1272 default value */
1273 mpz_t tmp;
1274 mpz_init (tmp);
1275 compute_default_max_skew (tmp, N, 2);
1276 if (mpz_cmp_ui(maxS, 0) == 0 || mpz_cmp(maxS, tmp) > 0)
1277 mpz_set (maxS, tmp);
1278
1279 /* Initialize primes. Since we use modredc to perform computations mod p^2
1280 for p < 2P, we need that 4P^2 fits in an unsigned long. */
1281 double Pd;
1282 Pd = (double) P;
1283 if (4.0 * Pd * Pd >= (double) ULONG_MAX)
1284 {
1285 mpz_set_ui (tmp, ULONG_MAX >> 2);
1286 mpz_sqrt (tmp, tmp);
1287 gmp_fprintf (stderr, "Error, too large value of P, maximum is %Zd\n",
1288 tmp);
1289 exit (1);
1290 }
1291 mpz_clear (tmp);
1292 if (P <= (unsigned long) SPECIAL_Q[LEN_SPECIAL_Q - 2]) {
1293 fprintf (stderr, "Error, too small value of P, need P > %u\n",
1294 SPECIAL_Q[LEN_SPECIAL_Q - 2]);
1295 exit (1);
1296 }
1297
1298 /* detect L1 cache size */
1299 ropt_L1_cachesize ();
1300
1301 st = milliseconds ();
1302 lenPrimes = initPrimes (P, &Primes);
1303
1304 printf ( "# Info: initializing %lu P primes took %lums,"
1305 " nq=%d, target_time=%d\n",
1306 lenPrimes, milliseconds () - st, nq, target_time / 1000 );
1307
1308
1309 printf ( "# Info: estimated peak memory=%.2fMB (%d thread(s),"
1310 " batch %d inversions on SQ)\n",
1311 (double) (nthreads * (BATCH_SIZE * 2 + INIT_FACTOR) * lenPrimes
1312 * (sizeof(uint32_t) + sizeof(uint64_t)) / 1024 / 1024),
1313 nthreads, BATCH_SIZE );
1314
1315 //printPrimes (Primes, lenPrimes);
1316
1317 /* init tabs_t for threads */
1318 T = malloc (nthreads * sizeof (tab_t));
1319 if (T == NULL)
1320 {
1321 fprintf (stderr, "Error, cannot allocate memory in main\n");
1322 exit (1);
1323 }
1324 for (i = 0; i < nthreads ; i++)
1325 {
1326 mpz_init_set (T[i]->N, N);
1327 mpz_init (T[i]->ad);
1328 T[i]->d = d;
1329 T[i]->thread = i;
1330 }
1331
1332 if (incr <= 0)
1333 {
1334 fprintf (stderr, "Error, incr should be positive\n");
1335 exit (1);
1336 }
1337
1338 /* admin should be nonnegative */
1339 if (mpz_cmp_ui (admin, 0) < 0)
1340 {
1341 fprintf (stderr, "Error, admin should be nonnegative\n");
1342 exit (1);
1343 }
1344
1345 /* ensure admin > 0 and admin is a multiple of incr */
1346 if (mpz_cmp_ui (admin, 0) == 0)
1347 mpz_set_ui (admin, incr);
1348 else if (mpz_fdiv_ui (admin, incr) != 0)
1349 mpz_add_ui (admin, admin, incr - mpz_fdiv_ui (admin, incr));
1350
1351 while (mpz_cmp (admin, admax) < 0 && seconds () - st0 <= maxtime)
1352 {
1353 for (i = 0; i < nthreads ; i++)
1354 {
1355 tries ++;
1356 if (verbose >= 1)
1357 gmp_printf ("# %d ad=%Zd\n", tries, admin);
1358 mpz_set (T[i]->ad, admin);
1359 pthread_create (&tid[i], NULL, one_thread, (void *) (T+i));
1360 mpz_add_ui (admin, admin, incr);
1361 }
1362 for (i = 0 ; i < nthreads ; i++)
1363 pthread_join(tid[i], NULL);
1364 }
1365
1366 printf ("# Stat: tried %d ad-value(s), found %lu polynomial(s)\n", tries,
1367 found);
1368
1369 /* print total time (this gets parsed by the scripts) */
1370 printf ("# Stat: total phase took %.2fs\n", seconds () - st0);
1371
1372 if (best_E == DBL_MAX)
1373 /* This line is required by the script: */
1374 printf ("# No polynomial found, please increase the ad range or decrease P\n");
1375
1376 for (i = 0; i < nthreads ; i++)
1377 {
1378 mpz_clear (T[i]->N);
1379 mpz_clear (T[i]->ad);
1380 }
1381 free (T);
1382 mpz_clear (N);
1383 mpz_clear (admin);
1384 mpz_clear (admax);
1385 mpz_clear (maxS);
1386 clearPrimes (&Primes);
1387 cado_poly_clear (best_poly);
1388 cado_poly_clear (curr_poly);
1389 param_list_clear (pl);
1390 free (tid);
1391
1392 return 0;
1393 }
1394