1 /**
2 * @file ropt_stage2.c
3 * Root sieve.
4 */
5
6
7 #include "cado.h" // IWYU pragma: keep
8 #include <stdio.h> // fprintf stderr
9 #include <stdlib.h>
10 #include <math.h>
11 #include <gmp.h>
12 #include "ropt_stage2.h"
13 #include "mpz_poly.h"
14 #include "auxiliary.h" // print_poly_fg
15 #include "ropt_arith.h" // compute_fuv_mp
16 #include "ropt_param.h" // SUP_ALPHA
17 #include "ropt_tree.h" // node ...
18 #include "ropt_str.h" // ropt_s2param_t
19 #include "size_optimization.h"
20 #include "timing.h" // for milliseconds
21
22 //#define TIMING_ROPT_STAGE2
23 /**
24 * Print sieve array for debugging.
25 */
26 #if 0
27 void
28 print_sievearray ( double **A,
29 long A0,
30 long A1,
31 long B0,
32 long B1,
33 unsigned long K_ST,
34 unsigned long J_ST,
35 unsigned long MOD )
36 {
37 long i, j;
38
39 for (i = 0; i < B1 - B0 + 1; i ++) {
40 for (j = 0; j < A1 - A0 + 1; j ++) {
41 if (j % 16 == 0) {
42 printf ("\n");
43 printf ("[%3ld*%lu+%lu, %4ld*%lu+%lu]=[%4ld, %4ld]: ",
44 i + B0, MOD, K_ST, j + A0, MOD,
45 J_ST, (i + B0)*MOD+K_ST, (j + A0)*MOD+J_ST);
46 }
47 if (j % 4 == 0)
48 printf (" ");
49 printf ("%1.1f ", A[i][j]);
50 }
51 printf ("\n");
52 }
53 printf ("\n");
54 }
55 #endif
56
57
58 /**
59 * Init the sieve array.
60 */
61 static inline void
sievearray_init(sievearray_t sievearray,unsigned int i,unsigned int j)62 sievearray_init ( sievearray_t sievearray,
63 unsigned int i,
64 unsigned int j )
65 {
66 /* sage: sum ([p/(p^2-1)*log(p) for p in prime_range(200)])
67 4.2774597204802731 */
68 float tmpf = SUP_ALPHA;
69 int16_t tmpu = (int16_t) ceil (tmpf * 1000.0);
70 unsigned long len = i * j, k;
71
72 /* allocate matrix A. */
73 sievearray->len_i = i;
74 sievearray->len_j = j;
75 sievearray->array = (int16_t *) malloc ( len * sizeof (int16_t) );
76
77 if ( sievearray->array != NULL ) {
78 for (k = 0; k < len; k++)
79 sievearray->array[k] = tmpu;
80 }
81 else {
82 fprintf ( stderr, "Error, cannot allocate memory in "
83 "sievearray_init().\n" );
84 exit (1);
85 }
86 }
87
88
89 /**
90 * Re-set sieve array.
91 */
92 static inline void
sievearray_reset(sievearray_t sievearray)93 sievearray_reset ( sievearray_t sievearray )
94 {
95 float tmpf = SUP_ALPHA;
96 int16_t tmpu = (int16_t) ceil (tmpf * 1000.0);
97 unsigned long k, len = sievearray->len_i * sievearray->len_j;
98
99 if (sievearray->array != NULL) {
100 for (k = 0; k < len; k++)
101 sievearray->array[k] = tmpu;
102 }
103 else {
104 fprintf (stderr, "Error, null memory in sievearray_reset().\n");
105 exit (1);
106 }
107 }
108
109
110 /**
111 * Free sieve array.
112 */
113 static inline void
sievearray_free(sievearray_t sievearray)114 sievearray_free ( sievearray_t sievearray )
115 {
116 free (sievearray->array);
117 }
118
119
120 /**
121 * Root sieve on ARRAY along the j-line.
122 */
123 static inline long
rootsieve_run_line(int16_t * sa,long j_bound,long j,unsigned int pe,int16_t sub)124 rootsieve_run_line ( int16_t *sa,
125 long j_bound,
126 long j,
127 unsigned int pe,
128 int16_t sub )
129 {
130 long pel = (long) pe;
131
132 /* bounding j with B0 < tmp + p*j < B1 */
133 while ( j <= j_bound ) {
134 sa[j] = sa[j] - sub;
135 j += pel;
136 }
137 return j;
138 }
139
140
141 /**
142 * Compute the indices for some special cases
143 * Note that pl < pe
144 */
145 static inline long
rootsieve_run_multroot_lift_idx(unsigned int v,long Bmin,mpz_t B,mpz_t MOD,unsigned int pl,unsigned int pe)146 rootsieve_run_multroot_lift_idx ( unsigned int v,
147 long Bmin,
148 mpz_t B,
149 mpz_t MOD,
150 unsigned int pl,
151 unsigned int pe )
152 {
153 if (pl >= pe) {
154 fprintf ( stderr, "Non-invertible element in "
155 "rootsieve_run_multroot_lift_idx().\n" );
156 exit(1);
157 }
158 /* findthe val_p (MOD), which must be < e */
159 unsigned int a, b, c;
160 long j_idx;
161
162 /* (B-v)/p^l + (MOD/p^l)*k = 0 (mod p^{e-l})
163 Note (B-v)/p^l is exact due to our constructions. */
164 a = (unsigned int) mpz_fdiv_ui (B, pe);
165 a = (a + pe - v) / pl; // +pe ensure it is positive
166 b = (unsigned int) mpz_fdiv_ui (MOD, pe);
167 b = b /pl;
168 pl = pe / pl; // it is now p^{e-l}
169 c = (unsigned int) solve_lineq (a, b, 0, pl);
170 j_idx = (long) ceil (((double) Bmin - (double) c) / (double) pl)
171 * (long) pl + (long) c;
172 j_idx = ab2ij (Bmin, j_idx);
173 return j_idx;
174 }
175
176
177 #define DEBUG_MULTROOT_LIFT 0
178 /**
179 * rootsieve_run_multroot_lift for the multiple root. Since we are
180 * sure that level 1 node only contains one multiple root
181 * r, we don't need to (and can't, otherwise, may count
182 * repeatedly) consider single root at all.
183 */
184 static inline void
rootsieve_run_multroot_lift(node * currnode,sievearray_t sa,unsigned int * f_ui,unsigned int * g_ui,unsigned int * fuv_ui,int d,ropt_s2param_t s2param,unsigned int p,unsigned int max_e,unsigned int curr_e,int16_t sub)185 rootsieve_run_multroot_lift ( node *currnode,
186 sievearray_t sa,
187 unsigned int *f_ui,
188 unsigned int *g_ui,
189 unsigned int *fuv_ui,
190 int d,
191 ropt_s2param_t s2param,
192 unsigned int p,
193 unsigned int max_e,
194 unsigned int curr_e,
195 int16_t sub )
196 {
197 /* recursion end */
198 if (currnode == NULL || curr_e > max_e || sub == 0)
199 return;
200
201 /* variables */
202 int16_t subtmp;
203 unsigned int i, j, k, l, nroots, pe, pl, pem1, fr, gr, step;
204 node *tmpnode = NULL, *tmpnode2 = NULL;
205 long i_idx, j_idx;
206
207 /* compute p^e */
208 pe = pem1 = 1;
209 for (k = 0; k < curr_e - 1; k ++)
210 pem1 = pem1 * p;
211 pe = pem1 * p;
212
213 /* val of p in MOD */
214 l = 0;
215 pl = p;
216 while (mpz_divisible_ui_p(s2param->MOD, pl) != 0) {
217 l ++;
218 pl *= p;
219 } // pl is p^{l+1}
220 pl /= p;
221
222 /* loop until all siblings are checked. */
223 while ( currnode != NULL ) {
224
225 /* loop all (lifted multiple) roots */
226 for (nroots = 0; nroots < currnode->nr; nroots++) {
227
228 /* compute g(r) */
229 gr = eval_poly_ui_mod (g_ui, 1, currnode->r[nroots], pe);
230 if (gr % p == 0)
231 continue;
232
233 /* compute f_uv(x) and then evaluate it at r. */
234 compute_fuv_ui ( fuv_ui, f_ui, g_ui, d,
235 currnode->u, currnode->v, pe );
236 fr = eval_poly_ui_mod (fuv_ui, d, currnode->r[nroots], pe);
237 fr = fr / pem1;
238
239 /* solve on fr + gr*x = 0 (mod p), where x = uu*r + vv. */
240 fr = (unsigned int) solve_lineq (fr, gr, 0, p); //fr = fr % p;
241
242 /* solve (i, j) in fr = i*r + j (mod p).
243 - if r is not invertible, loop i with fixed j;
244 - otherwise, loop j to solve i. */
245 if (currnode->r[nroots] % p == 0) {
246
247 for (i = 0; i < p; i ++) {
248
249 /* if p|MOD and current pe|MOD:
250 B + MOD*k = v (mod pe), so B = v (mod pe)
251 A + MOD*k = u (mod pe), so A = u (mod pe)
252 otherwise, no need to record this (u, v) at all. */
253 if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
254 if (mpz_divisible_ui_p (s2param->MOD, pe) != 0) {
255 if ( mpz_fdiv_ui (s2param->B, pe) !=
256 (currnode->v + fr * pem1) ||
257 mpz_fdiv_ui (s2param->A, pe) !=
258 (currnode->u + i * pem1) )
259 {
260 #if DEBUG_MULTROOT_LIFT
261 fprintf ( stderr, "skip found (%u, %u) in level %u.\n",
262 currnode->u + i * pem1,
263 currnode->v + fr * pem1,
264 curr_e );
265 #endif
266 continue;
267 }
268 }
269 }
270
271 /* r is a multiple root, add r + k * p^{e-1}. */
272 for (k = 0; k < p; k ++) {
273
274 #if DEBUG_MULTROOT_LIFT
275 fprintf ( stderr, "level %u, (%u, %u), r: %u\n",
276 curr_e,
277 currnode->u + i * pem1,
278 currnode->v + fr * pem1,
279 currnode->r[nroots] + k * pem1 );
280 #endif
281
282 insert_node ( currnode, &tmpnode,
283 currnode->u + i * pem1,
284 currnode->v + fr * pem1,
285 currnode->r[nroots] + k * pem1,
286 curr_e, p, pe, 0 );
287 }
288
289 /* r is a multiple root, add r + k * p^{e-1}. */
290 for (k = 0; k < p; k ++) {
291 insert_node ( currnode, &tmpnode, currnode->u + pem1 * i,
292 currnode->v + pem1 * fr,
293 currnode->r[nroots] + k * pem1,
294 curr_e, p, pe, 2 );
295 }
296 }
297 }
298 /* p -|- (currnode->r[nroots]), loop j to solve i. */
299 else {
300 for (j = 0; j < p; j ++) {
301
302 /* given j, solve i in fr = i*r + j (mod p). */
303 i = solve_lineq (j, currnode->r[nroots], fr, p);
304
305 /* if p|MOD and current pe|MOD:
306 B + MOD*k = v (mod pe), so B = v (mod pe)
307 A + MOD*k = u (mod pe), so A = u (mod pe)
308 otherwise, no need to record this (u, v) at all. */
309 if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
310 if (mpz_divisible_ui_p (s2param->MOD, pe) != 0) {
311 if ( mpz_fdiv_ui (s2param->B, pe) !=
312 (currnode->v + j * pem1) ||
313 mpz_fdiv_ui (s2param->A, pe) !=
314 (currnode->u + i * pem1) )
315 {
316 #if DEBUG_MULTROOT_LIFT
317 fprintf ( stderr, "skip found (%u, %u) in level %u.\n",
318 currnode->u + i * pem1,
319 currnode->v + fr * pem1,
320 curr_e );
321 #endif
322 continue;
323 }
324 }
325 }
326
327 /* r is a multiple root, add r + k * p^{e-1}. */
328 for (k = 0; k < p; k ++) {
329 #if DEBUG_MULTROOT_LIFT
330 fprintf ( stderr, "level %u, (%u, %u), r: %u\n",
331 curr_e, currnode->u + i * pem1,
332 currnode->v + j * pem1,
333 currnode->r[nroots] + k * pem1 );
334 #endif
335 insert_node ( currnode, &tmpnode, currnode->u + pem1 * i,
336 currnode->v + pem1 * j,
337 currnode->r[nroots] + k * pem1,
338 curr_e, p, pe, 2 );
339 }
340 }
341 }
342 } // next root of current (u, v)
343
344 /* recursieve to next level, curr_e + 1 */
345 rootsieve_run_multroot_lift ( currnode->firstchild,
346 sa,
347 f_ui,
348 g_ui,
349 fuv_ui,
350 d,
351 s2param,
352 p,
353 max_e,
354 curr_e + 1,
355 sub );
356
357 /* we are in the second level from bottom, consider all
358 children of this node (bottom nodes) and sieve */
359 if (curr_e == max_e) {
360
361 tmpnode = currnode->firstchild;
362
363 while (tmpnode != NULL) {
364
365 /* p|MOD, the inverse doesn't exist */
366 if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
367
368 /* This shouldn't be called since we didn't record them
369 during the lift. Only for safty purpose. */
370 if (mpz_divisible_ui_p (s2param->MOD, pe) != 0) {
371 tmpnode2 = tmpnode;
372 tmpnode = tmpnode->nextsibling;
373 #if DEBUG_MULTROOT_LIFT
374 fprintf (stderr, "deleting bottomnode (%u, %u) with %u "
375 "roots in level %u.\n",
376 tmpnode2->u, tmpnode2->v, tmpnode2->nr, curr_e);
377 #endif
378 free_node (&tmpnode2);
379 continue;
380 }
381 /* If current pe-|-MOD, then in B + MOD*x = v (mod pe),
382 it is still possible to solve such x. */
383 else {
384 j_idx = rootsieve_run_multroot_lift_idx
385 ( tmpnode->v, s2param->Bmin,
386 s2param->B, s2param->MOD, pl, pe );
387
388 i_idx = rootsieve_run_multroot_lift_idx
389 ( tmpnode->u, s2param->Amin,
390 s2param->A, s2param->MOD, pl, pe );
391 step = pe / pl;
392 }
393 }
394 /* p-|-MOD, the inverse exists */
395 else {
396 j_idx = uv2ij_mod (s2param->B, s2param->Bmin,
397 s2param->MOD, tmpnode->v, pe);
398 i_idx = uv2ij_mod (s2param->A, s2param->Amin,
399 s2param->MOD, tmpnode->u, pe);
400 step = pe;
401 }
402
403 /* be careful about this */
404 subtmp = (int16_t) ceil ( (double) sub / (double) pem1
405 * tmpnode->nr );
406
407 /* For each i block */
408 long old_j = j_idx;
409 while (i_idx < (long) sa->len_i) {
410 j_idx = old_j;
411 rootsieve_run_line ( sa->array,
412 i_idx * sa->len_j + sa->len_j - 1,
413 i_idx * sa->len_j + j_idx,
414 step,
415 subtmp );
416 i_idx += step;
417 }
418
419 tmpnode2 = tmpnode;
420 tmpnode = tmpnode->nextsibling;
421
422 #if DEBUG_MULTROOT_LIFT
423 fprintf ( stderr, "deleting bottomnode (%u, %u) with %u "
424 "roots in level %u AND ",
425 tmpnode2->u, tmpnode2->v, tmpnode2->nr, curr_e );
426 fprintf ( stderr, "sieving bottomnode %d in steps %u.\n",
427 subtmp, step ); // pe
428 #endif
429
430 free_node (&tmpnode2);
431 }
432 }
433
434 /* delete current node and move to next sibling. */
435 tmpnode = currnode;
436 currnode = currnode->nextsibling;
437 if (currnode != NULL)
438 (currnode->parent)->firstchild = currnode;
439
440 /* p|MOD, the inverse doesn't exist */
441 if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
442
443 /* This shouldn't be called since we didn't record them
444 during the lift. Only for safty purpose. */
445 if (mpz_divisible_ui_p (s2param->MOD, pem1) != 0) {
446 #if DEBUG_MULTROOT_LIFT
447 fprintf (stderr, "deleting (%u, %u) with %u roots in level %u.\n",
448 tmpnode->u, tmpnode->v, tmpnode->nr, curr_e - 1);
449 #endif
450 free_node (&tmpnode);
451 continue;
452 }
453 /* If current pem1-|-MOD, then in B + MOD*x = v (mod pem1),
454 it is still possible to solve such x. */
455 else {
456 j_idx = rootsieve_run_multroot_lift_idx
457 ( tmpnode->v, s2param->Bmin,
458 s2param->B, s2param->MOD, pl, pem1 );
459 i_idx = rootsieve_run_multroot_lift_idx
460 ( tmpnode->u, s2param->Amin,
461 s2param->A, s2param->MOD, pl, pem1 );
462 step = pem1 / pl;
463 }
464 }
465 /* p-|-MOD, the inverse exists */
466 else {
467 j_idx = uv2ij_mod (s2param->B, s2param->Bmin,
468 s2param->MOD, tmpnode->v, pem1);
469 i_idx = uv2ij_mod (s2param->A, s2param->Amin,
470 s2param->MOD, tmpnode->u, pem1);
471 step = pem1;
472 }
473
474 /* be careful about this */
475 subtmp = (int16_t) ceil ( (double) sub / (double) pem1 *
476 (double) p * tmpnode->nr );
477
478 /* For each i block */
479 long old_j = j_idx;
480 while (i_idx < (long) sa->len_i) {
481
482 j_idx = old_j;
483 rootsieve_run_line ( sa->array,
484 i_idx * sa->len_j + sa->len_j - 1,
485 i_idx * sa->len_j + j_idx,
486 step, subtmp );
487 i_idx += step;
488 }
489
490 #if DEBUG_MULTROOT_LIFT
491 fprintf (stderr, "deleting (%u, %u) with %u roots in level %u AND ",
492 tmpnode->u, tmpnode->v, tmpnode->nr, curr_e - 1);
493 fprintf (stderr, "sieving %d in steps %u.\n", subtmp, step); //pem1
494 #endif
495
496 free_node (&tmpnode);
497 }
498
499 return;
500 }
501
502
503 /**
504 * For multiple root r of the (i, j) (mod p).
505 */
506 static inline void
rootsieve_run_multroot(sievearray_t sa,ropt_poly_t rs,ropt_s2param_t s2param,unsigned int u,unsigned int i,long j,unsigned int r,unsigned int p,unsigned int e,int16_t sub)507 rootsieve_run_multroot ( sievearray_t sa,
508 ropt_poly_t rs,
509 ropt_s2param_t s2param,
510 unsigned int u,
511 unsigned int i,
512 long j,
513 unsigned int r,
514 unsigned int p,
515 unsigned int e,
516 int16_t sub )
517 {
518 /* sieve f(r) + ((u+i*p)*r + (B + MOD * (j+k*p)))*g(r) for i, k,
519 we sieve (mod p) case in the beginning since r would
520 be a multiple root for all (j + k*p).
521 In the rootsieve_v(), it must be that
522 roottype_flag = 1, e = 1; We can sieve starting from j.
523 Note if roottype_flag = 2, e must > 1. We are not here. */
524 if (e <= 1) {
525
526 /* i is kept the same, next_i denotes the u-dim sieve */
527 unsigned int next_i = i;
528 long old_j = j;
529
530 /* For each i block */
531 while (next_i < sa->len_i) {
532
533 j = old_j;
534 rootsieve_run_line ( sa->array,
535 next_i * sa->len_j + sa->len_j - 1,
536 next_i * sa->len_j + j, p, sub );
537 next_i += p;
538 }
539 return;
540 }
541
542 /* some variables */
543 unsigned int v, pe, *f_ui, *g_ui, *fuv_ui;
544 mpz_t tmpz;
545
546 mpz_init_set_ui (tmpz, 0UL);
547
548 /* compute p^e */
549 pe = 1;
550 for (v = 0; v < e ; v ++)
551 pe = pe * p;
552
553 /* use s.p instead of m.p */
554 f_ui = (unsigned int*) malloc ((rs->d + 1) * sizeof (unsigned int));
555 fuv_ui = (unsigned int*) malloc ((rs->d + 1) * sizeof (unsigned int));
556 g_ui = (unsigned int*) malloc ((2) * sizeof (unsigned int));
557 if ((f_ui == NULL) || (g_ui == NULL) || (fuv_ui == NULL)) {
558 fprintf (stderr, "Error, cannot allocate memory in "
559 "rootsieve_run_multroot(). \n");
560 exit (1);
561 }
562 reduce_poly_ul (f_ui, rs->f, rs->d, pe);
563 reduce_poly_ul (g_ui, rs->g, 1, pe);
564
565 /* j -> v (mod p) */
566 ij2uv (s2param->B, s2param->MOD, s2param->Bmin, j, tmpz);
567 v = (unsigned int) mpz_fdiv_ui (tmpz, p);
568 // v = (unsigned int) ( (tmp >= 0) ? tmp : tmp + (int) p );
569
570 #if DEBUG_MULTROOT_LIFT
571 fprintf (stderr, "\n f(%u) + ( %u * %u + %u ) * g(%u) = 0 (mod %u)\n",
572 r, u, r, v, r, p);
573 #endif
574
575 /* we've already known that r is a multiple root for f_{u, v}. */
576 node *tmpnode, *root;
577 new_tree (&root);
578 root = new_node ();
579 insert_node (root, &tmpnode, u, v, r, 1, p, p, 2);
580
581 /* lift to higher p^e */
582 rootsieve_run_multroot_lift ( root->firstchild, sa, f_ui, g_ui,
583 fuv_ui, rs->d, s2param, p,
584 e, 2, sub );
585
586 /* free, either root itself or root with a level 1 node.
587 Note, if e>1, freeing will be done in the lift function;
588 However, if e=1, we need to manaully free root->firstchild. */
589 free_node (&root);
590 tmpnode = NULL;
591
592 free (f_ui);
593 free (fuv_ui);
594 free (g_ui);
595 mpz_clear (tmpz);
596 }
597
598
599 #define DEBUG_ROOTSIEVE_UV_ONEBLOCK 0
600
601
602 /**
603 * Root sieve for f + u*x*g(x) + v*g(x)
604 */
605 static inline void
rootsieve_one_block(sievearray_t sa,ropt_poly_t poly,ropt_s2param_t s2param)606 rootsieve_one_block ( sievearray_t sa,
607 ropt_poly_t poly,
608 ropt_s2param_t s2param )
609 {
610 char *roottype_flag;
611 unsigned int p, r, u, v, k, np, nbb, max_e, totnbb, fr_ui, gr_ui, pe = 1;
612 int16_t *subsgl, *submul;
613 long i, bblock_size, i_idx, *j_idx, *j_idx_i0;
614 float subf;
615 mpz_t tmp;
616
617 mpz_init (tmp);
618 subsgl = (int16_t *) malloc (s2param->len_p_rs * sizeof(int16_t));
619 submul = (int16_t *) malloc (s2param->len_p_rs * sizeof(int16_t));
620 j_idx = (long *) malloc (primes[s2param->len_p_rs] * sizeof(long));
621 j_idx_i0 = (long *) malloc (primes[s2param->len_p_rs] * sizeof(long));
622 roottype_flag = (char *) malloc (primes[s2param->len_p_rs]
623 * sizeof(char));
624
625 if ( subsgl == NULL || submul == NULL || j_idx == NULL ||
626 j_idx_i0 == NULL || roottype_flag == NULL ) {
627 fprintf ( stderr, "Error, cannot allocate memory in "
628 "rootsieve_one_block().\n" );
629 exit (1);
630 }
631
632 bblock_size = L1_cachesize;
633
634 totnbb = (unsigned int) ((s2param->Bmax - s2param->Bmin + 1)
635 / bblock_size);
636
637 #if DEBUG_ROOTSIEVE_UV_ONEBLOCK
638 fprintf ( stderr,
639 "# Stat: totnbb: %u, bblock_size: %ld, total_size: %ld, "
640 "sa->len_i: %u, sa->len_j: %u\n",
641 totnbb, bblock_size, (long) totnbb * bblock_size,
642 sa->len_i, sa->len_j );
643 #endif
644
645 /* Init index array */
646 for ( np = 0; np < s2param->len_p_rs; np ++ ) {
647 p = primes[np];
648 subf = (float) p * log ( (float) p) / ((float) p * (float) p - 1.0);
649 subsgl[np] = (int16_t) ceil (subf * 1000.0);
650 subf = log ( (float) p) / ( (float) p + 1.0);
651 submul[np] = (int16_t) ceil (subf * 1000.0);
652 roottype_flag[np] = 0;
653 }
654
655 /* Idx holders for each r < B */
656 for ( r = 0; r < primes[s2param->len_p_rs]; r ++ ) {
657 j_idx[r] = 0;
658 j_idx_i0[r] = 0;
659 }
660
661 /* For each p < p_bound*/
662 for (np = 0; np < s2param->len_p_rs; np ++) {
663
664 p = primes[np];
665
666 /* e depends on p */
667 max_e = (unsigned int) (log (200.0) / log ((double) p));
668 pe = 1;
669 for (k = 0; k < max_e; k ++)
670 pe = pe * p;
671
672 /* pe | MOD in A + MOD*i = u (mod pe), don't need to sieve */
673 if (mpz_divisible_ui_p (s2param->MOD, pe) != 0)
674 continue;
675
676 /* For each u < p */
677 for (u = 0; u < p; u++) {
678
679 i = 0;
680
681 /* compute u->i in A + MOD*i = u (mod p) */
682 if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
683 if (mpz_fdiv_ui (s2param->A, p) != u)
684 continue;
685 else {
686 i = -1;
687 }
688 }
689 else {
690 /* i >= 0 */
691 i = uv2ij_mod (s2param->A, s2param->Amin, s2param->MOD, u, p);
692 }
693
694 /* i is kept the same, i_idx denotes the u-dim sieve */
695 i_idx = i;
696
697 /* For each i block */
698 while (i_idx < (long) sa->len_i) {
699
700 /* For each j block on a i-line */
701 for (nbb = 0; nbb < totnbb; nbb ++) {
702
703 /* For each r < p_bound */
704 for (r = 0; r < p; r++) {
705
706 /* skip */
707 if (mpz_divisible_ui_p(poly->gx[r], p) != 0)
708 continue;
709
710 /* u*g(r)^2 - f(r)g'(r) + f'(r)g(r) */
711 mpz_mul (tmp, poly->gx[r], poly->gx[r]);
712 mpz_mul_ui (tmp, tmp, u);
713 mpz_sub (tmp, tmp, poly->numerator[r]);
714
715 /* if nbb == 0:
716 -- if i_idx == i, compute indices;
717 -- elseif i_idx != 0, retrieve;
718 if nbb != 0:
719 -- next; */
720 if (nbb == 0) {
721 if (i_idx == i) {
722
723 /* reset for all r < p for the first block */
724 roottype_flag[r] = 0;
725 j_idx[r] = 0;
726 j_idx_i0[r] = 0;
727
728 /* compute v in f(r) + u*r*g(r) + v*g(r) = 0 (mod p) */
729 fr_ui = mpz_fdiv_ui (poly->fx[r], p);
730 gr_ui = mpz_fdiv_ui (poly->gx[r], p);
731 v = compute_v_ui (fr_ui, gr_ui, r, u, p);
732
733 /* compute v->j in B + MOD*j = v (mod p) */
734 if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
735
736 /* now A + MOD*i = u (mod p)
737 and B + MOD*j = v (mod p) */
738 if (mpz_fdiv_ui (s2param->B, p) == v) {
739
740 /* case where MOD has p-valuation smaller than e.
741 - If r is multiple, do the lift;
742 - If r is simple, all slots on the array
743 are equivalent - ignore. */
744 if (mpz_divisible_ui_p(tmp, p) == 0)
745 continue;
746 else
747 roottype_flag[r] = 2;
748 #if DEBUG_ROOTSIEVE_UV_ONEBLOCK
749 fprintf (stderr, "[mult] f(%u) + ( %u * %u + %u ) "
750 "* g(%u) = 0 (mod %u)\n",
751 r, u, r, v, r, p);
752 fprintf (stderr, " j_idx: %ld\n", i_idx, j_idx[r]);
753 #endif
754 }
755 else
756 continue;
757 }
758 else {
759 roottype_flag[r] = 1;
760
761 /* u -> i in A + MOD*i = u (mod p) has been computed. */
762 /* v -> j in B + MOD*j = v (mod p) */
763 j_idx_i0[r] = uv2ij_mod (s2param->B, s2param->Bmin,
764 s2param->MOD, v, p);
765 j_idx[r] = j_idx_i0[r];
766
767 #if DEBUG_ROOTSIEVE_UV_ONEBLOCK
768 fprintf ( stderr, "[simple] f(%u) + ( %u * %u + %u ) "
769 "* g(%u) = 0 (mod %u)\n",
770 r, u, r, v, r, p );
771 fprintf ( stderr, " i_idx: %ld, j_idx: %ld\n",
772 i_idx, j_idx[r] );
773 #endif
774 }
775 }
776 /* if nbb == 0, but i_idx != i, retrieve info
777 from nbb = 0 */
778 else {
779 /* don't change roottype_flag[np] */
780 j_idx[r] = j_idx_i0[r];
781 }
782 }
783 /* nbb != 0, need to find j_idx[r]'s j-position
784 with respect to a line */
785 else {
786
787 /* prevent overflow when len_i = 1 and j wind back,
788 this happends since the last j-block could too
789 short and the previous sieve jump out of this block
790 and hence % wind back to the beginning */
791 if ((j_idx[r] > (long) sa->len_j) && (nbb == (totnbb-1)))
792 j_idx[r] = sa->len_j;
793 else
794 j_idx[r] %= sa->len_j;
795
796 /*
797 if (r == 1)
798 printf ("(u=%u, p=%u, r=%u) nbb: %u, "
799 "idx: %u, j_idx: %u, len_i: %u\n",
800 u, p, r, nbb,
801 i_idx, j_idx[r], sa->len_i);
802 */
803 }
804
805 /* cases where we don't want to sieve at all */
806 if (roottype_flag[r] == 0)
807 continue;
808
809 /* r is a simple root for current (u, v, p). Then r
810 is simple root for (u, v+i*p, p) for all i. */
811 if (mpz_divisible_ui_p(tmp, p) == 0) {
812 if (nbb != (totnbb-1)) {
813 j_idx[r] = rootsieve_run_line (
814 sa->array,
815 i_idx * sa->len_j + bblock_size * (nbb + 1),
816 i_idx * sa->len_j + j_idx[r],
817 p,
818 subsgl[np] );
819 }
820 else {
821 j_idx[r] = rootsieve_run_line (
822 sa->array,
823 i_idx * sa->len_j + (s2param->Bmax - s2param->Bmin),
824 i_idx * sa->len_j + j_idx[r],
825 p,
826 subsgl[np] );
827 }
828 }
829 /* r is multiple root for current u, p */
830 else {
831 /* don't sieve in block, assume cm << cs */
832 if (nbb == 0 && i_idx == i) {
833
834 /* Two cases:
835 If roottype_flag = 1, then MOD != 0 (mod p),
836 everything is normal;
837 If MOD = 0 (mod p), then at nbb=0, roottype_flag=2;
838 Also must use u (mod pe) instead of u (mod p) */
839 rootsieve_run_multroot ( sa, poly, s2param, u, i,
840 j_idx[r], r, p, max_e,
841 submul[np] );
842 }
843 }
844 } // next r
845 } // next j-block
846 if (i_idx == -1)
847 break;
848 else {
849 i_idx += p;
850 }
851 } // next i-block
852 } // next u
853 } // next p
854
855 free (subsgl);
856 free (submul);
857 free (j_idx);
858 free (j_idx_i0);
859 free (roottype_flag);
860 mpz_clear (tmp);
861 }
862
863
864 /**
865 * Rootsieve for f + (u*x +v)*g for each sublattice
866 */
867 static inline void
rootsieve_one_sublattice(ropt_poly_t poly,ropt_s2param_t s2param,ropt_param_t param,ropt_info_t info,MurphyE_pq * global_E_pqueue)868 rootsieve_one_sublattice ( ropt_poly_t poly,
869 ropt_s2param_t s2param,
870 ropt_param_t param,
871 ropt_info_t info,
872 MurphyE_pq *global_E_pqueue)
873 {
874 int i;
875 unsigned long j, len_A, len_B, size_array_mem, size_B_block;
876 long tmpBmax, tmpBmin;
877 double MurphyE, best_MurphyE;
878 mpz_t tmpv, tmpu;
879 MurphyE_pq *local_E_pqueue;
880 sievescore_pq *sievescore;
881 sievearray_t sa; /* the rootsieve scores are recorded in sa->array[],
882 which is an array of int16_t,
883 cf https://gforge.inria.fr/tracker/?aid=21542 */
884
885 mpz_poly F, G;
886 F->coeff = s2param->f;
887 G->coeff = s2param->g;
888 F->deg = poly->d;
889 G->deg = 1;
890 F->alloc = F->deg + 1;
891 G->alloc = 2;
892
893 /* sieving length */
894 len_A = (unsigned long) (s2param->Amax - s2param->Amin + 1);
895 len_B = (unsigned long) (s2param->Bmax - s2param->Bmin + 1);
896
897 /* size of sieving array that fits in memory, may change later */
898 size_array_mem = SIZE_SIEVEARRAY;
899 if (len_B < (double) size_array_mem / len_A)
900 size_array_mem = len_B * len_A;
901 if (size_array_mem <= (unsigned long) len_A) {
902 fprintf (stderr, "Error: Amax - Amin + 1 = %lu is too long. "
903 "This happend when A is too large while B is "
904 "too small. Ropt doesn't support this yet.\n",
905 s2param->Amax - s2param->Amin + 1);
906 exit(1);
907 }
908
909 /* size of B block that fits in memory at one time. Note:
910 the siever is inefficient when len_A is large since
911 the 2d-sieve is in the short L1-cache blocks. */
912 size_B_block = (unsigned long) ((double)size_array_mem/(double)len_A);
913
914 /* repair */
915 size_array_mem = len_A * size_B_block;
916
917 /* init structs: if tune mode, look for less top candidates; */
918 if (info->mode == ROPT_MODE_TUNE) {
919 new_sievescore_pq (&sievescore, TUNE_NUM_TOPALPHA_SIEVEARRAY);
920 new_MurphyE_pq (&local_E_pqueue, TUNE_NUM_TOPE_SUBLATTICE);
921 }
922 else {
923 new_sievescore_pq (&sievescore, NUM_TOPALPHA_SIEVEARRAY);
924 new_MurphyE_pq (&local_E_pqueue, NUM_TOPE_SUBLATTICE);
925 }
926 sievearray_init (sa, len_A, size_B_block);
927 mpz_init (tmpv);
928 mpz_init (tmpu);
929 tmpBmax = s2param->Bmax;
930 tmpBmin = s2param->Bmin;
931
932 /* for each i -> each u = A + MOD * i */
933 unsigned long st = milliseconds ();
934 do {
935
936 /* fo reach block of size size_B_block */
937 s2param->Bmax = s2param->Bmin + size_B_block - 1;
938 if (s2param->Bmax > tmpBmax)
939 s2param->Bmax = tmpBmax;
940
941 #if 0
942 fprintf (stderr, "[%ld, %ld] ", s2param->Bmin, s2param->Bmax);
943 #endif
944
945
946 #ifdef TIMING_ROPT_STAGE2
947 unsigned long st = milliseconds_thread ();
948 #endif
949 /* root sieve for v. Note, s2param with updated Bmin
950 and Bmax will be used in rootsieve_v(). */
951 rootsieve_one_block (sa, poly, s2param);
952 #ifdef TIMING_ROPT_STAGE2
953 st = milliseconds_thread () - st;
954 printf ( "ropt_one : size %lu, [sieve %lu, ", size_array_mem, st);
955 st = milliseconds_thread ();
956 /* the following insertion took relatively short time compared to sieve */
957 #endif
958
959 /* compute the alpha of top slots */
960 for (j = 0; j < size_array_mem; j++) {
961
962 insert_sievescore_pq ( sievescore,
963 (j - j % (size_B_block)) / size_B_block,
964 j % (size_B_block),
965 sa->array[j] );
966
967 #if 0
968 ij2uv (s2param->A, s2param->MOD, s2param->Amin,
969 (j - j % (size_B_block)) / size_B_block,
970 tmpu);
971 ij2uv (s2param->B, s2param->MOD, s2param->Bmin,
972 j % (size_B_block),
973 tmpv);
974 gmp_fprintf (stderr, "MAT[]: %d, u: %Zd, v: %Zd, i: %lu, "
975 "j: %lu\n", sa->array[j], tmpu, tmpv,
976 (j - j % (size_B_block)) / size_B_block,
977 j % (size_B_block) );
978 #endif
979 }
980
981 #ifdef TIMING_ROPT_STAGE2
982 st = milliseconds_thread () - st;
983 printf ( "insert %lu", st);
984 st = milliseconds_thread ();
985 /* the following print_poly_fg ~ sieve_array 2^18 */
986 #endif
987
988 /* put sievescore into the MurphyE priority queue */
989 for (i = 1; i < sievescore->used; i ++) {
990
991 ij2uv (s2param->A, s2param->MOD, s2param->Amin, sievescore->i[i],
992 tmpu);
993 ij2uv (s2param->B, s2param->MOD, s2param->Bmin, sievescore->j[i],
994 tmpv);
995
996 #if 0
997 gmp_fprintf (stderr, "(i: %lu, j: %lu) -> (u: %Zd, v: %Zd), "
998 "score: %d\n", sievescore->i[i], sievescore->j[i],
999 tmpu, tmpv, sievescore->alpha[i]);
1000 #endif
1001
1002 compute_fuv_mp (s2param->f, poly->f, poly->g, poly->d, tmpu, tmpv);
1003
1004 /* translation-only optimize */
1005 mpz_set (s2param->g[0], poly->g[0]);
1006 mpz_set (s2param->g[1], poly->g[1]);
1007 sopt_local_descent (F, G, F, G, 1, -1, SOPT_DEFAULT_MAX_STEPS, 0);
1008 s2param->f = F->coeff;
1009 s2param->g = G->coeff;
1010
1011 #if 1
1012 /* use MurphyE for ranking in the priority queue (default) */
1013 MurphyE = print_poly_fg (F, s2param->g, poly->n, 0);
1014 insert_MurphyE_pq (local_E_pqueue, info->w, tmpu, tmpv,
1015 s2param->MOD, MurphyE);
1016 #else
1017 /* use E for ranking: takes slightly less time */
1018 double skew = L2_skewness (F, SKEWNESS_DEFAULT_PREC);
1019 double E = L2_lognorm (F, skew);
1020 double alpha = get_alpha (F, get_alpha_bound ());
1021 insert_MurphyE_pq (local_E_pqueue, info->w, tmpu, tmpv,
1022 s2param->MOD, -(E + alpha));
1023 #endif
1024 }
1025
1026 #ifdef TIMING_ROPT_STAGE2
1027 st = milliseconds_thread () - st;
1028 printf ( ", murphy %lu]\n", st);
1029 #endif
1030
1031 /* next j */
1032 s2param->Bmin = s2param->Bmax + 1;
1033
1034 /* reset sieve array and priority queue */
1035 sievearray_reset (sa);
1036 reset_sievescore_pq (sievescore);
1037
1038 } while (s2param->Bmin < tmpBmax);
1039
1040 /* recover */
1041 s2param->Bmax = tmpBmax;
1042 s2param->Bmin = tmpBmin;
1043
1044 /* output polynomials */
1045 MurphyE = 0.0;
1046 best_MurphyE = 0.0;
1047 for (i = 1; i < local_E_pqueue->used; i ++) {
1048
1049 if (param->verbose >= 3 && info->mode == ROPT_MODE_INIT) {
1050 gmp_fprintf ( stderr, "# Found (%3dth) E: %1.2e (w=%d, u=%Zd, v=%Zd)\n",
1051 i,
1052 local_E_pqueue->E[i],
1053 local_E_pqueue->w[i],
1054 local_E_pqueue->u[i],
1055 local_E_pqueue->v[i]
1056 );
1057
1058 if (param->verbose >= 4) {
1059
1060 compute_fuv_mp (s2param->f, poly->f, poly->g, poly->d,
1061 local_E_pqueue->u[i], local_E_pqueue->v[i]);
1062
1063 print_poly_fg (F, s2param->g, poly->n, 1);
1064
1065 fprintf (stderr, "\n");
1066 }
1067 }
1068
1069 /* insert E scores to a global queue (before optimization) */
1070 insert_MurphyE_pq ( global_E_pqueue,
1071 local_E_pqueue->w[i],
1072 local_E_pqueue->u[i],
1073 local_E_pqueue->v[i],
1074 local_E_pqueue->modulus[i],
1075 local_E_pqueue->E[i] );
1076
1077 MurphyE += local_E_pqueue->E[i];
1078 if (best_MurphyE < local_E_pqueue->E[i])
1079 best_MurphyE = local_E_pqueue->E[i];
1080 }
1081
1082 /* compute average E*/
1083 info->ave_MurphyE = MurphyE / (double) (local_E_pqueue->used - 1);
1084 info->best_MurphyE = best_MurphyE;
1085
1086 /* output stats */
1087 if (param->verbose >= 2 && info->mode == ROPT_MODE_INIT) {
1088 gmp_fprintf ( stderr,
1089 "# Stat: ave. E of top %ld polynomials: "
1090 "%1.2e (%Zd, %Zd)\n",
1091 local_E_pqueue->used - 1,
1092 info->ave_MurphyE,
1093 s2param->A, s2param->B );
1094 if (param->verbose >= 3) {
1095 fprintf ( stderr, "# Stat: root sieve took %lums\n",
1096 milliseconds () - st );
1097 }
1098 }
1099
1100 /* free */
1101 sievearray_free (sa);
1102 free_sievescore_pq (&sievescore);
1103 free_MurphyE_pq (&local_E_pqueue);
1104 mpz_clear (tmpv);
1105 mpz_clear (tmpu);
1106 }
1107
1108
1109 /**
1110 * Stage 2: root sieve on the sublattice points;
1111 * Call rootsieve_uv().
1112 */
1113 void
ropt_stage2(ropt_poly_t poly,ropt_s2param_t s2param,ropt_param_t param,ropt_info_t info,MurphyE_pq * global_E_pqueue,int w)1114 ropt_stage2 ( ropt_poly_t poly,
1115 ropt_s2param_t s2param,
1116 ropt_param_t param,
1117 ropt_info_t info,
1118 MurphyE_pq *global_E_pqueue,
1119 int w) // enforce w explicitly.
1120 {
1121 int i;
1122
1123 if (param->verbose >= 3 && info->mode == ROPT_MODE_INIT)
1124 ropt_s2param_print (s2param);
1125
1126 for (i = 0; i <= poly->d; i++)
1127 mpz_set (s2param->f[i], poly->f[i]);
1128
1129 info->w = w;
1130 compute_fuv_mp (s2param->f, poly->f, poly->g, poly->d,
1131 s2param->A, s2param->B);
1132
1133 /* sieve fuv */
1134 rootsieve_one_sublattice (poly, s2param, param, info,
1135 global_E_pqueue);
1136 }
1137