1 /**
2  * @file ropt_stage2.c
3  * Root sieve.
4  */
5 
6 
7 #include "cado.h" // IWYU pragma: keep
8 #include <stdio.h>      // fprintf stderr
9 #include <stdlib.h>
10 #include <math.h>
11 #include <gmp.h>
12 #include "ropt_stage2.h"
13 #include "mpz_poly.h"
14 #include "auxiliary.h"  // print_poly_fg
15 #include "ropt_arith.h" // compute_fuv_mp
16 #include "ropt_param.h"    // SUP_ALPHA
17 #include "ropt_tree.h"  // node ...
18 #include "ropt_str.h"    // ropt_s2param_t
19 #include "size_optimization.h"
20 #include "timing.h"             // for milliseconds
21 
22 //#define TIMING_ROPT_STAGE2
23 /**
24  * Print sieve array for debugging.
25  */
26 #if 0
27 void
28 print_sievearray ( double **A,
29                    long A0,
30                    long A1,
31                    long B0,
32                    long B1,
33                    unsigned long K_ST,
34                    unsigned long J_ST,
35                    unsigned long MOD )
36 {
37   long i, j;
38 
39   for (i = 0; i < B1 - B0 + 1; i ++) {
40     for (j = 0; j < A1 - A0 + 1; j ++) {
41       if (j % 16 == 0) {
42         printf ("\n");
43         printf ("[%3ld*%lu+%lu, %4ld*%lu+%lu]=[%4ld, %4ld]: ",
44                 i + B0, MOD, K_ST, j + A0, MOD,
45                 J_ST, (i + B0)*MOD+K_ST, (j + A0)*MOD+J_ST);
46       }
47       if (j % 4 == 0)
48         printf (" ");
49       printf ("%1.1f ", A[i][j]);
50     }
51     printf ("\n");
52   }
53   printf ("\n");
54 }
55 #endif
56 
57 
58 /**
59  * Init the sieve array.
60  */
61 static inline void
sievearray_init(sievearray_t sievearray,unsigned int i,unsigned int j)62 sievearray_init ( sievearray_t sievearray,
63                   unsigned int i,
64                   unsigned int j )
65 {
66   /* sage: sum ([p/(p^2-1)*log(p) for p in prime_range(200)])
67      4.2774597204802731 */
68   float tmpf = SUP_ALPHA;
69   int16_t tmpu = (int16_t) ceil (tmpf * 1000.0);
70   unsigned long len = i * j, k;
71 
72   /* allocate matrix A. */
73   sievearray->len_i = i;
74   sievearray->len_j = j;
75   sievearray->array = (int16_t *) malloc ( len * sizeof (int16_t) );
76 
77   if ( sievearray->array != NULL ) {
78     for (k = 0; k < len; k++)
79       sievearray->array[k] = tmpu;
80   }
81   else {
82     fprintf ( stderr, "Error, cannot allocate memory in "
83               "sievearray_init().\n" );
84     exit (1);
85   }
86 }
87 
88 
89 /**
90  * Re-set sieve array.
91  */
92 static inline void
sievearray_reset(sievearray_t sievearray)93 sievearray_reset ( sievearray_t sievearray )
94 {
95   float tmpf = SUP_ALPHA;
96   int16_t tmpu = (int16_t) ceil (tmpf * 1000.0);
97   unsigned long k, len = sievearray->len_i * sievearray->len_j;
98 
99   if (sievearray->array != NULL) {
100     for (k = 0; k < len; k++)
101       sievearray->array[k] = tmpu;
102   }
103   else {
104     fprintf (stderr, "Error, null memory in sievearray_reset().\n");
105     exit (1);
106   }
107 }
108 
109 
110 /**
111  * Free sieve array.
112  */
113 static inline void
sievearray_free(sievearray_t sievearray)114 sievearray_free ( sievearray_t sievearray )
115 {
116   free (sievearray->array);
117 }
118 
119 
120 /**
121  * Root sieve on ARRAY along the j-line.
122  */
123 static inline long
rootsieve_run_line(int16_t * sa,long j_bound,long j,unsigned int pe,int16_t sub)124 rootsieve_run_line ( int16_t *sa,
125                      long j_bound,
126                      long j,
127                      unsigned int pe,
128                      int16_t sub )
129 {
130   long pel = (long) pe;
131 
132   /* bounding j with B0 < tmp + p*j < B1 */
133   while ( j <= j_bound ) {
134     sa[j] = sa[j] - sub;
135     j += pel;
136   }
137   return j;
138 }
139 
140 
141 /**
142  * Compute the indices for some special cases
143  * Note that pl < pe
144  */
145 static inline long
rootsieve_run_multroot_lift_idx(unsigned int v,long Bmin,mpz_t B,mpz_t MOD,unsigned int pl,unsigned int pe)146 rootsieve_run_multroot_lift_idx ( unsigned int v,
147                                   long Bmin,
148                                   mpz_t B,
149                                   mpz_t MOD,
150                                   unsigned int pl,
151                                   unsigned int pe )
152 {
153   if (pl >= pe) {
154     fprintf ( stderr, "Non-invertible element in "
155               "rootsieve_run_multroot_lift_idx().\n" );
156     exit(1);
157   }
158   /* findthe val_p (MOD), which must be < e */
159   unsigned int a, b, c;
160   long j_idx;
161 
162   /* (B-v)/p^l + (MOD/p^l)*k = 0 (mod p^{e-l})
163      Note (B-v)/p^l is exact due to our constructions. */
164   a = (unsigned int) mpz_fdiv_ui (B, pe);
165   a = (a + pe - v) / pl; // +pe ensure it is positive
166   b = (unsigned int) mpz_fdiv_ui (MOD, pe);
167   b = b /pl;
168   pl = pe / pl;  // it is now p^{e-l}
169   c = (unsigned int) solve_lineq (a, b, 0, pl);
170   j_idx = (long) ceil (((double) Bmin - (double) c) / (double) pl)
171     * (long) pl + (long) c;
172   j_idx = ab2ij (Bmin, j_idx);
173   return j_idx;
174 }
175 
176 
177 #define DEBUG_MULTROOT_LIFT 0
178 /**
179  * rootsieve_run_multroot_lift for the multiple root. Since we are
180  * sure that level 1 node only contains one multiple root
181  * r, we don't need to (and can't, otherwise, may count
182  * repeatedly) consider single root at all.
183  */
184 static inline void
rootsieve_run_multroot_lift(node * currnode,sievearray_t sa,unsigned int * f_ui,unsigned int * g_ui,unsigned int * fuv_ui,int d,ropt_s2param_t s2param,unsigned int p,unsigned int max_e,unsigned int curr_e,int16_t sub)185 rootsieve_run_multroot_lift ( node *currnode,
186                               sievearray_t sa,
187                               unsigned int *f_ui,
188                               unsigned int *g_ui,
189                               unsigned int *fuv_ui,
190                               int d,
191                               ropt_s2param_t s2param,
192                               unsigned int p,
193                               unsigned int max_e,
194                               unsigned int curr_e,
195                               int16_t sub )
196 {
197   /* recursion end */
198   if (currnode == NULL || curr_e > max_e || sub == 0)
199     return;
200 
201   /* variables */
202   int16_t subtmp;
203   unsigned int i, j, k, l, nroots, pe, pl, pem1, fr, gr, step;
204   node *tmpnode = NULL, *tmpnode2 = NULL;
205   long i_idx, j_idx;
206 
207   /* compute p^e */
208   pe = pem1 = 1;
209   for (k = 0; k < curr_e - 1; k ++)
210     pem1 = pem1 * p;
211   pe = pem1 * p;
212 
213   /* val of p in MOD */
214   l = 0;
215   pl = p;
216   while (mpz_divisible_ui_p(s2param->MOD, pl) != 0) {
217     l ++;
218     pl *= p;
219   } // pl is p^{l+1}
220   pl /= p;
221 
222   /* loop until all siblings are checked. */
223   while ( currnode != NULL ) {
224 
225     /* loop all (lifted multiple) roots */
226     for (nroots = 0; nroots < currnode->nr; nroots++) {
227 
228       /* compute g(r) */
229       gr = eval_poly_ui_mod (g_ui, 1, currnode->r[nroots], pe);
230       if (gr % p == 0)
231         continue;
232 
233       /* compute f_uv(x) and then evaluate it at r. */
234       compute_fuv_ui ( fuv_ui, f_ui, g_ui, d,
235                        currnode->u, currnode->v, pe );
236       fr = eval_poly_ui_mod (fuv_ui, d, currnode->r[nroots], pe);
237       fr = fr / pem1;
238 
239       /* solve on fr + gr*x = 0 (mod p), where x = uu*r + vv. */
240       fr = (unsigned int) solve_lineq (fr, gr, 0, p); //fr = fr % p;
241 
242       /* solve (i, j) in  fr = i*r + j (mod p).
243          - if r is not invertible, loop i with fixed j;
244          - otherwise, loop j to solve i. */
245       if (currnode->r[nroots] % p == 0) {
246 
247         for (i = 0; i < p; i ++) {
248 
249           /* if p|MOD and current pe|MOD:
250              B + MOD*k = v (mod pe), so B = v (mod pe)
251              A + MOD*k = u (mod pe), so A = u (mod pe)
252              otherwise, no need to record this (u, v) at all. */
253           if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
254             if (mpz_divisible_ui_p (s2param->MOD, pe) != 0) {
255               if ( mpz_fdiv_ui (s2param->B, pe) !=
256                    (currnode->v + fr * pem1) ||
257                    mpz_fdiv_ui (s2param->A, pe) !=
258                    (currnode->u + i * pem1) )
259               {
260 #if DEBUG_MULTROOT_LIFT
261                 fprintf ( stderr, "skip found (%u, %u) in level %u.\n",
262                           currnode->u + i * pem1,
263                           currnode->v + fr * pem1,
264                           curr_e );
265 #endif
266                 continue;
267               }
268             }
269           }
270 
271           /* r is a multiple root, add r + k * p^{e-1}. */
272           for (k = 0; k < p; k ++) {
273 
274 #if DEBUG_MULTROOT_LIFT
275             fprintf ( stderr, "level %u, (%u, %u), r: %u\n",
276                       curr_e,
277                       currnode->u + i * pem1,
278                       currnode->v + fr * pem1,
279                       currnode->r[nroots] + k * pem1 );
280 #endif
281 
282             insert_node ( currnode, &tmpnode,
283                           currnode->u + i * pem1,
284                           currnode->v + fr * pem1,
285                           currnode->r[nroots] + k * pem1,
286                           curr_e, p, pe, 0 );
287           }
288 
289           /* r is a multiple root, add r + k * p^{e-1}. */
290           for (k = 0; k < p; k ++) {
291             insert_node ( currnode, &tmpnode, currnode->u + pem1 * i,
292                           currnode->v + pem1 * fr,
293                           currnode->r[nroots] + k * pem1,
294                           curr_e, p, pe, 2 );
295           }
296         }
297       }
298       /* p -|- (currnode->r[nroots]), loop j to solve i. */
299       else {
300         for (j = 0; j < p; j ++) {
301 
302           /* given j, solve i in  fr = i*r + j (mod p). */
303           i = solve_lineq (j, currnode->r[nroots], fr, p);
304 
305           /* if p|MOD and current pe|MOD:
306              B + MOD*k = v (mod pe), so B = v (mod pe)
307              A + MOD*k = u (mod pe), so A = u (mod pe)
308              otherwise, no need to record this (u, v) at all. */
309           if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
310             if (mpz_divisible_ui_p (s2param->MOD, pe) != 0) {
311               if ( mpz_fdiv_ui (s2param->B, pe) !=
312                    (currnode->v + j * pem1) ||
313                    mpz_fdiv_ui (s2param->A, pe) !=
314                    (currnode->u + i * pem1) )
315               {
316 #if DEBUG_MULTROOT_LIFT
317                 fprintf ( stderr, "skip found (%u, %u) in level %u.\n",
318                           currnode->u + i * pem1,
319                           currnode->v + fr * pem1,
320                           curr_e );
321 #endif
322                 continue;
323               }
324             }
325           }
326 
327           /* r is a multiple root, add r + k * p^{e-1}. */
328           for (k = 0; k < p; k ++) {
329 #if DEBUG_MULTROOT_LIFT
330             fprintf ( stderr, "level %u, (%u, %u), r: %u\n",
331                       curr_e, currnode->u + i * pem1,
332                       currnode->v + j * pem1,
333                       currnode->r[nroots] + k * pem1 );
334 #endif
335             insert_node ( currnode, &tmpnode, currnode->u + pem1 * i,
336                           currnode->v + pem1 * j,
337                           currnode->r[nroots] + k * pem1,
338                           curr_e, p, pe, 2 );
339           }
340         }
341       }
342     }  // next root of current (u, v)
343 
344     /* recursieve to next level, curr_e + 1 */
345     rootsieve_run_multroot_lift ( currnode->firstchild,
346                                   sa,
347                                   f_ui,
348                                   g_ui,
349                                   fuv_ui,
350                                   d,
351                                   s2param,
352                                   p,
353                                   max_e,
354                                   curr_e + 1,
355                                   sub );
356 
357     /* we are in the second level from bottom, consider all
358        children of this node (bottom nodes) and sieve */
359     if (curr_e == max_e) {
360 
361       tmpnode = currnode->firstchild;
362 
363       while (tmpnode != NULL) {
364 
365         /* p|MOD, the inverse doesn't exist */
366         if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
367 
368           /* This shouldn't be called since we didn't record them
369              during the lift. Only for safty purpose. */
370           if (mpz_divisible_ui_p (s2param->MOD, pe) != 0) {
371             tmpnode2 = tmpnode;
372             tmpnode = tmpnode->nextsibling;
373 #if DEBUG_MULTROOT_LIFT
374             fprintf (stderr, "deleting bottomnode (%u, %u) with %u "
375                      "roots in level %u.\n",
376                      tmpnode2->u, tmpnode2->v, tmpnode2->nr, curr_e);
377 #endif
378             free_node (&tmpnode2);
379             continue;
380           }
381           /* If current pe-|-MOD, then in B + MOD*x = v (mod pe),
382              it is still possible to solve such x. */
383           else {
384             j_idx = rootsieve_run_multroot_lift_idx
385               ( tmpnode->v, s2param->Bmin,
386                 s2param->B, s2param->MOD, pl, pe );
387 
388             i_idx = rootsieve_run_multroot_lift_idx
389               ( tmpnode->u, s2param->Amin,
390                 s2param->A, s2param->MOD, pl, pe );
391             step = pe / pl;
392           }
393         }
394         /* p-|-MOD, the inverse exists */
395         else {
396           j_idx = uv2ij_mod (s2param->B, s2param->Bmin,
397                              s2param->MOD, tmpnode->v, pe);
398           i_idx = uv2ij_mod (s2param->A, s2param->Amin,
399                              s2param->MOD, tmpnode->u, pe);
400           step = pe;
401         }
402 
403         /* be careful about this */
404         subtmp = (int16_t) ceil ( (double) sub / (double) pem1
405                                   * tmpnode->nr );
406 
407         /* For each i block */
408         long old_j = j_idx;
409         while (i_idx < (long) sa->len_i) {
410           j_idx = old_j;
411           rootsieve_run_line ( sa->array,
412                                i_idx * sa->len_j + sa->len_j - 1,
413                                i_idx * sa->len_j + j_idx,
414                                step,
415                                subtmp );
416           i_idx += step;
417         }
418 
419         tmpnode2 = tmpnode;
420         tmpnode = tmpnode->nextsibling;
421 
422 #if DEBUG_MULTROOT_LIFT
423         fprintf ( stderr, "deleting bottomnode (%u, %u) with %u "
424                   "roots in level %u AND ",
425                   tmpnode2->u, tmpnode2->v, tmpnode2->nr, curr_e );
426         fprintf ( stderr, "sieving bottomnode %d in steps %u.\n",
427                   subtmp, step ); // pe
428 #endif
429 
430         free_node (&tmpnode2);
431       }
432     }
433 
434     /* delete current node and move to next sibling. */
435     tmpnode = currnode;
436     currnode = currnode->nextsibling;
437     if (currnode != NULL)
438       (currnode->parent)->firstchild = currnode;
439 
440     /* p|MOD, the inverse doesn't exist */
441     if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
442 
443       /* This shouldn't be called since we didn't record them
444          during the lift. Only for safty purpose. */
445       if (mpz_divisible_ui_p (s2param->MOD, pem1) != 0) {
446 #if DEBUG_MULTROOT_LIFT
447         fprintf (stderr, "deleting (%u, %u) with %u roots in level %u.\n",
448                  tmpnode->u, tmpnode->v, tmpnode->nr, curr_e - 1);
449 #endif
450         free_node (&tmpnode);
451         continue;
452       }
453       /* If current pem1-|-MOD, then in B + MOD*x = v (mod pem1),
454          it is still possible to solve such x. */
455       else {
456         j_idx = rootsieve_run_multroot_lift_idx
457           ( tmpnode->v, s2param->Bmin,
458             s2param->B, s2param->MOD, pl, pem1 );
459         i_idx = rootsieve_run_multroot_lift_idx
460           ( tmpnode->u, s2param->Amin,
461             s2param->A, s2param->MOD, pl, pem1 );
462         step = pem1 / pl;
463       }
464     }
465     /* p-|-MOD, the inverse exists */
466     else {
467       j_idx = uv2ij_mod (s2param->B, s2param->Bmin,
468                          s2param->MOD, tmpnode->v, pem1);
469       i_idx = uv2ij_mod (s2param->A, s2param->Amin,
470                          s2param->MOD, tmpnode->u, pem1);
471       step = pem1;
472     }
473 
474     /* be careful about this */
475     subtmp = (int16_t) ceil ( (double) sub / (double) pem1 *
476                               (double) p  * tmpnode->nr );
477 
478     /* For each i block */
479     long old_j = j_idx;
480     while (i_idx < (long) sa->len_i) {
481 
482       j_idx = old_j;
483       rootsieve_run_line ( sa->array,
484                            i_idx * sa->len_j + sa->len_j - 1,
485                            i_idx * sa->len_j + j_idx,
486                            step, subtmp );
487       i_idx += step;
488     }
489 
490 #if DEBUG_MULTROOT_LIFT
491     fprintf (stderr, "deleting (%u, %u) with %u roots in level %u AND ",
492              tmpnode->u, tmpnode->v, tmpnode->nr, curr_e - 1);
493     fprintf (stderr, "sieving %d in steps %u.\n", subtmp, step); //pem1
494 #endif
495 
496     free_node (&tmpnode);
497   }
498 
499   return;
500 }
501 
502 
503 /**
504  * For multiple root r of the (i, j) (mod p).
505  */
506 static inline void
rootsieve_run_multroot(sievearray_t sa,ropt_poly_t rs,ropt_s2param_t s2param,unsigned int u,unsigned int i,long j,unsigned int r,unsigned int p,unsigned int e,int16_t sub)507 rootsieve_run_multroot ( sievearray_t sa,
508                          ropt_poly_t rs,
509                          ropt_s2param_t s2param,
510                          unsigned int u,
511                          unsigned int i,
512                          long j,
513                          unsigned int r,
514                          unsigned int p,
515                          unsigned int e,
516                          int16_t sub )
517 {
518   /* sieve f(r) + ((u+i*p)*r + (B + MOD * (j+k*p)))*g(r) for i, k,
519      we sieve (mod p) case in the beginning since r would
520      be a multiple root for all (j + k*p).
521      In the rootsieve_v(), it must be that
522      roottype_flag = 1, e = 1; We can sieve starting from j.
523      Note if roottype_flag = 2, e must > 1. We are not here. */
524   if (e <= 1) {
525 
526     /* i is kept the same, next_i denotes the u-dim sieve */
527     unsigned int next_i = i;
528     long old_j = j;
529 
530     /* For each i block */
531     while (next_i < sa->len_i) {
532 
533       j = old_j;
534       rootsieve_run_line ( sa->array,
535                            next_i * sa->len_j + sa->len_j - 1,
536                            next_i * sa->len_j + j, p, sub );
537       next_i += p;
538     }
539     return;
540   }
541 
542   /* some variables */
543   unsigned int v, pe, *f_ui, *g_ui, *fuv_ui;
544   mpz_t tmpz;
545 
546   mpz_init_set_ui (tmpz, 0UL);
547 
548   /* compute p^e */
549   pe = 1;
550   for (v = 0; v < e ; v ++)
551     pe = pe * p;
552 
553   /* use s.p instead of m.p */
554   f_ui = (unsigned int*) malloc ((rs->d + 1) * sizeof (unsigned int));
555   fuv_ui = (unsigned int*) malloc ((rs->d + 1) * sizeof (unsigned int));
556   g_ui = (unsigned int*) malloc ((2) * sizeof (unsigned int));
557   if ((f_ui == NULL) || (g_ui == NULL) || (fuv_ui == NULL)) {
558     fprintf (stderr, "Error, cannot allocate memory in "
559              "rootsieve_run_multroot(). \n");
560     exit (1);
561   }
562   reduce_poly_ul (f_ui, rs->f, rs->d, pe);
563   reduce_poly_ul (g_ui, rs->g, 1, pe);
564 
565   /* j -> v (mod p) */
566   ij2uv (s2param->B, s2param->MOD, s2param->Bmin, j, tmpz);
567   v = (unsigned int) mpz_fdiv_ui (tmpz, p);
568   // v = (unsigned int) ( (tmp >= 0) ? tmp : tmp + (int) p );
569 
570 #if DEBUG_MULTROOT_LIFT
571   fprintf (stderr, "\n f(%u) + ( %u * %u + %u ) * g(%u) = 0 (mod %u)\n",
572            r, u, r, v, r, p);
573 #endif
574 
575   /* we've already known that r is a multiple root for f_{u, v}. */
576   node *tmpnode, *root;
577   new_tree (&root);
578   root = new_node ();
579   insert_node (root, &tmpnode, u, v, r, 1, p, p, 2);
580 
581   /* lift to higher p^e */
582   rootsieve_run_multroot_lift ( root->firstchild, sa, f_ui, g_ui,
583                                 fuv_ui, rs->d, s2param, p,
584                                 e, 2, sub );
585 
586   /* free, either root itself or root with a level 1 node.
587      Note, if e>1, freeing will be done in the lift function;
588      However, if e=1, we need to manaully free root->firstchild. */
589   free_node (&root);
590   tmpnode = NULL;
591 
592   free (f_ui);
593   free (fuv_ui);
594   free (g_ui);
595   mpz_clear (tmpz);
596 }
597 
598 
599 #define DEBUG_ROOTSIEVE_UV_ONEBLOCK 0
600 
601 
602 /**
603  * Root sieve for f + u*x*g(x) + v*g(x)
604  */
605 static inline void
rootsieve_one_block(sievearray_t sa,ropt_poly_t poly,ropt_s2param_t s2param)606 rootsieve_one_block ( sievearray_t sa,
607                       ropt_poly_t poly,
608                       ropt_s2param_t s2param )
609 {
610   char *roottype_flag;
611   unsigned int p, r, u, v, k, np, nbb, max_e, totnbb, fr_ui, gr_ui, pe = 1;
612   int16_t *subsgl, *submul;
613   long i, bblock_size, i_idx, *j_idx, *j_idx_i0;
614   float subf;
615   mpz_t tmp;
616 
617   mpz_init (tmp);
618   subsgl = (int16_t *) malloc (s2param->len_p_rs * sizeof(int16_t));
619   submul = (int16_t *) malloc (s2param->len_p_rs * sizeof(int16_t));
620   j_idx = (long *) malloc (primes[s2param->len_p_rs] * sizeof(long));
621   j_idx_i0 = (long *) malloc (primes[s2param->len_p_rs] * sizeof(long));
622   roottype_flag = (char *) malloc (primes[s2param->len_p_rs]
623                                    * sizeof(char));
624 
625   if ( subsgl == NULL || submul == NULL || j_idx == NULL ||
626        j_idx_i0 == NULL || roottype_flag == NULL ) {
627     fprintf ( stderr, "Error, cannot allocate memory in "
628               "rootsieve_one_block().\n" );
629     exit (1);
630   }
631 
632   bblock_size = L1_cachesize;
633 
634   totnbb = (unsigned int) ((s2param->Bmax - s2param->Bmin + 1)
635                            / bblock_size);
636 
637 #if DEBUG_ROOTSIEVE_UV_ONEBLOCK
638   fprintf ( stderr,
639             "# Stat: totnbb: %u, bblock_size: %ld, total_size: %ld, "
640             "sa->len_i: %u, sa->len_j: %u\n",
641             totnbb, bblock_size, (long) totnbb * bblock_size,
642             sa->len_i, sa->len_j );
643 #endif
644 
645   /* Init index array */
646   for ( np = 0; np < s2param->len_p_rs; np ++ ) {
647     p = primes[np];
648     subf = (float) p * log ( (float) p) / ((float) p * (float) p - 1.0);
649     subsgl[np] = (int16_t) ceil (subf * 1000.0);
650     subf = log ( (float) p) / ( (float) p + 1.0);
651     submul[np] = (int16_t) ceil (subf * 1000.0);
652     roottype_flag[np] = 0;
653   }
654 
655   /* Idx holders for each r < B */
656   for ( r = 0; r < primes[s2param->len_p_rs]; r ++ ) {
657     j_idx[r] = 0;
658     j_idx_i0[r] = 0;
659   }
660 
661   /* For each p < p_bound*/
662   for (np = 0; np < s2param->len_p_rs; np ++) {
663 
664     p = primes[np];
665 
666     /* e depends on p */
667     max_e = (unsigned int) (log (200.0) / log ((double) p));
668     pe = 1;
669     for (k = 0; k < max_e; k ++)
670       pe = pe * p;
671 
672     /* pe | MOD in A + MOD*i = u (mod pe), don't need to sieve */
673     if (mpz_divisible_ui_p (s2param->MOD, pe) != 0)
674       continue;
675 
676     /* For each u < p */
677     for (u = 0; u < p; u++) {
678 
679       i = 0;
680 
681       /* compute u->i in A + MOD*i = u (mod p) */
682       if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
683         if (mpz_fdiv_ui (s2param->A, p) != u)
684           continue;
685         else {
686           i = -1;
687         }
688       }
689       else {
690         /* i >= 0 */
691         i = uv2ij_mod (s2param->A, s2param->Amin, s2param->MOD, u, p);
692       }
693 
694       /* i is kept the same, i_idx denotes the u-dim sieve */
695       i_idx = i;
696 
697       /* For each i block */
698       while (i_idx < (long) sa->len_i) {
699 
700         /* For each j block on a i-line */
701         for (nbb = 0; nbb < totnbb; nbb ++) {
702 
703           /* For each r < p_bound */
704           for (r = 0; r < p; r++) {
705 
706             /* skip */
707             if (mpz_divisible_ui_p(poly->gx[r], p) != 0)
708               continue;
709 
710             /* u*g(r)^2 - f(r)g'(r) + f'(r)g(r) */
711             mpz_mul (tmp, poly->gx[r], poly->gx[r]);
712             mpz_mul_ui (tmp, tmp, u);
713             mpz_sub (tmp, tmp, poly->numerator[r]);
714 
715             /* if nbb == 0:
716                -- if i_idx == i, compute indices;
717                -- elseif i_idx != 0, retrieve;
718                if nbb != 0:
719                -- next; */
720             if (nbb == 0) {
721               if (i_idx == i) {
722 
723                 /* reset for all r < p for the first block */
724                 roottype_flag[r] = 0;
725                 j_idx[r] = 0;
726                 j_idx_i0[r] = 0;
727 
728                 /* compute v in f(r) + u*r*g(r) + v*g(r) = 0 (mod p) */
729                 fr_ui = mpz_fdiv_ui (poly->fx[r], p);
730                 gr_ui = mpz_fdiv_ui (poly->gx[r], p);
731                 v = compute_v_ui (fr_ui, gr_ui, r, u, p);
732 
733                 /* compute v->j in B + MOD*j = v (mod p) */
734                 if (mpz_divisible_ui_p (s2param->MOD, p) != 0) {
735 
736                   /* now A + MOD*i = u (mod p)
737                      and B + MOD*j = v (mod p) */
738                   if (mpz_fdiv_ui (s2param->B, p) == v) {
739 
740                     /* case where MOD has p-valuation smaller than e.
741                        - If r is multiple, do the lift;
742                        - If r is simple, all slots on the array
743                        are equivalent - ignore. */
744                     if (mpz_divisible_ui_p(tmp, p) == 0)
745                       continue;
746                     else
747                       roottype_flag[r] = 2;
748 #if DEBUG_ROOTSIEVE_UV_ONEBLOCK
749                     fprintf (stderr, "[mult] f(%u) + ( %u * %u + %u ) "
750                              "* g(%u) = 0 (mod %u)\n",
751                              r, u, r, v, r, p);
752                     fprintf (stderr, " j_idx: %ld\n", i_idx, j_idx[r]);
753 #endif
754                   }
755                   else
756                     continue;
757                 }
758                 else {
759                   roottype_flag[r] = 1;
760 
761                   /* u -> i in A + MOD*i = u (mod p) has been computed. */
762                   /* v -> j in B + MOD*j = v (mod p) */
763                   j_idx_i0[r] = uv2ij_mod (s2param->B, s2param->Bmin,
764                                            s2param->MOD, v, p);
765                   j_idx[r] = j_idx_i0[r];
766 
767 #if DEBUG_ROOTSIEVE_UV_ONEBLOCK
768                   fprintf ( stderr, "[simple] f(%u) + ( %u * %u + %u ) "
769                             "* g(%u) = 0 (mod %u)\n",
770                             r, u, r, v, r, p );
771                   fprintf ( stderr, " i_idx: %ld, j_idx: %ld\n",
772                             i_idx, j_idx[r] );
773 #endif
774                 }
775               }
776               /* if nbb == 0, but i_idx != i, retrieve info
777                  from nbb = 0 */
778               else {
779                 /* don't change roottype_flag[np] */
780                 j_idx[r] = j_idx_i0[r];
781               }
782             }
783             /* nbb != 0, need to find j_idx[r]'s j-position
784                with respect to a line */
785             else {
786 
787               /* prevent overflow when len_i = 1 and j wind back,
788                  this happends since the last j-block could too
789                  short and the previous sieve jump out of this block
790                  and hence % wind back to the beginning */
791               if ((j_idx[r] > (long) sa->len_j) && (nbb == (totnbb-1)))
792                   j_idx[r] = sa->len_j;
793               else
794                 j_idx[r] %= sa->len_j;
795 
796               /*
797               if (r == 1)
798                 printf ("(u=%u, p=%u, r=%u) nbb: %u, "
799                         "idx: %u, j_idx: %u, len_i: %u\n",
800                         u, p, r, nbb,
801                         i_idx, j_idx[r], sa->len_i);
802               */
803             }
804 
805             /* cases where we don't want to sieve at all */
806             if (roottype_flag[r] == 0)
807               continue;
808 
809             /* r is a simple root for current (u, v, p). Then r
810                is simple root for (u, v+i*p, p) for all i. */
811             if (mpz_divisible_ui_p(tmp, p) == 0) {
812               if (nbb != (totnbb-1)) {
813                 j_idx[r] = rootsieve_run_line (
814                   sa->array,
815                   i_idx * sa->len_j + bblock_size * (nbb + 1),
816                   i_idx * sa->len_j + j_idx[r],
817                   p,
818                   subsgl[np] );
819               }
820               else {
821                 j_idx[r] = rootsieve_run_line (
822                   sa->array,
823                   i_idx * sa->len_j + (s2param->Bmax - s2param->Bmin),
824                   i_idx * sa->len_j + j_idx[r],
825                   p,
826                   subsgl[np] );
827               }
828             }
829             /* r is multiple root for current u, p */
830             else {
831               /* don't sieve in block, assume cm << cs */
832               if (nbb == 0 && i_idx == i) {
833 
834                 /* Two cases:
835                    If roottype_flag = 1, then MOD != 0 (mod p),
836                    everything is normal;
837                    If MOD = 0 (mod p), then at nbb=0, roottype_flag=2;
838                    Also must use u (mod pe) instead of u (mod p) */
839                 rootsieve_run_multroot ( sa, poly, s2param, u, i,
840                                          j_idx[r], r, p, max_e,
841                                          submul[np] );
842               }
843             }
844           } // next r
845         } // next j-block
846         if (i_idx == -1)
847           break;
848         else {
849           i_idx += p;
850         }
851       } // next i-block
852     } // next u
853   } // next p
854 
855   free (subsgl);
856   free (submul);
857   free (j_idx);
858   free (j_idx_i0);
859   free (roottype_flag);
860   mpz_clear (tmp);
861 }
862 
863 
864 /**
865  * Rootsieve for f + (u*x +v)*g for each sublattice
866  */
867 static inline void
rootsieve_one_sublattice(ropt_poly_t poly,ropt_s2param_t s2param,ropt_param_t param,ropt_info_t info,MurphyE_pq * global_E_pqueue)868 rootsieve_one_sublattice ( ropt_poly_t poly,
869                            ropt_s2param_t s2param,
870                            ropt_param_t param,
871                            ropt_info_t info,
872                            MurphyE_pq *global_E_pqueue)
873 {
874   int i;
875   unsigned long j, len_A, len_B, size_array_mem, size_B_block;
876   long tmpBmax, tmpBmin;
877   double MurphyE, best_MurphyE;
878   mpz_t tmpv, tmpu;
879   MurphyE_pq *local_E_pqueue;
880   sievescore_pq *sievescore;
881   sievearray_t sa; /* the rootsieve scores are recorded in sa->array[],
882                       which is an array of int16_t,
883                       cf https://gforge.inria.fr/tracker/?aid=21542 */
884 
885   mpz_poly F, G;
886   F->coeff = s2param->f;
887   G->coeff = s2param->g;
888   F->deg = poly->d;
889   G->deg = 1;
890   F->alloc = F->deg + 1;
891   G->alloc = 2;
892 
893   /* sieving length */
894   len_A = (unsigned long) (s2param->Amax - s2param->Amin + 1);
895   len_B = (unsigned long) (s2param->Bmax - s2param->Bmin + 1);
896 
897   /* size of sieving array that fits in memory, may change later */
898   size_array_mem = SIZE_SIEVEARRAY;
899   if (len_B < (double) size_array_mem / len_A)
900     size_array_mem = len_B * len_A;
901   if (size_array_mem <= (unsigned long) len_A) {
902     fprintf (stderr, "Error: Amax - Amin + 1 = %lu is too long. "
903              "This happend when A is too large while B is "
904              "too small. Ropt doesn't support this yet.\n",
905              s2param->Amax - s2param->Amin + 1);
906     exit(1);
907   }
908 
909   /* size of B block that fits in memory at one time. Note:
910      the siever is inefficient when len_A is large since
911      the 2d-sieve is in the short L1-cache blocks. */
912   size_B_block = (unsigned long) ((double)size_array_mem/(double)len_A);
913 
914   /* repair */
915   size_array_mem = len_A * size_B_block;
916 
917   /* init structs: if tune mode, look for less top candidates; */
918   if (info->mode == ROPT_MODE_TUNE) {
919     new_sievescore_pq (&sievescore, TUNE_NUM_TOPALPHA_SIEVEARRAY);
920     new_MurphyE_pq (&local_E_pqueue, TUNE_NUM_TOPE_SUBLATTICE);
921   }
922   else {
923     new_sievescore_pq (&sievescore, NUM_TOPALPHA_SIEVEARRAY);
924     new_MurphyE_pq (&local_E_pqueue, NUM_TOPE_SUBLATTICE);
925   }
926   sievearray_init (sa, len_A, size_B_block);
927   mpz_init (tmpv);
928   mpz_init (tmpu);
929   tmpBmax = s2param->Bmax;
930   tmpBmin = s2param->Bmin;
931 
932   /* for each i -> each u = A + MOD * i */
933   unsigned long st = milliseconds ();
934   do {
935 
936     /* fo reach block of size size_B_block */
937     s2param->Bmax = s2param->Bmin + size_B_block - 1;
938     if (s2param->Bmax > tmpBmax)
939       s2param->Bmax = tmpBmax;
940 
941 #if 0
942     fprintf (stderr, "[%ld, %ld] ", s2param->Bmin, s2param->Bmax);
943 #endif
944 
945 
946 #ifdef TIMING_ROPT_STAGE2
947     unsigned long st = milliseconds_thread ();
948 #endif
949     /* root sieve for v. Note, s2param with updated Bmin
950        and Bmax will be used in rootsieve_v(). */
951     rootsieve_one_block (sa, poly, s2param);
952 #ifdef TIMING_ROPT_STAGE2
953     st = milliseconds_thread () - st;
954     printf ( "ropt_one : size %lu, [sieve %lu, ", size_array_mem, st);
955     st = milliseconds_thread ();
956     /* the following insertion took relatively short time compared to sieve */
957 #endif
958 
959     /* compute the alpha of top slots */
960     for (j = 0; j < size_array_mem; j++) {
961 
962       insert_sievescore_pq ( sievescore,
963                              (j - j % (size_B_block)) / size_B_block,
964                              j % (size_B_block),
965                              sa->array[j] );
966 
967 #if 0
968       ij2uv (s2param->A, s2param->MOD, s2param->Amin,
969              (j - j % (size_B_block)) / size_B_block,
970              tmpu);
971       ij2uv (s2param->B, s2param->MOD, s2param->Bmin,
972              j % (size_B_block),
973              tmpv);
974       gmp_fprintf (stderr, "MAT[]: %d, u: %Zd, v: %Zd, i: %lu, "
975                    "j: %lu\n", sa->array[j], tmpu, tmpv,
976                    (j - j % (size_B_block)) / size_B_block,
977                    j % (size_B_block) );
978 #endif
979     }
980 
981 #ifdef TIMING_ROPT_STAGE2
982     st = milliseconds_thread () - st;
983     printf ( "insert %lu", st);
984     st = milliseconds_thread ();
985     /* the following print_poly_fg ~ sieve_array 2^18  */
986 #endif
987 
988     /* put sievescore into the MurphyE priority queue */
989     for (i = 1; i < sievescore->used; i ++) {
990 
991       ij2uv (s2param->A, s2param->MOD, s2param->Amin, sievescore->i[i],
992              tmpu);
993       ij2uv (s2param->B, s2param->MOD, s2param->Bmin, sievescore->j[i],
994              tmpv);
995 
996 #if 0
997       gmp_fprintf (stderr, "(i: %lu, j: %lu) -> (u: %Zd, v: %Zd), "
998                    "score: %d\n", sievescore->i[i], sievescore->j[i],
999                    tmpu, tmpv, sievescore->alpha[i]);
1000 #endif
1001 
1002       compute_fuv_mp (s2param->f, poly->f, poly->g, poly->d, tmpu, tmpv);
1003 
1004       /* translation-only optimize */
1005       mpz_set (s2param->g[0], poly->g[0]);
1006       mpz_set (s2param->g[1], poly->g[1]);
1007       sopt_local_descent (F, G, F, G, 1, -1, SOPT_DEFAULT_MAX_STEPS, 0);
1008       s2param->f = F->coeff;
1009       s2param->g = G->coeff;
1010 
1011 #if 1
1012       /* use MurphyE for ranking in the priority queue (default) */
1013       MurphyE = print_poly_fg (F, s2param->g, poly->n, 0);
1014       insert_MurphyE_pq (local_E_pqueue, info->w, tmpu, tmpv,
1015                          s2param->MOD, MurphyE);
1016 #else
1017       /* use E for ranking: takes slightly less time */
1018       double skew = L2_skewness (F, SKEWNESS_DEFAULT_PREC);
1019       double E = L2_lognorm (F, skew);
1020       double alpha = get_alpha (F, get_alpha_bound ());
1021       insert_MurphyE_pq (local_E_pqueue, info->w, tmpu, tmpv,
1022                          s2param->MOD, -(E + alpha));
1023 #endif
1024     }
1025 
1026 #ifdef TIMING_ROPT_STAGE2
1027     st = milliseconds_thread () - st;
1028     printf ( ", murphy %lu]\n", st);
1029 #endif
1030 
1031     /* next j */
1032     s2param->Bmin = s2param->Bmax + 1;
1033 
1034     /* reset sieve array and priority queue */
1035     sievearray_reset (sa);
1036     reset_sievescore_pq (sievescore);
1037 
1038   } while (s2param->Bmin < tmpBmax);
1039 
1040   /* recover */
1041   s2param->Bmax = tmpBmax;
1042   s2param->Bmin = tmpBmin;
1043 
1044   /* output polynomials */
1045   MurphyE = 0.0;
1046   best_MurphyE = 0.0;
1047   for (i = 1; i < local_E_pqueue->used; i ++) {
1048 
1049     if (param->verbose >= 3 && info->mode == ROPT_MODE_INIT) {
1050       gmp_fprintf ( stderr, "# Found (%3dth) E: %1.2e (w=%d, u=%Zd, v=%Zd)\n",
1051                     i,
1052                     local_E_pqueue->E[i],
1053                     local_E_pqueue->w[i],
1054                     local_E_pqueue->u[i],
1055                     local_E_pqueue->v[i]
1056                     );
1057 
1058       if (param->verbose >= 4) {
1059 
1060         compute_fuv_mp (s2param->f, poly->f, poly->g, poly->d,
1061                         local_E_pqueue->u[i], local_E_pqueue->v[i]);
1062 
1063         print_poly_fg (F, s2param->g, poly->n, 1);
1064 
1065         fprintf (stderr, "\n");
1066       }
1067     }
1068 
1069     /* insert E scores to a global queue (before optimization) */
1070     insert_MurphyE_pq ( global_E_pqueue,
1071                         local_E_pqueue->w[i],
1072                         local_E_pqueue->u[i],
1073                         local_E_pqueue->v[i],
1074                         local_E_pqueue->modulus[i],
1075                         local_E_pqueue->E[i] );
1076 
1077     MurphyE += local_E_pqueue->E[i];
1078     if (best_MurphyE < local_E_pqueue->E[i])
1079       best_MurphyE = local_E_pqueue->E[i];
1080   }
1081 
1082   /* compute average E*/
1083   info->ave_MurphyE = MurphyE / (double) (local_E_pqueue->used - 1);
1084   info->best_MurphyE = best_MurphyE;
1085 
1086   /* output stats */
1087   if (param->verbose >= 2 && info->mode == ROPT_MODE_INIT) {
1088     gmp_fprintf ( stderr,
1089                   "# Stat: ave. E of top %ld polynomials: "
1090                   "%1.2e (%Zd, %Zd)\n",
1091                   local_E_pqueue->used - 1,
1092                   info->ave_MurphyE,
1093                   s2param->A, s2param->B );
1094     if (param->verbose >= 3) {
1095       fprintf ( stderr, "# Stat: root sieve took %lums\n",
1096                 milliseconds () - st );
1097     }
1098   }
1099 
1100   /* free */
1101   sievearray_free (sa);
1102   free_sievescore_pq (&sievescore);
1103   free_MurphyE_pq (&local_E_pqueue);
1104   mpz_clear (tmpv);
1105   mpz_clear (tmpu);
1106 }
1107 
1108 
1109 /**
1110  * Stage 2: root sieve on the sublattice points;
1111  * Call rootsieve_uv().
1112  */
1113 void
ropt_stage2(ropt_poly_t poly,ropt_s2param_t s2param,ropt_param_t param,ropt_info_t info,MurphyE_pq * global_E_pqueue,int w)1114 ropt_stage2 ( ropt_poly_t poly,
1115               ropt_s2param_t s2param,
1116               ropt_param_t param,
1117               ropt_info_t info,
1118               MurphyE_pq *global_E_pqueue,
1119               int w) // enforce w explicitly.
1120 {
1121   int i;
1122 
1123   if (param->verbose >= 3 && info->mode == ROPT_MODE_INIT)
1124     ropt_s2param_print (s2param);
1125 
1126   for (i = 0; i <= poly->d; i++)
1127     mpz_set (s2param->f[i], poly->f[i]);
1128 
1129   info->w = w;
1130   compute_fuv_mp (s2param->f, poly->f, poly->g, poly->d,
1131                   s2param->A, s2param->B);
1132 
1133   /* sieve fuv */
1134   rootsieve_one_sublattice (poly, s2param, param, info,
1135                             global_E_pqueue);
1136 }
1137