1 #ifndef LAS_PLATTICE_H
2 #define LAS_PLATTICE_H
3 
4 #include <cstdint>
5 #include <vector>
6 
7 #include "macros.h"
8 
9 #include "fb-types.h"
10 #include "las-config.h"  // for NOPROFILE_INLINE
11 #include "misc.h"       // for UMAX
12 #include "gcd.h"       // for gcd_ul
13 
14 /*******************************************************************/
15 /********        Walking in p-lattices                    **********/
16 
17 /*  p-lattice stuff */
18 
19 
20 // Compute the root r describing the lattice inside the q-lattice
21 // corresponding to the factor base prime (p,R).
22 // Formula: r = - (a1-R*b1)/(a0-R*b0) mod p
23 // Assumes p < 2^32
24 
25 /* General version of the lattice transform function. Allows projective
26    roots in input and output, and handles prime powers.
27    In input, if the root is projective, say s/t (mod p) and t is
28    non-invertible (mod p), then we expect R = p + (t/s mod p).
29    On output, if the root is projective, say u/v (mod p) and v is
30    non-invertible (mod p), then return value r = p + (v/u mod p).
31    So projective roots are stored as their reciprocal, and have p added
32    to signal the fact that it's a reciprocal value.
33 */
34 
35 
36 /*
37  * Algorithm by Franke and Kleinjung for lattice sieving of largish
38  * primes.
39  */
40 
41 // Proposition 1 of [FrKl05]:
42 // Compute a basis <(a0, a1), (b0, b1)> of the p-lattice
43 // inside the q-lattice, such that
44 //    a1, b1 > 0
45 //    -I < a0 <= 0 <= b0 < I
46 //    b0-a0 >= I
47 //
48 // Sizes:
49 //    p is less than 32 bits and I fits easily in 32 bits.
50 //    So, a0 and a1 fit easily in 32 bits, since they are less than I
51 //    Now, b0 and b1 are also bounded by p, so 32 bits is enough
52 //    However: a and c can be as large as p*I (not both ?).
53 //    We still store them in 32 bits, since if they are larger, it means
54 //    that as soon as they are added to the offset for S, the index will
55 //    be out of range for S and the loop stops. Hence, this is safe to
56 //    replace a and c by a large value within 32 bits, when they are
57 //    larger than 32 bits.
58 //    Except that the choice of this ``large value'' requires some
59 //    caution. We need a value which can be used either for a or c, or
60 //    both, so that adding a, or c, or both, to a value within [0,IJ[ is
61 //    guaranteed to exceed IJ, but can't wrap around. Up to I=15, it's
62 //    rather easy. With the rescaling of J, at worst we may have IJ
63 //    within the range [2^29,2^30[. Thus if a and c are set to 2^30-1,
64 //    a.k.a INT32_MAX/2, then adding either, or the sum of both, to a
65 //    valid value x is guaranteed to be at most 3*2^30, which fits within
66 //    32 bits.
67 //    For I=16, it's much, much harder. Unless somebody comes up with a
68 //    nice idea, I see no way to avoid 64-bit arithmetic (which has some
69 //    cost, sure, but not terribly expensive). For consistency, we make
70 //    all data types for x, a, and c 64-bit in this case, and choose the
71 //    overflow constants as UINT32_MAX.
72 //
73 
74 // OBSOLETE: the large value is now UINT64MAX>>2. So, plattice_x_t must
75 // be 64 bits. We could deal until logI <= 31 when logJ = logI -1.
76 
77 
78 // Return value:
79 // * non-zero if everything worked ok
80 // * zero when the algorithm failed. This can happen when p is a prime power,
81 //   and g, gcd(p,r) >= I, since then the subtractive Euclidean algorithm will
82 //   yield (a0=g, b0=0) at some point --- or the converse --- and the loop
83 //   while (|a0| >= I) a0 += b0 will loop forever.
84 //
85 // Note that on a c166 example, this code alone accounts for almost 20%
86 // of the computation time.
87 
88 
89 typedef uint64_t plattice_x_t;
90 
91 struct plattice_info_t;  // IWYU pragma: keep
92 static int reduce_plattice (plattice_info_t *, fbprime_t, fbroot_t, uint32_t);
93 
94 
95 /* Information on a p-lattice: lattice basis coordinates, and auxiliary
96    values that can be derived directly from them, such as the constants
97    used by the Franke-Kleinjung algorithm */
98 struct plattice_info_t {
99 #if MOD2_CLASSES_BS
100   static const int shift = 1;
101 #else
102   static const int shift = 0;
103 #endif
104 
105   int32_t a0;
106   uint32_t a1;
107   int32_t b0;
108   uint32_t b1;
109 
plattice_info_tplattice_info_t110   plattice_info_t(const fbprime_t p, const fbroot_t r, const bool proj, const int logI) {
111     /* placate compiler. Sure, we shouldn't need it really, but
112      * admittedly the control flow through reduce_plattice is contrived.
113      */
114     a0 = a1 = b0 = b1 = 0;
115     if (UNLIKELY(proj && r == 0)) {
116       /* This lattice basis might work in principle, but it generates hits in
117          all locations i=1, ..., I/2-1, j = 0, all of which are useless except
118          i=1, j=0.  */
119       // Note: if J>p, we are missing some hits, here.
120       a1 = p;
121       a0 = -((int32_t)1 << logI) + 1;
122       b1 = 0;
123       b0 = 1;
124     } else if (UNLIKELY(!proj && r == 0)) {
125       a1 = ((int32_t)1 << logI) + 1;
126       a0 = -p; /* Thus bound0 = p > I, inc_c is always added */
127       b1 = 1;
128       b0 = 0; /* Thus bound1 = I - b0 = I, inc_a is never added */
129     } else {
130       ASSERT_ALWAYS(!proj);
131       /* One essential condition of proposition 1 in FrKl05 is that p >= I */
132       ASSERT(p >> logI);
133       int rc = reduce_plattice (this, p, r, 1U << logI);
134       /* These are the constraints which reduce_plattice should meet */
135       ASSERT(-(INT32_C(1) << logI) < a0);
136       ASSERT(a0 <= 0);
137       ASSERT(b0 >= 0);
138       ASSERT(b0 < (INT32_C(1) << logI));
139       ASSERT((b0-a0) >= (INT32_C(1) << logI));
140       ASSERT(a1 > 0);
141       ASSERT(b1 > 0);
142       if (UNLIKELY(rc == 0)) {
143         /* gcd(r, p) > I. We currently can't handle this case. Set everything
144            to zero to signal to calling code that this is not a valid basis.
145            Is there a better way? Exception, maybe? */
146         a0 = a1 = b0 = b1 = 0;
147       }
148     }
149   }
150 
plattice_info_tplattice_info_t151   plattice_info_t(int32_t aa0, uint32_t aa1, int32_t bb0, uint32_t bb1) {
152       a0 = aa0;
153       a1 = aa1;
154       b0 = bb0;
155       b1 = bb1;
156   }
157 
158   /* Return the four coordinates, but multiplied by 2 if we use mod 2 classes */
get_a0plattice_info_t159   int32_t get_a0() const {return a0 << shift;}
get_a1plattice_info_t160   uint32_t get_a1() const {return a1 << shift;}
get_b0plattice_info_t161   int32_t get_b0() const {return b0 << shift;}
get_b1plattice_info_t162   uint32_t get_b1() const {return b1 << shift;}
163 
get_bound0plattice_info_t164   uint32_t get_bound0(const int logI MAYBE_UNUSED) const {
165       return -get_a0();
166   }
167 
get_bound1plattice_info_t168   uint32_t get_bound1(const int logI) const {
169       return (1U << logI) - get_b0();
170   }
171 
get_inc_aplattice_info_t172   plattice_x_t get_inc_a(const int logI) const {
173       return ((uint64_t)get_a1() << logI) + (int64_t)get_a0();
174   }
175 
get_inc_cplattice_info_t176   plattice_x_t get_inc_c(const int logI) const {
177     return ((uint64_t)get_b1() << logI) + (int64_t)get_b0();
178   }
179 
detplattice_info_t180   uint32_t det() const {return (-a0)*b1 + a1*b0;};
181 };
182 
183 /* We have several versions of reduce_plattice. Which one is best is
184  * reportedly CPU-dependent, which I can imagine.
185  *
186  * I've run some benchmarks on a skylake i5 in 2017/05 with gcc-6.3.0.
187  * The C version below is almost on par with the unrolled version further
188  * down, but the latter still has a tiny edge nevertheless. We seem to be
189  * well within the margin of error at this point, so more tests would be
190  * useful.
191  *
192  * (which unrolling choice for the C code get the best performance is not
193  * measurable at this point).
194  */
195 #if 0
196 // Original version of reduce_plattice, always with a division for each step.
197 // This version is pretty fast on new powerful processors, but slow on others.
198 // I keep it because in fews years, if the int32_t division is faster, it's
199 // possible this code might be the fastest.
200 
201 NOPROFILE_INLINE int
202 reduce_plattice (plattice_info_t *pli, const fbprime_t p, const fbroot_t r, uint32_t I)
203 {
204   int32_t a0 = -((int32_t) p), b0 = (int32_t) r, a1 = 0, b1 = 1, k;
205 #if MOD2_CLASSES_BS
206   const int32_t hI = (int32_t) (I>>1);
207 #else
208   const int32_t hI = (int32_t) I;
209 #endif
210   const int32_t mhI = -hI;
211   while (LIKELY(b0 >= hI)) {
212     k = a0 / b0; a0 %= b0; a1 -= k * b1;
213     if (UNLIKELY(a0 > mhI)) break;
214     k = b0 / a0; b0 %= a0; b1 -= k * a1;
215     /* We may conceivably unroll a bit more, or a bit less, here. Just
216      * tuck in as many copies of the following block as you wish. */
217 #if 1
218     if (UNLIKELY(b0 < hI )) break;
219     k = a0 / b0; a0 %= b0; a1 -= k * b1;
220     if (UNLIKELY(a0 > mhI)) break;
221     k = b0 / a0; b0 %= a0; b1 -= k * a1;
222 #endif
223   }
224   k = b0 - hI - a0;
225   if (b0 > -a0) {
226     if (UNLIKELY(!a0)) return 0;
227     k /= a0; b0 -= k * a0; b1 -= k * a1;
228   } else {
229     if (UNLIKELY(!b0)) return 0;
230     k /= b0; a0 += k * b0; a1 += k * b1;
231   }
232   pli->a0 = (int32_t) a0; pli->a1 = (uint32_t) a1; pli->b0 = (int32_t) b0; pli->b1 = (uint32_t) b1;
233   return 1;
234 }
235 #else
236 // The really fastest version of reduce_plattice on processors >= Intel
237 // Nehalem & AMD Opteron.
238 // The C version is for others architectures; it's almost (99%) faster
239 // than the asm version on X86 with a GOOD C compiler (tested with gcc
240 // 4.6.X, 4.7.X, 4.8.X).
241 // Avoid to modify this code...
242 // This version has been tuned during several weeks; more than 50
243 // versions of this routine has been tested ever and ever. A. Filbois.
244 
245 // Main idea: to avoid a division & multiplication, I tried to test the
246 // sign of aO + b0 * 4 (resp. b0 + a0 * 4). If this sign is different
247 // than a0 (resp. b0) sign, the quotient of a0 / b0 is between 0 and 3,
248 // and needs (4-1)*2 conditional affectations.
249 // The best optimisation is the right choice of the number of the
250 // conditional affectations. It's not exactly the same between Intel
251 // Nehalem & AMD Opteron, but this code is the best deal for both.
252 //
253 // NB: Be careful if you tried to change the "4" value. You have to
254 // change the constants guards 0xe666667 = -0x7FFFFFFF/(4+1) & 0x19999999
255 // = 0x7fffffff / (4+1).
256 // These guards are needed because the conditional affectations use the
257 // sign change; this sign may changed at max 1 times, not 2 (with an
258 // overflow).  In fact, these guards are 99.99999% useless.
259 
260 NOPROFILE_INLINE int
reduce_plattice(plattice_info_t * pli,const fbprime_t p,const fbroot_t r,const uint32_t I)261 reduce_plattice (plattice_info_t *pli, const fbprime_t p, const fbroot_t r, const uint32_t I)
262 {
263 #if MOD2_CLASSES_BS
264   const int32_t hI = (int32_t) (I >> 1);
265 #else
266   const int32_t hI = (int32_t) I;
267 #endif
268   int64_t a0 = - (int64_t) p, b0 = (int64_t) r, a1, b1;
269 
270   /* Mac OS X 10.8 embarks a version of llvm which crashes on the code
271    * below (could be that the constraints are exerting too much of the
272    * compiler's behaviour).
273    *
274    * See tracker #16540
275    */
276 #if 0 // def HAVE_GCC_STYLE_AMD64_INLINE_ASM
277 #if defined(__APPLE_CC__) && defined(__llvm__)
278 #if __APPLE_CC__ == 5621
279 #define AVOID_ASM_REDUCE_PLATTICE
280 #endif
281 #endif
282 #else
283 #define AVOID_ASM_REDUCE_PLATTICE
284 #endif
285 
286 #ifndef AVOID_ASM_REDUCE_PLATTICE
287 #define RPA(LABEL)      \
288           "addl %2, %0\n"						\
289           "leal (%0,%2,4), %%edx\n"					\
290           "addl %3, %1\n"						\
291           "testl %%edx, %%edx\n"					\
292           "movl %0, %%eax\n"						\
293           "jng " LABEL "\n"						\
294           "addl %2, %%eax\n"						\
295           "leal (%1,%3,1), %%edx\n"					\
296           "cmovngl %%eax, %0\n"						\
297           "cmovngl %%edx, %1\n"						\
298           "addl %3, %%edx\n"						\
299           "addl %2, %%eax\n"						\
300           "cmovngl %%edx, %1\n"						\
301           "cmovngl %%eax, %0\n"						\
302           "addl %3, %%edx\n"						\
303           "addl %2, %%eax\n"						\
304           "cmovngl %%edx, %1\n"						\
305           "cmovngl %%eax, %0\n"
306 #define RPB(LABEL)      \
307           "addl %0, %2\n"						\
308           "leal (%2,%0,4), %%edx\n"					\
309           "addl %1, %3\n"						\
310           "testl %%edx, %%edx\n"					\
311           "movl %2, %%eax\n"						\
312           "jns " LABEL "\n"						\
313           "addl %0, %%eax\n"						\
314           "leal (%1,%3,1), %%edx\n"					\
315           "cmovnsl %%eax, %2\n"						\
316           "cmovnsl %%edx, %3\n"						\
317           "addl %1, %%edx\n"						\
318           "addl %0, %%eax\n"						\
319           "cmovnsl %%edx, %3\n"						\
320           "cmovnsl %%eax, %2\n"						\
321           "addl %1, %%edx\n"						\
322           "addl %0, %%eax\n"						\
323           "cmovnsl %%edx, %3\n"						\
324           "cmovnsl %%eax, %2\n"
325 #define RPC     \
326           "cltd\n"							\
327           "idivl %2\n"							\
328           "imull %3, %%eax\n"						\
329           "movl %%edx, %0\n"						\
330           "subl %%eax, %1\n"
331 #define RPD "cltd\n"    \
332           "idivl %0\n"							\
333           "imull %1, %%eax\n"						\
334           "movl %%edx, %2\n"						\
335           "subl %%eax, %3\n"
336 
337   int32_t mhI;
338   __asm__ __volatile__ (
339             "xorl %1, %1\n"
340             "cmpl %2, %5\n"
341             "movl $0x1, %3\n"
342             "jg 9f\n"
343             "movl %5, %%eax\n"
344             "negl %%eax\n"
345             "movl %%eax, %4\n"
346             "movl %0, %%eax\n"
347             "cltd\n"
348             "idivl %2\n"
349             "subl %%eax, %1\n"
350             "cmpl $0xe6666667, %%edx\n"
351             "movl %%edx, %0\n"
352             "jl 0f\n"
353             ".balign 8\n"
354             "1:\n"
355             "cmpl %0, %4\n"
356             "jl 9f\n"
357             RPB("3f")
358             "2:\n"
359             "cmpl %2, %5\n"
360             "jg 9f\n"
361             RPA("4f")
362             "jmp 1b\n"
363             ".balign 8\n"
364             "3:\n"
365             RPD
366             "jmp 2b\n"
367             ".balign 8\n"
368             "4:\n"
369             RPC
370             "jmp 1b\n"
371             ".balign 8\n"
372             "0:\n"
373             "movl %2, %%eax\n"
374             "cltd\n"
375             "idivl %0\n"
376             "imull %1, %%eax\n"
377             "subl %%eax, %3\n"
378             "cmpl $0x19999999, %%edx\n"
379             "movl %%edx, %2\n"
380             "jle 2b\n"
381             "movl %0, %%eax\n"
382             "cltd\n"
383             "idivl %2\n"
384             "imull %3, %%eax\n"
385             "subl %%eax, %1\n"
386             "cmpl $0xe6666667, %%edx\n"
387             "movl %%edx, %0\n"
388             "jge 1b\n"
389             "jmp 0b\n"
390             "9:\n"
391 	   : "+&r"(a0), "=&r"(a1), "+&r"(b0), "=&r"(b1),
392 	     "=&rm"(mhI) : "rm"(hI) : "%rax", "%rdx", "cc");
393 #else
394 #define RPA do {							\
395     a0 += b0; a1 += b1;							\
396     if (LIKELY(a0 + b0 * 4 > 0)) {					\
397       int64_t c0 = a0, c1 = a1;						\
398       c0 += b0; c1 += b1; if (LIKELY(c0 <= 0)) { a0 = c0; a1 = c1; }	\
399       c0 += b0; c1 += b1; if (LIKELY(c0 <= 0)) { a0 = c0; a1 = c1; }	\
400       c0 += b0; c1 += b1; if (LIKELY(c0 <= 0)) { a0 = c0; a1 = c1; }	\
401     } else								\
402       RPC;								\
403   } while (0)
404 #define RPB do {							\
405     b0 += a0; b1 += a1;							\
406     if (LIKELY(b0 + a0 * 4 < 0)) {					\
407       int64_t c0 = b0, c1 = b1;						\
408       c0 += a0; c1 += a1; if (LIKELY(c0 >= 0)) { b0 = c0; b1 = c1; }	\
409       c0 += a0; c1 += a1; if (LIKELY(c0 >= 0)) { b0 = c0; b1 = c1; }	\
410       c0 += a0; c1 += a1; if (LIKELY(c0 >= 0)) { b0 = c0; b1 = c1; }	\
411     } else								\
412       RPD;								\
413     } while (0)
414 #define RPC do {					\
415     int64_t k = a0 / b0; a0 %= b0; a1 -= k * b1;	\
416   } while (0)
417 #define RPD do {					\
418     int64_t k = b0 / a0; b0 %= a0; b1 -= k * a1;	\
419   } while (0)
420 
421   /* This code seems odd (looks after the a0 <= mhI loop),
422      but gcc generates the fastest asm with it... */
423   a1 = 0; b1 = 1;
424   if (LIKELY(b0 >= hI)) {
425     const int32_t mhI = -hI;
426     RPC;
427     while (UNLIKELY(a0 < -0X7FFFFFFF / 5)) {
428       RPD;
429       if (UNLIKELY(b0 < 0X7FFFFFFF / 5)) goto p15;
430       RPC;
431     }
432     if (LIKELY(a0 <= mhI))
433       do {
434 	RPB;
435       p15:
436 	if (UNLIKELY(b0 < hI)) break;
437 	RPA;
438       } while (LIKELY(a0 <= mhI));
439   }
440 #endif /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
441 
442   int64_t k = b0 - hI - a0;
443   if (b0 > -a0) {
444     if (UNLIKELY(!a0)) return 0;
445     k /= a0; b0 -= k * a0; b1 -= k * a1;
446   } else {
447     if (UNLIKELY(!b0)) return 0;
448     k /= b0; a0 += k * b0; a1 += k * b1;
449   }
450   pli->a0 = (int32_t) a0; pli->a1 = (uint32_t) a1; pli->b0 = (int32_t) b0; pli->b1 = (uint32_t) b1;
451   return 1;
452 }
453 #endif
454 
455 
456 /* Like plattice_info_t, but remembers the offset of the factor base entry
457    that generated each lattice basis. This offset becomes the "prime hint"
458    in the bucket updates. */
459 struct plattice_sieve_entry : public plattice_info_t {
460   slice_offset_t hint;
plattice_sieve_entryplattice_sieve_entry461   plattice_sieve_entry(const fbprime_t p, const fbroot_t r, const bool proj, const int logI, const slice_offset_t hint)
462      : plattice_info_t(p, r, proj, logI), hint(hint) {};
463 };
464 
465 // Compute 1/x mod m
466 // For m=2,3,6, this is trivial. Otherwise, has to be implemented
invmod(uint32_t x,uint32_t m)467 static inline uint32_t invmod(uint32_t x, uint32_t m) {
468     if (m==2 || m==3) {
469         return x;
470     } else if (m==6) {
471         ASSERT((x==1) || (x==5));
472         return x;
473     } else {
474         ASSERT_ALWAYS(0);
475     }
476 }
477 
478 /* This is (currently) common across all instantiations of
479  * plattice_enumerator. However if someday plattice_x_t changes type to
480  * vary depending on the level (which, in my reckoning, is now doable),
481  * then we should probably just move this function to the
482  * plattice_enumerator class anyway (or, equivalently, make it a template
483  * based on plattice_x_t).
484  */
485 struct plattice_enumerator_base {
starting_pointplattice_enumerator_base486 static plattice_x_t starting_point(const plattice_info_t &pli,
487         const int logI, const sublat_t &sublat) {
488     int64_t I = int64_t(1)<<logI;
489     uint32_t m  = sublat.m;
490     uint32_t i0 = sublat.i0;
491     uint32_t j0 = sublat.j0;
492 
493     // first FK vector a
494     int64_t a0 = pli.get_a0();
495     int64_t a1 = pli.get_a1();
496     // second FK vector b
497     int64_t b0 = pli.get_b0();
498     int64_t b1 = pli.get_b1();
499 
500     // FIXME: should have a better understanding of those cases
501     // (and not only in the sublat case, to be honest)
502     // Right now, we desactivate them, but putting a starting point
503     // higher than the limit.
504     if ((b0 == 0) || (b1 == 0)) {
505         return plattice_x_t(UMAX(plattice_x_t));
506     }
507 
508     // If a1 or b1 reaches 20 bits, then it was saturated during the
509     // dense storage. Let's skip this prime for which the FK basis is
510     // very skewed. (only a few hits are missed)
511     if ((a1 == ((1<<20)-1)) || (b1 == ((1<<20)-1))) {
512         return plattice_x_t(UMAX(plattice_x_t));
513     }
514 
515     // Look for alpha and beta such that
516     //   alpha*a + beta*b == (i0,j0) mod m
517     // This is a 2x2 system of determinant p, coprime to m.
518     int64_t det = pli.det(); // det() returns the opposite of what we want
519     det = (-det) % m;
520     if (det < 0)
521         det += m;
522     det = invmod(det, m);
523     int64_t al = ( b1*i0 - b0*j0) % m;
524     int64_t be = (-a1*i0 + a0*j0) % m;
525     al = (al*det) % m;
526     be = (be*det) % m;
527     if (al < 0)
528         al += m;
529     if (be < 0)
530         be += m;
531 
532     // Now, compute this potential starting point:
533     int64_t ii = (al*a0 + be*b0 - i0) / m; // exact divisions
534     int64_t jj = (al*a1 + be*b1 - j0) / m;
535 
536     // But here, ii might be beyond the bounds. So, let's fix.
537     // It should be enough to subtract one of the FK vectors.
538     // Note that a is the vector with negative abscissa.
539     if (ii < -I/2) {
540         ASSERT(ii - a0 >= 0);
541         ii -= a0;
542         jj -= a1;
543     } else if (ii > I/2-1) {
544         ASSERT(ii - b0 <= 0);
545         ii -= b0;
546         jj -= b1;
547     }
548     ASSERT ((ii >= -I/2) && (ii < I/2));
549 
550     // But now, jj might be negative! So let's start the FK walk until we
551     // go positive.
552     while (jj < 0) {
553         int64_t aux = ii;
554         if (aux >= I/2 - b0) {
555             ii += a0;
556             jj += a1;
557         }
558         if (aux < -I/2 - a0) {
559             ii += b0;
560             jj += b1;
561         }
562     }
563 
564     // Now, (ii,jj) is the starting point we are looking for. Let's
565     // convert it to the plattice_x_t type.
566     plattice_x_t res = (ii+I/2) + (jj<<logI);
567     return res;
568 }
569 };
570 
571 /* Class for enumerating lattice points with the Franke-Kleinjung algorithm */
572 template<int LEVEL>
573 class plattice_enumerator : public plattice_enumerator_base {
574 protected:
575     // Maybe at some point, the plattice_x_t type could be templated in
576     // order to have it 32 bits for non-top levels.
577     plattice_x_t inc_a, inc_c;
578     uint32_t bound0, bound1;
579     slice_offset_t hint;
580     plattice_x_t x;
581 
582 public:
583 
584     struct fence {
585         uint32_t maskI;
586         plattice_x_t even_mask;
587         plattice_x_t end;
fenceplattice_enumerator::fence588         fence(const int logI, int J) {
589             maskI = (1U << logI) - 1U;
590             even_mask = (plattice_x_t(1) << logI) | plattice_x_t(1);
591             end = plattice_x_t(J) << logI;
592         }
fenceplattice_enumerator::fence593         fence(const int logI, int J, plattice_x_t cap) : fence(logI, J) {
594             if (end >= cap) end = cap;
595         }
596     };
597 
plattice_enumerator(const plattice_info_t & basis,const slice_offset_t hint,const int logI,const sublat_t & sublat)598     plattice_enumerator(const plattice_info_t &basis,
599             const slice_offset_t hint, const int logI, const sublat_t &sublat)
600         : hint(hint)
601     {
602         inc_a = basis.get_inc_a(logI);
603         inc_c = basis.get_inc_c(logI);
604         bound0 = basis.get_bound0(logI);
605         bound1 = basis.get_bound1(logI);
606         if (!sublat.m)
607             x = plattice_x_t(1) << (logI-1);
608         else {
609             x = plattice_enumerator_base::starting_point(basis, logI, sublat);
610         }
611     }
612 
plattice_enumerator(const plattice_info_t & basis,const slice_offset_t hint,const int logI)613     plattice_enumerator(const plattice_info_t &basis,
614             const slice_offset_t hint, const int logI)
615         : hint(hint)
616     {
617         inc_a = basis.get_inc_a(logI);
618         inc_c = basis.get_inc_c(logI);
619         bound0 = basis.get_bound0(logI);
620         bound1 = basis.get_bound1(logI);
621         x = plattice_x_t(1) << (logI-1);
622     }
623 
624 
625     plattice_enumerator(const plattice_enumerator&) = default;
626 
627     /* This function is quite critical */
next(fence const & F)628     void next(fence const & F) {
629       uint32_t i = x & F.maskI;
630       if (i >= bound1)
631         x += inc_a;
632       if (i < bound0)
633         x += inc_c;
634     }
635 
636     /* Currently merely checks that not both are even */
probably_coprime(fence const & F) const637     bool probably_coprime(fence const & F) const {return (x & F.even_mask) != 0;}
638 
done(fence const & F)639     inline bool done(fence const & F) { return x >= F.end; }
advance_to_next_area(fence const & F)640     void advance_to_next_area(fence const & F) { x -= F.end; }
641 
advance_to_end_of_projective_first_line(fence const & F)642     inline void advance_to_end_of_projective_first_line(fence const & F)
643     {
644         /* This function is not critical at all. We want the last
645          * matching position on the line (i,0). This depends on the
646          * (b0,b1) vector, and works **ONLY** in the projective case, and
647          * **ONLY** while we're on the first line !
648          *
649          * for projective non-powers, we should have (b0,b1)=(1,0),
650          * inc_c=1, and bound1=I-1. However we might have something
651          * different for projective powers. Presently, powers are not
652          * bucket-sieved anyway, so there's little point in bothering.
653          * (see "Shall we enable bucket-sieving for powers" in
654          * las-fill-in-buckets.cpp)
655          */
656         x = F.maskI;
657         ASSERT(inc_c == 1);
658         ASSERT(bound1 == F.maskI);
659     }
get_x() const660     plattice_x_t get_x() const {return x;}
set_x(plattice_x_t xx)661     void set_x(plattice_x_t xx) {x = xx;}
get_bound1() const662     plattice_x_t get_bound1() const {return bound1;}
get_inc_c() const663     plattice_x_t get_inc_c() const {return inc_c;}
get_hint() const664     slice_offset_t get_hint() const {return hint;}
665 };
666 
667 /* Also enumerates lattice points, but probably_coprime() does a full gcd()
668    to ensure that points are really coprime. Very slow. Not used, just there
669    for experimenting.*/
670 template<int LEVEL>
671 class plattice_enumerator_coprime : public plattice_enumerator<LEVEL> {
672   unsigned long u, v;
673   typedef plattice_enumerator<LEVEL> super;
674   typedef typename super::fence fence;
675 public:
plattice_enumerator_coprime(const plattice_info_t & basis,const slice_offset_t hint,const int logI,const sublat_t & sublat)676   plattice_enumerator_coprime(const plattice_info_t &basis,
677           const slice_offset_t hint, const int logI, const sublat_t &sublat)
678     : plattice_enumerator<LEVEL>(basis, hint, logI, sublat), u(0), v(0) {}
next(fence const & F)679   void next(fence const & F) {
680     uint32_t i = super::x & F.maskI;
681     if (i >= super::bound1) { super::x += super::inc_a; u++; }
682     if (i < super::bound0) { super::x += super::inc_c; v++; }
683   }
probably_coprime(typename plattice_enumerator<LEVEL>::fence const & F) const684   bool probably_coprime(typename plattice_enumerator<LEVEL>::fence const & F) const {
685     return (super::x & F.even_mask) != 0 && gcd_ul(u, v) == 1;
686   }
687 };
688 
689 template<int LEVEL>
690 class plattices_vector_t:
691         public std::vector<plattice_enumerator<LEVEL>> {
692     /* The index here is the global index, across all fb parts */
693     slice_index_t index = -1;
694     double weight = 0;
695 public:
696     plattices_vector_t() = default;
plattices_vector_t(slice_index_t index,double weight)697     plattices_vector_t(slice_index_t index, double weight) : index(index), weight(weight) {}
698     /* returns a global index */
get_index() const699     slice_index_t get_index() const {return index;};
get_weight() const700     slice_index_t get_weight() const {return weight;};
701 };
702 
703 /* Dense version of plattice_info_t and friends for long-term storage in
704  * sublat mode. */
705 
706 /* This can now be a template. We don't use it, so far.
707  */
708 template<int /* LEVEL */>
709 struct plattice_info_dense_t {
710     uint32_t pack[3];
711     // This pack of 96 bits is enough to contain
712     //   minus_a0, b0, a1, b1
713     // as 20-bit unsigned integers and
714     //   hint
715     // as a 16-bit integer.
716     //
717     // Note that minus_a0 and b0 are less than I, so this is ok, but
718     // a1 and b1 could be larger. However, this is for very skewed
719     // plattices, and we lose only a few hits by skipping those primes.
720     // So we saturate them at 2^20-1 for later detection.
721     //
722     // uint16_t hint; // FIXME: this could be recovered for free...
723 
plattice_info_dense_tplattice_info_dense_t724     plattice_info_dense_t(const plattice_info_t & pli, uint16_t _hint) {
725         uint32_t minus_a0;
726         uint32_t b0;
727         uint32_t a1;
728         uint32_t b1;
729         uint16_t hint;
730         hint = _hint;
731         // Handle orthogonal lattices (proj and r=0 cases)
732         if (pli.b0 == 1 && pli.b1 == 0) {
733             b0 = 1;
734             b1 = 0;
735             minus_a0 = UMAX(uint32_t);
736             a1 = pli.a1;
737         } else if (pli.b0 == 0 && pli.b1 == 1) {
738             b0 = 0;
739             b1 = 1;
740             minus_a0 = UMAX(uint32_t);
741             a1 = pli.a1;
742         } else {
743             // generic case: true FK-basis
744             ASSERT(pli.b0 >= 0);
745             ASSERT(pli.a0 <= 0);
746             minus_a0 = -pli.a0;
747             a1 = pli.a1;
748             b0 = pli.b0;
749             b1 = pli.b1;
750         }
751         uint32_t mask8  = (1<<8)-1;
752         uint32_t mask16 = (1<<16)-1;
753         uint32_t mask20 = (1<<20)-1;
754 
755         // Saturate skewed lattices, for later detection and skipping.
756         if (a1 > mask20)
757             a1 = mask20;
758         if (b1 > mask20)
759             b1 = mask20;
760 
761         pack[0] = (minus_a0 & mask20) | (b0 << 20);
762         pack[1] = ((b0 >> 12) & mask8 ) | ((a1 & mask20) << 8) | (b1 << 28);
763         pack[2] = ((b1 >> 4) & mask16) | (hint << 16);
764     }
765 
unpackplattice_info_dense_t766     plattice_info_t unpack(const int logI) const {
767         uint32_t minus_a0;
768         uint32_t b0;
769         uint32_t a1;
770         uint32_t b1;
771 
772         uint32_t mask8 = (1<<8)-1;
773         uint32_t mask16 = (1<<16)-1;
774         uint32_t mask20 = (1<<20)-1;
775         minus_a0 = pack[0] & mask20;
776         b0 = (pack[0] >> 20) | ((pack[1] & mask8) << 12);
777         a1 = (pack[1] >> 8) & mask20;
778         b1 = (pack[1] >> 28) | ((pack[2] & mask16) << 4);
779 
780         plattice_info_t pli(-int32_t(minus_a0), uint32_t(a1),
781         int32_t(b0), uint32_t(b1));
782         // Orthogonal bases
783         if (pli.b0 == 1 && pli.b1 == 0) {
784             pli.a0 = -((int32_t)1 << logI) + 1;
785         } else if (pli.b0 == 0 && pli.b1 == 1) {
786             pli.a1 = ((int32_t)1 << logI) + 1;
787         }
788         return pli;
789     }
790 
get_hintplattice_info_dense_t791     uint16_t get_hint() const {
792         return pack[2] >> 16;
793     }
794 
795 };
796 
797 template<int LEVEL>
798 class plattices_dense_vector_t:
799         public std::vector<plattice_info_dense_t<LEVEL>> {
800 public:
801 #if !GNUC_VERSION(4,7,2)
802     /* Apparently there's a bug in gcc 4.7.2, which does not recognize
803      * that a vector of plattices_dense_vector_t is ok with a move ctor
804      * and no copy ctor. I'm not entirely sure why, and I don't really
805      * want to investigate. Theoretically, defining ctors as below should
806      * be fine.
807      */
808     plattices_dense_vector_t(plattices_dense_vector_t const&) = delete;
809     plattices_dense_vector_t(plattices_dense_vector_t&&) = default;
810     plattices_dense_vector_t& operator=(plattices_dense_vector_t&&) = default;
811     plattices_dense_vector_t() = default;
812 #endif
813 };
814 
815 // std::vector<plattices_dense_vector_t<LEVEL>> is for remembering the FK
816 // basis in sublat mode, between two different congruences of (i,j) mod
817 // m. For simplicity, we remember them only for the toplevel.
818 template<int LEVEL>
819 struct precomp_plattice_dense_t {
820     typedef std::vector<plattices_dense_vector_t<LEVEL>> type;
821 };
822 
823 
824 #if 0
825 /* MOD2_CLASSES_BS was an attempt, at some point, to support bucket
826  * sieving in several passes for congruence classes mod 2. It was never
827  * finished, and has never been tested. The last traces of that code may
828  * be found in the git history perhaps. To date, the only remaining bit
829  * that can still be considered interesting is the comment fragment below
830  * in plattice_starting_vector
831  */
832 
833 /* This is for working with congruence classes only */
834 NOPROFILE_INLINE
835 plattice_x_t plattice_starting_vector(const plattice_info_t * pli, sieve_info_srcptr si, unsigned int par )
836 {
837     /* With MOD2_CLASSES_BS set up, we have computed by the function
838      * above an adapted basis for the band of total width I/2 (thus from
839      * -I/4 to I/4). This adapted basis is in the coefficients a0 a1 b0
840      *  b1 of the pli data structure.
841      *
842      * Now as per Proposition 1 of FrKl05 applied to I/2, any vector
843      * whose i-coordinates are within ]-I/2,I/2[ (<ugly>We would like a
844      * closed interval on the left. Read further on for that case</ugly>)
845      * can actually be written as a combination with positive integer
846      * coefficients of these basis vectors a and b.
847      *
848      * We also know that the basis (a,b) has determinant p, thus odd. The
849      * congruence class mod 2 that we want to reach is thus accessible.
850      * It is even possible to find coefficients (k,l) in {0,1} such that
851      * ka+lb is within this congruence class. This means that we're going
852      * to consider either a,b,or a+b as a starting vector. The
853      * i-coordinates of these, as per the properties of Proposition 1, are
854      * within ]-I/2,I/2[. Now all other vectors with i-coordinates in
855      * ]-I/2,I/2[ which also belong to the same congruence class, can be
856      * written as (2k'+k)a+(2l'+l)b, with k' and l' necessarily
857      * nonnegative.
858      *
859      * The last ingredient is that (2a, 2b) forms an adapted basis for
860      * the band of width I with respect to the lattice 2p. It's just a
861      * homothetic transformation.
862      *
863      * To find (k,l), we proceed like this. First look at the (a,b)
864      * matrix mod 2:
865      *                 a0&1    a1&1
866      *                 b0&1    b1&1
867      * Its determinant is odd, thus the inverse mod 2 is:
868      *                 b1&1    a1&1
869      *                 b0&1    a0&1
870      * Now the congruence class is given by the parity argument. The
871      * vector is:
872      *                par&1,   par>>1
873      * Multiplying this vector by the inverse matrix above, we obtain the
874      * coordinates k,l, which are:
875      *            k = (b1&par&1)^(b0&(par>>1));
876      *            l = (a1&par&1)^(a0&(par>>1));
877      * Now our starting vector is ka+lb. Instead of multiplying by k and
878      * l with values in {0,1}, we mask with -k and -l, which both are
879      * either all zeroes or all ones in binary
880      *
881      */
882     /* Now for the extra nightmare. Above, we do not have the guarantee
883      * that a vector whose i-coordinate is precisely -I/2 has positive
884      * coefficients in our favorite basis. It's annoying, because it may
885      * well be that if such a vector also has positive j-coordinate, then
886      * it is in fact the first vector we will meet. An example is given
887      * by the following data:
888      *
889             f:=Polynomial(StringToIntegerSequence("
890                 -1286837891385482936880099433527136908899552
891                 55685111236629140623629786639929578
892                 13214494134209131997428776417
893                 -319664171270205889372
894                 -17633182261156
895                 40500"));
896 
897             q:=165017009; rho:=112690811;
898             a0:=52326198; b0:=-1; a1:=60364613; b1:=2;
899             lI:=13; I:=8192; J:=5088;
900             p:=75583; r0:=54375;
901             > M;
902             [-2241    19]
903             [ 1855    18]
904             > M[1]-M[2];
905             (-4096     1)
906 
907     * Clearly, above, for the congruence class (0,1), we must start with
908     * this vector, not with the sum.
909     */
910     int32_t a0 = pli->a0;
911     int32_t a1 = pli->a1;
912     int32_t b0 = pli->b0;
913     int32_t b1 = pli->b1;
914 
915     int k = -((b1&par&1)^(b0&(par>>1)));
916     int l = -((a1&par&1)^(a0&(par>>1)));
917     int32_t v[2]= { (a0&k)+(b0&l), (a1&k)+(b1&l)};
918 
919     /* handle exceptional case as described above */
920     if (k && l && a0-b0 == -(1 << (si->conf->logI-1)) && a1 > b1) {
921         v[0] = a0-b0;
922         v[1] = a1-b1;
923     }
924     return (v[1] << si->conf->logI) | (v[0] + (1 << (si->conf->logI-1)));
925 }
926 #endif
927 
928 /* All of these are defined in las-plattice.cpp */
929 extern template class plattice_enumerator<1>;
930 extern template class plattice_enumerator_coprime<1>;
931 extern template class plattices_vector_t<1>;
932 extern template struct plattice_info_dense_t<1>;
933 extern template class plattices_dense_vector_t<1>;
934 extern template struct precomp_plattice_dense_t<1>;
935 extern template class plattice_enumerator<2>;
936 extern template class plattice_enumerator_coprime<2>;
937 extern template class plattices_vector_t<2>;
938 extern template struct plattice_info_dense_t<2>;
939 extern template class plattices_dense_vector_t<2>;
940 extern template struct precomp_plattice_dense_t<2>;
941 extern template class plattice_enumerator<3>;
942 extern template class plattice_enumerator_coprime<3>;
943 extern template class plattices_vector_t<3>;
944 extern template struct plattice_info_dense_t<3>;
945 extern template class plattices_dense_vector_t<3>;
946 extern template struct precomp_plattice_dense_t<3>;
947 
948 
949 
950 #endif
951