1 #ifndef LAS_PLATTICE_H
2 #define LAS_PLATTICE_H
3
4 #include <cstdint>
5 #include <vector>
6
7 #include "macros.h"
8
9 #include "fb-types.h"
10 #include "las-config.h" // for NOPROFILE_INLINE
11 #include "misc.h" // for UMAX
12 #include "gcd.h" // for gcd_ul
13
14 /*******************************************************************/
15 /******** Walking in p-lattices **********/
16
17 /* p-lattice stuff */
18
19
20 // Compute the root r describing the lattice inside the q-lattice
21 // corresponding to the factor base prime (p,R).
22 // Formula: r = - (a1-R*b1)/(a0-R*b0) mod p
23 // Assumes p < 2^32
24
25 /* General version of the lattice transform function. Allows projective
26 roots in input and output, and handles prime powers.
27 In input, if the root is projective, say s/t (mod p) and t is
28 non-invertible (mod p), then we expect R = p + (t/s mod p).
29 On output, if the root is projective, say u/v (mod p) and v is
30 non-invertible (mod p), then return value r = p + (v/u mod p).
31 So projective roots are stored as their reciprocal, and have p added
32 to signal the fact that it's a reciprocal value.
33 */
34
35
36 /*
37 * Algorithm by Franke and Kleinjung for lattice sieving of largish
38 * primes.
39 */
40
41 // Proposition 1 of [FrKl05]:
42 // Compute a basis <(a0, a1), (b0, b1)> of the p-lattice
43 // inside the q-lattice, such that
44 // a1, b1 > 0
45 // -I < a0 <= 0 <= b0 < I
46 // b0-a0 >= I
47 //
48 // Sizes:
49 // p is less than 32 bits and I fits easily in 32 bits.
50 // So, a0 and a1 fit easily in 32 bits, since they are less than I
51 // Now, b0 and b1 are also bounded by p, so 32 bits is enough
52 // However: a and c can be as large as p*I (not both ?).
53 // We still store them in 32 bits, since if they are larger, it means
54 // that as soon as they are added to the offset for S, the index will
55 // be out of range for S and the loop stops. Hence, this is safe to
56 // replace a and c by a large value within 32 bits, when they are
57 // larger than 32 bits.
58 // Except that the choice of this ``large value'' requires some
59 // caution. We need a value which can be used either for a or c, or
60 // both, so that adding a, or c, or both, to a value within [0,IJ[ is
61 // guaranteed to exceed IJ, but can't wrap around. Up to I=15, it's
62 // rather easy. With the rescaling of J, at worst we may have IJ
63 // within the range [2^29,2^30[. Thus if a and c are set to 2^30-1,
64 // a.k.a INT32_MAX/2, then adding either, or the sum of both, to a
65 // valid value x is guaranteed to be at most 3*2^30, which fits within
66 // 32 bits.
67 // For I=16, it's much, much harder. Unless somebody comes up with a
68 // nice idea, I see no way to avoid 64-bit arithmetic (which has some
69 // cost, sure, but not terribly expensive). For consistency, we make
70 // all data types for x, a, and c 64-bit in this case, and choose the
71 // overflow constants as UINT32_MAX.
72 //
73
74 // OBSOLETE: the large value is now UINT64MAX>>2. So, plattice_x_t must
75 // be 64 bits. We could deal until logI <= 31 when logJ = logI -1.
76
77
78 // Return value:
79 // * non-zero if everything worked ok
80 // * zero when the algorithm failed. This can happen when p is a prime power,
81 // and g, gcd(p,r) >= I, since then the subtractive Euclidean algorithm will
82 // yield (a0=g, b0=0) at some point --- or the converse --- and the loop
83 // while (|a0| >= I) a0 += b0 will loop forever.
84 //
85 // Note that on a c166 example, this code alone accounts for almost 20%
86 // of the computation time.
87
88
89 typedef uint64_t plattice_x_t;
90
91 struct plattice_info_t; // IWYU pragma: keep
92 static int reduce_plattice (plattice_info_t *, fbprime_t, fbroot_t, uint32_t);
93
94
95 /* Information on a p-lattice: lattice basis coordinates, and auxiliary
96 values that can be derived directly from them, such as the constants
97 used by the Franke-Kleinjung algorithm */
98 struct plattice_info_t {
99 #if MOD2_CLASSES_BS
100 static const int shift = 1;
101 #else
102 static const int shift = 0;
103 #endif
104
105 int32_t a0;
106 uint32_t a1;
107 int32_t b0;
108 uint32_t b1;
109
plattice_info_tplattice_info_t110 plattice_info_t(const fbprime_t p, const fbroot_t r, const bool proj, const int logI) {
111 /* placate compiler. Sure, we shouldn't need it really, but
112 * admittedly the control flow through reduce_plattice is contrived.
113 */
114 a0 = a1 = b0 = b1 = 0;
115 if (UNLIKELY(proj && r == 0)) {
116 /* This lattice basis might work in principle, but it generates hits in
117 all locations i=1, ..., I/2-1, j = 0, all of which are useless except
118 i=1, j=0. */
119 // Note: if J>p, we are missing some hits, here.
120 a1 = p;
121 a0 = -((int32_t)1 << logI) + 1;
122 b1 = 0;
123 b0 = 1;
124 } else if (UNLIKELY(!proj && r == 0)) {
125 a1 = ((int32_t)1 << logI) + 1;
126 a0 = -p; /* Thus bound0 = p > I, inc_c is always added */
127 b1 = 1;
128 b0 = 0; /* Thus bound1 = I - b0 = I, inc_a is never added */
129 } else {
130 ASSERT_ALWAYS(!proj);
131 /* One essential condition of proposition 1 in FrKl05 is that p >= I */
132 ASSERT(p >> logI);
133 int rc = reduce_plattice (this, p, r, 1U << logI);
134 /* These are the constraints which reduce_plattice should meet */
135 ASSERT(-(INT32_C(1) << logI) < a0);
136 ASSERT(a0 <= 0);
137 ASSERT(b0 >= 0);
138 ASSERT(b0 < (INT32_C(1) << logI));
139 ASSERT((b0-a0) >= (INT32_C(1) << logI));
140 ASSERT(a1 > 0);
141 ASSERT(b1 > 0);
142 if (UNLIKELY(rc == 0)) {
143 /* gcd(r, p) > I. We currently can't handle this case. Set everything
144 to zero to signal to calling code that this is not a valid basis.
145 Is there a better way? Exception, maybe? */
146 a0 = a1 = b0 = b1 = 0;
147 }
148 }
149 }
150
plattice_info_tplattice_info_t151 plattice_info_t(int32_t aa0, uint32_t aa1, int32_t bb0, uint32_t bb1) {
152 a0 = aa0;
153 a1 = aa1;
154 b0 = bb0;
155 b1 = bb1;
156 }
157
158 /* Return the four coordinates, but multiplied by 2 if we use mod 2 classes */
get_a0plattice_info_t159 int32_t get_a0() const {return a0 << shift;}
get_a1plattice_info_t160 uint32_t get_a1() const {return a1 << shift;}
get_b0plattice_info_t161 int32_t get_b0() const {return b0 << shift;}
get_b1plattice_info_t162 uint32_t get_b1() const {return b1 << shift;}
163
get_bound0plattice_info_t164 uint32_t get_bound0(const int logI MAYBE_UNUSED) const {
165 return -get_a0();
166 }
167
get_bound1plattice_info_t168 uint32_t get_bound1(const int logI) const {
169 return (1U << logI) - get_b0();
170 }
171
get_inc_aplattice_info_t172 plattice_x_t get_inc_a(const int logI) const {
173 return ((uint64_t)get_a1() << logI) + (int64_t)get_a0();
174 }
175
get_inc_cplattice_info_t176 plattice_x_t get_inc_c(const int logI) const {
177 return ((uint64_t)get_b1() << logI) + (int64_t)get_b0();
178 }
179
detplattice_info_t180 uint32_t det() const {return (-a0)*b1 + a1*b0;};
181 };
182
183 /* We have several versions of reduce_plattice. Which one is best is
184 * reportedly CPU-dependent, which I can imagine.
185 *
186 * I've run some benchmarks on a skylake i5 in 2017/05 with gcc-6.3.0.
187 * The C version below is almost on par with the unrolled version further
188 * down, but the latter still has a tiny edge nevertheless. We seem to be
189 * well within the margin of error at this point, so more tests would be
190 * useful.
191 *
192 * (which unrolling choice for the C code get the best performance is not
193 * measurable at this point).
194 */
195 #if 0
196 // Original version of reduce_plattice, always with a division for each step.
197 // This version is pretty fast on new powerful processors, but slow on others.
198 // I keep it because in fews years, if the int32_t division is faster, it's
199 // possible this code might be the fastest.
200
201 NOPROFILE_INLINE int
202 reduce_plattice (plattice_info_t *pli, const fbprime_t p, const fbroot_t r, uint32_t I)
203 {
204 int32_t a0 = -((int32_t) p), b0 = (int32_t) r, a1 = 0, b1 = 1, k;
205 #if MOD2_CLASSES_BS
206 const int32_t hI = (int32_t) (I>>1);
207 #else
208 const int32_t hI = (int32_t) I;
209 #endif
210 const int32_t mhI = -hI;
211 while (LIKELY(b0 >= hI)) {
212 k = a0 / b0; a0 %= b0; a1 -= k * b1;
213 if (UNLIKELY(a0 > mhI)) break;
214 k = b0 / a0; b0 %= a0; b1 -= k * a1;
215 /* We may conceivably unroll a bit more, or a bit less, here. Just
216 * tuck in as many copies of the following block as you wish. */
217 #if 1
218 if (UNLIKELY(b0 < hI )) break;
219 k = a0 / b0; a0 %= b0; a1 -= k * b1;
220 if (UNLIKELY(a0 > mhI)) break;
221 k = b0 / a0; b0 %= a0; b1 -= k * a1;
222 #endif
223 }
224 k = b0 - hI - a0;
225 if (b0 > -a0) {
226 if (UNLIKELY(!a0)) return 0;
227 k /= a0; b0 -= k * a0; b1 -= k * a1;
228 } else {
229 if (UNLIKELY(!b0)) return 0;
230 k /= b0; a0 += k * b0; a1 += k * b1;
231 }
232 pli->a0 = (int32_t) a0; pli->a1 = (uint32_t) a1; pli->b0 = (int32_t) b0; pli->b1 = (uint32_t) b1;
233 return 1;
234 }
235 #else
236 // The really fastest version of reduce_plattice on processors >= Intel
237 // Nehalem & AMD Opteron.
238 // The C version is for others architectures; it's almost (99%) faster
239 // than the asm version on X86 with a GOOD C compiler (tested with gcc
240 // 4.6.X, 4.7.X, 4.8.X).
241 // Avoid to modify this code...
242 // This version has been tuned during several weeks; more than 50
243 // versions of this routine has been tested ever and ever. A. Filbois.
244
245 // Main idea: to avoid a division & multiplication, I tried to test the
246 // sign of aO + b0 * 4 (resp. b0 + a0 * 4). If this sign is different
247 // than a0 (resp. b0) sign, the quotient of a0 / b0 is between 0 and 3,
248 // and needs (4-1)*2 conditional affectations.
249 // The best optimisation is the right choice of the number of the
250 // conditional affectations. It's not exactly the same between Intel
251 // Nehalem & AMD Opteron, but this code is the best deal for both.
252 //
253 // NB: Be careful if you tried to change the "4" value. You have to
254 // change the constants guards 0xe666667 = -0x7FFFFFFF/(4+1) & 0x19999999
255 // = 0x7fffffff / (4+1).
256 // These guards are needed because the conditional affectations use the
257 // sign change; this sign may changed at max 1 times, not 2 (with an
258 // overflow). In fact, these guards are 99.99999% useless.
259
260 NOPROFILE_INLINE int
reduce_plattice(plattice_info_t * pli,const fbprime_t p,const fbroot_t r,const uint32_t I)261 reduce_plattice (plattice_info_t *pli, const fbprime_t p, const fbroot_t r, const uint32_t I)
262 {
263 #if MOD2_CLASSES_BS
264 const int32_t hI = (int32_t) (I >> 1);
265 #else
266 const int32_t hI = (int32_t) I;
267 #endif
268 int64_t a0 = - (int64_t) p, b0 = (int64_t) r, a1, b1;
269
270 /* Mac OS X 10.8 embarks a version of llvm which crashes on the code
271 * below (could be that the constraints are exerting too much of the
272 * compiler's behaviour).
273 *
274 * See tracker #16540
275 */
276 #if 0 // def HAVE_GCC_STYLE_AMD64_INLINE_ASM
277 #if defined(__APPLE_CC__) && defined(__llvm__)
278 #if __APPLE_CC__ == 5621
279 #define AVOID_ASM_REDUCE_PLATTICE
280 #endif
281 #endif
282 #else
283 #define AVOID_ASM_REDUCE_PLATTICE
284 #endif
285
286 #ifndef AVOID_ASM_REDUCE_PLATTICE
287 #define RPA(LABEL) \
288 "addl %2, %0\n" \
289 "leal (%0,%2,4), %%edx\n" \
290 "addl %3, %1\n" \
291 "testl %%edx, %%edx\n" \
292 "movl %0, %%eax\n" \
293 "jng " LABEL "\n" \
294 "addl %2, %%eax\n" \
295 "leal (%1,%3,1), %%edx\n" \
296 "cmovngl %%eax, %0\n" \
297 "cmovngl %%edx, %1\n" \
298 "addl %3, %%edx\n" \
299 "addl %2, %%eax\n" \
300 "cmovngl %%edx, %1\n" \
301 "cmovngl %%eax, %0\n" \
302 "addl %3, %%edx\n" \
303 "addl %2, %%eax\n" \
304 "cmovngl %%edx, %1\n" \
305 "cmovngl %%eax, %0\n"
306 #define RPB(LABEL) \
307 "addl %0, %2\n" \
308 "leal (%2,%0,4), %%edx\n" \
309 "addl %1, %3\n" \
310 "testl %%edx, %%edx\n" \
311 "movl %2, %%eax\n" \
312 "jns " LABEL "\n" \
313 "addl %0, %%eax\n" \
314 "leal (%1,%3,1), %%edx\n" \
315 "cmovnsl %%eax, %2\n" \
316 "cmovnsl %%edx, %3\n" \
317 "addl %1, %%edx\n" \
318 "addl %0, %%eax\n" \
319 "cmovnsl %%edx, %3\n" \
320 "cmovnsl %%eax, %2\n" \
321 "addl %1, %%edx\n" \
322 "addl %0, %%eax\n" \
323 "cmovnsl %%edx, %3\n" \
324 "cmovnsl %%eax, %2\n"
325 #define RPC \
326 "cltd\n" \
327 "idivl %2\n" \
328 "imull %3, %%eax\n" \
329 "movl %%edx, %0\n" \
330 "subl %%eax, %1\n"
331 #define RPD "cltd\n" \
332 "idivl %0\n" \
333 "imull %1, %%eax\n" \
334 "movl %%edx, %2\n" \
335 "subl %%eax, %3\n"
336
337 int32_t mhI;
338 __asm__ __volatile__ (
339 "xorl %1, %1\n"
340 "cmpl %2, %5\n"
341 "movl $0x1, %3\n"
342 "jg 9f\n"
343 "movl %5, %%eax\n"
344 "negl %%eax\n"
345 "movl %%eax, %4\n"
346 "movl %0, %%eax\n"
347 "cltd\n"
348 "idivl %2\n"
349 "subl %%eax, %1\n"
350 "cmpl $0xe6666667, %%edx\n"
351 "movl %%edx, %0\n"
352 "jl 0f\n"
353 ".balign 8\n"
354 "1:\n"
355 "cmpl %0, %4\n"
356 "jl 9f\n"
357 RPB("3f")
358 "2:\n"
359 "cmpl %2, %5\n"
360 "jg 9f\n"
361 RPA("4f")
362 "jmp 1b\n"
363 ".balign 8\n"
364 "3:\n"
365 RPD
366 "jmp 2b\n"
367 ".balign 8\n"
368 "4:\n"
369 RPC
370 "jmp 1b\n"
371 ".balign 8\n"
372 "0:\n"
373 "movl %2, %%eax\n"
374 "cltd\n"
375 "idivl %0\n"
376 "imull %1, %%eax\n"
377 "subl %%eax, %3\n"
378 "cmpl $0x19999999, %%edx\n"
379 "movl %%edx, %2\n"
380 "jle 2b\n"
381 "movl %0, %%eax\n"
382 "cltd\n"
383 "idivl %2\n"
384 "imull %3, %%eax\n"
385 "subl %%eax, %1\n"
386 "cmpl $0xe6666667, %%edx\n"
387 "movl %%edx, %0\n"
388 "jge 1b\n"
389 "jmp 0b\n"
390 "9:\n"
391 : "+&r"(a0), "=&r"(a1), "+&r"(b0), "=&r"(b1),
392 "=&rm"(mhI) : "rm"(hI) : "%rax", "%rdx", "cc");
393 #else
394 #define RPA do { \
395 a0 += b0; a1 += b1; \
396 if (LIKELY(a0 + b0 * 4 > 0)) { \
397 int64_t c0 = a0, c1 = a1; \
398 c0 += b0; c1 += b1; if (LIKELY(c0 <= 0)) { a0 = c0; a1 = c1; } \
399 c0 += b0; c1 += b1; if (LIKELY(c0 <= 0)) { a0 = c0; a1 = c1; } \
400 c0 += b0; c1 += b1; if (LIKELY(c0 <= 0)) { a0 = c0; a1 = c1; } \
401 } else \
402 RPC; \
403 } while (0)
404 #define RPB do { \
405 b0 += a0; b1 += a1; \
406 if (LIKELY(b0 + a0 * 4 < 0)) { \
407 int64_t c0 = b0, c1 = b1; \
408 c0 += a0; c1 += a1; if (LIKELY(c0 >= 0)) { b0 = c0; b1 = c1; } \
409 c0 += a0; c1 += a1; if (LIKELY(c0 >= 0)) { b0 = c0; b1 = c1; } \
410 c0 += a0; c1 += a1; if (LIKELY(c0 >= 0)) { b0 = c0; b1 = c1; } \
411 } else \
412 RPD; \
413 } while (0)
414 #define RPC do { \
415 int64_t k = a0 / b0; a0 %= b0; a1 -= k * b1; \
416 } while (0)
417 #define RPD do { \
418 int64_t k = b0 / a0; b0 %= a0; b1 -= k * a1; \
419 } while (0)
420
421 /* This code seems odd (looks after the a0 <= mhI loop),
422 but gcc generates the fastest asm with it... */
423 a1 = 0; b1 = 1;
424 if (LIKELY(b0 >= hI)) {
425 const int32_t mhI = -hI;
426 RPC;
427 while (UNLIKELY(a0 < -0X7FFFFFFF / 5)) {
428 RPD;
429 if (UNLIKELY(b0 < 0X7FFFFFFF / 5)) goto p15;
430 RPC;
431 }
432 if (LIKELY(a0 <= mhI))
433 do {
434 RPB;
435 p15:
436 if (UNLIKELY(b0 < hI)) break;
437 RPA;
438 } while (LIKELY(a0 <= mhI));
439 }
440 #endif /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
441
442 int64_t k = b0 - hI - a0;
443 if (b0 > -a0) {
444 if (UNLIKELY(!a0)) return 0;
445 k /= a0; b0 -= k * a0; b1 -= k * a1;
446 } else {
447 if (UNLIKELY(!b0)) return 0;
448 k /= b0; a0 += k * b0; a1 += k * b1;
449 }
450 pli->a0 = (int32_t) a0; pli->a1 = (uint32_t) a1; pli->b0 = (int32_t) b0; pli->b1 = (uint32_t) b1;
451 return 1;
452 }
453 #endif
454
455
456 /* Like plattice_info_t, but remembers the offset of the factor base entry
457 that generated each lattice basis. This offset becomes the "prime hint"
458 in the bucket updates. */
459 struct plattice_sieve_entry : public plattice_info_t {
460 slice_offset_t hint;
plattice_sieve_entryplattice_sieve_entry461 plattice_sieve_entry(const fbprime_t p, const fbroot_t r, const bool proj, const int logI, const slice_offset_t hint)
462 : plattice_info_t(p, r, proj, logI), hint(hint) {};
463 };
464
465 // Compute 1/x mod m
466 // For m=2,3,6, this is trivial. Otherwise, has to be implemented
invmod(uint32_t x,uint32_t m)467 static inline uint32_t invmod(uint32_t x, uint32_t m) {
468 if (m==2 || m==3) {
469 return x;
470 } else if (m==6) {
471 ASSERT((x==1) || (x==5));
472 return x;
473 } else {
474 ASSERT_ALWAYS(0);
475 }
476 }
477
478 /* This is (currently) common across all instantiations of
479 * plattice_enumerator. However if someday plattice_x_t changes type to
480 * vary depending on the level (which, in my reckoning, is now doable),
481 * then we should probably just move this function to the
482 * plattice_enumerator class anyway (or, equivalently, make it a template
483 * based on plattice_x_t).
484 */
485 struct plattice_enumerator_base {
starting_pointplattice_enumerator_base486 static plattice_x_t starting_point(const plattice_info_t &pli,
487 const int logI, const sublat_t &sublat) {
488 int64_t I = int64_t(1)<<logI;
489 uint32_t m = sublat.m;
490 uint32_t i0 = sublat.i0;
491 uint32_t j0 = sublat.j0;
492
493 // first FK vector a
494 int64_t a0 = pli.get_a0();
495 int64_t a1 = pli.get_a1();
496 // second FK vector b
497 int64_t b0 = pli.get_b0();
498 int64_t b1 = pli.get_b1();
499
500 // FIXME: should have a better understanding of those cases
501 // (and not only in the sublat case, to be honest)
502 // Right now, we desactivate them, but putting a starting point
503 // higher than the limit.
504 if ((b0 == 0) || (b1 == 0)) {
505 return plattice_x_t(UMAX(plattice_x_t));
506 }
507
508 // If a1 or b1 reaches 20 bits, then it was saturated during the
509 // dense storage. Let's skip this prime for which the FK basis is
510 // very skewed. (only a few hits are missed)
511 if ((a1 == ((1<<20)-1)) || (b1 == ((1<<20)-1))) {
512 return plattice_x_t(UMAX(plattice_x_t));
513 }
514
515 // Look for alpha and beta such that
516 // alpha*a + beta*b == (i0,j0) mod m
517 // This is a 2x2 system of determinant p, coprime to m.
518 int64_t det = pli.det(); // det() returns the opposite of what we want
519 det = (-det) % m;
520 if (det < 0)
521 det += m;
522 det = invmod(det, m);
523 int64_t al = ( b1*i0 - b0*j0) % m;
524 int64_t be = (-a1*i0 + a0*j0) % m;
525 al = (al*det) % m;
526 be = (be*det) % m;
527 if (al < 0)
528 al += m;
529 if (be < 0)
530 be += m;
531
532 // Now, compute this potential starting point:
533 int64_t ii = (al*a0 + be*b0 - i0) / m; // exact divisions
534 int64_t jj = (al*a1 + be*b1 - j0) / m;
535
536 // But here, ii might be beyond the bounds. So, let's fix.
537 // It should be enough to subtract one of the FK vectors.
538 // Note that a is the vector with negative abscissa.
539 if (ii < -I/2) {
540 ASSERT(ii - a0 >= 0);
541 ii -= a0;
542 jj -= a1;
543 } else if (ii > I/2-1) {
544 ASSERT(ii - b0 <= 0);
545 ii -= b0;
546 jj -= b1;
547 }
548 ASSERT ((ii >= -I/2) && (ii < I/2));
549
550 // But now, jj might be negative! So let's start the FK walk until we
551 // go positive.
552 while (jj < 0) {
553 int64_t aux = ii;
554 if (aux >= I/2 - b0) {
555 ii += a0;
556 jj += a1;
557 }
558 if (aux < -I/2 - a0) {
559 ii += b0;
560 jj += b1;
561 }
562 }
563
564 // Now, (ii,jj) is the starting point we are looking for. Let's
565 // convert it to the plattice_x_t type.
566 plattice_x_t res = (ii+I/2) + (jj<<logI);
567 return res;
568 }
569 };
570
571 /* Class for enumerating lattice points with the Franke-Kleinjung algorithm */
572 template<int LEVEL>
573 class plattice_enumerator : public plattice_enumerator_base {
574 protected:
575 // Maybe at some point, the plattice_x_t type could be templated in
576 // order to have it 32 bits for non-top levels.
577 plattice_x_t inc_a, inc_c;
578 uint32_t bound0, bound1;
579 slice_offset_t hint;
580 plattice_x_t x;
581
582 public:
583
584 struct fence {
585 uint32_t maskI;
586 plattice_x_t even_mask;
587 plattice_x_t end;
fenceplattice_enumerator::fence588 fence(const int logI, int J) {
589 maskI = (1U << logI) - 1U;
590 even_mask = (plattice_x_t(1) << logI) | plattice_x_t(1);
591 end = plattice_x_t(J) << logI;
592 }
fenceplattice_enumerator::fence593 fence(const int logI, int J, plattice_x_t cap) : fence(logI, J) {
594 if (end >= cap) end = cap;
595 }
596 };
597
plattice_enumerator(const plattice_info_t & basis,const slice_offset_t hint,const int logI,const sublat_t & sublat)598 plattice_enumerator(const plattice_info_t &basis,
599 const slice_offset_t hint, const int logI, const sublat_t &sublat)
600 : hint(hint)
601 {
602 inc_a = basis.get_inc_a(logI);
603 inc_c = basis.get_inc_c(logI);
604 bound0 = basis.get_bound0(logI);
605 bound1 = basis.get_bound1(logI);
606 if (!sublat.m)
607 x = plattice_x_t(1) << (logI-1);
608 else {
609 x = plattice_enumerator_base::starting_point(basis, logI, sublat);
610 }
611 }
612
plattice_enumerator(const plattice_info_t & basis,const slice_offset_t hint,const int logI)613 plattice_enumerator(const plattice_info_t &basis,
614 const slice_offset_t hint, const int logI)
615 : hint(hint)
616 {
617 inc_a = basis.get_inc_a(logI);
618 inc_c = basis.get_inc_c(logI);
619 bound0 = basis.get_bound0(logI);
620 bound1 = basis.get_bound1(logI);
621 x = plattice_x_t(1) << (logI-1);
622 }
623
624
625 plattice_enumerator(const plattice_enumerator&) = default;
626
627 /* This function is quite critical */
next(fence const & F)628 void next(fence const & F) {
629 uint32_t i = x & F.maskI;
630 if (i >= bound1)
631 x += inc_a;
632 if (i < bound0)
633 x += inc_c;
634 }
635
636 /* Currently merely checks that not both are even */
probably_coprime(fence const & F) const637 bool probably_coprime(fence const & F) const {return (x & F.even_mask) != 0;}
638
done(fence const & F)639 inline bool done(fence const & F) { return x >= F.end; }
advance_to_next_area(fence const & F)640 void advance_to_next_area(fence const & F) { x -= F.end; }
641
advance_to_end_of_projective_first_line(fence const & F)642 inline void advance_to_end_of_projective_first_line(fence const & F)
643 {
644 /* This function is not critical at all. We want the last
645 * matching position on the line (i,0). This depends on the
646 * (b0,b1) vector, and works **ONLY** in the projective case, and
647 * **ONLY** while we're on the first line !
648 *
649 * for projective non-powers, we should have (b0,b1)=(1,0),
650 * inc_c=1, and bound1=I-1. However we might have something
651 * different for projective powers. Presently, powers are not
652 * bucket-sieved anyway, so there's little point in bothering.
653 * (see "Shall we enable bucket-sieving for powers" in
654 * las-fill-in-buckets.cpp)
655 */
656 x = F.maskI;
657 ASSERT(inc_c == 1);
658 ASSERT(bound1 == F.maskI);
659 }
get_x() const660 plattice_x_t get_x() const {return x;}
set_x(plattice_x_t xx)661 void set_x(plattice_x_t xx) {x = xx;}
get_bound1() const662 plattice_x_t get_bound1() const {return bound1;}
get_inc_c() const663 plattice_x_t get_inc_c() const {return inc_c;}
get_hint() const664 slice_offset_t get_hint() const {return hint;}
665 };
666
667 /* Also enumerates lattice points, but probably_coprime() does a full gcd()
668 to ensure that points are really coprime. Very slow. Not used, just there
669 for experimenting.*/
670 template<int LEVEL>
671 class plattice_enumerator_coprime : public plattice_enumerator<LEVEL> {
672 unsigned long u, v;
673 typedef plattice_enumerator<LEVEL> super;
674 typedef typename super::fence fence;
675 public:
plattice_enumerator_coprime(const plattice_info_t & basis,const slice_offset_t hint,const int logI,const sublat_t & sublat)676 plattice_enumerator_coprime(const plattice_info_t &basis,
677 const slice_offset_t hint, const int logI, const sublat_t &sublat)
678 : plattice_enumerator<LEVEL>(basis, hint, logI, sublat), u(0), v(0) {}
next(fence const & F)679 void next(fence const & F) {
680 uint32_t i = super::x & F.maskI;
681 if (i >= super::bound1) { super::x += super::inc_a; u++; }
682 if (i < super::bound0) { super::x += super::inc_c; v++; }
683 }
probably_coprime(typename plattice_enumerator<LEVEL>::fence const & F) const684 bool probably_coprime(typename plattice_enumerator<LEVEL>::fence const & F) const {
685 return (super::x & F.even_mask) != 0 && gcd_ul(u, v) == 1;
686 }
687 };
688
689 template<int LEVEL>
690 class plattices_vector_t:
691 public std::vector<plattice_enumerator<LEVEL>> {
692 /* The index here is the global index, across all fb parts */
693 slice_index_t index = -1;
694 double weight = 0;
695 public:
696 plattices_vector_t() = default;
plattices_vector_t(slice_index_t index,double weight)697 plattices_vector_t(slice_index_t index, double weight) : index(index), weight(weight) {}
698 /* returns a global index */
get_index() const699 slice_index_t get_index() const {return index;};
get_weight() const700 slice_index_t get_weight() const {return weight;};
701 };
702
703 /* Dense version of plattice_info_t and friends for long-term storage in
704 * sublat mode. */
705
706 /* This can now be a template. We don't use it, so far.
707 */
708 template<int /* LEVEL */>
709 struct plattice_info_dense_t {
710 uint32_t pack[3];
711 // This pack of 96 bits is enough to contain
712 // minus_a0, b0, a1, b1
713 // as 20-bit unsigned integers and
714 // hint
715 // as a 16-bit integer.
716 //
717 // Note that minus_a0 and b0 are less than I, so this is ok, but
718 // a1 and b1 could be larger. However, this is for very skewed
719 // plattices, and we lose only a few hits by skipping those primes.
720 // So we saturate them at 2^20-1 for later detection.
721 //
722 // uint16_t hint; // FIXME: this could be recovered for free...
723
plattice_info_dense_tplattice_info_dense_t724 plattice_info_dense_t(const plattice_info_t & pli, uint16_t _hint) {
725 uint32_t minus_a0;
726 uint32_t b0;
727 uint32_t a1;
728 uint32_t b1;
729 uint16_t hint;
730 hint = _hint;
731 // Handle orthogonal lattices (proj and r=0 cases)
732 if (pli.b0 == 1 && pli.b1 == 0) {
733 b0 = 1;
734 b1 = 0;
735 minus_a0 = UMAX(uint32_t);
736 a1 = pli.a1;
737 } else if (pli.b0 == 0 && pli.b1 == 1) {
738 b0 = 0;
739 b1 = 1;
740 minus_a0 = UMAX(uint32_t);
741 a1 = pli.a1;
742 } else {
743 // generic case: true FK-basis
744 ASSERT(pli.b0 >= 0);
745 ASSERT(pli.a0 <= 0);
746 minus_a0 = -pli.a0;
747 a1 = pli.a1;
748 b0 = pli.b0;
749 b1 = pli.b1;
750 }
751 uint32_t mask8 = (1<<8)-1;
752 uint32_t mask16 = (1<<16)-1;
753 uint32_t mask20 = (1<<20)-1;
754
755 // Saturate skewed lattices, for later detection and skipping.
756 if (a1 > mask20)
757 a1 = mask20;
758 if (b1 > mask20)
759 b1 = mask20;
760
761 pack[0] = (minus_a0 & mask20) | (b0 << 20);
762 pack[1] = ((b0 >> 12) & mask8 ) | ((a1 & mask20) << 8) | (b1 << 28);
763 pack[2] = ((b1 >> 4) & mask16) | (hint << 16);
764 }
765
unpackplattice_info_dense_t766 plattice_info_t unpack(const int logI) const {
767 uint32_t minus_a0;
768 uint32_t b0;
769 uint32_t a1;
770 uint32_t b1;
771
772 uint32_t mask8 = (1<<8)-1;
773 uint32_t mask16 = (1<<16)-1;
774 uint32_t mask20 = (1<<20)-1;
775 minus_a0 = pack[0] & mask20;
776 b0 = (pack[0] >> 20) | ((pack[1] & mask8) << 12);
777 a1 = (pack[1] >> 8) & mask20;
778 b1 = (pack[1] >> 28) | ((pack[2] & mask16) << 4);
779
780 plattice_info_t pli(-int32_t(minus_a0), uint32_t(a1),
781 int32_t(b0), uint32_t(b1));
782 // Orthogonal bases
783 if (pli.b0 == 1 && pli.b1 == 0) {
784 pli.a0 = -((int32_t)1 << logI) + 1;
785 } else if (pli.b0 == 0 && pli.b1 == 1) {
786 pli.a1 = ((int32_t)1 << logI) + 1;
787 }
788 return pli;
789 }
790
get_hintplattice_info_dense_t791 uint16_t get_hint() const {
792 return pack[2] >> 16;
793 }
794
795 };
796
797 template<int LEVEL>
798 class plattices_dense_vector_t:
799 public std::vector<plattice_info_dense_t<LEVEL>> {
800 public:
801 #if !GNUC_VERSION(4,7,2)
802 /* Apparently there's a bug in gcc 4.7.2, which does not recognize
803 * that a vector of plattices_dense_vector_t is ok with a move ctor
804 * and no copy ctor. I'm not entirely sure why, and I don't really
805 * want to investigate. Theoretically, defining ctors as below should
806 * be fine.
807 */
808 plattices_dense_vector_t(plattices_dense_vector_t const&) = delete;
809 plattices_dense_vector_t(plattices_dense_vector_t&&) = default;
810 plattices_dense_vector_t& operator=(plattices_dense_vector_t&&) = default;
811 plattices_dense_vector_t() = default;
812 #endif
813 };
814
815 // std::vector<plattices_dense_vector_t<LEVEL>> is for remembering the FK
816 // basis in sublat mode, between two different congruences of (i,j) mod
817 // m. For simplicity, we remember them only for the toplevel.
818 template<int LEVEL>
819 struct precomp_plattice_dense_t {
820 typedef std::vector<plattices_dense_vector_t<LEVEL>> type;
821 };
822
823
824 #if 0
825 /* MOD2_CLASSES_BS was an attempt, at some point, to support bucket
826 * sieving in several passes for congruence classes mod 2. It was never
827 * finished, and has never been tested. The last traces of that code may
828 * be found in the git history perhaps. To date, the only remaining bit
829 * that can still be considered interesting is the comment fragment below
830 * in plattice_starting_vector
831 */
832
833 /* This is for working with congruence classes only */
834 NOPROFILE_INLINE
835 plattice_x_t plattice_starting_vector(const plattice_info_t * pli, sieve_info_srcptr si, unsigned int par )
836 {
837 /* With MOD2_CLASSES_BS set up, we have computed by the function
838 * above an adapted basis for the band of total width I/2 (thus from
839 * -I/4 to I/4). This adapted basis is in the coefficients a0 a1 b0
840 * b1 of the pli data structure.
841 *
842 * Now as per Proposition 1 of FrKl05 applied to I/2, any vector
843 * whose i-coordinates are within ]-I/2,I/2[ (<ugly>We would like a
844 * closed interval on the left. Read further on for that case</ugly>)
845 * can actually be written as a combination with positive integer
846 * coefficients of these basis vectors a and b.
847 *
848 * We also know that the basis (a,b) has determinant p, thus odd. The
849 * congruence class mod 2 that we want to reach is thus accessible.
850 * It is even possible to find coefficients (k,l) in {0,1} such that
851 * ka+lb is within this congruence class. This means that we're going
852 * to consider either a,b,or a+b as a starting vector. The
853 * i-coordinates of these, as per the properties of Proposition 1, are
854 * within ]-I/2,I/2[. Now all other vectors with i-coordinates in
855 * ]-I/2,I/2[ which also belong to the same congruence class, can be
856 * written as (2k'+k)a+(2l'+l)b, with k' and l' necessarily
857 * nonnegative.
858 *
859 * The last ingredient is that (2a, 2b) forms an adapted basis for
860 * the band of width I with respect to the lattice 2p. It's just a
861 * homothetic transformation.
862 *
863 * To find (k,l), we proceed like this. First look at the (a,b)
864 * matrix mod 2:
865 * a0&1 a1&1
866 * b0&1 b1&1
867 * Its determinant is odd, thus the inverse mod 2 is:
868 * b1&1 a1&1
869 * b0&1 a0&1
870 * Now the congruence class is given by the parity argument. The
871 * vector is:
872 * par&1, par>>1
873 * Multiplying this vector by the inverse matrix above, we obtain the
874 * coordinates k,l, which are:
875 * k = (b1&par&1)^(b0&(par>>1));
876 * l = (a1&par&1)^(a0&(par>>1));
877 * Now our starting vector is ka+lb. Instead of multiplying by k and
878 * l with values in {0,1}, we mask with -k and -l, which both are
879 * either all zeroes or all ones in binary
880 *
881 */
882 /* Now for the extra nightmare. Above, we do not have the guarantee
883 * that a vector whose i-coordinate is precisely -I/2 has positive
884 * coefficients in our favorite basis. It's annoying, because it may
885 * well be that if such a vector also has positive j-coordinate, then
886 * it is in fact the first vector we will meet. An example is given
887 * by the following data:
888 *
889 f:=Polynomial(StringToIntegerSequence("
890 -1286837891385482936880099433527136908899552
891 55685111236629140623629786639929578
892 13214494134209131997428776417
893 -319664171270205889372
894 -17633182261156
895 40500"));
896
897 q:=165017009; rho:=112690811;
898 a0:=52326198; b0:=-1; a1:=60364613; b1:=2;
899 lI:=13; I:=8192; J:=5088;
900 p:=75583; r0:=54375;
901 > M;
902 [-2241 19]
903 [ 1855 18]
904 > M[1]-M[2];
905 (-4096 1)
906
907 * Clearly, above, for the congruence class (0,1), we must start with
908 * this vector, not with the sum.
909 */
910 int32_t a0 = pli->a0;
911 int32_t a1 = pli->a1;
912 int32_t b0 = pli->b0;
913 int32_t b1 = pli->b1;
914
915 int k = -((b1&par&1)^(b0&(par>>1)));
916 int l = -((a1&par&1)^(a0&(par>>1)));
917 int32_t v[2]= { (a0&k)+(b0&l), (a1&k)+(b1&l)};
918
919 /* handle exceptional case as described above */
920 if (k && l && a0-b0 == -(1 << (si->conf->logI-1)) && a1 > b1) {
921 v[0] = a0-b0;
922 v[1] = a1-b1;
923 }
924 return (v[1] << si->conf->logI) | (v[0] + (1 << (si->conf->logI-1)));
925 }
926 #endif
927
928 /* All of these are defined in las-plattice.cpp */
929 extern template class plattice_enumerator<1>;
930 extern template class plattice_enumerator_coprime<1>;
931 extern template class plattices_vector_t<1>;
932 extern template struct plattice_info_dense_t<1>;
933 extern template class plattices_dense_vector_t<1>;
934 extern template struct precomp_plattice_dense_t<1>;
935 extern template class plattice_enumerator<2>;
936 extern template class plattice_enumerator_coprime<2>;
937 extern template class plattices_vector_t<2>;
938 extern template struct plattice_info_dense_t<2>;
939 extern template class plattices_dense_vector_t<2>;
940 extern template struct precomp_plattice_dense_t<2>;
941 extern template class plattice_enumerator<3>;
942 extern template class plattice_enumerator_coprime<3>;
943 extern template class plattices_vector_t<3>;
944 extern template struct plattice_info_dense_t<3>;
945 extern template class plattices_dense_vector_t<3>;
946 extern template struct precomp_plattice_dense_t<3>;
947
948
949
950 #endif
951