1 #include "cado.h"
2 // IWYU pragma: no_include <ext/alloc_traits.h>
3 // IWYU pragma: no_include <sys/param.h>
4 #include <limits.h> // for UINT_MAX
5 #include <stdlib.h> // for exit, EXIT_FAILURE
6 #include <algorithm> // for max, sort
7 #include <stdexcept> // for runtime_error
8 #include <type_traits> // for integral_constant<>::value
9 #include <utility> // for move
10 #include <sys/types.h> // ssize_t
11 #include <sys/stat.h> // stat fstat
12 #ifdef SELECT_MPFQ_LAYER_u64k1
13 #include <cstring> // memset
14 #include <gmp.h> // mpn_lshift
15 #include "misc.h" // bit_reverse
16 #endif
17 #include "fmt/core.h"
18 #include "fmt/format.h" // IWYU pragma: keep
19 #include "lingen_average_matsize.hpp"
20 #include "lingen_bmstatus.hpp"
21 #include "lingen_io_matpoly.hpp"
22 #include "lingen_io_wrappers.hpp"
23 #include "macros.h" // MIN
24 #include "omp_proxy.h"
25 #include "timing.h"
26
27 #pragma GCC diagnostic ignored "-Wunused-parameter"
28
29 /* This one is currently in lingen_io_matpoly.cpp, but that file will go
30 * away eventually */
31
32 extern unsigned int io_matpoly_block_size;
33
34 constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
35 constexpr const unsigned int splitwidth = matpoly::over_gf2 ? 64 : 1;
36
mpi_rank()37 static int mpi_rank()
38 {
39 int rank;
40 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
41 return rank;
42 }
43
44 /* A note on MPI usage of these routines.
45 *
46 * All data is expected to follow these rules
47 *
48 * - the matpoly arguments to {read_to,write_from}_matpoly are always
49 * expected to be significant only at the root. What happens to this
50 * data on non-zero ranks is largely unspecified, beyond the fact that
51 * collective calls to the routines should work and not have divergent
52 * behaviour. In practice we strive to just never access these
53 * arguments at non-zero ranks. (the same holds for the caches that
54 * are embedded in the i/o wrappers, with the exception that their
55 * limits cache_k0 and cache_k1 are consistent at all ranks).
56 *
57 * - the data embedded in the i/o wrappers is either meaningful only at
58 * the root (in which case the {read_to,write_from} calls will do
59 * close to nothing, or is of collective type, meaning that
60 * {read_to,write_from} will engage in colective operations.
61 *
62 * - all calls are implicit barriers, and their returned value is
63 * identical at all ranks.
64 */
65
66 /* XXX It seems likely that lingen_average_matsize.?pp will go away
67 */
average_matsize() const68 double lingen_io_wrapper_base::average_matsize() const
69 {
70 return ::average_matsize(ab, nrows, ncols, false);
71 }
72
preferred_window() const73 unsigned int lingen_io_wrapper_base::preferred_window() const
74 {
75 unsigned int nmats = iceildiv(io_matpoly_block_size, average_matsize());
76 unsigned int nmats_pad = simd * iceildiv(nmats, simd);
77 return nmats_pad;
78 }
79
average_matsize() const80 double lingen_file_input::average_matsize() const
81 {
82 return ::average_matsize(ab, nrows, ncols, ascii);
83 }
84
preferred_window() const85 unsigned int lingen_file_input::preferred_window() const
86 {
87 unsigned int nmats = iceildiv(io_matpoly_block_size, average_matsize());
88 unsigned int nmats_pad = simd * iceildiv(nmats, simd);
89 #ifdef HAVE_OPENMP
90 /* This might be a bit large */
91 nmats_pad *= omp_get_max_threads() * omp_get_max_threads() / 4;
92 #if ULONG_BITS == 32
93 /* limit to 256MB */
94 for( ; ((size_t) nmats_pad) * average_matsize() > (1UL << 28) ; nmats_pad /= 2);
95 #else
96 /* limit to 4GB */
97 for( ; ((size_t) nmats_pad) * average_matsize() > (1UL << 32) ; nmats_pad /= 2);
98 #endif
99 #endif
100 return nmats_pad;
101 }
102
open_file()103 void lingen_file_input::open_file()
104 {
105 if (mpi_rank()) return;
106 f = fopen(filename.c_str(), ascii ? "r" : "rb");
107 DIE_ERRNO_DIAG(!f, "open(%s)", filename.c_str());
108 }
109
close_file()110 void lingen_file_input::close_file()
111 {
112 if (mpi_rank()) return;
113 fclose(f);
114 }
115
guessed_length() const116 size_t lingen_file_input::guessed_length() const
117 {
118 unsigned long guess;
119
120 if (length_hint) return length_hint;
121
122 if (mpi_rank() == 0) {
123 struct stat sbuf[1];
124 int rc = fstat(fileno(f), sbuf);
125 if (rc < 0) {
126 perror(filename.c_str());
127 exit(EXIT_FAILURE);
128 }
129
130 size_t filesize = sbuf->st_size;
131
132 size_t avg = average_matsize();
133
134 if (!ascii) {
135 if (filesize % avg) {
136 fprintf(stderr, "File %s has %zu bytes, while its size"
137 " should be a multiple of %zu bytes "
138 "(assuming binary input; "
139 "perhaps --ascii is missing ?).\n",
140 filename.c_str(), filesize, avg);
141 exit(EXIT_FAILURE);
142 }
143 guess = filesize / avg;
144 } else {
145 double expected_length = (double) filesize / avg;
146 printf("# Expect roughly %.2f items in the sequence.\n", expected_length);
147
148 /* First coefficient is always lighter, so we add a +1. */
149 guess = 1 + expected_length;
150 }
151 }
152 MPI_Bcast(&guess, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);
153 return guess;
154 }
155
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)156 ssize_t lingen_file_input::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
157 {
158 /* This reads k1-k0 fresh coefficients from f into dst */
159 size_t ll;
160 if (!mpi_rank())
161 ll = matpoly_read(ab, f, dst, k0, k1, ascii, 0);
162 MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
163 return ll;
164 }
165
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)166 ssize_t lingen_random_input::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
167 {
168 ssize_t nk = MIN(length, next_src_k + k1 - k0) - next_src_k;
169 if (!mpi_rank())
170 dst.fill_random(k0, k0 + nk, rstate);
171 next_src_k += nk;
172 return nk;
173 }
preferred_window() const174 unsigned int lingen_random_input::preferred_window() const
175 {
176 unsigned int nmats = iceildiv(io_matpoly_block_size, average_matsize());
177 unsigned int nmats_pad = simd * iceildiv(nmats, simd);
178 #ifdef HAVE_OPENMP
179 /* This might be a bit large */
180 nmats_pad *= omp_get_max_threads() * omp_get_max_threads() / 4;
181 #if ULONG_BITS != 32
182 /* limit to 32GB */
183 for( ; ((size_t) nmats_pad) * average_matsize() > (1UL << 35) ; nmats_pad /= 2);
184 #endif
185 #endif
186 return nmats_pad;
187 }
188
189
190 /* pivots is a vector of length r <= M.n ;
191 * M is a square matrix ;
192 * coefficient (pivots[j],j) of M is -1.
193 *
194 * return true (and extend pivots) if column r of M is independent of the
195 * previous columns.
196 *
197 * Only the coefficient of degree 0 is considered.
198 */
199 static bool
reduce_column_mod_previous(matpoly & M,std::vector<unsigned int> & pivots)200 reduce_column_mod_previous(matpoly& M, std::vector<unsigned int>& pivots)
201 {
202 abelt tmp MAYBE_UNUSED;
203 abinit(M.ab, &tmp);
204 unsigned int r = pivots.size();
205 for (unsigned int v = 0; v < r; v++) {
206 unsigned int u = pivots[v];
207 /* the v-th column in the M is known to
208 * kill coefficient u (more exactly, to have a -1 as u-th
209 * coefficient, and zeroes for the other coefficients
210 * referenced in the pivots[0] to pivots[v-1] indices).
211 */
212 /* add M[u,r]*column v of M to column r of M */
213 #ifdef SELECT_MPFQ_LAYER_u64k1
214 if (abis_zero(M.ab, M.coeff(u, r, 0)))
215 continue;
216 #endif
217 for (unsigned int i = 0; i < M.m; i++) {
218 #ifndef SELECT_MPFQ_LAYER_u64k1
219 abmul(M.ab, tmp, M.coeff(i, v, 0), M.coeff(u, r, 0));
220 abadd(M.ab, M.coeff(i, r, 0), M.coeff(i, r, 0), tmp);
221 /* don't touch coeff u,r right now, since we're still using
222 * it !
223 */
224 if (i == u)
225 continue;
226 #else
227 /* here, we know that coeff (u,r) is one */
228 M.coeff_accessor(i, r, 0) += M.coeff(i, v);
229 #endif
230 }
231 #ifndef SELECT_MPFQ_LAYER_u64k1
232 abset_zero(M.ab, M.coeff(u, r, 0));
233 #endif
234 }
235 unsigned int u = 0;
236 for (; u < M.m; u++) {
237 if (!abis_zero(M.ab, M.coeff(u, r, 0)))
238 break;
239 }
240 abclear(M.ab, &tmp);
241 if (u != M.m) {
242 pivots.push_back(u);
243 return true;
244 } else {
245 return false;
246 }
247 }
248
249 #ifndef SELECT_MPFQ_LAYER_u64k1
250 static void
normalize_column(matpoly & M,std::vector<unsigned int> const & pivots)251 normalize_column(matpoly& M, std::vector<unsigned int> const& pivots)
252 {
253 abelt tmp MAYBE_UNUSED;
254 abinit(M.ab, &tmp);
255 unsigned int r = pivots.size() - 1;
256 unsigned int u = pivots.back();
257 /* Multiply the column so that the pivot becomes -1 */
258 int rc = abinv(M.ab, tmp, M.coeff(u, r, 0));
259 if (!rc) {
260 fprintf(stderr, "Error, found a factor of the modulus: ");
261 abfprint(M.ab, stderr, tmp);
262 fprintf(stderr, "\n");
263 exit(EXIT_FAILURE);
264 }
265 abneg(M.ab, tmp, tmp);
266 for (unsigned int i = 0; i < M.m; i++) {
267 abmul(M.ab, M.coeff(i, r, 0), M.coeff(i, r, 0), tmp);
268 }
269 abclear(M.ab, &tmp);
270 }
271 #endif
272
273 void
share(int root,MPI_Comm comm)274 lingen_F0::share(int root, MPI_Comm comm)
275 {
276 /* share the bw_dimensions structure, but I don't think it's even
277 * remotely possible that nodes disagree on it.
278 */
279 MPI_Bcast(&nrhs, 1, MPI_UNSIGNED, root, comm);
280 MPI_Bcast(&m, 1, MPI_UNSIGNED, root, comm);
281 MPI_Bcast(&n, 1, MPI_UNSIGNED, root, comm);
282 MPI_Bcast(&t0, 1, MPI_UNSIGNED, root, comm);
283 static_assert(std::is_same<typename decltype(fdesc)::value_type,
284 std::array<unsigned int, 2>>::value,
285 "want unsigned ints");
286 unsigned int fsize = fdesc.size();
287 MPI_Bcast(&fsize, 1, MPI_UNSIGNED, root, comm);
288 if (mpi_rank() != root)
289 fdesc.assign(fsize, {{ 0, 0 }});
290 if (fsize)
291 MPI_Bcast(fdesc.data(), 2 * fsize, MPI_UNSIGNED, root, comm);
292 }
293
294 /* It's a bit tricky. F0 is the _reversal_ of what we get here. (with
295 * respect to t0).
296 */
column_data_from_Aprime(unsigned int jE) const297 std::tuple<unsigned int, unsigned int> lingen_F0::column_data_from_Aprime(unsigned int jE) const
298 {
299 unsigned int kA, jA;
300 if (jE < n) {
301 jA = jE;
302 kA = t0;
303 } else {
304 jA = fdesc[jE-n][1];
305 kA = fdesc[jE-n][0];
306 }
307 /* This is pedantic, but oldish g++ has the ctor from init-list
308 * marked "explicit"
309 */
310 std::tuple<unsigned int, unsigned int> res { kA, jA };
311 return res;
312 }
column_data_from_A(unsigned int jE) const313 std::tuple<unsigned int, unsigned int> lingen_F0::column_data_from_A(unsigned int jE) const
314 {
315 unsigned int kA, jA;
316 std::tie(kA, jA) = column_data_from_Aprime(jE);
317 kA += jA >= nrhs;
318 /* This is pedantic, but oldish g++ has the ctor from init-list
319 * marked "explicit"
320 */
321 std::tuple<unsigned int, unsigned int> res { kA, jA };
322 return res;
323 }
324
refresh_cache_upto(unsigned int k)325 void lingen_E_from_A::refresh_cache_upto(unsigned int k)
326 {
327 unsigned int next_k1 = simd * iceildiv(k, simd);
328
329 ASSERT_ALWAYS(next_k1 >= cache_k1);
330
331 if (next_k1 != cache_k1) {
332 if (!mpi_rank())
333 cache.zero_pad(next_k1);
334 ssize_t nk = A.read_to_matpoly(cache, cache_k1, next_k1);
335 if (nk < (ssize_t) (k - cache_k1)) {
336 fprintf(stderr, "short read from A\n");
337 printf("This amount of data is insufficient. "
338 "Cannot find %u independent cols within A\n",
339 m);
340 exit(EXIT_FAILURE);
341 }
342 cache_k1 = next_k1;
343 }
344 }
345
346 void
initial_read()347 lingen_E_from_A::initial_read()
348 {
349 /* This is called collectively, because _all_
350 * {read_to,write_from}_matpoly call are collective calls. */
351
352 /* Our goal is to determine t0 and F0, and ultimately ensure that the
353 * constant coefficient of E at t=t0 is a full-rank matrix.
354 *
355 * The initial constant coefficient of E is in fact the coefficient
356 * of degree t0 of A'*F, where column j of A' is column j of A,
357 * divided by X if j >= bm.d.nrhs.
358 *
359 * Therefore, for j >= bm.d.nrhs, the contribution to the constant
360 * coefficient of E comes the following data from A:
361 * for cols of A with j < bm.d.rhs: coeffs 0 <= deg <= t0-1
362 * for cols of A with j >= bm.d.rhs: coeffs 1 <= deg <= t0
363 *
364 * We need m independent columns.
365 *
366 * When we inspect the k coefficients of A, we are, in effect,
367 * considering k*n-(n-rhs) = (k-1)*n+rhs columns. Therefore we expect
368 * at the very least that we need to go as fas as
369 * (k-1)*n+rhs >= m
370 * k >= ceiling((m+n-rhs)/n)
371 *
372 * Note that these k coefficients of A mean k' coefficients of A',
373 * where k' is either k or k-1. Namely, k' is k - (rhs == n)
374 */
375
376 share(0, MPI_COMM_WORLD);
377
378 if (!mpi_rank())
379 printf("Computing t0\n");
380
381 /* We want to create a full rank m*m matrix M, by extracting columns
382 * from the first coefficients of A */
383
384 matpoly M(bw_dimensions::ab, m, m, 1);
385 M.zero_pad(1);
386
387 /* For each integer i between 0 and m-1, we have a column, picked
388 * from column fdesc[i][1] of coeff fdesc[i][0] of A' which, once
389 * reduced modulo the other ones, has coefficient at row
390 * pivots[i] unequal to zero.
391 */
392 std::vector<unsigned int> pivots;
393
394 for (t0 = 1; pivots.size() < m; t0++) {
395 /* We are going to access coefficient k from A (perhaps
396 * coefficients k-1 and k) */
397 unsigned int k = t0 - (nrhs == n);
398
399 /* k + 1 because we're going to access coeff of degree k, of
400 * course.
401 * k + 2 is because of the leading identity block in F0. To
402 * form coefficient of degree t0, we'll need degree t0 in A',
403 * which means up to degree t0 + (nrhs < n) = k + 1 in A
404 */
405 refresh_cache_upto(k + 2);
406
407 for (unsigned int j = 0; pivots.size() < m && j < n; j++) {
408 /* Extract a full column into M (column j, degree t0 or
409 * t0-1 in A) */
410 /* adjust the coefficient degree to take into account the
411 * fact that for SM columns, we might in fact be
412 * interested by the _previous_ coefficient */
413
414 int inc;
415 if (!mpi_rank()) {
416 M.extract_column(pivots.size(), 0, cache, j, t0 - (j < nrhs));
417 inc = reduce_column_mod_previous(M, pivots);
418 }
419 MPI_Bcast(&inc, 1, MPI_INT, 0, MPI_COMM_WORLD);
420 if (inc && mpi_rank()) {
421 // we only need to have pivots.size() right
422 pivots.push_back(0);
423 }
424
425 if (!inc) {
426 if (!mpi_rank())
427 printf(
428 "[X^%d] A, col %d does not increase rank (still %zu)\n",
429 t0 - (j < nrhs),
430 j,
431 pivots.size());
432
433 /* we need at least m columns to get as starting matrix
434 * with full rank. Given that we have n columns per
435 * coefficient, this means at least m/n matrices.
436 */
437
438 if ((t0-1) * n > m + 40) {
439 if (!mpi_rank())
440 printf("The choice of starting vectors was bad. "
441 "Cannot find %u independent cols within A\n",
442 m);
443 exit(EXIT_FAILURE);
444 }
445 continue;
446 }
447
448 /* Bingo, it's a new independent col. */
449
450 fdesc.push_back( {{ t0 - 1, j }} );
451 #ifndef SELECT_MPFQ_LAYER_u64k1
452 if (!mpi_rank())
453 normalize_column(M, pivots);
454 #endif
455
456 /*
457 if (!mpi_rank())
458 printf("[X^%d] A, col %d increases rank to %zu%s\n",
459 t0 - (j < nrhs),
460 j,
461 pivots.size(),
462 (j < nrhs) ? " (column not shifted because of the RHS)"
463 : "");
464 */
465
466 }
467 /* Do this before we increase t0 */
468 if (pivots.size() == m)
469 break;
470 }
471
472 if (!mpi_rank())
473 printf("Found satisfactory init data for t0=%d\n", t0);
474
475 share(0, MPI_COMM_WORLD);
476 }
477
share(int root,MPI_Comm comm)478 void lingen_E_from_A::share(int root, MPI_Comm comm)
479 {
480 lingen_F0::share(root, comm);
481 MPI_Bcast(&cache_k0, 1, MPI_UNSIGNED, root, comm);
482 MPI_Bcast(&cache_k1, 1, MPI_UNSIGNED, root, comm);
483 if (!mpi_rank())
484 cache.zero_pad(cache_k1 - cache_k0);
485 }
486
487 /* read k1-k0 new coefficients from src, starting at coefficient k0,
488 * write them to the destination (which is embedded in the struct as a
489 * reference), starting at coefficient next_dst_k
490 */
491 template<>
write_from_matpoly(matpoly const & src,unsigned int k0,unsigned int k1)492 ssize_t lingen_scatter<matpoly>::write_from_matpoly(matpoly const & src, unsigned int k0, unsigned int k1)
493 {
494 long long nk;
495 ASSERT_ALWAYS(k0 % simd == 0);
496 ASSERT_ALWAYS(next_dst_k % simd == 0);
497 if (!mpi_rank()) {
498 nk = MIN(k1, src.get_size()) - k0;
499 ASSERT_ALWAYS(k1 <= src.get_size());
500 E.zero_pad(next_dst_k + nk);
501 for(unsigned int i = 0 ; i < nrows ; i++) {
502 for(unsigned int j = 0; j < ncols ; j++) {
503 abdst_vec to = E.part_head(i, j, next_dst_k);
504 absrc_vec from = src.part_head(i, j, k0);
505 abvec_set(ab, to, from, simd * iceildiv(nk, simd));
506 }
507 }
508 }
509 MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
510 next_dst_k += nk;
511 return nk;
512 }
513
514 /* read k1-k0 new coefficients from src, starting at coefficient k0,
515 * write them to the destination (which is embedded in the struct as a
516 * reference), starting at coefficient next_dst_k
517 */
518 template<>
write_from_matpoly(matpoly const & src,unsigned int k0,unsigned int k1)519 ssize_t lingen_scatter<bigmatpoly>::write_from_matpoly(matpoly const & src, unsigned int k0, unsigned int k1)
520 {
521 /* This one ***IS*** a collective call */
522 long long nk;
523 ASSERT_ALWAYS(k0 % simd == 0);
524 ASSERT_ALWAYS(next_dst_k % simd == 0);
525 if (!mpi_rank()) {
526 nk = MIN(k1, src.get_size()) - k0;
527 ASSERT_ALWAYS(k1 <= src.get_size());
528 }
529 MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
530 E.zero_pad(next_dst_k + nk);
531 E.scatter_mat_partial(src, k0, next_dst_k, nk);
532 next_dst_k += nk;
533 return nk;
534 }
535
536 /* read k1-k0 new coefficients from the source (which is embedded in the
537 * struct as a reference), starting at coefficient next_src_k, and write
538 * them to the destination, starting at coefficient k0.
539 */
540 template<>
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)541 ssize_t lingen_gather<matpoly>::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
542 {
543 long long nk;
544 ASSERT_ALWAYS(k0 % simd == 0);
545 ASSERT_ALWAYS(k1 % simd == 0);
546 ASSERT_ALWAYS(next_src_k % simd == 0);
547 if (!mpi_rank()) {
548 nk = MIN(k1, dst.get_size()) - k0;
549 nk = MIN(nk, (ssize_t) (pi.get_size() - next_src_k));
550 ASSERT_ALWAYS(k1 <= dst.get_size());
551 for(unsigned int i = 0 ; i < nrows ; i++) {
552 for(unsigned int j = 0; j < ncols ; j++) {
553 abdst_vec to = dst.part_head(i, j, k0);
554 absrc_vec from = pi.part_head(i, j, next_src_k);
555 abvec_set(ab, to, from, nk);
556 }
557 }
558 }
559 MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
560 next_src_k += nk;
561 return nk;
562 }
563
564 /* read k1-k0 new coefficients from the source (which is embedded in the
565 * struct as a reference), starting at coefficient next_src_k, and write
566 * them to the destination, starting at coefficient k0.
567 */
568 template<>
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)569 ssize_t lingen_gather<bigmatpoly>::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
570 {
571 /* This one ***IS*** a collective call */
572 long long nk;
573 ASSERT_ALWAYS(k0 % simd == 0);
574 ASSERT_ALWAYS(next_src_k % simd == 0);
575 ASSERT_ALWAYS(k1 % simd == 0);
576 if (!mpi_rank()) {
577 nk = MIN(k1, dst.get_size()) - k0;
578 nk = MIN(nk, (ssize_t) (pi.get_size() - next_src_k));
579 ASSERT_ALWAYS(k1 <= dst.get_size());
580 }
581 MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
582 pi.gather_mat_partial(dst, k0, next_src_k, nk);
583 next_src_k += nk;
584 return nk;
585 }
586
587 /* read k1-k0 new coefficients from pi, starting at coefficient
588 * next_src_k, and write them to the destination, starting at coefficient
589 * k0.
590 */
reverse_matpoly_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1,matpoly const & pi,unsigned int & next_src_k)591 ssize_t reverse_matpoly_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1, matpoly const & pi, unsigned int & next_src_k)
592 {
593 ASSERT_ALWAYS(!mpi_rank());
594 ssize_t nk;
595 nk = MIN(k1, dst.get_size()) - k0;
596 nk = MIN(nk, (ssize_t) (pi.get_size() - next_src_k));
597 ASSERT_ALWAYS(k0 % simd == 0);
598 ASSERT_ALWAYS(k1 % simd == 0);
599 ASSERT_ALWAYS(next_src_k % simd == 0);
600 unsigned int d = pi.get_size();
601 abdst_field ab = pi.ab;
602 #ifndef SELECT_MPFQ_LAYER_u64k1
603 for(unsigned int i = 0 ; i < pi.nrows() ; i++) {
604 for(unsigned int j = 0; j < pi.ncols() ; j++) {
605 /* We'll read coefficients of degrees [d-k1, d-k0[, and
606 * it might well be that d-k1<0
607 */
608 for(unsigned int k = k0 ; k < k0 + nk ; k++) {
609 if (next_src_k + k < d + k0)
610 abset(ab,
611 dst.coeff(i, j, k),
612 pi.coeff(i, j, d-1-next_src_k-(k-k0)));
613 else
614 abset_zero(ab, dst.coeff(i, j, k));
615 }
616 }
617 }
618 #else
619 size_t ntemp = (k1-k0) / simd + 2;
620 unsigned int rk0, rk1;
621 rk1 = next_src_k;
622 rk1 = d >= rk1 ? d - rk1 : 0;
623 rk0 = rk1 >= (unsigned int) nk ? rk1 - nk : 0;
624 unsigned int n_rk = rk1 - rk0;
625 unsigned int q_rk = rk0 / simd;
626 unsigned int r_rk = rk0 % simd;
627 unsigned int nq = iceildiv(n_rk + r_rk, simd);
628 if (nq) {
629 unsigned long * temp = new unsigned long[ntemp];
630 for(unsigned int i = 0 ; i < pi.nrows() ; i++) {
631 for(unsigned int j = 0; j < pi.ncols() ; j++) {
632 /* We'll read coefficients of degrees [rk0, rk1[, and
633 * it might well be that rk0<0
634 */
635 memset(temp, 0, ntemp * sizeof(unsigned long));
636
637 ASSERT_ALWAYS(nq <= ntemp);
638 #define R(x, b) ((b)*iceildiv((x),(b)))
639 abvec_set(ab, temp, pi.part(i, j) + q_rk, R(n_rk + r_rk, simd));
640 if (rk1 % simd) mpn_lshift(temp, temp, nq, simd - (rk1 % simd));
641 /* Now we only have to bit-reverse temp */
642 bit_reverse(temp, temp, nq);
643 abvec_set(ab, dst.part_head(i, j, k0), temp, R(nk, simd));
644 #undef R
645 }
646 }
647 delete[] temp;
648 }
649 #endif
650 next_src_k += nk;
651 return nk;
652 }
653
654 #if 0
655 shared_or_common_size<matpoly>::shared_or_common_size(matpoly const & pi)
656 {
657 unsigned long s = pi.get_size();
658 MPI_Bcast(&s, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);
659 shared = s;
660 };
661 #endif
662
663 /* read k1-k0 new coefficients from the source (which is embedded in the
664 * struct as a reference), starting at coefficient next_src_k, and write
665 * them to the destination, starting at coefficient k0.
666 */
667 template<>
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)668 ssize_t lingen_gather_reverse<matpoly>::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
669 {
670 long long ll;
671 if (!mpi_rank())
672 ll = reverse_matpoly_to_matpoly(dst, k0, k1, pi, next_src_k);
673 MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
674 return ll;
675 }
676
677 /* read k1-k0 new coefficients from the source (which is embedded in the
678 * struct as a reference), starting at coefficient next_src_k, and write
679 * them to the destination, starting at coefficient k0.
680 */
681 template<>
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)682 ssize_t lingen_gather_reverse<bigmatpoly>::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
683 {
684 /* This one ***IS*** a collective call */
685 long long nk;
686 unsigned int nq;
687 unsigned int offset;
688 long long dk;
689
690 ASSERT_ALWAYS(k0 % simd == 0);
691 ASSERT_ALWAYS(k1 % simd == 0);
692
693 if (!mpi_rank()) {
694 nk = MIN(k1, dst.get_size()) - k0;
695 nk = MIN(nk, (ssize_t) (pi.get_size() - next_src_k));
696 ASSERT_ALWAYS(k1 <= dst.get_size());
697 unsigned int d = pi.get_size();
698 unsigned int rk1 = next_src_k;
699 rk1 = d >= rk1 ? d - rk1 : 0;
700 unsigned int rk0 = rk1 >= (unsigned int) nk ? rk1 - nk : 0;
701 unsigned int n_rk = rk1 - rk0;
702 unsigned int q_rk = rk0 / simd;
703 unsigned int r_rk = rk0 % simd;
704 nq = iceildiv(n_rk + r_rk, simd);
705 offset = q_rk * simd;
706 }
707 MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
708 MPI_Bcast(&nq, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
709 MPI_Bcast(&offset, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
710
711 matpoly tpi(pi.ab, pi.nrows(), pi.ncols(), nq * simd);
712 tpi.zero_pad(nq * simd);
713 unsigned int zero = 0;
714 pi.gather_mat_partial(tpi, zero, offset, nk);
715
716 if (!mpi_rank())
717 dk = reverse_matpoly_to_matpoly(dst, k0, k0 + nk, tpi, zero);
718
719 MPI_Bcast(&dk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
720 ASSERT_ALWAYS(dk == nk);
721 next_src_k += dk;
722 return nk;
723 }
724
recompute_rhs()725 matpoly lingen_F_from_PI::recompute_rhs()
726 {
727 abdst_field ab = bw_dimensions::ab;
728 /*
729 * first compute the rhscontribs. Use that to decide on a renumbering
730 * of the columns, because we'd like to get the "primary" solutions
731 * first, in a sense. Those are the ones with fewer zeroes in the
732 * rhscontrib part. So we would want that matrix to have its columns
733 * sorted in decreasing weight order
734 *
735 * An alternative, possibly easier, is to have a function which
736 * decides the solution ordering precisely based on the inspection of
737 * this rhscoeffs matrix (?). But how should we spell that info when
738 * we give it to mksol ??
739 */
740
741 matpoly rhs(ab, nrhs, n, 1);
742 rhs.zero_pad(1);
743
744 /* adding the contributions to the rhs matrix. This is formed of the
745 * LEADING COEFFICIENT of the corresponding columns of pi.
746 */
747 for (unsigned int jF = 0; jF < n; jF++) {
748 unsigned int jpi = sols[jF].j;
749 for (unsigned int ipi = 0; ipi < m + n; ipi++) {
750 unsigned int iF;
751 unsigned int s;
752 std::tie(iF, s) = get_shift_ij(ipi, jF);
753 if (iF >= nrhs)
754 continue;
755 s--;
756 /* Coefficient (iF, ipi) of F0 is x^kF (with 0<=kF<=t0).
757 * Multiplied by coefficient (ipi, jpi) of pi, whose length
758 * is <=delta[jpi]+1, this induces a contribution to
759 * coefficient
760 * (iF, jF) of F. (actually since the length of column jF is
761 * at most delta[jpi]+1, the length of column jpi of pi is at
762 * most delta[jpi]+1-kF)
763 *
764 * The following formula gives the contribution C of this
765 * coefficient (iF, ipi) of F0 to coefficient delta[j] + k of
766 * A' * F:
767 *
768 * C = \sum_{s=0}^{delta[j]} A'_{iF,s+k} F_{iF,jF,delta[j]-s}
769 * (we understand F_{iF,jF,...} as representing the
770 * contribution of the coeff (iF, ipi) of F0 to this)
771 *
772 * C = \sum_{s=0}^{delta[j]} A'_{iF,s+k} Frev_{ipi,jpi,s}
773 * with Frev = rev_{delta[j]}(F).
774 *
775 * Let F0rev = rev_{t0}(F0) ; F0rev_{iF, ipi} is x^{t0-kF}
776 * pirev = rev_{G-1}(pi) with G large enough. (>=pi.get_size())
777 *
778 * We have Frev{*,j} = (F0rev*pirev)_{*,j} div (t0+G-1-delta[j])
779 * Frev{*,j,s} = (F0rev*pirev)_{*,j,s+t0+G-1-delta[j]}
780 * = pirev_{*,j,s+G-1-delta[j]+kF}
781 *
782 * Let shift = G - 1 - delta[j]
783 *
784 * C = \sum_{s=0}^{delta[j]} A'_{iF,s+k} * pirev_{ipi,jpi,s+shift+kF}
785 *
786 * x^T*M^k*\sum_{s=0}^{delta[j]-kF} a_s c_s
787 *
788 * with a_s = M^s * Y_{iF}
789 * and c_s = [x^{shift + s + kF}](rev_{G}(pi))
790 * (with shift = G - 1 - delta[j])
791 *
792 * and rev_G(pi) is precisely what we have in our source, and
793 * cached in cache.
794 *
795 * Now take this with s==0: we get the contribution of the
796 * rhs.
797 */
798
799 /* add coefficient (ipi, jpi, s) of the reversed pi to
800 * coefficient (iF, jF, 0) of rhs */
801 #ifndef SELECT_MPFQ_LAYER_u64k1
802 absrc_elt src = cache.coeff(ipi, jpi, s);
803 abdst_elt dst = rhs.coeff(iF, jF, 0);
804 abadd(ab, dst, dst, src);
805 #else
806 rhs.part(iF,jF)[0] ^= !abis_zero(ab, cache.coeff(ipi, jpi, s));
807 #endif
808 }
809 }
810 return rhs;
811 }
812
reorder_solutions()813 void lingen_F_from_PI::reorder_solutions()
814 {
815 abdst_field ab = bw_dimensions::ab;
816 /* compute the score per solution */
817 std::vector<std::array<unsigned int, 2>> sol_score;
818 for (unsigned int jF = 0; jF < n; jF++) {
819 unsigned int s = 0;
820 for (unsigned int iF = 0; iF < nrhs; iF++)
821 s += abis_zero(ab, rhs.coeff(iF, jF, 0));
822 sol_score.push_back({{ s, jF }});
823 }
824 std::sort(sol_score.begin(), sol_score.end());
825 if (nrhs && sol_score.size() && !mpi_rank()) {
826 printf("Reordered solutions:\n");
827 for (unsigned int i = 0; i < n; i++) {
828 printf(" %u (col %u in pi, weight %u on rhs vectors)\n",
829 sol_score[i][1],
830 sols[sol_score[i][1]].j,
831 nrhs-sol_score[i][0]);
832 }
833 }
834 {
835 std::vector<sol_desc> sols2;
836 for(auto const & s : sol_score) {
837 sols2.push_back(sols[s[1]]);
838 }
839 sols = std::move(sols2);
840 }
841 }
842
get_shift_ij(unsigned int ipi,unsigned int jF) const843 std::tuple<unsigned int, unsigned int> lingen_F_from_PI::get_shift_ij(unsigned int ipi, unsigned int jF) const
844 {
845 // unsigned int jpi = sols[jF].j;
846 unsigned int shift = sols[jF].shift;
847 unsigned int iF, kF;
848 std::tie(kF, iF) = column_data_from_Aprime(ipi);
849 kF = t0 - kF;
850 unsigned int s0 = shift + kF + (iF < nrhs);
851
852 /* This is pedantic, but oldish g++ has the ctor from init-list
853 * marked "explicit"
854 */
855 std::tuple<unsigned int, unsigned int> res { iF, s0 };
856 return res;
857 }
858
lingen_F_from_PI(bmstatus const & bm,lingen_input_wrapper_base & pi,lingen_F0 const & F0)859 lingen_F_from_PI::lingen_F_from_PI(bmstatus const & bm,
860 lingen_input_wrapper_base& pi, lingen_F0 const& F0)
861 : lingen_F0(F0)
862 , lingen_input_wrapper_base(pi.ab, n, n)
863 , pi(pi)
864 , cache(pi.ab, pi.nrows, pi.ncols, 0)
865 , tail(pi.ab, nrows, ncols, 0)
866 , rhs(pi.ab, nrhs, n, 1)
867 {
868 /* The first n columns of F0 are the identity matrix, so that for (0
869 * <= j < n),
870 * column j of E is actually X^{-t0-e}*column j=jA of A. Notice that we
871 * have in this case column_data_from_A(j) = { t0+e, j }. Here, e is
872 * (j >= bm.d.nrhs)
873 *
874 * The last m columns, namely for (n <= j < m+n), are as follows: if
875 * std::tie(kA, jA) = column_data_from_A(j-n);
876 * Then the entry at position (jA, j) is x^(t0-kA), so that
877 * column j of E is actually X^{-kA}*column jA of A.
878 *
879 * As for F and pi, we have F = F0*pi so that
880 *
881 * A' * F0 * pi = G
882 *
883 * colum j is such that max(1+deg((F0*pi)_j), G_j) <= delta[j]
884 *
885 * Each column uses therefore a combination of data from various
886 * columns of A', which means in turn various columns of A.
887 *
888 * We want to separate the resulting expressions between the "rhs
889 * contributions" (coefficients that appear as multipliers for the
890 * right-hand-side columns, which are coefficients of degree 0 in A),
891 * and the others (which all appear to some non-zero degree).
892 *
893 * Recall that each column of F0 has one single coefficient.
894 * Therefore a coefficient of pi contributes to only one coefficient
895 * of F0*pi. However, this is not 1-to-1: coefficients of F0*pi can
896 * be the combination of several polynomials from pi.
897 */
898
899 /* We need to determine the spread between smallest and largest
900 * degree columns among the solution columns.
901 *
902 * The length of column j of pi is at most 1+delta[j] (this might be
903 * uneven depending on the rows).
904 *
905 * This length is also at least 1+delta[j]-t0, since otherwise we
906 * can't have deg(F_j)=delta[j]
907 */
908 unsigned int lookback_needed = 0;
909
910 for (unsigned int j = 0; j < m + n; j++) {
911 if (bm.lucky[j] <= 0)
912 continue;
913 ASSERT_ALWAYS(bm.delta[j] >= t0);
914 /* Really any large enough G should do. We must strive to change
915 * in relevant places:
916 * - here
917 * - the comment in lingen_F_from_PI::recompute_rhs
918 * - in reverse_matpoly_to_matpoly
919 */
920 size_t G = pi.guessed_length();
921 ASSERT_ALWAYS(bm.delta[j] <= G);
922 unsigned int s = G - 1 - bm.delta[j];
923 sols.push_back({ j, s });
924 if (s + 1 >= lookback_needed)
925 lookback_needed = s + 1;
926 }
927
928 lookback_needed += t0;
929
930 if (sols.size() != n)
931 throw std::runtime_error("Cannot compute F, too few solutions found\n");
932
933 lookback_needed = iceildiv(lookback_needed, simd) * simd;
934 if (!mpi_rank()) {
935 cache.zero_pad(lookback_needed);
936 }
937 pi.read_to_matpoly(cache, 0, lookback_needed);
938 cache_k0 = 0;
939 cache_k1 = lookback_needed;
940
941 if (!mpi_rank()) {
942 rhs = recompute_rhs();
943 reorder_solutions();
944 }
945 MPI_Bcast(sols.data(), 2 * n, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
946
947 /* recompute rhs. Same algorithm as above. */
948 if (!mpi_rank())
949 rhs = recompute_rhs();
950 }
951
write_rhs(lingen_output_wrapper_base & Srhs)952 void lingen_F_from_PI::write_rhs(lingen_output_wrapper_base & Srhs)
953 {
954 if (nrhs)
955 Srhs.write_from_matpoly(rhs, 0, 1);
956 }
957
958
959
960 /* read k1-k0 new coefficients from the source (which is embedded in the
961 * struct as a reference), starting at coefficient next_src_k, and write
962 * them to the destination, starting at coefficient k0.
963 */
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)964 ssize_t lingen_E_from_A::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
965 {
966 abdst_field ab = bw_dimensions::ab;
967
968 /* The first n columns of F0 are the identity matrix, so that
969 * for (0 <= j < n),
970 * column j of E is actually X^{-t0-e}*column j=jA of A. Notice that we
971 * have in this case column_data_from_A(j) = { t0+e, j }. Here, e is
972 * (j >= bm.d.nrhs)
973 *
974 * The last m columns, namely for (n <= j < m+n), are as follows: if
975 * std::tie(kA, jA) = column_data_from_A(j);
976 * Then the entry at position (jA, j) is x^(t0-kA), so that
977 * column j of E is actually X^{-kA}*column jA of A.
978 *
979 *
980 * the max value of kA is t0 + (n < rhs)
981 */
982 ASSERT_ALWAYS(k0 % simd == 0);
983 ASSERT_ALWAYS(k1 % simd == 0);
984
985 ssize_t produced = 0;
986
987 if (cache_k1 != cache_k0) {
988 unsigned int F0_lookback = t0 + (nrhs < n);
989 unsigned int lookback = cache_k1 - cache_k0;
990 ASSERT_ALWAYS(lookback >= F0_lookback);
991
992 long long nk;
993 if (!mpi_rank()) {
994 nk = MIN(k1, dst.get_size()) - k0;
995 ASSERT_ALWAYS(cache.get_size() == lookback);
996 cache.zero_pad(lookback + nk);
997 }
998 MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
999 ASSERT_ALWAYS(nk % simd == 0);
1000
1001 /* This may be a collective call. At any rate, the cache variable is
1002 * meaningful only at the root. */
1003 ssize_t nread = A.read_to_matpoly(cache, lookback, lookback + nk);
1004
1005 if (!mpi_rank())
1006 cache.set_size(lookback + nread);
1007
1008 cache_k1 += nread;
1009
1010 /* Do the bulk of the processing with an aligned count.
1011 * Note that if nread is not already aligned, it means that we had a
1012 * short read, since nk has to be aligned. And if we have a short
1013 * read, we're going to clear the cache on exit, so that it doesn't
1014 * matter if we lose the possibility to rotate the cache correctly.
1015 */
1016 nread -= nread % simd;
1017
1018 if (!mpi_rank()) {
1019 /* process data at rank 0 */
1020 if (nread > 0) {
1021 for(unsigned int j = 0; j < m + n; j++) {
1022 unsigned int jA;
1023 unsigned int kA;
1024 std::tie(kA, jA) = column_data_from_A(j);
1025 /* column j of E is actually X^{-kA}*column jA of A. */
1026 for(unsigned int i = 0 ; i < m ; i++) {
1027 abdst_vec to = dst.part_head(i, j, k0);
1028 absrc_vec from = cache.part_head(i, jA, kA);
1029 #ifndef SELECT_MPFQ_LAYER_u64k1
1030 abvec_set(ab, to, from, nread);
1031 #else
1032 unsigned int sr = kA % simd;
1033 unsigned int nq = nread / simd;
1034 if (sr) {
1035 mpn_rshift(to, from, nq, sr);
1036 to[nq-1] ^= from[nq] << (simd - sr);
1037 } else {
1038 abvec_set(ab, to, from, nread);
1039 }
1040 #endif
1041 }
1042 }
1043 }
1044 }
1045
1046 /* not that nread is agreed at all ranks */
1047
1048 produced = nread;
1049
1050 if (nread + k0 < k1) {
1051 /* We need to handle the end of the input stream, and stow
1052 * that to the (tail) field. This will be done at rank 0.
1053 */
1054
1055 /* At this point we have a choice to make.
1056 * - Either we strive to perform the operation that is
1057 * mathematically unambiguous on our input, akin to
1058 * polynomial multiplication.
1059 * - Or we adapt to the true nature of the source we're
1060 * dealing in the first place: an infinite series, which we
1061 * know only to some precision.
1062 * We do the latter, by using coefficients from the cache
1063 * only until the point where some noise creeps in.
1064 */
1065 unsigned int cache_avail = cache_k1 - cache_k0;
1066 unsigned int cache_access = nread + F0_lookback;
1067
1068 if (!mpi_rank()) {
1069 /* This is the correct final size of the tail */
1070 tail.zero_pad(cache_avail - MIN(cache_avail, cache_access));
1071 for(unsigned int k = nread ; k + F0_lookback < cache_avail; k += simd) {
1072 for(unsigned int j = 0; j < m + n; j++) {
1073 unsigned int jA;
1074 unsigned int kA;
1075 std::tie(kA, jA) = column_data_from_A(j);
1076 /* column j of E is actually X^{-kA}*column jA of A. */
1077 if (k + kA >= cache_avail) continue;
1078 for(unsigned int i = 0 ; i < m ; i++) {
1079 abdst_vec to = tail.part_head(i, j, k - nread);
1080 absrc_vec from = cache.part_head(i, jA, kA + k);
1081 #ifndef SELECT_MPFQ_LAYER_u64k1
1082 abvec_set(ab, to, from, 1);
1083 #else
1084 unsigned int sr = kA % simd;
1085 if (sr) {
1086 *to = *from >> sr;
1087 if (((kA + k) / simd+1) * simd < cache_avail)
1088 *to ^= from[1] << (simd - sr);
1089 } else {
1090 *to = *from;
1091 }
1092 #endif
1093 }
1094 }
1095 }
1096 }
1097 cache.clear();
1098 cache_k1 = cache_k0 = 0;
1099 } else {
1100 if (!mpi_rank())
1101 cache.rshift(nread);
1102 cache_k0 += nread;
1103 }
1104 }
1105
1106 if (!mpi_rank()) {
1107 size_t extra;
1108 for(extra = 0 ; extra < tail.get_size() ; ) {
1109 for(unsigned int j = 0; j < ncols ; j++) {
1110 for(unsigned int i = 0 ; i < nrows ; i++) {
1111 abdst_vec to = dst.part_head(i, j, k0 + produced + extra);
1112 absrc_vec from = tail.part_head(i, j, extra);
1113 abvec_set(ab, to, from, simd);
1114 }
1115 }
1116 extra += MIN(simd, tail.get_size() - extra);
1117 }
1118 produced += MIN(extra, tail.get_size());
1119 tail.rshift(MIN(extra, tail.get_size()));
1120 }
1121
1122 long long ll = produced;
1123 MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1124 produced = ll;
1125
1126 return produced;
1127 }
1128
1129 /* read k1-k0 new coefficients from the source (which is embedded in the
1130 * struct as a reference), starting at coefficient next_src_k, and write
1131 * them to the destination, starting at coefficient k0.
1132 */
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)1133 ssize_t lingen_F_from_PI::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
1134 {
1135 abdst_field ab = bw_dimensions::ab;
1136
1137 ASSERT_ALWAYS(k0 % simd == 0);
1138 ASSERT_ALWAYS(k1 % simd == 0);
1139
1140 ssize_t produced = 0;
1141
1142 if (cache_k1 != cache_k0) {
1143 unsigned int F0_lookback = t0 + (nrhs < n);
1144 unsigned int lookback = cache_k1 - cache_k0;
1145 ASSERT_ALWAYS(lookback >= F0_lookback - 1);
1146
1147 long long nk;
1148 if (!mpi_rank()) {
1149 nk = MIN(k1, dst.get_size()) - k0;
1150 ASSERT_ALWAYS(nk % simd == 0);
1151 ASSERT_ALWAYS(cache.get_size() == lookback);
1152 cache.zero_pad(lookback + nk);
1153 }
1154 MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1155 ASSERT_ALWAYS(nk % simd == 0);
1156
1157 /* This may be a collective call. At any rate, the cache variable is
1158 * meaningful only at the root. */
1159 ssize_t nread = pi.read_to_matpoly(cache, lookback, lookback + nk);
1160
1161 if (!mpi_rank())
1162 cache.set_size(lookback + nread);
1163
1164 cache_k1 += nread;
1165
1166 /* Do the bulk of the processing with an aligned count.
1167 * Note that if nread is not already aligned, it means that we had a
1168 * short read, since nk has to be aligned. And if we have a short
1169 * read, we're going to clear the cache on exit, so that it doesn't
1170 * matter if we lose the possibility to rotate the cache correctly.
1171 */
1172 nread -= nread % simd;
1173
1174 if (!mpi_rank()) {
1175 if (nread > 0) {
1176 for(unsigned int jF = 0 ; jF < n ; jF++) {
1177 unsigned int jpi = sols[jF].j;
1178 for(unsigned int ipi = 0 ; ipi < m + n ; ipi++) {
1179 unsigned int s;
1180 unsigned int iF;
1181 std::tie(iF, s) = get_shift_ij(ipi, jF);
1182
1183 /* The reversal of pi_{ipi, jpi} contributes to entry (iF,
1184 * jF), once shifted right by s. */
1185
1186 abdst_vec to = dst.part_head(iF, jF, k0);
1187 absrc_vec from = cache.part_head(ipi, jpi, s);
1188 #ifndef SELECT_MPFQ_LAYER_u64k1
1189 abvec_add(ab, to, to, from, nread);
1190 #else
1191 /* Add must at the same time do a right shift by sr */
1192 unsigned int sr = s % simd;
1193 if (sr) {
1194 unsigned long cy = from[nread / simd] << (simd - sr);
1195 for(unsigned int i = nread / simd ; i-- ; ) {
1196 unsigned long t = (from[i] >> sr) ^ cy;
1197 cy = from[i] << (simd - sr);
1198 to[i] ^= t;
1199 }
1200 } else {
1201 abvec_add(ab, to, to, from, nread);
1202 }
1203 #endif
1204 }
1205 }
1206 }
1207 }
1208
1209 /* not that nread is agreed at all ranks */
1210
1211 produced = nread;
1212
1213 if (nread + k0 < k1) {
1214 /* We've reached the end of our input stream. Now is time to drain
1215 * the cache. We'll first stow that in the (tail) field.
1216 */
1217 unsigned int cache_avail = cache_k1 - cache_k0;
1218 unsigned int max_tail = 0;
1219
1220 for(unsigned int jF = 0 ; jF < n ; jF++) {
1221 for(unsigned int ipi = 0 ; ipi < m + n ; ipi++) {
1222 unsigned int s;
1223 unsigned int iF;
1224 std::tie(iF, s) = get_shift_ij(ipi, jF);
1225 if (nread + s < cache_avail) {
1226 unsigned int pr = cache_avail - (nread + s);
1227 if (pr >= max_tail)
1228 max_tail = pr;
1229 }
1230 }
1231 }
1232
1233 if (!mpi_rank()) {
1234 tail.zero_pad(max_tail);
1235
1236 for(unsigned int k = nread ; k < nread + max_tail ; k += simd) {
1237 for(unsigned int jF = 0 ; jF < n ; jF++) {
1238 unsigned int jpi = sols[jF].j;
1239 for(unsigned int ipi = 0 ; ipi < m + n ; ipi++) {
1240 unsigned int s;
1241 unsigned int iF;
1242 std::tie(iF, s) = get_shift_ij(ipi, jF);
1243 abdst_vec to = tail.part_head(iF, jF, k - nread);
1244 absrc_vec from = cache.part_head(ipi, jpi, k + s);
1245 #ifndef SELECT_MPFQ_LAYER_u64k1
1246 abvec_add(ab, to, to, from, 1);
1247 #else
1248 /* Add must at the same time do a right shift by sr */
1249 unsigned int sr = s % simd;
1250 if (sr) {
1251 to[0] ^= from[0] >> sr;
1252 if (((s + produced) / simd+1) * simd < (cache_k1 - cache_k0))
1253 to[0] ^= from[1] << (simd - sr);
1254 } else {
1255 to[0] ^= from[0];
1256 }
1257 #endif
1258 }
1259 }
1260 }
1261 }
1262 cache.clear();
1263 cache_k1 = cache_k0 = 0;
1264 } else {
1265 if (!mpi_rank())
1266 cache.rshift(nread);
1267 cache_k0 += nread;
1268 }
1269 }
1270
1271 if (!mpi_rank()) {
1272 size_t extra;
1273 for(extra = 0 ; extra < tail.get_size() ; ) {
1274 for(unsigned int j = 0; j < ncols ; j++) {
1275 for(unsigned int i = 0 ; i < nrows ; i++) {
1276 abdst_vec to = dst.part_head(i, j, k0 + produced + extra);
1277 absrc_vec from = tail.part_head(i, j, extra);
1278 abvec_set(ab, to, from, simd);
1279 }
1280 }
1281 extra += MIN(simd, tail.get_size() - extra);
1282 }
1283 produced += MIN(extra, tail.get_size());
1284 tail.rshift(MIN(extra, tail.get_size()));
1285 }
1286
1287 long long ll = produced;
1288 MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1289 produced = ll;
1290
1291 return produced;
1292 }
1293
open_file()1294 void lingen_output_to_singlefile::open_file()
1295 {
1296 if (!mpi_rank()) {
1297 std::ios_base::openmode mode = std::ios_base::out;
1298 if (!ascii) mode |= std::ios_base::binary;
1299 os = std::unique_ptr<std::ofstream>(new std::ofstream(filename, mode));
1300 }
1301 done_open = true;
1302 }
1303
write_from_matpoly(matpoly const & src,unsigned int k0,unsigned int k1)1304 ssize_t lingen_output_to_singlefile::write_from_matpoly(matpoly const & src, unsigned int k0, unsigned int k1)
1305 {
1306 /* This should be fixed. We seem to be used to writing this
1307 * transposed. That's slightly weird.
1308 */
1309 if (!done_open)
1310 open_file();
1311 ssize_t ll;
1312 if (!mpi_rank())
1313 ll = matpoly_write(ab, *os, src, k0, k1, ascii, 1);
1314 MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1315 return ll;
1316 }
1317
write_from_matpoly(matpoly const & src,unsigned int k0,unsigned int k1)1318 ssize_t lingen_output_to_splitfile::write_from_matpoly(matpoly const & src, unsigned int k0, unsigned int k1)
1319 {
1320 /* This should be fixed. We seem to be used to writing this
1321 * transposed. That's slightly weird.
1322 */
1323 if (!done_open)
1324 open_file();
1325 ssize_t ll;
1326 if (!mpi_rank())
1327 ll = matpoly_write_split(ab, fw, src, k0, k1, ascii);
1328 MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1329 return ll;
1330 }
1331
open_file()1332 void lingen_output_to_splitfile::open_file()
1333 {
1334 if (!mpi_rank()) {
1335 ASSERT_ALWAYS(!done_open);
1336 std::ios_base::openmode mode = std::ios_base::out;
1337 if (!ascii) mode |= std::ios_base::binary;
1338 for(unsigned int i = 0 ; i < nrows ; i+=splitwidth) {
1339 for(unsigned int j = 0 ; j < ncols ; j+=splitwidth) {
1340 std::string s = fmt::format(pattern,
1341 i, i+splitwidth,
1342 j, j+splitwidth);
1343 fw.emplace_back(std::ofstream { s, mode });
1344 DIE_ERRNO_DIAG(!fw.back(), "open(%s)", s.c_str());
1345 }
1346 }
1347 }
1348 done_open = true;
1349 }
1350
lingen_output_to_splitfile(abdst_field ab,unsigned int m,unsigned int n,std::string const & pattern,bool ascii)1351 lingen_output_to_splitfile::lingen_output_to_splitfile(abdst_field ab, unsigned int m, unsigned int n, std::string const & pattern, bool ascii)
1352 : lingen_output_wrapper_base(ab, m, n)
1353 , pattern(pattern)
1354 , ascii(ascii)
1355 {
1356 }
1357
~lingen_output_to_sha1sum()1358 lingen_output_to_sha1sum::~lingen_output_to_sha1sum()
1359 {
1360 if (!written || mpi_rank()) return;
1361 char out[41];
1362 f.checksum(out);
1363 printf("checksum(%s): %s\n", who.c_str(), out);
1364 }
1365
1366 ssize_t
write_from_matpoly(matpoly const & src,unsigned int k0,unsigned int k1)1367 lingen_output_to_sha1sum::write_from_matpoly(matpoly const& src,
1368 unsigned int k0,
1369 unsigned int k1)
1370 {
1371 ASSERT_ALWAYS(k0 % simd == 0);
1372 #ifdef SELECT_MPFQ_LAYER_u64k1
1373 ASSERT_ALWAYS(src.high_word_is_clear());
1374 size_t nbytes = iceildiv(k1 - k0, ULONG_BITS) * sizeof(unsigned long);
1375 #else
1376 size_t nbytes = abvec_elt_stride(src.ab, k1-k0);
1377 #endif
1378 written += src.nrows() * src.ncols() * nbytes;
1379 if (!mpi_rank()) {
1380 for(unsigned int i = 0 ; i < src.nrows() ; i++)
1381 for(unsigned int j = 0; j < src.ncols() ; j++)
1382 f.write((const char *) src.part_head(i, j, k0), nbytes);
1383 }
1384 return k1 - k0;
1385 }
1386
pipe(lingen_input_wrapper_base & in,lingen_output_wrapper_base & out,const char * action,bool skip_trailing_zeros)1387 void pipe(lingen_input_wrapper_base & in, lingen_output_wrapper_base & out, const char * action, bool skip_trailing_zeros)
1388 {
1389 unsigned int window = std::max(in.preferred_window(), out.preferred_window());
1390 if (window == UINT_MAX) {
1391 window=4096;
1392 }
1393 matpoly F(in.ab, in.nrows, in.ncols, 0);
1394 matpoly Z(in.ab, in.nrows, in.ncols, 0);
1395 if (!mpi_rank()) {
1396 F.zero_pad(window);
1397 Z.zero_pad(window);
1398 }
1399
1400 double tt0 = wct_seconds();
1401 double next_report_t = tt0;
1402 size_t next_report_k = 0;
1403 size_t expected = in.guessed_length();
1404 size_t zq = 0;
1405 for(size_t done = 0 ; ; ) {
1406 F.set_size(0);
1407 F.zero_pad(window);
1408 ssize_t n = in.read_to_matpoly(F, 0, window);
1409 bool is_last = n < (ssize_t) window;
1410 if (n <= 0) break;
1411 ssize_t n1;
1412 long long ll;
1413 if (!mpi_rank()) {
1414 F.set_size(n);
1415 ll = skip_trailing_zeros ? F.get_true_nonzero_size() : n;
1416 }
1417 MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1418 n1 = ll;
1419 if (n1 == 0) {
1420 zq += n;
1421 continue;
1422 }
1423 /* write the good number of zeros */
1424 for( ; zq ; ) {
1425 Z.set_size(0);
1426 unsigned int nz = MIN(zq, window);
1427 if (!mpi_rank())
1428 Z.zero_pad(nz);
1429 ssize_t nn = out.write_from_matpoly(Z, 0, nz);
1430 if (nn < (ssize_t) nz) {
1431 fprintf(stderr, "short write\n");
1432 exit(EXIT_FAILURE);
1433 }
1434 zq -= nz;
1435 done += nz;
1436 }
1437 ssize_t nn = out.write_from_matpoly(F, 0, n1);
1438 if (nn < n1) {
1439 fprintf(stderr, "short write\n");
1440 exit(EXIT_FAILURE);
1441 }
1442 zq = n - n1;
1443 done += n1;
1444 if (action && !mpi_rank() && (done >= next_report_k || is_last)) {
1445 double tt = wct_seconds();
1446 if (tt > next_report_t || is_last) {
1447 printf(
1448 "%s %zu coefficients (%.1f%%)"
1449 " in %.1f s (%.1f MB/s)\n",
1450 action,
1451 done, 100.0 * done / expected,
1452 tt-tt0, done * in.average_matsize() / (tt-tt0)/1.0e6);
1453 next_report_t = tt + 10;
1454 next_report_k = done + expected / 100;
1455 }
1456 }
1457 if (is_last) break;
1458 }
1459 }
1460
1461 template class lingen_scatter<matpoly>;
1462 template class lingen_scatter<bigmatpoly>;
1463 template class lingen_gather<matpoly>;
1464 template class lingen_gather<bigmatpoly>;
1465 template class lingen_gather_reverse<matpoly>;
1466 template class lingen_gather_reverse<bigmatpoly>;
1467
1468