1 #include "cado.h"
2 // IWYU pragma: no_include <ext/alloc_traits.h>
3 // IWYU pragma: no_include <sys/param.h>
4 #include <limits.h>                    // for UINT_MAX
5 #include <stdlib.h>                    // for exit, EXIT_FAILURE
6 #include <algorithm>                   // for max, sort
7 #include <stdexcept>                   // for runtime_error
8 #include <type_traits>                 // for integral_constant<>::value
9 #include <utility>                     // for move
10 #include <sys/types.h>  // ssize_t
11 #include <sys/stat.h>   // stat fstat
12 #ifdef SELECT_MPFQ_LAYER_u64k1
13 #include <cstring>        // memset
14 #include <gmp.h>        // mpn_lshift
15 #include "misc.h"       // bit_reverse
16 #endif
17 #include "fmt/core.h"
18 #include "fmt/format.h" // IWYU pragma: keep
19 #include "lingen_average_matsize.hpp"
20 #include "lingen_bmstatus.hpp"
21 #include "lingen_io_matpoly.hpp"
22 #include "lingen_io_wrappers.hpp"
23 #include "macros.h"     // MIN
24 #include "omp_proxy.h"
25 #include "timing.h"
26 
27 #pragma GCC diagnostic ignored "-Wunused-parameter"
28 
29 /* This one is currently in lingen_io_matpoly.cpp, but that file will go
30  * away eventually */
31 
32 extern unsigned int io_matpoly_block_size;
33 
34 constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
35 constexpr const unsigned int splitwidth = matpoly::over_gf2 ? 64 : 1;
36 
mpi_rank()37 static int mpi_rank()
38 {
39     int rank;
40     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
41     return rank;
42 }
43 
44 /* A note on MPI usage of these routines.
45  *
46  * All data is expected to follow these rules
47  *
48  *  - the matpoly arguments to {read_to,write_from}_matpoly are always
49  *    expected to be significant only at the root. What happens to this
50  *    data on non-zero ranks is largely unspecified, beyond the fact that
51  *    collective calls to the routines should work and not have divergent
52  *    behaviour. In practice we strive to just never access these
53  *    arguments at non-zero ranks. (the same holds for the caches that
54  *    are embedded in the i/o wrappers, with the exception that their
55  *    limits cache_k0 and cache_k1 are consistent at all ranks).
56  *
57  *  - the data embedded in the i/o wrappers is either meaningful only at
58  *    the root (in which case the {read_to,write_from} calls will do
59  *    close to nothing, or is of collective type, meaning that
60  *    {read_to,write_from} will engage in colective operations.
61  *
62  *  - all calls are implicit barriers, and their returned value is
63  *    identical at all ranks.
64  */
65 
66 /* XXX It seems likely that lingen_average_matsize.?pp will go away
67  */
average_matsize() const68 double lingen_io_wrapper_base::average_matsize() const
69 {
70     return ::average_matsize(ab, nrows, ncols, false);
71 }
72 
preferred_window() const73 unsigned int lingen_io_wrapper_base::preferred_window() const
74 {
75     unsigned int nmats = iceildiv(io_matpoly_block_size, average_matsize());
76     unsigned int nmats_pad = simd * iceildiv(nmats, simd);
77     return nmats_pad;
78 }
79 
average_matsize() const80 double lingen_file_input::average_matsize() const
81 {
82     return ::average_matsize(ab, nrows, ncols, ascii);
83 }
84 
preferred_window() const85 unsigned int lingen_file_input::preferred_window() const
86 {
87     unsigned int nmats = iceildiv(io_matpoly_block_size, average_matsize());
88     unsigned int nmats_pad = simd * iceildiv(nmats, simd);
89 #ifdef HAVE_OPENMP
90     /* This might be a bit large */
91     nmats_pad *= omp_get_max_threads() * omp_get_max_threads() / 4;
92 #if ULONG_BITS == 32
93     /* limit to 256MB */
94     for( ; ((size_t) nmats_pad) * average_matsize() > (1UL << 28) ; nmats_pad /= 2);
95 #else
96     /* limit to 4GB */
97     for( ; ((size_t) nmats_pad) * average_matsize() > (1UL << 32) ; nmats_pad /= 2);
98 #endif
99 #endif
100     return nmats_pad;
101 }
102 
open_file()103 void lingen_file_input::open_file()
104 {
105     if (mpi_rank()) return;
106     f = fopen(filename.c_str(), ascii ? "r" : "rb");
107     DIE_ERRNO_DIAG(!f, "open(%s)", filename.c_str());
108 }
109 
close_file()110 void lingen_file_input::close_file()
111 {
112     if (mpi_rank()) return;
113     fclose(f);
114 }
115 
guessed_length() const116 size_t lingen_file_input::guessed_length() const
117 {
118     unsigned long guess;
119 
120     if (length_hint) return length_hint;
121 
122     if (mpi_rank() == 0) {
123         struct stat sbuf[1];
124         int rc = fstat(fileno(f), sbuf);
125         if (rc < 0) {
126             perror(filename.c_str());
127             exit(EXIT_FAILURE);
128         }
129 
130         size_t filesize = sbuf->st_size;
131 
132         size_t avg = average_matsize();
133 
134         if (!ascii) {
135             if (filesize % avg) {
136                 fprintf(stderr, "File %s has %zu bytes, while its size"
137                         " should be a multiple of %zu bytes "
138                         "(assuming binary input; "
139                         "perhaps --ascii is missing ?).\n",
140                         filename.c_str(), filesize, avg);
141                 exit(EXIT_FAILURE);
142             }
143             guess = filesize / avg;
144         } else {
145             double expected_length = (double) filesize / avg;
146             printf("# Expect roughly %.2f items in the sequence.\n", expected_length);
147 
148             /* First coefficient is always lighter, so we add a +1. */
149             guess = 1 + expected_length;
150         }
151     }
152     MPI_Bcast(&guess, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);
153     return guess;
154 }
155 
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)156 ssize_t lingen_file_input::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
157 {
158     /* This reads k1-k0 fresh coefficients from f into dst */
159     size_t ll;
160     if (!mpi_rank())
161         ll = matpoly_read(ab, f, dst, k0, k1, ascii, 0);
162     MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
163     return ll;
164 }
165 
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)166 ssize_t lingen_random_input::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
167 {
168     ssize_t nk = MIN(length, next_src_k + k1 - k0) - next_src_k;
169     if (!mpi_rank())
170         dst.fill_random(k0, k0 + nk, rstate);
171     next_src_k += nk;
172     return nk;
173 }
preferred_window() const174 unsigned int lingen_random_input::preferred_window() const
175 {
176     unsigned int nmats = iceildiv(io_matpoly_block_size, average_matsize());
177     unsigned int nmats_pad = simd * iceildiv(nmats, simd);
178 #ifdef HAVE_OPENMP
179     /* This might be a bit large */
180     nmats_pad *= omp_get_max_threads() * omp_get_max_threads() / 4;
181 #if ULONG_BITS != 32
182     /* limit to 32GB */
183     for( ; ((size_t) nmats_pad) * average_matsize() > (1UL << 35) ; nmats_pad /= 2);
184 #endif
185 #endif
186     return nmats_pad;
187 }
188 
189 
190 /* pivots is a vector of length r <= M.n ;
191  * M is a square matrix ;
192  * coefficient (pivots[j],j) of M is -1.
193  *
194  * return true (and extend pivots) if column r of M is independent of the
195  * previous columns.
196  *
197  * Only the coefficient of degree 0 is considered.
198  */
199 static bool
reduce_column_mod_previous(matpoly & M,std::vector<unsigned int> & pivots)200 reduce_column_mod_previous(matpoly& M, std::vector<unsigned int>& pivots)
201 {
202     abelt tmp MAYBE_UNUSED;
203     abinit(M.ab, &tmp);
204     unsigned int r = pivots.size();
205     for (unsigned int v = 0; v < r; v++) {
206         unsigned int u = pivots[v];
207         /* the v-th column in the M is known to
208          * kill coefficient u (more exactly, to have a -1 as u-th
209          * coefficient, and zeroes for the other coefficients
210          * referenced in the pivots[0] to pivots[v-1] indices).
211          */
212         /* add M[u,r]*column v of M to column r of M */
213 #ifdef SELECT_MPFQ_LAYER_u64k1
214         if (abis_zero(M.ab, M.coeff(u, r, 0)))
215             continue;
216 #endif
217         for (unsigned int i = 0; i < M.m; i++) {
218 #ifndef SELECT_MPFQ_LAYER_u64k1
219             abmul(M.ab, tmp, M.coeff(i, v, 0), M.coeff(u, r, 0));
220             abadd(M.ab, M.coeff(i, r, 0), M.coeff(i, r, 0), tmp);
221             /* don't touch coeff u,r right now, since we're still using
222              * it !
223              */
224             if (i == u)
225                 continue;
226 #else
227             /* here, we know that coeff (u,r) is one */
228             M.coeff_accessor(i, r, 0) += M.coeff(i, v);
229 #endif
230         }
231 #ifndef SELECT_MPFQ_LAYER_u64k1
232         abset_zero(M.ab, M.coeff(u, r, 0));
233 #endif
234     }
235     unsigned int u = 0;
236     for (; u < M.m; u++) {
237         if (!abis_zero(M.ab, M.coeff(u, r, 0)))
238             break;
239     }
240     abclear(M.ab, &tmp);
241     if (u != M.m) {
242         pivots.push_back(u);
243         return true;
244     } else {
245         return false;
246     }
247 }
248 
249 #ifndef SELECT_MPFQ_LAYER_u64k1
250 static void
normalize_column(matpoly & M,std::vector<unsigned int> const & pivots)251 normalize_column(matpoly& M, std::vector<unsigned int> const& pivots)
252 {
253     abelt tmp MAYBE_UNUSED;
254     abinit(M.ab, &tmp);
255     unsigned int r = pivots.size() - 1;
256     unsigned int u = pivots.back();
257     /* Multiply the column so that the pivot becomes -1 */
258     int rc = abinv(M.ab, tmp, M.coeff(u, r, 0));
259     if (!rc) {
260         fprintf(stderr, "Error, found a factor of the modulus: ");
261         abfprint(M.ab, stderr, tmp);
262         fprintf(stderr, "\n");
263         exit(EXIT_FAILURE);
264     }
265     abneg(M.ab, tmp, tmp);
266     for (unsigned int i = 0; i < M.m; i++) {
267         abmul(M.ab, M.coeff(i, r, 0), M.coeff(i, r, 0), tmp);
268     }
269     abclear(M.ab, &tmp);
270 }
271 #endif
272 
273 void
share(int root,MPI_Comm comm)274 lingen_F0::share(int root, MPI_Comm comm)
275 {
276     /* share the bw_dimensions structure, but I don't think it's even
277      * remotely possible that nodes disagree on it.
278      */
279     MPI_Bcast(&nrhs, 1, MPI_UNSIGNED, root, comm);
280     MPI_Bcast(&m, 1, MPI_UNSIGNED, root, comm);
281     MPI_Bcast(&n, 1, MPI_UNSIGNED, root, comm);
282     MPI_Bcast(&t0, 1, MPI_UNSIGNED, root, comm);
283     static_assert(std::is_same<typename decltype(fdesc)::value_type,
284                                std::array<unsigned int, 2>>::value,
285                   "want unsigned ints");
286     unsigned int fsize = fdesc.size();
287     MPI_Bcast(&fsize, 1, MPI_UNSIGNED, root, comm);
288     if (mpi_rank() != root)
289         fdesc.assign(fsize, {{ 0, 0 }});
290     if (fsize)
291         MPI_Bcast(fdesc.data(), 2 * fsize, MPI_UNSIGNED, root, comm);
292 }
293 
294 /* It's a bit tricky. F0 is the _reversal_ of what we get here. (with
295  * respect to t0).
296  */
column_data_from_Aprime(unsigned int jE) const297 std::tuple<unsigned int, unsigned int> lingen_F0::column_data_from_Aprime(unsigned int jE) const
298 {
299     unsigned int kA, jA;
300     if (jE < n) {
301         jA = jE;
302         kA = t0;
303     } else {
304         jA = fdesc[jE-n][1];
305         kA = fdesc[jE-n][0];
306     }
307     /* This is pedantic, but oldish g++ has the ctor from init-list
308      * marked "explicit"
309      */
310     std::tuple<unsigned int, unsigned int> res { kA, jA };
311     return res;
312 }
column_data_from_A(unsigned int jE) const313 std::tuple<unsigned int, unsigned int> lingen_F0::column_data_from_A(unsigned int jE) const
314 {
315     unsigned int kA, jA;
316     std::tie(kA, jA) = column_data_from_Aprime(jE);
317     kA += jA >= nrhs;
318     /* This is pedantic, but oldish g++ has the ctor from init-list
319      * marked "explicit"
320      */
321     std::tuple<unsigned int, unsigned int> res { kA, jA };
322     return res;
323 }
324 
refresh_cache_upto(unsigned int k)325 void lingen_E_from_A::refresh_cache_upto(unsigned int k)
326 {
327     unsigned int next_k1 = simd * iceildiv(k, simd);
328 
329     ASSERT_ALWAYS(next_k1 >= cache_k1);
330 
331     if (next_k1 != cache_k1) {
332         if (!mpi_rank())
333             cache.zero_pad(next_k1);
334         ssize_t nk = A.read_to_matpoly(cache, cache_k1, next_k1);
335         if (nk < (ssize_t) (k - cache_k1)) {
336             fprintf(stderr, "short read from A\n");
337             printf("This amount of data is insufficient. "
338                     "Cannot find %u independent cols within A\n",
339                     m);
340             exit(EXIT_FAILURE);
341         }
342         cache_k1 = next_k1;
343     }
344 }
345 
346 void
initial_read()347 lingen_E_from_A::initial_read()
348 {
349     /* This is called collectively, because _all_
350      * {read_to,write_from}_matpoly call are collective calls. */
351 
352     /* Our goal is to determine t0 and F0, and ultimately ensure that the
353      * constant coefficient of E at t=t0 is a full-rank matrix.
354      *
355      * The initial constant coefficient of E is in fact the coefficient
356      * of degree t0 of A'*F, where column j of A' is column j of A,
357      * divided by X if j >= bm.d.nrhs.
358      *
359      * Therefore, for j >= bm.d.nrhs, the contribution to the constant
360      * coefficient of E comes the following data from A:
361      *  for cols of A with j < bm.d.rhs:  coeffs 0 <= deg <= t0-1
362      *  for cols of A with j >= bm.d.rhs: coeffs 1 <= deg <= t0
363      *
364      * We need m independent columns.
365      *
366      * When we inspect the k coefficients of A, we are, in effect,
367      * considering k*n-(n-rhs) = (k-1)*n+rhs columns. Therefore we expect
368      * at the very least that we need to go as fas as
369      *  (k-1)*n+rhs >= m
370      *  k >= ceiling((m+n-rhs)/n)
371      *
372      * Note that these k coefficients of A mean k' coefficients of A',
373      * where k' is either k or k-1. Namely, k' is k - (rhs == n)
374      */
375 
376     share(0, MPI_COMM_WORLD);
377 
378     if (!mpi_rank())
379         printf("Computing t0\n");
380 
381     /* We want to create a full rank m*m matrix M, by extracting columns
382      * from the first coefficients of A */
383 
384     matpoly M(bw_dimensions::ab, m, m, 1);
385     M.zero_pad(1);
386 
387     /* For each integer i between 0 and m-1, we have a column, picked
388      * from column fdesc[i][1] of coeff fdesc[i][0] of A' which, once
389      * reduced modulo the other ones, has coefficient at row
390      * pivots[i] unequal to zero.
391      */
392     std::vector<unsigned int> pivots;
393 
394     for (t0 = 1; pivots.size() < m; t0++) {
395         /* We are going to access coefficient k from A (perhaps
396          * coefficients k-1 and k) */
397         unsigned int k = t0 - (nrhs == n);
398 
399         /* k + 1 because we're going to access coeff of degree k, of
400          * course.
401          * k + 2 is because of the leading identity block in F0. To
402          * form coefficient of degree t0, we'll need degree t0 in A',
403          * which means up to degree t0 + (nrhs < n) = k + 1 in A
404          */
405         refresh_cache_upto(k + 2);
406 
407         for (unsigned int j = 0; pivots.size() < m && j < n; j++) {
408             /* Extract a full column into M (column j, degree t0 or
409              * t0-1 in A) */
410             /* adjust the coefficient degree to take into account the
411              * fact that for SM columns, we might in fact be
412              * interested by the _previous_ coefficient */
413 
414             int inc;
415             if (!mpi_rank()) {
416                 M.extract_column(pivots.size(), 0, cache, j, t0 - (j < nrhs));
417                 inc = reduce_column_mod_previous(M, pivots);
418             }
419             MPI_Bcast(&inc, 1, MPI_INT, 0, MPI_COMM_WORLD);
420             if (inc && mpi_rank()) {
421                 // we only need to have pivots.size() right
422                 pivots.push_back(0);
423             }
424 
425             if (!inc) {
426                 if (!mpi_rank())
427                     printf(
428                             "[X^%d] A, col %d does not increase rank (still %zu)\n",
429                             t0 - (j < nrhs),
430                             j,
431                             pivots.size());
432 
433                 /* we need at least m columns to get as starting matrix
434                  * with full rank. Given that we have n columns per
435                  * coefficient, this means at least m/n matrices.
436                  */
437 
438                 if ((t0-1) * n > m + 40) {
439                     if (!mpi_rank())
440                         printf("The choice of starting vectors was bad. "
441                                 "Cannot find %u independent cols within A\n",
442                                 m);
443                     exit(EXIT_FAILURE);
444                 }
445                 continue;
446             }
447 
448             /* Bingo, it's a new independent col. */
449 
450             fdesc.push_back( {{ t0 - 1, j }} );
451 #ifndef SELECT_MPFQ_LAYER_u64k1
452             if (!mpi_rank())
453                 normalize_column(M, pivots);
454 #endif
455 
456             /*
457             if (!mpi_rank())
458                 printf("[X^%d] A, col %d increases rank to %zu%s\n",
459                         t0 - (j < nrhs),
460                         j,
461                         pivots.size(),
462                         (j < nrhs) ? " (column not shifted because of the RHS)"
463                         : "");
464                         */
465 
466         }
467         /* Do this before we increase t0 */
468         if (pivots.size() == m)
469             break;
470     }
471 
472     if (!mpi_rank())
473         printf("Found satisfactory init data for t0=%d\n", t0);
474 
475     share(0, MPI_COMM_WORLD);
476 }
477 
share(int root,MPI_Comm comm)478 void lingen_E_from_A::share(int root, MPI_Comm comm)
479 {
480     lingen_F0::share(root, comm);
481     MPI_Bcast(&cache_k0, 1, MPI_UNSIGNED, root, comm);
482     MPI_Bcast(&cache_k1, 1, MPI_UNSIGNED, root, comm);
483     if (!mpi_rank())
484         cache.zero_pad(cache_k1 - cache_k0);
485 }
486 
487 /* read k1-k0 new coefficients from src, starting at coefficient k0,
488  * write them to the destination (which is embedded in the struct as a
489  * reference), starting at coefficient next_dst_k
490  */
491 template<>
write_from_matpoly(matpoly const & src,unsigned int k0,unsigned int k1)492 ssize_t lingen_scatter<matpoly>::write_from_matpoly(matpoly const & src, unsigned int k0, unsigned int k1)
493 {
494     long long nk;
495     ASSERT_ALWAYS(k0 % simd == 0);
496     ASSERT_ALWAYS(next_dst_k % simd == 0);
497     if (!mpi_rank()) {
498         nk = MIN(k1, src.get_size()) - k0;
499         ASSERT_ALWAYS(k1 <= src.get_size());
500         E.zero_pad(next_dst_k + nk);
501         for(unsigned int i = 0 ; i < nrows ; i++) {
502             for(unsigned int j = 0; j < ncols ; j++) {
503                 abdst_vec to = E.part_head(i, j, next_dst_k);
504                 absrc_vec from = src.part_head(i, j, k0);
505                 abvec_set(ab, to, from, simd * iceildiv(nk, simd));
506             }
507         }
508     }
509     MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
510     next_dst_k += nk;
511     return nk;
512 }
513 
514 /* read k1-k0 new coefficients from src, starting at coefficient k0,
515  * write them to the destination (which is embedded in the struct as a
516  * reference), starting at coefficient next_dst_k
517  */
518 template<>
write_from_matpoly(matpoly const & src,unsigned int k0,unsigned int k1)519 ssize_t lingen_scatter<bigmatpoly>::write_from_matpoly(matpoly const & src, unsigned int k0, unsigned int k1)
520 {
521     /* This one ***IS*** a collective call */
522     long long nk;
523     ASSERT_ALWAYS(k0 % simd == 0);
524     ASSERT_ALWAYS(next_dst_k % simd == 0);
525     if (!mpi_rank()) {
526         nk = MIN(k1, src.get_size()) - k0;
527         ASSERT_ALWAYS(k1 <= src.get_size());
528     }
529     MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
530     E.zero_pad(next_dst_k + nk);
531     E.scatter_mat_partial(src, k0, next_dst_k, nk);
532     next_dst_k += nk;
533     return nk;
534 }
535 
536 /* read k1-k0 new coefficients from the source (which is embedded in the
537  * struct as a reference), starting at coefficient next_src_k, and write
538  * them to the destination, starting at coefficient k0.
539  */
540 template<>
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)541 ssize_t lingen_gather<matpoly>::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
542 {
543     long long nk;
544     ASSERT_ALWAYS(k0 % simd == 0);
545     ASSERT_ALWAYS(k1 % simd == 0);
546     ASSERT_ALWAYS(next_src_k % simd == 0);
547     if (!mpi_rank()) {
548         nk = MIN(k1, dst.get_size()) - k0;
549         nk = MIN(nk, (ssize_t) (pi.get_size() - next_src_k));
550         ASSERT_ALWAYS(k1 <= dst.get_size());
551         for(unsigned int i = 0 ; i < nrows ; i++) {
552             for(unsigned int j = 0; j < ncols ; j++) {
553                 abdst_vec to = dst.part_head(i, j, k0);
554                 absrc_vec from = pi.part_head(i, j, next_src_k);
555                 abvec_set(ab, to, from, nk);
556             }
557         }
558     }
559     MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
560     next_src_k += nk;
561     return nk;
562 }
563 
564 /* read k1-k0 new coefficients from the source (which is embedded in the
565  * struct as a reference), starting at coefficient next_src_k, and write
566  * them to the destination, starting at coefficient k0.
567  */
568 template<>
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)569 ssize_t lingen_gather<bigmatpoly>::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
570 {
571     /* This one ***IS*** a collective call */
572     long long nk;
573     ASSERT_ALWAYS(k0 % simd == 0);
574     ASSERT_ALWAYS(next_src_k % simd == 0);
575     ASSERT_ALWAYS(k1 % simd == 0);
576     if (!mpi_rank()) {
577         nk = MIN(k1, dst.get_size()) - k0;
578         nk = MIN(nk, (ssize_t) (pi.get_size() - next_src_k));
579         ASSERT_ALWAYS(k1 <= dst.get_size());
580     }
581     MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
582     pi.gather_mat_partial(dst, k0, next_src_k, nk);
583     next_src_k += nk;
584     return nk;
585 }
586 
587 /* read k1-k0 new coefficients from pi, starting at coefficient
588  * next_src_k, and write them to the destination, starting at coefficient
589  * k0.
590  */
reverse_matpoly_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1,matpoly const & pi,unsigned int & next_src_k)591 ssize_t reverse_matpoly_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1, matpoly const & pi, unsigned int & next_src_k)
592 {
593     ASSERT_ALWAYS(!mpi_rank());
594     ssize_t nk;
595     nk = MIN(k1, dst.get_size()) - k0;
596     nk = MIN(nk, (ssize_t) (pi.get_size() - next_src_k));
597     ASSERT_ALWAYS(k0 % simd == 0);
598     ASSERT_ALWAYS(k1 % simd == 0);
599     ASSERT_ALWAYS(next_src_k % simd == 0);
600     unsigned int d = pi.get_size();
601     abdst_field ab = pi.ab;
602 #ifndef SELECT_MPFQ_LAYER_u64k1
603     for(unsigned int i = 0 ; i < pi.nrows() ; i++) {
604         for(unsigned int j = 0; j < pi.ncols() ; j++) {
605             /* We'll read coefficients of degrees [d-k1, d-k0[, and
606              * it might well be that d-k1<0
607              */
608             for(unsigned int k = k0 ; k < k0 + nk ; k++) {
609                 if (next_src_k + k < d + k0)
610                     abset(ab,
611                             dst.coeff(i, j, k),
612                             pi.coeff(i, j, d-1-next_src_k-(k-k0)));
613                 else
614                     abset_zero(ab, dst.coeff(i, j, k));
615             }
616         }
617     }
618 #else
619     size_t ntemp = (k1-k0) / simd + 2;
620     unsigned int rk0, rk1;
621     rk1 = next_src_k;
622     rk1 = d >= rk1 ? d - rk1 : 0;
623     rk0 = rk1 >= (unsigned int) nk ? rk1 - nk : 0;
624     unsigned int n_rk = rk1 - rk0;
625     unsigned int q_rk = rk0 / simd;
626     unsigned int r_rk = rk0 % simd;
627     unsigned int nq = iceildiv(n_rk + r_rk, simd);
628     if (nq) {
629         unsigned long * temp = new unsigned long[ntemp];
630         for(unsigned int i = 0 ; i < pi.nrows() ; i++) {
631             for(unsigned int j = 0; j < pi.ncols() ; j++) {
632                 /* We'll read coefficients of degrees [rk0, rk1[, and
633                  * it might well be that rk0<0
634                  */
635                 memset(temp, 0, ntemp * sizeof(unsigned long));
636 
637                 ASSERT_ALWAYS(nq <= ntemp);
638 #define R(x, b) ((b)*iceildiv((x),(b)))
639                 abvec_set(ab, temp, pi.part(i, j) + q_rk, R(n_rk + r_rk, simd));
640                 if (rk1 % simd) mpn_lshift(temp, temp, nq, simd - (rk1 % simd));
641                 /* Now we only have to bit-reverse temp */
642                 bit_reverse(temp, temp, nq);
643                 abvec_set(ab, dst.part_head(i, j, k0), temp, R(nk, simd));
644 #undef R
645             }
646         }
647         delete[] temp;
648     }
649 #endif
650     next_src_k += nk;
651     return nk;
652 }
653 
654 #if 0
655 shared_or_common_size<matpoly>::shared_or_common_size(matpoly const & pi)
656 {
657     unsigned long s = pi.get_size();
658     MPI_Bcast(&s, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD);
659     shared = s;
660 };
661 #endif
662 
663 /* read k1-k0 new coefficients from the source (which is embedded in the
664  * struct as a reference), starting at coefficient next_src_k, and write
665  * them to the destination, starting at coefficient k0.
666  */
667 template<>
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)668 ssize_t lingen_gather_reverse<matpoly>::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
669 {
670     long long ll;
671     if (!mpi_rank())
672         ll = reverse_matpoly_to_matpoly(dst, k0, k1, pi, next_src_k);
673     MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
674     return ll;
675 }
676 
677 /* read k1-k0 new coefficients from the source (which is embedded in the
678  * struct as a reference), starting at coefficient next_src_k, and write
679  * them to the destination, starting at coefficient k0.
680  */
681 template<>
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)682 ssize_t lingen_gather_reverse<bigmatpoly>::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
683 {
684     /* This one ***IS*** a collective call */
685     long long nk;
686     unsigned int nq;
687     unsigned int offset;
688     long long dk;
689 
690     ASSERT_ALWAYS(k0 % simd == 0);
691     ASSERT_ALWAYS(k1 % simd == 0);
692 
693     if (!mpi_rank()) {
694         nk = MIN(k1, dst.get_size()) - k0;
695         nk = MIN(nk, (ssize_t) (pi.get_size() - next_src_k));
696         ASSERT_ALWAYS(k1 <= dst.get_size());
697         unsigned int d = pi.get_size();
698         unsigned int rk1 = next_src_k;
699         rk1 = d >= rk1 ? d - rk1 : 0;
700         unsigned int rk0 = rk1 >= (unsigned int) nk ? rk1 - nk : 0;
701         unsigned int n_rk = rk1 - rk0;
702         unsigned int q_rk = rk0 / simd;
703         unsigned int r_rk = rk0 % simd;
704         nq = iceildiv(n_rk + r_rk, simd);
705         offset = q_rk * simd;
706     }
707     MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
708     MPI_Bcast(&nq, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
709     MPI_Bcast(&offset, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
710 
711     matpoly tpi(pi.ab, pi.nrows(), pi.ncols(), nq * simd);
712     tpi.zero_pad(nq * simd);
713     unsigned int zero = 0;
714     pi.gather_mat_partial(tpi, zero, offset, nk);
715 
716     if (!mpi_rank())
717         dk = reverse_matpoly_to_matpoly(dst, k0, k0 + nk, tpi, zero);
718 
719     MPI_Bcast(&dk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
720     ASSERT_ALWAYS(dk == nk);
721     next_src_k += dk;
722     return nk;
723 }
724 
recompute_rhs()725 matpoly lingen_F_from_PI::recompute_rhs()
726 {
727     abdst_field ab = bw_dimensions::ab;
728     /*
729      * first compute the rhscontribs. Use that to decide on a renumbering
730      * of the columns, because we'd like to get the "primary" solutions
731      * first, in a sense. Those are the ones with fewer zeroes in the
732      * rhscontrib part. So we would want that matrix to have its columns
733      * sorted in decreasing weight order
734      *
735      * An alternative, possibly easier, is to have a function which
736      * decides the solution ordering precisely based on the inspection of
737      * this rhscoeffs matrix (?). But how should we spell that info when
738      * we give it to mksol ??
739      */
740 
741     matpoly rhs(ab, nrhs, n, 1);
742     rhs.zero_pad(1);
743 
744     /* adding the contributions to the rhs matrix. This is formed of the
745      * LEADING COEFFICIENT of the corresponding columns of pi.
746      */
747     for (unsigned int jF = 0; jF < n; jF++) {
748         unsigned int jpi = sols[jF].j;
749         for (unsigned int ipi = 0; ipi < m + n; ipi++) {
750             unsigned int iF;
751             unsigned int s;
752             std::tie(iF, s) = get_shift_ij(ipi, jF);
753             if (iF >= nrhs)
754                 continue;
755             s--;
756             /* Coefficient (iF, ipi) of F0 is x^kF (with 0<=kF<=t0).
757              * Multiplied by coefficient (ipi, jpi) of pi, whose length
758              * is <=delta[jpi]+1, this induces a contribution to
759              * coefficient
760              * (iF, jF) of F. (actually since the length of column jF is
761              * at most delta[jpi]+1, the length of column jpi of pi is at
762              * most delta[jpi]+1-kF)
763              *
764              * The following formula gives the contribution C of this
765              * coefficient (iF, ipi) of F0 to coefficient delta[j] + k of
766              * A' * F:
767              *
768              * C = \sum_{s=0}^{delta[j]} A'_{iF,s+k} F_{iF,jF,delta[j]-s}
769              *     (we understand F_{iF,jF,...} as representing the
770              *     contribution of the coeff (iF, ipi) of F0 to this)
771              *
772              * C = \sum_{s=0}^{delta[j]} A'_{iF,s+k} Frev_{ipi,jpi,s}
773              *     with Frev = rev_{delta[j]}(F).
774              *
775              * Let F0rev = rev_{t0}(F0) ; F0rev_{iF, ipi} is x^{t0-kF}
776              *     pirev = rev_{G-1}(pi)  with G large enough. (>=pi.get_size())
777              *
778              * We have Frev{*,j} = (F0rev*pirev)_{*,j}  div (t0+G-1-delta[j])
779              *         Frev{*,j,s} = (F0rev*pirev)_{*,j,s+t0+G-1-delta[j]}
780              *                     = pirev_{*,j,s+G-1-delta[j]+kF}
781              *
782              * Let shift = G - 1 - delta[j]
783              *
784              * C = \sum_{s=0}^{delta[j]} A'_{iF,s+k} * pirev_{ipi,jpi,s+shift+kF}
785              *
786              * x^T*M^k*\sum_{s=0}^{delta[j]-kF} a_s c_s
787              *
788              * with a_s = M^s * Y_{iF}
789              * and  c_s = [x^{shift + s + kF}](rev_{G}(pi))
790              *            (with shift = G - 1 - delta[j])
791              *
792              * and rev_G(pi) is precisely what we have in our source, and
793              * cached in cache.
794              *
795              * Now take this with s==0: we get the contribution of the
796              * rhs.
797              */
798 
799             /* add coefficient (ipi, jpi, s) of the reversed pi to
800              * coefficient (iF, jF, 0) of rhs */
801 #ifndef SELECT_MPFQ_LAYER_u64k1
802             absrc_elt src = cache.coeff(ipi, jpi, s);
803             abdst_elt dst = rhs.coeff(iF, jF, 0);
804             abadd(ab, dst, dst, src);
805 #else
806             rhs.part(iF,jF)[0] ^= !abis_zero(ab, cache.coeff(ipi, jpi, s));
807 #endif
808         }
809     }
810     return rhs;
811 }
812 
reorder_solutions()813 void lingen_F_from_PI::reorder_solutions()
814 {
815     abdst_field ab = bw_dimensions::ab;
816     /* compute the score per solution */
817     std::vector<std::array<unsigned int, 2>> sol_score;
818     for (unsigned int jF = 0; jF < n; jF++) {
819         unsigned int s = 0;
820         for (unsigned int iF = 0; iF < nrhs; iF++)
821             s += abis_zero(ab, rhs.coeff(iF, jF, 0));
822         sol_score.push_back({{ s, jF }});
823     }
824     std::sort(sol_score.begin(), sol_score.end());
825     if (nrhs && sol_score.size() && !mpi_rank()) {
826         printf("Reordered solutions:\n");
827         for (unsigned int i = 0; i < n; i++) {
828             printf(" %u (col %u in pi, weight %u on rhs vectors)\n",
829                    sol_score[i][1],
830                    sols[sol_score[i][1]].j,
831                    nrhs-sol_score[i][0]);
832         }
833     }
834     {
835         std::vector<sol_desc> sols2;
836         for(auto const & s : sol_score) {
837             sols2.push_back(sols[s[1]]);
838         }
839         sols = std::move(sols2);
840     }
841 }
842 
get_shift_ij(unsigned int ipi,unsigned int jF) const843 std::tuple<unsigned int, unsigned int> lingen_F_from_PI::get_shift_ij(unsigned int ipi, unsigned int jF) const
844 {
845     // unsigned int jpi = sols[jF].j;
846     unsigned int shift = sols[jF].shift;
847     unsigned int iF, kF;
848     std::tie(kF, iF) = column_data_from_Aprime(ipi);
849     kF = t0 - kF;
850     unsigned int s0 = shift + kF + (iF < nrhs);
851 
852     /* This is pedantic, but oldish g++ has the ctor from init-list
853      * marked "explicit"
854      */
855     std::tuple<unsigned int, unsigned int> res { iF, s0 };
856     return res;
857 }
858 
lingen_F_from_PI(bmstatus const & bm,lingen_input_wrapper_base & pi,lingen_F0 const & F0)859 lingen_F_from_PI::lingen_F_from_PI(bmstatus const & bm,
860         lingen_input_wrapper_base& pi, lingen_F0 const& F0)
861     : lingen_F0(F0)
862     , lingen_input_wrapper_base(pi.ab, n, n)
863     , pi(pi)
864     , cache(pi.ab, pi.nrows, pi.ncols, 0)
865     , tail(pi.ab, nrows, ncols, 0)
866     , rhs(pi.ab, nrhs, n, 1)
867 {
868     /* The first n columns of F0 are the identity matrix, so that for (0
869      * <= j < n),
870      * column j of E is actually X^{-t0-e}*column j=jA of A. Notice that we
871      * have in this case column_data_from_A(j) = { t0+e, j }. Here, e is
872      * (j >= bm.d.nrhs)
873      *
874      * The last m columns, namely for (n <= j < m+n), are as follows: if
875      *      std::tie(kA, jA) = column_data_from_A(j-n);
876      * Then the entry at position (jA, j) is x^(t0-kA), so that
877      * column j of E is actually X^{-kA}*column jA of A.
878      *
879      * As for F and pi, we have F = F0*pi so that
880      *
881      *  A' * F0 * pi = G
882      *
883      * colum j is such that max(1+deg((F0*pi)_j), G_j) <= delta[j]
884      *
885      * Each column uses therefore a combination of data from various
886      * columns of A', which means in turn various columns of A.
887      *
888      * We want to separate the resulting expressions between the "rhs
889      * contributions" (coefficients that appear as multipliers for the
890      * right-hand-side columns, which are coefficients of degree 0 in A),
891      * and the others (which all appear to some non-zero degree).
892      *
893      * Recall that each column of F0 has one single coefficient.
894      * Therefore a coefficient of pi contributes to only one coefficient
895      * of F0*pi. However, this is not 1-to-1: coefficients of F0*pi can
896      * be the combination of several polynomials from pi.
897      */
898 
899     /* We need to determine the spread between smallest and largest
900      * degree columns among the solution columns.
901      *
902      * The length of column j of pi is at most 1+delta[j] (this might be
903      * uneven depending on the rows).
904      *
905      * This length is also at least 1+delta[j]-t0, since otherwise we
906      * can't have deg(F_j)=delta[j]
907      */
908     unsigned int lookback_needed = 0;
909 
910     for (unsigned int j = 0; j < m + n; j++) {
911         if (bm.lucky[j] <= 0)
912             continue;
913         ASSERT_ALWAYS(bm.delta[j] >= t0);
914         /* Really any large enough G should do. We must strive to change
915          * in relevant places:
916          * - here
917          * - the comment in lingen_F_from_PI::recompute_rhs
918          * - in reverse_matpoly_to_matpoly
919          */
920         size_t G = pi.guessed_length();
921         ASSERT_ALWAYS(bm.delta[j] <= G);
922         unsigned int s = G - 1 - bm.delta[j];
923         sols.push_back({ j, s });
924         if (s + 1 >= lookback_needed)
925             lookback_needed = s + 1;
926     }
927 
928     lookback_needed += t0;
929 
930     if (sols.size() != n)
931         throw std::runtime_error("Cannot compute F, too few solutions found\n");
932 
933     lookback_needed = iceildiv(lookback_needed, simd) * simd;
934     if (!mpi_rank()) {
935         cache.zero_pad(lookback_needed);
936     }
937     pi.read_to_matpoly(cache, 0, lookback_needed);
938     cache_k0 = 0;
939     cache_k1 = lookback_needed;
940 
941     if (!mpi_rank()) {
942         rhs = recompute_rhs();
943         reorder_solutions();
944     }
945     MPI_Bcast(sols.data(), 2 * n, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
946 
947     /* recompute rhs. Same algorithm as above. */
948     if (!mpi_rank())
949         rhs = recompute_rhs();
950 }
951 
write_rhs(lingen_output_wrapper_base & Srhs)952 void lingen_F_from_PI::write_rhs(lingen_output_wrapper_base & Srhs)
953 {
954     if (nrhs)
955         Srhs.write_from_matpoly(rhs, 0, 1);
956 }
957 
958 
959 
960 /* read k1-k0 new coefficients from the source (which is embedded in the
961  * struct as a reference), starting at coefficient next_src_k, and write
962  * them to the destination, starting at coefficient k0.
963  */
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)964 ssize_t lingen_E_from_A::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
965 {
966     abdst_field ab = bw_dimensions::ab;
967 
968     /* The first n columns of F0 are the identity matrix, so that
969      * for (0 <= j < n),
970      * column j of E is actually X^{-t0-e}*column j=jA of A. Notice that we
971      * have in this case column_data_from_A(j) = { t0+e, j }. Here, e is
972      * (j >= bm.d.nrhs)
973      *
974      * The last m columns, namely for (n <= j < m+n), are as follows: if
975      *      std::tie(kA, jA) = column_data_from_A(j);
976      * Then the entry at position (jA, j) is x^(t0-kA), so that
977      * column j of E is actually X^{-kA}*column jA of A.
978      *
979      *
980      * the max value of kA is t0 + (n < rhs)
981      */
982     ASSERT_ALWAYS(k0 % simd == 0);
983     ASSERT_ALWAYS(k1 % simd == 0);
984 
985     ssize_t produced = 0;
986 
987     if (cache_k1 != cache_k0) {
988         unsigned int F0_lookback = t0 + (nrhs < n);
989         unsigned int lookback = cache_k1 - cache_k0;
990         ASSERT_ALWAYS(lookback >= F0_lookback);
991 
992         long long nk;
993         if (!mpi_rank()) {
994             nk = MIN(k1, dst.get_size()) - k0;
995             ASSERT_ALWAYS(cache.get_size() == lookback);
996             cache.zero_pad(lookback + nk);
997         }
998         MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
999         ASSERT_ALWAYS(nk % simd == 0);
1000 
1001         /* This may be a collective call. At any rate, the cache variable is
1002          * meaningful only at the root. */
1003         ssize_t nread = A.read_to_matpoly(cache, lookback, lookback + nk);
1004 
1005         if (!mpi_rank())
1006             cache.set_size(lookback + nread);
1007 
1008         cache_k1 += nread;
1009 
1010         /* Do the bulk of the processing with an aligned count.
1011          * Note that if nread is not already aligned, it means that we had a
1012          * short read, since nk has to be aligned. And if we have a short
1013          * read, we're going to clear the cache on exit, so that it doesn't
1014          * matter if we lose the possibility to rotate the cache correctly.
1015          */
1016         nread -= nread % simd;
1017 
1018         if (!mpi_rank()) {
1019             /* process data at rank 0 */
1020             if (nread > 0) {
1021                 for(unsigned int j = 0; j < m + n; j++) {
1022                     unsigned int jA;
1023                     unsigned int kA;
1024                     std::tie(kA, jA) = column_data_from_A(j);
1025                     /* column j of E is actually X^{-kA}*column jA of A. */
1026                     for(unsigned int i = 0 ; i < m ; i++) {
1027                         abdst_vec to = dst.part_head(i, j, k0);
1028                         absrc_vec from = cache.part_head(i, jA, kA);
1029 #ifndef SELECT_MPFQ_LAYER_u64k1
1030                         abvec_set(ab, to, from, nread);
1031 #else
1032                         unsigned int sr = kA % simd;
1033                         unsigned int nq = nread / simd;
1034                         if (sr) {
1035                             mpn_rshift(to, from, nq, sr);
1036                             to[nq-1] ^= from[nq] << (simd - sr);
1037                         } else {
1038                             abvec_set(ab, to, from, nread);
1039                         }
1040 #endif
1041                     }
1042                 }
1043             }
1044         }
1045 
1046         /* not that nread is agreed at all ranks */
1047 
1048         produced = nread;
1049 
1050         if (nread + k0 < k1) {
1051             /* We need to handle the end of the input stream, and stow
1052              * that to the (tail) field. This will be done at rank 0.
1053              */
1054 
1055             /* At this point we have a choice to make.
1056              * - Either we strive to perform the operation that is
1057              *   mathematically unambiguous on our input, akin to
1058              *   polynomial multiplication.
1059              * - Or we adapt to the true nature of the source we're
1060              *   dealing in the first place: an infinite series, which we
1061              *   know only to some precision.
1062              * We do the latter, by using coefficients from the cache
1063              * only until the point where some noise creeps in.
1064              */
1065             unsigned int cache_avail = cache_k1 - cache_k0;
1066             unsigned int cache_access = nread + F0_lookback;
1067 
1068             if (!mpi_rank()) {
1069                 /* This is the correct final size of the tail */
1070                 tail.zero_pad(cache_avail - MIN(cache_avail, cache_access));
1071                 for(unsigned int k = nread ; k + F0_lookback < cache_avail; k += simd) {
1072                     for(unsigned int j = 0; j < m + n; j++) {
1073                         unsigned int jA;
1074                         unsigned int kA;
1075                         std::tie(kA, jA) = column_data_from_A(j);
1076                         /* column j of E is actually X^{-kA}*column jA of A. */
1077                         if (k + kA >= cache_avail) continue;
1078                         for(unsigned int i = 0 ; i < m ; i++) {
1079                             abdst_vec to = tail.part_head(i, j, k - nread);
1080                             absrc_vec from = cache.part_head(i, jA, kA + k);
1081 #ifndef SELECT_MPFQ_LAYER_u64k1
1082                             abvec_set(ab, to, from, 1);
1083 #else
1084                             unsigned int sr = kA % simd;
1085                             if (sr) {
1086                                 *to = *from >> sr;
1087                                 if (((kA + k) / simd+1) * simd < cache_avail)
1088                                     *to ^= from[1] << (simd - sr);
1089                             } else {
1090                                 *to = *from;
1091                             }
1092 #endif
1093                         }
1094                     }
1095                 }
1096             }
1097             cache.clear();
1098             cache_k1 = cache_k0 = 0;
1099         } else {
1100             if (!mpi_rank())
1101                 cache.rshift(nread);
1102             cache_k0 += nread;
1103         }
1104     }
1105 
1106     if (!mpi_rank()) {
1107         size_t extra;
1108         for(extra = 0 ; extra < tail.get_size() ; ) {
1109             for(unsigned int j = 0; j < ncols ; j++) {
1110                 for(unsigned int i = 0 ; i < nrows ; i++) {
1111                     abdst_vec to = dst.part_head(i, j, k0 + produced + extra);
1112                     absrc_vec from = tail.part_head(i, j, extra);
1113                     abvec_set(ab, to, from, simd);
1114                 }
1115             }
1116             extra += MIN(simd, tail.get_size() - extra);
1117         }
1118         produced += MIN(extra, tail.get_size());
1119         tail.rshift(MIN(extra, tail.get_size()));
1120     }
1121 
1122     long long ll = produced;
1123     MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1124     produced = ll;
1125 
1126     return produced;
1127 }
1128 
1129 /* read k1-k0 new coefficients from the source (which is embedded in the
1130  * struct as a reference), starting at coefficient next_src_k, and write
1131  * them to the destination, starting at coefficient k0.
1132  */
read_to_matpoly(matpoly & dst,unsigned int k0,unsigned int k1)1133 ssize_t lingen_F_from_PI::read_to_matpoly(matpoly & dst, unsigned int k0, unsigned int k1)
1134 {
1135     abdst_field ab = bw_dimensions::ab;
1136 
1137     ASSERT_ALWAYS(k0 % simd == 0);
1138     ASSERT_ALWAYS(k1 % simd == 0);
1139 
1140     ssize_t produced = 0;
1141 
1142     if (cache_k1 != cache_k0) {
1143         unsigned int F0_lookback = t0 + (nrhs < n);
1144         unsigned int lookback = cache_k1 - cache_k0;
1145         ASSERT_ALWAYS(lookback >= F0_lookback - 1);
1146 
1147         long long nk;
1148         if (!mpi_rank()) {
1149             nk = MIN(k1, dst.get_size()) - k0;
1150             ASSERT_ALWAYS(nk % simd == 0);
1151             ASSERT_ALWAYS(cache.get_size() == lookback);
1152             cache.zero_pad(lookback + nk);
1153         }
1154         MPI_Bcast(&nk, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1155         ASSERT_ALWAYS(nk % simd == 0);
1156 
1157         /* This may be a collective call. At any rate, the cache variable is
1158          * meaningful only at the root. */
1159         ssize_t nread = pi.read_to_matpoly(cache, lookback, lookback + nk);
1160 
1161         if (!mpi_rank())
1162             cache.set_size(lookback + nread);
1163 
1164         cache_k1 += nread;
1165 
1166         /* Do the bulk of the processing with an aligned count.
1167          * Note that if nread is not already aligned, it means that we had a
1168          * short read, since nk has to be aligned. And if we have a short
1169          * read, we're going to clear the cache on exit, so that it doesn't
1170          * matter if we lose the possibility to rotate the cache correctly.
1171          */
1172         nread -= nread % simd;
1173 
1174         if (!mpi_rank()) {
1175             if (nread > 0) {
1176                 for(unsigned int jF = 0 ; jF < n ; jF++) {
1177                     unsigned int jpi = sols[jF].j;
1178                     for(unsigned int ipi = 0 ; ipi < m + n ; ipi++) {
1179                         unsigned int s;
1180                         unsigned int iF;
1181                         std::tie(iF, s) = get_shift_ij(ipi, jF);
1182 
1183                         /* The reversal of pi_{ipi, jpi} contributes to entry (iF,
1184                          * jF), once shifted right by s.  */
1185 
1186                         abdst_vec to = dst.part_head(iF, jF, k0);
1187                         absrc_vec from = cache.part_head(ipi, jpi, s);
1188 #ifndef SELECT_MPFQ_LAYER_u64k1
1189                         abvec_add(ab, to, to, from, nread);
1190 #else
1191                         /* Add must at the same time do a right shift by sr */
1192                         unsigned int sr = s % simd;
1193                         if (sr) {
1194                             unsigned long cy = from[nread / simd] << (simd - sr);
1195                             for(unsigned int i = nread / simd ; i-- ; ) {
1196                                 unsigned long t = (from[i] >> sr) ^ cy;
1197                                 cy = from[i] << (simd - sr);
1198                                 to[i] ^= t;
1199                             }
1200                         } else {
1201                             abvec_add(ab, to, to, from, nread);
1202                         }
1203 #endif
1204                     }
1205                 }
1206             }
1207         }
1208 
1209         /* not that nread is agreed at all ranks */
1210 
1211         produced = nread;
1212 
1213         if (nread + k0 < k1) {
1214             /* We've reached the end of our input stream. Now is time to drain
1215              * the cache. We'll first stow that in the (tail) field.
1216              */
1217             unsigned int cache_avail = cache_k1 - cache_k0;
1218             unsigned int max_tail = 0;
1219 
1220             for(unsigned int jF = 0 ; jF < n ; jF++) {
1221                 for(unsigned int ipi = 0 ; ipi < m + n ; ipi++) {
1222                     unsigned int s;
1223                     unsigned int iF;
1224                     std::tie(iF, s) = get_shift_ij(ipi, jF);
1225                     if (nread + s < cache_avail) {
1226                         unsigned int pr = cache_avail - (nread + s);
1227                         if (pr >= max_tail)
1228                             max_tail = pr;
1229                     }
1230                 }
1231             }
1232 
1233             if (!mpi_rank()) {
1234                 tail.zero_pad(max_tail);
1235 
1236                 for(unsigned int k = nread ; k < nread + max_tail ; k += simd) {
1237                     for(unsigned int jF = 0 ; jF < n ; jF++) {
1238                         unsigned int jpi = sols[jF].j;
1239                         for(unsigned int ipi = 0 ; ipi < m + n ; ipi++) {
1240                             unsigned int s;
1241                             unsigned int iF;
1242                             std::tie(iF, s) = get_shift_ij(ipi, jF);
1243                             abdst_vec to = tail.part_head(iF, jF, k - nread);
1244                             absrc_vec from = cache.part_head(ipi, jpi, k + s);
1245 #ifndef SELECT_MPFQ_LAYER_u64k1
1246                             abvec_add(ab, to, to, from, 1);
1247 #else
1248                             /* Add must at the same time do a right shift by sr */
1249                             unsigned int sr = s % simd;
1250                             if (sr) {
1251                                 to[0] ^= from[0] >> sr;
1252                                 if (((s + produced) / simd+1) * simd < (cache_k1 - cache_k0))
1253                                     to[0] ^= from[1] << (simd - sr);
1254                             } else {
1255                                 to[0] ^= from[0];
1256                             }
1257 #endif
1258                         }
1259                     }
1260                 }
1261             }
1262             cache.clear();
1263             cache_k1 = cache_k0 = 0;
1264         } else {
1265             if (!mpi_rank())
1266                 cache.rshift(nread);
1267             cache_k0 += nread;
1268         }
1269     }
1270 
1271     if (!mpi_rank()) {
1272         size_t extra;
1273         for(extra = 0 ; extra < tail.get_size() ; ) {
1274             for(unsigned int j = 0; j < ncols ; j++) {
1275                 for(unsigned int i = 0 ; i < nrows ; i++) {
1276                     abdst_vec to = dst.part_head(i, j, k0 + produced + extra);
1277                     absrc_vec from = tail.part_head(i, j, extra);
1278                     abvec_set(ab, to, from, simd);
1279                 }
1280             }
1281             extra += MIN(simd, tail.get_size() - extra);
1282         }
1283         produced += MIN(extra, tail.get_size());
1284         tail.rshift(MIN(extra, tail.get_size()));
1285     }
1286 
1287     long long ll = produced;
1288     MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1289     produced = ll;
1290 
1291     return produced;
1292 }
1293 
open_file()1294 void lingen_output_to_singlefile::open_file()
1295 {
1296     if (!mpi_rank()) {
1297         std::ios_base::openmode mode = std::ios_base::out;
1298         if (!ascii) mode |= std::ios_base::binary;
1299         os = std::unique_ptr<std::ofstream>(new std::ofstream(filename, mode));
1300     }
1301     done_open = true;
1302 }
1303 
write_from_matpoly(matpoly const & src,unsigned int k0,unsigned int k1)1304 ssize_t lingen_output_to_singlefile::write_from_matpoly(matpoly const & src, unsigned int k0, unsigned int k1)
1305 {
1306     /* This should be fixed. We seem to be used to writing this
1307      * transposed. That's slightly weird.
1308      */
1309     if (!done_open)
1310         open_file();
1311     ssize_t ll;
1312     if (!mpi_rank())
1313         ll = matpoly_write(ab, *os, src, k0, k1, ascii, 1);
1314     MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1315     return ll;
1316 }
1317 
write_from_matpoly(matpoly const & src,unsigned int k0,unsigned int k1)1318 ssize_t lingen_output_to_splitfile::write_from_matpoly(matpoly const & src, unsigned int k0, unsigned int k1)
1319 {
1320     /* This should be fixed. We seem to be used to writing this
1321      * transposed. That's slightly weird.
1322      */
1323     if (!done_open)
1324         open_file();
1325     ssize_t ll;
1326     if (!mpi_rank())
1327         ll = matpoly_write_split(ab, fw, src, k0, k1, ascii);
1328     MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1329     return ll;
1330 }
1331 
open_file()1332 void lingen_output_to_splitfile::open_file()
1333 {
1334     if (!mpi_rank()) {
1335         ASSERT_ALWAYS(!done_open);
1336         std::ios_base::openmode mode = std::ios_base::out;
1337         if (!ascii) mode |= std::ios_base::binary;
1338         for(unsigned int i = 0 ; i < nrows ; i+=splitwidth) {
1339             for(unsigned int j = 0 ; j < ncols ; j+=splitwidth) {
1340                 std::string s = fmt::format(pattern,
1341                         i, i+splitwidth,
1342                         j, j+splitwidth);
1343                 fw.emplace_back(std::ofstream { s, mode });
1344                 DIE_ERRNO_DIAG(!fw.back(), "open(%s)", s.c_str());
1345             }
1346         }
1347     }
1348     done_open = true;
1349 }
1350 
lingen_output_to_splitfile(abdst_field ab,unsigned int m,unsigned int n,std::string const & pattern,bool ascii)1351 lingen_output_to_splitfile::lingen_output_to_splitfile(abdst_field ab, unsigned int m, unsigned int n, std::string const & pattern, bool ascii)
1352     : lingen_output_wrapper_base(ab, m, n)
1353     , pattern(pattern)
1354     , ascii(ascii)
1355 {
1356 }
1357 
~lingen_output_to_sha1sum()1358 lingen_output_to_sha1sum::~lingen_output_to_sha1sum()
1359 {
1360     if (!written || mpi_rank()) return;
1361     char out[41];
1362     f.checksum(out);
1363     printf("checksum(%s): %s\n", who.c_str(), out);
1364 }
1365 
1366 ssize_t
write_from_matpoly(matpoly const & src,unsigned int k0,unsigned int k1)1367 lingen_output_to_sha1sum::write_from_matpoly(matpoly const& src,
1368                                unsigned int k0,
1369                                unsigned int k1)
1370 {
1371     ASSERT_ALWAYS(k0 % simd == 0);
1372 #ifdef SELECT_MPFQ_LAYER_u64k1
1373     ASSERT_ALWAYS(src.high_word_is_clear());
1374     size_t nbytes = iceildiv(k1 - k0, ULONG_BITS) * sizeof(unsigned long);
1375 #else
1376     size_t nbytes = abvec_elt_stride(src.ab, k1-k0);
1377 #endif
1378     written += src.nrows() * src.ncols() * nbytes;
1379     if (!mpi_rank()) {
1380         for(unsigned int i = 0 ; i < src.nrows() ; i++)
1381             for(unsigned int j = 0; j < src.ncols() ; j++)
1382                 f.write((const char *) src.part_head(i, j, k0), nbytes);
1383     }
1384     return k1 - k0;
1385 }
1386 
pipe(lingen_input_wrapper_base & in,lingen_output_wrapper_base & out,const char * action,bool skip_trailing_zeros)1387 void pipe(lingen_input_wrapper_base & in, lingen_output_wrapper_base & out, const char * action, bool skip_trailing_zeros)
1388 {
1389     unsigned int window = std::max(in.preferred_window(), out.preferred_window());
1390     if (window == UINT_MAX) {
1391         window=4096;
1392     }
1393     matpoly F(in.ab, in.nrows, in.ncols, 0);
1394     matpoly Z(in.ab, in.nrows, in.ncols, 0);
1395     if (!mpi_rank()) {
1396         F.zero_pad(window);
1397         Z.zero_pad(window);
1398     }
1399 
1400     double tt0 = wct_seconds();
1401     double next_report_t = tt0;
1402     size_t next_report_k = 0;
1403     size_t expected = in.guessed_length();
1404     size_t zq = 0;
1405     for(size_t done = 0 ; ; ) {
1406         F.set_size(0);
1407         F.zero_pad(window);
1408         ssize_t n = in.read_to_matpoly(F, 0, window);
1409         bool is_last = n < (ssize_t) window;
1410         if (n <= 0) break;
1411         ssize_t n1;
1412         long long ll;
1413         if (!mpi_rank()) {
1414             F.set_size(n);
1415             ll = skip_trailing_zeros ? F.get_true_nonzero_size() : n;
1416         }
1417         MPI_Bcast(&ll, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
1418         n1 = ll;
1419         if (n1 == 0) {
1420             zq += n;
1421             continue;
1422         }
1423         /* write the good number of zeros */
1424         for( ; zq ; ) {
1425             Z.set_size(0);
1426             unsigned int nz = MIN(zq, window);
1427             if (!mpi_rank())
1428                 Z.zero_pad(nz);
1429             ssize_t nn = out.write_from_matpoly(Z, 0, nz);
1430             if (nn < (ssize_t) nz) {
1431                 fprintf(stderr, "short write\n");
1432                 exit(EXIT_FAILURE);
1433             }
1434             zq -= nz;
1435             done += nz;
1436         }
1437         ssize_t nn = out.write_from_matpoly(F, 0, n1);
1438         if (nn < n1) {
1439             fprintf(stderr, "short write\n");
1440             exit(EXIT_FAILURE);
1441         }
1442         zq = n - n1;
1443         done += n1;
1444         if (action && !mpi_rank() && (done >= next_report_k || is_last)) {
1445             double tt = wct_seconds();
1446             if (tt > next_report_t || is_last) {
1447                 printf(
1448                         "%s %zu coefficients (%.1f%%)"
1449                         " in %.1f s (%.1f MB/s)\n",
1450                         action,
1451                         done, 100.0 * done / expected,
1452                         tt-tt0, done * in.average_matsize() / (tt-tt0)/1.0e6);
1453                 next_report_t = tt + 10;
1454                 next_report_k = done + expected / 100;
1455             }
1456         }
1457         if (is_last) break;
1458     }
1459 }
1460 
1461 template class lingen_scatter<matpoly>;
1462 template class lingen_scatter<bigmatpoly>;
1463 template class lingen_gather<matpoly>;
1464 template class lingen_gather<bigmatpoly>;
1465 template class lingen_gather_reverse<matpoly>;
1466 template class lingen_gather_reverse<bigmatpoly>;
1467 
1468