1 /* Copyright (C) 1999--2007 Emmanuel Thom'e --- see LICENSE file */
2 #include "cado.h" // IWYU pragma: keep
3 // IWYU pragma: no_include <sys/param.h>
4 #include <cstdint>                        // for SIZE_MAX
5 #include <cmath>                          // for ceil
6 #include <cstdio>                         // for printf, fprintf, size_t
7 #include <cstdlib>                        // for exit, free, EXIT_FAILURE
8 #include <cstring>                        // for strcmp, memcpy, memset, strdup
9 
10 #include <algorithm>                      // for fill
11 #include <fstream>                        // for basic_ostream::write
12 #include <memory>                         // for unique_ptr
13 #include <stdexcept>                      // for runtime_error, overflow_error
14 #include <string>                         // for operator+, string
15 #include <vector>                         // for vector
16 
17 #include <sys/utsname.h>                  // for uname, utsname
18 #include <gmp.h>                          // for gmp_randclear, gmp_randinit...
19 
20 #include "bw-common.h"                    // for bw, bw_common_clear, bw_com...
21 #include "fmt/core.h"                     // for check_format_string
22 #include "fmt/format.h"                   // for basic_buffer::append, basic...
23 #include "fmt/printf.h" // IWYU pragma: keep
24 #ifdef SELECT_MPFQ_LAYER_u64k1
25 #include "gf2x-fft.h"                     // for gf2x_cantor_fft_info
26 #include "gf2x-ternary-fft.h"             // for gf2x_ternary_fft_info
27 #else
28 #include "flint-fft/transform_interface.h"          // fft_transform_info
29 #endif
30 #include "lingen_abfield.hpp"             // for abfield_clear, abfield_init
31 #include "lingen_bigmatpoly.hpp"          // for bigmatpoly, bigmatpoly_model
32 #include "lingen_bigmatpoly_ft.hpp"       // for bigmatpoly_ft
33 #include "lingen_bmstatus.hpp"            // for bmstatus
34 #include "lingen_bw_dimensions.hpp"       // for bw_dimensions
35 #include "lingen_call_companion.hpp"      // for lingen_call_companion, ling...
36 #include "lingen_checkpoints.hpp"         // for save_checkpoint_file, load_...
37 #include "lingen_expected_pi_length.hpp"  // for expected_pi_length, expecte...
38 #include "lingen_hints.hpp"               // for lingen_hints
39 #include "lingen_io_matpoly.hpp"          // for lingen_io_matpoly_decl_usage
40 #include "lingen_io_wrappers.hpp"         // for lingen_output_wrapper_base
41 #include "lingen_matpoly_ft.hpp"          // for matpoly_ft, matpoly_ft<>::m...
42 #include "lingen_matpoly_select.hpp"      // for matpoly, matpoly::memory_guard
43 #include "lingen_qcode_select.hpp"           // for bw_lingen_basecase
44 #include "lingen_substep_schedule.hpp"    // for lingen_substep_schedule
45 #include "lingen_tuning.hpp"              // for lingen_tuning, lingen_tunin...
46 #include "logline.h"                      // for logline_end, logline_init_t...
47 #include "macros.h"                       // for ASSERT_ALWAYS, iceildiv, MIN
48 #include "memusage.h"                     // for PeakMemusage
49 #include "misc.h"                         // for size_disp
50 #include "omp_proxy.h"                    // for omp_set_num_threads
51 #include "params.h"                       // for cxx_param_list, param_list_...
52 #include "select_mpi.h"                   // for MPI_Comm, MPI_Comm_rank
53 #include "sha1.h"                         // for sha1_checksumming_stream
54 #include "timing.h"                       // for seconds
55 #include "tree_stats.hpp"                 // for tree_stats, tree_stats::tra...
56 #include "portability.h" // strdup // IWYU pragma: keep
57 
58 
59 /* If non-zero, then reading from A is actually replaced by reading from
60  * a random generator */
61 static unsigned int random_input_length = 0;
62 static unsigned int input_length = 0;
63 
64 static int split_input_file = 0;  /* unsupported ; do acollect by ourselves */
65 static int split_output_file = 0; /* do split by ourselves */
66 
67 gmp_randstate_t rstate;
68 
69 static int allow_zero_on_rhs = 0;
70 
71 int rank0_exit_code = EXIT_SUCCESS;
72 
73 int global_flag_ascii = 0;
74 int global_flag_tune = 0;
75 
lingen_decl_usage(cxx_param_list & pl)76 void lingen_decl_usage(cxx_param_list & pl)/*{{{*/
77 {
78     param_list_decl_usage(pl, "ascii",
79             "read and write data in ascii");
80     param_list_decl_usage(pl, "timings",
81             "provide timings on all output lines");
82     param_list_decl_usage(pl, "tune",
83             "activate tuning mode");
84     param_list_decl_usage(pl, "allow_zero_on_rhs",
85             "do not cry if the generator corresponds to a zero contribution on the RHS vectors");
86 
87     /* we must be square ! */
88     param_list_decl_usage(pl, "mpi", "number of MPI nodes across which the execution will span, with mesh dimensions");
89     param_list_decl_usage(pl, "thr", "number of threads (on each node) for the program, with mesh dimensions");
90 
91     param_list_decl_usage(pl, "nrhs",
92             "number of columns that correspond to rhs vectors");
93     param_list_decl_usage(pl, "rhs",
94             "file with rhs vectors (only the header is read)");
95 
96     param_list_decl_usage(pl, "afile",
97             "input sequence file");
98     param_list_decl_usage(pl, "input_length",
99             "input sequence length (defaults to auto-detect)");
100     param_list_decl_usage(pl, "random-input-with-length",
101             "use surrogate for input");
102     param_list_decl_usage(pl, "split-input-file",
103             "work with split files on input");
104     param_list_decl_usage(pl, "split-output-file",
105             "work with split files on output");
106     param_list_decl_usage(pl, "random_seed",
107             "seed the random generator");
108     param_list_decl_usage(pl, "ffile",
109             "output generator file");
110 
111 #if 0
112     param_list_decl_usage(pl, "lingen_mpi_threshold",
113             "use MPI matrix operations above this size");
114     param_list_decl_usage(pl, "lingen_threshold",
115             "use recursive algorithm above this size");
116 #endif
117 
118     param_list_configure_switch(pl, "--tune", &global_flag_tune);
119     param_list_configure_switch(pl, "--ascii", &global_flag_ascii);
120     param_list_configure_alias(pl, "seed", "random_seed");
121 
122 }/*}}}*/
123 
124 /**********************************************************************/
125 
126 /*{{{ Main entry points and recursive algorithm (with and without MPI) */
127 
128 /* Forward declaration, it's used by the recursive version */
129 matpoly bw_lingen_single(bmstatus & bm, matpoly & E);
130 
131 bigmatpoly bw_biglingen_collective(bmstatus & bm, bigmatpoly & E);
132 
sha1sum(matpoly const & X)133 std::string sha1sum(matpoly const & X)
134 {
135     sha1_checksumming_stream S;
136     ASSERT_ALWAYS(X.is_tight());
137     S.write((const char *) X.data_area(), X.data_size_in_bytes());
138     char checksum[41];
139     S.checksum(checksum);
140     return std::string(checksum);
141 }
142 
143 template<typename matpoly_type, typename fft_type>
144 struct matching_ft_type {};
145 
146 template<typename matpoly_type>
147 struct matpoly_diverter {};
148 
149 template<typename fft_type>
150 struct matching_ft_type<matpoly, fft_type> {
151     typedef matpoly_ft<fft_type> type;
152 };
153 template<> struct matpoly_diverter<matpoly> {
154     static constexpr const char * prefix = "";
callbackmatpoly_diverter155     static matpoly callback(bmstatus & bm, matpoly & E) {
156         return bw_lingen_single(bm, E);
157     }
158 };
159 constexpr const char * matpoly_diverter<matpoly>::prefix;
160 
161 template<typename fft_type>
162 struct matching_ft_type<bigmatpoly, fft_type> {
163     typedef bigmatpoly_ft<fft_type> type;
164 };
165 template<> struct matpoly_diverter<bigmatpoly> {
166     static constexpr const char * prefix = "MPI-";
callbackmatpoly_diverter167     static bigmatpoly callback(bmstatus & bm, bigmatpoly & E) {
168         return bw_biglingen_collective(bm, E);
169     }
170 };
171 constexpr const char * matpoly_diverter<bigmatpoly>::prefix;
172 
173 
174 template<typename matpoly_type>
generic_mp(matpoly_type & E,matpoly_type & pi_left,bmstatus & bm,lingen_call_companion & C)175 matpoly_type generic_mp(matpoly_type & E, matpoly_type & pi_left, bmstatus & bm, lingen_call_companion & C)
176 {
177     switch (C.mp.S.fft_type) {
178         case lingen_substep_schedule::FFT_NONE:
179             return matpoly_type::mp(bm.stats, E, pi_left, &C.mp);
180         case lingen_substep_schedule::FFT_FLINT:
181 #ifndef SELECT_MPFQ_LAYER_u64k1
182             return matching_ft_type<matpoly_type,
183                     fft_transform_info>::type::mp_caching(
184                             bm.stats, E, pi_left, & C.mp);
185 #else
186             throw std::runtime_error("fft type \"flint\" does not make sense here");
187 #endif
188 #ifdef SELECT_MPFQ_LAYER_u64k1
189         case lingen_substep_schedule::FFT_CANTOR:
190             return matching_ft_type<matpoly_type,
191                     gf2x_cantor_fft_info>::type::mp_caching(
192                             bm.stats, E, pi_left, & C.mp);
193         case lingen_substep_schedule::FFT_TERNARY:
194             return matching_ft_type<matpoly_type,
195                     gf2x_ternary_fft_info>::type::mp_caching(
196                             bm.stats, E, pi_left, & C.mp);
197 #else
198         case lingen_substep_schedule::FFT_TERNARY:
199         case lingen_substep_schedule::FFT_CANTOR:
200             throw std::runtime_error("fft types over GF(2)[x] do not make sense here");
201 #endif
202     }
203     throw std::runtime_error("invalid fft_type");
204 }
205 
206 template<typename matpoly_type>
generic_mul(matpoly_type & pi_left,matpoly_type & pi_right,bmstatus & bm,lingen_call_companion & C)207 matpoly_type generic_mul(matpoly_type & pi_left, matpoly_type & pi_right, bmstatus & bm, lingen_call_companion & C)
208 {
209     switch (C.mul.S.fft_type) {
210         case lingen_substep_schedule::FFT_NONE:
211             return matpoly_type::mul(bm.stats, pi_left, pi_right, & C.mul);
212         case lingen_substep_schedule::FFT_FLINT:
213 #ifndef SELECT_MPFQ_LAYER_u64k1
214             return matching_ft_type<matpoly_type,
215                     fft_transform_info>::type::mul_caching(
216                             bm.stats, pi_left, pi_right, & C.mul);
217 #else
218             throw std::runtime_error("fft type \"flint\" does not make sense here");
219 #endif
220 #ifdef SELECT_MPFQ_LAYER_u64k1
221         case lingen_substep_schedule::FFT_CANTOR:
222             return matching_ft_type<matpoly_type,
223                     gf2x_cantor_fft_info>::type::mul_caching(
224                             bm.stats, pi_left, pi_right, & C.mul);
225         case lingen_substep_schedule::FFT_TERNARY:
226             return matching_ft_type<matpoly_type,
227                     gf2x_ternary_fft_info>::type::mul_caching(
228                             bm.stats, pi_left, pi_right, & C.mul);
229 #else
230         case lingen_substep_schedule::FFT_TERNARY:
231         case lingen_substep_schedule::FFT_CANTOR:
232             throw std::runtime_error("fft types over GF(2)[x] do not make sense here");
233 #endif
234     }
235     throw std::runtime_error("invalid fft_type");
236 }
237 
238 template<typename matpoly_type>
truncate_overflow(bmstatus & bm,matpoly_type & pi,unsigned int pi_expect)239 void truncate_overflow(bmstatus & bm, matpoly_type & pi, unsigned int pi_expect)
240 {
241     if (pi.get_size() > pi_expect) {
242         unsigned int nluck = 0;
243         unsigned int maxdelta_lucky = 0;
244         for(unsigned int j = 0 ; j < bm.d.m + bm.d.n ; j++) {
245             if (!bm.lucky[j]) continue;
246             nluck++;
247             if (bm.delta[j] >= maxdelta_lucky)
248                 maxdelta_lucky = bm.delta[j];
249         }
250         printf("truncating excess cols\n"
251                 "   pi has length %zu, we expected %u at most.\n"
252                 "   max delta on the %u lucky columns: %u\n",
253                 pi.get_size(),
254                 pi_expect,
255                 nluck,
256                 maxdelta_lucky);
257         bm.display_deltas();
258         pi.set_size(pi_expect);
259         pi.clear_high_word();
260         /* in fact, there is really no reason to reach here. The expected
261          * pi length should be large enough so that the "luck" condition
262          * and eventually the "done"/"finished" flags are triggered
263          * before we reach the point where we overflow. This means a
264          * constant offset. Therefore, reaching here is a bug.
265          *
266          * Furthermore, while the idea of truncating is appealing, it
267          * doesn't quote solve the problem when we wish to keep iterating
268          * a bit more: because of the degree constraints, truncating will
269          * make the next E matrix cancel, as we're almost knowingly
270          * killing its rank...
271          *
272          * one of the better things to do could be to keep track of a
273          * shift vector. but again, what for...
274          */
275         ASSERT_ALWAYS(0);
276     }
277 }
278 
279 
280 template<typename matpoly_type>
bw_lingen_recursive(bmstatus & bm,matpoly_type & E)281 matpoly_type bw_lingen_recursive(bmstatus & bm, matpoly_type & E) /*{{{*/
282 {
283     int depth = bm.depth();
284     size_t z = E.get_size();
285 
286     /* C0 is a copy. We won't use it for long anyway. We'll take a
287      * reference _later_ */
288     lingen_call_companion C0 = bm.companion(depth, z);
289 
290     tree_stats::sentinel dummy(bm.stats, fmt::sprintf("%srecursive", matpoly_diverter<matpoly_type>::prefix), z, C0.total_ncalls);
291 
292     bm.stats.plan_smallstep(C0.mp.step_name(), C0.mp.tt);
293     bm.stats.plan_smallstep(C0.mul.step_name(), C0.mul.tt);
294 
295     bw_dimensions & d = bm.d;
296 
297     /* we have to start with something large enough to get all
298      * coefficients of E_right correct */
299     size_t half = E.get_size() - (E.get_size() / 2);
300     // unsigned int pi_expect = expected_pi_length(d, bm.delta, E.get_size());
301     unsigned int pi_expect_lowerbound = expected_pi_length_lowerbound(d, E.get_size());
302     unsigned int pi_left_expect = expected_pi_length(d, bm.delta, half);
303     unsigned int pi_left_expect_lowerbound = expected_pi_length_lowerbound(d, half);
304     unsigned int pi_left_expect_used_for_shift = MIN(pi_left_expect, half + 1);
305 
306 
307     matpoly_type E_left = E.truncate_and_rshift(half, half + 1 - pi_left_expect_used_for_shift);
308 
309     // this consumes E_left entirely.
310     matpoly_type pi_left = matpoly_diverter<matpoly_type>::callback(bm, E_left);
311 
312     ASSERT_ALWAYS(pi_left.get_size());
313 
314     if (bm.done) {
315         return pi_left;
316     }
317 
318     truncate_overflow(bm, pi_left, pi_left_expect);
319 
320     ASSERT_ALWAYS(bm.done || pi_left.get_size() >= pi_left_expect_lowerbound);
321 
322     /* XXX I don't understand why I need to do this. It seems to me that
323      * MP(XA, B) and MP(A, B) should be identical whenever deg A > deg B.
324      */
325     ASSERT_ALWAYS(pi_left_expect_used_for_shift >= pi_left.get_size());
326     if (pi_left_expect_used_for_shift != pi_left.get_size()) {
327         E.rshift(E, pi_left_expect_used_for_shift - pi_left.get_size());
328         /* Don't shrink_to_fit at this point, because we've only made a
329          * minor adjustment. */
330     }
331 
332     matpoly_type E_right = E.similar_shell();
333 
334     if (!load_checkpoint_file(bm, LINGEN_CHECKPOINT_E, E_right, bm.t, bm.t+E.get_size()-pi_left.get_size()+1)) {
335         logline_begin(stdout, z, "t=%u %*s%sMP(%s, %zu, %zu) -> %zu",
336                 bm.t, depth,"", matpoly_diverter<matpoly_type>::prefix,
337                 C0.mp.fft_name(),
338                 E.get_size(), pi_left.get_size(), E.get_size() - pi_left.get_size() + 1);
339 
340         E_right = generic_mp(E, pi_left, bm, bm.companion(depth, z));
341         logline_end(&bm.t_mp, "");
342     }
343     E.clear();
344 
345 
346     unsigned int pi_right_expect = expected_pi_length(d, bm.delta, E_right.get_size());
347     unsigned int pi_right_expect_lowerbound = expected_pi_length_lowerbound(d, E_right.get_size());
348 
349     matpoly_type pi_right = matpoly_diverter<matpoly_type>::callback(bm, E_right);
350 
351     truncate_overflow(bm, pi_right, pi_right_expect);
352 
353     ASSERT_ALWAYS(bm.done || pi_right.get_size() >= pi_right_expect_lowerbound);
354 
355     logline_begin(stdout, z, "t=%u %*s%sMUL(%s, %zu, %zu) -> %zu",
356             bm.t, depth, "", matpoly_diverter<matpoly_type>::prefix,
357             C0.mul.fft_name(),
358             pi_left.get_size(), pi_right.get_size(), pi_left.get_size() + pi_right.get_size() - 1);
359 
360     matpoly_type pi = generic_mul(pi_left, pi_right, bm, bm.companion(depth, z));
361     pi_left.clear();
362     pi_right.clear();
363 
364     /* Note that the leading coefficients of pi_left and pi_right are not
365      * necessarily full-rank, so that we have to fix potential zeros. If
366      * we don't, the degree of pi artificially grows with the recursive
367      * level.
368      */
369     unsigned int pisize = pi.get_size();
370 #if 0
371     /* The asserts below are bogus. In fact, it's not entirely impossible
372      * that pi grows more than what we had expected on entry, e.g. if we
373      * have one early generator. So we can't just do this. Most of the
374      * time it will work, but we can't claim that it will always work.
375      *
376      * One possible sign is when the entry deltas are somewhat even, and
377      * the result deltas are unbalanced.
378      *
379      * It might also be worthwhile to do this check by column, and look
380      * at both degree and valuation.
381      */
382     for(; pisize > pi_expect ; pisize--) {
383         /* These coefficients really must be zero */
384         ASSERT_ALWAYS(pi.coeff_is_zero(pisize - 1));
385     }
386     ASSERT_ALWAYS(pisize <= pi_expect);
387 #endif
388     /* Now below pi_expect, it's not impossible to have a few
389      * cancellations as well.
390      */
391     for(; pisize ; pisize--) {
392         if (!pi.coeff_is_zero(pisize - 1)) break;
393     }
394     pi.set_size(pisize);
395     ASSERT_ALWAYS(bm.done || pisize >= pi_expect_lowerbound);
396 
397     logline_end(&bm.t_mul, "");
398 
399     return pi;
400 }/*}}}*/
401 
402 
bw_lingen_single_nocp(bmstatus & bm,matpoly & E)403 matpoly bw_lingen_single_nocp(bmstatus & bm, matpoly & E) /*{{{*/
404 {
405     int rank;
406     MPI_Comm_rank(bm.com[0], &rank);
407     ASSERT_ALWAYS(!rank);
408     matpoly pi;
409 
410     lingen_call_companion C = bm.companion(bm.depth(), E.get_size());
411 
412     // ASSERT_ALWAYS(E.size < bm.lingen_mpi_threshold);
413 
414     // fprintf(stderr, "Enter %s\n", __func__);
415     if (!bm.recurse(E.get_size())) {
416         tree_stats::transition_sentinel dummy(bm.stats, "recursive_threshold", E.get_size(), C.total_ncalls);
417         bm.t_basecase -= seconds();
418         E.clear_high_word();
419         pi = bw_lingen_basecase(bm, E);
420         bm.t_basecase += seconds();
421     } else {
422         pi = bw_lingen_recursive(bm, E);
423     }
424     // fprintf(stderr, "Leave %s\n", __func__);
425 
426     return pi;
427 }/*}}}*/
bw_lingen_single(bmstatus & bm,matpoly & E)428 matpoly bw_lingen_single(bmstatus & bm, matpoly & E) /*{{{*/
429 {
430     int rank;
431     MPI_Comm_rank(bm.com[0], &rank);
432     ASSERT_ALWAYS(!rank);
433     unsigned int t0 = bm.t;
434     unsigned int t1 = bm.t + E.get_size();
435     matpoly pi;
436 
437     save_checkpoint_file(bm, LINGEN_CHECKPOINT_E, E, t0, t1);
438 
439     if (load_checkpoint_file(bm, LINGEN_CHECKPOINT_PI, pi, t0, t1))
440         return pi;
441 
442     pi = bw_lingen_single_nocp(bm, E);
443 
444     save_checkpoint_file(bm, LINGEN_CHECKPOINT_PI, pi, t0, t1);
445 
446     return pi;
447 }/*}}}*/
448 
bw_biglingen_collective(bmstatus & bm,bigmatpoly & E)449 bigmatpoly bw_biglingen_collective(bmstatus & bm, bigmatpoly & E)/*{{{*/
450 {
451     /* as for bw_lingen_single, we're tempted to say that we're just a
452      * trampoline. In fact, it's not really satisfactory: we're really
453      * doing stuff here. In a sense though, it's not *that much* of a
454      * trouble, because the mpi threshold will be low enough that doing
455      * our full job here is not too much of a problem.
456      */
457     bw_dimensions & d = bm.d;
458     abdst_field ab = d.ab;
459     unsigned int m = d.m;
460     unsigned int n = d.n;
461     unsigned int b = m + n;
462     int rank;
463     int size;
464     MPI_Comm_rank(bm.com[0], &rank);
465     MPI_Comm_size(bm.com[0], &size);
466     unsigned int t0 = bm.t;
467     unsigned int t1 = bm.t + E.get_size();
468     bigmatpoly_model const& model(E);
469     int depth = bm.depth();
470     size_t z = E.get_size();
471 
472     lingen_call_companion C = bm.companion(depth, z);
473     bool go_mpi = C.go_mpi();
474     // bool go_mpi = E.get_size() >= bm.lingen_mpi_threshold;
475 
476     bigmatpoly pi(model);
477 
478     save_checkpoint_file(bm, LINGEN_CHECKPOINT_E, E, t0, t1);
479 
480     if (load_checkpoint_file(bm, LINGEN_CHECKPOINT_PI, pi, t0, t1))
481         return pi;
482 
483     // fprintf(stderr, "Enter %s\n", __func__);
484     if (go_mpi) {
485         pi = bw_lingen_recursive(bm, E);
486     } else {
487         /* Fall back to local code */
488         /* This entails gathering E locally, computing pi locally, and
489          * dispathing it back. */
490 
491         tree_stats::transition_sentinel dummy(bm.stats, "mpi_threshold", E.get_size(), C.total_ncalls);
492 
493         matpoly sE(ab, m, b, E.get_size());
494         matpoly spi;
495 
496         double expect0 = bm.hints.tt_gather_per_unit * E.get_size();
497         bm.stats.plan_smallstep("gather(L+R)", expect0);
498         bm.stats.begin_smallstep("gather(L+R)");
499         E.gather_mat(sE);
500         E = bigmatpoly(model);
501         bm.stats.end_smallstep();
502 
503         /* Only the master node does the local computation */
504         if (!rank)
505             spi = bw_lingen_single_nocp(bm, sE);
506 
507         double expect1 = bm.hints.tt_scatter_per_unit * z;
508         bm.stats.plan_smallstep("scatter(L+R)", expect1);
509         bm.stats.begin_smallstep("scatter(L+R)");
510         pi = bigmatpoly(ab, model, b, b, 0);
511         pi.scatter_mat(spi);
512         MPI_Bcast(&bm.done, 1, MPI_INT, 0, bm.com[0]);
513         MPI_Bcast(bm.delta.data(), b, MPI_UNSIGNED, 0, bm.com[0]);
514         MPI_Bcast(bm.lucky.data(), b, MPI_UNSIGNED, 0, bm.com[0]);
515         MPI_Bcast(&(bm.t), 1, MPI_UNSIGNED, 0, bm.com[0]);
516         /* Don't forget to broadcast delta from root node to others ! */
517         bm.stats.end_smallstep();
518     }
519     // fprintf(stderr, "Leave %s\n", __func__);
520 
521     save_checkpoint_file(bm, LINGEN_CHECKPOINT_PI, pi, t0, t1);
522 
523     MPI_Barrier(bm.com[0]);
524 
525     return pi;
526 }/*}}}*/
527 
528 /*}}}*/
529 
530 /**********************************************************************/
531 
532 /**********************************************************************/
count_lucky_columns(bmstatus & bm)533 unsigned int count_lucky_columns(bmstatus & bm)/*{{{*/
534 {
535     bw_dimensions & d = bm.d;
536     unsigned int m = d.m;
537     unsigned int n = d.n;
538     unsigned int b = m + n;
539     int luck_mini = expected_pi_length(d);
540     MPI_Bcast(bm.lucky.data(), b, MPI_UNSIGNED, 0, bm.com[0]);
541     unsigned int nlucky = 0;
542     for(unsigned int j = 0 ; j < b ; nlucky += bm.lucky[j++] >= luck_mini) ;
543     return nlucky;
544 }/*}}}*/
545 
check_luck_condition(bmstatus & bm)546 int check_luck_condition(bmstatus & bm)/*{{{*/
547 {
548     bw_dimensions & d = bm.d;
549     unsigned int m = d.m;
550     unsigned int n = d.n;
551     unsigned int nlucky = count_lucky_columns(bm);
552 
553     int rank;
554     MPI_Comm_rank(bm.com[0], &rank);
555 
556     if (!rank) {
557         printf("Number of lucky columns: %u (%u wanted)\n", nlucky, n);
558     }
559 
560     if (nlucky == n)
561         return 1;
562 
563     if (!rank) {
564         fprintf(stderr, "Could not find the required set of solutions (nlucky=%u)\n", nlucky);
565     }
566     if (random_input_length) {
567         static int once=0;
568         if (once++) {
569             if (!rank) {
570                 fprintf(stderr, "Solution-faking loop crashed\n");
571             }
572             MPI_Abort(bm.com[0], EXIT_FAILURE);
573         }
574         if (!rank) {
575             printf("Random input: faking successful computation\n");
576         }
577         for(unsigned int j = 0 ; j < n ; j++) {
578             unsigned int s = (j * 1009) % (m+n);
579             bm.lucky[s]  = expected_pi_length(d);
580             bm.delta[s] -= expected_pi_length(d);
581         }
582         return check_luck_condition(bm);
583     }
584 
585     return 0;
586 }/*}}}*/
587 
print_node_assignment(MPI_Comm comm)588 void print_node_assignment(MPI_Comm comm)/*{{{*/
589 {
590     int rank;
591     int size;
592     MPI_Comm_size(comm, &size);
593     MPI_Comm_rank(comm, &rank);
594 
595     struct utsname me[1];
596     int rc = uname(me);
597     if (rc < 0) { perror("uname"); MPI_Abort(comm, 1); }
598     size_t sz = 1 + sizeof(me->nodename);
599     char * global = (char*) malloc(size * sz);
600     memset(global, 0, size * sz);
601     memcpy(global + rank * sz, me->nodename, sizeof(me->nodename));
602 
603     MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
604             global, sz, MPI_BYTE, comm);
605     if (rank == 0) {
606         char name[80];
607         int len=80;
608         MPI_Comm_get_name(comm, name, &len);
609         name[79]=0;
610         for(int i = 0 ; i < size ; i++) {
611             printf("# %s rank %d: %s\n", name, i, global + i * sz);
612         }
613     }
614     free(global);
615 }/*}}}*/
616 
617 /* Counting memory usage in the recursive algorithm.
618  *
619  * The recursive algorithm is designed to allow the allocated memory for
620  * the input to be reused for placing the output. Some memory might have
621  * been saved by upper layers. We also have some local allocation.
622  *
623  * Notations: The algorithm starts at depth 0 with an
624  * input length L, and the notation \ell_i denotes L/2^(i+1). We have
625  * \ell_i=2\ell_{i+1}. The notation \alpha denotes m/(m+n). Note that the
626  * input has size \alpha*(1-\alpha)*L times (m+n)^2*\log_2(p) (divided by
627  * r^2 if relevant).
628  *
629  * We define five quantities. All are understood as multiples of
630  * (m+n)^2*\log_2(p).
631  *
632  * MP(i) is the extra storage needed for the MP operation at depth i.
633  *
634  * MUL(i) is the extra storage needed for the MUL operation at depth i.
635  *
636  * IO(i) is the common size of the input and output data of the call at
637  *       depth i. We have
638  *              IO(i) = 2\alpha\ell_i
639  *
640  * ST(i) is the storage *at all levels above the current one* (i.e. with
641  *    depth strictly less than i) for the data that is still live and
642  *    need to exist until after we return. This count is maximized in the
643  *    leftmost branch, where chopped E at all levels must be kept.
644  *    chopped E at depth i (not counted in ST(i) !) is:
645  *          \alpha(1+\alpha) \ell_i
646  *    (counted as the degree it takes to make the necessary data that
647  *    we want to use to compute E_right),
648  *    so the cumulated cost above is twice the depth 0 value, minus the
649  *    depth i value, i.e.
650  *              ST(i) = \alpha(1+\alpha)(L-2\ell_i).
651  * SP(i) is the "spike" at depth i: not counting allocation that is
652  *    already reserved for IO or ST, this is the amount of extra memory
653  *    that is required by the call at depth i. We have:
654  *      SP(i) = max {
655  *              \alpha\ell_i,
656  *              \alpha\ell_i-2\alpha\ell_i+\alpha(1+\alpha)\ell_i+SP(i+1),
657  *             2\alpha\ell_i-2\alpha\ell_i+\alpha(1+\alpha)\ell_i+MP(i),
658  *             2\alpha\ell_i-2\alpha\ell_i+SP(i+1)
659  *             4\alpha\ell_i-2\alpha\ell_i+MUL(i)
660  *             }
661  *            = max {
662  *              \alpha\ell_i-2\alpha\ell_i+\alpha(1+\alpha)\ell_i+SP(i+1),
663  *             2\alpha\ell_i-2\alpha\ell_i+\alpha(1+\alpha)\ell_i+MP(i),
664  *             4\alpha\ell_i-2\alpha\ell_i+MUL(i)
665  *                           }
666  *
667  * Combining this together, and using
668  * ST(i)+\alpha(1+\alpha)\ell_i=ST(i+1), we have:
669  *
670  * IO(i)+ST(i)+SP(i) = max {
671  *              IO(i+1)+ST(i+1)+SP(i+1),
672  *              ST(i) + 2\alpha\ell_i+\alpha(1+\alpha)\ell_i+MP(i),
673  *              ST(i) + 4\alpha\ell_i+MUL(i)
674  *                      }
675  *                   = max {
676  *              IO(i+1)+ST(i+1)+SP(i+1),
677  *              \alpha(1+\alpha)(L-\ell_i)  + 2\alpha\ell_i + MP(i),
678  *              \alpha(1+\alpha)(L-2\ell_i) + 4\alpha\ell_i + MUL(i)
679  *                      }
680  *                   = max {
681  *              IO(i+1)+ST(i+1)+SP(i+1),
682  *              \alpha((1+\alpha)L+(1-\alpha)\ell_i) + MP(i),
683  *              \alpha((1+\alpha)L+2(1-\alpha)\ell_i) + MUL(i),
684  *                      }
685  *
686  * Let RMP(i) be the amount of memory that is reserved while we are doing
687  * the MP operation, and define RMUL similarly. We have:
688  *      RMP(i)  = \alpha(1+\alpha)(L-\ell_i)  + 2\alpha\ell_i
689  *      RMUL(i) = \alpha(1+\alpha)(L-2\ell_i) + 4\alpha\ell_i
690  * whence:
691  *      RMP(i) = \alpha((1+\alpha)L+(1-\alpha)\ell_i)
692  *      RMUL(i) = \alpha((1+\alpha)L+2(1-\alpha)\ell_i)
693  *
694  * We have RMP(i) <= RMUL(i) <= RMP(0) <= RMUL(0) = 2\alpha*L. We'll use
695  * the un-simplified expression later.
696  *
697  * Furthermore IO(infinity)=SP(infinity)=0, and ST(infinity)=\alpha(1+\alpha)L
698  *
699  * So that eventually, the amount of reserved memory for the whole
700  * algorithm is RMUL(0)=2\alpha*L (which is 2/(1-\alpha)=2*(1+m/n) times
701  * the input size). On top of that we have the memory required
702  * for the transforms.
703  *
704  *
705  * When going MPI, matrices may be rounded with some inaccuracy.
706  * Splitting in two a 3x3 matrix leads to a 2x2 chunk, which is 1.77
707  * times more than the simplistic proportionality rule.
708  *
709  * Therefore it makes sense to distinguish between matrices of size
710  * m*(m+n) and (m+n)*(m+n). If we recompute RMUL(i) by taking this into
711  * account, we obtain:
712  *      [m/r][(m+n)/r][(1+\alpha)(L-2\ell_i)] + [(m+n)/r]^2*[4\alpha\ell_i]
713  * where we only paid attention to the rounding issues with dimensions,
714  * as those are more important than for degrees. Bottom line, the max is
715  * expected to be for i=0, and that will be made only of pi matrices.
716  */
717 
718 /* Some of the early reading must be done before we even start, since
719  * the code that we run depends on the input size.
720  */
721 
722 /* We don't have a header file for this one */
723 extern "C" void check_for_mpi_problems();
724 
wrapped_main(int argc,char * argv[])725 int wrapped_main(int argc, char *argv[])
726 {
727 
728     cxx_param_list pl;
729 
730     bw_common_decl_usage(pl);
731     lingen_decl_usage(pl);
732     logline_decl_usage(pl);
733     lingen_tuning_decl_usage(pl);
734     lingen_checkpoint::decl_usage(pl);
735     lingen_io_matpoly_decl_usage(pl);
736     tree_stats::declare_usage(pl);
737 
738     bw_common_parse_cmdline(bw, pl, &argc, &argv);
739 
740     bw_common_interpret_parameters(bw, pl);
741     /* {{{ interpret our parameters */
742     gmp_randinit_default(rstate);
743 
744     int rank;
745     int size;
746     MPI_Comm_size(MPI_COMM_WORLD, &size);
747     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
748 
749     param_list_parse_int(pl, "allow_zero_on_rhs", &allow_zero_on_rhs);
750     param_list_parse_uint(pl, "random-input-with-length", &random_input_length);
751     param_list_parse_uint(pl, "input-length", &input_length);
752     param_list_parse_int(pl, "split-output-file", &split_output_file);
753     param_list_parse_int(pl, "split-input-file", &split_input_file);
754 
755     const char * afile = param_list_lookup_string(pl, "afile");
756 
757     if (bw->m == -1) {
758 	fprintf(stderr, "no m value set\n");
759 	exit(EXIT_FAILURE);
760     }
761     if (bw->n == -1) {
762 	fprintf(stderr, "no n value set\n");
763 	exit(EXIT_FAILURE);
764     }
765     if (!global_flag_tune && !(afile || random_input_length)) {
766         fprintf(stderr, "No afile provided\n");
767         exit(EXIT_FAILURE);
768     }
769 
770     /* we allow ffile and ffile to be both NULL */
771     const char * tmp = param_list_lookup_string(pl, "ffile");
772     char * ffile = NULL;
773     if (tmp) {
774         ffile = strdup(tmp);
775     } else if (afile) {
776         int rc = asprintf(&ffile, "%s.gen", afile);
777         ASSERT_ALWAYS(rc >= 0);
778     }
779     ASSERT_ALWAYS((afile==NULL) == (ffile == NULL));
780 
781     bmstatus bm(bw->m, bw->n);
782     bw_dimensions & d = bm.d;
783 
784     const char * rhs_name = param_list_lookup_string(pl, "rhs");
785     if (!global_flag_tune && !random_input_length) {
786         if (!rhs_name) {
787             fprintf(stderr, "# When using lingen, you must either supply --random-input-with-length, or provide a rhs, or possibly provide rhs=none\n");
788         } else if (strcmp(rhs_name, "none") == 0) {
789             rhs_name = NULL;
790         }
791     }
792     if (param_list_parse_uint(pl, "nrhs", &(bm.d.nrhs)) && rhs_name) {
793         fprintf(stderr, "# the command line arguments rhs= and nrhs= are incompatible\n");
794         exit(EXIT_FAILURE);
795     }
796     if (rhs_name && strcmp(rhs_name, "none") != 0) {
797         if (!rank)
798             get_rhs_file_header(rhs_name, NULL, &(bm.d.nrhs), NULL);
799         MPI_Bcast(&bm.d.nrhs, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
800     }
801 
802     abfield_init(d.ab);
803     abfield_specify(d.ab, MPFQ_PRIME_MPZ, bw->p);
804 
805     gmp_randseed_ui(rstate, bw->seed);
806 
807 #if 0
808     bm.lingen_threshold = 10;
809     bm.lingen_mpi_threshold = 1000;
810     param_list_parse_uint(pl, "lingen_threshold", &(bm.lingen_threshold));
811     param_list_parse_uint(pl, "lingen_mpi_threshold", &(bm.lingen_mpi_threshold));
812     if (bm.lingen_mpi_threshold < bm.lingen_threshold) {
813         bm.lingen_mpi_threshold = bm.lingen_threshold;
814         fprintf(stderr, "Argument fixing: setting lingen_mpi_threshold=%u (because lingen_threshold=%u)\n",
815                 bm.lingen_mpi_threshold, bm.lingen_threshold);
816     }
817 
818 
819 #if defined(FAKEMPI_H_)
820     bm.lingen_mpi_threshold = UINT_MAX;
821 #endif
822 #endif
823 
824     /* }}} */
825 
826     /* TODO: we should rather use lingen_platform.
827      */
828     /* {{{ Parse MPI args. Make bm.com[0] a better mpi communicator */
829     bm.mpi_dims[0] = 1;
830     bm.mpi_dims[1] = 1;
831     param_list_parse_intxint(pl, "mpi", bm.mpi_dims);
832     {
833         /* Display node index wrt MPI_COMM_WORLD */
834         print_node_assignment(MPI_COMM_WORLD);
835 
836         /* Reorder all mpi nodes so that each node gets the given number
837          * of jobs, but close together.
838          */
839         int mpi[2] = { bm.mpi_dims[0], bm.mpi_dims[1], };
840         int thr[2] = {1,1};
841 #ifdef  HAVE_OPENMP
842         if (param_list_parse_intxint(pl, "thr", thr)) {
843             if (!rank)
844                 printf("# Limiting number of openmp threads to %d\n",
845                         thr[0] * thr[1]);
846             omp_set_num_threads(thr[0] * thr[1]);
847         }
848 #else
849         if (param_list_parse_intxint(pl, "thr", thr)) {
850             if (thr[0]*thr[1] != 1) {
851                 if (!rank) {
852                     fprintf(stderr, "This program only wants openmp for multithreading. Ignoring thr argument.\n");
853                 }
854                 param_list_add_key(pl, "thr", "1x1", PARAMETER_FROM_CMDLINE);
855             }
856         }
857 #endif
858 
859 #ifdef  FAKEMPI_H_
860         if (mpi[0]*mpi[1] > 1) {
861             fprintf(stderr, "non-trivial option mpi= can't be used with fakempi. Please do an MPI-enabled build (MPI=1)\n");
862             exit(EXIT_FAILURE);
863         }
864 #endif
865         if (!rank)
866             printf("# size=%d mpi=%dx%d thr=%dx%d\n", size, mpi[0], mpi[1], thr[0], thr[1]);
867         ASSERT_ALWAYS(size == mpi[0] * mpi[1]);
868         if (bm.mpi_dims[0] != bm.mpi_dims[1]) {
869             if (!rank)
870                 fprintf(stderr, "The current lingen code is limited to square splits ; here, we received a %d x %d split, which will not work\n",
871                     bm.mpi_dims[0], bm.mpi_dims[1]);
872             abort();
873         }
874         int irank = rank / mpi[1];
875         int jrank = rank % mpi[1];
876         bm.com[0] = MPI_COMM_WORLD;
877         /* MPI Api has some very deprecated prototypes */
878         MPI_Comm_set_name(bm.com[0], (char*) "world");
879 
880         char commname[32];
881         snprintf(commname, sizeof(commname), "row%d\n", irank);
882         MPI_Comm_split(MPI_COMM_WORLD, irank, jrank, &(bm.com[1]));
883         MPI_Comm_set_name(bm.com[1], commname);
884 
885         snprintf(commname, sizeof(commname), "col%d\n", jrank);
886         MPI_Comm_split(MPI_COMM_WORLD, jrank, irank, &(bm.com[2]));
887         MPI_Comm_set_name(bm.com[2], commname);
888 
889         print_node_assignment(bm.com[0]);
890 
891         constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
892         if ((bm.d.m + bm.d.n) / simd < (unsigned int) mpi[0]) {
893             printf("########################################################\n");
894             printf("# Warning: this run will leave some resources idle:\n"
895                    "# the matrices of size %u*%u and %u*%u can be split into\n"
896                    "# chunks of minimal size %u, whence an mpi split over %d*%d is useless\n",
897                    bm.d.m,
898                    bm.d.m + bm.d.n,
899                    bm.d.m + bm.d.n,
900                    bm.d.m + bm.d.n,
901                    simd,
902                    mpi[0],
903                    mpi[0]);
904             printf("########################################################\n");
905         }
906     }
907     /* }}} */
908 
909     /* lingen tuning accepts some arguments. We look them up so as to
910      * avoid failures down the line */
911     lingen_tuning_lookup_parameters(pl);
912 
913     tree_stats::interpret_parameters(pl);
914     logline_interpret_parameters(pl);
915     lingen_checkpoint::interpret_parameters(pl);
916     lingen_io_matpoly_interpret_parameters(pl);
917 
918     if (param_list_warn_unused(pl)) {
919         int rank;
920         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
921         if (!rank) param_list_print_usage(pl, bw->original_argv[0], stderr);
922         MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
923     }
924 
925     /* TODO: read the a files in scattered mode */
926 
927     /* Don't police memory right now, we don't care */
928     matpoly::memory_guard main_memory(SIZE_MAX);
929 
930     std::unique_ptr<lingen_input_wrapper_base> A_series;
931 
932     if (random_input_length) {
933         A_series = std::unique_ptr<lingen_input_wrapper_base>(new lingen_random_input(bm.d.ab, bm.d.m, bm.d.n, rstate, random_input_length));
934     } else {
935         A_series = std::unique_ptr<lingen_input_wrapper_base>(new lingen_file_input(bm.d.ab, bm.d.m, bm.d.n, afile, global_flag_ascii, input_length));
936     }
937 
938 #ifdef SELECT_MPFQ_LAYER_u64k1
939 #define K_elts_to_bytes(x)      (iceildiv((x),ULONG_BITS) * sizeof(unsigned long))
940 #else
941 #define K_elts_to_bytes(x)      (abvec_elt_stride(bm.d.ab, (x)))
942 #endif
943 
944     /* run the mpi problem detection only if we're certain that we're at
945      * least close to the ballpark where this sort of checks make sense.
946      */
947     if (K_elts_to_bytes((size_t) A_series->guessed_length() * (size_t) (bm.d.m + bm.d.n)) >= (1 << 28)) {
948         check_for_mpi_problems();
949     }
950     MPI_Barrier(MPI_COMM_WORLD);
951 
952     /* This will cause the initial read */
953     std::unique_ptr<lingen_E_from_A> E_series = std::unique_ptr<lingen_E_from_A>(new lingen_E_from_A(bm.d, *A_series));
954 
955     bm.t = E_series->t0;
956 
957     size_t L = E_series->guessed_length();
958 
959     {
960         matpoly::memory_guard blanket(SIZE_MAX);
961 #ifndef SELECT_MPFQ_LAYER_u64k1
962         typename matpoly_ft<fft_transform_info>::memory_guard blanket_ft(SIZE_MAX);
963 #else
964         typename matpoly_ft<gf2x_cantor_fft_info>::memory_guard blanket_ft(SIZE_MAX);
965         typename matpoly_ft<gf2x_ternary_fft_info>::memory_guard blanket_ft2(SIZE_MAX);
966 #endif
967         try {
968             bm.hints = lingen_tuning(bm.d, L - bm.t, bm.com[0], pl);
969         } catch (std::overflow_error const & e) {
970             fputs(e.what(), stderr);
971             MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
972         }
973     }
974 
975     if (global_flag_tune) {
976         MPI_Finalize();
977         exit(EXIT_SUCCESS);
978     }
979 
980     size_t safe_guess = global_flag_ascii ? ceil(1.05 * L) : L;
981 
982     /* c0 is (1+m/n) times the input size */
983     size_t c0 = K_elts_to_bytes(
984                 iceildiv(bm.d.m + bm.d.n, bm.mpi_dims[0]) *
985                 iceildiv(bm.d.m + bm.d.n, bm.mpi_dims[1]) *
986                 iceildiv(bm.d.m*safe_guess, bm.d.m+bm.d.n));
987     matpoly::memory_guard main(2*c0);
988 
989     if (!rank) {
990         char buf[20];
991         printf("# Estimated memory for JUST transforms (per node): %s\n",
992                 size_disp(2*c0, buf));
993         printf("# Estimated peak total memory (per node): max at depth %d: %s\n",
994                 bm.hints.ipeak,
995                 size_disp(bm.hints.peak, buf));
996     }
997 
998     int go_mpi = bm.companion(0, L).go_mpi();
999 
1000     if (go_mpi) {
1001         if (!rank) {
1002             if (size > 1) {
1003                 printf("Expected length %zu exceeds MPI threshold,"
1004                        " going MPI now.\n",
1005                        L);
1006             } else {
1007                 printf("Expected length %zu exceeds MPI threshold, "
1008                        "but the process is not running in an MPI context.\n",
1009                        L);
1010             }
1011         }
1012         MPI_Barrier(bm.com[0]);
1013     }
1014 
1015     std::unique_ptr<lingen_output_wrapper_base> Fdst;
1016     std::unique_ptr<lingen_output_wrapper_base> Fdst_rhs;
1017 
1018     if (random_input_length) {
1019         Fdst = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_sha1sum(bm.d.ab, bm.d.n, bm.d.n, "F"));
1020         Fdst_rhs = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_sha1sum(bm.d.ab, bm.d.nrhs, bm.d.n, "Frhs"));
1021     } else if (split_output_file) {
1022         std::string pattern = ffile;
1023         Fdst = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_splitfile(bm.d.ab, bm.d.n, bm.d.n, pattern + ".sols{2}-{3}.{0}-{1}", global_flag_ascii));
1024         Fdst_rhs = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_splitfile(bm.d.ab, bm.d.nrhs, bm.d.n, pattern + ".sols{2}-{3}.{0}-{1}.rhs", global_flag_ascii));
1025     } else {
1026         Fdst = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_singlefile(bm.d.ab, bm.d.n, bm.d.n, ffile, global_flag_ascii));
1027         Fdst_rhs = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_singlefile(bm.d.ab, bm.d.nrhs, bm.d.n, std::string(ffile) + ".rhs", global_flag_ascii));
1028     }
1029 
1030     if (go_mpi && size > 1) {
1031         bigmatpoly_model model(bm.com, bm.mpi_dims[0], bm.mpi_dims[1]);
1032         bigmatpoly E(bm.d.ab, model, bm.d.m, bm.d.m + bm.d.n, safe_guess);
1033         lingen_scatter<bigmatpoly> fill_E(E);
1034         lingen_F0 F0 = *E_series;
1035         pipe(*E_series, fill_E, "Read");
1036         bm.delta.assign(bm.d.m + bm.d.n, F0.t0);
1037         logline_init_timer();
1038         bigmatpoly pi = bw_biglingen_collective(bm, E);
1039         bm.stats.final_print();
1040         bm.display_deltas();
1041         if (!rank) printf("(pi.alloc = %zu)\n", pi.my_cell().capacity());
1042         constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
1043         pi.zero_pad(simd * iceildiv(pi.get_size(), simd));
1044         if (check_luck_condition(bm)) {
1045             lingen_gather_reverse<bigmatpoly> read_PI(pi);
1046             lingen_F_from_PI Fsrc(bm, read_PI, F0);
1047             pipe(Fsrc, *Fdst, "Written", true);
1048             Fsrc.write_rhs(*Fdst_rhs);
1049         }
1050     } else if (!rank) {
1051         /* We do this only in the rank==0 case, since we have really
1052          * nothing to do at the other ranks.
1053          */
1054 
1055         /* We don't want to bother with memory problems in the non-mpi
1056          * case when the tuning was done for MPI: this is because the
1057          * per-transform ram was computed in the perspective of an MPI
1058          * run, and not for a plain run.
1059          */
1060 #ifndef SELECT_MPFQ_LAYER_u64k1
1061         typename matpoly_ft<fft_transform_info>::memory_guard blanket_ft(SIZE_MAX);
1062 #else
1063         typename matpoly_ft<gf2x_cantor_fft_info>::memory_guard blanket_ft(SIZE_MAX);
1064         typename matpoly_ft<gf2x_ternary_fft_info>::memory_guard blanket_ft2(SIZE_MAX);
1065 #endif
1066         matpoly E(bm.d.ab, bm.d.m, bm.d.m + bm.d.n, safe_guess);
1067         lingen_scatter<matpoly> fill_E(E);
1068         lingen_F0 F0 = *E_series;
1069         pipe(*E_series, fill_E, "Read");
1070         bm.delta.assign(bm.d.m + bm.d.n, F0.t0);
1071         logline_init_timer();
1072         matpoly pi = bw_lingen_single(bm, E);
1073         bm.stats.final_print();
1074         bm.display_deltas();
1075         if (!rank) printf("(pi.alloc = %zu)\n", pi.capacity());
1076         constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
1077         pi.zero_pad(simd * iceildiv(pi.get_size(), simd));
1078         if (check_luck_condition(bm)) {
1079             lingen_gather_reverse<matpoly> read_PI(pi);
1080             lingen_F_from_PI Fsrc(bm, read_PI, F0);
1081             pipe(Fsrc, *Fdst, "Written", true);
1082             Fsrc.write_rhs(*Fdst_rhs);
1083         }
1084     }
1085 
1086     if (!rank && random_input_length) {
1087         printf("t_basecase = %.2f\n", bm.t_basecase);
1088         printf("t_mp = %.2f\n", bm.t_mp);
1089         printf("t_mul = %.2f\n", bm.t_mul);
1090         printf("t_cp_io = %.2f\n", bm.t_cp_io);
1091         long peakmem = PeakMemusage();
1092         if (peakmem > 0)
1093             printf("# PeakMemusage (MB) = %ld (VmPeak: can be misleading)\n", peakmem >> 10);
1094     }
1095 
1096     abfield_clear(d.ab);
1097     if (ffile) free(ffile);
1098 
1099     gmp_randclear(rstate);
1100 
1101     return 0;   // ignored.
1102 }
1103 
1104 /* We do this so that the dtors of the data that gets allocated within
1105  * main are allowed to use MPI_Comm_rank.
1106  */
1107 // coverity[root_function]
main(int argc,char * argv[])1108 int main(int argc, char *argv[])
1109 {
1110     bw_common_init(bw, &argc, &argv);
1111     wrapped_main(argc, argv);
1112     bw_common_clear(bw);
1113     return rank0_exit_code;
1114 }
1115 
1116 /* vim:set sw=4 sta et: */
1117