1 /* Copyright (C) 1999--2007 Emmanuel Thom'e --- see LICENSE file */
2 #include "cado.h" // IWYU pragma: keep
3 // IWYU pragma: no_include <sys/param.h>
4 #include <cstdint> // for SIZE_MAX
5 #include <cmath> // for ceil
6 #include <cstdio> // for printf, fprintf, size_t
7 #include <cstdlib> // for exit, free, EXIT_FAILURE
8 #include <cstring> // for strcmp, memcpy, memset, strdup
9
10 #include <algorithm> // for fill
11 #include <fstream> // for basic_ostream::write
12 #include <memory> // for unique_ptr
13 #include <stdexcept> // for runtime_error, overflow_error
14 #include <string> // for operator+, string
15 #include <vector> // for vector
16
17 #include <sys/utsname.h> // for uname, utsname
18 #include <gmp.h> // for gmp_randclear, gmp_randinit...
19
20 #include "bw-common.h" // for bw, bw_common_clear, bw_com...
21 #include "fmt/core.h" // for check_format_string
22 #include "fmt/format.h" // for basic_buffer::append, basic...
23 #include "fmt/printf.h" // IWYU pragma: keep
24 #ifdef SELECT_MPFQ_LAYER_u64k1
25 #include "gf2x-fft.h" // for gf2x_cantor_fft_info
26 #include "gf2x-ternary-fft.h" // for gf2x_ternary_fft_info
27 #else
28 #include "flint-fft/transform_interface.h" // fft_transform_info
29 #endif
30 #include "lingen_abfield.hpp" // for abfield_clear, abfield_init
31 #include "lingen_bigmatpoly.hpp" // for bigmatpoly, bigmatpoly_model
32 #include "lingen_bigmatpoly_ft.hpp" // for bigmatpoly_ft
33 #include "lingen_bmstatus.hpp" // for bmstatus
34 #include "lingen_bw_dimensions.hpp" // for bw_dimensions
35 #include "lingen_call_companion.hpp" // for lingen_call_companion, ling...
36 #include "lingen_checkpoints.hpp" // for save_checkpoint_file, load_...
37 #include "lingen_expected_pi_length.hpp" // for expected_pi_length, expecte...
38 #include "lingen_hints.hpp" // for lingen_hints
39 #include "lingen_io_matpoly.hpp" // for lingen_io_matpoly_decl_usage
40 #include "lingen_io_wrappers.hpp" // for lingen_output_wrapper_base
41 #include "lingen_matpoly_ft.hpp" // for matpoly_ft, matpoly_ft<>::m...
42 #include "lingen_matpoly_select.hpp" // for matpoly, matpoly::memory_guard
43 #include "lingen_qcode_select.hpp" // for bw_lingen_basecase
44 #include "lingen_substep_schedule.hpp" // for lingen_substep_schedule
45 #include "lingen_tuning.hpp" // for lingen_tuning, lingen_tunin...
46 #include "logline.h" // for logline_end, logline_init_t...
47 #include "macros.h" // for ASSERT_ALWAYS, iceildiv, MIN
48 #include "memusage.h" // for PeakMemusage
49 #include "misc.h" // for size_disp
50 #include "omp_proxy.h" // for omp_set_num_threads
51 #include "params.h" // for cxx_param_list, param_list_...
52 #include "select_mpi.h" // for MPI_Comm, MPI_Comm_rank
53 #include "sha1.h" // for sha1_checksumming_stream
54 #include "timing.h" // for seconds
55 #include "tree_stats.hpp" // for tree_stats, tree_stats::tra...
56 #include "portability.h" // strdup // IWYU pragma: keep
57
58
59 /* If non-zero, then reading from A is actually replaced by reading from
60 * a random generator */
61 static unsigned int random_input_length = 0;
62 static unsigned int input_length = 0;
63
64 static int split_input_file = 0; /* unsupported ; do acollect by ourselves */
65 static int split_output_file = 0; /* do split by ourselves */
66
67 gmp_randstate_t rstate;
68
69 static int allow_zero_on_rhs = 0;
70
71 int rank0_exit_code = EXIT_SUCCESS;
72
73 int global_flag_ascii = 0;
74 int global_flag_tune = 0;
75
lingen_decl_usage(cxx_param_list & pl)76 void lingen_decl_usage(cxx_param_list & pl)/*{{{*/
77 {
78 param_list_decl_usage(pl, "ascii",
79 "read and write data in ascii");
80 param_list_decl_usage(pl, "timings",
81 "provide timings on all output lines");
82 param_list_decl_usage(pl, "tune",
83 "activate tuning mode");
84 param_list_decl_usage(pl, "allow_zero_on_rhs",
85 "do not cry if the generator corresponds to a zero contribution on the RHS vectors");
86
87 /* we must be square ! */
88 param_list_decl_usage(pl, "mpi", "number of MPI nodes across which the execution will span, with mesh dimensions");
89 param_list_decl_usage(pl, "thr", "number of threads (on each node) for the program, with mesh dimensions");
90
91 param_list_decl_usage(pl, "nrhs",
92 "number of columns that correspond to rhs vectors");
93 param_list_decl_usage(pl, "rhs",
94 "file with rhs vectors (only the header is read)");
95
96 param_list_decl_usage(pl, "afile",
97 "input sequence file");
98 param_list_decl_usage(pl, "input_length",
99 "input sequence length (defaults to auto-detect)");
100 param_list_decl_usage(pl, "random-input-with-length",
101 "use surrogate for input");
102 param_list_decl_usage(pl, "split-input-file",
103 "work with split files on input");
104 param_list_decl_usage(pl, "split-output-file",
105 "work with split files on output");
106 param_list_decl_usage(pl, "random_seed",
107 "seed the random generator");
108 param_list_decl_usage(pl, "ffile",
109 "output generator file");
110
111 #if 0
112 param_list_decl_usage(pl, "lingen_mpi_threshold",
113 "use MPI matrix operations above this size");
114 param_list_decl_usage(pl, "lingen_threshold",
115 "use recursive algorithm above this size");
116 #endif
117
118 param_list_configure_switch(pl, "--tune", &global_flag_tune);
119 param_list_configure_switch(pl, "--ascii", &global_flag_ascii);
120 param_list_configure_alias(pl, "seed", "random_seed");
121
122 }/*}}}*/
123
124 /**********************************************************************/
125
126 /*{{{ Main entry points and recursive algorithm (with and without MPI) */
127
128 /* Forward declaration, it's used by the recursive version */
129 matpoly bw_lingen_single(bmstatus & bm, matpoly & E);
130
131 bigmatpoly bw_biglingen_collective(bmstatus & bm, bigmatpoly & E);
132
sha1sum(matpoly const & X)133 std::string sha1sum(matpoly const & X)
134 {
135 sha1_checksumming_stream S;
136 ASSERT_ALWAYS(X.is_tight());
137 S.write((const char *) X.data_area(), X.data_size_in_bytes());
138 char checksum[41];
139 S.checksum(checksum);
140 return std::string(checksum);
141 }
142
143 template<typename matpoly_type, typename fft_type>
144 struct matching_ft_type {};
145
146 template<typename matpoly_type>
147 struct matpoly_diverter {};
148
149 template<typename fft_type>
150 struct matching_ft_type<matpoly, fft_type> {
151 typedef matpoly_ft<fft_type> type;
152 };
153 template<> struct matpoly_diverter<matpoly> {
154 static constexpr const char * prefix = "";
callbackmatpoly_diverter155 static matpoly callback(bmstatus & bm, matpoly & E) {
156 return bw_lingen_single(bm, E);
157 }
158 };
159 constexpr const char * matpoly_diverter<matpoly>::prefix;
160
161 template<typename fft_type>
162 struct matching_ft_type<bigmatpoly, fft_type> {
163 typedef bigmatpoly_ft<fft_type> type;
164 };
165 template<> struct matpoly_diverter<bigmatpoly> {
166 static constexpr const char * prefix = "MPI-";
callbackmatpoly_diverter167 static bigmatpoly callback(bmstatus & bm, bigmatpoly & E) {
168 return bw_biglingen_collective(bm, E);
169 }
170 };
171 constexpr const char * matpoly_diverter<bigmatpoly>::prefix;
172
173
174 template<typename matpoly_type>
generic_mp(matpoly_type & E,matpoly_type & pi_left,bmstatus & bm,lingen_call_companion & C)175 matpoly_type generic_mp(matpoly_type & E, matpoly_type & pi_left, bmstatus & bm, lingen_call_companion & C)
176 {
177 switch (C.mp.S.fft_type) {
178 case lingen_substep_schedule::FFT_NONE:
179 return matpoly_type::mp(bm.stats, E, pi_left, &C.mp);
180 case lingen_substep_schedule::FFT_FLINT:
181 #ifndef SELECT_MPFQ_LAYER_u64k1
182 return matching_ft_type<matpoly_type,
183 fft_transform_info>::type::mp_caching(
184 bm.stats, E, pi_left, & C.mp);
185 #else
186 throw std::runtime_error("fft type \"flint\" does not make sense here");
187 #endif
188 #ifdef SELECT_MPFQ_LAYER_u64k1
189 case lingen_substep_schedule::FFT_CANTOR:
190 return matching_ft_type<matpoly_type,
191 gf2x_cantor_fft_info>::type::mp_caching(
192 bm.stats, E, pi_left, & C.mp);
193 case lingen_substep_schedule::FFT_TERNARY:
194 return matching_ft_type<matpoly_type,
195 gf2x_ternary_fft_info>::type::mp_caching(
196 bm.stats, E, pi_left, & C.mp);
197 #else
198 case lingen_substep_schedule::FFT_TERNARY:
199 case lingen_substep_schedule::FFT_CANTOR:
200 throw std::runtime_error("fft types over GF(2)[x] do not make sense here");
201 #endif
202 }
203 throw std::runtime_error("invalid fft_type");
204 }
205
206 template<typename matpoly_type>
generic_mul(matpoly_type & pi_left,matpoly_type & pi_right,bmstatus & bm,lingen_call_companion & C)207 matpoly_type generic_mul(matpoly_type & pi_left, matpoly_type & pi_right, bmstatus & bm, lingen_call_companion & C)
208 {
209 switch (C.mul.S.fft_type) {
210 case lingen_substep_schedule::FFT_NONE:
211 return matpoly_type::mul(bm.stats, pi_left, pi_right, & C.mul);
212 case lingen_substep_schedule::FFT_FLINT:
213 #ifndef SELECT_MPFQ_LAYER_u64k1
214 return matching_ft_type<matpoly_type,
215 fft_transform_info>::type::mul_caching(
216 bm.stats, pi_left, pi_right, & C.mul);
217 #else
218 throw std::runtime_error("fft type \"flint\" does not make sense here");
219 #endif
220 #ifdef SELECT_MPFQ_LAYER_u64k1
221 case lingen_substep_schedule::FFT_CANTOR:
222 return matching_ft_type<matpoly_type,
223 gf2x_cantor_fft_info>::type::mul_caching(
224 bm.stats, pi_left, pi_right, & C.mul);
225 case lingen_substep_schedule::FFT_TERNARY:
226 return matching_ft_type<matpoly_type,
227 gf2x_ternary_fft_info>::type::mul_caching(
228 bm.stats, pi_left, pi_right, & C.mul);
229 #else
230 case lingen_substep_schedule::FFT_TERNARY:
231 case lingen_substep_schedule::FFT_CANTOR:
232 throw std::runtime_error("fft types over GF(2)[x] do not make sense here");
233 #endif
234 }
235 throw std::runtime_error("invalid fft_type");
236 }
237
238 template<typename matpoly_type>
truncate_overflow(bmstatus & bm,matpoly_type & pi,unsigned int pi_expect)239 void truncate_overflow(bmstatus & bm, matpoly_type & pi, unsigned int pi_expect)
240 {
241 if (pi.get_size() > pi_expect) {
242 unsigned int nluck = 0;
243 unsigned int maxdelta_lucky = 0;
244 for(unsigned int j = 0 ; j < bm.d.m + bm.d.n ; j++) {
245 if (!bm.lucky[j]) continue;
246 nluck++;
247 if (bm.delta[j] >= maxdelta_lucky)
248 maxdelta_lucky = bm.delta[j];
249 }
250 printf("truncating excess cols\n"
251 " pi has length %zu, we expected %u at most.\n"
252 " max delta on the %u lucky columns: %u\n",
253 pi.get_size(),
254 pi_expect,
255 nluck,
256 maxdelta_lucky);
257 bm.display_deltas();
258 pi.set_size(pi_expect);
259 pi.clear_high_word();
260 /* in fact, there is really no reason to reach here. The expected
261 * pi length should be large enough so that the "luck" condition
262 * and eventually the "done"/"finished" flags are triggered
263 * before we reach the point where we overflow. This means a
264 * constant offset. Therefore, reaching here is a bug.
265 *
266 * Furthermore, while the idea of truncating is appealing, it
267 * doesn't quote solve the problem when we wish to keep iterating
268 * a bit more: because of the degree constraints, truncating will
269 * make the next E matrix cancel, as we're almost knowingly
270 * killing its rank...
271 *
272 * one of the better things to do could be to keep track of a
273 * shift vector. but again, what for...
274 */
275 ASSERT_ALWAYS(0);
276 }
277 }
278
279
280 template<typename matpoly_type>
bw_lingen_recursive(bmstatus & bm,matpoly_type & E)281 matpoly_type bw_lingen_recursive(bmstatus & bm, matpoly_type & E) /*{{{*/
282 {
283 int depth = bm.depth();
284 size_t z = E.get_size();
285
286 /* C0 is a copy. We won't use it for long anyway. We'll take a
287 * reference _later_ */
288 lingen_call_companion C0 = bm.companion(depth, z);
289
290 tree_stats::sentinel dummy(bm.stats, fmt::sprintf("%srecursive", matpoly_diverter<matpoly_type>::prefix), z, C0.total_ncalls);
291
292 bm.stats.plan_smallstep(C0.mp.step_name(), C0.mp.tt);
293 bm.stats.plan_smallstep(C0.mul.step_name(), C0.mul.tt);
294
295 bw_dimensions & d = bm.d;
296
297 /* we have to start with something large enough to get all
298 * coefficients of E_right correct */
299 size_t half = E.get_size() - (E.get_size() / 2);
300 // unsigned int pi_expect = expected_pi_length(d, bm.delta, E.get_size());
301 unsigned int pi_expect_lowerbound = expected_pi_length_lowerbound(d, E.get_size());
302 unsigned int pi_left_expect = expected_pi_length(d, bm.delta, half);
303 unsigned int pi_left_expect_lowerbound = expected_pi_length_lowerbound(d, half);
304 unsigned int pi_left_expect_used_for_shift = MIN(pi_left_expect, half + 1);
305
306
307 matpoly_type E_left = E.truncate_and_rshift(half, half + 1 - pi_left_expect_used_for_shift);
308
309 // this consumes E_left entirely.
310 matpoly_type pi_left = matpoly_diverter<matpoly_type>::callback(bm, E_left);
311
312 ASSERT_ALWAYS(pi_left.get_size());
313
314 if (bm.done) {
315 return pi_left;
316 }
317
318 truncate_overflow(bm, pi_left, pi_left_expect);
319
320 ASSERT_ALWAYS(bm.done || pi_left.get_size() >= pi_left_expect_lowerbound);
321
322 /* XXX I don't understand why I need to do this. It seems to me that
323 * MP(XA, B) and MP(A, B) should be identical whenever deg A > deg B.
324 */
325 ASSERT_ALWAYS(pi_left_expect_used_for_shift >= pi_left.get_size());
326 if (pi_left_expect_used_for_shift != pi_left.get_size()) {
327 E.rshift(E, pi_left_expect_used_for_shift - pi_left.get_size());
328 /* Don't shrink_to_fit at this point, because we've only made a
329 * minor adjustment. */
330 }
331
332 matpoly_type E_right = E.similar_shell();
333
334 if (!load_checkpoint_file(bm, LINGEN_CHECKPOINT_E, E_right, bm.t, bm.t+E.get_size()-pi_left.get_size()+1)) {
335 logline_begin(stdout, z, "t=%u %*s%sMP(%s, %zu, %zu) -> %zu",
336 bm.t, depth,"", matpoly_diverter<matpoly_type>::prefix,
337 C0.mp.fft_name(),
338 E.get_size(), pi_left.get_size(), E.get_size() - pi_left.get_size() + 1);
339
340 E_right = generic_mp(E, pi_left, bm, bm.companion(depth, z));
341 logline_end(&bm.t_mp, "");
342 }
343 E.clear();
344
345
346 unsigned int pi_right_expect = expected_pi_length(d, bm.delta, E_right.get_size());
347 unsigned int pi_right_expect_lowerbound = expected_pi_length_lowerbound(d, E_right.get_size());
348
349 matpoly_type pi_right = matpoly_diverter<matpoly_type>::callback(bm, E_right);
350
351 truncate_overflow(bm, pi_right, pi_right_expect);
352
353 ASSERT_ALWAYS(bm.done || pi_right.get_size() >= pi_right_expect_lowerbound);
354
355 logline_begin(stdout, z, "t=%u %*s%sMUL(%s, %zu, %zu) -> %zu",
356 bm.t, depth, "", matpoly_diverter<matpoly_type>::prefix,
357 C0.mul.fft_name(),
358 pi_left.get_size(), pi_right.get_size(), pi_left.get_size() + pi_right.get_size() - 1);
359
360 matpoly_type pi = generic_mul(pi_left, pi_right, bm, bm.companion(depth, z));
361 pi_left.clear();
362 pi_right.clear();
363
364 /* Note that the leading coefficients of pi_left and pi_right are not
365 * necessarily full-rank, so that we have to fix potential zeros. If
366 * we don't, the degree of pi artificially grows with the recursive
367 * level.
368 */
369 unsigned int pisize = pi.get_size();
370 #if 0
371 /* The asserts below are bogus. In fact, it's not entirely impossible
372 * that pi grows more than what we had expected on entry, e.g. if we
373 * have one early generator. So we can't just do this. Most of the
374 * time it will work, but we can't claim that it will always work.
375 *
376 * One possible sign is when the entry deltas are somewhat even, and
377 * the result deltas are unbalanced.
378 *
379 * It might also be worthwhile to do this check by column, and look
380 * at both degree and valuation.
381 */
382 for(; pisize > pi_expect ; pisize--) {
383 /* These coefficients really must be zero */
384 ASSERT_ALWAYS(pi.coeff_is_zero(pisize - 1));
385 }
386 ASSERT_ALWAYS(pisize <= pi_expect);
387 #endif
388 /* Now below pi_expect, it's not impossible to have a few
389 * cancellations as well.
390 */
391 for(; pisize ; pisize--) {
392 if (!pi.coeff_is_zero(pisize - 1)) break;
393 }
394 pi.set_size(pisize);
395 ASSERT_ALWAYS(bm.done || pisize >= pi_expect_lowerbound);
396
397 logline_end(&bm.t_mul, "");
398
399 return pi;
400 }/*}}}*/
401
402
bw_lingen_single_nocp(bmstatus & bm,matpoly & E)403 matpoly bw_lingen_single_nocp(bmstatus & bm, matpoly & E) /*{{{*/
404 {
405 int rank;
406 MPI_Comm_rank(bm.com[0], &rank);
407 ASSERT_ALWAYS(!rank);
408 matpoly pi;
409
410 lingen_call_companion C = bm.companion(bm.depth(), E.get_size());
411
412 // ASSERT_ALWAYS(E.size < bm.lingen_mpi_threshold);
413
414 // fprintf(stderr, "Enter %s\n", __func__);
415 if (!bm.recurse(E.get_size())) {
416 tree_stats::transition_sentinel dummy(bm.stats, "recursive_threshold", E.get_size(), C.total_ncalls);
417 bm.t_basecase -= seconds();
418 E.clear_high_word();
419 pi = bw_lingen_basecase(bm, E);
420 bm.t_basecase += seconds();
421 } else {
422 pi = bw_lingen_recursive(bm, E);
423 }
424 // fprintf(stderr, "Leave %s\n", __func__);
425
426 return pi;
427 }/*}}}*/
bw_lingen_single(bmstatus & bm,matpoly & E)428 matpoly bw_lingen_single(bmstatus & bm, matpoly & E) /*{{{*/
429 {
430 int rank;
431 MPI_Comm_rank(bm.com[0], &rank);
432 ASSERT_ALWAYS(!rank);
433 unsigned int t0 = bm.t;
434 unsigned int t1 = bm.t + E.get_size();
435 matpoly pi;
436
437 save_checkpoint_file(bm, LINGEN_CHECKPOINT_E, E, t0, t1);
438
439 if (load_checkpoint_file(bm, LINGEN_CHECKPOINT_PI, pi, t0, t1))
440 return pi;
441
442 pi = bw_lingen_single_nocp(bm, E);
443
444 save_checkpoint_file(bm, LINGEN_CHECKPOINT_PI, pi, t0, t1);
445
446 return pi;
447 }/*}}}*/
448
bw_biglingen_collective(bmstatus & bm,bigmatpoly & E)449 bigmatpoly bw_biglingen_collective(bmstatus & bm, bigmatpoly & E)/*{{{*/
450 {
451 /* as for bw_lingen_single, we're tempted to say that we're just a
452 * trampoline. In fact, it's not really satisfactory: we're really
453 * doing stuff here. In a sense though, it's not *that much* of a
454 * trouble, because the mpi threshold will be low enough that doing
455 * our full job here is not too much of a problem.
456 */
457 bw_dimensions & d = bm.d;
458 abdst_field ab = d.ab;
459 unsigned int m = d.m;
460 unsigned int n = d.n;
461 unsigned int b = m + n;
462 int rank;
463 int size;
464 MPI_Comm_rank(bm.com[0], &rank);
465 MPI_Comm_size(bm.com[0], &size);
466 unsigned int t0 = bm.t;
467 unsigned int t1 = bm.t + E.get_size();
468 bigmatpoly_model const& model(E);
469 int depth = bm.depth();
470 size_t z = E.get_size();
471
472 lingen_call_companion C = bm.companion(depth, z);
473 bool go_mpi = C.go_mpi();
474 // bool go_mpi = E.get_size() >= bm.lingen_mpi_threshold;
475
476 bigmatpoly pi(model);
477
478 save_checkpoint_file(bm, LINGEN_CHECKPOINT_E, E, t0, t1);
479
480 if (load_checkpoint_file(bm, LINGEN_CHECKPOINT_PI, pi, t0, t1))
481 return pi;
482
483 // fprintf(stderr, "Enter %s\n", __func__);
484 if (go_mpi) {
485 pi = bw_lingen_recursive(bm, E);
486 } else {
487 /* Fall back to local code */
488 /* This entails gathering E locally, computing pi locally, and
489 * dispathing it back. */
490
491 tree_stats::transition_sentinel dummy(bm.stats, "mpi_threshold", E.get_size(), C.total_ncalls);
492
493 matpoly sE(ab, m, b, E.get_size());
494 matpoly spi;
495
496 double expect0 = bm.hints.tt_gather_per_unit * E.get_size();
497 bm.stats.plan_smallstep("gather(L+R)", expect0);
498 bm.stats.begin_smallstep("gather(L+R)");
499 E.gather_mat(sE);
500 E = bigmatpoly(model);
501 bm.stats.end_smallstep();
502
503 /* Only the master node does the local computation */
504 if (!rank)
505 spi = bw_lingen_single_nocp(bm, sE);
506
507 double expect1 = bm.hints.tt_scatter_per_unit * z;
508 bm.stats.plan_smallstep("scatter(L+R)", expect1);
509 bm.stats.begin_smallstep("scatter(L+R)");
510 pi = bigmatpoly(ab, model, b, b, 0);
511 pi.scatter_mat(spi);
512 MPI_Bcast(&bm.done, 1, MPI_INT, 0, bm.com[0]);
513 MPI_Bcast(bm.delta.data(), b, MPI_UNSIGNED, 0, bm.com[0]);
514 MPI_Bcast(bm.lucky.data(), b, MPI_UNSIGNED, 0, bm.com[0]);
515 MPI_Bcast(&(bm.t), 1, MPI_UNSIGNED, 0, bm.com[0]);
516 /* Don't forget to broadcast delta from root node to others ! */
517 bm.stats.end_smallstep();
518 }
519 // fprintf(stderr, "Leave %s\n", __func__);
520
521 save_checkpoint_file(bm, LINGEN_CHECKPOINT_PI, pi, t0, t1);
522
523 MPI_Barrier(bm.com[0]);
524
525 return pi;
526 }/*}}}*/
527
528 /*}}}*/
529
530 /**********************************************************************/
531
532 /**********************************************************************/
count_lucky_columns(bmstatus & bm)533 unsigned int count_lucky_columns(bmstatus & bm)/*{{{*/
534 {
535 bw_dimensions & d = bm.d;
536 unsigned int m = d.m;
537 unsigned int n = d.n;
538 unsigned int b = m + n;
539 int luck_mini = expected_pi_length(d);
540 MPI_Bcast(bm.lucky.data(), b, MPI_UNSIGNED, 0, bm.com[0]);
541 unsigned int nlucky = 0;
542 for(unsigned int j = 0 ; j < b ; nlucky += bm.lucky[j++] >= luck_mini) ;
543 return nlucky;
544 }/*}}}*/
545
check_luck_condition(bmstatus & bm)546 int check_luck_condition(bmstatus & bm)/*{{{*/
547 {
548 bw_dimensions & d = bm.d;
549 unsigned int m = d.m;
550 unsigned int n = d.n;
551 unsigned int nlucky = count_lucky_columns(bm);
552
553 int rank;
554 MPI_Comm_rank(bm.com[0], &rank);
555
556 if (!rank) {
557 printf("Number of lucky columns: %u (%u wanted)\n", nlucky, n);
558 }
559
560 if (nlucky == n)
561 return 1;
562
563 if (!rank) {
564 fprintf(stderr, "Could not find the required set of solutions (nlucky=%u)\n", nlucky);
565 }
566 if (random_input_length) {
567 static int once=0;
568 if (once++) {
569 if (!rank) {
570 fprintf(stderr, "Solution-faking loop crashed\n");
571 }
572 MPI_Abort(bm.com[0], EXIT_FAILURE);
573 }
574 if (!rank) {
575 printf("Random input: faking successful computation\n");
576 }
577 for(unsigned int j = 0 ; j < n ; j++) {
578 unsigned int s = (j * 1009) % (m+n);
579 bm.lucky[s] = expected_pi_length(d);
580 bm.delta[s] -= expected_pi_length(d);
581 }
582 return check_luck_condition(bm);
583 }
584
585 return 0;
586 }/*}}}*/
587
print_node_assignment(MPI_Comm comm)588 void print_node_assignment(MPI_Comm comm)/*{{{*/
589 {
590 int rank;
591 int size;
592 MPI_Comm_size(comm, &size);
593 MPI_Comm_rank(comm, &rank);
594
595 struct utsname me[1];
596 int rc = uname(me);
597 if (rc < 0) { perror("uname"); MPI_Abort(comm, 1); }
598 size_t sz = 1 + sizeof(me->nodename);
599 char * global = (char*) malloc(size * sz);
600 memset(global, 0, size * sz);
601 memcpy(global + rank * sz, me->nodename, sizeof(me->nodename));
602
603 MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
604 global, sz, MPI_BYTE, comm);
605 if (rank == 0) {
606 char name[80];
607 int len=80;
608 MPI_Comm_get_name(comm, name, &len);
609 name[79]=0;
610 for(int i = 0 ; i < size ; i++) {
611 printf("# %s rank %d: %s\n", name, i, global + i * sz);
612 }
613 }
614 free(global);
615 }/*}}}*/
616
617 /* Counting memory usage in the recursive algorithm.
618 *
619 * The recursive algorithm is designed to allow the allocated memory for
620 * the input to be reused for placing the output. Some memory might have
621 * been saved by upper layers. We also have some local allocation.
622 *
623 * Notations: The algorithm starts at depth 0 with an
624 * input length L, and the notation \ell_i denotes L/2^(i+1). We have
625 * \ell_i=2\ell_{i+1}. The notation \alpha denotes m/(m+n). Note that the
626 * input has size \alpha*(1-\alpha)*L times (m+n)^2*\log_2(p) (divided by
627 * r^2 if relevant).
628 *
629 * We define five quantities. All are understood as multiples of
630 * (m+n)^2*\log_2(p).
631 *
632 * MP(i) is the extra storage needed for the MP operation at depth i.
633 *
634 * MUL(i) is the extra storage needed for the MUL operation at depth i.
635 *
636 * IO(i) is the common size of the input and output data of the call at
637 * depth i. We have
638 * IO(i) = 2\alpha\ell_i
639 *
640 * ST(i) is the storage *at all levels above the current one* (i.e. with
641 * depth strictly less than i) for the data that is still live and
642 * need to exist until after we return. This count is maximized in the
643 * leftmost branch, where chopped E at all levels must be kept.
644 * chopped E at depth i (not counted in ST(i) !) is:
645 * \alpha(1+\alpha) \ell_i
646 * (counted as the degree it takes to make the necessary data that
647 * we want to use to compute E_right),
648 * so the cumulated cost above is twice the depth 0 value, minus the
649 * depth i value, i.e.
650 * ST(i) = \alpha(1+\alpha)(L-2\ell_i).
651 * SP(i) is the "spike" at depth i: not counting allocation that is
652 * already reserved for IO or ST, this is the amount of extra memory
653 * that is required by the call at depth i. We have:
654 * SP(i) = max {
655 * \alpha\ell_i,
656 * \alpha\ell_i-2\alpha\ell_i+\alpha(1+\alpha)\ell_i+SP(i+1),
657 * 2\alpha\ell_i-2\alpha\ell_i+\alpha(1+\alpha)\ell_i+MP(i),
658 * 2\alpha\ell_i-2\alpha\ell_i+SP(i+1)
659 * 4\alpha\ell_i-2\alpha\ell_i+MUL(i)
660 * }
661 * = max {
662 * \alpha\ell_i-2\alpha\ell_i+\alpha(1+\alpha)\ell_i+SP(i+1),
663 * 2\alpha\ell_i-2\alpha\ell_i+\alpha(1+\alpha)\ell_i+MP(i),
664 * 4\alpha\ell_i-2\alpha\ell_i+MUL(i)
665 * }
666 *
667 * Combining this together, and using
668 * ST(i)+\alpha(1+\alpha)\ell_i=ST(i+1), we have:
669 *
670 * IO(i)+ST(i)+SP(i) = max {
671 * IO(i+1)+ST(i+1)+SP(i+1),
672 * ST(i) + 2\alpha\ell_i+\alpha(1+\alpha)\ell_i+MP(i),
673 * ST(i) + 4\alpha\ell_i+MUL(i)
674 * }
675 * = max {
676 * IO(i+1)+ST(i+1)+SP(i+1),
677 * \alpha(1+\alpha)(L-\ell_i) + 2\alpha\ell_i + MP(i),
678 * \alpha(1+\alpha)(L-2\ell_i) + 4\alpha\ell_i + MUL(i)
679 * }
680 * = max {
681 * IO(i+1)+ST(i+1)+SP(i+1),
682 * \alpha((1+\alpha)L+(1-\alpha)\ell_i) + MP(i),
683 * \alpha((1+\alpha)L+2(1-\alpha)\ell_i) + MUL(i),
684 * }
685 *
686 * Let RMP(i) be the amount of memory that is reserved while we are doing
687 * the MP operation, and define RMUL similarly. We have:
688 * RMP(i) = \alpha(1+\alpha)(L-\ell_i) + 2\alpha\ell_i
689 * RMUL(i) = \alpha(1+\alpha)(L-2\ell_i) + 4\alpha\ell_i
690 * whence:
691 * RMP(i) = \alpha((1+\alpha)L+(1-\alpha)\ell_i)
692 * RMUL(i) = \alpha((1+\alpha)L+2(1-\alpha)\ell_i)
693 *
694 * We have RMP(i) <= RMUL(i) <= RMP(0) <= RMUL(0) = 2\alpha*L. We'll use
695 * the un-simplified expression later.
696 *
697 * Furthermore IO(infinity)=SP(infinity)=0, and ST(infinity)=\alpha(1+\alpha)L
698 *
699 * So that eventually, the amount of reserved memory for the whole
700 * algorithm is RMUL(0)=2\alpha*L (which is 2/(1-\alpha)=2*(1+m/n) times
701 * the input size). On top of that we have the memory required
702 * for the transforms.
703 *
704 *
705 * When going MPI, matrices may be rounded with some inaccuracy.
706 * Splitting in two a 3x3 matrix leads to a 2x2 chunk, which is 1.77
707 * times more than the simplistic proportionality rule.
708 *
709 * Therefore it makes sense to distinguish between matrices of size
710 * m*(m+n) and (m+n)*(m+n). If we recompute RMUL(i) by taking this into
711 * account, we obtain:
712 * [m/r][(m+n)/r][(1+\alpha)(L-2\ell_i)] + [(m+n)/r]^2*[4\alpha\ell_i]
713 * where we only paid attention to the rounding issues with dimensions,
714 * as those are more important than for degrees. Bottom line, the max is
715 * expected to be for i=0, and that will be made only of pi matrices.
716 */
717
718 /* Some of the early reading must be done before we even start, since
719 * the code that we run depends on the input size.
720 */
721
722 /* We don't have a header file for this one */
723 extern "C" void check_for_mpi_problems();
724
wrapped_main(int argc,char * argv[])725 int wrapped_main(int argc, char *argv[])
726 {
727
728 cxx_param_list pl;
729
730 bw_common_decl_usage(pl);
731 lingen_decl_usage(pl);
732 logline_decl_usage(pl);
733 lingen_tuning_decl_usage(pl);
734 lingen_checkpoint::decl_usage(pl);
735 lingen_io_matpoly_decl_usage(pl);
736 tree_stats::declare_usage(pl);
737
738 bw_common_parse_cmdline(bw, pl, &argc, &argv);
739
740 bw_common_interpret_parameters(bw, pl);
741 /* {{{ interpret our parameters */
742 gmp_randinit_default(rstate);
743
744 int rank;
745 int size;
746 MPI_Comm_size(MPI_COMM_WORLD, &size);
747 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
748
749 param_list_parse_int(pl, "allow_zero_on_rhs", &allow_zero_on_rhs);
750 param_list_parse_uint(pl, "random-input-with-length", &random_input_length);
751 param_list_parse_uint(pl, "input-length", &input_length);
752 param_list_parse_int(pl, "split-output-file", &split_output_file);
753 param_list_parse_int(pl, "split-input-file", &split_input_file);
754
755 const char * afile = param_list_lookup_string(pl, "afile");
756
757 if (bw->m == -1) {
758 fprintf(stderr, "no m value set\n");
759 exit(EXIT_FAILURE);
760 }
761 if (bw->n == -1) {
762 fprintf(stderr, "no n value set\n");
763 exit(EXIT_FAILURE);
764 }
765 if (!global_flag_tune && !(afile || random_input_length)) {
766 fprintf(stderr, "No afile provided\n");
767 exit(EXIT_FAILURE);
768 }
769
770 /* we allow ffile and ffile to be both NULL */
771 const char * tmp = param_list_lookup_string(pl, "ffile");
772 char * ffile = NULL;
773 if (tmp) {
774 ffile = strdup(tmp);
775 } else if (afile) {
776 int rc = asprintf(&ffile, "%s.gen", afile);
777 ASSERT_ALWAYS(rc >= 0);
778 }
779 ASSERT_ALWAYS((afile==NULL) == (ffile == NULL));
780
781 bmstatus bm(bw->m, bw->n);
782 bw_dimensions & d = bm.d;
783
784 const char * rhs_name = param_list_lookup_string(pl, "rhs");
785 if (!global_flag_tune && !random_input_length) {
786 if (!rhs_name) {
787 fprintf(stderr, "# When using lingen, you must either supply --random-input-with-length, or provide a rhs, or possibly provide rhs=none\n");
788 } else if (strcmp(rhs_name, "none") == 0) {
789 rhs_name = NULL;
790 }
791 }
792 if (param_list_parse_uint(pl, "nrhs", &(bm.d.nrhs)) && rhs_name) {
793 fprintf(stderr, "# the command line arguments rhs= and nrhs= are incompatible\n");
794 exit(EXIT_FAILURE);
795 }
796 if (rhs_name && strcmp(rhs_name, "none") != 0) {
797 if (!rank)
798 get_rhs_file_header(rhs_name, NULL, &(bm.d.nrhs), NULL);
799 MPI_Bcast(&bm.d.nrhs, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
800 }
801
802 abfield_init(d.ab);
803 abfield_specify(d.ab, MPFQ_PRIME_MPZ, bw->p);
804
805 gmp_randseed_ui(rstate, bw->seed);
806
807 #if 0
808 bm.lingen_threshold = 10;
809 bm.lingen_mpi_threshold = 1000;
810 param_list_parse_uint(pl, "lingen_threshold", &(bm.lingen_threshold));
811 param_list_parse_uint(pl, "lingen_mpi_threshold", &(bm.lingen_mpi_threshold));
812 if (bm.lingen_mpi_threshold < bm.lingen_threshold) {
813 bm.lingen_mpi_threshold = bm.lingen_threshold;
814 fprintf(stderr, "Argument fixing: setting lingen_mpi_threshold=%u (because lingen_threshold=%u)\n",
815 bm.lingen_mpi_threshold, bm.lingen_threshold);
816 }
817
818
819 #if defined(FAKEMPI_H_)
820 bm.lingen_mpi_threshold = UINT_MAX;
821 #endif
822 #endif
823
824 /* }}} */
825
826 /* TODO: we should rather use lingen_platform.
827 */
828 /* {{{ Parse MPI args. Make bm.com[0] a better mpi communicator */
829 bm.mpi_dims[0] = 1;
830 bm.mpi_dims[1] = 1;
831 param_list_parse_intxint(pl, "mpi", bm.mpi_dims);
832 {
833 /* Display node index wrt MPI_COMM_WORLD */
834 print_node_assignment(MPI_COMM_WORLD);
835
836 /* Reorder all mpi nodes so that each node gets the given number
837 * of jobs, but close together.
838 */
839 int mpi[2] = { bm.mpi_dims[0], bm.mpi_dims[1], };
840 int thr[2] = {1,1};
841 #ifdef HAVE_OPENMP
842 if (param_list_parse_intxint(pl, "thr", thr)) {
843 if (!rank)
844 printf("# Limiting number of openmp threads to %d\n",
845 thr[0] * thr[1]);
846 omp_set_num_threads(thr[0] * thr[1]);
847 }
848 #else
849 if (param_list_parse_intxint(pl, "thr", thr)) {
850 if (thr[0]*thr[1] != 1) {
851 if (!rank) {
852 fprintf(stderr, "This program only wants openmp for multithreading. Ignoring thr argument.\n");
853 }
854 param_list_add_key(pl, "thr", "1x1", PARAMETER_FROM_CMDLINE);
855 }
856 }
857 #endif
858
859 #ifdef FAKEMPI_H_
860 if (mpi[0]*mpi[1] > 1) {
861 fprintf(stderr, "non-trivial option mpi= can't be used with fakempi. Please do an MPI-enabled build (MPI=1)\n");
862 exit(EXIT_FAILURE);
863 }
864 #endif
865 if (!rank)
866 printf("# size=%d mpi=%dx%d thr=%dx%d\n", size, mpi[0], mpi[1], thr[0], thr[1]);
867 ASSERT_ALWAYS(size == mpi[0] * mpi[1]);
868 if (bm.mpi_dims[0] != bm.mpi_dims[1]) {
869 if (!rank)
870 fprintf(stderr, "The current lingen code is limited to square splits ; here, we received a %d x %d split, which will not work\n",
871 bm.mpi_dims[0], bm.mpi_dims[1]);
872 abort();
873 }
874 int irank = rank / mpi[1];
875 int jrank = rank % mpi[1];
876 bm.com[0] = MPI_COMM_WORLD;
877 /* MPI Api has some very deprecated prototypes */
878 MPI_Comm_set_name(bm.com[0], (char*) "world");
879
880 char commname[32];
881 snprintf(commname, sizeof(commname), "row%d\n", irank);
882 MPI_Comm_split(MPI_COMM_WORLD, irank, jrank, &(bm.com[1]));
883 MPI_Comm_set_name(bm.com[1], commname);
884
885 snprintf(commname, sizeof(commname), "col%d\n", jrank);
886 MPI_Comm_split(MPI_COMM_WORLD, jrank, irank, &(bm.com[2]));
887 MPI_Comm_set_name(bm.com[2], commname);
888
889 print_node_assignment(bm.com[0]);
890
891 constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
892 if ((bm.d.m + bm.d.n) / simd < (unsigned int) mpi[0]) {
893 printf("########################################################\n");
894 printf("# Warning: this run will leave some resources idle:\n"
895 "# the matrices of size %u*%u and %u*%u can be split into\n"
896 "# chunks of minimal size %u, whence an mpi split over %d*%d is useless\n",
897 bm.d.m,
898 bm.d.m + bm.d.n,
899 bm.d.m + bm.d.n,
900 bm.d.m + bm.d.n,
901 simd,
902 mpi[0],
903 mpi[0]);
904 printf("########################################################\n");
905 }
906 }
907 /* }}} */
908
909 /* lingen tuning accepts some arguments. We look them up so as to
910 * avoid failures down the line */
911 lingen_tuning_lookup_parameters(pl);
912
913 tree_stats::interpret_parameters(pl);
914 logline_interpret_parameters(pl);
915 lingen_checkpoint::interpret_parameters(pl);
916 lingen_io_matpoly_interpret_parameters(pl);
917
918 if (param_list_warn_unused(pl)) {
919 int rank;
920 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
921 if (!rank) param_list_print_usage(pl, bw->original_argv[0], stderr);
922 MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
923 }
924
925 /* TODO: read the a files in scattered mode */
926
927 /* Don't police memory right now, we don't care */
928 matpoly::memory_guard main_memory(SIZE_MAX);
929
930 std::unique_ptr<lingen_input_wrapper_base> A_series;
931
932 if (random_input_length) {
933 A_series = std::unique_ptr<lingen_input_wrapper_base>(new lingen_random_input(bm.d.ab, bm.d.m, bm.d.n, rstate, random_input_length));
934 } else {
935 A_series = std::unique_ptr<lingen_input_wrapper_base>(new lingen_file_input(bm.d.ab, bm.d.m, bm.d.n, afile, global_flag_ascii, input_length));
936 }
937
938 #ifdef SELECT_MPFQ_LAYER_u64k1
939 #define K_elts_to_bytes(x) (iceildiv((x),ULONG_BITS) * sizeof(unsigned long))
940 #else
941 #define K_elts_to_bytes(x) (abvec_elt_stride(bm.d.ab, (x)))
942 #endif
943
944 /* run the mpi problem detection only if we're certain that we're at
945 * least close to the ballpark where this sort of checks make sense.
946 */
947 if (K_elts_to_bytes((size_t) A_series->guessed_length() * (size_t) (bm.d.m + bm.d.n)) >= (1 << 28)) {
948 check_for_mpi_problems();
949 }
950 MPI_Barrier(MPI_COMM_WORLD);
951
952 /* This will cause the initial read */
953 std::unique_ptr<lingen_E_from_A> E_series = std::unique_ptr<lingen_E_from_A>(new lingen_E_from_A(bm.d, *A_series));
954
955 bm.t = E_series->t0;
956
957 size_t L = E_series->guessed_length();
958
959 {
960 matpoly::memory_guard blanket(SIZE_MAX);
961 #ifndef SELECT_MPFQ_LAYER_u64k1
962 typename matpoly_ft<fft_transform_info>::memory_guard blanket_ft(SIZE_MAX);
963 #else
964 typename matpoly_ft<gf2x_cantor_fft_info>::memory_guard blanket_ft(SIZE_MAX);
965 typename matpoly_ft<gf2x_ternary_fft_info>::memory_guard blanket_ft2(SIZE_MAX);
966 #endif
967 try {
968 bm.hints = lingen_tuning(bm.d, L - bm.t, bm.com[0], pl);
969 } catch (std::overflow_error const & e) {
970 fputs(e.what(), stderr);
971 MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
972 }
973 }
974
975 if (global_flag_tune) {
976 MPI_Finalize();
977 exit(EXIT_SUCCESS);
978 }
979
980 size_t safe_guess = global_flag_ascii ? ceil(1.05 * L) : L;
981
982 /* c0 is (1+m/n) times the input size */
983 size_t c0 = K_elts_to_bytes(
984 iceildiv(bm.d.m + bm.d.n, bm.mpi_dims[0]) *
985 iceildiv(bm.d.m + bm.d.n, bm.mpi_dims[1]) *
986 iceildiv(bm.d.m*safe_guess, bm.d.m+bm.d.n));
987 matpoly::memory_guard main(2*c0);
988
989 if (!rank) {
990 char buf[20];
991 printf("# Estimated memory for JUST transforms (per node): %s\n",
992 size_disp(2*c0, buf));
993 printf("# Estimated peak total memory (per node): max at depth %d: %s\n",
994 bm.hints.ipeak,
995 size_disp(bm.hints.peak, buf));
996 }
997
998 int go_mpi = bm.companion(0, L).go_mpi();
999
1000 if (go_mpi) {
1001 if (!rank) {
1002 if (size > 1) {
1003 printf("Expected length %zu exceeds MPI threshold,"
1004 " going MPI now.\n",
1005 L);
1006 } else {
1007 printf("Expected length %zu exceeds MPI threshold, "
1008 "but the process is not running in an MPI context.\n",
1009 L);
1010 }
1011 }
1012 MPI_Barrier(bm.com[0]);
1013 }
1014
1015 std::unique_ptr<lingen_output_wrapper_base> Fdst;
1016 std::unique_ptr<lingen_output_wrapper_base> Fdst_rhs;
1017
1018 if (random_input_length) {
1019 Fdst = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_sha1sum(bm.d.ab, bm.d.n, bm.d.n, "F"));
1020 Fdst_rhs = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_sha1sum(bm.d.ab, bm.d.nrhs, bm.d.n, "Frhs"));
1021 } else if (split_output_file) {
1022 std::string pattern = ffile;
1023 Fdst = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_splitfile(bm.d.ab, bm.d.n, bm.d.n, pattern + ".sols{2}-{3}.{0}-{1}", global_flag_ascii));
1024 Fdst_rhs = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_splitfile(bm.d.ab, bm.d.nrhs, bm.d.n, pattern + ".sols{2}-{3}.{0}-{1}.rhs", global_flag_ascii));
1025 } else {
1026 Fdst = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_singlefile(bm.d.ab, bm.d.n, bm.d.n, ffile, global_flag_ascii));
1027 Fdst_rhs = std::unique_ptr<lingen_output_wrapper_base>(new lingen_output_to_singlefile(bm.d.ab, bm.d.nrhs, bm.d.n, std::string(ffile) + ".rhs", global_flag_ascii));
1028 }
1029
1030 if (go_mpi && size > 1) {
1031 bigmatpoly_model model(bm.com, bm.mpi_dims[0], bm.mpi_dims[1]);
1032 bigmatpoly E(bm.d.ab, model, bm.d.m, bm.d.m + bm.d.n, safe_guess);
1033 lingen_scatter<bigmatpoly> fill_E(E);
1034 lingen_F0 F0 = *E_series;
1035 pipe(*E_series, fill_E, "Read");
1036 bm.delta.assign(bm.d.m + bm.d.n, F0.t0);
1037 logline_init_timer();
1038 bigmatpoly pi = bw_biglingen_collective(bm, E);
1039 bm.stats.final_print();
1040 bm.display_deltas();
1041 if (!rank) printf("(pi.alloc = %zu)\n", pi.my_cell().capacity());
1042 constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
1043 pi.zero_pad(simd * iceildiv(pi.get_size(), simd));
1044 if (check_luck_condition(bm)) {
1045 lingen_gather_reverse<bigmatpoly> read_PI(pi);
1046 lingen_F_from_PI Fsrc(bm, read_PI, F0);
1047 pipe(Fsrc, *Fdst, "Written", true);
1048 Fsrc.write_rhs(*Fdst_rhs);
1049 }
1050 } else if (!rank) {
1051 /* We do this only in the rank==0 case, since we have really
1052 * nothing to do at the other ranks.
1053 */
1054
1055 /* We don't want to bother with memory problems in the non-mpi
1056 * case when the tuning was done for MPI: this is because the
1057 * per-transform ram was computed in the perspective of an MPI
1058 * run, and not for a plain run.
1059 */
1060 #ifndef SELECT_MPFQ_LAYER_u64k1
1061 typename matpoly_ft<fft_transform_info>::memory_guard blanket_ft(SIZE_MAX);
1062 #else
1063 typename matpoly_ft<gf2x_cantor_fft_info>::memory_guard blanket_ft(SIZE_MAX);
1064 typename matpoly_ft<gf2x_ternary_fft_info>::memory_guard blanket_ft2(SIZE_MAX);
1065 #endif
1066 matpoly E(bm.d.ab, bm.d.m, bm.d.m + bm.d.n, safe_guess);
1067 lingen_scatter<matpoly> fill_E(E);
1068 lingen_F0 F0 = *E_series;
1069 pipe(*E_series, fill_E, "Read");
1070 bm.delta.assign(bm.d.m + bm.d.n, F0.t0);
1071 logline_init_timer();
1072 matpoly pi = bw_lingen_single(bm, E);
1073 bm.stats.final_print();
1074 bm.display_deltas();
1075 if (!rank) printf("(pi.alloc = %zu)\n", pi.capacity());
1076 constexpr const unsigned int simd = matpoly::over_gf2 ? ULONG_BITS : 1;
1077 pi.zero_pad(simd * iceildiv(pi.get_size(), simd));
1078 if (check_luck_condition(bm)) {
1079 lingen_gather_reverse<matpoly> read_PI(pi);
1080 lingen_F_from_PI Fsrc(bm, read_PI, F0);
1081 pipe(Fsrc, *Fdst, "Written", true);
1082 Fsrc.write_rhs(*Fdst_rhs);
1083 }
1084 }
1085
1086 if (!rank && random_input_length) {
1087 printf("t_basecase = %.2f\n", bm.t_basecase);
1088 printf("t_mp = %.2f\n", bm.t_mp);
1089 printf("t_mul = %.2f\n", bm.t_mul);
1090 printf("t_cp_io = %.2f\n", bm.t_cp_io);
1091 long peakmem = PeakMemusage();
1092 if (peakmem > 0)
1093 printf("# PeakMemusage (MB) = %ld (VmPeak: can be misleading)\n", peakmem >> 10);
1094 }
1095
1096 abfield_clear(d.ab);
1097 if (ffile) free(ffile);
1098
1099 gmp_randclear(rstate);
1100
1101 return 0; // ignored.
1102 }
1103
1104 /* We do this so that the dtors of the data that gets allocated within
1105 * main are allowed to use MPI_Comm_rank.
1106 */
1107 // coverity[root_function]
main(int argc,char * argv[])1108 int main(int argc, char *argv[])
1109 {
1110 bw_common_init(bw, &argc, &argv);
1111 wrapped_main(argc, argv);
1112 bw_common_clear(bw);
1113 return rank0_exit_code;
1114 }
1115
1116 /* vim:set sw=4 sta et: */
1117