1 #include "cado.h"
2 // IWYU pragma: no_include "mpfq_fake.hpp"
3 // IWYU pragma: no_include "mpfq_layer.h"
4 // IWYU pragma: no_include <sys/param.h>
5 #include <cstdio>
6 #include <istream> // IWYU pragma: keep
7 #include <fstream> // IWYU pragma: keep
8 #include <gmp.h>                       // for operator<<, mpz_cmp
9 #include <stdint.h>                    // for SIZE_MAX
10 #include <stdlib.h>                    // for abort, free
11 #include <unistd.h>                    // for unlink, access, R_OK, X_OK
12 #include <map>                         // for operator!=, map
13 #include <utility>                     // for move, pair
14 #include <vector>                      // for vector
15 #include "fmt/core.h"                  // for check_format_string, char_t
16 #include "fmt/format.h"                // for basic_buffer::append, basic_pa...
17 #include "lingen_abfield.hpp"
18 #include "lingen_bw_dimensions.hpp"
19 #include "lingen_call_companion.hpp"   // for lingen_call_companion, lingen_...
20 #include "lingen_hints.hpp"            // for operator<<, lingen_hints, oper...
21 #include "lingen_matpoly_select.hpp"   // for matpoly
22 #include "macros.h"                    // for ASSERT_ALWAYS, MIN, iceildiv
23 #include "params.h"                    // for cxx_param_list, param_list_loo...
24 #include "select_mpi.h"                // for MPI_Allreduce, MPI_Bcast, MPI_...
25 #include "tree_stats.hpp"              // for operator<<, operator>>
26 #include "lingen_bmstatus.hpp"
27 #include "lingen_checkpoints.hpp"
28 #include "lingen_io_matpoly.hpp"
29 #include "lingen_average_matsize.hpp"
30 #include "logline.h"
31 #include "fmt/printf.h" // IWYU pragma: keep
32 #include "cxx_mpz.hpp"
33 
34 /* Checkpoints */
35 
36 const char * lingen_checkpoint::directory = NULL;
37 unsigned int lingen_checkpoint::threshold = 100;
38 int lingen_checkpoint::save_gathered = 0;
39 constexpr unsigned long lingen_checkpoint::format;
40 
41 /* There's much copy-paste here */
42 
43 
decl_usage(cxx_param_list & pl)44 void lingen_checkpoint::decl_usage(cxx_param_list & pl)
45 {
46     param_list_decl_usage(pl, "checkpoint-directory",
47             "where to save checkpoints");
48     param_list_decl_usage(pl, "checkpoint-threshold",
49             "threshold for saving checkpoints");
50     param_list_decl_usage(pl, "lingen_checkpoint_save_gathered",
51             "save global checkpoints files, instead of per-job files");
52 }
53 
lookup_parameters(cxx_param_list & pl)54 void lingen_checkpoint::lookup_parameters(cxx_param_list & pl)
55 {
56     param_list_lookup_string(pl, "checkpoint-directory");
57     param_list_lookup_string(pl, "checkpoint-threshold");
58     param_list_lookup_string(pl, "lingen_checkpoint_save_gathered");
59 }
60 
interpret_parameters(cxx_param_list & pl)61 void lingen_checkpoint::interpret_parameters(cxx_param_list & pl)
62 {
63     lingen_checkpoint::directory = param_list_lookup_string(pl, "checkpoint-directory");
64     param_list_parse_uint(pl, "checkpoint-threshold", &lingen_checkpoint::threshold);
65     param_list_parse_int(pl, "lingen_checkpoint_save_gathered", &lingen_checkpoint::save_gathered);
66 
67     if (!lingen_checkpoint::directory) return;
68 
69     int rank;
70     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
71 
72     if (!lingen_checkpoint::save_gathered) {
73         int ok = access(lingen_checkpoint::directory, X_OK) == 0;
74         MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
75         if (!ok) {
76             if (!rank)
77                 printf("# Checkpoint directory %s/ does not exist at all ranks, falling back to gathered checkpoints\n", lingen_checkpoint::directory);
78             lingen_checkpoint::save_gathered = 1;
79         }
80     }
81     if (lingen_checkpoint::save_gathered) {
82         int ok = access(lingen_checkpoint::directory, X_OK) == 0;
83         MPI_Bcast(&ok, 1, MPI_INT, 0, MPI_COMM_WORLD);
84         if (!ok) {
85             if (!rank)
86                 printf("# Checkpoint directory %s/ does not exist at rank zero, checkpoint settings ignored\n", lingen_checkpoint::directory);
87             lingen_checkpoint::directory = NULL;
88         }
89     }
90 }
91 
lingen_checkpoint(bmstatus & bm,unsigned int t0,unsigned int t1,int mpi,std::string base)92 lingen_checkpoint::lingen_checkpoint(bmstatus & bm, unsigned int t0, unsigned int t1, int mpi, std::string base)
93     : bm(bm), t0(t0), t1(t1), mpi(mpi)
94 {
95     level = bm.depth();
96     if (mpi)
97         MPI_Comm_rank(bm.com[0], &(rank));
98     else
99         rank = 0;
100     auxfile = base + ".aux";
101     gdatafile = base + ".single.data";
102     sdatafile = base + fmt::sprintf(".%d.data", rank);
103     if (directory) {
104         auxfile = fmt::sprintf("%s/%s", directory, auxfile);
105         gdatafile = fmt::sprintf("%s/%s", directory, gdatafile);
106         sdatafile = fmt::sprintf("%s/%s", directory, sdatafile);
107     }
108 
109     datafile = mpi ? sdatafile : gdatafile;
110 }
111 
get_cp_basename(bmstatus & bm,cp_which which,unsigned int t0,unsigned int t1)112 std::string lingen_checkpoint::get_cp_basename(bmstatus & bm, cp_which which, unsigned int t0, unsigned int t1)
113 {
114     int level = bm.depth();
115     std::string base;
116     switch(which) {
117         case LINGEN_CHECKPOINT_E:
118             base = fmt::sprintf("E.%d.%u", level, t0);
119             break;
120         case LINGEN_CHECKPOINT_PI:
121             base = fmt::sprintf("pi.%d.%u.%u", level, t0, t1);
122             break;
123     }
124     return base;
125 }
126 
127 
save_aux_file(size_t Xsize) const128 bool lingen_checkpoint::save_aux_file(size_t Xsize) const /*{{{*/
129 {
130     bw_dimensions & d = bm.d;
131     unsigned int m = d.m;
132     unsigned int n = d.n;
133     if (rank) return 1;
134     std::ofstream os(auxfile);
135     os << "format " << format << "\n";
136     os << m << " " << n << "\n";
137     os << level << " " << t0 << " " << t1 << " " << bm.t << "\n";
138     os << Xsize << "\n";
139     os << abfield_characteristic_srcptr(bm.d.ab) << "\n";
140     for(unsigned int i = 0 ; i < m + n ; i++) os << " " << bm.delta[i];
141     os << "\n";
142     for(unsigned int i = 0 ; i < m + n ; i++) os << " " << bm.lucky[i];
143     os << "\n";
144     os << bm.done;
145     os << "\n";
146     os << logline_serialize();
147     os << "\n";
148     try {
149         os << bm.hints;
150     } catch (std::runtime_error const & e) {
151         std::string s = e.what();
152         s += " when writing checkpoint";
153         throw std::runtime_error(s);
154     }
155     os << "\n";
156     os << bm.stats;
157     os << "\n";
158     bool ok = os.good();
159     if (!ok)
160         unlink(auxfile.c_str());
161     return ok;
162 }/*}}}*/
163 
164 /* See load_mpi_checkpoint_file_* for the way we reconcile the aux file
165  * that is read at rank 0 and the in-memory structure at all ranks.
166  */
167 
checkpoint_already_present() const168 bool lingen_checkpoint::checkpoint_already_present() const/*{{{*/
169 {
170     lingen_checkpoint test = *this;
171     size_t Xsize;
172     int ok=0;
173     int exc=0;
174 
175     /* Do we have an aux file at rank 0 ? */
176     try {
177         ok = test.load_aux_file(Xsize);     /* rank>0 always returns 1 */
178     } catch (lingen_checkpoint::invalid_aux_file const & inv) {
179         if (!rank)
180         fmt::fprintf(stderr, "Overwriting bogus checkpoint file %s [%s]\n",
181                 datafile, inv.what());
182         exc = 1;
183     }
184     MPI_Allreduce(MPI_IN_PLACE, &exc, 1, MPI_INT, MPI_MAX, bm.com[0]);
185     if (exc) return false;
186 
187     MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
188     if (!ok) return false;
189 
190     /* Do we have a scattered file set (preferred) ? */
191     ok = access(sdatafile.c_str(), R_OK) == 0;
192     MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
193     if (ok) return true;
194 
195     /* Do we have a gathered file ? */
196     ok = rank || access(gdatafile.c_str(), R_OK) == 0;
197     MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
198     if (ok) return true;
199 
200     return false;
201 }/*}}}*/
202 
load_aux_file(size_t & Xsize)203 bool lingen_checkpoint::load_aux_file(size_t & Xsize)/*{{{*/
204 {
205     bmstatus nbm = bm;
206     bw_dimensions & d = bm.d;
207     unsigned int m = d.m;
208     unsigned int n = d.n;
209     if (rank) return 1;
210     std::ifstream is(auxfile);
211     if (!is.good()) return false;
212     std::string hfstring;
213     unsigned long hformat;
214     is >> hfstring >> hformat;
215     if (hfstring != "format")
216         throw invalid_aux_file("checkpoint file cannot be used (version < 1)");
217     if (hformat != format)
218         throw invalid_aux_file(fmt::sprintf("checkpoint file cannot be used (version %lu < %lu)", hformat, format));
219     unsigned int xm,xn;
220     is >> xm >> xn;
221     if (xm != m || xn != n)
222         throw invalid_aux_file(fmt::sprintf("checkpoint file cannot be used (made for (m,n)=(%u,%u)", xm, xn));
223     int xlevel;
224     unsigned int xt0, xt1;
225     is >> xlevel >> xt0 >> xt1 >> target_t;
226     if (xlevel != level || t0 != xt0 || t1 != xt1)
227         throw invalid_aux_file(fmt::sprintf("checkpoint file cannot be used (made for depth=%d t0=%u t1=%u", xlevel, xt0, xt1));
228     ASSERT_ALWAYS(target_t <= t1);
229     is >> Xsize;
230     cxx_mpz xp;
231     is >> xp;
232     if (mpz_cmp(xp, abfield_characteristic_srcptr(bm.d.ab)) != 0)
233         throw invalid_aux_file("checkpoint file cannot be used (made for wrong p)");
234     for(unsigned int i = 0 ; i < m + n ; i++) {
235         is >> nbm.delta[i];
236     }
237     for(unsigned int i = 0 ; i < m + n ; i++) {
238         is >> nbm.lucky[i];
239     }
240     is >> nbm.done;
241     double tt;
242     is >> tt;
243     logline_unserialize(tt);
244     if (!(is >> nbm.hints)) {
245         throw invalid_aux_file("parse error while reading hints file from checkpoint");
246     }
247     for(auto const & x : nbm.hints) {
248         if (!x.second.check()) {
249             is.setstate(std::ios::failbit);
250             throw invalid_aux_file("checkpoint contains invalid schedule information");
251         }
252     }
253 
254     if (bm.hints != nbm.hints) {
255         is.setstate(std::ios::failbit);
256         std::stringstream os;
257         os << bm.hints;
258         throw invalid_aux_file(
259                 fmt::sprintf(
260                     "checkpoint file cannot be used since"
261                     " it was made for another set of schedules (stats"
262                     " would be incorrect)\n"
263                 ) + "textual description of the schedule set that we"
264                     " expect to find:\n"
265                 + os.str());
266     }
267 
268     if (!(is >> nbm.stats))
269         throw invalid_aux_file("stats in checkpoint file cannot be merged with existing stats");
270 
271     bm = std::move(nbm);
272 
273     return is.good();
274 }/*}}}*/
275 
276 /* TODO: adapt for GF(2) */
load_data_file(matpoly & X)277 int lingen_checkpoint::load_data_file(matpoly & X)/*{{{*/
278 {
279     bw_dimensions & d = bm.d;
280     abdst_field ab = d.ab;
281     FILE * data = fopen(datafile.c_str(), "rb");
282     int rc;
283     if (data == NULL) {
284         fmt::fprintf(stderr, "Warning: cannot open %s\n", datafile);
285         return 0;
286     }
287     rc = matpoly_read(ab, data, X, 0, X.get_size(), 0, 0);
288     if (rc != (int) X.get_size()) { fclose(data); return 0; }
289     rc = fclose(data);
290     return rc == 0;
291 }/*}}}*/
292 
293 /* TODO: adapt for GF(2) */
294 /* I think we always have Xsize == X.size, the only questionable
295  * situation is when we're saving part of a big matrix */
save_data_file(matpoly const & X)296 int lingen_checkpoint::save_data_file(matpoly const & X)/*{{{*/
297 {
298     abdst_field ab = bm.d.ab;
299     std::ofstream data(datafile, std::ios_base::out | std::ios_base::binary);
300     int rc;
301     if (!data) {
302         fmt::fprintf(stderr, "Warning: cannot open %s\n", datafile);
303         unlink(auxfile.c_str());
304         return 0;
305     }
306     rc = matpoly_write(ab, data, X, 0, X.get_size(), 0, 0);
307     if (rc != (int) X.get_size()) goto lingen_checkpoint_save_data_file_bailout;
308     if (data.good()) return 1;
309 lingen_checkpoint_save_data_file_bailout:
310     unlink(datafile.c_str());
311     unlink(auxfile.c_str());
312     return 0;
313 }/*}}}*/
314 
315 template<>
load_checkpoint_file(bmstatus & bm,cp_which which,matpoly & X,unsigned int t0,unsigned int t1)316 int load_checkpoint_file<matpoly>(bmstatus & bm, cp_which which, matpoly & X, unsigned int t0, unsigned int t1)/*{{{*/
317 {
318     bw_dimensions & d = bm.d;
319     abdst_field ab = d.ab;
320     unsigned int m = d.m;
321     unsigned int n = d.n;
322 
323     if (!lingen_checkpoint::directory) return 0;
324     if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
325 
326     lingen_checkpoint cp(bm, which, t0, t1, 0);
327 
328     ASSERT_ALWAYS(X.check_pre_init());
329     size_t Xsize;
330     /* Don't output a message just now, since after all it's not
331      * noteworthy if the checkpoint file does not exist. */
332     int ok = 1;
333     try {
334         ok = cp.load_aux_file(Xsize);
335         if (!ok) return false;
336     } catch (lingen_checkpoint::invalid_aux_file const & inv) {
337         fmt::fprintf(stderr, "Error with checkpoint file %s:\n%s\n", cp.datafile, inv.what());
338         abort();
339     }
340     logline_begin(stdout, SIZE_MAX, "Reading %s", cp.datafile.c_str());
341     switch (which) {
342         case LINGEN_CHECKPOINT_E:
343             X = matpoly(ab, m, m+n, Xsize);
344             break;
345         case LINGEN_CHECKPOINT_PI:
346             X = matpoly(ab, m+n, m+n, Xsize);
347             break;
348     }
349     X.set_size(Xsize);
350     ok = cp.load_data_file(X);
351     X.clear_high_word();
352     logline_end(&bm.t_cp_io,"");
353     if (!ok)
354         fmt::fprintf(stderr, "Warning: I/O error while reading %s\n", cp.datafile);
355     bm.t = cp.target_t;
356     return ok;
357 }/*}}}*/
358 
359 template<>
save_checkpoint_file(bmstatus & bm,cp_which which,matpoly const & X,unsigned int t0,unsigned int t1)360 int save_checkpoint_file<matpoly>(bmstatus & bm, cp_which which, matpoly const & X, unsigned int t0, unsigned int t1)/*{{{*/
361 {
362     /* corresponding t is bm.t - E.size ! */
363     if (!lingen_checkpoint::directory) return 0;
364     if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
365     lingen_checkpoint cp(bm, which, t0, t1, 0);
366     if (cp.checkpoint_already_present())
367         return 1;
368     logline_begin(stdout, SIZE_MAX, "Saving %s%s",
369             cp.datafile.c_str(),
370             cp.mpi ? " (MPI, scattered)" : "");
371     int ok = cp.save_aux_file(X.get_size());
372     if (ok) ok = cp.save_data_file(X);
373     logline_end(&bm.t_cp_io,"");
374     if (!ok && !cp.rank)
375         fmt::fprintf(stderr, "Warning: I/O error while saving %s\n", cp.datafile);
376     return ok;
377 }/*}}}*/
378 
379 /* At some point the aux file, which is read at rank 0 only, must bcast
380  * its data to other ranks. We list here the fields of interest.
381  *
382  * Maybe a function that is dedicated to sharing this structure info
383  * would be a good idea.
384     target_t
385     Xsize
386     bm.delta[0]@(m+n)
387     bm.lucky[0]@(m+n)
388     bm.done
389     bm.hints -- should match the expected stuff. no need to share !
390     bm.stats -- not needed at rank>0
391  */
392 
load_mpi_checkpoint_file_scattered(bmstatus & bm,cp_which which,bigmatpoly & X,unsigned int t0,unsigned int t1)393 int load_mpi_checkpoint_file_scattered(bmstatus & bm, cp_which which, bigmatpoly & X, unsigned int t0, unsigned int t1)/*{{{*/
394 {
395     int size;
396     MPI_Comm_size(MPI_COMM_WORLD, &size);
397     if (!lingen_checkpoint::directory) return 0;
398     if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
399     int rank;
400     MPI_Comm_rank(bm.com[0], &rank);
401     bw_dimensions & d = bm.d;
402     abdst_field ab = bm.d.ab;
403     unsigned int m = d.m;
404     unsigned int n = d.n;
405     lingen_checkpoint cp(bm, which, t0, t1, 1);
406     ASSERT_ALWAYS(X.check_pre_init());
407     size_t Xsize;
408     int ok = 1, error = 0;
409     try {
410         ok = cp.load_aux_file(Xsize);
411     } catch (lingen_checkpoint::invalid_aux_file const & inv) {
412         if (!rank)
413             fmt::fprintf(stderr, "Error with checkpoint file %s:\n%s\n", cp.datafile, inv.what());
414         error = 1;
415     }
416     MPI_Bcast(&error, 1, MPI_INT, 0, bm.com[0]);
417     ASSERT_ALWAYS(!error);
418     MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
419     if (!ok) {
420         return false;
421     }
422     MPI_Bcast(&Xsize, 1, CADO_MPI_SIZE_T, 0, bm.com[0]);
423     MPI_Bcast(bm.delta.data(), m + n, MPI_UNSIGNED, 0, bm.com[0]);
424     MPI_Bcast(bm.lucky.data(), m + n, MPI_INT, 0, bm.com[0]);
425     MPI_Bcast(&bm.done, 1, MPI_INT, 0, bm.com[0]);
426     int commsize;
427     MPI_Comm_size(bm.com[0], &commsize);
428     logline_begin(stdout, SIZE_MAX, "Reading %s (MPI, scattered)",
429             cp.datafile.c_str());
430     switch (which) {
431         case LINGEN_CHECKPOINT_E:
432             X.finish_init(ab, m, m+n, Xsize);
433             break;
434         case LINGEN_CHECKPOINT_PI:
435             X.finish_init(ab, m+n, m+n, Xsize);
436             break;
437     }
438     X.set_size(Xsize);
439     ok = cp.load_data_file(X.my_cell());
440     MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
441     logline_end(&bm.t_cp_io,"");
442     if (!ok && !rank) {
443         fmt::fprintf(stderr, "\nWarning: I/O error while reading %s\n",
444                 cp.datafile, Xsize);
445     }
446     bm.t = cp.target_t;
447     MPI_Bcast(&bm.t, 1, MPI_INT, 0, bm.com[0]);
448     return ok;
449 }/*}}}*/
450 
save_mpi_checkpoint_file_scattered(bmstatus & bm,cp_which which,bigmatpoly const & X,unsigned int t0,unsigned int t1)451 int save_mpi_checkpoint_file_scattered(bmstatus & bm, cp_which which, bigmatpoly const & X, unsigned int t0, unsigned int t1)/*{{{*/
452 {
453     /* corresponding t is bm.t - E.size ! */
454     if (!lingen_checkpoint::directory) return 0;
455     if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
456     int rank;
457     MPI_Comm_rank(bm.com[0], &rank);
458     lingen_checkpoint cp(bm, which, t0, t1, 1);
459     int ok;
460     ok = cp.checkpoint_already_present();
461     if (ok) return 1;
462 
463     ok = cp.save_aux_file(X.get_size());
464     MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
465     if (!ok && !rank) unlink(cp.auxfile.c_str());
466     if (ok) {
467         logline_begin(stdout, SIZE_MAX, "Saving %s (MPI, scattered)",
468                 cp.datafile.c_str());
469         ok = cp.save_data_file(X.my_cell());
470         logline_end(&bm.t_cp_io,"");
471         MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
472         if (!ok) {
473             if (!cp.datafile.empty()) unlink(cp.datafile.c_str());
474             if (!rank) unlink(cp.auxfile.c_str());
475         }
476     }
477     if (!ok && !rank) {
478         fmt::fprintf(stderr, "Warning: I/O error while saving %s\n", cp.datafile);
479     }
480     return ok;
481 }/*}}}*/
482 
load_mpi_checkpoint_file_gathered(bmstatus & bm,cp_which which,bigmatpoly & X,unsigned int t0,unsigned int t1)483 int load_mpi_checkpoint_file_gathered(bmstatus & bm, cp_which which, bigmatpoly & X, unsigned int t0, unsigned int t1)/*{{{*/
484 {
485     if (!lingen_checkpoint::directory) return 0;
486     if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
487     int rank;
488     MPI_Comm_rank(bm.com[0], &rank);
489     bw_dimensions & d = bm.d;
490     abdst_field ab = d.ab;
491     unsigned int m = d.m;
492     unsigned int n = d.n;
493     lingen_checkpoint cp(bm, which, t0, t1, 1);
494     cp.datafile = cp.gdatafile;
495     size_t Xsize;
496     int ok = 1, error = 0;
497     try {
498         ok = cp.load_aux_file(Xsize);
499     } catch (lingen_checkpoint::invalid_aux_file const & inv) {
500         error = 1;
501     }
502     MPI_Allreduce(MPI_IN_PLACE, &error, 1, MPI_INT, MPI_MAX, bm.com[0]);
503     ASSERT_ALWAYS(!error);
504     MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
505     if (!ok)
506         /* This can be a normal situation if the .aux file does not
507          * exist.
508          */
509         return false;
510     MPI_Bcast(&Xsize, 1, CADO_MPI_SIZE_T, 0, bm.com[0]);
511     MPI_Bcast(bm.delta.data(), m + n, MPI_UNSIGNED, 0, bm.com[0]);
512     MPI_Bcast(bm.lucky.data(), m + n, MPI_INT, 0, bm.com[0]);
513     MPI_Bcast(&bm.done, 1, MPI_INT, 0, bm.com[0]);
514     logline_begin(stdout, SIZE_MAX, "Reading %s (MPI, gathered)",
515             cp.datafile.c_str());
516     do {
517         FILE * data = NULL;
518         if (!rank) ok = (data = fopen(cp.datafile.c_str(), "rb")) != NULL;
519         MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
520         if (!ok) {
521             if (!rank)
522                 fprintf(stderr, "Warning: cannot open %s\n", cp.datafile.c_str());
523             if (data) free(data);
524             break;
525         }
526 
527         switch (which) {
528             case LINGEN_CHECKPOINT_E:
529                 X.finish_init(ab, m, m+n, Xsize);
530                 break;
531             case LINGEN_CHECKPOINT_PI:
532                 X.finish_init(ab, m+n, m+n, Xsize);
533                 break;
534         }
535         X.set_size(Xsize);
536 
537         double avg = average_matsize(ab, X.m, X.n, 0);
538         unsigned int B = iceildiv(io_matpoly_block_size, avg);
539 
540         /* This is only temp storage ! */
541         matpoly loc(ab, X.m, X.n, B);
542         loc.zero_pad(B);
543 
544         for(unsigned int k = 0 ; ok && k < X.get_size() ; k += B) {
545             unsigned int nc = MIN(B, X.get_size() - k);
546             if (!rank)
547                 ok = matpoly_read(ab, data, loc, 0, nc, 0, 0) == (int) nc;
548             MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
549             X.scatter_mat_partial(loc, 0, k, nc);
550         }
551 
552         if (!rank) {
553             int rc = fclose(data);
554             ok = ok && (rc == 0);
555         }
556     } while (0);
557     MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
558     logline_end(&bm.t_cp_io,"");
559     if (ok)
560         bm.t = cp.target_t;
561     else if (!rank)
562             fmt::fprintf(stderr, "Warning: I/O error while reading %s\n",
563                     cp.datafile);
564     MPI_Bcast(&bm.t, 1, MPI_INT, 0, bm.com[0]);
565 
566     return ok;
567 }/*}}}*/
568 
save_mpi_checkpoint_file_gathered(bmstatus & bm,cp_which which,bigmatpoly const & X,unsigned int t0,unsigned int t1)569 int save_mpi_checkpoint_file_gathered(bmstatus & bm, cp_which which, bigmatpoly const & X, unsigned int t0, unsigned int t1)/*{{{*/
570 {
571     if (!lingen_checkpoint::directory) return 0;
572     if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
573     int rank;
574     MPI_Comm_rank(bm.com[0], &rank);
575     bw_dimensions & d = bm.d;
576     abdst_field ab = d.ab;
577     lingen_checkpoint cp(bm, which, t0, t1, 1);
578     cp.datafile = cp.gdatafile;
579     int ok = cp.checkpoint_already_present();
580     if (ok) return 1;
581     logline_begin(stdout, SIZE_MAX, "Saving %s (MPI, gathered)",
582             cp.datafile.c_str());
583     ok = cp.save_aux_file(X.get_size());
584     MPI_Bcast(&ok, 1, MPI_INT, 0, bm.com[0]);
585     if (ok) {
586         do {
587             std::ofstream data;
588             if (!rank) {
589                 data.open(cp.datafile, std::ios_base::out | std::ios_base::binary);
590                 ok = (bool) data;
591             }
592             MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
593             if (!ok) {
594                 if (!rank)
595                     fmt::fprintf(stderr, "Warning: cannot open %s\n", cp.datafile);
596                 break;
597             }
598 
599             double avg = average_matsize(ab, X.m, X.n, 0);
600             unsigned int B = iceildiv(io_matpoly_block_size, avg);
601 
602             /* This is only temp storage ! */
603             matpoly loc(ab, X.m, X.n, B);
604             loc.zero_pad(B);
605 
606             for(unsigned int k = 0 ; ok && k < X.get_size() ; k += B) {
607                 unsigned int nc = MIN(B, X.get_size() - k);
608                 X.gather_mat_partial(loc, 0, k, nc);
609                 if (!rank)
610                     ok = matpoly_write(ab, data, loc, 0, nc, 0, 0) == (int) nc;
611                 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
612             }
613 
614             if (!rank) {
615                 data.close();
616                 ok = ok && (bool) data;
617             }
618         } while (0);
619         MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
620         if (!ok && !rank) {
621             if (!cp.datafile.empty()) unlink(cp.datafile.c_str());
622             unlink(cp.auxfile.c_str());
623         }
624     }
625     logline_end(&bm.t_cp_io,"");
626     if (!ok && !rank) {
627         fmt::fprintf(stderr, "Warning: I/O error while saving %s\n", cp.datafile);
628     }
629     return ok;
630 }/*}}}*/
631 
632 template<>
load_checkpoint_file(bmstatus & bm,cp_which which,bigmatpoly & X,unsigned int t0,unsigned int t1)633 int load_checkpoint_file<bigmatpoly>(bmstatus & bm, cp_which which, bigmatpoly & X, unsigned int t0, unsigned int t1)/*{{{*/
634 {
635     /* read scattered checkpoint with higher priority if available,
636      * because we like distributed I/O. Otherwise, read gathered
637      * checkpoint if we could find one.
638      */
639     if (!lingen_checkpoint::directory) return 0;
640     if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
641     int rank;
642     MPI_Comm_rank(bm.com[0], &rank);
643     lingen_checkpoint cp(bm, which, t0, t1, 1);
644     int ok = 0;
645     int aux_ok = rank || access(cp.auxfile.c_str(), R_OK) == 0;
646     int sdata_ok = access(cp.sdatafile.c_str(), R_OK) == 0;
647     int scattered_ok = aux_ok && sdata_ok;
648     MPI_Allreduce(MPI_IN_PLACE, &scattered_ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
649     if (scattered_ok) {
650         ok = load_mpi_checkpoint_file_scattered(bm, which, X, t0, t1);
651         if (ok) return ok;
652     }
653     int gdata_ok = rank || access(cp.gdatafile.c_str(), R_OK) == 0;
654     int gathered_ok = aux_ok && gdata_ok;
655     MPI_Bcast(&gathered_ok, 1, MPI_INT, 0, bm.com[0]);
656     if (gathered_ok) {
657         ok = load_mpi_checkpoint_file_gathered(bm, which, X, t0, t1);
658     }
659     return ok;
660 }/*}}}*/
661 
662 template<>
save_checkpoint_file(bmstatus & bm,cp_which which,bigmatpoly const & X,unsigned int t0,unsigned int t1)663 int save_checkpoint_file<bigmatpoly>(bmstatus & bm, cp_which which, bigmatpoly const & X, unsigned int t0, unsigned int t1)/*{{{*/
664 {
665     if (lingen_checkpoint::save_gathered) {
666         return save_mpi_checkpoint_file_gathered(bm, which, X, t0, t1);
667     } else {
668         return save_mpi_checkpoint_file_scattered(bm, which, X, t0, t1);
669     }
670 }/*}}}*/
671