1 #include "cado.h"
2 // IWYU pragma: no_include "mpfq_fake.hpp"
3 // IWYU pragma: no_include "mpfq_layer.h"
4 // IWYU pragma: no_include <sys/param.h>
5 #include <cstdio>
6 #include <istream> // IWYU pragma: keep
7 #include <fstream> // IWYU pragma: keep
8 #include <gmp.h> // for operator<<, mpz_cmp
9 #include <stdint.h> // for SIZE_MAX
10 #include <stdlib.h> // for abort, free
11 #include <unistd.h> // for unlink, access, R_OK, X_OK
12 #include <map> // for operator!=, map
13 #include <utility> // for move, pair
14 #include <vector> // for vector
15 #include "fmt/core.h" // for check_format_string, char_t
16 #include "fmt/format.h" // for basic_buffer::append, basic_pa...
17 #include "lingen_abfield.hpp"
18 #include "lingen_bw_dimensions.hpp"
19 #include "lingen_call_companion.hpp" // for lingen_call_companion, lingen_...
20 #include "lingen_hints.hpp" // for operator<<, lingen_hints, oper...
21 #include "lingen_matpoly_select.hpp" // for matpoly
22 #include "macros.h" // for ASSERT_ALWAYS, MIN, iceildiv
23 #include "params.h" // for cxx_param_list, param_list_loo...
24 #include "select_mpi.h" // for MPI_Allreduce, MPI_Bcast, MPI_...
25 #include "tree_stats.hpp" // for operator<<, operator>>
26 #include "lingen_bmstatus.hpp"
27 #include "lingen_checkpoints.hpp"
28 #include "lingen_io_matpoly.hpp"
29 #include "lingen_average_matsize.hpp"
30 #include "logline.h"
31 #include "fmt/printf.h" // IWYU pragma: keep
32 #include "cxx_mpz.hpp"
33
34 /* Checkpoints */
35
36 const char * lingen_checkpoint::directory = NULL;
37 unsigned int lingen_checkpoint::threshold = 100;
38 int lingen_checkpoint::save_gathered = 0;
39 constexpr unsigned long lingen_checkpoint::format;
40
41 /* There's much copy-paste here */
42
43
decl_usage(cxx_param_list & pl)44 void lingen_checkpoint::decl_usage(cxx_param_list & pl)
45 {
46 param_list_decl_usage(pl, "checkpoint-directory",
47 "where to save checkpoints");
48 param_list_decl_usage(pl, "checkpoint-threshold",
49 "threshold for saving checkpoints");
50 param_list_decl_usage(pl, "lingen_checkpoint_save_gathered",
51 "save global checkpoints files, instead of per-job files");
52 }
53
lookup_parameters(cxx_param_list & pl)54 void lingen_checkpoint::lookup_parameters(cxx_param_list & pl)
55 {
56 param_list_lookup_string(pl, "checkpoint-directory");
57 param_list_lookup_string(pl, "checkpoint-threshold");
58 param_list_lookup_string(pl, "lingen_checkpoint_save_gathered");
59 }
60
interpret_parameters(cxx_param_list & pl)61 void lingen_checkpoint::interpret_parameters(cxx_param_list & pl)
62 {
63 lingen_checkpoint::directory = param_list_lookup_string(pl, "checkpoint-directory");
64 param_list_parse_uint(pl, "checkpoint-threshold", &lingen_checkpoint::threshold);
65 param_list_parse_int(pl, "lingen_checkpoint_save_gathered", &lingen_checkpoint::save_gathered);
66
67 if (!lingen_checkpoint::directory) return;
68
69 int rank;
70 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
71
72 if (!lingen_checkpoint::save_gathered) {
73 int ok = access(lingen_checkpoint::directory, X_OK) == 0;
74 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
75 if (!ok) {
76 if (!rank)
77 printf("# Checkpoint directory %s/ does not exist at all ranks, falling back to gathered checkpoints\n", lingen_checkpoint::directory);
78 lingen_checkpoint::save_gathered = 1;
79 }
80 }
81 if (lingen_checkpoint::save_gathered) {
82 int ok = access(lingen_checkpoint::directory, X_OK) == 0;
83 MPI_Bcast(&ok, 1, MPI_INT, 0, MPI_COMM_WORLD);
84 if (!ok) {
85 if (!rank)
86 printf("# Checkpoint directory %s/ does not exist at rank zero, checkpoint settings ignored\n", lingen_checkpoint::directory);
87 lingen_checkpoint::directory = NULL;
88 }
89 }
90 }
91
lingen_checkpoint(bmstatus & bm,unsigned int t0,unsigned int t1,int mpi,std::string base)92 lingen_checkpoint::lingen_checkpoint(bmstatus & bm, unsigned int t0, unsigned int t1, int mpi, std::string base)
93 : bm(bm), t0(t0), t1(t1), mpi(mpi)
94 {
95 level = bm.depth();
96 if (mpi)
97 MPI_Comm_rank(bm.com[0], &(rank));
98 else
99 rank = 0;
100 auxfile = base + ".aux";
101 gdatafile = base + ".single.data";
102 sdatafile = base + fmt::sprintf(".%d.data", rank);
103 if (directory) {
104 auxfile = fmt::sprintf("%s/%s", directory, auxfile);
105 gdatafile = fmt::sprintf("%s/%s", directory, gdatafile);
106 sdatafile = fmt::sprintf("%s/%s", directory, sdatafile);
107 }
108
109 datafile = mpi ? sdatafile : gdatafile;
110 }
111
get_cp_basename(bmstatus & bm,cp_which which,unsigned int t0,unsigned int t1)112 std::string lingen_checkpoint::get_cp_basename(bmstatus & bm, cp_which which, unsigned int t0, unsigned int t1)
113 {
114 int level = bm.depth();
115 std::string base;
116 switch(which) {
117 case LINGEN_CHECKPOINT_E:
118 base = fmt::sprintf("E.%d.%u", level, t0);
119 break;
120 case LINGEN_CHECKPOINT_PI:
121 base = fmt::sprintf("pi.%d.%u.%u", level, t0, t1);
122 break;
123 }
124 return base;
125 }
126
127
save_aux_file(size_t Xsize) const128 bool lingen_checkpoint::save_aux_file(size_t Xsize) const /*{{{*/
129 {
130 bw_dimensions & d = bm.d;
131 unsigned int m = d.m;
132 unsigned int n = d.n;
133 if (rank) return 1;
134 std::ofstream os(auxfile);
135 os << "format " << format << "\n";
136 os << m << " " << n << "\n";
137 os << level << " " << t0 << " " << t1 << " " << bm.t << "\n";
138 os << Xsize << "\n";
139 os << abfield_characteristic_srcptr(bm.d.ab) << "\n";
140 for(unsigned int i = 0 ; i < m + n ; i++) os << " " << bm.delta[i];
141 os << "\n";
142 for(unsigned int i = 0 ; i < m + n ; i++) os << " " << bm.lucky[i];
143 os << "\n";
144 os << bm.done;
145 os << "\n";
146 os << logline_serialize();
147 os << "\n";
148 try {
149 os << bm.hints;
150 } catch (std::runtime_error const & e) {
151 std::string s = e.what();
152 s += " when writing checkpoint";
153 throw std::runtime_error(s);
154 }
155 os << "\n";
156 os << bm.stats;
157 os << "\n";
158 bool ok = os.good();
159 if (!ok)
160 unlink(auxfile.c_str());
161 return ok;
162 }/*}}}*/
163
164 /* See load_mpi_checkpoint_file_* for the way we reconcile the aux file
165 * that is read at rank 0 and the in-memory structure at all ranks.
166 */
167
checkpoint_already_present() const168 bool lingen_checkpoint::checkpoint_already_present() const/*{{{*/
169 {
170 lingen_checkpoint test = *this;
171 size_t Xsize;
172 int ok=0;
173 int exc=0;
174
175 /* Do we have an aux file at rank 0 ? */
176 try {
177 ok = test.load_aux_file(Xsize); /* rank>0 always returns 1 */
178 } catch (lingen_checkpoint::invalid_aux_file const & inv) {
179 if (!rank)
180 fmt::fprintf(stderr, "Overwriting bogus checkpoint file %s [%s]\n",
181 datafile, inv.what());
182 exc = 1;
183 }
184 MPI_Allreduce(MPI_IN_PLACE, &exc, 1, MPI_INT, MPI_MAX, bm.com[0]);
185 if (exc) return false;
186
187 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
188 if (!ok) return false;
189
190 /* Do we have a scattered file set (preferred) ? */
191 ok = access(sdatafile.c_str(), R_OK) == 0;
192 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
193 if (ok) return true;
194
195 /* Do we have a gathered file ? */
196 ok = rank || access(gdatafile.c_str(), R_OK) == 0;
197 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
198 if (ok) return true;
199
200 return false;
201 }/*}}}*/
202
load_aux_file(size_t & Xsize)203 bool lingen_checkpoint::load_aux_file(size_t & Xsize)/*{{{*/
204 {
205 bmstatus nbm = bm;
206 bw_dimensions & d = bm.d;
207 unsigned int m = d.m;
208 unsigned int n = d.n;
209 if (rank) return 1;
210 std::ifstream is(auxfile);
211 if (!is.good()) return false;
212 std::string hfstring;
213 unsigned long hformat;
214 is >> hfstring >> hformat;
215 if (hfstring != "format")
216 throw invalid_aux_file("checkpoint file cannot be used (version < 1)");
217 if (hformat != format)
218 throw invalid_aux_file(fmt::sprintf("checkpoint file cannot be used (version %lu < %lu)", hformat, format));
219 unsigned int xm,xn;
220 is >> xm >> xn;
221 if (xm != m || xn != n)
222 throw invalid_aux_file(fmt::sprintf("checkpoint file cannot be used (made for (m,n)=(%u,%u)", xm, xn));
223 int xlevel;
224 unsigned int xt0, xt1;
225 is >> xlevel >> xt0 >> xt1 >> target_t;
226 if (xlevel != level || t0 != xt0 || t1 != xt1)
227 throw invalid_aux_file(fmt::sprintf("checkpoint file cannot be used (made for depth=%d t0=%u t1=%u", xlevel, xt0, xt1));
228 ASSERT_ALWAYS(target_t <= t1);
229 is >> Xsize;
230 cxx_mpz xp;
231 is >> xp;
232 if (mpz_cmp(xp, abfield_characteristic_srcptr(bm.d.ab)) != 0)
233 throw invalid_aux_file("checkpoint file cannot be used (made for wrong p)");
234 for(unsigned int i = 0 ; i < m + n ; i++) {
235 is >> nbm.delta[i];
236 }
237 for(unsigned int i = 0 ; i < m + n ; i++) {
238 is >> nbm.lucky[i];
239 }
240 is >> nbm.done;
241 double tt;
242 is >> tt;
243 logline_unserialize(tt);
244 if (!(is >> nbm.hints)) {
245 throw invalid_aux_file("parse error while reading hints file from checkpoint");
246 }
247 for(auto const & x : nbm.hints) {
248 if (!x.second.check()) {
249 is.setstate(std::ios::failbit);
250 throw invalid_aux_file("checkpoint contains invalid schedule information");
251 }
252 }
253
254 if (bm.hints != nbm.hints) {
255 is.setstate(std::ios::failbit);
256 std::stringstream os;
257 os << bm.hints;
258 throw invalid_aux_file(
259 fmt::sprintf(
260 "checkpoint file cannot be used since"
261 " it was made for another set of schedules (stats"
262 " would be incorrect)\n"
263 ) + "textual description of the schedule set that we"
264 " expect to find:\n"
265 + os.str());
266 }
267
268 if (!(is >> nbm.stats))
269 throw invalid_aux_file("stats in checkpoint file cannot be merged with existing stats");
270
271 bm = std::move(nbm);
272
273 return is.good();
274 }/*}}}*/
275
276 /* TODO: adapt for GF(2) */
load_data_file(matpoly & X)277 int lingen_checkpoint::load_data_file(matpoly & X)/*{{{*/
278 {
279 bw_dimensions & d = bm.d;
280 abdst_field ab = d.ab;
281 FILE * data = fopen(datafile.c_str(), "rb");
282 int rc;
283 if (data == NULL) {
284 fmt::fprintf(stderr, "Warning: cannot open %s\n", datafile);
285 return 0;
286 }
287 rc = matpoly_read(ab, data, X, 0, X.get_size(), 0, 0);
288 if (rc != (int) X.get_size()) { fclose(data); return 0; }
289 rc = fclose(data);
290 return rc == 0;
291 }/*}}}*/
292
293 /* TODO: adapt for GF(2) */
294 /* I think we always have Xsize == X.size, the only questionable
295 * situation is when we're saving part of a big matrix */
save_data_file(matpoly const & X)296 int lingen_checkpoint::save_data_file(matpoly const & X)/*{{{*/
297 {
298 abdst_field ab = bm.d.ab;
299 std::ofstream data(datafile, std::ios_base::out | std::ios_base::binary);
300 int rc;
301 if (!data) {
302 fmt::fprintf(stderr, "Warning: cannot open %s\n", datafile);
303 unlink(auxfile.c_str());
304 return 0;
305 }
306 rc = matpoly_write(ab, data, X, 0, X.get_size(), 0, 0);
307 if (rc != (int) X.get_size()) goto lingen_checkpoint_save_data_file_bailout;
308 if (data.good()) return 1;
309 lingen_checkpoint_save_data_file_bailout:
310 unlink(datafile.c_str());
311 unlink(auxfile.c_str());
312 return 0;
313 }/*}}}*/
314
315 template<>
load_checkpoint_file(bmstatus & bm,cp_which which,matpoly & X,unsigned int t0,unsigned int t1)316 int load_checkpoint_file<matpoly>(bmstatus & bm, cp_which which, matpoly & X, unsigned int t0, unsigned int t1)/*{{{*/
317 {
318 bw_dimensions & d = bm.d;
319 abdst_field ab = d.ab;
320 unsigned int m = d.m;
321 unsigned int n = d.n;
322
323 if (!lingen_checkpoint::directory) return 0;
324 if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
325
326 lingen_checkpoint cp(bm, which, t0, t1, 0);
327
328 ASSERT_ALWAYS(X.check_pre_init());
329 size_t Xsize;
330 /* Don't output a message just now, since after all it's not
331 * noteworthy if the checkpoint file does not exist. */
332 int ok = 1;
333 try {
334 ok = cp.load_aux_file(Xsize);
335 if (!ok) return false;
336 } catch (lingen_checkpoint::invalid_aux_file const & inv) {
337 fmt::fprintf(stderr, "Error with checkpoint file %s:\n%s\n", cp.datafile, inv.what());
338 abort();
339 }
340 logline_begin(stdout, SIZE_MAX, "Reading %s", cp.datafile.c_str());
341 switch (which) {
342 case LINGEN_CHECKPOINT_E:
343 X = matpoly(ab, m, m+n, Xsize);
344 break;
345 case LINGEN_CHECKPOINT_PI:
346 X = matpoly(ab, m+n, m+n, Xsize);
347 break;
348 }
349 X.set_size(Xsize);
350 ok = cp.load_data_file(X);
351 X.clear_high_word();
352 logline_end(&bm.t_cp_io,"");
353 if (!ok)
354 fmt::fprintf(stderr, "Warning: I/O error while reading %s\n", cp.datafile);
355 bm.t = cp.target_t;
356 return ok;
357 }/*}}}*/
358
359 template<>
save_checkpoint_file(bmstatus & bm,cp_which which,matpoly const & X,unsigned int t0,unsigned int t1)360 int save_checkpoint_file<matpoly>(bmstatus & bm, cp_which which, matpoly const & X, unsigned int t0, unsigned int t1)/*{{{*/
361 {
362 /* corresponding t is bm.t - E.size ! */
363 if (!lingen_checkpoint::directory) return 0;
364 if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
365 lingen_checkpoint cp(bm, which, t0, t1, 0);
366 if (cp.checkpoint_already_present())
367 return 1;
368 logline_begin(stdout, SIZE_MAX, "Saving %s%s",
369 cp.datafile.c_str(),
370 cp.mpi ? " (MPI, scattered)" : "");
371 int ok = cp.save_aux_file(X.get_size());
372 if (ok) ok = cp.save_data_file(X);
373 logline_end(&bm.t_cp_io,"");
374 if (!ok && !cp.rank)
375 fmt::fprintf(stderr, "Warning: I/O error while saving %s\n", cp.datafile);
376 return ok;
377 }/*}}}*/
378
379 /* At some point the aux file, which is read at rank 0 only, must bcast
380 * its data to other ranks. We list here the fields of interest.
381 *
382 * Maybe a function that is dedicated to sharing this structure info
383 * would be a good idea.
384 target_t
385 Xsize
386 bm.delta[0]@(m+n)
387 bm.lucky[0]@(m+n)
388 bm.done
389 bm.hints -- should match the expected stuff. no need to share !
390 bm.stats -- not needed at rank>0
391 */
392
load_mpi_checkpoint_file_scattered(bmstatus & bm,cp_which which,bigmatpoly & X,unsigned int t0,unsigned int t1)393 int load_mpi_checkpoint_file_scattered(bmstatus & bm, cp_which which, bigmatpoly & X, unsigned int t0, unsigned int t1)/*{{{*/
394 {
395 int size;
396 MPI_Comm_size(MPI_COMM_WORLD, &size);
397 if (!lingen_checkpoint::directory) return 0;
398 if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
399 int rank;
400 MPI_Comm_rank(bm.com[0], &rank);
401 bw_dimensions & d = bm.d;
402 abdst_field ab = bm.d.ab;
403 unsigned int m = d.m;
404 unsigned int n = d.n;
405 lingen_checkpoint cp(bm, which, t0, t1, 1);
406 ASSERT_ALWAYS(X.check_pre_init());
407 size_t Xsize;
408 int ok = 1, error = 0;
409 try {
410 ok = cp.load_aux_file(Xsize);
411 } catch (lingen_checkpoint::invalid_aux_file const & inv) {
412 if (!rank)
413 fmt::fprintf(stderr, "Error with checkpoint file %s:\n%s\n", cp.datafile, inv.what());
414 error = 1;
415 }
416 MPI_Bcast(&error, 1, MPI_INT, 0, bm.com[0]);
417 ASSERT_ALWAYS(!error);
418 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
419 if (!ok) {
420 return false;
421 }
422 MPI_Bcast(&Xsize, 1, CADO_MPI_SIZE_T, 0, bm.com[0]);
423 MPI_Bcast(bm.delta.data(), m + n, MPI_UNSIGNED, 0, bm.com[0]);
424 MPI_Bcast(bm.lucky.data(), m + n, MPI_INT, 0, bm.com[0]);
425 MPI_Bcast(&bm.done, 1, MPI_INT, 0, bm.com[0]);
426 int commsize;
427 MPI_Comm_size(bm.com[0], &commsize);
428 logline_begin(stdout, SIZE_MAX, "Reading %s (MPI, scattered)",
429 cp.datafile.c_str());
430 switch (which) {
431 case LINGEN_CHECKPOINT_E:
432 X.finish_init(ab, m, m+n, Xsize);
433 break;
434 case LINGEN_CHECKPOINT_PI:
435 X.finish_init(ab, m+n, m+n, Xsize);
436 break;
437 }
438 X.set_size(Xsize);
439 ok = cp.load_data_file(X.my_cell());
440 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
441 logline_end(&bm.t_cp_io,"");
442 if (!ok && !rank) {
443 fmt::fprintf(stderr, "\nWarning: I/O error while reading %s\n",
444 cp.datafile, Xsize);
445 }
446 bm.t = cp.target_t;
447 MPI_Bcast(&bm.t, 1, MPI_INT, 0, bm.com[0]);
448 return ok;
449 }/*}}}*/
450
save_mpi_checkpoint_file_scattered(bmstatus & bm,cp_which which,bigmatpoly const & X,unsigned int t0,unsigned int t1)451 int save_mpi_checkpoint_file_scattered(bmstatus & bm, cp_which which, bigmatpoly const & X, unsigned int t0, unsigned int t1)/*{{{*/
452 {
453 /* corresponding t is bm.t - E.size ! */
454 if (!lingen_checkpoint::directory) return 0;
455 if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
456 int rank;
457 MPI_Comm_rank(bm.com[0], &rank);
458 lingen_checkpoint cp(bm, which, t0, t1, 1);
459 int ok;
460 ok = cp.checkpoint_already_present();
461 if (ok) return 1;
462
463 ok = cp.save_aux_file(X.get_size());
464 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
465 if (!ok && !rank) unlink(cp.auxfile.c_str());
466 if (ok) {
467 logline_begin(stdout, SIZE_MAX, "Saving %s (MPI, scattered)",
468 cp.datafile.c_str());
469 ok = cp.save_data_file(X.my_cell());
470 logline_end(&bm.t_cp_io,"");
471 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
472 if (!ok) {
473 if (!cp.datafile.empty()) unlink(cp.datafile.c_str());
474 if (!rank) unlink(cp.auxfile.c_str());
475 }
476 }
477 if (!ok && !rank) {
478 fmt::fprintf(stderr, "Warning: I/O error while saving %s\n", cp.datafile);
479 }
480 return ok;
481 }/*}}}*/
482
load_mpi_checkpoint_file_gathered(bmstatus & bm,cp_which which,bigmatpoly & X,unsigned int t0,unsigned int t1)483 int load_mpi_checkpoint_file_gathered(bmstatus & bm, cp_which which, bigmatpoly & X, unsigned int t0, unsigned int t1)/*{{{*/
484 {
485 if (!lingen_checkpoint::directory) return 0;
486 if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
487 int rank;
488 MPI_Comm_rank(bm.com[0], &rank);
489 bw_dimensions & d = bm.d;
490 abdst_field ab = d.ab;
491 unsigned int m = d.m;
492 unsigned int n = d.n;
493 lingen_checkpoint cp(bm, which, t0, t1, 1);
494 cp.datafile = cp.gdatafile;
495 size_t Xsize;
496 int ok = 1, error = 0;
497 try {
498 ok = cp.load_aux_file(Xsize);
499 } catch (lingen_checkpoint::invalid_aux_file const & inv) {
500 error = 1;
501 }
502 MPI_Allreduce(MPI_IN_PLACE, &error, 1, MPI_INT, MPI_MAX, bm.com[0]);
503 ASSERT_ALWAYS(!error);
504 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
505 if (!ok)
506 /* This can be a normal situation if the .aux file does not
507 * exist.
508 */
509 return false;
510 MPI_Bcast(&Xsize, 1, CADO_MPI_SIZE_T, 0, bm.com[0]);
511 MPI_Bcast(bm.delta.data(), m + n, MPI_UNSIGNED, 0, bm.com[0]);
512 MPI_Bcast(bm.lucky.data(), m + n, MPI_INT, 0, bm.com[0]);
513 MPI_Bcast(&bm.done, 1, MPI_INT, 0, bm.com[0]);
514 logline_begin(stdout, SIZE_MAX, "Reading %s (MPI, gathered)",
515 cp.datafile.c_str());
516 do {
517 FILE * data = NULL;
518 if (!rank) ok = (data = fopen(cp.datafile.c_str(), "rb")) != NULL;
519 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
520 if (!ok) {
521 if (!rank)
522 fprintf(stderr, "Warning: cannot open %s\n", cp.datafile.c_str());
523 if (data) free(data);
524 break;
525 }
526
527 switch (which) {
528 case LINGEN_CHECKPOINT_E:
529 X.finish_init(ab, m, m+n, Xsize);
530 break;
531 case LINGEN_CHECKPOINT_PI:
532 X.finish_init(ab, m+n, m+n, Xsize);
533 break;
534 }
535 X.set_size(Xsize);
536
537 double avg = average_matsize(ab, X.m, X.n, 0);
538 unsigned int B = iceildiv(io_matpoly_block_size, avg);
539
540 /* This is only temp storage ! */
541 matpoly loc(ab, X.m, X.n, B);
542 loc.zero_pad(B);
543
544 for(unsigned int k = 0 ; ok && k < X.get_size() ; k += B) {
545 unsigned int nc = MIN(B, X.get_size() - k);
546 if (!rank)
547 ok = matpoly_read(ab, data, loc, 0, nc, 0, 0) == (int) nc;
548 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
549 X.scatter_mat_partial(loc, 0, k, nc);
550 }
551
552 if (!rank) {
553 int rc = fclose(data);
554 ok = ok && (rc == 0);
555 }
556 } while (0);
557 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
558 logline_end(&bm.t_cp_io,"");
559 if (ok)
560 bm.t = cp.target_t;
561 else if (!rank)
562 fmt::fprintf(stderr, "Warning: I/O error while reading %s\n",
563 cp.datafile);
564 MPI_Bcast(&bm.t, 1, MPI_INT, 0, bm.com[0]);
565
566 return ok;
567 }/*}}}*/
568
save_mpi_checkpoint_file_gathered(bmstatus & bm,cp_which which,bigmatpoly const & X,unsigned int t0,unsigned int t1)569 int save_mpi_checkpoint_file_gathered(bmstatus & bm, cp_which which, bigmatpoly const & X, unsigned int t0, unsigned int t1)/*{{{*/
570 {
571 if (!lingen_checkpoint::directory) return 0;
572 if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
573 int rank;
574 MPI_Comm_rank(bm.com[0], &rank);
575 bw_dimensions & d = bm.d;
576 abdst_field ab = d.ab;
577 lingen_checkpoint cp(bm, which, t0, t1, 1);
578 cp.datafile = cp.gdatafile;
579 int ok = cp.checkpoint_already_present();
580 if (ok) return 1;
581 logline_begin(stdout, SIZE_MAX, "Saving %s (MPI, gathered)",
582 cp.datafile.c_str());
583 ok = cp.save_aux_file(X.get_size());
584 MPI_Bcast(&ok, 1, MPI_INT, 0, bm.com[0]);
585 if (ok) {
586 do {
587 std::ofstream data;
588 if (!rank) {
589 data.open(cp.datafile, std::ios_base::out | std::ios_base::binary);
590 ok = (bool) data;
591 }
592 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
593 if (!ok) {
594 if (!rank)
595 fmt::fprintf(stderr, "Warning: cannot open %s\n", cp.datafile);
596 break;
597 }
598
599 double avg = average_matsize(ab, X.m, X.n, 0);
600 unsigned int B = iceildiv(io_matpoly_block_size, avg);
601
602 /* This is only temp storage ! */
603 matpoly loc(ab, X.m, X.n, B);
604 loc.zero_pad(B);
605
606 for(unsigned int k = 0 ; ok && k < X.get_size() ; k += B) {
607 unsigned int nc = MIN(B, X.get_size() - k);
608 X.gather_mat_partial(loc, 0, k, nc);
609 if (!rank)
610 ok = matpoly_write(ab, data, loc, 0, nc, 0, 0) == (int) nc;
611 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
612 }
613
614 if (!rank) {
615 data.close();
616 ok = ok && (bool) data;
617 }
618 } while (0);
619 MPI_Allreduce(MPI_IN_PLACE, &ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
620 if (!ok && !rank) {
621 if (!cp.datafile.empty()) unlink(cp.datafile.c_str());
622 unlink(cp.auxfile.c_str());
623 }
624 }
625 logline_end(&bm.t_cp_io,"");
626 if (!ok && !rank) {
627 fmt::fprintf(stderr, "Warning: I/O error while saving %s\n", cp.datafile);
628 }
629 return ok;
630 }/*}}}*/
631
632 template<>
load_checkpoint_file(bmstatus & bm,cp_which which,bigmatpoly & X,unsigned int t0,unsigned int t1)633 int load_checkpoint_file<bigmatpoly>(bmstatus & bm, cp_which which, bigmatpoly & X, unsigned int t0, unsigned int t1)/*{{{*/
634 {
635 /* read scattered checkpoint with higher priority if available,
636 * because we like distributed I/O. Otherwise, read gathered
637 * checkpoint if we could find one.
638 */
639 if (!lingen_checkpoint::directory) return 0;
640 if ((t1 - t0) < lingen_checkpoint::threshold) return 0;
641 int rank;
642 MPI_Comm_rank(bm.com[0], &rank);
643 lingen_checkpoint cp(bm, which, t0, t1, 1);
644 int ok = 0;
645 int aux_ok = rank || access(cp.auxfile.c_str(), R_OK) == 0;
646 int sdata_ok = access(cp.sdatafile.c_str(), R_OK) == 0;
647 int scattered_ok = aux_ok && sdata_ok;
648 MPI_Allreduce(MPI_IN_PLACE, &scattered_ok, 1, MPI_INT, MPI_MIN, bm.com[0]);
649 if (scattered_ok) {
650 ok = load_mpi_checkpoint_file_scattered(bm, which, X, t0, t1);
651 if (ok) return ok;
652 }
653 int gdata_ok = rank || access(cp.gdatafile.c_str(), R_OK) == 0;
654 int gathered_ok = aux_ok && gdata_ok;
655 MPI_Bcast(&gathered_ok, 1, MPI_INT, 0, bm.com[0]);
656 if (gathered_ok) {
657 ok = load_mpi_checkpoint_file_gathered(bm, which, X, t0, t1);
658 }
659 return ok;
660 }/*}}}*/
661
662 template<>
save_checkpoint_file(bmstatus & bm,cp_which which,bigmatpoly const & X,unsigned int t0,unsigned int t1)663 int save_checkpoint_file<bigmatpoly>(bmstatus & bm, cp_which which, bigmatpoly const & X, unsigned int t0, unsigned int t1)/*{{{*/
664 {
665 if (lingen_checkpoint::save_gathered) {
666 return save_mpi_checkpoint_file_gathered(bm, which, X, t0, t1);
667 } else {
668 return save_mpi_checkpoint_file_scattered(bm, which, X, t0, t1);
669 }
670 }/*}}}*/
671