1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2 /*
3 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2017 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2012 The University of Wisconsin-La Crosse. All rights
14 * reserved.
15 * $COPYRIGHT$
16 *
17 * Additional copyrights may follow
18 *
19 * $HEADER$
20 */
21
22 /** @file
23 *
24 * OMPI Layer Checkpoint/Restart Runtime functions
25 *
26 */
27
28 #include "ompi_config.h"
29
30 #include <errno.h>
31 #ifdef HAVE_UNISTD_H
32 #include <unistd.h>
33 #endif /* HAVE_UNISTD_H */
34 #ifdef HAVE_FCNTL_H
35 #include <fcntl.h>
36 #endif /* HAVE_FCNTL_H */
37 #ifdef HAVE_SYS_TYPES_H
38 #include <sys/types.h>
39 #endif /* HAVE_SYS_TYPES_H */
40 #ifdef HAVE_SYS_STAT_H
41 #include <sys/stat.h> /* for mkfifo */
42 #endif /* HAVE_SYS_STAT_H */
43
44 #include "opal/mca/event/event.h"
45 #include "opal/util/output.h"
46 #include "opal/mca/crs/crs.h"
47 #include "opal/mca/crs/base/base.h"
48 #include "opal/mca/installdirs/installdirs.h"
49 #include "opal/runtime/opal_cr.h"
50 #include "opal/mca/btl/base/base.h"
51
52 #if OPAL_ENABLE_FT_CR == 1
53 #include "orte/mca/snapc/snapc.h"
54 #include "orte/mca/snapc/base/base.h"
55 #endif
56
57 #include "ompi/constants.h"
58 #include "ompi/mca/pml/pml.h"
59 #include "ompi/mca/pml/base/base.h"
60 #include "ompi/mca/crcp/crcp.h"
61 #include "ompi/mca/crcp/base/base.h"
62 #include "ompi/communicator/communicator.h"
63 #include "ompi/runtime/ompi_cr.h"
64 #if OPAL_ENABLE_CRDEBUG == 1
65 #include "ompi/debuggers/debuggers.h"
66 #endif
67
68 #if OPAL_ENABLE_CRDEBUG == 1
69 OMPI_DECLSPEC int MPIR_checkpointable = 0;
70 OMPI_DECLSPEC char * MPIR_controller_hostname = NULL;
71 OMPI_DECLSPEC char * MPIR_checkpoint_command = NULL;
72 OMPI_DECLSPEC char * MPIR_restart_command = NULL;
73 OMPI_DECLSPEC char * MPIR_checkpoint_listing_command = NULL;
74 #endif
75
76 /*************
77 * Local functions
78 *************/
79 static int ompi_cr_coord_pre_ckpt(void);
80 static int ompi_cr_coord_pre_restart(void);
81 static int ompi_cr_coord_pre_continue(void);
82
83 static int ompi_cr_coord_post_ckpt(void);
84 static int ompi_cr_coord_post_restart(void);
85 static int ompi_cr_coord_post_continue(void);
86
87 /*************
88 * Local vars
89 *************/
90 static opal_cr_coord_callback_fn_t prev_coord_callback = NULL;
91
92 int ompi_cr_output = -1;
93 int ompi_cr_verbosity = 0;
94
95 #define NUM_COLLECTIVES 16
96
97 #define SIGNAL(comm, modules, highest_module, msg, ret, func) \
98 do { \
99 bool found = false; \
100 int k; \
101 mca_coll_base_module_t *my_module = \
102 comm->c_coll->coll_ ## func ## _module; \
103 if (NULL != my_module) { \
104 for (k = 0 ; k < highest_module ; ++k) { \
105 if (my_module == modules[k]) found = true; \
106 } \
107 if (!found) { \
108 modules[highest_module++] = my_module; \
109 if (NULL != my_module->ft_event) { \
110 ret = my_module->ft_event(msg); \
111 if( OMPI_SUCCESS != ret ) { \
112 return ret; \
113 } \
114 } \
115 } \
116 } \
117 } while (0)
118
119
120 static int
notify_collectives(int msg)121 notify_collectives(int msg)
122 {
123 mca_coll_base_module_t *modules[NUM_COLLECTIVES];
124 int i, max, ret, highest_module = 0;
125
126 memset(&modules, 0, sizeof(mca_coll_base_module_t*) * NUM_COLLECTIVES);
127
128 max = opal_pointer_array_get_size(&ompi_mpi_communicators);
129 for (i = 0 ; i < max ; ++i) {
130 ompi_communicator_t *comm =
131 (ompi_communicator_t *)opal_pointer_array_get_item(&ompi_mpi_communicators, i);
132 if (NULL == comm) continue;
133
134 SIGNAL(comm, modules, highest_module, msg, ret, allgather);
135 SIGNAL(comm, modules, highest_module, msg, ret, allgatherv);
136 SIGNAL(comm, modules, highest_module, msg, ret, allreduce);
137 SIGNAL(comm, modules, highest_module, msg, ret, alltoall);
138 SIGNAL(comm, modules, highest_module, msg, ret, alltoallv);
139 SIGNAL(comm, modules, highest_module, msg, ret, alltoallw);
140 SIGNAL(comm, modules, highest_module, msg, ret, barrier);
141 SIGNAL(comm, modules, highest_module, msg, ret, bcast);
142 SIGNAL(comm, modules, highest_module, msg, ret, exscan);
143 SIGNAL(comm, modules, highest_module, msg, ret, gather);
144 SIGNAL(comm, modules, highest_module, msg, ret, gatherv);
145 SIGNAL(comm, modules, highest_module, msg, ret, reduce);
146 SIGNAL(comm, modules, highest_module, msg, ret, reduce_scatter);
147 SIGNAL(comm, modules, highest_module, msg, ret, scan);
148 SIGNAL(comm, modules, highest_module, msg, ret, scatter);
149 SIGNAL(comm, modules, highest_module, msg, ret, scatterv);
150 }
151
152 return OMPI_SUCCESS;
153 }
154
155
156 /*
157 * CR Init
158 */
ompi_cr_init(void)159 int ompi_cr_init(void)
160 {
161 /*
162 * Register some MCA variables
163 */
164 ompi_cr_verbosity = 0;
165 (void) mca_base_var_register("ompi", "ompi", "cr", "verbose",
166 "Verbose output for the OMPI Checkpoint/Restart functionality",
167 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
168 OPAL_INFO_LVL_9,
169 MCA_BASE_VAR_SCOPE_READONLY,
170 &ompi_cr_verbosity);
171 if(0 != ompi_cr_verbosity) {
172 ompi_cr_output = opal_output_open(NULL);
173 opal_output_set_verbosity(ompi_cr_output, ompi_cr_verbosity);
174 } else {
175 ompi_cr_output = opal_cr_output;
176 }
177
178 opal_output_verbose(10, ompi_cr_output,
179 "ompi_cr: init: ompi_cr_init()");
180
181 /* Register the OMPI interlevel coordination callback */
182 opal_cr_reg_coord_callback(ompi_cr_coord, &prev_coord_callback);
183
184 #if OPAL_ENABLE_CRDEBUG == 1
185 /* Check for C/R enabled debugging */
186 if( MPIR_debug_with_checkpoint ) {
187 char *uri = NULL;
188 char *sep = NULL;
189 char *hostname = NULL;
190
191 /* Mark as debuggable with C/R */
192 MPIR_checkpointable = 1;
193
194 /* Set the checkpoint and restart commands */
195 /* Add the full path to the binary */
196 asprintf(&MPIR_checkpoint_command,
197 "%s/ompi-checkpoint --crdebug --hnp-jobid %u",
198 opal_install_dirs.bindir,
199 ORTE_PROC_MY_HNP->jobid);
200 asprintf(&MPIR_restart_command,
201 "%s/ompi-restart --crdebug ",
202 opal_install_dirs.bindir);
203 asprintf(&MPIR_checkpoint_listing_command,
204 "%s/ompi-checkpoint -l --crdebug ",
205 opal_install_dirs.bindir);
206
207 /* Set contact information for HNP */
208 uri = strdup(ompi_process_info.my_hnp_uri);
209 hostname = strchr(uri, ';') + 1;
210 sep = strchr(hostname, ';');
211 if (sep) {
212 *sep = 0;
213 }
214 if (strncmp(hostname, "tcp://", 6) == 0) {
215 hostname += 6;
216 sep = strchr(hostname, ':');
217 *sep = 0;
218 MPIR_controller_hostname = strdup(hostname);
219 } else {
220 MPIR_controller_hostname = strdup("localhost");
221 }
222
223 /* Cleanup */
224 if( NULL != uri ) {
225 free(uri);
226 uri = NULL;
227 }
228 }
229 #endif
230
231 return OMPI_SUCCESS;
232 }
233
234 /*
235 * Finalize
236 */
ompi_cr_finalize(void)237 int ompi_cr_finalize(void)
238 {
239 opal_output_verbose(10, ompi_cr_output,
240 "ompi_cr: finalize: ompi_cr_finalize()");
241
242 return OMPI_SUCCESS;
243 }
244
245 /*
246 * Interlayer coordination callback
247 */
ompi_cr_coord(int state)248 int ompi_cr_coord(int state)
249 {
250 int ret, exit_status = OMPI_SUCCESS;
251
252 opal_output_verbose(10, ompi_cr_output,
253 "ompi_cr: coord: ompi_cr_coord(%s)\n",
254 opal_crs_base_state_str((opal_crs_state_type_t)state));
255
256 /*
257 * Before calling the previous callback, we have the opportunity to
258 * take action given the state.
259 */
260 if(OPAL_CRS_CHECKPOINT == state) {
261 /* Do Checkpoint Phase work */
262 ret = ompi_cr_coord_pre_ckpt();
263 if( ret == OMPI_EXISTS) {
264 return ret;
265 }
266 else if( ret != OMPI_SUCCESS) {
267 return ret;
268 }
269 }
270 else if (OPAL_CRS_CONTINUE == state ) {
271 /* Do Continue Phase work */
272 ompi_cr_coord_pre_continue();
273 }
274 else if (OPAL_CRS_RESTART == state ) {
275 /* Do Restart Phase work */
276 ompi_cr_coord_pre_restart();
277 }
278 else if (OPAL_CRS_TERM == state ) {
279 /* Do Continue Phase work in prep to terminate the application */
280 }
281 else {
282 /* We must have been in an error state from the checkpoint
283 * recreate everything, as in the Continue Phase
284 */
285 }
286
287 /*
288 * Call the previous callback, which should be ORTE [which will handle OPAL]
289 */
290 if(OMPI_SUCCESS != (ret = prev_coord_callback(state)) ) {
291 exit_status = ret;
292 goto cleanup;
293 }
294
295
296 /*
297 * After calling the previous callback, we have the opportunity to
298 * take action given the state to tidy up.
299 */
300 if(OPAL_CRS_CHECKPOINT == state) {
301 /* Do Checkpoint Phase work */
302 ompi_cr_coord_post_ckpt();
303 }
304 else if (OPAL_CRS_CONTINUE == state ) {
305 /* Do Continue Phase work */
306 ompi_cr_coord_post_continue();
307
308 #if OPAL_ENABLE_CRDEBUG == 1
309 /*
310 * If C/R enabled debugging,
311 * wait here for debugger to attach
312 */
313 if( MPIR_debug_with_checkpoint ) {
314 MPIR_checkpoint_debugger_breakpoint();
315 }
316 #endif
317 }
318 else if (OPAL_CRS_RESTART == state ) {
319 /* Do Restart Phase work */
320 ompi_cr_coord_post_restart();
321
322 #if OPAL_ENABLE_CRDEBUG == 1
323 /*
324 * If C/R enabled debugging,
325 * wait here for debugger to attach
326 */
327 if( MPIR_debug_with_checkpoint ) {
328 MPIR_checkpoint_debugger_breakpoint();
329 }
330 #endif
331 }
332 else if (OPAL_CRS_TERM == state ) {
333 /* Do Continue Phase work in prep to terminate the application */
334 }
335 else {
336 /* We must have been in an error state from the checkpoint
337 * recreate everything, as in the Continue Phase
338 */
339 }
340
341 cleanup:
342 return exit_status;
343 }
344
345 /*************
346 * Pre Lower Layer
347 *************/
ompi_cr_coord_pre_ckpt(void)348 static int ompi_cr_coord_pre_ckpt(void) {
349 int ret, exit_status = OMPI_SUCCESS;
350
351 /*
352 * All the checkpoint heavey lifting in here...
353 */
354 opal_output_verbose(10, ompi_cr_output,
355 "ompi_cr: coord_pre_ckpt: ompi_cr_coord_pre_ckpt()\n");
356
357 /*
358 * Notify Collectives
359 * - Need to do this on a per communicator basis
360 * Traverse all communicators...
361 */
362 if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CR_CHECKPOINT))) {
363 goto cleanup;
364 }
365
366 /*
367 * Notify PML
368 * - Will notify BML and BTL's
369 */
370 if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CHECKPOINT))) {
371 exit_status = ret;
372 goto cleanup;
373 }
374
375 cleanup:
376
377 return exit_status;
378 }
379
ompi_cr_coord_pre_restart(void)380 static int ompi_cr_coord_pre_restart(void) {
381 int ret, exit_status = OMPI_SUCCESS;
382
383 opal_output_verbose(10, ompi_cr_output,
384 "ompi_cr: coord_pre_restart: ompi_cr_coord_pre_restart()");
385
386 /*
387 * Notify PML
388 * - Will notify BML and BTL's
389 * - The intention here is to have the PML shutdown all the old components
390 * and handles. On the second pass (once ORTE is restarted) we can
391 * reconnect processes.
392 */
393 if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_RESTART_PRE))) {
394 exit_status = ret;
395 goto cleanup;
396 }
397
398 cleanup:
399 return exit_status;
400 }
401
ompi_cr_coord_pre_continue(void)402 static int ompi_cr_coord_pre_continue(void) {
403 #if OPAL_ENABLE_FT_CR == 1
404 int ret, exit_status = OMPI_SUCCESS;
405
406 /*
407 * Can not really do much until ORTE is up and running,
408 * so defer action until the post_continue function.
409 */
410 opal_output_verbose(10, ompi_cr_output,
411 "ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()");
412
413 if (opal_cr_continue_like_restart) {
414 /* Mimic ompi_cr_coord_pre_restart(); */
415 if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
416 exit_status = ret;
417 goto cleanup;
418 }
419 }
420 else {
421 if( opal_cr_timing_barrier_enabled ) {
422 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
423 }
424 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
425 if( opal_cr_timing_barrier_enabled ) {
426 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
427 }
428 OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
429 }
430
431 cleanup:
432 return exit_status;
433 #else
434 return OMPI_SUCCESS;
435 #endif
436 }
437
438 /*************
439 * Post Lower Layer
440 *************/
ompi_cr_coord_post_ckpt(void)441 static int ompi_cr_coord_post_ckpt(void) {
442 /*
443 * Now that ORTE/OPAL are shutdown, we really can't do much
444 * so assume pre_ckpt took care of everything.
445 */
446 opal_output_verbose(10, ompi_cr_output,
447 "ompi_cr: coord_post_ckpt: ompi_cr_coord_post_ckpt()");
448
449 return OMPI_SUCCESS;
450 }
451
ompi_cr_coord_post_restart(void)452 static int ompi_cr_coord_post_restart(void) {
453 int ret, exit_status = OMPI_SUCCESS;
454
455 opal_output_verbose(10, ompi_cr_output,
456 "ompi_cr: coord_post_restart: ompi_cr_coord_post_restart()");
457
458 /*
459 * Notify PML
460 * - Will notify BML and BTL's
461 */
462 if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_RESTART))) {
463 exit_status = ret;
464 goto cleanup;
465 }
466
467 /*
468 * Notify Collectives
469 * - Need to do this on a per communicator basis
470 * Traverse all communicators...
471 */
472 if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CRS_RESTART))) {
473 goto cleanup;
474 }
475
476 cleanup:
477
478 return exit_status;
479 }
480
ompi_cr_coord_post_continue(void)481 static int ompi_cr_coord_post_continue(void) {
482 int ret, exit_status = OMPI_SUCCESS;
483
484 opal_output_verbose(10, ompi_cr_output,
485 "ompi_cr: coord_post_continue: ompi_cr_coord_post_continue()");
486
487 /*
488 * Notify PML
489 * - Will notify BML and BTL's
490 */
491 if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
492 exit_status = ret;
493 goto cleanup;
494 }
495
496 /*
497 * Notify Collectives
498 * - Need to do this on a per communicator basis
499 * Traverse all communicators...
500 */
501 if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CRS_CONTINUE))) {
502 goto cleanup;
503 }
504
505 cleanup:
506
507 return exit_status;
508 }
509