1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2 /*
3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2017 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2012      The University of Wisconsin-La Crosse. All rights
14  *                         reserved.
15  * $COPYRIGHT$
16  *
17  * Additional copyrights may follow
18  *
19  * $HEADER$
20  */
21 
22 /** @file
23  *
24  * OMPI Layer Checkpoint/Restart Runtime functions
25  *
26  */
27 
28 #include "ompi_config.h"
29 
30 #include <errno.h>
31 #ifdef HAVE_UNISTD_H
32 #include <unistd.h>
33 #endif  /* HAVE_UNISTD_H */
34 #ifdef HAVE_FCNTL_H
35 #include <fcntl.h>
36 #endif  /* HAVE_FCNTL_H */
37 #ifdef HAVE_SYS_TYPES_H
38 #include <sys/types.h>
39 #endif  /* HAVE_SYS_TYPES_H */
40 #ifdef HAVE_SYS_STAT_H
41 #include <sys/stat.h>  /* for mkfifo */
42 #endif  /* HAVE_SYS_STAT_H */
43 
44 #include "opal/mca/event/event.h"
45 #include "opal/util/output.h"
46 #include "opal/mca/crs/crs.h"
47 #include "opal/mca/crs/base/base.h"
48 #include "opal/mca/installdirs/installdirs.h"
49 #include "opal/runtime/opal_cr.h"
50 #include "opal/mca/btl/base/base.h"
51 
52 #if OPAL_ENABLE_FT_CR == 1
53 #include "orte/mca/snapc/snapc.h"
54 #include "orte/mca/snapc/base/base.h"
55 #endif
56 
57 #include "ompi/constants.h"
58 #include "ompi/mca/pml/pml.h"
59 #include "ompi/mca/pml/base/base.h"
60 #include "ompi/mca/crcp/crcp.h"
61 #include "ompi/mca/crcp/base/base.h"
62 #include "ompi/communicator/communicator.h"
63 #include "ompi/runtime/ompi_cr.h"
64 #if OPAL_ENABLE_CRDEBUG == 1
65 #include "ompi/debuggers/debuggers.h"
66 #endif
67 
68 #if OPAL_ENABLE_CRDEBUG == 1
69 OMPI_DECLSPEC int MPIR_checkpointable = 0;
70 OMPI_DECLSPEC char * MPIR_controller_hostname = NULL;
71 OMPI_DECLSPEC char * MPIR_checkpoint_command  = NULL;
72 OMPI_DECLSPEC char * MPIR_restart_command     = NULL;
73 OMPI_DECLSPEC char * MPIR_checkpoint_listing_command  = NULL;
74 #endif
75 
76 /*************
77  * Local functions
78  *************/
79 static int ompi_cr_coord_pre_ckpt(void);
80 static int ompi_cr_coord_pre_restart(void);
81 static int ompi_cr_coord_pre_continue(void);
82 
83 static int ompi_cr_coord_post_ckpt(void);
84 static int ompi_cr_coord_post_restart(void);
85 static int ompi_cr_coord_post_continue(void);
86 
87 /*************
88  * Local vars
89  *************/
90 static opal_cr_coord_callback_fn_t  prev_coord_callback = NULL;
91 
92 int ompi_cr_output = -1;
93 int ompi_cr_verbosity = 0;
94 
95 #define NUM_COLLECTIVES 16
96 
97 #define SIGNAL(comm, modules, highest_module, msg, ret, func)   \
98     do {                                                        \
99         bool found = false;                                     \
100         int k;                                                  \
101         mca_coll_base_module_t *my_module =                     \
102             comm->c_coll->coll_ ## func ## _module;             \
103         if (NULL != my_module) {                                \
104             for (k = 0 ; k < highest_module ; ++k) {            \
105                 if (my_module == modules[k]) found = true;      \
106             }                                                   \
107             if (!found) {                                       \
108                 modules[highest_module++] = my_module;          \
109                 if (NULL != my_module->ft_event) {              \
110                     ret = my_module->ft_event(msg);             \
111                     if( OMPI_SUCCESS != ret ) {                 \
112                         return ret;                             \
113                     }                                           \
114                 }                                               \
115             }                                                   \
116         }                                                       \
117     } while (0)
118 
119 
120 static int
notify_collectives(int msg)121 notify_collectives(int msg)
122 {
123     mca_coll_base_module_t *modules[NUM_COLLECTIVES];
124     int i, max, ret, highest_module = 0;
125 
126     memset(&modules, 0, sizeof(mca_coll_base_module_t*) * NUM_COLLECTIVES);
127 
128     max = opal_pointer_array_get_size(&ompi_mpi_communicators);
129     for (i = 0 ; i < max ; ++i) {
130         ompi_communicator_t *comm =
131             (ompi_communicator_t *)opal_pointer_array_get_item(&ompi_mpi_communicators, i);
132         if (NULL == comm) continue;
133 
134         SIGNAL(comm, modules, highest_module, msg, ret, allgather);
135         SIGNAL(comm, modules, highest_module, msg, ret, allgatherv);
136         SIGNAL(comm, modules, highest_module, msg, ret, allreduce);
137         SIGNAL(comm, modules, highest_module, msg, ret, alltoall);
138         SIGNAL(comm, modules, highest_module, msg, ret, alltoallv);
139         SIGNAL(comm, modules, highest_module, msg, ret, alltoallw);
140         SIGNAL(comm, modules, highest_module, msg, ret, barrier);
141         SIGNAL(comm, modules, highest_module, msg, ret, bcast);
142         SIGNAL(comm, modules, highest_module, msg, ret, exscan);
143         SIGNAL(comm, modules, highest_module, msg, ret, gather);
144         SIGNAL(comm, modules, highest_module, msg, ret, gatherv);
145         SIGNAL(comm, modules, highest_module, msg, ret, reduce);
146         SIGNAL(comm, modules, highest_module, msg, ret, reduce_scatter);
147         SIGNAL(comm, modules, highest_module, msg, ret, scan);
148         SIGNAL(comm, modules, highest_module, msg, ret, scatter);
149         SIGNAL(comm, modules, highest_module, msg, ret, scatterv);
150     }
151 
152     return OMPI_SUCCESS;
153 }
154 
155 
156 /*
157  * CR Init
158  */
ompi_cr_init(void)159 int ompi_cr_init(void)
160 {
161     /*
162      * Register some MCA variables
163      */
164     ompi_cr_verbosity = 0;
165     (void) mca_base_var_register("ompi", "ompi", "cr", "verbose",
166                                  "Verbose output for the OMPI Checkpoint/Restart functionality",
167                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
168                                  OPAL_INFO_LVL_9,
169                                  MCA_BASE_VAR_SCOPE_READONLY,
170                                  &ompi_cr_verbosity);
171     if(0 != ompi_cr_verbosity) {
172         ompi_cr_output = opal_output_open(NULL);
173         opal_output_set_verbosity(ompi_cr_output, ompi_cr_verbosity);
174     } else {
175         ompi_cr_output = opal_cr_output;
176     }
177 
178     opal_output_verbose(10, ompi_cr_output,
179                         "ompi_cr: init: ompi_cr_init()");
180 
181     /* Register the OMPI interlevel coordination callback */
182     opal_cr_reg_coord_callback(ompi_cr_coord, &prev_coord_callback);
183 
184 #if OPAL_ENABLE_CRDEBUG == 1
185     /* Check for C/R enabled debugging */
186     if( MPIR_debug_with_checkpoint ) {
187         char *uri = NULL;
188         char *sep = NULL;
189         char *hostname = NULL;
190 
191         /* Mark as debuggable with C/R */
192         MPIR_checkpointable = 1;
193 
194         /* Set the checkpoint and restart commands */
195         /* Add the full path to the binary */
196         asprintf(&MPIR_checkpoint_command,
197                  "%s/ompi-checkpoint --crdebug --hnp-jobid %u",
198                  opal_install_dirs.bindir,
199                  ORTE_PROC_MY_HNP->jobid);
200         asprintf(&MPIR_restart_command,
201                  "%s/ompi-restart --crdebug ",
202                  opal_install_dirs.bindir);
203         asprintf(&MPIR_checkpoint_listing_command,
204                  "%s/ompi-checkpoint -l --crdebug ",
205                  opal_install_dirs.bindir);
206 
207         /* Set contact information for HNP */
208         uri = strdup(ompi_process_info.my_hnp_uri);
209         hostname = strchr(uri, ';') + 1;
210         sep = strchr(hostname, ';');
211         if (sep) {
212             *sep = 0;
213         }
214         if (strncmp(hostname, "tcp://", 6) == 0) {
215             hostname += 6;
216             sep = strchr(hostname, ':');
217             *sep = 0;
218             MPIR_controller_hostname = strdup(hostname);
219         } else {
220             MPIR_controller_hostname = strdup("localhost");
221         }
222 
223         /* Cleanup */
224         if( NULL != uri ) {
225             free(uri);
226             uri = NULL;
227         }
228     }
229 #endif
230 
231     return OMPI_SUCCESS;
232 }
233 
234 /*
235  * Finalize
236  */
ompi_cr_finalize(void)237 int ompi_cr_finalize(void)
238 {
239     opal_output_verbose(10, ompi_cr_output,
240                         "ompi_cr: finalize: ompi_cr_finalize()");
241 
242     return OMPI_SUCCESS;
243 }
244 
245 /*
246  * Interlayer coordination callback
247  */
ompi_cr_coord(int state)248 int ompi_cr_coord(int state)
249 {
250     int ret, exit_status = OMPI_SUCCESS;
251 
252     opal_output_verbose(10, ompi_cr_output,
253                         "ompi_cr: coord: ompi_cr_coord(%s)\n",
254                         opal_crs_base_state_str((opal_crs_state_type_t)state));
255 
256     /*
257      * Before calling the previous callback, we have the opportunity to
258      * take action given the state.
259      */
260     if(OPAL_CRS_CHECKPOINT == state) {
261         /* Do Checkpoint Phase work */
262         ret = ompi_cr_coord_pre_ckpt();
263         if( ret == OMPI_EXISTS) {
264             return ret;
265         }
266         else if( ret != OMPI_SUCCESS) {
267             return ret;
268         }
269     }
270     else if (OPAL_CRS_CONTINUE == state ) {
271         /* Do Continue Phase work */
272         ompi_cr_coord_pre_continue();
273     }
274     else if (OPAL_CRS_RESTART == state ) {
275         /* Do Restart Phase work */
276         ompi_cr_coord_pre_restart();
277     }
278     else if (OPAL_CRS_TERM == state ) {
279         /* Do Continue Phase work in prep to terminate the application */
280     }
281     else {
282         /* We must have been in an error state from the checkpoint
283          * recreate everything, as in the Continue Phase
284          */
285     }
286 
287     /*
288      * Call the previous callback, which should be ORTE [which will handle OPAL]
289      */
290     if(OMPI_SUCCESS != (ret = prev_coord_callback(state)) ) {
291         exit_status = ret;
292         goto cleanup;
293     }
294 
295 
296     /*
297      * After calling the previous callback, we have the opportunity to
298      * take action given the state to tidy up.
299      */
300     if(OPAL_CRS_CHECKPOINT == state) {
301         /* Do Checkpoint Phase work */
302         ompi_cr_coord_post_ckpt();
303     }
304     else if (OPAL_CRS_CONTINUE == state ) {
305         /* Do Continue Phase work */
306         ompi_cr_coord_post_continue();
307 
308 #if OPAL_ENABLE_CRDEBUG == 1
309         /*
310          * If C/R enabled debugging,
311          * wait here for debugger to attach
312          */
313         if( MPIR_debug_with_checkpoint ) {
314             MPIR_checkpoint_debugger_breakpoint();
315         }
316 #endif
317     }
318     else if (OPAL_CRS_RESTART == state ) {
319         /* Do Restart Phase work */
320         ompi_cr_coord_post_restart();
321 
322 #if OPAL_ENABLE_CRDEBUG == 1
323         /*
324          * If C/R enabled debugging,
325          * wait here for debugger to attach
326          */
327         if( MPIR_debug_with_checkpoint ) {
328             MPIR_checkpoint_debugger_breakpoint();
329         }
330 #endif
331     }
332     else if (OPAL_CRS_TERM == state ) {
333         /* Do Continue Phase work in prep to terminate the application */
334     }
335     else {
336         /* We must have been in an error state from the checkpoint
337          * recreate everything, as in the Continue Phase
338          */
339     }
340 
341  cleanup:
342     return exit_status;
343 }
344 
345 /*************
346  * Pre Lower Layer
347  *************/
ompi_cr_coord_pre_ckpt(void)348 static int ompi_cr_coord_pre_ckpt(void) {
349     int ret, exit_status = OMPI_SUCCESS;
350 
351     /*
352      * All the checkpoint heavey lifting in here...
353      */
354     opal_output_verbose(10, ompi_cr_output,
355                         "ompi_cr: coord_pre_ckpt: ompi_cr_coord_pre_ckpt()\n");
356 
357     /*
358      * Notify Collectives
359      * - Need to do this on a per communicator basis
360      *   Traverse all communicators...
361      */
362     if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CR_CHECKPOINT))) {
363         goto cleanup;
364     }
365 
366     /*
367      * Notify PML
368      *  - Will notify BML and BTL's
369      */
370     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CHECKPOINT))) {
371         exit_status = ret;
372         goto cleanup;
373     }
374 
375  cleanup:
376 
377     return exit_status;
378 }
379 
ompi_cr_coord_pre_restart(void)380 static int ompi_cr_coord_pre_restart(void) {
381     int ret, exit_status = OMPI_SUCCESS;
382 
383     opal_output_verbose(10, ompi_cr_output,
384                         "ompi_cr: coord_pre_restart: ompi_cr_coord_pre_restart()");
385 
386     /*
387      * Notify PML
388      *  - Will notify BML and BTL's
389      *  - The intention here is to have the PML shutdown all the old components
390      *    and handles. On the second pass (once ORTE is restarted) we can
391      *    reconnect processes.
392      */
393     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_RESTART_PRE))) {
394         exit_status = ret;
395         goto cleanup;
396     }
397 
398  cleanup:
399     return exit_status;
400 }
401 
ompi_cr_coord_pre_continue(void)402 static int ompi_cr_coord_pre_continue(void) {
403 #if OPAL_ENABLE_FT_CR == 1
404     int ret, exit_status = OMPI_SUCCESS;
405 
406     /*
407      * Can not really do much until ORTE is up and running,
408      * so defer action until the post_continue function.
409      */
410     opal_output_verbose(10, ompi_cr_output,
411                         "ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()");
412 
413     if (opal_cr_continue_like_restart) {
414         /* Mimic ompi_cr_coord_pre_restart(); */
415         if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
416             exit_status = ret;
417             goto cleanup;
418         }
419     }
420     else {
421         if( opal_cr_timing_barrier_enabled ) {
422             OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
423         }
424         OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
425         if( opal_cr_timing_barrier_enabled ) {
426             OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
427         }
428         OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
429     }
430 
431  cleanup:
432     return exit_status;
433 #else
434     return OMPI_SUCCESS;
435 #endif
436 }
437 
438 /*************
439  * Post Lower Layer
440  *************/
ompi_cr_coord_post_ckpt(void)441 static int ompi_cr_coord_post_ckpt(void) {
442     /*
443      * Now that ORTE/OPAL are shutdown, we really can't do much
444      * so assume pre_ckpt took care of everything.
445      */
446     opal_output_verbose(10, ompi_cr_output,
447                         "ompi_cr: coord_post_ckpt: ompi_cr_coord_post_ckpt()");
448 
449     return OMPI_SUCCESS;
450 }
451 
ompi_cr_coord_post_restart(void)452 static int ompi_cr_coord_post_restart(void) {
453     int ret, exit_status = OMPI_SUCCESS;
454 
455     opal_output_verbose(10, ompi_cr_output,
456                         "ompi_cr: coord_post_restart: ompi_cr_coord_post_restart()");
457 
458     /*
459      * Notify PML
460      *  - Will notify BML and BTL's
461      */
462     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_RESTART))) {
463         exit_status = ret;
464         goto cleanup;
465     }
466 
467     /*
468      * Notify Collectives
469      * - Need to do this on a per communicator basis
470      *   Traverse all communicators...
471      */
472     if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CRS_RESTART))) {
473         goto cleanup;
474     }
475 
476  cleanup:
477 
478     return exit_status;
479 }
480 
ompi_cr_coord_post_continue(void)481 static int ompi_cr_coord_post_continue(void) {
482     int ret, exit_status = OMPI_SUCCESS;
483 
484     opal_output_verbose(10, ompi_cr_output,
485                         "ompi_cr: coord_post_continue: ompi_cr_coord_post_continue()");
486 
487     /*
488      * Notify PML
489      *  - Will notify BML and BTL's
490      */
491     if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
492         exit_status = ret;
493         goto cleanup;
494     }
495 
496     /*
497      * Notify Collectives
498      * - Need to do this on a per communicator basis
499      *   Traverse all communicators...
500      */
501     if (OMPI_SUCCESS != (ret = notify_collectives(OPAL_CRS_CONTINUE))) {
502         goto cleanup;
503     }
504 
505  cleanup:
506 
507     return exit_status;
508 }
509