1 /*
2  * Copyright (c) 2004-2010 The Trustees of Indiana University.
3  *                         All rights reserved.
4  * Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
5  *                         All rights reserved.
6  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
7  *                         University of Stuttgart.  All rights reserved.
8  * Copyright (c) 2004-2005 The Regents of the University of California.
9  *                         All rights reserved.
10  * Copyright (c) 2007      Evergrid, Inc. All rights reserved.
11  * Copyright (c) 2013      Cisco Systems, Inc.  All rights reserved.
12  * Copyright (c) 2014-2016 Intel, Inc.  All rights reserved.
13  * $COPYRIGHT$
14  *
15  * Additional copyrights may follow
16  *
17  * $HEADER$
18  */
19 
20 #include "orte_config.h"
21 
22 #include <string.h>
23 #ifdef HAVE_SYS_TYPES_H
24 #include <sys/types.h>
25 #endif  /* HAVE_SYS_TYPES_H */
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif  /* HAVE_UNISTD_H */
29 #ifdef HAVE_SYS_TYPES_H
30 #include <sys/types.h>
31 #endif /* HAVE_SYS_TYPES_H */
32 #ifdef HAVE_SYS_STAT_H
33 #include <sys/stat.h>
34 #endif /* HAVE_SYS_STAT_H */
35 #ifdef HAVE_DIRENT_H
36 #include <dirent.h>
37 #endif /* HAVE_DIRENT_H */
38 #include <time.h>
39 
40 #include "orte/mca/mca.h"
41 #include "opal/mca/base/base.h"
42 
43 #include "opal/util/os_dirpath.h"
44 #include "opal/util/output.h"
45 #include "opal/util/show_help.h"
46 #include "opal/util/basename.h"
47 #include "opal/util/argv.h"
48 #include "opal/mca/crs/crs.h"
49 #include "opal/mca/crs/base/base.h"
50 #include "opal/dss/dss.h"
51 
52 #include "orte/mca/rml/rml.h"
53 #include "orte/mca/rml/rml_types.h"
54 #include "orte/mca/errmgr/errmgr.h"
55 #include "orte/runtime/orte_globals.h"
56 #include "orte/util/name_fns.h"
57 
58 #include "orte/mca/sstore/sstore.h"
59 #include "orte/mca/sstore/base/base.h"
60 
61 #include "orte/mca/snapc/snapc.h"
62 #include "orte/mca/snapc/base/base.h"
63 
64 /******************
65  * Local Functions
66  ******************/
67 size_t orte_snapc_base_snapshot_seq_number = 0;
68 
69 /******************
70  * Object stuff
71  ******************/
72 OBJ_CLASS_INSTANCE(orte_snapc_base_local_snapshot_t,
73                    opal_list_item_t,
74                    orte_snapc_base_local_snapshot_construct,
75                    orte_snapc_base_local_snapshot_destruct);
76 
orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t * snapshot)77 void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *snapshot)
78 {
79     snapshot->process_name.jobid  = 0;
80     snapshot->process_name.vpid   = 0;
81 
82     snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
83 
84     snapshot->ss_handle  = ORTE_SSTORE_HANDLE_INVALID;
85 }
86 
orte_snapc_base_local_snapshot_destruct(orte_snapc_base_local_snapshot_t * snapshot)87 void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *snapshot)
88 {
89     snapshot->process_name.jobid  = 0;
90     snapshot->process_name.vpid   = 0;
91 
92     snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
93 
94     snapshot->ss_handle  = ORTE_SSTORE_HANDLE_INVALID;
95 }
96 
97 /****/
98 OBJ_CLASS_INSTANCE(orte_snapc_base_global_snapshot_t,
99                    opal_list_item_t,
100                    orte_snapc_base_global_snapshot_construct,
101                    orte_snapc_base_global_snapshot_destruct);
102 
orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t * snapshot)103 void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *snapshot)
104 {
105     OBJ_CONSTRUCT(&(snapshot->local_snapshots), opal_list_t);
106 
107     snapshot->options = OBJ_NEW(opal_crs_base_ckpt_options_t);
108 
109     snapshot->ss_handle  = ORTE_SSTORE_HANDLE_INVALID;
110 }
111 
orte_snapc_base_global_snapshot_destruct(orte_snapc_base_global_snapshot_t * snapshot)112 void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot)
113 {
114     opal_list_item_t* item = NULL;
115 
116     while (NULL != (item = opal_list_remove_first(&snapshot->local_snapshots))) {
117         OBJ_RELEASE(item);
118     }
119     OBJ_DESTRUCT(&(snapshot->local_snapshots));
120 
121     if( NULL != snapshot->options ) {
122         OBJ_RELEASE(snapshot->options);
123         snapshot->options = NULL;
124     }
125 
126     snapshot->ss_handle  = ORTE_SSTORE_HANDLE_INVALID;
127 }
128 
129 OBJ_CLASS_INSTANCE(orte_snapc_base_quiesce_t,
130                    opal_object_t,
131                    orte_snapc_base_quiesce_construct,
132                    orte_snapc_base_quiesce_destruct);
133 
orte_snapc_base_quiesce_construct(orte_snapc_base_quiesce_t * quiesce)134 void orte_snapc_base_quiesce_construct(orte_snapc_base_quiesce_t *quiesce)
135 {
136     quiesce->epoch         = -1;
137     quiesce->snapshot      = NULL;
138     quiesce->ss_handle     = ORTE_SSTORE_HANDLE_INVALID;
139     quiesce->ss_snapshot   = NULL;
140     quiesce->handle        = NULL;
141     quiesce->target_dir    = NULL;
142     quiesce->crs_name      = NULL;
143     quiesce->cmdline       = NULL;
144     quiesce->cr_state      = OPAL_CRS_NONE;
145     quiesce->checkpointing = false;
146     quiesce->restarting    = false;
147 
148     quiesce->migrating     = false;
149     quiesce->num_migrating = 0;
150     OBJ_CONSTRUCT(&(quiesce->migrating_procs), opal_pointer_array_t);
151     opal_pointer_array_init(&(quiesce->migrating_procs), 8, INT32_MAX, 8);
152 }
153 
orte_snapc_base_quiesce_destruct(orte_snapc_base_quiesce_t * quiesce)154 void orte_snapc_base_quiesce_destruct( orte_snapc_base_quiesce_t *quiesce)
155 {
156     int i;
157     void *item = NULL;
158 
159     quiesce->epoch = -1;
160 
161     if( NULL != quiesce->snapshot ) {
162         OBJ_RELEASE(quiesce->snapshot);
163         quiesce->snapshot      = NULL;
164     }
165 
166     quiesce->ss_handle     = ORTE_SSTORE_HANDLE_INVALID;
167     if( NULL != quiesce->ss_snapshot ) {
168         OBJ_RELEASE(quiesce->ss_snapshot);
169         quiesce->ss_snapshot   = NULL;
170     }
171 
172     if( NULL != quiesce->handle ) {
173         free(quiesce->handle);
174         quiesce->handle = NULL;
175     }
176     if( NULL != quiesce->target_dir ) {
177         free(quiesce->target_dir);
178         quiesce->target_dir = NULL;
179     }
180     if( NULL != quiesce->crs_name ) {
181         free(quiesce->crs_name);
182         quiesce->crs_name = NULL;
183     }
184     if( NULL != quiesce->cmdline ) {
185         free(quiesce->cmdline);
186         quiesce->cmdline = NULL;
187     }
188 
189     quiesce->cr_state      = OPAL_CRS_NONE;
190     quiesce->checkpointing = false;
191     quiesce->restarting    = false;
192 
193     quiesce->migrating     = false;
194     quiesce->num_migrating = 0;
195     for( i = 0; i < quiesce->migrating_procs.size; ++i) {
196         item = opal_pointer_array_get_item(&(quiesce->migrating_procs), i);
197         if( NULL != item ) {
198             OBJ_RELEASE(item);
199         }
200     }
201     OBJ_DESTRUCT(&(quiesce->migrating_procs));
202 }
203 
204 OBJ_CLASS_INSTANCE(orte_snapc_base_request_op_t,
205                    opal_object_t,
206                    orte_snapc_base_request_op_construct,
207                    orte_snapc_base_request_op_destruct);
208 
orte_snapc_base_request_op_construct(orte_snapc_base_request_op_t * op)209 void orte_snapc_base_request_op_construct(orte_snapc_base_request_op_t *op)
210 {
211     op->event     = ORTE_SNAPC_OP_NONE;
212     op->is_active = false;
213     op->leader    = -1;
214 
215     op->seq_num       = -1;
216     op->global_handle = NULL;
217     op->ss_handle     = ORTE_SSTORE_HANDLE_INVALID;
218 
219     op->mig_num       = -1;
220     op->mig_vpids     = NULL;
221     /*op->mig_host_pref = NULL;*/
222     op->mig_vpid_pref = NULL;
223     op->mig_off_node  = NULL;
224 }
225 
orte_snapc_base_request_op_destruct(orte_snapc_base_request_op_t * op)226 void orte_snapc_base_request_op_destruct( orte_snapc_base_request_op_t *op)
227 {
228     op->event     = ORTE_SNAPC_OP_NONE;
229     op->is_active = false;
230     op->leader    = -1;
231 
232     op->seq_num       = -1;
233     if(NULL != op->global_handle ) {
234         free(op->global_handle);
235         op->global_handle = NULL;
236     }
237 
238     op->ss_handle     = ORTE_SSTORE_HANDLE_INVALID;
239 
240     op->mig_num       = -1;
241     /*
242     if( NULL != op->mig_vpids ) {
243         free( op->mig_vpids );
244         op->mig_vpids = NULL;
245     }
246 
247     if( NULL != op->mig_host_pref ) {
248         free( op->mig_host_pref );
249         op->mig_host_pref = NULL;
250     }
251 
252     if( NULL != op->mig_vpid_pref ) {
253         free( op->mig_vpid_pref );
254         op->mig_vpid_pref = NULL;
255     }
256 
257     if( NULL != op->mig_off_node ) {
258         free( op->mig_off_node );
259         op->mig_off_node = NULL;
260     }
261     */
262 }
263 
264 
265 /***********************
266  * None component stuff
267  ************************/
orte_snapc_base_none_open(void)268 int orte_snapc_base_none_open(void)
269 {
270     return ORTE_SUCCESS;
271 }
272 
orte_snapc_base_none_close(void)273 int orte_snapc_base_none_close(void)
274 {
275     return ORTE_SUCCESS;
276 }
277 
orte_snapc_base_none_query(mca_base_module_t ** module,int * priority)278 int orte_snapc_base_none_query(mca_base_module_t **module, int *priority)
279 {
280     *module = NULL;
281     *priority = 0;
282 
283     return OPAL_SUCCESS;
284 }
285 
orte_snapc_base_module_init(bool seed,bool app)286 int orte_snapc_base_module_init(bool seed, bool app)
287 {
288     return ORTE_SUCCESS;
289 }
290 
orte_snapc_base_module_finalize(void)291 int orte_snapc_base_module_finalize(void)
292 {
293     return ORTE_SUCCESS;
294 }
295 
296 /* None RML command line response callback */
297 static void snapc_none_global_cmdline_request(int status,
298                                               orte_process_name_t* sender,
299                                               opal_buffer_t *buffer,
300                                               orte_rml_tag_t tag,
301                                               void* cbdata);
orte_snapc_base_none_setup_job(orte_jobid_t jobid)302 int orte_snapc_base_none_setup_job(orte_jobid_t jobid)
303 {
304 
305     /*
306      * Coordinator command listener
307      */
308     orte_snapc_base_snapshot_seq_number = -1;
309     orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
310                             ORTE_RML_TAG_CKPT,
311                             ORTE_RML_PERSISTENT,
312                             snapc_none_global_cmdline_request,
313                             NULL);
314 
315     return ORTE_SUCCESS;
316 }
317 
orte_snapc_base_none_release_job(orte_jobid_t jobid)318 int orte_snapc_base_none_release_job(orte_jobid_t jobid)
319 {
320     /*
321      * Remove the checkpoint request callback
322      */
323 
324     return ORTE_SUCCESS;
325 }
326 
orte_snapc_base_none_ft_event(int state)327 int orte_snapc_base_none_ft_event(int state)
328 {
329     return ORTE_SUCCESS;
330 }
331 
orte_snapc_base_none_start_ckpt(orte_snapc_base_quiesce_t * datum)332 int orte_snapc_base_none_start_ckpt(orte_snapc_base_quiesce_t *datum)
333 {
334     return ORTE_SUCCESS;
335 }
336 
orte_snapc_base_none_end_ckpt(orte_snapc_base_quiesce_t * datum)337 int orte_snapc_base_none_end_ckpt(orte_snapc_base_quiesce_t *datum)
338 {
339     return ORTE_SUCCESS;
340 }
341 
342 
343 /********************
344  * Local Functions
345  ********************/
346 /* None RML response callback */
snapc_none_global_cmdline_request(int status,orte_process_name_t * sender,opal_buffer_t * buffer,orte_rml_tag_t tag,void * cbdata)347 static void snapc_none_global_cmdline_request(int status,
348                                               orte_process_name_t* sender,
349                                               opal_buffer_t *buffer,
350                                               orte_rml_tag_t tag,
351                                               void* cbdata)
352 {
353     int ret;
354     orte_snapc_cmd_flag_t command;
355     orte_std_cntr_t n = 1;
356     opal_crs_base_ckpt_options_t *options = NULL;
357     orte_jobid_t jobid;
358 
359     options = OBJ_NEW(opal_crs_base_ckpt_options_t);
360 
361     n = 1;
362     if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_SNAPC_CMD))) {
363         ORTE_ERROR_LOG(ret);
364         goto cleanup;
365     }
366 
367     /*
368      * orte_checkpoint has requested that a checkpoint be taken
369      * Respond that a checkpoint cannot be taken at this time
370      */
371     if (ORTE_SNAPC_GLOBAL_INIT_CMD == command) {
372         /*
373          * Do the basic handshake with the orte_checkpoint command
374          */
375         if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, options, &jobid)) ) {
376             ORTE_ERROR_LOG(ret);
377             goto cleanup;
378         }
379 
380         /*
381          * Respond with an invalid response
382          */
383         if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, 0, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
384             ORTE_ERROR_LOG(ret);
385             goto cleanup;
386         }
387     }
388     /*
389      * Unknown command
390      */
391     else {
392         ORTE_ERROR_LOG(ret);
393         goto cleanup;
394     }
395 
396  cleanup:
397     if( NULL != options ) {
398         OBJ_RELEASE(options);
399         options = NULL;
400     }
401 
402     return;
403 }
404 
405 /********************
406  * Utility functions
407  ********************/
408 
409 /* Report the checkpoint status */
orte_snapc_ckpt_state_notify(int state)410 void orte_snapc_ckpt_state_notify(int state)
411 {
412     switch(state) {
413     case ORTE_SNAPC_CKPT_STATE_ESTABLISHED:
414         opal_output(0, "%d: Checkpoint established for process %s.",
415                     orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
416         break;
417     case ORTE_SNAPC_CKPT_STATE_NO_CKPT:
418         opal_output(0, "%d: Process %s is not checkpointable.",
419                     orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
420         break;
421     case ORTE_SNAPC_CKPT_STATE_ERROR:
422         opal_output(0, "%d: Failed to checkpoint process %s.",
423                     orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
424         break;
425     case ORTE_SNAPC_CKPT_STATE_RECOVERED:
426         opal_output(0, "%d: Successfully restarted process %s.",
427                     orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
428         break;
429     case ORTE_SNAPC_CKPT_STATE_NO_RESTART:
430         opal_output(0, "%d: Failed to restart process %s.",
431                     orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
432         break;
433     /* ADK: We currently do not notify for these states, but good to
434      * have them around anyways. */
435     case ORTE_SNAPC_CKPT_STATE_NONE:
436     case ORTE_SNAPC_CKPT_STATE_REQUEST:
437     case ORTE_SNAPC_CKPT_STATE_PENDING:
438     case ORTE_SNAPC_CKPT_STATE_RUNNING:
439     case ORTE_SNAPC_CKPT_STATE_STOPPED:
440     case ORTE_SNAPC_CKPT_STATE_MIGRATING:
441     case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
442     default:
443         break;
444     }
445 }
446 
orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t * peer,opal_buffer_t * buffer,opal_crs_base_ckpt_options_t * options,orte_jobid_t * jobid)447 int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
448                                                opal_buffer_t* buffer,
449                                                opal_crs_base_ckpt_options_t *options,
450                                                orte_jobid_t *jobid)
451 {
452     int ret, exit_status = ORTE_SUCCESS;
453     orte_std_cntr_t count = 1;
454     orte_ns_cmp_bitmask_t mask;
455 
456     mask = ORTE_NS_CMP_ALL;
457 
458     /*
459      * Do not send to self, as that is silly.
460      */
461     if (OPAL_EQUAL ==
462             orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
463         OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
464                              "%s) base:ckpt_init_cmd: Error: Do not send to self!\n",
465                              ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
466         return ORTE_SUCCESS;
467     }
468 
469     OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
470                          "%s) base:ckpt_init_cmd: Receiving commands\n",
471                          ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
472 
473     /********************
474      * Receive command line checkpoint request:
475      * - Command (already received)
476      * - options
477      * - jobid
478      ********************/
479     if( ORTE_SUCCESS != (ret = orte_snapc_base_unpack_options(buffer, options)) ) {
480         opal_output(orte_snapc_base_framework.framework_output,
481                     "%s) base:ckpt_init_cmd: Error: Unpack (options) Failure (ret = %d)\n",
482                     ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
483         ORTE_ERROR_LOG(ret);
484         exit_status = ret;
485         goto cleanup;
486     }
487 
488     count = 1;
489     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, jobid, &count, ORTE_JOBID)) ) {
490         opal_output(orte_snapc_base_framework.framework_output,
491                     "%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
492                     ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
493                     ret, __LINE__);
494         ORTE_ERROR_LOG(ret);
495         exit_status = ret;
496         goto cleanup;
497     }
498 
499     OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
500                          "%s) base:ckpt_init_cmd: Received [%d, %d, %s]\n",
501                          ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
502                          (int)(options->term),
503                          (int)(options->stop),
504                          ORTE_JOBID_PRINT(*jobid)));
505 
506  cleanup:
507     return exit_status;
508 }
509 
orte_snapc_base_unpack_options(opal_buffer_t * buffer,opal_crs_base_ckpt_options_t * options)510 int orte_snapc_base_unpack_options(opal_buffer_t* buffer,
511                                    opal_crs_base_ckpt_options_t *options)
512 {
513     int ret, exit_status = ORTE_SUCCESS;
514     orte_std_cntr_t count = 1;
515 
516     count = 1;
517     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->term), &count, OPAL_BOOL)) ) {
518         opal_output(orte_snapc_base_framework.framework_output,
519                     "snapc:base:unpack_options: Error: Unpack (term) Failure (ret = %d)\n",
520                     ret);
521         ORTE_ERROR_LOG(ret);
522         exit_status = ret;
523         goto cleanup;
524     }
525 
526     count = 1;
527     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->stop), &count, OPAL_BOOL)) ) {
528         opal_output(orte_snapc_base_framework.framework_output,
529                     "snapc:base:unpack_options: Error: Unpack (stop) Failure (ret = %d)\n",
530                     ret);
531         ORTE_ERROR_LOG(ret);
532         exit_status = ret;
533         goto cleanup;
534     }
535 
536     count = 1;
537     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->inc_prep_only), &count, OPAL_BOOL)) ) {
538         opal_output(orte_snapc_base_framework.framework_output,
539                     "snapc:base:unpack_options: Error: Unpack (inc_prep_only) Failure (ret = %d)\n",
540                     ret);
541         ORTE_ERROR_LOG(ret);
542         exit_status = ret;
543         goto cleanup;
544     }
545 
546     count = 1;
547     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->inc_recover_only), &count, OPAL_BOOL)) ) {
548         opal_output(orte_snapc_base_framework.framework_output,
549                     "snapc:base:unpack_options: Error: Unpack (inc_recover_only) Failure (ret = %d)\n",
550                     ret);
551         ORTE_ERROR_LOG(ret);
552         exit_status = ret;
553         goto cleanup;
554     }
555 
556 #if OPAL_ENABLE_CRDEBUG == 1
557     count = 1;
558     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->attach_debugger), &count, OPAL_BOOL)) ) {
559         opal_output(orte_snapc_base_framework.framework_output,
560                     "snapc:base:unpack_options: Error: Unpack (attach_debugger) Failure (ret = %d)\n",
561                     ret);
562         ORTE_ERROR_LOG(ret);
563         exit_status = ret;
564         goto cleanup;
565     }
566 
567     count = 1;
568     if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->detach_debugger), &count, OPAL_BOOL)) ) {
569         opal_output(orte_snapc_base_framework.framework_output,
570                     "snapc:base:unpack_options: Error: Unpack (detach_debugger) Failure (ret = %d)\n",
571                     ret);
572         ORTE_ERROR_LOG(ret);
573         exit_status = ret;
574         goto cleanup;
575     }
576 #endif
577 
578  cleanup:
579     return exit_status;
580 }
581 
orte_snapc_base_pack_options(opal_buffer_t * buffer,opal_crs_base_ckpt_options_t * options)582 int orte_snapc_base_pack_options(opal_buffer_t* buffer,
583                                  opal_crs_base_ckpt_options_t *options)
584 {
585     int ret, exit_status = ORTE_SUCCESS;
586 
587     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->term), 1, OPAL_BOOL))) {
588         ORTE_ERROR_LOG(ret);
589         exit_status = ret;
590         goto cleanup;
591     }
592 
593     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->stop), 1, OPAL_BOOL))) {
594         ORTE_ERROR_LOG(ret);
595         exit_status = ret;
596         goto cleanup;
597     }
598 
599     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->inc_prep_only), 1, OPAL_BOOL))) {
600         ORTE_ERROR_LOG(ret);
601         exit_status = ret;
602         goto cleanup;
603     }
604 
605     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->inc_recover_only), 1, OPAL_BOOL))) {
606         ORTE_ERROR_LOG(ret);
607         exit_status = ret;
608         goto cleanup;
609     }
610 
611 #if OPAL_ENABLE_CRDEBUG == 1
612     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->attach_debugger), 1, OPAL_BOOL))) {
613         ORTE_ERROR_LOG(ret);
614         exit_status = ret;
615         goto cleanup;
616     }
617 
618     if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->detach_debugger), 1, OPAL_BOOL))) {
619         ORTE_ERROR_LOG(ret);
620         exit_status = ret;
621         goto cleanup;
622     }
623 #endif
624 
625  cleanup:
626     return exit_status;
627 }
628 
orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t * peer,orte_sstore_base_handle_t ss_handle,int ckpt_status)629 int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
630                                                  orte_sstore_base_handle_t ss_handle,
631                                                  int ckpt_status)
632 {
633     int ret, exit_status = ORTE_SUCCESS;
634     opal_buffer_t *loc_buffer = NULL;
635     orte_snapc_cmd_flag_t command = ORTE_SNAPC_GLOBAL_UPDATE_CMD;
636     char *global_snapshot_handle = NULL;
637     char *tmp_str = NULL;
638     int seq_num;
639     orte_ns_cmp_bitmask_t mask;
640 
641     /*
642      * Noop if invalid peer, or peer not specified (JJH Double check this)
643      */
644     if( NULL == peer ||
645         OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) {
646         /*return ORTE_ERR_BAD_PARAM;*/
647         return ORTE_SUCCESS;
648     }
649 
650     mask = ORTE_NS_CMP_ALL;
651 
652     /*
653      * Do not send to self, as that is silly.
654      */
655     if (OPAL_EQUAL == orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
656         OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
657                              "%s) base:ckpt_update_cmd: Error: Do not send to self!\n",
658                              ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
659         return ORTE_SUCCESS;
660     }
661 
662     /*
663      * Pass on the checkpoint state.
664      */
665     orte_snapc_ckpt_state_notify(ckpt_status);
666 
667     OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
668                          "%s) base:ckpt_update_cmd: Sending update command <status %d>\n",
669                          ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
670                          ckpt_status));
671 
672     /********************
673      * Send over the status of the checkpoint
674      * - ckpt_state
675      * - global snapshot handle (upon finish only)
676      * - sequence number        (upon finish only)
677      ********************/
678     if (NULL == (loc_buffer = OBJ_NEW(opal_buffer_t))) {
679         exit_status = ORTE_ERROR;
680         goto cleanup;
681     }
682 
683     if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
684         ORTE_ERROR_LOG(ret);
685         exit_status = ret;
686         OBJ_RELEASE(loc_buffer);
687         goto cleanup;
688     }
689 
690     if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &ckpt_status, 1, OPAL_INT))) {
691         opal_output(orte_snapc_base_framework.framework_output,
692                     "%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
693                     ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
694                     ret, __LINE__);
695         ORTE_ERROR_LOG(ret);
696         exit_status = ret;
697         OBJ_RELEASE(loc_buffer);
698         goto cleanup;
699     }
700 
701     if( ORTE_SNAPC_CKPT_STATE_RECOVERED == ckpt_status ||
702         ORTE_SNAPC_CKPT_STATE_ESTABLISHED  == ckpt_status ||
703         ORTE_SNAPC_CKPT_STATE_STOPPED   == ckpt_status ||
704         ORTE_SNAPC_CKPT_STATE_ERROR     == ckpt_status ) {
705 
706         if( ORTE_SNAPC_CKPT_STATE_ERROR != ckpt_status ) {
707             if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
708                                                             SSTORE_METADATA_GLOBAL_SNAP_REF,
709                                                             &global_snapshot_handle)) ) {
710                 opal_output(orte_snapc_base_framework.framework_output,
711                             "%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
712                             ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
713                 ORTE_ERROR_LOG(ret);
714                 /* Do not exit here, continue so that we can inform the tool
715                  * that the checkpoint has failed
716                  */
717             }
718 
719             if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
720                                                             SSTORE_METADATA_GLOBAL_SNAP_SEQ,
721                                                             &tmp_str)) ) {
722                 opal_output(orte_snapc_base_framework.framework_output,
723                             "%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
724                             ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
725                 ORTE_ERROR_LOG(ret);
726                 /* Do not exit here, continue so that we can inform the tool
727                  * that the checkpoint has failed
728                  */
729             }
730 
731             if( NULL != tmp_str ) {
732                 seq_num = atoi(tmp_str);
733             } else {
734                 seq_num = -1;
735             }
736         } else {
737             /* Checkpoint Error Case */
738             global_snapshot_handle = NULL;
739             seq_num = -1;
740         }
741 
742         OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
743                              "%s) base:ckpt_update_cmd: Sending update command <status %d> + <ref %s> <seq %d>\n",
744                              ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
745                              ckpt_status, global_snapshot_handle, seq_num));
746 
747         if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &global_snapshot_handle, 1, OPAL_STRING))) {
748             opal_output(orte_snapc_base_framework.framework_output,
749                         "%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
750                         ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
751                         ret, __LINE__);
752             ORTE_ERROR_LOG(ret);
753             exit_status = ret;
754             OBJ_RELEASE(loc_buffer);
755             goto cleanup;
756         }
757 
758         if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &seq_num, 1, OPAL_INT))) {
759             opal_output(orte_snapc_base_framework.framework_output,
760                         "%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
761                         ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
762                         ret, __LINE__);
763             ORTE_ERROR_LOG(ret);
764             exit_status = ret;
765             OBJ_RELEASE(loc_buffer);
766             goto cleanup;
767         }
768     }
769 
770     if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
771                                            peer, loc_buffer,
772                                            ORTE_RML_TAG_CKPT,
773                                            orte_rml_send_callback, NULL))) {
774         opal_output(orte_snapc_base_framework.framework_output,
775                     "%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
776                     ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
777                     ret, __LINE__);
778         ORTE_ERROR_LOG(ret);
779         exit_status = ret;
780         OBJ_RELEASE(loc_buffer);
781         goto cleanup;
782     }
783 
784  cleanup:
785     if( NULL != global_snapshot_handle ){
786         free(global_snapshot_handle);
787         global_snapshot_handle = NULL;
788     }
789     if( NULL != tmp_str ) {
790         free(tmp_str);
791         tmp_str = NULL;
792     }
793 
794     return exit_status;
795 }
796 
797 /****************************
798  * Command line tool request functions
799  ****************************/
800 /* JJH TODO - Move the command line functions here ? */
801 
802 /*****************************
803  * Snapshot metadata functions
804  *****************************/
orte_snapc_ckpt_state_str(char ** state_str,int state)805 int orte_snapc_ckpt_state_str(char ** state_str, int state)
806 {
807     switch(state) {
808     case ORTE_SNAPC_CKPT_STATE_NONE:
809         *state_str = strdup(" -- ");
810         break;
811     case ORTE_SNAPC_CKPT_STATE_REQUEST:
812         *state_str = strdup("Requested");
813         break;
814     case ORTE_SNAPC_CKPT_STATE_PENDING:
815         *state_str = strdup("Pending");
816         break;
817     case ORTE_SNAPC_CKPT_STATE_RUNNING:
818         *state_str = strdup("Running");
819         break;
820     case ORTE_SNAPC_CKPT_STATE_STOPPED:
821         *state_str = strdup("Stopped");
822         break;
823     case ORTE_SNAPC_CKPT_STATE_MIGRATING:
824         *state_str = strdup("Migrating");
825         break;
826     case ORTE_SNAPC_CKPT_STATE_ESTABLISHED:
827         *state_str = strdup("Checkpoint Established");
828         break;
829     case ORTE_SNAPC_CKPT_STATE_RECOVERED:
830         *state_str = strdup("Continuing/Recovered");
831         break;
832     case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
833         *state_str = strdup("Locally Finished");
834         break;
835     case ORTE_SNAPC_CKPT_STATE_ERROR:
836         *state_str = strdup("Error");
837         break;
838     default:
839         asprintf(state_str, "Unknown %d", state);
840         break;
841     }
842 
843     return ORTE_SUCCESS;
844 }
845