1 /*
2 * Copyright (c) 2004-2010 The Trustees of Indiana University.
3 * All rights reserved.
4 * Copyright (c) 2004-2011 The Trustees of the University of Tennessee.
5 * All rights reserved.
6 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
7 * University of Stuttgart. All rights reserved.
8 * Copyright (c) 2004-2005 The Regents of the University of California.
9 * All rights reserved.
10 * Copyright (c) 2007 Evergrid, Inc. All rights reserved.
11 * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
12 * Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
13 * $COPYRIGHT$
14 *
15 * Additional copyrights may follow
16 *
17 * $HEADER$
18 */
19
20 #include "orte_config.h"
21
22 #include <string.h>
23 #ifdef HAVE_SYS_TYPES_H
24 #include <sys/types.h>
25 #endif /* HAVE_SYS_TYPES_H */
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif /* HAVE_UNISTD_H */
29 #ifdef HAVE_SYS_TYPES_H
30 #include <sys/types.h>
31 #endif /* HAVE_SYS_TYPES_H */
32 #ifdef HAVE_SYS_STAT_H
33 #include <sys/stat.h>
34 #endif /* HAVE_SYS_STAT_H */
35 #ifdef HAVE_DIRENT_H
36 #include <dirent.h>
37 #endif /* HAVE_DIRENT_H */
38 #include <time.h>
39
40 #include "orte/mca/mca.h"
41 #include "opal/mca/base/base.h"
42
43 #include "opal/util/os_dirpath.h"
44 #include "opal/util/output.h"
45 #include "opal/util/show_help.h"
46 #include "opal/util/basename.h"
47 #include "opal/util/argv.h"
48 #include "opal/mca/crs/crs.h"
49 #include "opal/mca/crs/base/base.h"
50 #include "opal/dss/dss.h"
51
52 #include "orte/mca/rml/rml.h"
53 #include "orte/mca/rml/rml_types.h"
54 #include "orte/mca/errmgr/errmgr.h"
55 #include "orte/runtime/orte_globals.h"
56 #include "orte/util/name_fns.h"
57
58 #include "orte/mca/sstore/sstore.h"
59 #include "orte/mca/sstore/base/base.h"
60
61 #include "orte/mca/snapc/snapc.h"
62 #include "orte/mca/snapc/base/base.h"
63
64 /******************
65 * Local Functions
66 ******************/
67 size_t orte_snapc_base_snapshot_seq_number = 0;
68
69 /******************
70 * Object stuff
71 ******************/
72 OBJ_CLASS_INSTANCE(orte_snapc_base_local_snapshot_t,
73 opal_list_item_t,
74 orte_snapc_base_local_snapshot_construct,
75 orte_snapc_base_local_snapshot_destruct);
76
orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t * snapshot)77 void orte_snapc_base_local_snapshot_construct(orte_snapc_base_local_snapshot_t *snapshot)
78 {
79 snapshot->process_name.jobid = 0;
80 snapshot->process_name.vpid = 0;
81
82 snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
83
84 snapshot->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
85 }
86
orte_snapc_base_local_snapshot_destruct(orte_snapc_base_local_snapshot_t * snapshot)87 void orte_snapc_base_local_snapshot_destruct( orte_snapc_base_local_snapshot_t *snapshot)
88 {
89 snapshot->process_name.jobid = 0;
90 snapshot->process_name.vpid = 0;
91
92 snapshot->state = ORTE_SNAPC_CKPT_STATE_NONE;
93
94 snapshot->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
95 }
96
97 /****/
98 OBJ_CLASS_INSTANCE(orte_snapc_base_global_snapshot_t,
99 opal_list_item_t,
100 orte_snapc_base_global_snapshot_construct,
101 orte_snapc_base_global_snapshot_destruct);
102
orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t * snapshot)103 void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t *snapshot)
104 {
105 OBJ_CONSTRUCT(&(snapshot->local_snapshots), opal_list_t);
106
107 snapshot->options = OBJ_NEW(opal_crs_base_ckpt_options_t);
108
109 snapshot->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
110 }
111
orte_snapc_base_global_snapshot_destruct(orte_snapc_base_global_snapshot_t * snapshot)112 void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot)
113 {
114 opal_list_item_t* item = NULL;
115
116 while (NULL != (item = opal_list_remove_first(&snapshot->local_snapshots))) {
117 OBJ_RELEASE(item);
118 }
119 OBJ_DESTRUCT(&(snapshot->local_snapshots));
120
121 if( NULL != snapshot->options ) {
122 OBJ_RELEASE(snapshot->options);
123 snapshot->options = NULL;
124 }
125
126 snapshot->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
127 }
128
129 OBJ_CLASS_INSTANCE(orte_snapc_base_quiesce_t,
130 opal_object_t,
131 orte_snapc_base_quiesce_construct,
132 orte_snapc_base_quiesce_destruct);
133
orte_snapc_base_quiesce_construct(orte_snapc_base_quiesce_t * quiesce)134 void orte_snapc_base_quiesce_construct(orte_snapc_base_quiesce_t *quiesce)
135 {
136 quiesce->epoch = -1;
137 quiesce->snapshot = NULL;
138 quiesce->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
139 quiesce->ss_snapshot = NULL;
140 quiesce->handle = NULL;
141 quiesce->target_dir = NULL;
142 quiesce->crs_name = NULL;
143 quiesce->cmdline = NULL;
144 quiesce->cr_state = OPAL_CRS_NONE;
145 quiesce->checkpointing = false;
146 quiesce->restarting = false;
147
148 quiesce->migrating = false;
149 quiesce->num_migrating = 0;
150 OBJ_CONSTRUCT(&(quiesce->migrating_procs), opal_pointer_array_t);
151 opal_pointer_array_init(&(quiesce->migrating_procs), 8, INT32_MAX, 8);
152 }
153
orte_snapc_base_quiesce_destruct(orte_snapc_base_quiesce_t * quiesce)154 void orte_snapc_base_quiesce_destruct( orte_snapc_base_quiesce_t *quiesce)
155 {
156 int i;
157 void *item = NULL;
158
159 quiesce->epoch = -1;
160
161 if( NULL != quiesce->snapshot ) {
162 OBJ_RELEASE(quiesce->snapshot);
163 quiesce->snapshot = NULL;
164 }
165
166 quiesce->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
167 if( NULL != quiesce->ss_snapshot ) {
168 OBJ_RELEASE(quiesce->ss_snapshot);
169 quiesce->ss_snapshot = NULL;
170 }
171
172 if( NULL != quiesce->handle ) {
173 free(quiesce->handle);
174 quiesce->handle = NULL;
175 }
176 if( NULL != quiesce->target_dir ) {
177 free(quiesce->target_dir);
178 quiesce->target_dir = NULL;
179 }
180 if( NULL != quiesce->crs_name ) {
181 free(quiesce->crs_name);
182 quiesce->crs_name = NULL;
183 }
184 if( NULL != quiesce->cmdline ) {
185 free(quiesce->cmdline);
186 quiesce->cmdline = NULL;
187 }
188
189 quiesce->cr_state = OPAL_CRS_NONE;
190 quiesce->checkpointing = false;
191 quiesce->restarting = false;
192
193 quiesce->migrating = false;
194 quiesce->num_migrating = 0;
195 for( i = 0; i < quiesce->migrating_procs.size; ++i) {
196 item = opal_pointer_array_get_item(&(quiesce->migrating_procs), i);
197 if( NULL != item ) {
198 OBJ_RELEASE(item);
199 }
200 }
201 OBJ_DESTRUCT(&(quiesce->migrating_procs));
202 }
203
204 OBJ_CLASS_INSTANCE(orte_snapc_base_request_op_t,
205 opal_object_t,
206 orte_snapc_base_request_op_construct,
207 orte_snapc_base_request_op_destruct);
208
orte_snapc_base_request_op_construct(orte_snapc_base_request_op_t * op)209 void orte_snapc_base_request_op_construct(orte_snapc_base_request_op_t *op)
210 {
211 op->event = ORTE_SNAPC_OP_NONE;
212 op->is_active = false;
213 op->leader = -1;
214
215 op->seq_num = -1;
216 op->global_handle = NULL;
217 op->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
218
219 op->mig_num = -1;
220 op->mig_vpids = NULL;
221 /*op->mig_host_pref = NULL;*/
222 op->mig_vpid_pref = NULL;
223 op->mig_off_node = NULL;
224 }
225
orte_snapc_base_request_op_destruct(orte_snapc_base_request_op_t * op)226 void orte_snapc_base_request_op_destruct( orte_snapc_base_request_op_t *op)
227 {
228 op->event = ORTE_SNAPC_OP_NONE;
229 op->is_active = false;
230 op->leader = -1;
231
232 op->seq_num = -1;
233 if(NULL != op->global_handle ) {
234 free(op->global_handle);
235 op->global_handle = NULL;
236 }
237
238 op->ss_handle = ORTE_SSTORE_HANDLE_INVALID;
239
240 op->mig_num = -1;
241 /*
242 if( NULL != op->mig_vpids ) {
243 free( op->mig_vpids );
244 op->mig_vpids = NULL;
245 }
246
247 if( NULL != op->mig_host_pref ) {
248 free( op->mig_host_pref );
249 op->mig_host_pref = NULL;
250 }
251
252 if( NULL != op->mig_vpid_pref ) {
253 free( op->mig_vpid_pref );
254 op->mig_vpid_pref = NULL;
255 }
256
257 if( NULL != op->mig_off_node ) {
258 free( op->mig_off_node );
259 op->mig_off_node = NULL;
260 }
261 */
262 }
263
264
265 /***********************
266 * None component stuff
267 ************************/
orte_snapc_base_none_open(void)268 int orte_snapc_base_none_open(void)
269 {
270 return ORTE_SUCCESS;
271 }
272
orte_snapc_base_none_close(void)273 int orte_snapc_base_none_close(void)
274 {
275 return ORTE_SUCCESS;
276 }
277
orte_snapc_base_none_query(mca_base_module_t ** module,int * priority)278 int orte_snapc_base_none_query(mca_base_module_t **module, int *priority)
279 {
280 *module = NULL;
281 *priority = 0;
282
283 return OPAL_SUCCESS;
284 }
285
orte_snapc_base_module_init(bool seed,bool app)286 int orte_snapc_base_module_init(bool seed, bool app)
287 {
288 return ORTE_SUCCESS;
289 }
290
orte_snapc_base_module_finalize(void)291 int orte_snapc_base_module_finalize(void)
292 {
293 return ORTE_SUCCESS;
294 }
295
296 /* None RML command line response callback */
297 static void snapc_none_global_cmdline_request(int status,
298 orte_process_name_t* sender,
299 opal_buffer_t *buffer,
300 orte_rml_tag_t tag,
301 void* cbdata);
orte_snapc_base_none_setup_job(orte_jobid_t jobid)302 int orte_snapc_base_none_setup_job(orte_jobid_t jobid)
303 {
304
305 /*
306 * Coordinator command listener
307 */
308 orte_snapc_base_snapshot_seq_number = -1;
309 orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
310 ORTE_RML_TAG_CKPT,
311 ORTE_RML_PERSISTENT,
312 snapc_none_global_cmdline_request,
313 NULL);
314
315 return ORTE_SUCCESS;
316 }
317
orte_snapc_base_none_release_job(orte_jobid_t jobid)318 int orte_snapc_base_none_release_job(orte_jobid_t jobid)
319 {
320 /*
321 * Remove the checkpoint request callback
322 */
323
324 return ORTE_SUCCESS;
325 }
326
orte_snapc_base_none_ft_event(int state)327 int orte_snapc_base_none_ft_event(int state)
328 {
329 return ORTE_SUCCESS;
330 }
331
orte_snapc_base_none_start_ckpt(orte_snapc_base_quiesce_t * datum)332 int orte_snapc_base_none_start_ckpt(orte_snapc_base_quiesce_t *datum)
333 {
334 return ORTE_SUCCESS;
335 }
336
orte_snapc_base_none_end_ckpt(orte_snapc_base_quiesce_t * datum)337 int orte_snapc_base_none_end_ckpt(orte_snapc_base_quiesce_t *datum)
338 {
339 return ORTE_SUCCESS;
340 }
341
342
343 /********************
344 * Local Functions
345 ********************/
346 /* None RML response callback */
snapc_none_global_cmdline_request(int status,orte_process_name_t * sender,opal_buffer_t * buffer,orte_rml_tag_t tag,void * cbdata)347 static void snapc_none_global_cmdline_request(int status,
348 orte_process_name_t* sender,
349 opal_buffer_t *buffer,
350 orte_rml_tag_t tag,
351 void* cbdata)
352 {
353 int ret;
354 orte_snapc_cmd_flag_t command;
355 orte_std_cntr_t n = 1;
356 opal_crs_base_ckpt_options_t *options = NULL;
357 orte_jobid_t jobid;
358
359 options = OBJ_NEW(opal_crs_base_ckpt_options_t);
360
361 n = 1;
362 if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_SNAPC_CMD))) {
363 ORTE_ERROR_LOG(ret);
364 goto cleanup;
365 }
366
367 /*
368 * orte_checkpoint has requested that a checkpoint be taken
369 * Respond that a checkpoint cannot be taken at this time
370 */
371 if (ORTE_SNAPC_GLOBAL_INIT_CMD == command) {
372 /*
373 * Do the basic handshake with the orte_checkpoint command
374 */
375 if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_init_cmd(sender, buffer, options, &jobid)) ) {
376 ORTE_ERROR_LOG(ret);
377 goto cleanup;
378 }
379
380 /*
381 * Respond with an invalid response
382 */
383 if( ORTE_SUCCESS != (ret = orte_snapc_base_global_coord_ckpt_update_cmd(sender, 0, ORTE_SNAPC_CKPT_STATE_NO_CKPT)) ) {
384 ORTE_ERROR_LOG(ret);
385 goto cleanup;
386 }
387 }
388 /*
389 * Unknown command
390 */
391 else {
392 ORTE_ERROR_LOG(ret);
393 goto cleanup;
394 }
395
396 cleanup:
397 if( NULL != options ) {
398 OBJ_RELEASE(options);
399 options = NULL;
400 }
401
402 return;
403 }
404
405 /********************
406 * Utility functions
407 ********************/
408
409 /* Report the checkpoint status */
orte_snapc_ckpt_state_notify(int state)410 void orte_snapc_ckpt_state_notify(int state)
411 {
412 switch(state) {
413 case ORTE_SNAPC_CKPT_STATE_ESTABLISHED:
414 opal_output(0, "%d: Checkpoint established for process %s.",
415 orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
416 break;
417 case ORTE_SNAPC_CKPT_STATE_NO_CKPT:
418 opal_output(0, "%d: Process %s is not checkpointable.",
419 orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
420 break;
421 case ORTE_SNAPC_CKPT_STATE_ERROR:
422 opal_output(0, "%d: Failed to checkpoint process %s.",
423 orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
424 break;
425 case ORTE_SNAPC_CKPT_STATE_RECOVERED:
426 opal_output(0, "%d: Successfully restarted process %s.",
427 orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
428 break;
429 case ORTE_SNAPC_CKPT_STATE_NO_RESTART:
430 opal_output(0, "%d: Failed to restart process %s.",
431 orte_process_info.pid, ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
432 break;
433 /* ADK: We currently do not notify for these states, but good to
434 * have them around anyways. */
435 case ORTE_SNAPC_CKPT_STATE_NONE:
436 case ORTE_SNAPC_CKPT_STATE_REQUEST:
437 case ORTE_SNAPC_CKPT_STATE_PENDING:
438 case ORTE_SNAPC_CKPT_STATE_RUNNING:
439 case ORTE_SNAPC_CKPT_STATE_STOPPED:
440 case ORTE_SNAPC_CKPT_STATE_MIGRATING:
441 case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
442 default:
443 break;
444 }
445 }
446
orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t * peer,opal_buffer_t * buffer,opal_crs_base_ckpt_options_t * options,orte_jobid_t * jobid)447 int orte_snapc_base_global_coord_ckpt_init_cmd(orte_process_name_t* peer,
448 opal_buffer_t* buffer,
449 opal_crs_base_ckpt_options_t *options,
450 orte_jobid_t *jobid)
451 {
452 int ret, exit_status = ORTE_SUCCESS;
453 orte_std_cntr_t count = 1;
454 orte_ns_cmp_bitmask_t mask;
455
456 mask = ORTE_NS_CMP_ALL;
457
458 /*
459 * Do not send to self, as that is silly.
460 */
461 if (OPAL_EQUAL ==
462 orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
463 OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
464 "%s) base:ckpt_init_cmd: Error: Do not send to self!\n",
465 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
466 return ORTE_SUCCESS;
467 }
468
469 OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
470 "%s) base:ckpt_init_cmd: Receiving commands\n",
471 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
472
473 /********************
474 * Receive command line checkpoint request:
475 * - Command (already received)
476 * - options
477 * - jobid
478 ********************/
479 if( ORTE_SUCCESS != (ret = orte_snapc_base_unpack_options(buffer, options)) ) {
480 opal_output(orte_snapc_base_framework.framework_output,
481 "%s) base:ckpt_init_cmd: Error: Unpack (options) Failure (ret = %d)\n",
482 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
483 ORTE_ERROR_LOG(ret);
484 exit_status = ret;
485 goto cleanup;
486 }
487
488 count = 1;
489 if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, jobid, &count, ORTE_JOBID)) ) {
490 opal_output(orte_snapc_base_framework.framework_output,
491 "%s) base:ckpt_init_cmd: Error: DSS Unpack (jobid) Failure (ret = %d) (LINE = %d)\n",
492 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
493 ret, __LINE__);
494 ORTE_ERROR_LOG(ret);
495 exit_status = ret;
496 goto cleanup;
497 }
498
499 OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
500 "%s) base:ckpt_init_cmd: Received [%d, %d, %s]\n",
501 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
502 (int)(options->term),
503 (int)(options->stop),
504 ORTE_JOBID_PRINT(*jobid)));
505
506 cleanup:
507 return exit_status;
508 }
509
orte_snapc_base_unpack_options(opal_buffer_t * buffer,opal_crs_base_ckpt_options_t * options)510 int orte_snapc_base_unpack_options(opal_buffer_t* buffer,
511 opal_crs_base_ckpt_options_t *options)
512 {
513 int ret, exit_status = ORTE_SUCCESS;
514 orte_std_cntr_t count = 1;
515
516 count = 1;
517 if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->term), &count, OPAL_BOOL)) ) {
518 opal_output(orte_snapc_base_framework.framework_output,
519 "snapc:base:unpack_options: Error: Unpack (term) Failure (ret = %d)\n",
520 ret);
521 ORTE_ERROR_LOG(ret);
522 exit_status = ret;
523 goto cleanup;
524 }
525
526 count = 1;
527 if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->stop), &count, OPAL_BOOL)) ) {
528 opal_output(orte_snapc_base_framework.framework_output,
529 "snapc:base:unpack_options: Error: Unpack (stop) Failure (ret = %d)\n",
530 ret);
531 ORTE_ERROR_LOG(ret);
532 exit_status = ret;
533 goto cleanup;
534 }
535
536 count = 1;
537 if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->inc_prep_only), &count, OPAL_BOOL)) ) {
538 opal_output(orte_snapc_base_framework.framework_output,
539 "snapc:base:unpack_options: Error: Unpack (inc_prep_only) Failure (ret = %d)\n",
540 ret);
541 ORTE_ERROR_LOG(ret);
542 exit_status = ret;
543 goto cleanup;
544 }
545
546 count = 1;
547 if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->inc_recover_only), &count, OPAL_BOOL)) ) {
548 opal_output(orte_snapc_base_framework.framework_output,
549 "snapc:base:unpack_options: Error: Unpack (inc_recover_only) Failure (ret = %d)\n",
550 ret);
551 ORTE_ERROR_LOG(ret);
552 exit_status = ret;
553 goto cleanup;
554 }
555
556 #if OPAL_ENABLE_CRDEBUG == 1
557 count = 1;
558 if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->attach_debugger), &count, OPAL_BOOL)) ) {
559 opal_output(orte_snapc_base_framework.framework_output,
560 "snapc:base:unpack_options: Error: Unpack (attach_debugger) Failure (ret = %d)\n",
561 ret);
562 ORTE_ERROR_LOG(ret);
563 exit_status = ret;
564 goto cleanup;
565 }
566
567 count = 1;
568 if ( ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(options->detach_debugger), &count, OPAL_BOOL)) ) {
569 opal_output(orte_snapc_base_framework.framework_output,
570 "snapc:base:unpack_options: Error: Unpack (detach_debugger) Failure (ret = %d)\n",
571 ret);
572 ORTE_ERROR_LOG(ret);
573 exit_status = ret;
574 goto cleanup;
575 }
576 #endif
577
578 cleanup:
579 return exit_status;
580 }
581
orte_snapc_base_pack_options(opal_buffer_t * buffer,opal_crs_base_ckpt_options_t * options)582 int orte_snapc_base_pack_options(opal_buffer_t* buffer,
583 opal_crs_base_ckpt_options_t *options)
584 {
585 int ret, exit_status = ORTE_SUCCESS;
586
587 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->term), 1, OPAL_BOOL))) {
588 ORTE_ERROR_LOG(ret);
589 exit_status = ret;
590 goto cleanup;
591 }
592
593 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->stop), 1, OPAL_BOOL))) {
594 ORTE_ERROR_LOG(ret);
595 exit_status = ret;
596 goto cleanup;
597 }
598
599 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->inc_prep_only), 1, OPAL_BOOL))) {
600 ORTE_ERROR_LOG(ret);
601 exit_status = ret;
602 goto cleanup;
603 }
604
605 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->inc_recover_only), 1, OPAL_BOOL))) {
606 ORTE_ERROR_LOG(ret);
607 exit_status = ret;
608 goto cleanup;
609 }
610
611 #if OPAL_ENABLE_CRDEBUG == 1
612 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->attach_debugger), 1, OPAL_BOOL))) {
613 ORTE_ERROR_LOG(ret);
614 exit_status = ret;
615 goto cleanup;
616 }
617
618 if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(options->detach_debugger), 1, OPAL_BOOL))) {
619 ORTE_ERROR_LOG(ret);
620 exit_status = ret;
621 goto cleanup;
622 }
623 #endif
624
625 cleanup:
626 return exit_status;
627 }
628
orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t * peer,orte_sstore_base_handle_t ss_handle,int ckpt_status)629 int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
630 orte_sstore_base_handle_t ss_handle,
631 int ckpt_status)
632 {
633 int ret, exit_status = ORTE_SUCCESS;
634 opal_buffer_t *loc_buffer = NULL;
635 orte_snapc_cmd_flag_t command = ORTE_SNAPC_GLOBAL_UPDATE_CMD;
636 char *global_snapshot_handle = NULL;
637 char *tmp_str = NULL;
638 int seq_num;
639 orte_ns_cmp_bitmask_t mask;
640
641 /*
642 * Noop if invalid peer, or peer not specified (JJH Double check this)
643 */
644 if( NULL == peer ||
645 OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer) ) {
646 /*return ORTE_ERR_BAD_PARAM;*/
647 return ORTE_SUCCESS;
648 }
649
650 mask = ORTE_NS_CMP_ALL;
651
652 /*
653 * Do not send to self, as that is silly.
654 */
655 if (OPAL_EQUAL == orte_util_compare_name_fields(mask, peer, ORTE_PROC_MY_HNP)) {
656 OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
657 "%s) base:ckpt_update_cmd: Error: Do not send to self!\n",
658 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type)));
659 return ORTE_SUCCESS;
660 }
661
662 /*
663 * Pass on the checkpoint state.
664 */
665 orte_snapc_ckpt_state_notify(ckpt_status);
666
667 OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
668 "%s) base:ckpt_update_cmd: Sending update command <status %d>\n",
669 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
670 ckpt_status));
671
672 /********************
673 * Send over the status of the checkpoint
674 * - ckpt_state
675 * - global snapshot handle (upon finish only)
676 * - sequence number (upon finish only)
677 ********************/
678 if (NULL == (loc_buffer = OBJ_NEW(opal_buffer_t))) {
679 exit_status = ORTE_ERROR;
680 goto cleanup;
681 }
682
683 if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &command, 1, ORTE_SNAPC_CMD)) ) {
684 ORTE_ERROR_LOG(ret);
685 exit_status = ret;
686 OBJ_RELEASE(loc_buffer);
687 goto cleanup;
688 }
689
690 if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &ckpt_status, 1, OPAL_INT))) {
691 opal_output(orte_snapc_base_framework.framework_output,
692 "%s) base:ckpt_update_cmd: Error: DSS Pack (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
693 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
694 ret, __LINE__);
695 ORTE_ERROR_LOG(ret);
696 exit_status = ret;
697 OBJ_RELEASE(loc_buffer);
698 goto cleanup;
699 }
700
701 if( ORTE_SNAPC_CKPT_STATE_RECOVERED == ckpt_status ||
702 ORTE_SNAPC_CKPT_STATE_ESTABLISHED == ckpt_status ||
703 ORTE_SNAPC_CKPT_STATE_STOPPED == ckpt_status ||
704 ORTE_SNAPC_CKPT_STATE_ERROR == ckpt_status ) {
705
706 if( ORTE_SNAPC_CKPT_STATE_ERROR != ckpt_status ) {
707 if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
708 SSTORE_METADATA_GLOBAL_SNAP_REF,
709 &global_snapshot_handle)) ) {
710 opal_output(orte_snapc_base_framework.framework_output,
711 "%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
712 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
713 ORTE_ERROR_LOG(ret);
714 /* Do not exit here, continue so that we can inform the tool
715 * that the checkpoint has failed
716 */
717 }
718
719 if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
720 SSTORE_METADATA_GLOBAL_SNAP_SEQ,
721 &tmp_str)) ) {
722 opal_output(orte_snapc_base_framework.framework_output,
723 "%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
724 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
725 ORTE_ERROR_LOG(ret);
726 /* Do not exit here, continue so that we can inform the tool
727 * that the checkpoint has failed
728 */
729 }
730
731 if( NULL != tmp_str ) {
732 seq_num = atoi(tmp_str);
733 } else {
734 seq_num = -1;
735 }
736 } else {
737 /* Checkpoint Error Case */
738 global_snapshot_handle = NULL;
739 seq_num = -1;
740 }
741
742 OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_framework.framework_output,
743 "%s) base:ckpt_update_cmd: Sending update command <status %d> + <ref %s> <seq %d>\n",
744 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
745 ckpt_status, global_snapshot_handle, seq_num));
746
747 if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &global_snapshot_handle, 1, OPAL_STRING))) {
748 opal_output(orte_snapc_base_framework.framework_output,
749 "%s) base:ckpt_update_cmd: Error: DSS Pack (snapshot handle) Failure (ret = %d) (LINE = %d)\n",
750 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
751 ret, __LINE__);
752 ORTE_ERROR_LOG(ret);
753 exit_status = ret;
754 OBJ_RELEASE(loc_buffer);
755 goto cleanup;
756 }
757
758 if (ORTE_SUCCESS != (ret = opal_dss.pack(loc_buffer, &seq_num, 1, OPAL_INT))) {
759 opal_output(orte_snapc_base_framework.framework_output,
760 "%s) base:ckpt_update_cmd: Error: DSS Pack (seq number) Failure (ret = %d) (LINE = %d)\n",
761 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
762 ret, __LINE__);
763 ORTE_ERROR_LOG(ret);
764 exit_status = ret;
765 OBJ_RELEASE(loc_buffer);
766 goto cleanup;
767 }
768 }
769
770 if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
771 peer, loc_buffer,
772 ORTE_RML_TAG_CKPT,
773 orte_rml_send_callback, NULL))) {
774 opal_output(orte_snapc_base_framework.framework_output,
775 "%s) base:ckpt_update_cmd: Error: Send (ckpt_status) Failure (ret = %d) (LINE = %d)\n",
776 ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type),
777 ret, __LINE__);
778 ORTE_ERROR_LOG(ret);
779 exit_status = ret;
780 OBJ_RELEASE(loc_buffer);
781 goto cleanup;
782 }
783
784 cleanup:
785 if( NULL != global_snapshot_handle ){
786 free(global_snapshot_handle);
787 global_snapshot_handle = NULL;
788 }
789 if( NULL != tmp_str ) {
790 free(tmp_str);
791 tmp_str = NULL;
792 }
793
794 return exit_status;
795 }
796
797 /****************************
798 * Command line tool request functions
799 ****************************/
800 /* JJH TODO - Move the command line functions here ? */
801
802 /*****************************
803 * Snapshot metadata functions
804 *****************************/
orte_snapc_ckpt_state_str(char ** state_str,int state)805 int orte_snapc_ckpt_state_str(char ** state_str, int state)
806 {
807 switch(state) {
808 case ORTE_SNAPC_CKPT_STATE_NONE:
809 *state_str = strdup(" -- ");
810 break;
811 case ORTE_SNAPC_CKPT_STATE_REQUEST:
812 *state_str = strdup("Requested");
813 break;
814 case ORTE_SNAPC_CKPT_STATE_PENDING:
815 *state_str = strdup("Pending");
816 break;
817 case ORTE_SNAPC_CKPT_STATE_RUNNING:
818 *state_str = strdup("Running");
819 break;
820 case ORTE_SNAPC_CKPT_STATE_STOPPED:
821 *state_str = strdup("Stopped");
822 break;
823 case ORTE_SNAPC_CKPT_STATE_MIGRATING:
824 *state_str = strdup("Migrating");
825 break;
826 case ORTE_SNAPC_CKPT_STATE_ESTABLISHED:
827 *state_str = strdup("Checkpoint Established");
828 break;
829 case ORTE_SNAPC_CKPT_STATE_RECOVERED:
830 *state_str = strdup("Continuing/Recovered");
831 break;
832 case ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL:
833 *state_str = strdup("Locally Finished");
834 break;
835 case ORTE_SNAPC_CKPT_STATE_ERROR:
836 *state_str = strdup("Error");
837 break;
838 default:
839 asprintf(state_str, "Unknown %d", state);
840 break;
841 }
842
843 return ORTE_SUCCESS;
844 }
845