1 /*
2 * Copyright (c) 2009-2011 The Trustees of Indiana University.
3 * All rights reserved.
4 *
5 * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
6 *
7 * Copyright (c) 2004-2006 The University of Tennessee and The University
8 * of Tennessee Research Foundation. All rights
9 * reserved.
10 * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
11 * All rights reserved.
12 * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
13 * $COPYRIGHT$
14 *
15 * Additional copyrights may follow
16 *
17 * $HEADER$
18 */
19
20 #include "orte_config.h"
21
22 #include <sys/types.h>
23 #ifdef HAVE_UNISTD_H
24 #include <unistd.h>
25 #endif /* HAVE_UNISTD_H */
26 #include <string.h>
27
28 #include "opal/util/output.h"
29 #include "opal/dss/dss.h"
30
31 #include "orte/util/error_strings.h"
32 #include "orte/util/name_fns.h"
33 #include "orte/util/show_help.h"
34 #include "orte/util/threads.h"
35 #include "orte/runtime/orte_globals.h"
36 #include "orte/mca/rml/rml.h"
37 #include "orte/mca/odls/odls_types.h"
38 #include "orte/mca/state/state.h"
39
40 #include "orte/mca/errmgr/base/base.h"
41 #include "orte/mca/errmgr/base/errmgr_private.h"
42 #include "errmgr_default_tool.h"
43
44 /*
45 * Module functions: Global
46 */
47 static int init(void);
48 static int finalize(void);
49
50 static int abort_peers(orte_process_name_t *procs,
51 orte_std_cntr_t num_procs,
52 int error_code);
53
54 /******************
55 * HNP module
56 ******************/
57 orte_errmgr_base_module_t orte_errmgr_default_tool_module = {
58 .init= init,
59 .finalize = finalize,
60 .logfn = orte_errmgr_base_log,
61 .abort = orte_errmgr_base_abort,
62 .abort_peers = abort_peers
63 };
64
65 static void proc_errors(int fd, short args, void *cbdata);
66
67 /************************
68 * API Definitions
69 ************************/
init(void)70 static int init(void)
71 {
72 /* setup state machine to trap proc errors */
73 orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
74
75 return ORTE_SUCCESS;
76 }
77
finalize(void)78 static int finalize(void)
79 {
80 return ORTE_SUCCESS;
81 }
82
proc_errors(int fd,short args,void * cbdata)83 static void proc_errors(int fd, short args, void *cbdata)
84 {
85 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
86
87 ORTE_ACQUIRE_OBJECT(caddy);
88
89 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
90 "%s errmgr:default_tool: proc %s state %s",
91 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
92 ORTE_NAME_PRINT(&caddy->name),
93 orte_proc_state_to_str(caddy->proc_state)));
94
95 /*
96 * if orte is trying to shutdown, just let it
97 */
98 if (orte_finalizing) {
99 OBJ_RELEASE(caddy);
100 return;
101 }
102
103 /* if we lost our lifeline, then just stop the event loop
104 * so the main program can cleanly terminate */
105 if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
106 ORTE_POST_OBJECT(caddy);
107 orte_event_base_active = false;
108 } else {
109 /* all other errors require abort */
110 orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
111 }
112
113 OBJ_RELEASE(caddy);
114 }
115
abort_peers(orte_process_name_t * procs,orte_std_cntr_t num_procs,int error_code)116 static int abort_peers(orte_process_name_t *procs,
117 orte_std_cntr_t num_procs,
118 int error_code)
119 {
120 /* just abort */
121 if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
122 orte_errmgr_base_abort(error_code, "%s called abort_peers",
123 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
124 } else {
125 orte_errmgr_base_abort(error_code, NULL);
126 }
127 return ORTE_SUCCESS;
128 }
129