1 /*
2  * Copyright (c) 2009-2011 The Trustees of Indiana University.
3  *                         All rights reserved.
4  *
5  * Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
6  *
7  * Copyright (c) 2004-2006 The University of Tennessee and The University
8  *                         of Tennessee Research Foundation.  All rights
9  *                         reserved.
10  * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
11  *                         All rights reserved.
12  * Copyright (c) 2013-2017 Intel, Inc.  All rights reserved.
13  * $COPYRIGHT$
14  *
15  * Additional copyrights may follow
16  *
17  * $HEADER$
18  */
19 
20 #include "orte_config.h"
21 
22 #include <sys/types.h>
23 #ifdef HAVE_UNISTD_H
24 #include <unistd.h>
25 #endif  /* HAVE_UNISTD_H */
26 #include <string.h>
27 
28 #include "opal/util/output.h"
29 #include "opal/dss/dss.h"
30 
31 #include "orte/util/error_strings.h"
32 #include "orte/util/name_fns.h"
33 #include "orte/util/show_help.h"
34 #include "orte/util/threads.h"
35 #include "orte/runtime/orte_globals.h"
36 #include "orte/mca/rml/rml.h"
37 #include "orte/mca/odls/odls_types.h"
38 #include "orte/mca/state/state.h"
39 
40 #include "orte/mca/errmgr/base/base.h"
41 #include "orte/mca/errmgr/base/errmgr_private.h"
42 #include "errmgr_default_tool.h"
43 
44 /*
45  * Module functions: Global
46  */
47 static int init(void);
48 static int finalize(void);
49 
50 static int abort_peers(orte_process_name_t *procs,
51                        orte_std_cntr_t num_procs,
52                        int error_code);
53 
54 /******************
55  * HNP module
56  ******************/
57 orte_errmgr_base_module_t orte_errmgr_default_tool_module = {
58     .init= init,
59     .finalize = finalize,
60     .logfn = orte_errmgr_base_log,
61     .abort = orte_errmgr_base_abort,
62     .abort_peers = abort_peers
63 };
64 
65 static void proc_errors(int fd, short args, void *cbdata);
66 
67 /************************
68  * API Definitions
69  ************************/
init(void)70 static int init(void)
71 {
72     /* setup state machine to trap proc errors */
73     orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
74 
75     return ORTE_SUCCESS;
76 }
77 
finalize(void)78 static int finalize(void)
79 {
80     return ORTE_SUCCESS;
81 }
82 
proc_errors(int fd,short args,void * cbdata)83 static void proc_errors(int fd, short args, void *cbdata)
84 {
85     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
86 
87     ORTE_ACQUIRE_OBJECT(caddy);
88 
89     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
90                          "%s errmgr:default_tool: proc %s state %s",
91                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
92                          ORTE_NAME_PRINT(&caddy->name),
93                          orte_proc_state_to_str(caddy->proc_state)));
94 
95     /*
96      * if orte is trying to shutdown, just let it
97      */
98     if (orte_finalizing) {
99         OBJ_RELEASE(caddy);
100         return;
101     }
102 
103     /* if we lost our lifeline, then just stop the event loop
104      * so the main program can cleanly terminate */
105     if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
106         ORTE_POST_OBJECT(caddy);
107         orte_event_base_active = false;
108     } else {
109         /* all other errors require abort */
110         orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
111     }
112 
113     OBJ_RELEASE(caddy);
114 }
115 
abort_peers(orte_process_name_t * procs,orte_std_cntr_t num_procs,int error_code)116 static int abort_peers(orte_process_name_t *procs,
117                        orte_std_cntr_t num_procs,
118                        int error_code)
119 {
120     /* just abort */
121     if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
122         orte_errmgr_base_abort(error_code, "%s called abort_peers",
123                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
124     } else {
125         orte_errmgr_base_abort(error_code, NULL);
126     }
127     return ORTE_SUCCESS;
128 }
129