1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2011 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2009      Cisco Systems, Inc.  All rights reserved.
14  * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
15  * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
16  *                         reserved.
17  * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
18  * Copyright (c) 2014      NVIDIA Corporation.  All rights reserved.
19  * $COPYRIGHT$
20  *
21  * Additional copyrights may follow
22  *
23  * $HEADER$
24  */
25 /** @file:
26  *
27  * The Open RTE Error and Recovery Manager (ErrMgr)
28  *
29  * This framework is the logically central clearing house for process/daemon
30  * state updates. In particular when a process fails and another process detects
31  * it, then that information is reported through this framework. This framework
32  * then (depending on the active component) decides how to handle the failure.
33  *
34  * For example, if a process fails this may activate an automatic recovery
35  * of the process from a previous checkpoint, or initial state. Conversely,
36  * the active component could decide not to continue the job, and request that
37  * it be terminated. The error and recovery policy is determined by individual
38  * components within this framework.
39  *
40  */
41 
42 #ifndef ORTE_MCA_ERRMGR_H
43 #define ORTE_MCA_ERRMGR_H
44 
45 /*
46  * includes
47  */
48 
49 #include "orte_config.h"
50 #include "orte/constants.h"
51 #include "orte/types.h"
52 
53 #include "orte/mca/mca.h"
54 #include "opal/mca/base/base.h"
55 
56 #include "opal/class/opal_object.h"
57 #include "opal/class/opal_pointer_array.h"
58 #include "opal/util/output.h"
59 #include "opal/util/error.h"
60 
61 #include "orte/runtime/orte_globals.h"
62 #include "orte/mca/plm/plm_types.h"
63 
64 BEGIN_C_DECLS
65 
66 /*
67  * Macro definitions
68  */
69 /*
70  * Thess macros and associated error name array are used to output intelligible error
71  * messages.
72  */
73 
74 #define ORTE_ERROR_NAME(n)  opal_strerror(n)
75 #define ORTE_ERROR_LOG(n)                       \
76         orte_errmgr.logfn(n, __FILE__, __LINE__);
77 
78 /*
79  * Framework Interfaces
80  */
81 /**
82  * Module initialization function.
83  *
84  * @retval ORTE_SUCCESS The operation completed successfully
85  * @retval ORTE_ERROR   An unspecifed error occurred
86  */
87 typedef int (*orte_errmgr_base_module_init_fn_t)(void);
88 
89 /**
90  * Module finalization function.
91  *
92  * @retval ORTE_SUCCESS The operation completed successfully
93  * @retval ORTE_ERROR   An unspecifed error occurred
94  */
95 typedef int (*orte_errmgr_base_module_finalize_fn_t)(void);
96 
97 /**
98  * This is not part of any module so it can be used at any time!
99  */
100 typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line);
101 
102 /**
103  * Alert - self aborting
104  * This function is called when a process is aborting due to some internal error.
105  * It will finalize the process
106  * itself, and then exit - it takes no other actions. The intent here is to provide
107  * a last-ditch exit procedure that attempts to clean up a little.
108  */
109 typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...)
110 __opal_attribute_format_funcptr__(__printf__, 2, 3);
111 
112 /**
113  * Alert - abort peers
114  *  This function is called when a process wants to abort one or more peer processes.
115  *  For example, MPI_Abort(comm) will use this function to terminate peers in the
116  *  communicator group before aborting itself.
117  */
118 typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs,
119                                                         orte_std_cntr_t num_procs,
120                                                         int error_code);
121 
122 /*
123  * Module Structure
124  */
125 struct orte_errmgr_base_module_2_3_0_t {
126     /** Initialization Function */
127     orte_errmgr_base_module_init_fn_t                       init;
128     /** Finalization Function */
129     orte_errmgr_base_module_finalize_fn_t                   finalize;
130 
131     orte_errmgr_base_module_log_fn_t                        logfn;
132     orte_errmgr_base_module_abort_fn_t                      abort;
133     orte_errmgr_base_module_abort_peers_fn_t                abort_peers;
134 };
135 typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
136 typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
137 ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr;
138 
139 /*
140  * ErrMgr Component
141  */
142 struct orte_errmgr_base_component_3_0_0_t {
143     /** MCA base component */
144     mca_base_component_t base_version;
145     /** MCA base data */
146     mca_base_component_data_t base_data;
147 
148     /** Verbosity Level */
149     int verbose;
150     /** Output Handle for opal_output */
151     int output_handle;
152     /** Default Priority */
153     int priority;
154 };
155 typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t;
156 typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t;
157 
158 /*
159  * Macro for use in components that are of type errmgr
160  */
161 #define ORTE_ERRMGR_BASE_VERSION_3_0_0 \
162     ORTE_MCA_BASE_VERSION_2_1_0("errmgr", 3, 0, 0)
163 
164 END_C_DECLS
165 
166 #endif
167