1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ 2 /* 3 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana 4 * University Research and Technology 5 * Corporation. All rights reserved. 6 * Copyright (c) 2004-2011 The University of Tennessee and The University 7 * of Tennessee Research Foundation. All rights 8 * reserved. 9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 10 * University of Stuttgart. All rights reserved. 11 * Copyright (c) 2004-2005 The Regents of the University of California. 12 * All rights reserved. 13 * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. 14 * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. 15 * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights 16 * reserved. 17 * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. 18 * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. 19 * $COPYRIGHT$ 20 * 21 * Additional copyrights may follow 22 * 23 * $HEADER$ 24 */ 25 /** @file: 26 * 27 * The Open RTE Error and Recovery Manager (ErrMgr) 28 * 29 * This framework is the logically central clearing house for process/daemon 30 * state updates. In particular when a process fails and another process detects 31 * it, then that information is reported through this framework. This framework 32 * then (depending on the active component) decides how to handle the failure. 33 * 34 * For example, if a process fails this may activate an automatic recovery 35 * of the process from a previous checkpoint, or initial state. Conversely, 36 * the active component could decide not to continue the job, and request that 37 * it be terminated. The error and recovery policy is determined by individual 38 * components within this framework. 39 * 40 */ 41 42 #ifndef ORTE_MCA_ERRMGR_H 43 #define ORTE_MCA_ERRMGR_H 44 45 /* 46 * includes 47 */ 48 49 #include "orte_config.h" 50 #include "orte/constants.h" 51 #include "orte/types.h" 52 53 #include "orte/mca/mca.h" 54 #include "opal/mca/base/base.h" 55 56 #include "opal/class/opal_object.h" 57 #include "opal/class/opal_pointer_array.h" 58 #include "opal/util/output.h" 59 #include "opal/util/error.h" 60 61 #include "orte/runtime/orte_globals.h" 62 #include "orte/mca/plm/plm_types.h" 63 64 BEGIN_C_DECLS 65 66 /* 67 * Macro definitions 68 */ 69 /* 70 * Thess macros and associated error name array are used to output intelligible error 71 * messages. 72 */ 73 74 #define ORTE_ERROR_NAME(n) opal_strerror(n) 75 #define ORTE_ERROR_LOG(n) \ 76 orte_errmgr.logfn(n, __FILE__, __LINE__); 77 78 /* 79 * Framework Interfaces 80 */ 81 /** 82 * Module initialization function. 83 * 84 * @retval ORTE_SUCCESS The operation completed successfully 85 * @retval ORTE_ERROR An unspecifed error occurred 86 */ 87 typedef int (*orte_errmgr_base_module_init_fn_t)(void); 88 89 /** 90 * Module finalization function. 91 * 92 * @retval ORTE_SUCCESS The operation completed successfully 93 * @retval ORTE_ERROR An unspecifed error occurred 94 */ 95 typedef int (*orte_errmgr_base_module_finalize_fn_t)(void); 96 97 /** 98 * This is not part of any module so it can be used at any time! 99 */ 100 typedef void (*orte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line); 101 102 /** 103 * Alert - self aborting 104 * This function is called when a process is aborting due to some internal error. 105 * It will finalize the process 106 * itself, and then exit - it takes no other actions. The intent here is to provide 107 * a last-ditch exit procedure that attempts to clean up a little. 108 */ 109 typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...) 110 __opal_attribute_format_funcptr__(__printf__, 2, 3); 111 112 /** 113 * Alert - abort peers 114 * This function is called when a process wants to abort one or more peer processes. 115 * For example, MPI_Abort(comm) will use this function to terminate peers in the 116 * communicator group before aborting itself. 117 */ 118 typedef int (*orte_errmgr_base_module_abort_peers_fn_t)(orte_process_name_t *procs, 119 orte_std_cntr_t num_procs, 120 int error_code); 121 122 /* 123 * Module Structure 124 */ 125 struct orte_errmgr_base_module_2_3_0_t { 126 /** Initialization Function */ 127 orte_errmgr_base_module_init_fn_t init; 128 /** Finalization Function */ 129 orte_errmgr_base_module_finalize_fn_t finalize; 130 131 orte_errmgr_base_module_log_fn_t logfn; 132 orte_errmgr_base_module_abort_fn_t abort; 133 orte_errmgr_base_module_abort_peers_fn_t abort_peers; 134 }; 135 typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t; 136 typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t; 137 ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr; 138 139 /* 140 * ErrMgr Component 141 */ 142 struct orte_errmgr_base_component_3_0_0_t { 143 /** MCA base component */ 144 mca_base_component_t base_version; 145 /** MCA base data */ 146 mca_base_component_data_t base_data; 147 148 /** Verbosity Level */ 149 int verbose; 150 /** Output Handle for opal_output */ 151 int output_handle; 152 /** Default Priority */ 153 int priority; 154 }; 155 typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t; 156 typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t; 157 158 /* 159 * Macro for use in components that are of type errmgr 160 */ 161 #define ORTE_ERRMGR_BASE_VERSION_3_0_0 \ 162 ORTE_MCA_BASE_VERSION_2_1_0("errmgr", 3, 0, 0) 163 164 END_C_DECLS 165 166 #endif 167