1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ 2 /* 3 * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights reserved. 4 * Copyright (c) 2013 Mellanox Technologies, Inc. 5 * All rights reserved. 6 * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. 7 * 8 * $COPYRIGHT$ 9 * 10 * Additional copyrights may follow 11 */ 12 13 /* This is the public RTE interface to the OMPI layer. Any RTE can be 14 * connected to the OMPI layer by creating a new static component in 15 * this framework, assigning it a priority and including a configure.m4 16 * to define when it should be built. 17 * 18 * Each component must provide a number of types and functions that mimic 19 * those provided by ORTE. These include (where flexibility exists, the 20 * ORTE data type is shown, but any compatible type is allowed. For example, 21 * the jobid field in ompi_process_name_t could be any type of integer, but 22 * cannot be a string): 23 * 24 * (a) Process name objects and operations 25 * 1. Definitions for integral types ompi_jobid_t and ompi_vpid_t. 26 * The jobid must be unique for a given MPI_COMM_WORLD capable of 27 * connecting to another OMPI_COMM_WORLD and the vpid will be the 28 * process's rank in MPI_COMM_WORLD. 29 * 2. ompi_process_name_t - a struct that must contain at least two integer-typed fields: 30 * a. ompi_jobid_t jobid 31 * b. ompi_vpid_t vpid 32 * Note that the structure can contain any number of fields beyond these 33 * two, so the process name struct for any particular RTE can be whatever 34 * is desired. 35 * 3. OMPI_NAME_PRINT - a macro that prints a process name when given 36 * a pointer to ompi_process_name_t. The output format is to be 37 * a single string representing the name. This function should 38 * be thread-safe for multiple threads to call simultaneously. 39 * 4. OMPI_PROC_MY_NAME - a pointer to a global variable containing 40 * the ompi_process_name_t for this process. Typically, this is 41 * stored as a field in the ompi_process_info_t struct, but that 42 * is not a requirement. 43 * 5. OMPI_NAME_WIlDCARD - a wildcard name. 44 * 6. ompi_rte_compare_name_fields - a function used to compare fields 45 * in the ompi_process_name_t struct. The function prototype must be 46 * of the form: 47 * int ompi_rte_compare_name_fields(ompi_rte_cmp_bitmask_t mask, 48 * ompi_process_name_t *name1, 49 * ompi_process_name_t *name2); 50 * The bitmask must be defined to indicate the fields to be used 51 * in the comparison. Fields not included in the mask must be ignored. 52 * Supported bitmask values must include: 53 * b. OMPI_RTE_CMP_JOBID 54 * c. OMPI_RTE_CMP_VPID 55 * d. OMPI_RTE_CMP_ALL 56 * 7. uint64_t ompi_rte_hash_name(name) - return a string hash uniquely 57 * representing the ompi_process_name passed in. 58 * 8. OMPI_NAME - an Opal DSS constant for a handler already registered 59 * to serialize/deserialize an ompi_process_name_t structure. 60 * 61 * (b) Collective objects and operations 62 * 1. ompi_rte_collective_t - an OPAL object used during RTE collective operations 63 * such as modex and barrier. It must be an opal_list_item_t and contain the 64 * following fields: 65 * a. id (ORTE type: int32_t) 66 * b. bool active 67 * flag that user can poll on to know when collective 68 * has completed - set to false just prior to 69 * calling user callback function, if provided 70 * 2. ompi_rte_modex - a function that performs an exchange of endpoint information 71 * to wireup the MPI transports. The function prototype must be of the form: 72 * int ompi_rte_modex(ompi_rte_collective_t *coll); 73 * At the completion of the modex operation, the coll->active flag must be set 74 * to false, and the endpoint information must be stored in the modex database. 75 * This function must have barrier semantics across the MPI_COMM_WORLD of the 76 * calling process. 77 * 3. ompi_rte_barrier - a function that performs a barrier operation within the 78 * RTE. The function prototype must be of the form: 79 * int ompi_rte_barrier(ompi_rte_collective_t *coll); 80 * At the completion of the barrier operation, the coll->active flag must be set 81 * to false 82 * 83 * (c) Process info struct 84 * 1. ompi_process_info_t - a struct containing info about the current process. 85 * The struct must contain at least the following fields: 86 * a. app_num - 87 * b. pid - this process's pid. Should be same as getpid(). 88 * c. num_procs - Number of processes in this job (ie, MCW) 89 * d. my_node_rank - relative rank on local node to other peers this run-time 90 * instance knows about. If doing dynamics, this may be something 91 * different than my_local_rank, but will be my_local_rank in a 92 * static job. 93 * d. my_local_rank - relative rank on local node with other peers in this job (ie, MCW) 94 * e. num_local_peers - Number of local peers (peers in MCW on your node) 95 * f. my_hnp_uri - 96 * g. peer_modex - a collective id for the modex operation 97 * h. peer_init_barrier - a collective id for the barrier during MPI_Init 98 * i. peer_fini_barrier - a collective id for the barrier during MPI_Finalize 99 * j. job_session_dir - 100 * k. proc_session_dir - 101 * l. nodename - a string representation for the name of the node this 102 * process is on 103 * m. cpuset - 104 * 2. ompi_process_info - a global instance of the ompi_process_t structure. 105 * 3. ompi_rte_proc_is_bound - global boolean that will be true if the runtime bound 106 * the process to a particular core or set of cores and is false otherwise. 107 * 108 * (d) Error handling objects and operations 109 * 1. void ompi_rte_abort(int err_code, char *fmt, ...) - Abort the current 110 * process with the specified error code and message. 111 * 2. int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs) - 112 * Abort the specified list of peers 113 * 3. OMPI_ERROR_LOG(rc) - print error message regarding the given return code 114 * 115 * (e) Init and finalize objects and operations 116 * 1. ompi_rte_init - a function to initialize the RTE. The function 117 * prototype must be of the form: 118 * int ompi_rte_init(int *argc, char ***argv); 119 * 2. ompi_rte_finalize - a function to finalize the RTE. The function 120 * prototype must be of the form: 121 * int ompi_rte_finalize(void); 122 * 3. void ompi_rte_wait_for_debugger(void) - Called during MPI_Init, this 123 * function is used to wait for debuggers to do their pre-MPI attach. 124 * If there is no attached debugger, this function will not block. 125 * 126 * (f) Database operations 127 * 1. ompi_rte_db_store - a function to store modex and other data in 128 * a local database. The function is primarily used for storing modex 129 * data, but can be used for general purposes. The prototype must be 130 * of the form: 131 * int ompi_rte_db_store(const ompi_process_name_t *proc, 132 * const char *key, const void *data, 133 * opal_data_type_t type); 134 * The implementation of this function must store a COPY of the data 135 * provided - the data is NOT guaranteed to be valid after return 136 * from the call. 137 * 3. ompi_rte_db_fetch - 138 * NOTE: Fetch accepts an 'ompi_proc_t'. 139 * int ompi_rte_db_fetch(const struct ompi_proc_t *proc, 140 * const char *key, 141 * void **data, 142 * opal_data_type_t type); 143 * 4. ompi_rte_db_fetch_pointer - 144 * NOTE: Fetch accepts an 'ompi_proc_t'. 145 * int ompi_rte_db_fetch_pointer(const struct ompi_proc_t *proc, 146 * const char *key, 147 * void **data, 148 * opal_data_type_t type); 149 * 5. Pre-defined db keys (with associated values after rte_init) 150 * a. OMPI_DB_HOSTNAME 151 * b. OMPI_DB_LOCALITY 152 * 153 * (g) Communication support 154 * 155 */ 156 157 #ifndef OMPI_MCA_RTE_H 158 #define OMPI_MCA_RTE_H 159 160 #include "ompi_config.h" 161 162 #include "opal/dss/dss_types.h" 163 #include "ompi/mca/mca.h" 164 #include "opal/mca/base/base.h" 165 166 BEGIN_C_DECLS 167 168 /** 169 * Structure for rte components. 170 */ 171 struct ompi_rte_base_component_1_0_0_t { 172 /** MCA base component */ 173 mca_base_component_t base_version; 174 /** MCA base data */ 175 mca_base_component_data_t base_data; 176 }; 177 178 /** 179 * Convenience typedef 180 */ 181 typedef struct ompi_rte_base_component_1_0_0_t ompi_rte_base_component_1_0_0_t; 182 typedef struct ompi_rte_base_component_1_0_0_t ompi_rte_component_t; 183 184 /** 185 * Macro for use in components that are of type rte 186 */ 187 #define OMPI_RTE_BASE_VERSION_1_0_0 \ 188 OMPI_MCA_BASE_VERSION_2_1_0("rte", 2, 0, 0) 189 190 END_C_DECLS 191 192 /* include implementation to call */ 193 #include MCA_rte_IMPLEMENTATION_HEADER 194 195 BEGIN_C_DECLS 196 197 /* 198 * MCA Framework 199 */ 200 OMPI_DECLSPEC extern mca_base_framework_t ompi_rte_base_framework; 201 202 /* In a few places, we need to barrier until something happens 203 * that changes a flag to indicate we can release - e.g., waiting 204 * for a specific RTE message to arrive. We don't want to block MPI 205 * progress while waiting, so we loop over opal_progress, letting 206 * the RTE progress thread move the RTE along 207 */ 208 #define OMPI_WAIT_FOR_COMPLETION(flg) \ 209 do { \ 210 opal_output_verbose(1, ompi_rte_base_framework.framework_output, \ 211 "%s waiting on RTE event at %s:%d", \ 212 OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ 213 __FILE__, __LINE__); \ 214 while ((flg)) { \ 215 opal_progress(); \ 216 } \ 217 }while(0); 218 219 #define OMPI_LAZY_WAIT_FOR_COMPLETION(flg) \ 220 do { \ 221 opal_output_verbose(1, ompi_rte_base_framework.framework_output, \ 222 "%s lazy waiting on RTE event at %s:%d", \ 223 OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \ 224 __FILE__, __LINE__); \ 225 while ((flg)) { \ 226 opal_progress(); \ 227 usleep(100); \ 228 } \ 229 }while(0); 230 231 typedef struct { 232 opal_list_item_t super; 233 ompi_process_name_t name; 234 } ompi_namelist_t; 235 OBJ_CLASS_DECLARATION(ompi_namelist_t); 236 237 END_C_DECLS 238 239 #endif /* OMPI_RTE_H_ */ 240