1 /* begin_generated_IBM_copyright_prolog */ 2 /* */ 3 /* This is an automatically generated copyright prolog. */ 4 /* After initializing, DO NOT MODIFY OR MOVE */ 5 /* --------------------------------------------------------------- */ 6 /* Licensed Materials - Property of IBM */ 7 /* Blue Gene/Q 5765-PER 5765-PRP */ 8 /* */ 9 /* (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved */ 10 /* US Government Users Restricted Rights - */ 11 /* Use, duplication, or disclosure restricted */ 12 /* by GSA ADP Schedule Contract with IBM Corp. */ 13 /* */ 14 /* --------------------------------------------------------------- */ 15 /* */ 16 /* end_generated_IBM_copyright_prolog */ 17 /* (C)Copyright IBM Corp. 2007, 2011 */ 18 /** 19 * \file include/mpidi_datatypes.h 20 * \brief ??? 21 */ 22 /* 23 * (C) 2001 by Argonne National Laboratory. 24 * See COPYRIGHT in top-level directory. 25 */ 26 27 28 #ifndef __include_mpidi_datatypes_h__ 29 #define __include_mpidi_datatypes_h__ 30 31 #ifdef MPIDI_STATISTICS 32 #include <pami_ext_pe.h> 33 #endif 34 35 #include "mpidi_constants.h" 36 #include "mpidi_platform.h" 37 #include "pami.h" 38 39 #if (MPIU_HANDLE_ALLOCATION_METHOD == MPIU_HANDLE_ALLOCATION_THREAD_LOCAL) && defined(__BGQ__) 40 struct MPID_Request; 41 typedef struct 42 { 43 struct MPID_Request * head; 44 size_t count; 45 } MPIDI_RequestHandle_t; 46 #endif 47 48 49 /** 50 * \brief MPI Process descriptor 51 * 52 * This structure contains global configuration flags. 53 */ 54 typedef struct 55 { 56 unsigned avail_contexts; 57 unsigned short_limit; 58 unsigned eager_limit; 59 unsigned eager_limit_local; 60 #if (MPIDI_STATISTICS || MPIDI_PRINTENV) 61 unsigned mp_infolevel; 62 unsigned mp_statistics; /* print pamid statistcs data */ 63 unsigned mp_printenv; ; /* print env data */ 64 #endif 65 #ifdef RDMA_FAILOVER 66 unsigned mp_s_use_pami_get; /* force the PAMI_Get path instead of PAMI_Rget */ 67 #endif 68 69 #if (MPIU_HANDLE_ALLOCATION_METHOD == MPIU_HANDLE_ALLOCATION_THREAD_LOCAL) && defined(__BGQ__) 70 MPIDI_RequestHandle_t request_handles[MPIDI_MAX_THREADS]; 71 #endif 72 73 unsigned verbose; /**< The current level of verbosity for end-of-job stats. */ 74 unsigned statistics; /**< The current level of stats collection. */ 75 unsigned rma_pending; /**< The max num outstanding requests during an RMA op */ 76 unsigned shmem_pt2pt; /**< Enable optimized shared memory point-to-point functions. */ 77 78 pami_geometry_t world_geometry; 79 80 struct 81 { 82 unsigned collectives; /**< Enable optimized collective functions. */ 83 unsigned subcomms; 84 unsigned select_colls; /**< Enable collective selection */ 85 } 86 optimized; 87 88 struct 89 { 90 volatile unsigned active; /**< Number of contexts with active async progress */ 91 unsigned mode; /**< 0 == 'disabled', 1 == 'locked', 2 == 'trigger' */ 92 } 93 async_progress; 94 95 struct 96 { 97 struct 98 { 99 unsigned requested; /**< 1 == application requests context post */ 100 unsigned active; /**< 1 == context post is currently required */ 101 } context_post; 102 } perobj; /**< This structure is only used in the 'perobj' mpich lock mode. */ 103 104 } MPIDI_Process_t; 105 106 107 enum 108 { 109 MPIDI_Protocols_Short, 110 MPIDI_Protocols_ShortSync, 111 MPIDI_Protocols_Eager, 112 MPIDI_Protocols_RVZ, 113 MPIDI_Protocols_Cancel, 114 MPIDI_Protocols_Control, 115 MPIDI_Protocols_WinCtrl, 116 MPIDI_Protocols_WinAccum, 117 MPIDI_Protocols_RVZ_zerobyte, 118 MPIDI_Protocols_COUNT, 119 }; 120 121 122 /** 123 * \brief This defines the type of message being sent/received 124 * mpid_startall() invokes the correct start based on the type of the request 125 */ 126 typedef enum 127 { 128 MPIDI_REQUEST_PTYPE_RECV, 129 MPIDI_REQUEST_PTYPE_SEND, 130 MPIDI_REQUEST_PTYPE_BSEND, 131 MPIDI_REQUEST_PTYPE_SSEND, 132 } MPIDI_REQUEST_PTYPE; 133 134 135 typedef enum 136 { 137 MPIDI_CONTROL_SSEND_ACKNOWLEDGE, 138 MPIDI_CONTROL_CANCEL_REQUEST, 139 MPIDI_CONTROL_CANCEL_ACKNOWLEDGE, 140 MPIDI_CONTROL_CANCEL_NOT_ACKNOWLEDGE, 141 MPIDI_CONTROL_RENDEZVOUS_ACKNOWLEDGE, 142 } MPIDI_CONTROL; 143 144 145 /** \brief Request completion actions */ 146 typedef enum 147 { 148 MPIDI_CA_COMPLETE, 149 MPIDI_CA_UNPACK_UEBUF_AND_COMPLETE, /**< Unpack uebuf, then complete. */ 150 } MPIDI_CA; 151 152 153 /** 154 * \brief MPIDI_Message_match contains enough information to match an 155 * MPI message. 156 */ 157 typedef struct 158 { 159 int tag; /**< match tag */ 160 int rank; /**< match rank */ 161 int context_id; /**< match context */ 162 #ifdef OUT_OF_ORDER_HANDLING 163 int seqno; /**< match seqno */ 164 #endif 165 } MPIDI_Message_match; 166 167 168 /** 169 * \brief MPID pt2pt message header 170 */ 171 typedef struct 172 { 173 MPI_Request req; /**< peer's request handle */ 174 unsigned MPItag; /**< match tag */ 175 unsigned MPIrank; /**< match rank */ 176 uint16_t MPIctxt; /**< match context */ 177 178 union { 179 uint16_t flags; 180 struct { 181 unsigned control:3; /**< message type for control protocols */ 182 unsigned isSync:1; /**< set for sync sends */ 183 unsigned isRzv :1; /**< use pt2pt rendezvous */ 184 } __attribute__ ((__packed__)); 185 }; 186 187 #ifdef OUT_OF_ORDER_HANDLING 188 unsigned MPIseqno; /**< match seqno */ 189 #endif 190 } MPIDI_MsgInfo; 191 192 /** \brief Full Rendezvous msg info to be set as two quads of unexpected data. */ 193 typedef struct 194 { 195 MPIDI_MsgInfo msginfo; 196 pami_memregion_t memregion; 197 #ifdef RDMA_FAILOVER 198 uint32_t memregion_used; 199 #endif 200 void * data; 201 size_t length; 202 } MPIDI_MsgEnvelope; 203 204 /** \brief This defines the portion of MPID_Request that is specific to the Device */ 205 struct MPIDI_Request 206 { 207 struct MPID_Request *next; /**< Link to next req. in queue */ 208 struct MPID_Datatype *datatype_ptr; /**< Info about the datatype */ 209 pami_work_t post_request; /**< */ 210 211 MPIDI_MsgEnvelope envelope; 212 213 void *userbuf; /**< User buffer */ 214 unsigned userbufcount; /**< Userbuf data count */ 215 MPI_Datatype datatype; /**< Data type of message */ 216 pami_task_t peer_pami; /**< The other guy's rank (in PAMI) */ 217 unsigned peer_comm; /**< The other guy's rank (in the orig communicator) */ 218 unsigned cancel_pending:16; /**< Cancel status */ 219 unsigned uebuf_malloc:16; /**< does uebuf require free() */ 220 221 unsigned uebuflen; /**< Length (bytes) of uebuf */ 222 void *uebuf; /**< Unexpected buffer */ 223 224 MPIDI_REQUEST_PTYPE ptype; /**< The persistent msg type */ 225 MPIDI_CA ca; /**< Completion action */ 226 pami_memregion_t memregion; /**< Rendezvous recv memregion */ 227 #ifdef OUT_OF_ORDER_HANDLING 228 struct MPID_Request *prev; /**< Link to prev req. in queue */ 229 void *nextR; /** < pointer to next recv for the out-of-order list, the out-of-order list is a list per source */ 230 void *prevR; /** < pointer to prev recv for the out-of-order list, the out-of-order list is a list per source */ 231 struct MPID_Request *oo_peer; /** < pointer to the matched post recv request to complete in the out-of-order case */ 232 #endif 233 #ifdef RDMA_FAILOVER 234 uint32_t memregion_used:16; 235 uint32_t shm:16; 236 #endif 237 #ifdef MPIDI_TRACE 238 int cur_nMsgs; 239 int partner_id; 240 int idx; 241 int PR_idx; 242 #endif 243 }; 244 245 246 /** \brief This defines the portion of MPID_Comm that is specific to the Device */ 247 struct MPIDI_Comm 248 { 249 pami_geometry_t geometry; /**< Geometry component for collectives */ 250 pami_geometry_t parent; /**< The parent geometry this communicator came from */ 251 pami_algorithm_t *coll_algorithm[PAMI_XFER_COUNT][2]; 252 pami_metadata_t *coll_metadata[PAMI_XFER_COUNT][2]; 253 char coll_count[PAMI_XFER_COUNT][2]; 254 pami_algorithm_t user_selected[PAMI_XFER_COUNT]; 255 /* no way to tell if user_selected[] is NULL */ 256 /* could probably union these two though? */ 257 char user_selected_type[PAMI_XFER_COUNT]; 258 pami_metadata_t user_metadata[PAMI_XFER_COUNT]; 259 char last_algorithm[100]; 260 char preallreduces[MPID_NUM_PREALLREDUCES]; 261 /* \todo Need to figure out how to deal with algorithms above the pami level */ 262 char allgathers[4]; 263 char allgathervs[4]; 264 char scattervs[2]; 265 char optgather, optscatter; 266 267 /* These need to be freed at geom destroy, so we need to store them 268 * inside the communicator struct until destroy time rather than 269 * allocating pointers on the stack 270 */ 271 /* For create_taskrange */ 272 pami_geometry_range_t *ranges; 273 /* For create_tasklist/endpoints if we ever use it */ 274 pami_task_t *tasks; 275 pami_endpoint_t *endpoints; 276 /* There are some protocols where the optimized protocol always works and 277 * is the best performance */ 278 /* Assume we have small vs large cutoffs vs medium for some protocols */ 279 pami_algorithm_t opt_protocol[PAMI_XFER_COUNT][2]; 280 int must_query[PAMI_XFER_COUNT][2]; 281 pami_metadata_t opt_protocol_md[PAMI_XFER_COUNT][2]; 282 int cutoff_size[PAMI_XFER_COUNT][2]; 283 /* Our best allreduce double protocol only works on 284 * doubles and sum/min/max. Since that is a common 285 * occurance let's cache that protocol and call 286 * it without checking */ 287 pami_algorithm_t cached_allred_dsmm; /*dsmm = double, sum/min/max */ 288 pami_metadata_t cached_allred_dsmm_md; 289 int query_allred_dsmm; 290 291 /* We have some integer optimized protocols that only work on 292 * sum/min/max but also have datasize/ppn <= 8k limitations */ 293 /* Using Amith's protocol, these work on int/min/max/sum of SMALL messages */ 294 pami_algorithm_t cached_allred_ismm; 295 pami_metadata_t cached_allred_ismm_md; 296 /* Because this only works at select message sizes, this will have to be 297 * nonzero */ 298 int query_allred_ismm; 299 300 union tasks_descrip_t { 301 /* For create_taskrange */ 302 pami_geometry_range_t *ranges; 303 /* For create_tasklist/endpoints if we ever use it */ 304 pami_task_t *tasks; 305 pami_endpoint_t *endpoints; 306 } tasks_descriptor; 307 }; 308 309 310 typedef struct 311 { 312 pami_work_t state; 313 pami_xfer_t *coll_struct; 314 } MPIDI_Post_coll_t; 315 316 317 /** \brief Forward declaration of the MPID_Comm structure */ 318 struct MPID_Comm; 319 /** \brief Forward declaration of the MPID_Win structure */ 320 struct MPID_Win; 321 /** \brief Forward declaration of the MPID_Group structure */ 322 struct MPID_Group; 323 324 325 struct MPIDI_Win_lock 326 { 327 struct MPIDI_Win_lock *next; 328 unsigned rank; 329 int type; 330 }; 331 struct MPIDI_Win_queue 332 { 333 struct MPIDI_Win_lock *head; 334 struct MPIDI_Win_lock *tail; 335 }; 336 /** 337 * \brief Collective information related to a window 338 * 339 * This structure is used to share information about a local window with 340 * all nodes in the window communicator. Part of that information includes 341 * statistics about RMA operations during access/exposure epochs. 342 * 343 * The structure is allocated as an array sized for the window communicator. 344 * Each entry in the array corresponds directly to the node of the same rank. 345 */ 346 struct MPIDI_Win_info 347 { 348 void * base_addr; /**< Node's exposure window base address */ 349 struct MPID_Win * win; 350 uint32_t disp_unit; /**< Node's exposure window displacement units */ 351 pami_memregion_t memregion; /**< Memory region descriptor for each node */ 352 #ifdef RDMA_FAILOVER 353 uint32_t memregion_used; 354 #endif 355 }; 356 /** 357 * \brief Structure of PAMI extensions to MPID_Win structure 358 */ 359 struct MPIDI_Win 360 { 361 struct MPIDI_Win_info * info; /**< allocated array of collective info */ 362 struct MPIDI_Win_sync 363 { 364 #if 0 365 /** \todo optimize some of the synchronization assertion */ 366 uint32_t assert; /**< MPI_MODE_* bits asserted at epoch start */ 367 #endif 368 369 /* These fields are reset by the sync functions */ 370 uint32_t total; /**< The number of PAMI requests that we know about (updated only by calling thread) */ 371 volatile uint32_t started; /**< The number of PAMI requests made (updated only in the context_post callback) */ 372 volatile uint32_t complete; /**< The number of completed PAMI requests (only updated by the done callbacks) */ 373 374 struct MPIDI_Win_sync_pscw 375 { 376 struct MPID_Group * group; 377 volatile unsigned count; 378 } sc, pw; 379 struct MPIDI_Win_sync_lock 380 { 381 struct 382 { 383 volatile unsigned locked; 384 } remote; 385 struct 386 { 387 struct MPIDI_Win_queue requested; 388 int type; 389 unsigned count; 390 } local; 391 } lock; 392 } sync; 393 }; 394 395 396 #endif 397