1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #ifndef MPIR_COMM_H_INCLUDED
7 #define MPIR_COMM_H_INCLUDED
8
9 #if defined HAVE_LIBHCOLL
10 #include "../mpid/common/hcoll/hcollpre.h"
11 #endif
12
13 /*E
14 MPIR_Comm_kind_t - Name the two types of communicators
15 E*/
16 typedef enum MPIR_Comm_kind_t {
17 MPIR_COMM_KIND__INTRACOMM = 0,
18 MPIR_COMM_KIND__INTERCOMM = 1
19 } MPIR_Comm_kind_t;
20
21 /* ideally we could add these to MPIR_Comm_kind_t, but there's too much existing
22 * code that assumes that the only valid values are INTRACOMM or INTERCOMM */
23 typedef enum MPIR_Comm_hierarchy_kind_t {
24 MPIR_COMM_HIERARCHY_KIND__FLAT = 0, /* no hierarchy */
25 MPIR_COMM_HIERARCHY_KIND__PARENT = 1, /* has subcommunicators */
26 MPIR_COMM_HIERARCHY_KIND__NODE_ROOTS = 2, /* is the subcomm for node roots */
27 MPIR_COMM_HIERARCHY_KIND__NODE = 3, /* is the subcomm for a node */
28 MPIR_COMM_HIERARCHY_KIND__SIZE /* cardinality of this enum */
29 } MPIR_Comm_hierarchy_kind_t;
30
31 typedef enum {
32 MPIR_COMM_MAP_TYPE__DUP,
33 MPIR_COMM_MAP_TYPE__IRREGULAR
34 } MPIR_Comm_map_type_t;
35
36 /* direction of mapping: local to local, local to remote, remote to
37 * local, remote to remote */
38 typedef enum {
39 MPIR_COMM_MAP_DIR__L2L,
40 MPIR_COMM_MAP_DIR__L2R,
41 MPIR_COMM_MAP_DIR__R2L,
42 MPIR_COMM_MAP_DIR__R2R
43 } MPIR_Comm_map_dir_t;
44
45 typedef struct MPIR_Comm_map {
46 MPIR_Comm_map_type_t type;
47
48 struct MPIR_Comm *src_comm;
49
50 /* mapping direction for intercomms, which contain local and
51 * remote groups */
52 MPIR_Comm_map_dir_t dir;
53
54 /* only valid for irregular map type */
55 int src_mapping_size;
56 int *src_mapping;
57 int free_mapping; /* we allocated the mapping */
58
59 struct MPIR_Comm_map *next;
60 } MPIR_Comm_map_t;
61
62 int MPIR_Comm_map_irregular(struct MPIR_Comm *newcomm, struct MPIR_Comm *src_comm,
63 int *src_mapping, int src_mapping_size,
64 MPIR_Comm_map_dir_t dir, MPIR_Comm_map_t ** map);
65 int MPIR_Comm_map_dup(struct MPIR_Comm *newcomm, struct MPIR_Comm *src_comm,
66 MPIR_Comm_map_dir_t dir);
67 int MPIR_Comm_map_free(struct MPIR_Comm *comm);
68
69 /* Communicator info hint */
70 #define MPIR_COMM_HINT_TYPE_BOOL 0
71 #define MPIR_COMM_HINT_TYPE_INT 1
72
73 /* Communicator attr (bitmask)
74 * If local bit is set, the hint is local. Default (0) will require the hint value be
75 * the same across communicator.
76 */
77 #define MPIR_COMM_HINT_ATTR_LOCAL 0x1
78
79 #define MPIR_COMM_HINT_MAX 100
80
81 enum MPIR_COMM_HINT_PREDEFINED_t {
82 MPIR_COMM_HINT_INVALID = 0,
83 MPIR_COMM_HINT_NO_ANY_TAG,
84 MPIR_COMM_HINT_NO_ANY_SOURCE,
85 MPIR_COMM_HINT_EXACT_LENGTH,
86 MPIR_COMM_HINT_ALLOW_OVERTAKING,
87 /* device specific hints.
88 * Potentially, we can use macros and configure to hide them */
89 MPIR_COMM_HINT_EAGER_THRESH, /* ch3 */
90 MPIR_COMM_HINT_EAGAIN, /* ch4:ofi */
91 /* dynamic hints starts here */
92 MPIR_COMM_HINT_PREDEFINED_COUNT
93 };
94
95 /*S
96 MPIR_Comm - Description of the Communicator data structure
97
98 Notes:
99 Note that the size and rank duplicate data in the groups that
100 make up this communicator. These are used often enough that this
101 optimization is valuable.
102
103 This definition provides only a 16-bit integer for context id''s .
104 This should be sufficient for most applications. However, extending
105 this to a 32-bit (or longer) integer should be easy.
106
107 There are two context ids. One is used for sending and one for
108 receiving. In the case of an Intracommunicator, they are the same
109 context id. They differ in the case of intercommunicators, where
110 they may come from processes in different comm worlds (in the
111 case of MPI-2 dynamic process intercomms).
112
113 The virtual connection table is an explicit member of this structure.
114 This contains the information used to contact a particular process,
115 indexed by the rank relative to this communicator.
116
117 Groups are allocated lazily. That is, the group pointers may be
118 null, created only when needed by a routine such as 'MPI_Comm_group'.
119 The local process ids needed to form the group are available within
120 the virtual connection table.
121 For intercommunicators, we may want to always have the groups. If not,
122 we either need the 'local_group' or we need a virtual connection table
123 corresponding to the 'local_group' (we may want this anyway to simplify
124 the implementation of the intercommunicator collective routines).
125
126 The pointer to the structure 'MPIR_Collops' containing pointers to the
127 collective
128 routines allows an implementation to replace each routine on a
129 routine-by-routine basis. By default, this pointer is null, as are the
130 pointers within the structure. If either pointer is null, the implementation
131 uses the generic provided implementation. This choice, rather than
132 initializing the table with pointers to all of the collective routines,
133 is made to reduce the space used in the communicators and to eliminate the
134 need to include the implementation of all collective routines in all MPI
135 executables, even if the routines are not used.
136
137 Please note that the local_size and remote_size fields can be confusing. For
138 intracommunicators both fields are always equal to the size of the
139 communicator. For intercommunicators local_size is equal to the size of
140 local_group while remote_size is equal to the size of remote_group.
141
142 Module:
143 Communicator-DS
144
145 Question:
146 For fault tolerance, do we want to have a standard field for communicator
147 health? For example, ok, failure detected, all (live) members of failed
148 communicator have acked.
149 S*/
150 struct MPIR_Comm {
151 MPIR_OBJECT_HEADER; /* adds handle and ref_count fields */
152 MPID_Thread_mutex_t mutex;
153 MPIR_Context_id_t context_id; /* Send context id. See notes */
154 MPIR_Context_id_t recvcontext_id; /* Send context id. See notes */
155 int remote_size; /* Value of MPI_Comm_(remote)_size */
156 int rank; /* Value of MPI_Comm_rank */
157 MPIR_Attribute *attributes; /* List of attributes */
158 int local_size; /* Value of MPI_Comm_size for local group */
159 MPIR_Group *local_group, /* Groups in communicator. */
160 *remote_group; /* The local and remote groups are the
161 * same for intra communicators */
162 MPIR_Comm_kind_t comm_kind; /* MPIR_COMM_KIND__INTRACOMM or MPIR_COMM_KIND__INTERCOMM */
163 char name[MPI_MAX_OBJECT_NAME]; /* Required for MPI-2 */
164 MPIR_Errhandler *errhandler; /* Pointer to the error handler structure */
165 struct MPIR_Comm *local_comm; /* Defined only for intercomms, holds
166 * an intracomm for the local group */
167
168 MPIR_Comm_hierarchy_kind_t hierarchy_kind; /* flat, parent, node, or node_roots */
169 struct MPIR_Comm *node_comm; /* Comm of processes in this comm that are on
170 * the same node as this process. */
171 struct MPIR_Comm *node_roots_comm; /* Comm of root processes for other nodes. */
172 int *intranode_table; /* intranode_table[i] gives the rank in
173 * node_comm of rank i in this comm or -1 if i
174 * is not in this process' node_comm.
175 * It is of size 'local_size'. */
176 int *internode_table; /* internode_table[i] gives the rank in
177 * node_roots_comm of rank i in this comm.
178 * It is of size 'local_size'. */
179 int node_count; /* number of nodes this comm is spread over */
180
181 int is_low_group; /* For intercomms only, this boolean is
182 * set for all members of one of the
183 * two groups of processes and clear for
184 * the other. It enables certain
185 * intercommunicator collective operations
186 * that wish to use half-duplex operations
187 * to implement a full-duplex operation */
188
189 struct MPIR_Comm *comm_next; /* Provides a chain through all active
190 * communicators */
191 struct MPII_Topo_ops *topo_fns; /* Pointer to a table of functions
192 * implementting the topology routines */
193 int next_sched_tag; /* used by the NBC schedule code to allocate tags */
194
195 int revoked; /* Flag to track whether the communicator
196 * has been revoked */
197 /* A sequence number used for e.g. vci hashing. We can't directly use context_id
198 * because context_id is non-sequential and can't be used to identify user-level
199 * communicators (due to sub-comms). */
200 int seq;
201 /* Certain comm and its offsprings should be restricted to sequence 0 due to
202 * various restrictions. E.g. multiple-vci doesn't support dynamic process,
203 * nor intercomms (even after its merge).
204 */
205 int tainted;
206
207
208 int hints[MPIR_COMM_HINT_MAX]; /* Hints to the communicator
209 * use int array for fast access */
210
211 struct {
212 int pof2; /* Nearest (smaller than or equal to) power of 2
213 * to the number of ranks in the communicator.
214 * To be used during collective communication */
215 } coll;
216
217 void *csel_comm; /* collective selector handle */
218 #if defined HAVE_LIBHCOLL
219 hcoll_comm_priv_t hcoll_priv;
220 #endif /* HAVE_LIBHCOLL */
221
222 /* the mapper is temporarily filled out in order to allow the
223 * device to setup its network addresses. it will be freed after
224 * the device has initialized the comm. */
225 MPIR_Comm_map_t *mapper_head;
226 MPIR_Comm_map_t *mapper_tail;
227
228 /* Other, device-specific information */
229 #ifdef MPID_DEV_COMM_DECL
230 MPID_DEV_COMM_DECL
231 #endif
232 };
233 extern MPIR_Object_alloc_t MPIR_Comm_mem;
234
235 /* this function should not be called by normal code! */
236 int MPIR_Comm_delete_internal(MPIR_Comm * comm_ptr);
237
238 #define MPIR_Comm_add_ref(comm_p_) \
239 do { MPIR_Object_add_ref((comm_p_)); } while (0)
240 #define MPIR_Comm_release_ref(comm_p_, inuse_) \
241 do { MPIR_Object_release_ref(comm_p_, inuse_); } while (0)
242
243
244 /* Release a reference to a communicator. If there are no pending
245 references, delete the communicator and recover all storage and
246 context ids.
247
248 This routine has been inlined because keeping it as a separate routine
249 results in a >5% performance hit for the SQMR benchmark.
250 */
MPIR_Comm_release(MPIR_Comm * comm_ptr)251 static inline int MPIR_Comm_release(MPIR_Comm * comm_ptr)
252 {
253 int mpi_errno = MPI_SUCCESS;
254 int in_use;
255
256 MPIR_Comm_release_ref(comm_ptr, &in_use);
257 if (unlikely(!in_use)) {
258 /* the following routine should only be called by this function and its
259 * "_always" variant. */
260 mpi_errno = MPIR_Comm_delete_internal(comm_ptr);
261 /* not ERR_POPing here to permit simpler inlining. Our caller will
262 * still report the error from the comm_delete level. */
263 }
264
265 return mpi_errno;
266 }
267
268
269 /* MPIR_Comm_release_always is the same as MPIR_Comm_release except it uses
270 MPIR_Comm_release_ref_always instead.
271 */
272 int MPIR_Comm_release_always(MPIR_Comm * comm_ptr);
273
274 int MPIR_Comm_create(MPIR_Comm **);
275 int MPIR_Comm_create_group(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, int tag,
276 MPIR_Comm ** newcomm);
277
278 /* implements the logic for MPI_Comm_create for intracommunicators only */
279 int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Comm ** newcomm_ptr);
280
281
282 int MPIR_Comm_create_subcomms(MPIR_Comm * comm);
283 int MPIR_Comm_commit(MPIR_Comm *);
284
285 int MPIR_Comm_is_parent_comm(MPIR_Comm *);
286
287 int MPIR_Comm_idup_impl(MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm, MPIR_Request ** reqp);
288
289 int MPIR_Comm_shrink(MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm_ptr);
290 int MPIR_Comm_agree(MPIR_Comm * comm_ptr, int *flag);
291
292 #if defined(HAVE_ROMIO)
293 int MPIR_Comm_split_filesystem(MPI_Comm comm, int key, const char *dirname, MPI_Comm * newcomm);
294 #endif
295
296 #define MPIR_Comm_rank(comm_ptr) ((comm_ptr)->rank)
297 #define MPIR_Comm_size(comm_ptr) ((comm_ptr)->local_size)
298
299 /* Comm hint registration.
300 *
301 * Hint function is optional. If it is NULL, MPIR_layer will set corresponding
302 * hints array directly. If it is supplied, MPIR_layer will *NOT* set hints array.
303 * The hint function is responsible for setting it, as well as validating it and
304 * update whatever side-effects.
305 *
306 * Current supported type is boolean and int and the value parsed accordingly.
307 *
308 * If the attr is 0, it is requires the hint value to be consistent across the
309 * communicator. If the LOCAL bit is set, the hint values is treated as local.
310 * Additional attributes may be added in the future.
311 */
312 void MPIR_Comm_hint_init(void);
313 typedef int (*MPIR_Comm_hint_fn_t) (MPIR_Comm *, int, int); /* comm, key, val */
314 int MPIR_Comm_register_hint(int index, const char *hint_key, MPIR_Comm_hint_fn_t fn,
315 int type, int attr);
316
317 int MPIR_Comm_delete_attr_impl(MPIR_Comm * comm_ptr, MPII_Keyval * keyval_ptr);
318 int MPIR_Comm_create_keyval_impl(MPI_Comm_copy_attr_function * comm_copy_attr_fn,
319 MPI_Comm_delete_attr_function * comm_delete_attr_fn,
320 int *comm_keyval, void *extra_state);
321 int MPIR_Comm_accept_impl(const char *port_name, MPIR_Info * info_ptr, int root,
322 MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm_ptr);
323 int MPIR_Comm_connect_impl(const char *port_name, MPIR_Info * info_ptr, int root,
324 MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm_ptr);
325 int MPIR_Comm_create_errhandler_impl(MPI_Comm_errhandler_function * function,
326 MPI_Errhandler * errhandler);
327 int MPIR_Comm_dup_impl(MPIR_Comm * comm_ptr, MPIR_Info * info, MPIR_Comm ** newcomm_ptr);
328 int MPIR_Comm_dup_with_info_impl(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr,
329 MPIR_Comm ** newcomm_ptr);
330 int MPIR_Comm_get_info_impl(MPIR_Comm * comm_ptr, MPIR_Info ** info_ptr);
331 int MPIR_Comm_set_info_impl(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr);
332 int MPIR_Comm_free_impl(MPIR_Comm * comm_ptr);
333 void MPIR_Comm_free_keyval_impl(int keyval);
334 void MPIR_Comm_get_errhandler_impl(MPIR_Comm * comm_ptr, MPIR_Errhandler ** errhandler_ptr);
335 void MPIR_Comm_set_errhandler_impl(MPIR_Comm * comm_ptr, MPIR_Errhandler * errhandler_ptr);
336 void MPIR_Comm_get_name_impl(MPIR_Comm * comm, char *comm_name, int *resultlen);
337 int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_intracomm_ptr);
338 int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader,
339 MPIR_Comm * peer_comm_ptr, int remote_leader, int tag,
340 MPIR_Comm ** new_intercomm_ptr);
341 int MPIR_Comm_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr);
342 int MPIR_Comm_remote_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr);
343 int MPIR_Comm_group_failed_impl(MPIR_Comm * comm, MPIR_Group ** failed_group_ptr);
344 int MPIR_Comm_remote_group_failed_impl(MPIR_Comm * comm, MPIR_Group ** failed_group_ptr);
345 int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** newcomm_ptr);
346 int MPIR_Comm_split_type_self(MPIR_Comm * comm_ptr, int split_type, int key,
347 MPIR_Comm ** newcomm_ptr);
348 int MPIR_Comm_split_type_by_node(MPIR_Comm * comm_ptr, int split_type, int key,
349 MPIR_Comm ** newcomm_ptr);
350 int MPIR_Comm_split_type_node_topo(MPIR_Comm * comm_ptr, int split_type, int key,
351 MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr);
352 int MPIR_Comm_split_type(MPIR_Comm * comm_ptr, int split_type, int key, MPIR_Info * info_ptr,
353 MPIR_Comm ** newcomm_ptr);
354 int MPIR_Comm_split_type_impl(MPIR_Comm * comm_ptr, int split_type, int key, MPIR_Info * info_ptr,
355 MPIR_Comm ** newcomm_ptr);
356 int MPIR_Comm_set_attr_impl(MPIR_Comm * comm_ptr, int comm_keyval, void *attribute_val,
357 MPIR_Attr_type attrType);
358
359 int MPIR_Comm_split_type_neighborhood(MPIR_Comm * comm_ptr, int split_type, int key,
360 MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr);
361 int MPIR_Comm_split_type_nbhd_common_dir(MPIR_Comm * user_comm_ptr, int key, const char *hintval,
362 MPIR_Comm ** newcomm_ptr);
363 int MPIR_Comm_split_type_network_topo(MPIR_Comm * user_comm_ptr, int key, const char *hintval,
364 MPIR_Comm ** newcomm_ptr);
365 int MPIR_Comm_compare_impl(MPIR_Comm * comm_ptr1, MPIR_Comm * comm_ptr2, int *result);
366
367 /* Preallocated comm objects. There are 3: comm_world, comm_self, and
368 a private (non-user accessible) dup of comm world that is provided
369 if needed in MPI_Finalize. Having a separate version of comm_world
370 avoids possible interference with User code */
371 #define MPIR_COMM_N_BUILTIN 3
372 extern MPIR_Comm MPIR_Comm_builtin[MPIR_COMM_N_BUILTIN];
373 extern MPIR_Comm MPIR_Comm_direct[];
374 /* This is the handle for the internal MPI_COMM_WORLD . The "2" at the end
375 of the handle is 3-1 (e.g., the index in the builtin array) */
376 #define MPIR_ICOMM_WORLD ((MPI_Comm)0x44000002)
377
378 typedef struct MPIR_Commops {
379 int (*split_type) (MPIR_Comm *, int, int, MPIR_Info *, MPIR_Comm **);
380 } MPIR_Commops;
381 extern struct MPIR_Commops *MPIR_Comm_fns; /* Communicator creation functions */
382
383
384 /* internal functions */
385
386 int MPII_Comm_init(MPIR_Comm *);
387
388 int MPII_Comm_is_node_consecutive(MPIR_Comm *);
389
390 int MPII_Comm_copy(MPIR_Comm * comm_ptr, int size, MPIR_Info * info, MPIR_Comm ** outcomm_ptr);
391 int MPII_Comm_copy_data(MPIR_Comm * comm_ptr, MPIR_Comm ** outcomm_ptr);
392
393 int MPII_Setup_intercomm_localcomm(MPIR_Comm *);
394
395 /* comm_create helper functions, used by both comm_create and comm_create_group */
396 int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr,
397 MPIR_Comm * comm_ptr,
398 int **mapping_out, MPIR_Comm ** mapping_comm);
399
400 int MPII_Comm_create_map(int local_n,
401 int remote_n,
402 int *local_mapping,
403 int *remote_mapping, MPIR_Comm * mapping_comm, MPIR_Comm * newcomm);
404
405 int MPII_Comm_set_hints(MPIR_Comm * comm_ptr, MPIR_Info * info);
406 int MPII_Comm_get_hints(MPIR_Comm * comm_ptr, MPIR_Info * info);
407 int MPII_Comm_check_hints(MPIR_Comm * comm_ptr);
408
409 #endif /* MPIR_COMM_H_INCLUDED */
410