1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #ifndef MPIR_COMM_H_INCLUDED
7 #define MPIR_COMM_H_INCLUDED
8 
9 #if defined HAVE_LIBHCOLL
10 #include "../mpid/common/hcoll/hcollpre.h"
11 #endif
12 
13 /*E
14   MPIR_Comm_kind_t - Name the two types of communicators
15   E*/
16 typedef enum MPIR_Comm_kind_t {
17     MPIR_COMM_KIND__INTRACOMM = 0,
18     MPIR_COMM_KIND__INTERCOMM = 1
19 } MPIR_Comm_kind_t;
20 
21 /* ideally we could add these to MPIR_Comm_kind_t, but there's too much existing
22  * code that assumes that the only valid values are INTRACOMM or INTERCOMM */
23 typedef enum MPIR_Comm_hierarchy_kind_t {
24     MPIR_COMM_HIERARCHY_KIND__FLAT = 0, /* no hierarchy */
25     MPIR_COMM_HIERARCHY_KIND__PARENT = 1,       /* has subcommunicators */
26     MPIR_COMM_HIERARCHY_KIND__NODE_ROOTS = 2,   /* is the subcomm for node roots */
27     MPIR_COMM_HIERARCHY_KIND__NODE = 3, /* is the subcomm for a node */
28     MPIR_COMM_HIERARCHY_KIND__SIZE      /* cardinality of this enum */
29 } MPIR_Comm_hierarchy_kind_t;
30 
31 typedef enum {
32     MPIR_COMM_MAP_TYPE__DUP,
33     MPIR_COMM_MAP_TYPE__IRREGULAR
34 } MPIR_Comm_map_type_t;
35 
36 /* direction of mapping: local to local, local to remote, remote to
37  * local, remote to remote */
38 typedef enum {
39     MPIR_COMM_MAP_DIR__L2L,
40     MPIR_COMM_MAP_DIR__L2R,
41     MPIR_COMM_MAP_DIR__R2L,
42     MPIR_COMM_MAP_DIR__R2R
43 } MPIR_Comm_map_dir_t;
44 
45 typedef struct MPIR_Comm_map {
46     MPIR_Comm_map_type_t type;
47 
48     struct MPIR_Comm *src_comm;
49 
50     /* mapping direction for intercomms, which contain local and
51      * remote groups */
52     MPIR_Comm_map_dir_t dir;
53 
54     /* only valid for irregular map type */
55     int src_mapping_size;
56     int *src_mapping;
57     int free_mapping;           /* we allocated the mapping */
58 
59     struct MPIR_Comm_map *next;
60 } MPIR_Comm_map_t;
61 
62 int MPIR_Comm_map_irregular(struct MPIR_Comm *newcomm, struct MPIR_Comm *src_comm,
63                             int *src_mapping, int src_mapping_size,
64                             MPIR_Comm_map_dir_t dir, MPIR_Comm_map_t ** map);
65 int MPIR_Comm_map_dup(struct MPIR_Comm *newcomm, struct MPIR_Comm *src_comm,
66                       MPIR_Comm_map_dir_t dir);
67 int MPIR_Comm_map_free(struct MPIR_Comm *comm);
68 
69 /* Communicator info hint */
70 #define MPIR_COMM_HINT_TYPE_BOOL 0
71 #define MPIR_COMM_HINT_TYPE_INT  1
72 
73 /* Communicator attr (bitmask)
74  * If local bit is set, the hint is local. Default (0) will require the hint value be
75  * the same across communicator.
76  */
77 #define MPIR_COMM_HINT_ATTR_LOCAL 0x1
78 
79 #define MPIR_COMM_HINT_MAX 100
80 
81 enum MPIR_COMM_HINT_PREDEFINED_t {
82     MPIR_COMM_HINT_INVALID = 0,
83     MPIR_COMM_HINT_NO_ANY_TAG,
84     MPIR_COMM_HINT_NO_ANY_SOURCE,
85     MPIR_COMM_HINT_EXACT_LENGTH,
86     MPIR_COMM_HINT_ALLOW_OVERTAKING,
87     /* device specific hints.
88      * Potentially, we can use macros and configure to hide them */
89     MPIR_COMM_HINT_EAGER_THRESH,        /* ch3 */
90     MPIR_COMM_HINT_EAGAIN,      /* ch4:ofi */
91     /* dynamic hints starts here */
92     MPIR_COMM_HINT_PREDEFINED_COUNT
93 };
94 
95 /*S
96   MPIR_Comm - Description of the Communicator data structure
97 
98   Notes:
99   Note that the size and rank duplicate data in the groups that
100   make up this communicator.  These are used often enough that this
101   optimization is valuable.
102 
103   This definition provides only a 16-bit integer for context id''s .
104   This should be sufficient for most applications.  However, extending
105   this to a 32-bit (or longer) integer should be easy.
106 
107   There are two context ids.  One is used for sending and one for
108   receiving.  In the case of an Intracommunicator, they are the same
109   context id.  They differ in the case of intercommunicators, where
110   they may come from processes in different comm worlds (in the
111   case of MPI-2 dynamic process intercomms).
112 
113   The virtual connection table is an explicit member of this structure.
114   This contains the information used to contact a particular process,
115   indexed by the rank relative to this communicator.
116 
117   Groups are allocated lazily.  That is, the group pointers may be
118   null, created only when needed by a routine such as 'MPI_Comm_group'.
119   The local process ids needed to form the group are available within
120   the virtual connection table.
121   For intercommunicators, we may want to always have the groups.  If not,
122   we either need the 'local_group' or we need a virtual connection table
123   corresponding to the 'local_group' (we may want this anyway to simplify
124   the implementation of the intercommunicator collective routines).
125 
126   The pointer to the structure 'MPIR_Collops' containing pointers to the
127   collective
128   routines allows an implementation to replace each routine on a
129   routine-by-routine basis.  By default, this pointer is null, as are the
130   pointers within the structure.  If either pointer is null, the implementation
131   uses the generic provided implementation.  This choice, rather than
132   initializing the table with pointers to all of the collective routines,
133   is made to reduce the space used in the communicators and to eliminate the
134   need to include the implementation of all collective routines in all MPI
135   executables, even if the routines are not used.
136 
137   Please note that the local_size and remote_size fields can be confusing.  For
138   intracommunicators both fields are always equal to the size of the
139   communicator.  For intercommunicators local_size is equal to the size of
140   local_group while remote_size is equal to the size of remote_group.
141 
142   Module:
143   Communicator-DS
144 
145   Question:
146   For fault tolerance, do we want to have a standard field for communicator
147   health?  For example, ok, failure detected, all (live) members of failed
148   communicator have acked.
149   S*/
150 struct MPIR_Comm {
151     MPIR_OBJECT_HEADER;         /* adds handle and ref_count fields */
152     MPID_Thread_mutex_t mutex;
153     MPIR_Context_id_t context_id;       /* Send context id.  See notes */
154     MPIR_Context_id_t recvcontext_id;   /* Send context id.  See notes */
155     int remote_size;            /* Value of MPI_Comm_(remote)_size */
156     int rank;                   /* Value of MPI_Comm_rank */
157     MPIR_Attribute *attributes; /* List of attributes */
158     int local_size;             /* Value of MPI_Comm_size for local group */
159     MPIR_Group *local_group,    /* Groups in communicator. */
160     *remote_group;              /* The local and remote groups are the
161                                  * same for intra communicators */
162     MPIR_Comm_kind_t comm_kind; /* MPIR_COMM_KIND__INTRACOMM or MPIR_COMM_KIND__INTERCOMM */
163     char name[MPI_MAX_OBJECT_NAME];     /* Required for MPI-2 */
164     MPIR_Errhandler *errhandler;        /* Pointer to the error handler structure */
165     struct MPIR_Comm *local_comm;       /* Defined only for intercomms, holds
166                                          * an intracomm for the local group */
167 
168     MPIR_Comm_hierarchy_kind_t hierarchy_kind;  /* flat, parent, node, or node_roots */
169     struct MPIR_Comm *node_comm;        /* Comm of processes in this comm that are on
170                                          * the same node as this process. */
171     struct MPIR_Comm *node_roots_comm;  /* Comm of root processes for other nodes. */
172     int *intranode_table;       /* intranode_table[i] gives the rank in
173                                  * node_comm of rank i in this comm or -1 if i
174                                  * is not in this process' node_comm.
175                                  * It is of size 'local_size'. */
176     int *internode_table;       /* internode_table[i] gives the rank in
177                                  * node_roots_comm of rank i in this comm.
178                                  * It is of size 'local_size'. */
179     int node_count;             /* number of nodes this comm is spread over */
180 
181     int is_low_group;           /* For intercomms only, this boolean is
182                                  * set for all members of one of the
183                                  * two groups of processes and clear for
184                                  * the other.  It enables certain
185                                  * intercommunicator collective operations
186                                  * that wish to use half-duplex operations
187                                  * to implement a full-duplex operation */
188 
189     struct MPIR_Comm *comm_next;        /* Provides a chain through all active
190                                          * communicators */
191     struct MPII_Topo_ops *topo_fns;     /* Pointer to a table of functions
192                                          * implementting the topology routines */
193     int next_sched_tag;         /* used by the NBC schedule code to allocate tags */
194 
195     int revoked;                /* Flag to track whether the communicator
196                                  * has been revoked */
197     /* A sequence number used for e.g. vci hashing. We can't directly use context_id
198      * because context_id is non-sequential and can't be used to identify user-level
199      * communicators (due to sub-comms). */
200     int seq;
201     /* Certain comm and its offsprings should be restricted to sequence 0 due to
202      * various restrictions. E.g. multiple-vci doesn't support dynamic process,
203      * nor intercomms (even after its merge).
204      */
205     int tainted;
206 
207 
208     int hints[MPIR_COMM_HINT_MAX];      /* Hints to the communicator
209                                          * use int array for fast access */
210 
211     struct {
212         int pof2;               /* Nearest (smaller than or equal to) power of 2
213                                  * to the number of ranks in the communicator.
214                                  * To be used during collective communication */
215     } coll;
216 
217     void *csel_comm;            /* collective selector handle */
218 #if defined HAVE_LIBHCOLL
219     hcoll_comm_priv_t hcoll_priv;
220 #endif                          /* HAVE_LIBHCOLL */
221 
222     /* the mapper is temporarily filled out in order to allow the
223      * device to setup its network addresses.  it will be freed after
224      * the device has initialized the comm. */
225     MPIR_Comm_map_t *mapper_head;
226     MPIR_Comm_map_t *mapper_tail;
227 
228     /* Other, device-specific information */
229 #ifdef MPID_DEV_COMM_DECL
230      MPID_DEV_COMM_DECL
231 #endif
232 };
233 extern MPIR_Object_alloc_t MPIR_Comm_mem;
234 
235 /* this function should not be called by normal code! */
236 int MPIR_Comm_delete_internal(MPIR_Comm * comm_ptr);
237 
238 #define MPIR_Comm_add_ref(comm_p_) \
239     do { MPIR_Object_add_ref((comm_p_)); } while (0)
240 #define MPIR_Comm_release_ref(comm_p_, inuse_) \
241     do { MPIR_Object_release_ref(comm_p_, inuse_); } while (0)
242 
243 
244 /* Release a reference to a communicator.  If there are no pending
245    references, delete the communicator and recover all storage and
246    context ids.
247 
248    This routine has been inlined because keeping it as a separate routine
249    results in a >5% performance hit for the SQMR benchmark.
250 */
MPIR_Comm_release(MPIR_Comm * comm_ptr)251 static inline int MPIR_Comm_release(MPIR_Comm * comm_ptr)
252 {
253     int mpi_errno = MPI_SUCCESS;
254     int in_use;
255 
256     MPIR_Comm_release_ref(comm_ptr, &in_use);
257     if (unlikely(!in_use)) {
258         /* the following routine should only be called by this function and its
259          * "_always" variant. */
260         mpi_errno = MPIR_Comm_delete_internal(comm_ptr);
261         /* not ERR_POPing here to permit simpler inlining.  Our caller will
262          * still report the error from the comm_delete level. */
263     }
264 
265     return mpi_errno;
266 }
267 
268 
269 /* MPIR_Comm_release_always is the same as MPIR_Comm_release except it uses
270    MPIR_Comm_release_ref_always instead.
271 */
272 int MPIR_Comm_release_always(MPIR_Comm * comm_ptr);
273 
274 int MPIR_Comm_create(MPIR_Comm **);
275 int MPIR_Comm_create_group(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, int tag,
276                            MPIR_Comm ** newcomm);
277 
278 /* implements the logic for MPI_Comm_create for intracommunicators only */
279 int MPIR_Comm_create_intra(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, MPIR_Comm ** newcomm_ptr);
280 
281 
282 int MPIR_Comm_create_subcomms(MPIR_Comm * comm);
283 int MPIR_Comm_commit(MPIR_Comm *);
284 
285 int MPIR_Comm_is_parent_comm(MPIR_Comm *);
286 
287 int MPIR_Comm_idup_impl(MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm, MPIR_Request ** reqp);
288 
289 int MPIR_Comm_shrink(MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm_ptr);
290 int MPIR_Comm_agree(MPIR_Comm * comm_ptr, int *flag);
291 
292 #if defined(HAVE_ROMIO)
293 int MPIR_Comm_split_filesystem(MPI_Comm comm, int key, const char *dirname, MPI_Comm * newcomm);
294 #endif
295 
296 #define MPIR_Comm_rank(comm_ptr) ((comm_ptr)->rank)
297 #define MPIR_Comm_size(comm_ptr) ((comm_ptr)->local_size)
298 
299 /* Comm hint registration.
300  *
301  * Hint function is optional. If it is NULL, MPIR_layer will set corresponding
302  * hints array directly. If it is supplied, MPIR_layer will *NOT* set hints array.
303  * The hint function is responsible for setting it, as well as validating it and
304  * update whatever side-effects.
305  *
306  * Current supported type is boolean and int and the value parsed accordingly.
307  *
308  * If the attr is 0, it is requires the hint value to be consistent across the
309  * communicator. If the LOCAL bit is set, the hint values is treated as local.
310  * Additional attributes may be added in the future.
311  */
312 void MPIR_Comm_hint_init(void);
313 typedef int (*MPIR_Comm_hint_fn_t) (MPIR_Comm *, int, int);     /* comm, key, val */
314 int MPIR_Comm_register_hint(int index, const char *hint_key, MPIR_Comm_hint_fn_t fn,
315                             int type, int attr);
316 
317 int MPIR_Comm_delete_attr_impl(MPIR_Comm * comm_ptr, MPII_Keyval * keyval_ptr);
318 int MPIR_Comm_create_keyval_impl(MPI_Comm_copy_attr_function * comm_copy_attr_fn,
319                                  MPI_Comm_delete_attr_function * comm_delete_attr_fn,
320                                  int *comm_keyval, void *extra_state);
321 int MPIR_Comm_accept_impl(const char *port_name, MPIR_Info * info_ptr, int root,
322                           MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm_ptr);
323 int MPIR_Comm_connect_impl(const char *port_name, MPIR_Info * info_ptr, int root,
324                            MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm_ptr);
325 int MPIR_Comm_create_errhandler_impl(MPI_Comm_errhandler_function * function,
326                                      MPI_Errhandler * errhandler);
327 int MPIR_Comm_dup_impl(MPIR_Comm * comm_ptr, MPIR_Info * info, MPIR_Comm ** newcomm_ptr);
328 int MPIR_Comm_dup_with_info_impl(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr,
329                                  MPIR_Comm ** newcomm_ptr);
330 int MPIR_Comm_get_info_impl(MPIR_Comm * comm_ptr, MPIR_Info ** info_ptr);
331 int MPIR_Comm_set_info_impl(MPIR_Comm * comm_ptr, MPIR_Info * info_ptr);
332 int MPIR_Comm_free_impl(MPIR_Comm * comm_ptr);
333 void MPIR_Comm_free_keyval_impl(int keyval);
334 void MPIR_Comm_get_errhandler_impl(MPIR_Comm * comm_ptr, MPIR_Errhandler ** errhandler_ptr);
335 void MPIR_Comm_set_errhandler_impl(MPIR_Comm * comm_ptr, MPIR_Errhandler * errhandler_ptr);
336 void MPIR_Comm_get_name_impl(MPIR_Comm * comm, char *comm_name, int *resultlen);
337 int MPIR_Intercomm_merge_impl(MPIR_Comm * comm_ptr, int high, MPIR_Comm ** new_intracomm_ptr);
338 int MPIR_Intercomm_create_impl(MPIR_Comm * local_comm_ptr, int local_leader,
339                                MPIR_Comm * peer_comm_ptr, int remote_leader, int tag,
340                                MPIR_Comm ** new_intercomm_ptr);
341 int MPIR_Comm_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr);
342 int MPIR_Comm_remote_group_impl(MPIR_Comm * comm_ptr, MPIR_Group ** group_ptr);
343 int MPIR_Comm_group_failed_impl(MPIR_Comm * comm, MPIR_Group ** failed_group_ptr);
344 int MPIR_Comm_remote_group_failed_impl(MPIR_Comm * comm, MPIR_Group ** failed_group_ptr);
345 int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** newcomm_ptr);
346 int MPIR_Comm_split_type_self(MPIR_Comm * comm_ptr, int split_type, int key,
347                               MPIR_Comm ** newcomm_ptr);
348 int MPIR_Comm_split_type_by_node(MPIR_Comm * comm_ptr, int split_type, int key,
349                                  MPIR_Comm ** newcomm_ptr);
350 int MPIR_Comm_split_type_node_topo(MPIR_Comm * comm_ptr, int split_type, int key,
351                                    MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr);
352 int MPIR_Comm_split_type(MPIR_Comm * comm_ptr, int split_type, int key, MPIR_Info * info_ptr,
353                          MPIR_Comm ** newcomm_ptr);
354 int MPIR_Comm_split_type_impl(MPIR_Comm * comm_ptr, int split_type, int key, MPIR_Info * info_ptr,
355                               MPIR_Comm ** newcomm_ptr);
356 int MPIR_Comm_set_attr_impl(MPIR_Comm * comm_ptr, int comm_keyval, void *attribute_val,
357                             MPIR_Attr_type attrType);
358 
359 int MPIR_Comm_split_type_neighborhood(MPIR_Comm * comm_ptr, int split_type, int key,
360                                       MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr);
361 int MPIR_Comm_split_type_nbhd_common_dir(MPIR_Comm * user_comm_ptr, int key, const char *hintval,
362                                          MPIR_Comm ** newcomm_ptr);
363 int MPIR_Comm_split_type_network_topo(MPIR_Comm * user_comm_ptr, int key, const char *hintval,
364                                       MPIR_Comm ** newcomm_ptr);
365 int MPIR_Comm_compare_impl(MPIR_Comm * comm_ptr1, MPIR_Comm * comm_ptr2, int *result);
366 
367 /* Preallocated comm objects.  There are 3: comm_world, comm_self, and
368    a private (non-user accessible) dup of comm world that is provided
369    if needed in MPI_Finalize.  Having a separate version of comm_world
370    avoids possible interference with User code */
371 #define MPIR_COMM_N_BUILTIN 3
372 extern MPIR_Comm MPIR_Comm_builtin[MPIR_COMM_N_BUILTIN];
373 extern MPIR_Comm MPIR_Comm_direct[];
374 /* This is the handle for the internal MPI_COMM_WORLD .  The "2" at the end
375    of the handle is 3-1 (e.g., the index in the builtin array) */
376 #define MPIR_ICOMM_WORLD  ((MPI_Comm)0x44000002)
377 
378 typedef struct MPIR_Commops {
379     int (*split_type) (MPIR_Comm *, int, int, MPIR_Info *, MPIR_Comm **);
380 } MPIR_Commops;
381 extern struct MPIR_Commops *MPIR_Comm_fns;      /* Communicator creation functions */
382 
383 
384 /* internal functions */
385 
386 int MPII_Comm_init(MPIR_Comm *);
387 
388 int MPII_Comm_is_node_consecutive(MPIR_Comm *);
389 
390 int MPII_Comm_copy(MPIR_Comm * comm_ptr, int size, MPIR_Info * info, MPIR_Comm ** outcomm_ptr);
391 int MPII_Comm_copy_data(MPIR_Comm * comm_ptr, MPIR_Comm ** outcomm_ptr);
392 
393 int MPII_Setup_intercomm_localcomm(MPIR_Comm *);
394 
395 /* comm_create helper functions, used by both comm_create and comm_create_group */
396 int MPII_Comm_create_calculate_mapping(MPIR_Group * group_ptr,
397                                        MPIR_Comm * comm_ptr,
398                                        int **mapping_out, MPIR_Comm ** mapping_comm);
399 
400 int MPII_Comm_create_map(int local_n,
401                          int remote_n,
402                          int *local_mapping,
403                          int *remote_mapping, MPIR_Comm * mapping_comm, MPIR_Comm * newcomm);
404 
405 int MPII_Comm_set_hints(MPIR_Comm * comm_ptr, MPIR_Info * info);
406 int MPII_Comm_get_hints(MPIR_Comm * comm_ptr, MPIR_Info * info);
407 int MPII_Comm_check_hints(MPIR_Comm * comm_ptr);
408 
409 #endif /* MPIR_COMM_H_INCLUDED */
410