1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2 /*
3  *  (C) 2001 by Argonne National Laboratory.
4  *      See COPYRIGHT in top-level directory.
5  */
6 
7 #include "mpiimpl.h"
8 #include "mpicomm.h"
9 
10 /* This is the utility file for comm that contains the basic comm items
11    and storage management */
12 #ifndef MPID_COMM_PREALLOC
13 #define MPID_COMM_PREALLOC 8
14 #endif
15 
16 /* Preallocated comm objects */
17 /* initialized in initthread.c */
18 MPID_Comm MPID_Comm_builtin[MPID_COMM_N_BUILTIN] = { {0} };
19 MPID_Comm MPID_Comm_direct[MPID_COMM_PREALLOC]   = { {0} };
20 MPIU_Object_alloc_t MPID_Comm_mem = { 0, 0, 0, 0, MPID_COMM,
21 				      sizeof(MPID_Comm), MPID_Comm_direct,
22                                       MPID_COMM_PREALLOC};
23 
24 /* Communicator creation functions */
25 struct MPID_CommOps  *MPID_Comm_fns = NULL;
26 
27 /* utility function to pretty print a context ID for debugging purposes, see
28  * mpiimpl.h for more info on the various fields */
29 #ifdef USE_DBG_LOGGING
MPIR_Comm_dump_context_id(MPIR_Context_id_t context_id,char * out_str,int len)30 static void MPIR_Comm_dump_context_id(MPIR_Context_id_t context_id, char *out_str, int len)
31 {
32     int subcomm_type = MPID_CONTEXT_READ_FIELD(SUBCOMM,context_id);
33     const char *subcomm_type_name = NULL;
34 
35     switch (subcomm_type) {
36         case 0: subcomm_type_name = "parent"; break;
37         case 1: subcomm_type_name = "intranode"; break;
38         case 2: subcomm_type_name = "internode"; break;
39         default: MPIU_Assert(FALSE); break;
40     }
41     MPIU_Snprintf(out_str, len,
42                   "context_id=%d (%#x): DYNAMIC_PROC=%d PREFIX=%#x IS_LOCALCOMM=%d SUBCOMM=%s SUFFIX=%s",
43                   context_id,
44                   context_id,
45                   MPID_CONTEXT_READ_FIELD(DYNAMIC_PROC,context_id),
46                   MPID_CONTEXT_READ_FIELD(PREFIX,context_id),
47                   MPID_CONTEXT_READ_FIELD(IS_LOCALCOMM,context_id),
48                   subcomm_type_name,
49                   (MPID_CONTEXT_READ_FIELD(SUFFIX,context_id) ? "coll" : "pt2pt"));
50 }
51 #endif
52 
53 /* FIXME :
54    Reusing context ids can lead to a race condition if (as is desirable)
55    MPI_Comm_free does not include a barrier.  Consider the following:
56    Process A frees the communicator.
57    Process A creates a new communicator, reusing the just released id
58    Process B sends a message to A on the old communicator.
59    Process A receives the message, and believes that it belongs to the
60    new communicator.
61    Process B then cancels the message, and frees the communicator.
62 
63    The likelyhood of this happening can be reduced by introducing a gap
64    between when a context id is released and when it is reused.  An alternative
65    is to use an explicit message (in the implementation of MPI_Comm_free)
66    to indicate that a communicator is being freed; this will often require
67    less communication than a barrier in MPI_Comm_free, and will ensure that
68    no messages are later sent to the same communicator (we may also want to
69    have a similar check when building fault-tolerant versions of MPI).
70  */
71 
72 /* Zeroes most non-handle fields in a communicator, as well as initializing any
73  * other special fields, such as a per-object mutex.  Also defaults the
74  * reference count to 1, under the assumption that the caller holds a reference
75  * to it.
76  *
77  * !!! The resulting struct is _not_ ready for communication !!! */
MPIR_Comm_init(MPID_Comm * comm_p)78 int MPIR_Comm_init(MPID_Comm *comm_p)
79 {
80     int mpi_errno = MPI_SUCCESS;
81 
82     MPIU_Object_set_ref(comm_p, 1);
83 
84     MPIU_THREAD_MPI_OBJ_INIT(comm_p);
85 
86     /* Clear many items (empty means to use the default; some of these
87        may be overridden within the upper-level communicator initialization) */
88     comm_p->errhandler   = NULL;
89     comm_p->attributes   = NULL;
90     comm_p->remote_group = NULL;
91     comm_p->local_group  = NULL;
92     comm_p->coll_fns     = NULL;
93     comm_p->topo_fns     = NULL;
94     comm_p->name[0]      = '\0';
95 
96     comm_p->hierarchy_kind  = MPID_HIERARCHY_FLAT;
97     comm_p->node_comm       = NULL;
98     comm_p->node_roots_comm = NULL;
99     comm_p->intranode_table = NULL;
100     comm_p->internode_table = NULL;
101 
102     /* abstractions bleed a bit here... :( */
103     comm_p->next_sched_tag = MPIR_FIRST_NBC_TAG;
104 
105     /* Fields not set include context_id, remote and local size, and
106        kind, since different communicator construction routines need
107        different values */
108 fn_fail:
109     return mpi_errno;
110 }
111 
112 
113 /*
114     Create a communicator structure and perform basic initialization
115     (mostly clearing fields and updating the reference count).
116  */
117 #undef FUNCNAME
118 #define FUNCNAME MPIR_Comm_create
119 #undef FCNAME
120 #define FCNAME "MPIR_Comm_create"
MPIR_Comm_create(MPID_Comm ** newcomm_ptr)121 int MPIR_Comm_create( MPID_Comm **newcomm_ptr )
122 {
123     int mpi_errno = MPI_SUCCESS;
124     MPID_Comm *newptr;
125     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_CREATE);
126 
127     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_CREATE);
128 
129     newptr = (MPID_Comm *)MPIU_Handle_obj_alloc( &MPID_Comm_mem );
130     MPIU_ERR_CHKANDJUMP(!newptr, mpi_errno, MPI_ERR_OTHER, "**nomem");
131 
132     *newcomm_ptr = newptr;
133 
134     mpi_errno = MPIR_Comm_init(newptr);
135     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
136 
137     /* Insert this new communicator into the list of known communicators.
138        Make this conditional on debugger support to match the test in
139        MPIR_Comm_release . */
140     MPIR_COMML_REMEMBER( newptr );
141 
142  fn_fail:
143     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_CREATE);
144 
145     return mpi_errno;
146 }
147 
148 /* Create a local intra communicator from the local group of the
149    specified intercomm. */
150 /* FIXME this is an alternative constructor that doesn't use MPIR_Comm_create! */
151 #undef FUNCNAME
152 #define FUNCNAME MPIR_Setup_intercomm_localcomm
153 #undef FCNAME
154 #define FCNAME "MPIR_Setup_intercomm_localcomm"
MPIR_Setup_intercomm_localcomm(MPID_Comm * intercomm_ptr)155 int MPIR_Setup_intercomm_localcomm( MPID_Comm *intercomm_ptr )
156 {
157     MPID_Comm *localcomm_ptr;
158     int mpi_errno = MPI_SUCCESS;
159     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_SETUP_INTERCOMM_LOCALCOMM);
160 
161     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_SETUP_INTERCOMM_LOCALCOMM);
162 
163     localcomm_ptr = (MPID_Comm *)MPIU_Handle_obj_alloc( &MPID_Comm_mem );
164     MPIU_ERR_CHKANDJUMP(!localcomm_ptr,mpi_errno,MPI_ERR_OTHER,"**nomem");
165 
166     /* get sensible default values for most fields (usually zeros) */
167     mpi_errno = MPIR_Comm_init(localcomm_ptr);
168     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
169 
170     /* use the parent intercomm's recv ctx as the basis for our ctx */
171     localcomm_ptr->recvcontext_id = MPID_CONTEXT_SET_FIELD(IS_LOCALCOMM, intercomm_ptr->recvcontext_id, 1);
172     localcomm_ptr->context_id = localcomm_ptr->recvcontext_id;
173 
174     MPIU_DBG_MSG_FMT(COMM,TYPICAL,(MPIU_DBG_FDEST, "setup_intercomm_localcomm ic=%p ic->context_id=%d ic->recvcontext_id=%d lc->recvcontext_id=%d", intercomm_ptr, intercomm_ptr->context_id, intercomm_ptr->recvcontext_id, localcomm_ptr->recvcontext_id));
175 
176     /* Duplicate the VCRT references */
177     MPID_VCRT_Add_ref( intercomm_ptr->local_vcrt );
178     localcomm_ptr->vcrt = intercomm_ptr->local_vcrt;
179     localcomm_ptr->vcr  = intercomm_ptr->local_vcr;
180 
181     /* Save the kind of the communicator */
182     localcomm_ptr->comm_kind   = MPID_INTRACOMM;
183 
184     /* Set the sizes and ranks */
185     localcomm_ptr->remote_size = intercomm_ptr->local_size;
186     localcomm_ptr->local_size  = intercomm_ptr->local_size;
187     localcomm_ptr->rank        = intercomm_ptr->rank;
188 
189     /* TODO More advanced version: if the group is available, dup it by
190        increasing the reference count instead of recreating it later */
191     /* FIXME  : No coll_fns functions for the collectives */
192     /* FIXME  : No local functions for the topology routines */
193 
194     intercomm_ptr->local_comm = localcomm_ptr;
195 
196     /* sets up the SMP-aware sub-communicators and tables */
197     mpi_errno = MPIR_Comm_commit(localcomm_ptr);
198     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
199 
200  fn_fail:
201     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_SETUP_INTERCOMM_LOCALCOMM);
202 
203     return mpi_errno;
204 }
205 
206 /* holds default collop "vtables" for _intracomms_, where
207  * default[hierarchy_kind] is the pointer to the collop struct for that
208  * hierarchy kind */
209 static struct MPID_Collops *default_collops[MPID_HIERARCHY_SIZE] = {NULL};
210 /* default for intercomms */
211 static struct MPID_Collops *ic_default_collops = NULL;
212 
213 #undef FUNCNAME
214 #define FUNCNAME cleanup_default_collops
215 #undef FCNAME
216 #define FCNAME MPIU_QUOTE(FUNCNAME)
cleanup_default_collops(void * unused)217 static int cleanup_default_collops(void *unused) {
218     int i;
219     for (i = 0; i < MPID_HIERARCHY_SIZE; ++i) {
220         if (default_collops[i]) {
221             MPIU_Assert(default_collops[i]->ref_count >= 1);
222             if (--default_collops[i]->ref_count == 0)
223                 MPIU_Free(default_collops[i]);
224             default_collops[i] = NULL;
225         }
226     }
227     if (ic_default_collops) {
228         MPIU_Assert(ic_default_collops->ref_count >= 1);
229         if (--ic_default_collops->ref_count == 0)
230             MPIU_Free(ic_default_collops);
231     }
232     return MPI_SUCCESS;
233 }
234 
235 #undef FUNCNAME
236 #define FUNCNAME init_default_collops
237 #undef FCNAME
238 #define FCNAME MPIU_QUOTE(FUNCNAME)
init_default_collops(void)239 static int init_default_collops(void)
240 {
241     int mpi_errno = MPI_SUCCESS;
242     int i;
243     struct MPID_Collops *ops = NULL;
244     MPIU_CHKPMEM_DECL(MPID_HIERARCHY_SIZE+1);
245 
246     /* first initialize the intracomms */
247     for (i = 0; i < MPID_HIERARCHY_SIZE; ++i) {
248         MPIU_CHKPMEM_CALLOC(ops, struct MPID_Collops *, sizeof(struct MPID_Collops), mpi_errno, "default intracomm collops");
249         ops->ref_count = 1; /* force existence until finalize time */
250 
251         /* intracomm default defaults... */
252         ops->Ibcast = &MPIR_Ibcast_intra;
253         ops->Ibarrier = &MPIR_Ibarrier_intra;
254         ops->Ireduce = &MPIR_Ireduce_intra;
255         ops->Ialltoall = &MPIR_Ialltoall_intra;
256         ops->Ialltoallv = &MPIR_Ialltoallv_intra;
257         ops->Ialltoallw = &MPIR_Ialltoallw_intra;
258         ops->Iallreduce = &MPIR_Iallreduce_intra;
259         ops->Igather = &MPIR_Igather_intra;
260         ops->Igatherv = &MPIR_Igatherv;
261         ops->Iscatter = &MPIR_Iscatter_intra;
262         ops->Iscatterv = &MPIR_Iscatterv;
263         ops->Ireduce_scatter = &MPIR_Ireduce_scatter_intra;
264         ops->Ireduce_scatter_block = &MPIR_Ireduce_scatter_block_intra;
265         ops->Iallgather = &MPIR_Iallgather_intra;
266         ops->Iallgatherv = &MPIR_Iallgatherv_intra;
267         ops->Iscan = &MPIR_Iscan_rec_dbl;
268         ops->Iexscan = &MPIR_Iexscan;
269         ops->Neighbor_allgather   = &MPIR_Neighbor_allgather_default;
270         ops->Neighbor_allgatherv  = &MPIR_Neighbor_allgatherv_default;
271         ops->Neighbor_alltoall    = &MPIR_Neighbor_alltoall_default;
272         ops->Neighbor_alltoallv   = &MPIR_Neighbor_alltoallv_default;
273         ops->Neighbor_alltoallw   = &MPIR_Neighbor_alltoallw_default;
274         ops->Ineighbor_allgather  = &MPIR_Ineighbor_allgather_default;
275         ops->Ineighbor_allgatherv = &MPIR_Ineighbor_allgatherv_default;
276         ops->Ineighbor_alltoall   = &MPIR_Ineighbor_alltoall_default;
277         ops->Ineighbor_alltoallv  = &MPIR_Ineighbor_alltoallv_default;
278         ops->Ineighbor_alltoallw  = &MPIR_Ineighbor_alltoallw_default;
279 
280         /* override defaults, such as for SMP */
281         switch (i) {
282             case MPID_HIERARCHY_FLAT:
283                 break;
284             case MPID_HIERARCHY_PARENT:
285                 ops->Ibcast = &MPIR_Ibcast_SMP;
286                 ops->Iscan = &MPIR_Iscan_SMP;
287                 ops->Iallreduce = &MPIR_Iallreduce_SMP;
288                 ops->Ireduce = &MPIR_Ireduce_SMP;
289                 break;
290             case MPID_HIERARCHY_NODE:
291                 break;
292             case MPID_HIERARCHY_NODE_ROOTS:
293                 break;
294 
295                 /* --BEGIN ERROR HANDLING-- */
296             default:
297                 MPIU_Assertp(FALSE);
298                 break;
299                 /* --END ERROR HANDLING-- */
300         }
301 
302         /* this is a default table, it's not overriding another table */
303         ops->prev_coll_fns = NULL;
304 
305         default_collops[i] = ops;
306     }
307 
308     /* now the intercomm table */
309     {
310         MPIU_CHKPMEM_CALLOC(ops, struct MPID_Collops *, sizeof(struct MPID_Collops), mpi_errno, "default intercomm collops");
311         ops->ref_count = 1; /* force existence until finalize time */
312 
313         /* intercomm defaults */
314         ops->Ibcast = &MPIR_Ibcast_inter;
315         ops->Ibarrier = &MPIR_Ibarrier_inter;
316         ops->Ireduce = &MPIR_Ireduce_inter;
317         ops->Ialltoall = &MPIR_Ialltoall_inter;
318         ops->Ialltoallv = &MPIR_Ialltoallv_inter;
319         ops->Ialltoallw = &MPIR_Ialltoallw_inter;
320         ops->Iallreduce = &MPIR_Iallreduce_inter;
321         ops->Igather = &MPIR_Igather_inter;
322         ops->Igatherv = &MPIR_Igatherv;
323         ops->Iscatter = &MPIR_Iscatter_inter;
324         ops->Iscatterv = &MPIR_Iscatterv;
325         ops->Ireduce_scatter = &MPIR_Ireduce_scatter_inter;
326         ops->Ireduce_scatter_block = &MPIR_Ireduce_scatter_block_inter;
327         ops->Iallgather = &MPIR_Iallgather_inter;
328         ops->Iallgatherv = &MPIR_Iallgatherv_inter;
329         /* scan and exscan are not valid for intercommunicators, leave them NULL */
330         /* Ineighbor_all* routines are not valid for intercommunicators, leave
331          * them NULL */
332 
333         /* this is a default table, it's not overriding another table */
334         ops->prev_coll_fns = NULL;
335 
336         ic_default_collops = ops;
337     }
338 
339 
340     /* run after MPID_Finalize to permit collective usage during finalize */
341     MPIR_Add_finalize(cleanup_default_collops, NULL, MPIR_FINALIZE_CALLBACK_PRIO - 1);
342 
343     MPIU_CHKPMEM_COMMIT();
344 fn_exit:
345     return mpi_errno;
346     /* --BEGIN ERROR HANDLING-- */
347 fn_fail:
348     MPIU_CHKPMEM_REAP();
349     goto fn_exit;
350     /* --END ERROR HANDLING-- */
351 }
352 
353 /* Initializes the coll_fns field of comm to a sensible default.  It may re-use
354  * an existing structure, so any override by a lower level should _not_ change
355  * any of the fields but replace the coll_fns object instead.
356  *
357  * NOTE: for now we only initialize nonblocking collective routines, since the
358  * blocking collectives all contain fallback logic that correctly handles NULL
359  * override functions. */
360 #undef FUNCNAME
361 #define FUNCNAME set_collops
362 #undef FCNAME
363 #define FCNAME MPIU_QUOTE(FUNCNAME)
set_collops(MPID_Comm * comm)364 static int set_collops(MPID_Comm *comm)
365 {
366     int mpi_errno = MPI_SUCCESS;
367     static int initialized = FALSE;
368 
369     if (comm->coll_fns != NULL)
370         goto fn_exit;
371 
372     if (unlikely(!initialized)) {
373         mpi_errno = init_default_collops();
374         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
375 
376         initialized = TRUE;
377     }
378 
379     if (comm->comm_kind == MPID_INTRACOMM) {
380         /* FIXME MT what protects access to this structure and ic_default_collops? */
381         comm->coll_fns = default_collops[comm->hierarchy_kind];
382     }
383     else { /* intercomm */
384         comm->coll_fns = ic_default_collops;
385     }
386 
387     comm->coll_fns->ref_count++;
388 
389 fn_exit:
390     return mpi_errno;
391 fn_fail:
392     goto fn_exit;
393 }
394 
395 /* Provides a hook for the top level functions to perform some manipulation on a
396    communicator just before it is given to the application level.
397 
398    For example, we create sub-communicators for SMP-aware collectives at this
399    step. */
400 #undef FUNCNAME
401 #define FUNCNAME MPIR_Comm_commit
402 #undef FCNAME
403 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Comm_commit(MPID_Comm * comm)404 int MPIR_Comm_commit(MPID_Comm *comm)
405 {
406     int mpi_errno = MPI_SUCCESS;
407     int i;
408     int num_local = -1, num_external = -1;
409     int local_rank = -1, external_rank = -1;
410     int *local_procs = NULL, *external_procs = NULL;
411     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_COMMIT);
412 
413     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_COMMIT);
414 
415     /* It's OK to relax these assertions, but we should do so very
416        intentionally.  For now this function is the only place that we create
417        our hierarchy of communicators */
418     MPIU_Assert(comm->node_comm == NULL);
419     MPIU_Assert(comm->node_roots_comm == NULL);
420 
421     if (comm->comm_kind == MPID_INTRACOMM) {
422 
423         mpi_errno = MPIU_Find_local_and_external(comm,
424                                                  &num_local,    &local_rank,    &local_procs,
425                                                  &num_external, &external_rank, &external_procs,
426                                                  &comm->intranode_table, &comm->internode_table);
427         /* --BEGIN ERROR HANDLING-- */
428         if (mpi_errno) {
429             if (MPIR_Err_is_fatal(mpi_errno)) MPIU_ERR_POP(mpi_errno);
430 
431             /* Non-fatal errors simply mean that this communicator will not have
432                any node awareness.  Node-aware collectives are an optimization. */
433             MPIU_DBG_MSG_P(COMM,VERBOSE,"MPIU_Find_local_and_external failed for comm_ptr=%p", comm);
434             if (comm->intranode_table)
435                 MPIU_Free(comm->intranode_table);
436             if (comm->internode_table)
437                 MPIU_Free(comm->internode_table);
438 
439             mpi_errno = MPI_SUCCESS;
440             goto fn_exit;
441         }
442         /* --END ERROR HANDLING-- */
443 
444         /* defensive checks */
445         MPIU_Assert(num_local > 0);
446         MPIU_Assert(num_local > 1 || external_rank >= 0);
447         MPIU_Assert(external_rank < 0 || external_procs != NULL);
448 
449         /* if the node_roots_comm and comm would be the same size, then creating
450            the second communicator is useless and wasteful. */
451         if (num_external == comm->remote_size) {
452             MPIU_Assert(num_local == 1);
453             goto fn_exit;
454         }
455 
456         /* we don't need a local comm if this process is the only one on this node */
457         if (num_local > 1) {
458             mpi_errno = MPIR_Comm_create(&comm->node_comm);
459             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
460 
461             comm->node_comm->context_id = comm->context_id + MPID_CONTEXT_INTRANODE_OFFSET;
462             comm->node_comm->recvcontext_id = comm->node_comm->context_id;
463             comm->node_comm->rank = local_rank;
464             comm->node_comm->comm_kind = MPID_INTRACOMM;
465             comm->node_comm->hierarchy_kind = MPID_HIERARCHY_NODE;
466             comm->node_comm->local_comm = NULL;
467 
468             comm->node_comm->local_size  = num_local;
469             comm->node_comm->remote_size = num_local;
470 
471             MPID_VCRT_Create( num_local, &comm->node_comm->vcrt );
472             MPID_VCRT_Get_ptr( comm->node_comm->vcrt, &comm->node_comm->vcr );
473             for (i = 0; i < num_local; ++i) {
474                 /* For rank i in the new communicator, find the corresponding
475                    rank in the input communicator */
476                 MPID_VCR_Dup( comm->vcr[local_procs[i]],
477                               &comm->node_comm->vcr[i] );
478             }
479 
480             mpi_errno = set_collops(comm->node_comm);
481             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
482 
483             /* Notify device of communicator creation */
484             mpi_errno = MPID_Dev_comm_create_hook( comm->node_comm );
485             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
486             /* don't call MPIR_Comm_commit here */
487         }
488 
489 
490         /* this process may not be a member of the node_roots_comm */
491         if (local_rank == 0) {
492             mpi_errno = MPIR_Comm_create(&comm->node_roots_comm);
493             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
494 
495             comm->node_roots_comm->context_id = comm->context_id + MPID_CONTEXT_INTERNODE_OFFSET;
496             comm->node_roots_comm->recvcontext_id = comm->node_roots_comm->context_id;
497             comm->node_roots_comm->rank = external_rank;
498             comm->node_roots_comm->comm_kind = MPID_INTRACOMM;
499             comm->node_roots_comm->hierarchy_kind = MPID_HIERARCHY_NODE_ROOTS;
500             comm->node_roots_comm->local_comm = NULL;
501 
502             comm->node_roots_comm->local_size  = num_external;
503             comm->node_roots_comm->remote_size = num_external;
504 
505             MPID_VCRT_Create( num_external, &comm->node_roots_comm->vcrt );
506             MPID_VCRT_Get_ptr( comm->node_roots_comm->vcrt, &comm->node_roots_comm->vcr );
507             for (i = 0; i < num_external; ++i) {
508                 /* For rank i in the new communicator, find the corresponding
509                    rank in the input communicator */
510                 MPID_VCR_Dup( comm->vcr[external_procs[i]],
511                               &comm->node_roots_comm->vcr[i] );
512             }
513 
514             mpi_errno = set_collops(comm->node_roots_comm);
515             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
516 
517             /* Notify device of communicator creation */
518             mpi_errno = MPID_Dev_comm_create_hook( comm->node_roots_comm );
519             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
520             /* don't call MPIR_Comm_commit here */
521         }
522 
523         comm->hierarchy_kind = MPID_HIERARCHY_PARENT;
524     }
525 
526 fn_exit:
527     if (!mpi_errno) {
528         /* catch all of the early-bail, non-error cases */
529 
530         mpi_errno = set_collops(comm);
531         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
532 
533         /* Notify device of communicator creation */
534         mpi_errno = MPID_Dev_comm_create_hook(comm);
535         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
536     }
537 
538     if (external_procs != NULL)
539         MPIU_Free(external_procs);
540     if (local_procs != NULL)
541         MPIU_Free(local_procs);
542 
543     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_COMMIT);
544     return mpi_errno;
545 fn_fail:
546     goto fn_exit;
547 }
548 
549 /* Returns true if the given communicator is aware of node topology information,
550    false otherwise.  Such information could be used to implement more efficient
551    collective communication, for example. */
MPIR_Comm_is_node_aware(MPID_Comm * comm)552 int MPIR_Comm_is_node_aware(MPID_Comm * comm)
553 {
554     return (comm->hierarchy_kind == MPID_HIERARCHY_PARENT);
555 }
556 
557 /* Returns true if the communicator is node-aware and processes in all the nodes
558    are consecutive. For example, if node 0 contains "0, 1, 2, 3", node 1
559    contains "4, 5, 6", and node 2 contains "7", we shall return true. */
MPIR_Comm_is_node_consecutive(MPID_Comm * comm)560 int MPIR_Comm_is_node_consecutive(MPID_Comm * comm)
561 {
562     int i = 0, curr_nodeidx = 0;
563     int *internode_table = comm->internode_table;
564 
565     if (!MPIR_Comm_is_node_aware(comm))
566         return 0;
567 
568     for (; i < comm->local_size; i++)
569     {
570         if (internode_table[i] == curr_nodeidx + 1)
571             curr_nodeidx++;
572         else if (internode_table[i] != curr_nodeidx)
573             return 0;
574     }
575 
576     return 1;
577 }
578 
579 /*
580  * Here are the routines to find a new context id.  The algorithm is discussed
581  * in detail in the mpich2 coding document.  There are versions for
582  * single threaded and multithreaded MPI.
583  *
584  * Both the threaded and non-threaded routines use the same mask of
585  * available context id values.
586  */
587 static uint32_t context_mask[MPIR_MAX_CONTEXT_MASK];
588 static int initialize_context_mask = 1;
589 
590 #ifdef USE_DBG_LOGGING
591 /* Create a string that contains the context mask.  This is
592    used only with the logging interface, and must be used by one thread at
593    a time (should this be enforced by the logging interface?).
594    Converts the mask to hex and returns a pointer to that string */
MPIR_ContextMaskToStr(void)595 static char *MPIR_ContextMaskToStr( void )
596 {
597     static char bufstr[MPIR_MAX_CONTEXT_MASK*8+1];
598     int i;
599     int maxset=0;
600 
601     for (maxset=MPIR_MAX_CONTEXT_MASK-1; maxset>=0; maxset--) {
602 	if (context_mask[maxset] != 0) break;
603     }
604 
605     for (i=0; i<maxset; i++) {
606 	MPIU_Snprintf( &bufstr[i*8], 9, "%.8x", context_mask[i] );
607     }
608     return bufstr;
609 }
610 #endif
611 
612 #ifdef MPICH_DEBUG_HANDLEALLOC
MPIU_CheckContextIDsOnFinalize(void * context_mask_ptr)613 static int MPIU_CheckContextIDsOnFinalize(void *context_mask_ptr)
614 {
615     int i;
616     uint32_t *mask = context_mask_ptr;
617     /* the predefined communicators should be freed by this point, so we don't
618      * need to special case bits 0,1, and 2 */
619     for (i = 0; i < MPIR_MAX_CONTEXT_MASK; ++i) {
620         if (~mask[i]) {
621             /* some bits were still cleared */
622             printf("leaked context IDs detected: mask=%p mask[%d]=%#x\n", mask, i, (int)mask[i]);
623         }
624     }
625     return MPI_SUCCESS;
626 }
627 #endif
628 
MPIR_Init_contextid(void)629 static void MPIR_Init_contextid(void)
630 {
631     int i;
632 
633     for (i=1; i<MPIR_MAX_CONTEXT_MASK; i++) {
634 	context_mask[i] = 0xFFFFFFFF;
635     }
636     /* the first three values are already used (comm_world, comm_self,
637        and the internal-only copy of comm_world) */
638     context_mask[0] = 0xFFFFFFF8;
639     initialize_context_mask = 0;
640 
641 #ifdef MPICH_DEBUG_HANDLEALLOC
642     /* check for context ID leaks in MPI_Finalize.  Use (_PRIO-1) to make sure
643      * that we run after MPID_Finalize. */
644     MPIR_Add_finalize(MPIU_CheckContextIDsOnFinalize, context_mask,
645                       MPIR_FINALIZE_CALLBACK_PRIO - 1);
646 #endif
647 }
648 
649 /* Return the context id corresponding to the first set bit in the mask.
650    Return 0 if no bit found.  This function does _not_ alter local_mask. */
MPIR_Locate_context_bit(uint32_t local_mask[])651 static int MPIR_Locate_context_bit(uint32_t local_mask[])
652 {
653     int i, j, context_id = 0;
654     for (i=0; i<MPIR_MAX_CONTEXT_MASK; i++) {
655 	if (local_mask[i]) {
656 	    /* There is a bit set in this word. */
657 	    register uint32_t     val, nval;
658 	    /* The following code finds the highest set bit by recursively
659 	       checking the top half of a subword for a bit, and incrementing
660 	       the bit location by the number of bit of the lower sub word if
661 	       the high subword contains a set bit.  The assumption is that
662 	       full-word bitwise operations and compares against zero are
663 	       fast */
664 	    val = local_mask[i];
665 	    j   = 0;
666 	    nval = val & 0xFFFF0000;
667 	    if (nval) {
668 		j += 16;
669 		val = nval;
670 	    }
671 	    nval = val & 0xFF00FF00;
672 	    if (nval) {
673 		j += 8;
674 		val = nval;
675 	    }
676 	    nval = val & 0xF0F0F0F0;
677 	    if (nval) {
678 		j += 4;
679 		val = nval;
680 	    }
681 	    nval = val & 0xCCCCCCCC;
682 	    if (nval) {
683 		j += 2;
684 		val = nval;
685 	    }
686 	    if (val & 0xAAAAAAAA) {
687 		j += 1;
688 	    }
689 	    context_id = (MPIR_CONTEXT_INT_BITS * i + j) << MPID_CONTEXT_PREFIX_SHIFT;
690 	    return context_id;
691 	}
692     }
693     return 0;
694 }
695 
696 /* Allocates a context ID from the given mask by clearing the bit
697  * corresponding to the the given id.  Returns 0 on failure, id on
698  * success. */
MPIR_Allocate_context_bit(uint32_t mask[],MPIR_Context_id_t id)699 static int MPIR_Allocate_context_bit(uint32_t mask[], MPIR_Context_id_t id)
700 {
701     int raw_prefix, idx, bitpos;
702     raw_prefix = MPID_CONTEXT_READ_FIELD(PREFIX,id);
703     idx    = raw_prefix / MPIR_CONTEXT_INT_BITS;
704     bitpos = raw_prefix % MPIR_CONTEXT_INT_BITS;
705 
706     /* the bit should not already be cleared (allocated) */
707     MPIU_Assert(mask[idx] & (1<<bitpos));
708 
709     /* clear the bit */
710     mask[idx] &= ~(1<<bitpos);
711 
712     MPIU_DBG_MSG_FMT(COMM,VERBOSE,(MPIU_DBG_FDEST,
713             "allocating contextid = %d, (mask=%p, mask[%d], bit %d)",
714             id, mask, idx, bitpos));
715     return id;
716 }
717 
718 /* Allocates the first available context ID from context_mask based on the available
719  * bits given in local_mask.  This function will clear the corresponding bit in
720  * context_mask if allocation was successful.
721  *
722  * Returns 0 on failure.  Returns the allocated context ID on success. */
MPIR_Find_and_allocate_context_id(uint32_t local_mask[])723 static int MPIR_Find_and_allocate_context_id(uint32_t local_mask[])
724 {
725     MPIR_Context_id_t context_id;
726     context_id = MPIR_Locate_context_bit(local_mask);
727     if (context_id != 0) {
728         context_id = MPIR_Allocate_context_bit(context_mask, context_id);
729     }
730     return context_id;
731 }
732 
733 /* Older, simpler interface.  Allocates a context ID collectively over the given
734  * communicator. */
735 #undef FUNCNAME
736 #define FUNCNAME MPIR_Get_contextid
737 #undef FCNAME
738 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Get_contextid(MPID_Comm * comm_ptr,MPIR_Context_id_t * context_id)739 int MPIR_Get_contextid(MPID_Comm *comm_ptr, MPIR_Context_id_t *context_id)
740 {
741     int mpi_errno = MPI_SUCCESS;
742     mpi_errno = MPIR_Get_contextid_sparse(comm_ptr, context_id, FALSE);
743     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
744     MPIU_Assert(*context_id != MPIR_INVALID_CONTEXT_ID);
745 fn_fail:
746     return mpi_errno;
747 }
748 
749 
750 #ifndef MPICH_IS_THREADED
751 /* Unthreaded (only one MPI call active at any time) */
752 
753 #undef FUNCNAME
754 #define FUNCNAME MPIR_Get_contextid_sparse
755 #undef FCNAME
756 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Get_contextid_sparse(MPID_Comm * comm_ptr,MPIR_Context_id_t * context_id,int ignore_id)757 int MPIR_Get_contextid_sparse(MPID_Comm *comm_ptr, MPIR_Context_id_t *context_id, int ignore_id)
758 {
759     return MPIR_Get_contextid_sparse_group(comm_ptr, NULL /* group_ptr */, MPIR_Process.attrs.tag_ub /* tag */, context_id, ignore_id);
760 }
761 
762 #undef FUNCNAME
763 #define FUNCNAME MPIR_Get_contextid_sparse_group
764 #undef FCNAME
765 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Get_contextid_sparse_group(MPID_Comm * comm_ptr,MPID_Group * group_ptr,int tag,MPIR_Context_id_t * context_id,int ignore_id)766 int MPIR_Get_contextid_sparse_group(MPID_Comm *comm_ptr, MPID_Group *group_ptr, int tag, MPIR_Context_id_t *context_id, int ignore_id)
767 {
768     int mpi_errno = MPI_SUCCESS;
769     uint32_t     local_mask[MPIR_MAX_CONTEXT_MASK];
770     int errflag = FALSE;
771     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_GET_CONTEXTID);
772 
773     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_GET_CONTEXTID);
774 
775     *context_id = 0;
776 
777     if (initialize_context_mask) {
778 	MPIR_Init_contextid();
779     }
780 
781     if (ignore_id) {
782         /* We are not participating in the resulting communicator, so our
783          * context ID space doesn't matter.  Set the mask to "all available". */
784         memset(local_mask, 0xff, MPIR_MAX_CONTEXT_MASK * sizeof(int));
785     }
786     else {
787         MPIU_Memcpy( local_mask, context_mask, MPIR_MAX_CONTEXT_MASK * sizeof(int) );
788     }
789 
790     /* Note that this is the unthreaded version */
791     /* Comm must be an intracommunicator */
792     if (group_ptr != NULL) {
793         int coll_tag = tag | MPIR_Process.tagged_coll_mask; /* Shift tag into the tagged coll space */
794         mpi_errno = MPIR_Allreduce_group( MPI_IN_PLACE, local_mask, MPIR_MAX_CONTEXT_MASK,
795                                             MPI_INT, MPI_BAND, comm_ptr, group_ptr, coll_tag, &errflag );
796     } else {
797         mpi_errno = MPIR_Allreduce_impl( MPI_IN_PLACE, local_mask, MPIR_MAX_CONTEXT_MASK,
798                                             MPI_INT, MPI_BAND, comm_ptr, &errflag );
799     }
800 
801     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
802     MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
803 
804     if (ignore_id) {
805         *context_id = MPIR_Locate_context_bit(local_mask);
806         MPIU_ERR_CHKANDJUMP(!(*context_id), mpi_errno, MPIR_ERR_RECOVERABLE, "**toomanycomm");
807     }
808     else {
809         *context_id = MPIR_Find_and_allocate_context_id(local_mask);
810         MPIU_ERR_CHKANDJUMP(!(*context_id), mpi_errno, MPIR_ERR_RECOVERABLE, "**toomanycomm");
811     }
812 
813 fn_exit:
814     if (ignore_id)
815         *context_id = MPIR_INVALID_CONTEXT_ID;
816     MPIU_DBG_MSG_S(COMM,VERBOSE,"Context mask = %s",MPIR_ContextMaskToStr());
817 
818     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_GET_CONTEXTID);
819     return mpi_errno;
820 fn_fail:
821     goto fn_exit;
822 }
823 
824 #else /* MPICH_IS_THREADED is set and true */
825 
826 /* EAGER CONTEXT ID ALLOCATION: Attempt to allocate the context ID during the
827  * initial synchronization step.  If eager protocol fails, threads fall back to
828  * the base algorithm.
829  */
830 static volatile int eager_nelem     = -1;
831 static volatile int eager_in_use    = 0;
832 
833 /* Additional values needed to maintain thread safety */
834 static volatile int mask_in_use     = 0;
835 
836 /* lowestContextId is used to prioritize access when multiple threads
837  * are contending for the mask.  lowestTag is used to break ties when
838  * MPI_Comm_create_group is invoked my multiple threads on the same parent
839  * communicator.
840  */
841 #define MPIR_MAXID (1 << 30)
842 static volatile int lowestContextId = MPIR_MAXID;
843 static volatile int lowestTag       = -1;
844 
845 #undef FUNCNAME
846 #define FUNCNAME MPIR_Get_contextid_sparse
847 #undef FCNAME
848 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Get_contextid_sparse(MPID_Comm * comm_ptr,MPIR_Context_id_t * context_id,int ignore_id)849 int MPIR_Get_contextid_sparse(MPID_Comm *comm_ptr, MPIR_Context_id_t *context_id, int ignore_id)
850 {
851     return MPIR_Get_contextid_sparse_group(comm_ptr, NULL /*group_ptr*/,
852                                            MPIR_Process.attrs.tag_ub /*tag*/,
853                                            context_id, ignore_id);
854 }
855 
856 /* Allocates a new context ID collectively over the given communicator.  This
857  * routine is "sparse" in the sense that while it is collective, some processes
858  * may not care about the value selected context ID.
859  *
860  * One example of this case is processes who pass MPI_UNDEFINED as the color
861  * value to MPI_Comm_split.  Such processes should pass ignore_id==TRUE to
862  * obtain the best performance and utilization of the context ID space.
863  *
864  * Processes that pass ignore_id==TRUE will receive
865  * (*context_id==MPIR_INVALID_CONTEXT_ID) and should not attempt to use it.
866  *
867  * If a group pointer is given, the call is _not_ sparse, and only processes
868  * in the group should call this routine.  That is, it is collective only over
869  * the given group.
870  */
871 /* NOTE-C1: We need a special test in this loop for the case where some process
872  * has exhausted its supply of context ids.  In the single threaded case, this
873  * is simple, because the algorithm is deterministic (see above).  In the
874  * multithreaded case, it is more complicated, because we may get a zero for the
875  * context mask because some other thread holds the mask.  In addition, we can't
876  * check for the case where this process did not select MPI_THREAD_MULTIPLE,
877  * because one of the other processes may have selected MPI_THREAD_MULTIPLE.  To
878  * handle this case, after a fixed number of failures, we test to see if some
879  * process has exhausted its supply of context ids.  If so, all processes can
880  * invoke the out-of-context-id error.  That fixed number of tests is in
881  * testCount */
882 #undef FUNCNAME
883 #define FUNCNAME MPIR_Get_contextid_sparse_group
884 #undef FCNAME
885 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Get_contextid_sparse_group(MPID_Comm * comm_ptr,MPID_Group * group_ptr,int tag,MPIR_Context_id_t * context_id,int ignore_id)886 int MPIR_Get_contextid_sparse_group(MPID_Comm *comm_ptr, MPID_Group *group_ptr, int tag, MPIR_Context_id_t *context_id, int ignore_id)
887 {
888     int mpi_errno = MPI_SUCCESS;
889     uint32_t local_mask[MPIR_MAX_CONTEXT_MASK];
890     int own_mask = 0;
891     int own_eager_mask = 0;
892     static const int NUM_CTX_TESTS = 10;
893     int testCount = NUM_CTX_TESTS;
894     int errflag = FALSE;
895     int first_iter = 1;
896     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_GET_CONTEXTID);
897 
898     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_GET_CONTEXTID);
899 
900     /* Group-collective and ignore_id should never be combined */
901     MPIU_Assert(! (group_ptr != NULL && ignore_id) );
902 
903     *context_id = 0;
904 
905     MPIU_DBG_MSG_FMT(COMM, VERBOSE, (MPIU_DBG_FDEST,
906          "Entering; shared state is %d:%d:%d, my ctx id is %d, tag=%d",
907          mask_in_use, lowestContextId, lowestTag, comm_ptr->context_id, tag));
908 
909 
910     /* see NOTE-C1 for info about the "exhausted IDs" test */
911     while (*context_id == 0) {
912         /* We lock only around access to the mask (except in the global locking
913          * case).  If another thread is using the mask, we take a mask of zero. */
914         MPIU_THREAD_CS_ENTER(CONTEXTID,);
915 
916         if (initialize_context_mask) {
917             MPIR_Init_contextid();
918         }
919 
920         if (eager_nelem < 0) {
921             /* Ensure that at least one word of deadlock-free context IDs is
922                always set aside for the base protocol */
923             MPIU_Assert( MPIR_PARAM_CTXID_EAGER_SIZE >= 0 && MPIR_PARAM_CTXID_EAGER_SIZE < MPIR_MAX_CONTEXT_MASK-1 );
924             eager_nelem = MPIR_PARAM_CTXID_EAGER_SIZE;
925         }
926 
927         if (ignore_id) {
928             /* We are not participating in the resulting communicator, so our
929              * context ID space doesn't matter.  Set the mask to "all available". */
930             memset(local_mask, 0xff, MPIR_MAX_CONTEXT_MASK * sizeof(int));
931             own_mask = 0;
932             /* don't need to touch mask_in_use/lowestContextId b/c our thread
933              * doesn't ever need to "win" the mask */
934         }
935 
936         /* Deadlock avoidance: Only participate in context id loop when all
937          * processes have called this routine.  On the first iteration, use the
938          * "eager" allocation protocol.
939          */
940         else if (first_iter) {
941             memset(local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int));
942             own_eager_mask = 0;
943 
944             /* Attempt to reserve the eager mask segment */
945             if (!eager_in_use && eager_nelem > 0) {
946                 int i;
947                 for (i = 0; i < eager_nelem; i++)
948                     local_mask[i] = context_mask[i];
949 
950                 eager_in_use   = 1;
951                 own_eager_mask = 1;
952             }
953         }
954 
955         else {
956             /* lowestTag breaks ties when contextIds are the same (happens only
957                in calls to MPI_Comm_create_group. */
958             if (comm_ptr->context_id < lowestContextId ||
959                     (comm_ptr->context_id == lowestContextId && tag < lowestTag)) {
960                 lowestContextId = comm_ptr->context_id;
961                 lowestTag       = tag;
962             }
963 
964             if (mask_in_use || ! (comm_ptr->context_id == lowestContextId && tag == lowestTag)) {
965                 memset(local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int));
966                 own_mask = 0;
967                 MPIU_DBG_MSG_D(COMM, VERBOSE, "In in-use, set lowestContextId to %d", lowestContextId);
968             }
969             else {
970                 int i;
971 
972                 /* Copy safe mask segment to local_mask */
973                 for (i = 0; i < eager_nelem; i++)
974                     local_mask[i] = 0;
975                 for (i = eager_nelem; i < MPIR_MAX_CONTEXT_MASK; i++)
976                     local_mask[i] = context_mask[i];
977 
978                 mask_in_use     = 1;
979                 own_mask        = 1;
980                 MPIU_DBG_MSG(COMM, VERBOSE, "Copied local_mask");
981             }
982         }
983         MPIU_THREAD_CS_EXIT(CONTEXTID,);
984 
985         /* Now, try to get a context id */
986         MPIU_Assert(comm_ptr->comm_kind == MPID_INTRACOMM);
987         /* In the global and brief-global cases, note that this routine will
988            release that global lock when it needs to wait.  That will allow
989            other processes to enter the global or brief global critical section.
990          */
991         if (group_ptr != NULL) {
992             int coll_tag = tag | MPIR_Process.tagged_coll_mask; /* Shift tag into the tagged coll space */
993             mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, local_mask, MPIR_MAX_CONTEXT_MASK,
994                                              MPI_INT, MPI_BAND, comm_ptr, group_ptr, coll_tag, &errflag);
995         } else {
996             mpi_errno = MPIR_Allreduce_impl(MPI_IN_PLACE, local_mask, MPIR_MAX_CONTEXT_MASK,
997                                             MPI_INT, MPI_BAND, comm_ptr, &errflag);
998         }
999         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1000         MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
1001 
1002         /* MT FIXME 2/3 cases don't seem to need the CONTEXTID CS, check and
1003          * narrow this region */
1004         MPIU_THREAD_CS_ENTER(CONTEXTID,);
1005         if (ignore_id) {
1006             /* we don't care what the value was, but make sure that everyone
1007              * who did care agreed on a value */
1008             *context_id = MPIR_Locate_context_bit(local_mask);
1009             /* used later in out-of-context ids check and outer while loop condition */
1010         }
1011         else if (own_eager_mask) {
1012             /* There is a chance that we've found a context id */
1013             /* Find_and_allocate_context_id updates the context_mask if it finds a match */
1014             *context_id = MPIR_Find_and_allocate_context_id(local_mask);
1015             MPIU_DBG_MSG_D(COMM, VERBOSE, "Context id is now %hd", *context_id);
1016 
1017             own_eager_mask = 0;
1018             eager_in_use   = 0;
1019 
1020             if (*context_id <= 0) {
1021                 /* else we did not find a context id. Give up the mask in case
1022                  * there is another thread (with a lower input context id)
1023                  * waiting for it.  We need to ensure that any other threads
1024                  * have the opportunity to run, hence yielding */
1025                 MPIU_THREAD_CS_YIELD(CONTEXTID,);
1026             }
1027         }
1028         else if (own_mask) {
1029             /* There is a chance that we've found a context id */
1030             /* Find_and_allocate_context_id updates the context_mask if it finds a match */
1031             *context_id = MPIR_Find_and_allocate_context_id(local_mask);
1032             MPIU_DBG_MSG_D(COMM, VERBOSE, "Context id is now %hd", *context_id);
1033 
1034             mask_in_use = 0;
1035 
1036             if (*context_id > 0) {
1037                 /* If we were the lowest context id, reset the value to
1038                    allow the other threads to compete for the mask */
1039                 if (lowestContextId == comm_ptr->context_id && lowestTag == tag) {
1040                     lowestContextId = MPIR_MAXID;
1041                     lowestTag       = -1;
1042                     /* Else leave it alone; there is another thread waiting */
1043                 }
1044             }
1045             else {
1046                 /* else we did not find a context id. Give up the mask in case
1047                  * there is another thread (with a lower input context id)
1048                  * waiting for it.  We need to ensure that any other threads
1049                  * have the opportunity to run, hence yielding */
1050                 MPIU_THREAD_CS_YIELD(CONTEXTID,);
1051             }
1052         }
1053         else {
1054             /* As above, force this thread to yield */
1055             MPIU_THREAD_CS_YIELD(CONTEXTID,);
1056         }
1057         MPIU_THREAD_CS_EXIT(CONTEXTID,);
1058 
1059         /* Here is the test for out-of-context ids */
1060         /* FIXME we may be able to "rotate" this a half iteration up to the top
1061          * where we already have to grab the lock */
1062         if ((testCount-- == 0) && (*context_id == 0)) {
1063             int hasNoId, totalHasNoId;
1064             int i;
1065             MPIU_THREAD_CS_ENTER(CONTEXTID,);
1066 
1067             /* Copy safe mask segment to local_mask */
1068             for (i = 0; i < eager_nelem; i++)
1069                 local_mask[i] = 0;
1070             for (i = eager_nelem; i < MPIR_MAX_CONTEXT_MASK; i++)
1071                 local_mask[i] = context_mask[i];
1072 
1073             hasNoId = MPIR_Locate_context_bit(local_mask) == 0;
1074             MPIU_THREAD_CS_EXIT(CONTEXTID,);
1075 
1076             /* we _must_ release the lock above in order to avoid deadlocking on
1077              * this blocking allreduce operation */
1078             if (group_ptr != NULL) {
1079                 int coll_tag = tag | MPIR_Process.tagged_coll_mask; /* Shift tag into the tagged coll space */
1080                 mpi_errno = MPIR_Allreduce_group(&hasNoId, &totalHasNoId, 1, MPI_INT,
1081                                                  MPI_MAX, comm_ptr, group_ptr, coll_tag, &errflag);
1082             } else {
1083                 mpi_errno = MPIR_Allreduce_impl(&hasNoId, &totalHasNoId, 1, MPI_INT,
1084                                                 MPI_MAX, comm_ptr, &errflag);
1085             }
1086             if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1087             MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
1088             if (totalHasNoId == 1) {
1089                 /* --BEGIN ERROR HANDLING-- */
1090                 /* Release the mask for use by other threads */
1091                 if (own_mask) {
1092                     MPIU_THREAD_CS_ENTER(CONTEXTID,);
1093                     mask_in_use = 0;
1094                     MPIU_THREAD_CS_EXIT(CONTEXTID,);
1095                 }
1096                 MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**toomanycomm");
1097                 /* --END ERROR HANDLING-- */
1098             }
1099             else { /* reinitialize testCount */
1100                 testCount = NUM_CTX_TESTS;
1101                 MPIU_DBG_MSG_D(COMM, VERBOSE, "reinitialized testCount to %d", testCount);
1102             }
1103         }
1104 
1105         first_iter = 0;
1106     }
1107 
1108 
1109 fn_exit:
1110     if (ignore_id)
1111         *context_id = MPIR_INVALID_CONTEXT_ID;
1112     MPIU_DBG_MSG_S(COMM,VERBOSE,"Context mask = %s",MPIR_ContextMaskToStr());
1113     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_GET_CONTEXTID);
1114     return mpi_errno;
1115 
1116     /* --BEGIN ERROR HANDLING-- */
1117 fn_fail:
1118     /* Release the masks */
1119     if (own_mask) {
1120         /* is it safe to access this without holding the CS? */
1121         mask_in_use = 0;
1122     }
1123     goto fn_exit;
1124     /* --END ERROR HANDLING-- */
1125 }
1126 
1127 #endif
1128 
1129 struct gcn_state {
1130     MPIR_Context_id_t *ctx0;
1131     MPIR_Context_id_t *ctx1;
1132     uint32_t local_mask[MPIR_MAX_CONTEXT_MASK];
1133 };
1134 
1135 #undef FUNCNAME
1136 #define FUNCNAME gcn_helper
1137 #undef FCNAME
1138 #define FCNAME MPIU_QUOTE(FUNCNAME)
gcn_helper(MPID_Comm * comm,int tag,void * state)1139 static int gcn_helper(MPID_Comm *comm, int tag, void *state)
1140 {
1141     int mpi_errno = MPI_SUCCESS;
1142     struct gcn_state *st = state;
1143     MPIR_Context_id_t newctxid;
1144 
1145     newctxid = MPIR_Find_and_allocate_context_id(st->local_mask);
1146     MPIU_ERR_CHKANDJUMP(!newctxid, mpi_errno, MPIR_ERR_RECOVERABLE, "**toomanycomm");
1147 
1148     if (st->ctx0)
1149         *st->ctx0 = newctxid;
1150     if (st->ctx1)
1151         *st->ctx1 = newctxid;
1152 
1153 fn_fail:
1154     return mpi_errno;
1155 }
1156 
1157 
1158 /* Does the meat of the algorithm, adds the relevant entries to the schedule.
1159  * Assigns the resulting value to *ctx0 and *ctx1, as long as those respective
1160  * pointers are non-NULL. */
1161 /* FIXME this version only works for single-threaded code, it will totally fail
1162  * for any multithreaded communicator creation */
1163 #undef FUNCNAME
1164 #define FUNCNAME gcn_sch
1165 #undef FCNAME
1166 #define FCNAME MPIU_QUOTE(FUNCNAME)
gcn_sch(MPID_Comm * comm_ptr,MPIR_Context_id_t * ctx0,MPIR_Context_id_t * ctx1,MPID_Sched_t s)1167 static int gcn_sch(MPID_Comm *comm_ptr, MPIR_Context_id_t *ctx0, MPIR_Context_id_t *ctx1, MPID_Sched_t s)
1168 {
1169     int mpi_errno = MPI_SUCCESS;
1170     struct gcn_state *st = NULL;
1171     MPIU_CHKPMEM_DECL(1);
1172 
1173     MPIU_Assert(comm_ptr->comm_kind == MPID_INTRACOMM);
1174 
1175     MPIU_ERR_CHKANDJUMP(MPIU_ISTHREADED, mpi_errno, MPI_ERR_INTERN, "**notsuppmultithread");
1176 
1177     /* first do as much local setup as we can */
1178     if (initialize_context_mask) {
1179         MPIR_Init_contextid();
1180     }
1181 
1182     MPIU_CHKPMEM_MALLOC(st, struct gcn_state *, sizeof(struct gcn_state), mpi_errno, "gcn_state");
1183     st->ctx0 = ctx0;
1184     st->ctx1 = ctx1;
1185     MPIU_Memcpy(st->local_mask, context_mask, MPIR_MAX_CONTEXT_MASK * sizeof(uint32_t));
1186 
1187     mpi_errno = comm_ptr->coll_fns->Iallreduce(MPI_IN_PLACE, st->local_mask, MPIR_MAX_CONTEXT_MASK,
1188                                                MPI_UINT32_T, MPI_BAND, comm_ptr, s);
1189     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1190 
1191     MPID_SCHED_BARRIER(s);
1192 
1193     mpi_errno = MPID_Sched_cb(&gcn_helper, st, s);
1194     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1195 
1196     MPID_SCHED_BARRIER(s);
1197 
1198     mpi_errno = MPID_Sched_cb(&MPIR_Sched_cb_free_buf, st, s);
1199     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1200 
1201     MPIU_CHKPMEM_COMMIT();
1202 fn_exit:
1203     return mpi_errno;
1204     /* --BEGIN ERROR HANDLING-- */
1205 fn_fail:
1206     MPIU_CHKPMEM_REAP();
1207     goto fn_exit;
1208     /* --END ERROR HANDLING-- */
1209 }
1210 
1211 
1212 #undef FUNCNAME
1213 #define FUNCNAME MPIR_Get_contextid_nonblock
1214 #undef FCNAME
1215 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Get_contextid_nonblock(MPID_Comm * comm_ptr,MPID_Comm * newcommp,MPID_Request ** req)1216 int MPIR_Get_contextid_nonblock(MPID_Comm *comm_ptr, MPID_Comm *newcommp, MPID_Request **req)
1217 {
1218     int mpi_errno = MPI_SUCCESS;
1219     int tag;
1220     MPID_Sched_t s;
1221 
1222     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_GET_CONTEXTID_NONBLOCK);
1223 
1224     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_GET_CONTEXTID_NONBLOCK);
1225 
1226     /* now create a schedule */
1227     mpi_errno = MPID_Sched_next_tag(comm_ptr, &tag);
1228     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1229     mpi_errno = MPID_Sched_create(&s);
1230     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1231 
1232     /* add some entries to it */
1233     mpi_errno = gcn_sch(comm_ptr, &newcommp->context_id, &newcommp->recvcontext_id, s);
1234     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1235 
1236     /* finally, kick off the schedule and give the caller a request */
1237     mpi_errno = MPID_Sched_start(&s, comm_ptr, tag, req);
1238     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1239 
1240 fn_exit:
1241     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_GET_CONTEXTID_NONBLOCK);
1242     return mpi_errno;
1243     /* --BEGIN ERROR HANDLING-- */
1244 fn_fail:
1245     goto fn_exit;
1246     /* --END ERROR HANDLING-- */
1247 }
1248 
1249 #undef FUNCNAME
1250 #define FUNCNAME MPIR_Get_intercomm_contextid_nonblock
1251 #undef FCNAME
1252 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Get_intercomm_contextid_nonblock(MPID_Comm * comm_ptr,MPID_Comm * newcommp,MPID_Request ** req)1253 int MPIR_Get_intercomm_contextid_nonblock(MPID_Comm *comm_ptr, MPID_Comm *newcommp, MPID_Request **req)
1254 {
1255     int mpi_errno = MPI_SUCCESS;
1256     int tag;
1257     MPID_Sched_t s;
1258     MPID_Comm *lcomm = NULL;
1259     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_GET_INTERCOMM_CONTEXTID_NONBLOCK);
1260 
1261     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_GET_INTERCOMM_CONTEXTID_NONBLOCK);
1262 
1263     /* do as much local setup as possible */
1264     if (!comm_ptr->local_comm) {
1265         mpi_errno = MPIR_Setup_intercomm_localcomm(comm_ptr);
1266         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1267     }
1268     lcomm = comm_ptr->local_comm;
1269 
1270     /* now create a schedule */
1271     mpi_errno = MPID_Sched_next_tag(comm_ptr, &tag);
1272     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1273     mpi_errno = MPID_Sched_create(&s);
1274     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1275 
1276     /* add some entries to it */
1277 
1278     /* first get a context ID over the local comm */
1279     mpi_errno = gcn_sch(lcomm, &newcommp->context_id, NULL, s);
1280     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1281 
1282     MPID_SCHED_BARRIER(s);
1283 
1284     if (comm_ptr->rank == 0) {
1285         newcommp->recvcontext_id = -1;
1286         mpi_errno = MPID_Sched_recv(&newcommp->recvcontext_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, comm_ptr, s);
1287         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1288         mpi_errno = MPID_Sched_send(&newcommp->context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, comm_ptr, s);
1289         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1290         MPID_SCHED_BARRIER(s);
1291     }
1292 
1293     mpi_errno = lcomm->coll_fns->Ibcast(&newcommp->recvcontext_id, 1,
1294                                         MPIR_CONTEXT_ID_T_DATATYPE, 0, lcomm, s);
1295     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1296 
1297     /* finally, kick off the schedule and give the caller a request */
1298     mpi_errno = MPID_Sched_start(&s, comm_ptr, tag, req);
1299     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1300 
1301 fn_fail:
1302     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_GET_INTERCOMM_CONTEXTID_NONBLOCK);
1303     return mpi_errno;
1304 }
1305 
1306 
1307 /* Get a context for a new intercomm.  There are two approaches
1308    here (for MPI-1 codes only)
1309    (a) Each local group gets a context; the groups exchange, and
1310        the low value is accepted and the high one returned.  This
1311        works because the context ids are taken from the same pool.
1312    (b) Form a temporary intracomm over all processes and use that
1313        with the regular algorithm.
1314 
1315    In some ways, (a) is the better approach because it is the one that
1316    extends to MPI-2 (where the last step, returning the context, is
1317    not used and instead separate send and receive context id value
1318    are kept).  For this reason, we'll use (a).
1319 
1320    Even better is to separate the local and remote context ids.  Then
1321    each group of processes can manage their context ids separately.
1322 */
1323 /*
1324  * This uses the thread-safe (if necessary) routine to get a context id
1325  * and does not need its own thread-safe version.
1326  */
1327 #undef FUNCNAME
1328 #define FUNCNAME MPIR_Get_intercomm_contextid
1329 #undef FCNAME
1330 #define FCNAME "MPIR_Get_intercomm_contextid"
MPIR_Get_intercomm_contextid(MPID_Comm * comm_ptr,MPIR_Context_id_t * context_id,MPIR_Context_id_t * recvcontext_id)1331 int MPIR_Get_intercomm_contextid( MPID_Comm *comm_ptr, MPIR_Context_id_t *context_id,
1332 				  MPIR_Context_id_t *recvcontext_id )
1333 {
1334     MPIR_Context_id_t mycontext_id, remote_context_id;
1335     int mpi_errno = MPI_SUCCESS;
1336     int tag = 31567; /* FIXME  - we need an internal tag or
1337 		        communication channel.  Can we use a different
1338 		        context instead?.  Or can we use the tag
1339 		        provided in the intercomm routine? (not on a dup,
1340 			but in that case it can use the collective context) */
1341     int errflag = FALSE;
1342     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_GET_INTERCOMM_CONTEXTID);
1343 
1344     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_GET_INTERCOMM_CONTEXTID);
1345 
1346     if (!comm_ptr->local_comm) {
1347         /* Manufacture the local communicator */
1348         mpi_errno = MPIR_Setup_intercomm_localcomm( comm_ptr );
1349         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1350     }
1351 
1352     mpi_errno = MPIR_Get_contextid( comm_ptr->local_comm, &mycontext_id );
1353     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1354     MPIU_Assert(mycontext_id != 0);
1355 
1356     /* MPIC routine uses an internal context id.  The local leads (process 0)
1357        exchange data */
1358     remote_context_id = -1;
1359     if (comm_ptr->rank == 0) {
1360         mpi_errno = MPIC_Sendrecv( &mycontext_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, tag,
1361                                    &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, tag,
1362                                    comm_ptr->handle, MPI_STATUS_IGNORE );
1363         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1364     }
1365 
1366     /* Make sure that all of the local processes now have this
1367        id */
1368     mpi_errno = MPIR_Bcast_impl( &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE,
1369                                  0, comm_ptr->local_comm, &errflag );
1370     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1371     MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
1372     /* The recvcontext_id must be the one that was allocated out of the local
1373      * group, not the remote group.  Otherwise we could end up posting two
1374      * MPI_ANY_SOURCE,MPI_ANY_TAG recvs on the same context IDs even though we
1375      * are attempting to post them for two separate communicators. */
1376     *context_id     = remote_context_id;
1377     *recvcontext_id = mycontext_id;
1378  fn_fail:
1379     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_GET_INTERCOMM_CONTEXTID);
1380     return mpi_errno;
1381 }
1382 
1383 #undef FUNCNAME
1384 #define FUNCNAME MPIR_Free_contextid
1385 #undef FCNAME
1386 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Free_contextid(MPIR_Context_id_t context_id)1387 void MPIR_Free_contextid( MPIR_Context_id_t context_id )
1388 {
1389     int idx, bitpos, raw_prefix;
1390     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_FREE_CONTEXTID);
1391 
1392     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_FREE_CONTEXTID);
1393 
1394     /* Convert the context id to the bit position */
1395     raw_prefix = MPID_CONTEXT_READ_FIELD(PREFIX,context_id);
1396     idx    = raw_prefix / MPIR_CONTEXT_INT_BITS;
1397     bitpos = raw_prefix % MPIR_CONTEXT_INT_BITS;
1398 
1399     /* --BEGIN ERROR HANDLING-- */
1400     if (idx < 0 || idx >= MPIR_MAX_CONTEXT_MASK) {
1401 	MPID_Abort( 0, MPI_ERR_INTERN, 1,
1402 		    "In MPIR_Free_contextid, idx is out of range" );
1403     }
1404     /* --END ERROR HANDLING-- */
1405 
1406     /* The low order bits for dynamic context IDs don't have meaning the
1407      * same way that low bits of non-dynamic ctx IDs do.  So we have to
1408      * check the dynamic case first. */
1409     if (MPID_CONTEXT_READ_FIELD(DYNAMIC_PROC, context_id)) {
1410         MPIU_DBG_MSG_D(COMM,VERBOSE,"skipping dynamic process ctx id, context_id=%d", context_id);
1411         goto fn_exit;
1412     }
1413     else { /* non-dynamic context ID */
1414         /* In terms of the context ID bit vector, intercomms and their constituent
1415          * localcomms have the same value.  To avoid a double-free situation we just
1416          * don't free the context ID for localcomms and assume it will be cleaned up
1417          * when the parent intercomm is itself completely freed. */
1418         if (MPID_CONTEXT_READ_FIELD(IS_LOCALCOMM, context_id)) {
1419 #ifdef USE_DBG_LOGGING
1420             char dump_str[1024];
1421             MPIR_Comm_dump_context_id(context_id, dump_str, sizeof(dump_str));
1422             MPIU_DBG_MSG_S(COMM,VERBOSE,"skipping localcomm id: %s", dump_str);
1423 #endif
1424             goto fn_exit;
1425         }
1426         else if (MPID_CONTEXT_READ_FIELD(SUBCOMM, context_id)) {
1427             MPIU_DBG_MSG_D(COMM,VERBOSE,"skipping non-parent communicator ctx id, context_id=%d", context_id);
1428             goto fn_exit;
1429         }
1430     }
1431 
1432     /* --BEGIN ERROR HANDLING-- */
1433     /* Check that this context id has been allocated */
1434     if ( (context_mask[idx] & (0x1 << bitpos)) != 0 ) {
1435 #ifdef USE_DBG_LOGGING
1436         char dump_str[1024];
1437         MPIR_Comm_dump_context_id(context_id, dump_str, sizeof(dump_str));
1438         MPIU_DBG_MSG_S(COMM,VERBOSE,"context dump: %s", dump_str);
1439         MPIU_DBG_MSG_S(COMM,VERBOSE,"context mask = %s",MPIR_ContextMaskToStr());
1440 #endif
1441 	MPID_Abort( 0, MPI_ERR_INTERN, 1,
1442 		    "In MPIR_Free_contextid, the context id is not in use" );
1443     }
1444     /* --END ERROR HANDLING-- */
1445 
1446     MPIU_THREAD_CS_ENTER(CONTEXTID,);
1447     /* MT: Note that this update must be done atomically in the multithreaedd
1448        case.  In the "one, single lock" implementation, that lock is indeed
1449        held when this operation is called. */
1450     context_mask[idx] |= (0x1 << bitpos);
1451     MPIU_THREAD_CS_EXIT(CONTEXTID,);
1452 
1453     MPIU_DBG_MSG_FMT(COMM,VERBOSE,
1454                      (MPIU_DBG_FDEST,
1455                       "Freed context %d, mask[%d] bit %d (prefix=%#x)",
1456                       context_id, idx, bitpos, raw_prefix));
1457 fn_exit:
1458     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_FREE_CONTEXTID);
1459 }
1460 
1461 /*
1462  * Copy a communicator, including creating a new context and copying the
1463  * virtual connection tables and clearing the various fields.
1464  * Does *not* copy attributes.  If size is < the size of the local group
1465  * in the input communicator, copy only the first size elements.
1466  * If this process is not a member, return a null pointer in outcomm_ptr.
1467  * This is only supported in the case where the communicator is in
1468  * Intracomm (not an Intercomm).  Note that this is all that is required
1469  * for cart_create and graph_create.
1470  *
1471  * Used by cart_create, graph_create, and dup_create
1472  */
1473 #undef FUNCNAME
1474 #define FUNCNAME MPIR_Comm_copy
1475 #undef FCNAME
1476 #define FCNAME "MPIR_Comm_copy"
MPIR_Comm_copy(MPID_Comm * comm_ptr,int size,MPID_Comm ** outcomm_ptr)1477 int MPIR_Comm_copy( MPID_Comm *comm_ptr, int size, MPID_Comm **outcomm_ptr )
1478 {
1479     int mpi_errno = MPI_SUCCESS;
1480     MPIR_Context_id_t new_context_id, new_recvcontext_id;
1481     MPID_Comm *newcomm_ptr = NULL;
1482     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_COPY);
1483 
1484     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_COPY);
1485 
1486     /* Get a new context first.  We need this to be collective over the
1487        input communicator */
1488     /* If there is a context id cache in oldcomm, use it here.  Otherwise,
1489        use the appropriate algorithm to get a new context id.  Be careful
1490        of intercomms here */
1491     if (comm_ptr->comm_kind == MPID_INTERCOMM) {
1492 	mpi_errno =
1493 	    MPIR_Get_intercomm_contextid(
1494 		 comm_ptr, &new_context_id, &new_recvcontext_id );
1495         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1496     }
1497     else {
1498 	mpi_errno = MPIR_Get_contextid( comm_ptr, &new_context_id );
1499 	new_recvcontext_id = new_context_id;
1500         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1501         MPIU_Assert(new_context_id != 0);
1502     }
1503     /* --BEGIN ERROR HANDLING-- */
1504     if (new_context_id == 0) {
1505         MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**toomanycomm" );
1506     }
1507     /* --END ERROR HANDLING-- */
1508 
1509     /* This is the local size, not the remote size, in the case of
1510        an intercomm */
1511     if (comm_ptr->rank >= size) {
1512         *outcomm_ptr = 0;
1513         /* always free the recvcontext ID, never the "send" ID */
1514         MPIR_Free_contextid(new_recvcontext_id);
1515 	goto fn_exit;
1516     }
1517 
1518     /* We're left with the processes that will have a non-null communicator.
1519        Create the object, initialize the data, and return the result */
1520 
1521     mpi_errno = MPIR_Comm_create( &newcomm_ptr );
1522     if (mpi_errno) goto fn_fail;
1523 
1524     newcomm_ptr->context_id     = new_context_id;
1525     newcomm_ptr->recvcontext_id = new_recvcontext_id;
1526 
1527     /* Save the kind of the communicator */
1528     newcomm_ptr->comm_kind   = comm_ptr->comm_kind;
1529     newcomm_ptr->local_comm  = 0;
1530 
1531     /* There are two cases here - size is the same as the old communicator,
1532        or it is smaller.  If the size is the same, we can just add a reference.
1533        Otherwise, we need to create a new VCRT.  Note that this is the
1534        test that matches the test on rank above. */
1535     if (size == comm_ptr->local_size) {
1536 	/* Duplicate the VCRT references */
1537 	MPID_VCRT_Add_ref( comm_ptr->vcrt );
1538 	newcomm_ptr->vcrt = comm_ptr->vcrt;
1539 	newcomm_ptr->vcr  = comm_ptr->vcr;
1540     }
1541     else {
1542 	int i;
1543 	/* The "remote" vcr gets the shortened vcrt */
1544 	MPID_VCRT_Create( size, &newcomm_ptr->vcrt );
1545 	MPID_VCRT_Get_ptr( newcomm_ptr->vcrt,
1546 			   &newcomm_ptr->vcr );
1547 	for (i=0; i<size; i++) {
1548 	    /* For rank i in the new communicator, find the corresponding
1549 	       rank in the input communicator */
1550 	    MPID_VCR_Dup( comm_ptr->vcr[i], &newcomm_ptr->vcr[i] );
1551 	}
1552     }
1553 
1554     /* If it is an intercomm, duplicate the local vcrt references */
1555     if (comm_ptr->comm_kind == MPID_INTERCOMM) {
1556 	MPID_VCRT_Add_ref( comm_ptr->local_vcrt );
1557 	newcomm_ptr->local_vcrt = comm_ptr->local_vcrt;
1558 	newcomm_ptr->local_vcr  = comm_ptr->local_vcr;
1559     }
1560 
1561     /* Set the sizes and ranks */
1562     newcomm_ptr->rank        = comm_ptr->rank;
1563     if (comm_ptr->comm_kind == MPID_INTERCOMM) {
1564 	newcomm_ptr->local_size   = comm_ptr->local_size;
1565 	newcomm_ptr->remote_size  = comm_ptr->remote_size;
1566 	newcomm_ptr->is_low_group = comm_ptr->is_low_group;
1567     }
1568     else {
1569 	newcomm_ptr->local_size  = size;
1570 	newcomm_ptr->remote_size = size;
1571     }
1572 
1573     /* Inherit the error handler (if any) */
1574     MPIU_THREAD_CS_ENTER(MPI_OBJ, comm_ptr);
1575     newcomm_ptr->errhandler = comm_ptr->errhandler;
1576     if (comm_ptr->errhandler) {
1577 	MPIR_Errhandler_add_ref( comm_ptr->errhandler );
1578     }
1579     MPIU_THREAD_CS_EXIT(MPI_OBJ, comm_ptr);
1580 
1581     /* FIXME do we want to copy coll_fns here? */
1582 
1583     mpi_errno = MPIR_Comm_commit(newcomm_ptr);
1584     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1585 
1586     /* Start with no attributes on this communicator */
1587     newcomm_ptr->attributes = 0;
1588     *outcomm_ptr = newcomm_ptr;
1589 
1590  fn_fail:
1591  fn_exit:
1592 
1593     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_COPY);
1594 
1595     return mpi_errno;
1596 }
1597 
1598 /* Copy a communicator, including copying the virtual connection tables and
1599  * clearing the various fields.  Does *not* allocate a context ID or commit the
1600  * communicator.  Does *not* copy attributes.
1601  *
1602  * Used by comm_idup.
1603  */
1604 #undef FUNCNAME
1605 #define FUNCNAME MPIR_Comm_copy_data
1606 #undef FCNAME
1607 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Comm_copy_data(MPID_Comm * comm_ptr,MPID_Comm ** outcomm_ptr)1608 int MPIR_Comm_copy_data(MPID_Comm *comm_ptr, MPID_Comm **outcomm_ptr)
1609 {
1610     int mpi_errno = MPI_SUCCESS;
1611     MPID_Comm *newcomm_ptr = NULL;
1612     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_COPY_DATA);
1613 
1614     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_COPY_DATA);
1615 
1616     mpi_errno = MPIR_Comm_create(&newcomm_ptr);
1617     if (mpi_errno) goto fn_fail;
1618 
1619     /* use a large garbage value to ensure errors are caught more easily */
1620     newcomm_ptr->context_id     = 32767;
1621     newcomm_ptr->recvcontext_id = 32767;
1622 
1623     /* Save the kind of the communicator */
1624     newcomm_ptr->comm_kind  = comm_ptr->comm_kind;
1625     newcomm_ptr->local_comm = 0;
1626 
1627     /* Duplicate the VCRT references */
1628     MPID_VCRT_Add_ref(comm_ptr->vcrt);
1629     newcomm_ptr->vcrt = comm_ptr->vcrt;
1630     newcomm_ptr->vcr  = comm_ptr->vcr;
1631 
1632     /* If it is an intercomm, duplicate the local vcrt references */
1633     if (comm_ptr->comm_kind == MPID_INTERCOMM) {
1634         MPID_VCRT_Add_ref(comm_ptr->local_vcrt);
1635         newcomm_ptr->local_vcrt = comm_ptr->local_vcrt;
1636         newcomm_ptr->local_vcr  = comm_ptr->local_vcr;
1637     }
1638 
1639     /* Set the sizes and ranks */
1640     newcomm_ptr->rank         = comm_ptr->rank;
1641     newcomm_ptr->local_size   = comm_ptr->local_size;
1642     newcomm_ptr->remote_size  = comm_ptr->remote_size;
1643     newcomm_ptr->is_low_group = comm_ptr->is_low_group; /* only relevant for intercomms */
1644 
1645     /* Inherit the error handler (if any) */
1646     MPIU_THREAD_CS_ENTER(MPI_OBJ, comm_ptr);
1647     newcomm_ptr->errhandler = comm_ptr->errhandler;
1648     if (comm_ptr->errhandler) {
1649         MPIR_Errhandler_add_ref(comm_ptr->errhandler);
1650     }
1651     MPIU_THREAD_CS_EXIT(MPI_OBJ, comm_ptr);
1652 
1653     /* FIXME do we want to copy coll_fns here? */
1654 
1655     mpi_errno = MPIR_Comm_commit(newcomm_ptr);
1656     if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1657 
1658     /* Start with no attributes on this communicator */
1659     newcomm_ptr->attributes = 0;
1660     *outcomm_ptr = newcomm_ptr;
1661 
1662 fn_fail:
1663 fn_exit:
1664     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_COPY_DATA);
1665     return mpi_errno;
1666 }
1667 /* Common body between MPIR_Comm_release and MPIR_comm_release_always.  This
1668  * helper function frees the actual MPID_Comm structure and any associated
1669  * storage.  It also releases any refernces to other objects, such as the VCRT.
1670  * This function should only be called when the communicator's reference count
1671  * has dropped to 0.
1672  *
1673  * !!! This routine should *never* be called outside of MPIR_Comm_release{,_always} !!!
1674  */
1675 #undef FUNCNAME
1676 #define FUNCNAME MPIR_Comm_delete_internal
1677 #undef FCNAME
1678 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Comm_delete_internal(MPID_Comm * comm_ptr,int isDisconnect)1679 int MPIR_Comm_delete_internal(MPID_Comm * comm_ptr, int isDisconnect)
1680 {
1681     int in_use;
1682     int mpi_errno = MPI_SUCCESS;
1683     MPID_MPI_STATE_DECL(MPID_STATE_COMM_DELETE_INTERNAL);
1684 
1685     MPID_MPI_FUNC_ENTER(MPID_STATE_COMM_DELETE_INTERNAL);
1686 
1687     MPIU_Assert(MPIU_Object_get_ref(comm_ptr) == 0); /* sanity check */
1688 
1689     /* Remove the attributes, executing the attribute delete routine.
1690        Do this only if the attribute functions are defined.
1691        This must be done first, because if freeing the attributes
1692        returns an error, the communicator is not freed */
1693     if (MPIR_Process.attr_free && comm_ptr->attributes) {
1694         /* Temporarily add a reference to this communicator because
1695            the attr_free code requires a valid communicator */
1696         MPIU_Object_add_ref( comm_ptr );
1697         mpi_errno = MPIR_Process.attr_free( comm_ptr->handle,
1698                                             &comm_ptr->attributes );
1699         /* Release the temporary reference added before the call to
1700            attr_free */
1701         MPIU_Object_release_ref( comm_ptr, &in_use);
1702     }
1703 
1704     /* If the attribute delete functions return failure, the
1705        communicator must not be freed.  That is the reason for the
1706        test on mpi_errno here. */
1707     if (mpi_errno == MPI_SUCCESS) {
1708         /* If this communicator is our parent, and we're disconnecting
1709            from the parent, mark that fact */
1710         if (MPIR_Process.comm_parent == comm_ptr)
1711             MPIR_Process.comm_parent = NULL;
1712 
1713         /* Notify the device that the communicator is about to be
1714            destroyed */
1715         mpi_errno = MPID_Dev_comm_destroy_hook(comm_ptr);
1716         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1717 
1718         /* release our reference to the collops structure, comes after the
1719          * destroy_hook to allow the device to manage these vtables in a custom
1720          * fashion */
1721         if (comm_ptr->coll_fns && --comm_ptr->coll_fns->ref_count == 0) {
1722             MPIU_Free(comm_ptr->coll_fns);
1723             comm_ptr->coll_fns = NULL;
1724         }
1725 
1726         /* Free the VCRT */
1727         mpi_errno = MPID_VCRT_Release(comm_ptr->vcrt, isDisconnect);
1728         if (mpi_errno != MPI_SUCCESS) {
1729             MPIU_ERR_POP(mpi_errno);
1730         }
1731         if (comm_ptr->comm_kind == MPID_INTERCOMM) {
1732             mpi_errno = MPID_VCRT_Release(
1733                                           comm_ptr->local_vcrt, isDisconnect);
1734             if (mpi_errno != MPI_SUCCESS) {
1735                 MPIU_ERR_POP(mpi_errno);
1736             }
1737             if (comm_ptr->local_comm)
1738                 MPIR_Comm_release(comm_ptr->local_comm, isDisconnect );
1739         }
1740 
1741         /* Free the local and remote groups, if they exist */
1742         if (comm_ptr->local_group)
1743             MPIR_Group_release(comm_ptr->local_group);
1744         if (comm_ptr->remote_group)
1745             MPIR_Group_release(comm_ptr->remote_group);
1746 
1747         /* free the intra/inter-node communicators, if they exist */
1748         if (comm_ptr->node_comm)
1749             MPIR_Comm_release(comm_ptr->node_comm, isDisconnect);
1750         if (comm_ptr->node_roots_comm)
1751             MPIR_Comm_release(comm_ptr->node_roots_comm, isDisconnect);
1752         if (comm_ptr->intranode_table != NULL)
1753             MPIU_Free(comm_ptr->intranode_table);
1754         if (comm_ptr->internode_table != NULL)
1755             MPIU_Free(comm_ptr->internode_table);
1756 
1757         /* Free the context value.  This should come after freeing the
1758          * intra/inter-node communicators since those free calls won't
1759          * release this context ID and releasing this before then could lead
1760          * to races once we make threading finer grained. */
1761         /* This must be the recvcontext_id (i.e. not the (send)context_id)
1762          * because in the case of intercommunicators the send context ID is
1763          * allocated out of the remote group's bit vector, not ours. */
1764         MPIR_Free_contextid( comm_ptr->recvcontext_id );
1765 
1766         /* We need to release the error handler */
1767         /* no MPI_OBJ CS needed */
1768         if (comm_ptr->errhandler &&
1769             ! (HANDLE_GET_KIND(comm_ptr->errhandler->handle) ==
1770                HANDLE_KIND_BUILTIN) ) {
1771             int errhInuse;
1772             MPIR_Errhandler_release_ref( comm_ptr->errhandler,&errhInuse);
1773             if (!errhInuse) {
1774                 MPIU_Handle_obj_free( &MPID_Errhandler_mem,
1775                                       comm_ptr->errhandler );
1776             }
1777         }
1778 
1779         /* Remove from the list of active communicators if
1780            we are supporting message-queue debugging.  We make this
1781            conditional on having debugger support since the
1782            operation is not constant-time */
1783         MPIR_COMML_FORGET( comm_ptr );
1784 
1785         /* Check for predefined communicators - these should not
1786            be freed */
1787         if (! (HANDLE_GET_KIND(comm_ptr->handle) == HANDLE_KIND_BUILTIN) )
1788             MPIU_Handle_obj_free( &MPID_Comm_mem, comm_ptr );
1789     }
1790     else {
1791         /* If the user attribute free function returns an error,
1792            then do not free the communicator */
1793         MPIR_Comm_add_ref( comm_ptr );
1794     }
1795 
1796  fn_exit:
1797     MPID_MPI_FUNC_EXIT(MPID_STATE_COMM_DELETE_INTERNAL);
1798     return mpi_errno;
1799  fn_fail:
1800     goto fn_exit;
1801 }
1802 
1803 /* Release a reference to a communicator.  If there are no pending
1804    references, delete the communicator and recover all storage and
1805    context ids.  This version of the function always manipulates the reference
1806    counts, even for predefined objects. */
1807 #undef FUNCNAME
1808 #define FUNCNAME MPIR_Comm_release_always
1809 #undef FCNAME
1810 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Comm_release_always(MPID_Comm * comm_ptr,int isDisconnect)1811 int MPIR_Comm_release_always(MPID_Comm *comm_ptr, int isDisconnect)
1812 {
1813     int mpi_errno = MPI_SUCCESS;
1814     int in_use;
1815     MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_RELEASE_ALWAYS);
1816 
1817     MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_RELEASE_ALWAYS);
1818 
1819     /* we want to short-circuit any optimization that avoids reference counting
1820      * predefined communicators, such as MPI_COMM_WORLD or MPI_COMM_SELF. */
1821     MPIU_Object_release_ref_always(comm_ptr, &in_use);
1822     if (!in_use) {
1823         mpi_errno = MPIR_Comm_delete_internal(comm_ptr, isDisconnect);
1824         if (mpi_errno) MPIU_ERR_POP(mpi_errno);
1825     }
1826 
1827  fn_exit:
1828     MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_RELEASE_ALWAYS);
1829     return mpi_errno;
1830  fn_fail:
1831     goto fn_exit;
1832 }
1833 
1834