1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpiimpl.h"
7 
8 /*
9 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
10 
11 cvars:
12     - name        : MPIR_CVAR_IALLGATHER_RECEXCH_KVAL
13       category    : COLLECTIVE
14       type        : int
15       default     : 2
16       class       : none
17       verbosity   : MPI_T_VERBOSITY_USER_BASIC
18       scope       : MPI_T_SCOPE_ALL_EQ
19       description : >-
20         k value for recursive exchange based iallgather
21 
22     - name        : MPIR_CVAR_IALLGATHER_BRUCKS_KVAL
23       category    : COLLECTIVE
24       type        : int
25       default     : 2
26       class       : none
27       verbosity   : MPI_T_VERBOSITY_USER_BASIC
28       scope       : MPI_T_SCOPE_ALL_EQ
29       description : >-
30         k value for radix in brucks based iallgather
31 
32     - name        : MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM
33       category    : COLLECTIVE
34       type        : enum
35       default     : auto
36       class       : none
37       verbosity   : MPI_T_VERBOSITY_USER_BASIC
38       scope       : MPI_T_SCOPE_ALL_EQ
39       description : |-
40         Variable to select iallgather algorithm
41         auto - Internal algorithm selection (can be overridden with MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE)
42         sched_auto - Internal algorithm selection for sched-based algorithms
43         sched_ring               - Force ring algorithm
44         sched_brucks             - Force brucks algorithm
45         sched_recursive_doubling - Force recursive doubling algorithm
46         gentran_ring       - Force generic transport ring algorithm
47         gentran_brucks     - Force generic transport based brucks algorithm
48         gentran_recexch_doubling - Force generic transport recursive exchange with neighbours doubling in distance in each phase
49         gentran_recexch_halving  - Force generic transport recursive exchange with neighbours halving in distance in each phase
50 
51     - name        : MPIR_CVAR_IALLGATHER_INTER_ALGORITHM
52       category    : COLLECTIVE
53       type        : enum
54       default     : auto
55       class       : none
56       verbosity   : MPI_T_VERBOSITY_USER_BASIC
57       scope       : MPI_T_SCOPE_ALL_EQ
58       description : |-
59         Variable to select iallgather algorithm
60         auto - Internal algorithm selection (can be overridden with MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE)
61         sched_auto - Internal algorithm selection for sched-based algorithms
62         sched_local_gather_remote_bcast - Force local-gather-remote-bcast algorithm
63 
64     - name        : MPIR_CVAR_IALLGATHER_DEVICE_COLLECTIVE
65       category    : COLLECTIVE
66       type        : boolean
67       default     : true
68       class       : none
69       verbosity   : MPI_T_VERBOSITY_USER_BASIC
70       scope       : MPI_T_SCOPE_ALL_EQ
71       description : >-
72         This CVAR is only used when MPIR_CVAR_DEVICE_COLLECTIVES
73         is set to "percoll".  If set to true, MPI_Iallgather will
74         allow the device to override the MPIR-level collective
75         algorithms.  The device might still call the MPIR-level
76         algorithms manually.  If set to false, the device-override
77         will be disabled.
78 
79 === END_MPI_T_CVAR_INFO_BLOCK ===
80 */
81 
82 /* -- Begin Profiling Symbol Block for routine MPI_Iallgather */
83 #if defined(HAVE_PRAGMA_WEAK)
84 #pragma weak MPI_Iallgather = PMPI_Iallgather
85 #elif defined(HAVE_PRAGMA_HP_SEC_DEF)
86 #pragma _HP_SECONDARY_DEF PMPI_Iallgather  MPI_Iallgather
87 #elif defined(HAVE_PRAGMA_CRI_DUP)
88 #pragma _CRI duplicate MPI_Iallgather as PMPI_Iallgather
89 #elif defined(HAVE_WEAK_ATTRIBUTE)
90 int MPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
91                    int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request * request)
92     __attribute__ ((weak, alias("PMPI_Iallgather")));
93 #endif
94 /* -- End Profiling Symbol Block */
95 
96 /* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build
97    the MPI routines */
98 #ifndef MPICH_MPI_FROM_PMPI
99 #undef MPI_Iallgather
100 #define MPI_Iallgather PMPI_Iallgather
101 
102 /* This is the machine-independent implementation of allgather. The algorithm is:
103 
104    Algorithm: MPI_Allgather
105 
106    For short messages and non-power-of-two no. of processes, we use
107    the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
108    paper. It is a variant of the disemmination algorithm for
109    barrier. It takes ceiling(lg p) steps.
110 
111    Cost = lgp.alpha + n.((p-1)/p).beta
112    where n is total size of data gathered on each process.
113 
114    For short or medium-size messages and power-of-two no. of
115    processes, we use the recursive doubling algorithm.
116 
117    Cost = lgp.alpha + n.((p-1)/p).beta
118 
119    TODO: On TCP, we may want to use recursive doubling instead of the Bruck
120    algorithm in all cases because of the pairwise-exchange property of
121    recursive doubling (see Benson et al paper in Euro PVM/MPI
122    2003).
123 
124    It is interesting to note that either of the above algorithms for
125    MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
126 
127    For long messages or medium-size messages and non-power-of-two
128    no. of processes, we use a ring algorithm. In the first step, each
129    process i sends its contribution to process i+1 and receives
130    the contribution from process i-1 (with wrap-around). From the
131    second step onwards, each process i forwards to process i+1 the
132    data it received from process i-1 in the previous step. This takes
133    a total of p-1 steps.
134 
135    Cost = (p-1).alpha + n.((p-1)/p).beta
136 
137    We use this algorithm instead of recursive doubling for long
138    messages because we find that this communication pattern (nearest
139    neighbor) performs twice as fast as recursive doubling for long
140    messages (on Myrinet and IBM SP).
141 
142    Possible improvements:
143 
144    End Algorithm: MPI_Allgather
145 */
146 
MPIR_Iallgather_allcomm_auto(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,MPIR_Comm * comm_ptr,MPIR_Request ** request)147 int MPIR_Iallgather_allcomm_auto(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
148                                  void *recvbuf, int recvcount, MPI_Datatype recvtype,
149                                  MPIR_Comm * comm_ptr, MPIR_Request ** request)
150 {
151     int mpi_errno = MPI_SUCCESS;
152 
153     MPIR_Csel_coll_sig_s coll_sig = {
154         .coll_type = MPIR_CSEL_COLL_TYPE__IALLGATHER,
155         .comm_ptr = comm_ptr,
156 
157         .u.iallgather.sendbuf = sendbuf,
158         .u.iallgather.sendcount = sendcount,
159         .u.iallgather.sendtype = sendtype,
160         .u.iallgather.recvbuf = recvbuf,
161         .u.iallgather.recvcount = recvcount,
162         .u.iallgather.recvtype = recvtype,
163     };
164 
165     MPII_Csel_container_s *cnt = MPIR_Csel_search(comm_ptr->csel_comm, coll_sig);
166     MPIR_Assert(cnt);
167 
168     switch (cnt->id) {
169         case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Iallgather_intra_gentran_brucks:
170             mpi_errno =
171                 MPIR_Iallgather_intra_gentran_brucks(sendbuf, sendcount, sendtype, recvbuf,
172                                                      recvcount, recvtype, comm_ptr,
173                                                      cnt->u.iallgather.intra_gentran_brucks.k,
174                                                      request);
175             break;
176 
177         case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Iallgather_intra_sched_auto:
178             MPII_SCHED_WRAPPER(MPIR_Iallgather_intra_sched_auto, comm_ptr, request, sendbuf,
179                                sendcount, sendtype, recvbuf, recvcount, recvtype);
180             break;
181 
182         case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Iallgather_intra_sched_brucks:
183             MPII_SCHED_WRAPPER(MPIR_Iallgather_intra_sched_brucks, comm_ptr, request, sendbuf,
184                                sendcount, sendtype, recvbuf, recvcount, recvtype);
185             break;
186 
187         case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Iallgather_intra_sched_recursive_doubling:
188             MPII_SCHED_WRAPPER(MPIR_Iallgather_intra_sched_recursive_doubling, comm_ptr, request,
189                                sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype);
190             break;
191 
192         case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Iallgather_intra_sched_ring:
193             MPII_SCHED_WRAPPER(MPIR_Iallgather_intra_sched_ring, comm_ptr, request, sendbuf,
194                                sendcount, sendtype, recvbuf, recvcount, recvtype);
195             break;
196 
197         case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Iallgather_intra_gentran_recexch_doubling:
198             mpi_errno =
199                 MPIR_Iallgather_intra_gentran_recexch_doubling(sendbuf, sendcount, sendtype,
200                                                                recvbuf, recvcount, recvtype,
201                                                                comm_ptr,
202                                                                cnt->u.
203                                                                iallgather.intra_gentran_recexch_doubling.
204                                                                k, request);
205             break;
206 
207         case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Iallgather_intra_gentran_recexch_halving:
208             mpi_errno =
209                 MPIR_Iallgather_intra_gentran_recexch_halving(sendbuf, sendcount, sendtype, recvbuf,
210                                                               recvcount, recvtype, comm_ptr,
211                                                               cnt->u.
212                                                               iallgather.intra_gentran_recexch_halving.
213                                                               k, request);
214             break;
215 
216         case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Iallgather_intra_gentran_ring:
217             mpi_errno =
218                 MPIR_Iallgather_intra_gentran_ring(sendbuf, sendcount, sendtype, recvbuf, recvcount,
219                                                    recvtype, comm_ptr, request);
220             break;
221 
222         case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Iallgather_inter_sched_auto:
223             MPII_SCHED_WRAPPER(MPIR_Iallgather_inter_sched_auto, comm_ptr, request, sendbuf,
224                                sendcount, sendtype, recvbuf, recvcount, recvtype);
225             break;
226 
227         case MPII_CSEL_CONTAINER_TYPE__ALGORITHM__MPIR_Iallgather_inter_sched_local_gather_remote_bcast:
228             MPII_SCHED_WRAPPER(MPIR_Iallgather_inter_sched_local_gather_remote_bcast, comm_ptr,
229                                request, sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype);
230             break;
231 
232         default:
233             MPIR_Assert(0);
234     }
235 
236   fn_exit:
237     return mpi_errno;
238   fn_fail:
239     goto fn_exit;
240 }
241 
MPIR_Iallgather_intra_sched_auto(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,MPIR_Comm * comm_ptr,MPIR_Sched_t s)242 int MPIR_Iallgather_intra_sched_auto(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
243                                      void *recvbuf, int recvcount, MPI_Datatype recvtype,
244                                      MPIR_Comm * comm_ptr, MPIR_Sched_t s)
245 {
246     int mpi_errno = MPI_SUCCESS;
247     int comm_size, recvtype_size;
248     int tot_bytes;
249 
250     if (((sendcount == 0) && (sendbuf != MPI_IN_PLACE)) || (recvcount == 0))
251         return MPI_SUCCESS;
252 
253     comm_size = comm_ptr->local_size;
254 
255     MPIR_Datatype_get_size_macro(recvtype, recvtype_size);
256     tot_bytes = (MPI_Aint) recvcount *comm_size * recvtype_size;
257 
258     if ((tot_bytes < MPIR_CVAR_ALLGATHER_LONG_MSG_SIZE) && !(comm_size & (comm_size - 1))) {
259         mpi_errno =
260             MPIR_Iallgather_intra_sched_recursive_doubling(sendbuf, sendcount, sendtype, recvbuf,
261                                                            recvcount, recvtype, comm_ptr, s);
262     } else if (tot_bytes < MPIR_CVAR_ALLGATHER_SHORT_MSG_SIZE) {
263         mpi_errno =
264             MPIR_Iallgather_intra_sched_brucks(sendbuf, sendcount, sendtype, recvbuf, recvcount,
265                                                recvtype, comm_ptr, s);
266     } else {
267         mpi_errno =
268             MPIR_Iallgather_intra_sched_ring(sendbuf, sendcount, sendtype, recvbuf, recvcount,
269                                              recvtype, comm_ptr, s);
270     }
271     MPIR_ERR_CHECK(mpi_errno);
272 
273   fn_exit:
274     return mpi_errno;
275   fn_fail:
276     goto fn_exit;
277 }
278 
MPIR_Iallgather_inter_sched_auto(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,MPIR_Comm * comm_ptr,MPIR_Sched_t s)279 int MPIR_Iallgather_inter_sched_auto(const void *sendbuf, int sendcount,
280                                      MPI_Datatype sendtype, void *recvbuf, int recvcount,
281                                      MPI_Datatype recvtype, MPIR_Comm * comm_ptr, MPIR_Sched_t s)
282 {
283     int mpi_errno = MPI_SUCCESS;
284 
285     mpi_errno = MPIR_Iallgather_inter_sched_local_gather_remote_bcast(sendbuf, sendcount,
286                                                                       sendtype, recvbuf, recvcount,
287                                                                       recvtype, comm_ptr, s);
288 
289     return mpi_errno;
290 }
291 
MPIR_Iallgather_sched_auto(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,MPIR_Comm * comm_ptr,MPIR_Sched_t s)292 int MPIR_Iallgather_sched_auto(const void *sendbuf, int sendcount,
293                                MPI_Datatype sendtype, void *recvbuf,
294                                int recvcount, MPI_Datatype recvtype,
295                                MPIR_Comm * comm_ptr, MPIR_Sched_t s)
296 {
297     int mpi_errno = MPI_SUCCESS;
298 
299     if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) {
300         mpi_errno = MPIR_Iallgather_intra_sched_auto(sendbuf, sendcount, sendtype, recvbuf,
301                                                      recvcount, recvtype, comm_ptr, s);
302     } else {
303         mpi_errno = MPIR_Iallgather_inter_sched_auto(sendbuf, sendcount, sendtype,
304                                                      recvbuf, recvcount, recvtype, comm_ptr, s);
305     }
306 
307     return mpi_errno;
308 }
309 
MPIR_Iallgather_impl(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,MPIR_Comm * comm_ptr,MPIR_Request ** request)310 int MPIR_Iallgather_impl(const void *sendbuf, int sendcount,
311                          MPI_Datatype sendtype, void *recvbuf, int recvcount,
312                          MPI_Datatype recvtype, MPIR_Comm * comm_ptr, MPIR_Request ** request)
313 {
314     int mpi_errno = MPI_SUCCESS;
315 
316     *request = NULL;
317 
318     /* If the user picks one of the transport-enabled algorithms, branch there
319      * before going down to the MPIR_Sched-based algorithms. */
320     /* TODO - Eventually the intention is to replace all of the
321      * MPIR_Sched-based algorithms with transport-enabled algorithms, but that
322      * will require sufficient performance testing and replacement algorithms. */
323     if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) {
324         /* intracommunicator */
325         switch (MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM) {
326             case MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM_gentran_recexch_doubling:
327                 mpi_errno =
328                     MPIR_Iallgather_intra_gentran_recexch_doubling(sendbuf, sendcount, sendtype,
329                                                                    recvbuf, recvcount, recvtype,
330                                                                    comm_ptr,
331                                                                    MPIR_CVAR_IALLGATHER_RECEXCH_KVAL,
332                                                                    request);
333                 break;
334 
335             case MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM_gentran_recexch_halving:
336                 mpi_errno =
337                     MPIR_Iallgather_intra_gentran_recexch_halving(sendbuf, sendcount, sendtype,
338                                                                   recvbuf, recvcount, recvtype,
339                                                                   comm_ptr,
340                                                                   MPIR_CVAR_IALLGATHER_RECEXCH_KVAL,
341                                                                   request);
342                 break;
343 
344             case MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM_gentran_brucks:
345                 mpi_errno =
346                     MPIR_Iallgather_intra_gentran_brucks(sendbuf, sendcount, sendtype, recvbuf,
347                                                          recvcount, recvtype, comm_ptr,
348                                                          MPIR_CVAR_IALLGATHER_BRUCKS_KVAL, request);
349                 break;
350 
351             case MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM_gentran_ring:
352                 mpi_errno =
353                     MPIR_Iallgather_intra_gentran_ring(sendbuf, sendcount, sendtype,
354                                                        recvbuf, recvcount, recvtype, comm_ptr,
355                                                        request);
356                 break;
357 
358             case MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM_sched_brucks:
359                 MPII_SCHED_WRAPPER(MPIR_Iallgather_intra_sched_brucks, comm_ptr, request, sendbuf,
360                                    sendcount, sendtype, recvbuf, recvcount, recvtype);
361                 break;
362 
363             case MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM_sched_recursive_doubling:
364                 MPII_SCHED_WRAPPER(MPIR_Iallgather_intra_sched_recursive_doubling, comm_ptr,
365                                    request, sendbuf, sendcount, sendtype, recvbuf, recvcount,
366                                    recvtype);
367                 break;
368 
369             case MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM_sched_ring:
370                 MPII_SCHED_WRAPPER(MPIR_Iallgather_intra_sched_ring, comm_ptr, request, sendbuf,
371                                    sendcount, sendtype, recvbuf, recvcount, recvtype);
372                 break;
373 
374             case MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM_sched_auto:
375                 MPII_SCHED_WRAPPER(MPIR_Iallgather_intra_sched_auto, comm_ptr, request, sendbuf,
376                                    sendcount, sendtype, recvbuf, recvcount, recvtype);
377                 break;
378 
379             case MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM_auto:
380                 mpi_errno =
381                     MPIR_Iallgather_allcomm_auto(sendbuf, sendcount, sendtype, recvbuf, recvcount,
382                                                  recvtype, comm_ptr, request);
383                 break;
384 
385             default:
386                 MPIR_Assert(0);
387         }
388     } else {
389         switch (MPIR_CVAR_IALLGATHER_INTER_ALGORITHM) {
390             case MPIR_CVAR_IALLGATHER_INTER_ALGORITHM_sched_local_gather_remote_bcast:
391                 MPII_SCHED_WRAPPER(MPIR_Iallgather_inter_sched_local_gather_remote_bcast, comm_ptr,
392                                    request, sendbuf, sendcount, sendtype, recvbuf, recvcount,
393                                    recvtype);
394                 break;
395 
396             case MPIR_CVAR_IALLGATHER_INTER_ALGORITHM_sched_auto:
397                 MPII_SCHED_WRAPPER(MPIR_Iallgather_inter_sched_auto, comm_ptr, request, sendbuf,
398                                    sendcount, sendtype, recvbuf, recvcount, recvtype);
399                 break;
400 
401             case MPIR_CVAR_IALLGATHER_INTER_ALGORITHM_auto:
402                 mpi_errno =
403                     MPIR_Iallgather_allcomm_auto(sendbuf, sendcount, sendtype, recvbuf, recvcount,
404                                                  recvtype, comm_ptr, request);
405                 break;
406 
407             default:
408                 MPIR_Assert(0);
409         }
410     }
411 
412     MPIR_ERR_CHECK(mpi_errno);
413 
414   fn_exit:
415     return mpi_errno;
416   fn_fail:
417     goto fn_exit;
418 }
419 
MPIR_Iallgather(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,MPIR_Comm * comm_ptr,MPIR_Request ** request)420 int MPIR_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
421                     void *recvbuf, int recvcount, MPI_Datatype recvtype,
422                     MPIR_Comm * comm_ptr, MPIR_Request ** request)
423 {
424     int mpi_errno = MPI_SUCCESS;
425 
426     if ((MPIR_CVAR_DEVICE_COLLECTIVES == MPIR_CVAR_DEVICE_COLLECTIVES_all) ||
427         ((MPIR_CVAR_DEVICE_COLLECTIVES == MPIR_CVAR_DEVICE_COLLECTIVES_percoll) &&
428          MPIR_CVAR_IALLGATHER_DEVICE_COLLECTIVE)) {
429         mpi_errno =
430             MPID_Iallgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr,
431                             request);
432     } else {
433         mpi_errno = MPIR_Iallgather_impl(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
434                                          comm_ptr, request);
435     }
436 
437     return mpi_errno;
438 }
439 
440 #endif /* MPICH_MPI_FROM_PMPI */
441 
442 /*@
443 MPI_Iallgather - Gathers data from all tasks and distribute the combined data
444                  to all tasks in a nonblocking way
445 
446 Input Parameters:
447 + sendbuf - starting address of the send buffer (choice)
448 . sendcount - number of elements in send buffer (non-negative integer)
449 . sendtype - data type of send buffer elements (handle)
450 . recvcount - number of elements in receive buffer (non-negative integer)
451 . recvtype - data type of receive buffer elements (handle)
452 - comm - communicator (handle)
453 
454 Output Parameters:
455 + recvbuf - starting address of the receive buffer (choice)
456 - request - communication request (handle)
457 
458 .N ThreadSafe
459 
460 .N Fortran
461 
462 .N Errors
463 @*/
MPI_Iallgather(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,MPI_Comm comm,MPI_Request * request)464 int MPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
465                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
466                    MPI_Comm comm, MPI_Request * request)
467 {
468     int mpi_errno = MPI_SUCCESS;
469     MPIR_Comm *comm_ptr = NULL;
470     MPIR_Request *request_ptr = NULL;
471     MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_IALLGATHER);
472 
473     MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
474     MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_IALLGATHER);
475 
476     /* Validate parameters, especially handles needing to be converted */
477 #ifdef HAVE_ERROR_CHECKING
478     {
479         MPID_BEGIN_ERROR_CHECKS;
480         {
481             if (sendbuf != MPI_IN_PLACE) {
482                 MPIR_ERRTEST_DATATYPE(sendtype, "sendtype", mpi_errno);
483                 MPIR_ERRTEST_COUNT(sendcount, mpi_errno);
484             }
485             MPIR_ERRTEST_DATATYPE(recvtype, "recvtype", mpi_errno);
486             MPIR_ERRTEST_COMM(comm, mpi_errno);
487 
488             /* TODO more checks may be appropriate */
489         }
490         MPID_END_ERROR_CHECKS;
491     }
492 #endif /* HAVE_ERROR_CHECKING */
493 
494     /* Convert MPI object handles to object pointers */
495     MPIR_Comm_get_ptr(comm, comm_ptr);
496     MPIR_Assert(comm_ptr != NULL);
497 
498     /* Validate parameters and objects (post conversion) */
499 #ifdef HAVE_ERROR_CHECKING
500     {
501         MPID_BEGIN_ERROR_CHECKS;
502         {
503             MPIR_Comm_valid_ptr(comm_ptr, mpi_errno, FALSE);
504             if (sendbuf != MPI_IN_PLACE && !HANDLE_IS_BUILTIN(sendtype)) {
505                 MPIR_Datatype *sendtype_ptr = NULL;
506                 MPIR_Datatype_get_ptr(sendtype, sendtype_ptr);
507                 MPIR_Datatype_valid_ptr(sendtype_ptr, mpi_errno);
508                 if (mpi_errno != MPI_SUCCESS)
509                     goto fn_fail;
510                 MPIR_Datatype_committed_ptr(sendtype_ptr, mpi_errno);
511                 if (mpi_errno != MPI_SUCCESS)
512                     goto fn_fail;
513             }
514 
515             if (!HANDLE_IS_BUILTIN(recvtype)) {
516                 MPIR_Datatype *recvtype_ptr = NULL;
517                 MPIR_Datatype_get_ptr(recvtype, recvtype_ptr);
518                 MPIR_Datatype_valid_ptr(recvtype_ptr, mpi_errno);
519                 if (mpi_errno != MPI_SUCCESS)
520                     goto fn_fail;
521                 MPIR_Datatype_committed_ptr(recvtype_ptr, mpi_errno);
522                 if (mpi_errno != MPI_SUCCESS)
523                     goto fn_fail;
524             }
525 
526             MPIR_ERRTEST_ARGNULL(request, "request", mpi_errno);
527 
528             /* catch common aliasing cases */
529             if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM && recvbuf != MPI_IN_PLACE &&
530                 sendtype == recvtype && sendcount == recvcount && sendcount != 0) {
531                 int recvtype_size;
532                 MPIR_Datatype_get_size_macro(recvtype, recvtype_size);
533                 MPIR_ERRTEST_ALIAS_COLL(sendbuf,
534                                         (char *) recvbuf +
535                                         comm_ptr->rank * recvcount * recvtype_size, mpi_errno);
536             }
537 
538             /* TODO more checks may be appropriate (counts, in_place, etc) */
539         }
540         MPID_END_ERROR_CHECKS;
541     }
542 #endif /* HAVE_ERROR_CHECKING */
543 
544     /* ... body of routine ...  */
545 
546     mpi_errno = MPIR_Iallgather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
547                                 recvtype, comm_ptr, &request_ptr);
548     MPIR_ERR_CHECK(mpi_errno);
549 
550     /* create a complete request, if needed */
551     if (!request_ptr)
552         request_ptr = MPIR_Request_create_complete(MPIR_REQUEST_KIND__COLL);
553     /* return the handle of the request to the user */
554     *request = request_ptr->handle;
555 
556     /* ... end of body of routine ... */
557 
558   fn_exit:
559     MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_IALLGATHER);
560     MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
561     return mpi_errno;
562 
563   fn_fail:
564     /* --BEGIN ERROR HANDLING-- */
565 #ifdef HAVE_ERROR_CHECKING
566     {
567         mpi_errno =
568             MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER,
569                                  "**mpi_iallgather", "**mpi_iallgather %p %d %D %p %d %D %C %p",
570                                  sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm,
571                                  request);
572     }
573 #endif
574     mpi_errno = MPIR_Err_return_comm(comm_ptr, __func__, mpi_errno);
575     goto fn_exit;
576     /* --END ERROR HANDLING-- */
577 }
578