1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpiimpl.h"
7 
8 /*
9 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
10 
11 cvars:
12     - name        : MPIR_CVAR_GATHERV_INTER_SSEND_MIN_PROCS
13       category    : COLLECTIVE
14       type        : int
15       default     : 32
16       class       : none
17       verbosity   : MPI_T_VERBOSITY_USER_BASIC
18       scope       : MPI_T_SCOPE_ALL_EQ
19       description : >-
20         Use Ssend (synchronous send) for intercommunicator MPI_Gatherv if the
21         "group B" size is >= this value.  Specifying "-1" always avoids using
22         Ssend.  For backwards compatibility, specifying "0" uses the default
23         value.
24 
25 === END_MPI_T_CVAR_INFO_BLOCK ===
26 */
27 
28 /* Algorithm: MPI_Gatherv
29  *
30  * Since the array of recvcounts is valid only on the root, we cannot do a tree
31  * algorithm without first communicating the recvcounts to other processes.
32  * Therefore, we simply use a linear algorithm for the gather, which takes
33  * (p-1) steps versus lgp steps for the tree algorithm. The bandwidth
34  * requirement is the same for both algorithms.
35  *
36  * Cost = (p-1).alpha + n.((p-1)/p).beta
37 */
MPIR_Gatherv_allcomm_linear(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,const int * recvcounts,const int * displs,MPI_Datatype recvtype,int root,MPIR_Comm * comm_ptr,MPIR_Errflag_t * errflag)38 int MPIR_Gatherv_allcomm_linear(const void *sendbuf,
39                                 int sendcount,
40                                 MPI_Datatype sendtype,
41                                 void *recvbuf,
42                                 const int *recvcounts,
43                                 const int *displs,
44                                 MPI_Datatype recvtype,
45                                 int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
46 {
47     int comm_size, rank;
48     int mpi_errno = MPI_SUCCESS;
49     int mpi_errno_ret = MPI_SUCCESS;
50     MPI_Aint extent;
51     int i, reqs;
52     int min_procs;
53     MPIR_Request **reqarray;
54     MPI_Status *starray;
55     MPIR_CHKLMEM_DECL(2);
56 
57     rank = comm_ptr->rank;
58 
59     /* If rank == root, then I recv lots, otherwise I send */
60     if (((comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) && (root == rank)) ||
61         ((comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) && (root == MPI_ROOT))) {
62         if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM)
63             comm_size = comm_ptr->local_size;
64         else
65             comm_size = comm_ptr->remote_size;
66 
67         MPIR_Datatype_get_extent_macro(recvtype, extent);
68 
69         MPIR_CHKLMEM_MALLOC(reqarray, MPIR_Request **, comm_size * sizeof(MPIR_Request *),
70                             mpi_errno, "reqarray", MPL_MEM_BUFFER);
71         MPIR_CHKLMEM_MALLOC(starray, MPI_Status *, comm_size * sizeof(MPI_Status), mpi_errno,
72                             "starray", MPL_MEM_BUFFER);
73 
74         reqs = 0;
75         for (i = 0; i < comm_size; i++) {
76             if (recvcounts[i]) {
77                 if ((comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) && (i == rank)) {
78                     if (sendbuf != MPI_IN_PLACE) {
79                         mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype,
80                                                    ((char *) recvbuf + displs[rank] * extent),
81                                                    recvcounts[rank], recvtype);
82                         MPIR_ERR_CHECK(mpi_errno);
83                     }
84                 } else {
85                     mpi_errno = MPIC_Irecv(((char *) recvbuf + displs[i] * extent),
86                                            recvcounts[i], recvtype, i,
87                                            MPIR_GATHERV_TAG, comm_ptr, &reqarray[reqs++]);
88                     MPIR_ERR_CHECK(mpi_errno);
89                 }
90             }
91         }
92         /* ... then wait for *all* of them to finish: */
93         mpi_errno = MPIC_Waitall(reqs, reqarray, starray, errflag);
94         if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS)
95             MPIR_ERR_POP(mpi_errno);
96 
97         /* --BEGIN ERROR HANDLING-- */
98         if (mpi_errno == MPI_ERR_IN_STATUS) {
99             for (i = 0; i < reqs; i++) {
100                 if (starray[i].MPI_ERROR != MPI_SUCCESS) {
101                     mpi_errno = starray[i].MPI_ERROR;
102                     if (mpi_errno) {
103                         /* for communication errors, just record the error but continue */
104                         *errflag =
105                             MPIX_ERR_PROC_FAILED ==
106                             MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
107                         MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
108                         MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
109                     }
110                 }
111             }
112         }
113         /* --END ERROR HANDLING-- */
114     }
115 
116     else if (root != MPI_PROC_NULL) {   /* non-root nodes, and in the intercomm. case, non-root nodes on remote side */
117         if (sendcount) {
118             /* we want local size in both the intracomm and intercomm cases
119              * because the size of the root's group (group A in the standard) is
120              * irrelevant here. */
121             comm_size = comm_ptr->local_size;
122 
123             min_procs = MPIR_CVAR_GATHERV_INTER_SSEND_MIN_PROCS;
124             if (min_procs == -1)
125                 min_procs = comm_size + 1;      /* Disable ssend */
126             else if (min_procs == 0)    /* backwards compatibility, use default value */
127                 MPIR_CVAR_GET_DEFAULT_INT(MPIR_CVAR_GATHERV_INTER_SSEND_MIN_PROCS, &min_procs);
128 
129             if (comm_size >= min_procs) {
130                 mpi_errno = MPIC_Ssend(sendbuf, sendcount, sendtype, root,
131                                        MPIR_GATHERV_TAG, comm_ptr, errflag);
132                 if (mpi_errno) {
133                     /* for communication errors, just record the error but continue */
134                     *errflag =
135                         MPIX_ERR_PROC_FAILED ==
136                         MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
137                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
138                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
139                 }
140             } else {
141                 mpi_errno = MPIC_Send(sendbuf, sendcount, sendtype, root,
142                                       MPIR_GATHERV_TAG, comm_ptr, errflag);
143                 if (mpi_errno) {
144                     /* for communication errors, just record the error but continue */
145                     *errflag =
146                         MPIX_ERR_PROC_FAILED ==
147                         MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
148                     MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
149                     MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
150                 }
151             }
152         }
153     }
154 
155 
156   fn_exit:
157     MPIR_CHKLMEM_FREEALL();
158     if (mpi_errno_ret)
159         mpi_errno = mpi_errno_ret;
160     else if (*errflag != MPIR_ERR_NONE)
161         MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
162     return mpi_errno;
163   fn_fail:
164     goto fn_exit;
165 }
166