1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include "mpiimpl.h"
7
8 /*
9 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
10
11 cvars:
12 - name : MPIR_CVAR_GATHERV_INTER_SSEND_MIN_PROCS
13 category : COLLECTIVE
14 type : int
15 default : 32
16 class : none
17 verbosity : MPI_T_VERBOSITY_USER_BASIC
18 scope : MPI_T_SCOPE_ALL_EQ
19 description : >-
20 Use Ssend (synchronous send) for intercommunicator MPI_Gatherv if the
21 "group B" size is >= this value. Specifying "-1" always avoids using
22 Ssend. For backwards compatibility, specifying "0" uses the default
23 value.
24
25 === END_MPI_T_CVAR_INFO_BLOCK ===
26 */
27
28 /* Algorithm: MPI_Gatherv
29 *
30 * Since the array of recvcounts is valid only on the root, we cannot do a tree
31 * algorithm without first communicating the recvcounts to other processes.
32 * Therefore, we simply use a linear algorithm for the gather, which takes
33 * (p-1) steps versus lgp steps for the tree algorithm. The bandwidth
34 * requirement is the same for both algorithms.
35 *
36 * Cost = (p-1).alpha + n.((p-1)/p).beta
37 */
MPIR_Gatherv_allcomm_linear(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,const int * recvcounts,const int * displs,MPI_Datatype recvtype,int root,MPIR_Comm * comm_ptr,MPIR_Errflag_t * errflag)38 int MPIR_Gatherv_allcomm_linear(const void *sendbuf,
39 int sendcount,
40 MPI_Datatype sendtype,
41 void *recvbuf,
42 const int *recvcounts,
43 const int *displs,
44 MPI_Datatype recvtype,
45 int root, MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
46 {
47 int comm_size, rank;
48 int mpi_errno = MPI_SUCCESS;
49 int mpi_errno_ret = MPI_SUCCESS;
50 MPI_Aint extent;
51 int i, reqs;
52 int min_procs;
53 MPIR_Request **reqarray;
54 MPI_Status *starray;
55 MPIR_CHKLMEM_DECL(2);
56
57 rank = comm_ptr->rank;
58
59 /* If rank == root, then I recv lots, otherwise I send */
60 if (((comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) && (root == rank)) ||
61 ((comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) && (root == MPI_ROOT))) {
62 if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM)
63 comm_size = comm_ptr->local_size;
64 else
65 comm_size = comm_ptr->remote_size;
66
67 MPIR_Datatype_get_extent_macro(recvtype, extent);
68
69 MPIR_CHKLMEM_MALLOC(reqarray, MPIR_Request **, comm_size * sizeof(MPIR_Request *),
70 mpi_errno, "reqarray", MPL_MEM_BUFFER);
71 MPIR_CHKLMEM_MALLOC(starray, MPI_Status *, comm_size * sizeof(MPI_Status), mpi_errno,
72 "starray", MPL_MEM_BUFFER);
73
74 reqs = 0;
75 for (i = 0; i < comm_size; i++) {
76 if (recvcounts[i]) {
77 if ((comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) && (i == rank)) {
78 if (sendbuf != MPI_IN_PLACE) {
79 mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype,
80 ((char *) recvbuf + displs[rank] * extent),
81 recvcounts[rank], recvtype);
82 MPIR_ERR_CHECK(mpi_errno);
83 }
84 } else {
85 mpi_errno = MPIC_Irecv(((char *) recvbuf + displs[i] * extent),
86 recvcounts[i], recvtype, i,
87 MPIR_GATHERV_TAG, comm_ptr, &reqarray[reqs++]);
88 MPIR_ERR_CHECK(mpi_errno);
89 }
90 }
91 }
92 /* ... then wait for *all* of them to finish: */
93 mpi_errno = MPIC_Waitall(reqs, reqarray, starray, errflag);
94 if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS)
95 MPIR_ERR_POP(mpi_errno);
96
97 /* --BEGIN ERROR HANDLING-- */
98 if (mpi_errno == MPI_ERR_IN_STATUS) {
99 for (i = 0; i < reqs; i++) {
100 if (starray[i].MPI_ERROR != MPI_SUCCESS) {
101 mpi_errno = starray[i].MPI_ERROR;
102 if (mpi_errno) {
103 /* for communication errors, just record the error but continue */
104 *errflag =
105 MPIX_ERR_PROC_FAILED ==
106 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
107 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
108 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
109 }
110 }
111 }
112 }
113 /* --END ERROR HANDLING-- */
114 }
115
116 else if (root != MPI_PROC_NULL) { /* non-root nodes, and in the intercomm. case, non-root nodes on remote side */
117 if (sendcount) {
118 /* we want local size in both the intracomm and intercomm cases
119 * because the size of the root's group (group A in the standard) is
120 * irrelevant here. */
121 comm_size = comm_ptr->local_size;
122
123 min_procs = MPIR_CVAR_GATHERV_INTER_SSEND_MIN_PROCS;
124 if (min_procs == -1)
125 min_procs = comm_size + 1; /* Disable ssend */
126 else if (min_procs == 0) /* backwards compatibility, use default value */
127 MPIR_CVAR_GET_DEFAULT_INT(MPIR_CVAR_GATHERV_INTER_SSEND_MIN_PROCS, &min_procs);
128
129 if (comm_size >= min_procs) {
130 mpi_errno = MPIC_Ssend(sendbuf, sendcount, sendtype, root,
131 MPIR_GATHERV_TAG, comm_ptr, errflag);
132 if (mpi_errno) {
133 /* for communication errors, just record the error but continue */
134 *errflag =
135 MPIX_ERR_PROC_FAILED ==
136 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
137 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
138 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
139 }
140 } else {
141 mpi_errno = MPIC_Send(sendbuf, sendcount, sendtype, root,
142 MPIR_GATHERV_TAG, comm_ptr, errflag);
143 if (mpi_errno) {
144 /* for communication errors, just record the error but continue */
145 *errflag =
146 MPIX_ERR_PROC_FAILED ==
147 MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
148 MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
149 MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
150 }
151 }
152 }
153 }
154
155
156 fn_exit:
157 MPIR_CHKLMEM_FREEALL();
158 if (mpi_errno_ret)
159 mpi_errno = mpi_errno_ret;
160 else if (*errflag != MPIR_ERR_NONE)
161 MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
162 return mpi_errno;
163 fn_fail:
164 goto fn_exit;
165 }
166