1 #include <petscsys.h> /*I "petscsys.h" I*/
2 #include <petsc/private/petscimpl.h>
3
4 PetscLogEvent PETSC_BuildTwoSided;
5 PetscLogEvent PETSC_BuildTwoSidedF;
6
7 const char *const PetscBuildTwoSidedTypes[] = {
8 "ALLREDUCE",
9 "IBARRIER",
10 "REDSCATTER",
11 "PetscBuildTwoSidedType",
12 "PETSC_BUILDTWOSIDED_",
13 NULL
14 };
15
16 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
17
18 /*@
19 PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
20
21 Logically Collective
22
23 Input Arguments:
24 + comm - PETSC_COMM_WORLD
25 - twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
26
27 Level: developer
28
29 Note:
30 This option is currently global, but could be made per-communicator.
31
32 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
33 @*/
PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)34 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
35 {
36 PetscFunctionBegin;
37 if (PetscDefined(USE_DEBUG)) { /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
38 PetscMPIInt ierr;
39 PetscMPIInt b1[2],b2[2];
40 b1[0] = -(PetscMPIInt)twosided;
41 b1[1] = (PetscMPIInt)twosided;
42 ierr = MPIU_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
43 if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
44 }
45 _twosided_type = twosided;
46 PetscFunctionReturn(0);
47 }
48
49 /*@
50 PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
51
52 Logically Collective
53
54 Output Arguments:
55 + comm - communicator on which to query algorithm
56 - twosided - algorithm to use for PetscCommBuildTwoSided()
57
58 Level: developer
59
60 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
61 @*/
PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType * twosided)62 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
63 {
64 PetscErrorCode ierr;
65 PetscMPIInt size;
66
67 PetscFunctionBegin;
68 *twosided = PETSC_BUILDTWOSIDED_NOTSET;
69 if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
70 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
71 _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
72 #if defined(PETSC_HAVE_MPI_IBARRIER)
73 if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
74 #endif
75 ierr = PetscOptionsGetEnum(NULL,NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);CHKERRQ(ierr);
76 }
77 *twosided = _twosided_type;
78 PetscFunctionReturn(0);
79 }
80
81 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
82
PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata)83 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
84 {
85 PetscErrorCode ierr;
86 PetscMPIInt nrecvs,tag,done,i;
87 MPI_Aint lb,unitbytes;
88 char *tdata;
89 MPI_Request *sendreqs,barrier;
90 PetscSegBuffer segrank,segdata;
91 PetscBool barrier_started;
92
93 PetscFunctionBegin;
94 ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
95 ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
96 if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
97 tdata = (char*)todata;
98 ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
99 for (i=0; i<nto; i++) {
100 ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
101 }
102 ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
103 ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
104
105 nrecvs = 0;
106 barrier = MPI_REQUEST_NULL;
107 /* MPICH-3.2 sometimes does not create a request in some "optimized" cases. This is arguably a standard violation,
108 * but we need to work around it. */
109 barrier_started = PETSC_FALSE;
110 for (done=0; !done;) {
111 PetscMPIInt flag;
112 MPI_Status status;
113 ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
114 if (flag) { /* incoming message */
115 PetscMPIInt *recvrank;
116 void *buf;
117 ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
118 ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
119 *recvrank = status.MPI_SOURCE;
120 ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
121 nrecvs++;
122 }
123 if (!barrier_started) {
124 PetscMPIInt sent,nsends;
125 ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
126 ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
127 if (sent) {
128 #if defined(PETSC_HAVE_MPI_IBARRIER)
129 ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
130 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
131 ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
132 #endif
133 barrier_started = PETSC_TRUE;
134 ierr = PetscFree(sendreqs);CHKERRQ(ierr);
135 }
136 } else {
137 ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
138 }
139 }
140 *nfrom = nrecvs;
141 ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
142 ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
143 ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
144 ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
145 ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
146 PetscFunctionReturn(0);
147 }
148 #endif
149
PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata)150 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
151 {
152 PetscErrorCode ierr;
153 PetscMPIInt size,rank,*iflags,nrecvs,tag,*franks,i,flg;
154 MPI_Aint lb,unitbytes;
155 char *tdata,*fdata;
156 MPI_Request *reqs,*sendreqs;
157 MPI_Status *statuses;
158 PetscCommCounter *counter;
159
160 PetscFunctionBegin;
161 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
162 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
163 ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
164 ierr = MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
165 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
166 if (!counter->iflags) {
167 ierr = PetscCalloc1(size,&counter->iflags);CHKERRQ(ierr);
168 iflags = counter->iflags;
169 } else {
170 iflags = counter->iflags;
171 ierr = PetscArrayzero(iflags,size);CHKERRQ(ierr);
172 }
173 for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
174 ierr = MPIU_Allreduce(MPI_IN_PLACE,iflags,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
175 nrecvs = iflags[rank];
176 ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
177 if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
178 ierr = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
179 tdata = (char*)todata;
180 ierr = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
181 sendreqs = reqs + nrecvs;
182 for (i=0; i<nrecvs; i++) {
183 ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
184 }
185 for (i=0; i<nto; i++) {
186 ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
187 }
188 ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
189 ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
190 for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
191 ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
192 ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
193
194 *nfrom = nrecvs;
195 *fromranks = franks;
196 *(void**)fromdata = fdata;
197 PetscFunctionReturn(0);
198 }
199
200 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata)201 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
202 {
203 PetscErrorCode ierr;
204 PetscMPIInt size,*iflags,nrecvs,tag,*franks,i,flg;
205 MPI_Aint lb,unitbytes;
206 char *tdata,*fdata;
207 MPI_Request *reqs,*sendreqs;
208 MPI_Status *statuses;
209 PetscCommCounter *counter;
210
211 PetscFunctionBegin;
212 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
213 ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
214 ierr = MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
215 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
216 if (!counter->iflags) {
217 ierr = PetscCalloc1(size,&counter->iflags);CHKERRQ(ierr);
218 iflags = counter->iflags;
219 } else {
220 iflags = counter->iflags;
221 ierr = PetscArrayzero(iflags,size);CHKERRQ(ierr);
222 }
223 for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
224 ierr = MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
225 ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
226 if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
227 ierr = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
228 tdata = (char*)todata;
229 ierr = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
230 sendreqs = reqs + nrecvs;
231 for (i=0; i<nrecvs; i++) {
232 ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
233 }
234 for (i=0; i<nto; i++) {
235 ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
236 }
237 ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
238 ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
239 for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
240 ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
241 ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
242
243 *nfrom = nrecvs;
244 *fromranks = franks;
245 *(void**)fromdata = fdata;
246 PetscFunctionReturn(0);
247 }
248 #endif
249
250 /*@C
251 PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
252
253 Collective
254
255 Input Arguments:
256 + comm - communicator
257 . count - number of entries to send/receive (must match on all ranks)
258 . dtype - datatype to send/receive from each rank (must match on all ranks)
259 . nto - number of ranks to send data to
260 . toranks - ranks to send to (array of length nto)
261 - todata - data to send to each rank (packed)
262
263 Output Arguments:
264 + nfrom - number of ranks receiving messages from
265 . fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
266 - fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
267
268 Level: developer
269
270 Options Database Keys:
271 . -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks, otherwise ibarrier.
272
273 Notes:
274 This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
275 PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
276
277 Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
278
279 References:
280 . 1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
281 Scalable communication protocols for dynamic sparse data exchange, 2010.
282
283 .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
284 @*/
PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata)285 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
286 {
287 PetscErrorCode ierr;
288 PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
289
290 PetscFunctionBegin;
291 ierr = PetscSysInitializePackage();CHKERRQ(ierr);
292 ierr = PetscLogEventSync(PETSC_BuildTwoSided,comm);CHKERRQ(ierr);
293 ierr = PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
294 ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
295 switch (buildtype) {
296 case PETSC_BUILDTWOSIDED_IBARRIER:
297 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
298 ierr = PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
299 #else
300 SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
301 #endif
302 break;
303 case PETSC_BUILDTWOSIDED_ALLREDUCE:
304 ierr = PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
305 break;
306 case PETSC_BUILDTWOSIDED_REDSCATTER:
307 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
308 ierr = PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
309 #else
310 SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
311 #endif
312 break;
313 default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
314 }
315 ierr = PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
316 PetscFunctionReturn(0);
317 }
318
PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata,PetscMPIInt ntags,MPI_Request ** toreqs,MPI_Request ** fromreqs,PetscErrorCode (* send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void *,MPI_Request[],void *),PetscErrorCode (* recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void *,MPI_Request[],void *),void * ctx)319 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
320 PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
321 PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
322 PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
323 {
324 PetscErrorCode ierr;
325 PetscMPIInt i,*tag;
326 MPI_Aint lb,unitbytes;
327 MPI_Request *sendreq,*recvreq;
328
329 PetscFunctionBegin;
330 ierr = PetscMalloc1(ntags,&tag);CHKERRQ(ierr);
331 if (ntags > 0) {
332 ierr = PetscCommDuplicate(comm,&comm,&tag[0]);CHKERRQ(ierr);
333 }
334 for (i=1; i<ntags; i++) {
335 ierr = PetscCommGetNewTag(comm,&tag[i]);CHKERRQ(ierr);
336 }
337
338 /* Perform complete initial rendezvous */
339 ierr = PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
340
341 ierr = PetscMalloc1(nto*ntags,&sendreq);CHKERRQ(ierr);
342 ierr = PetscMalloc1(*nfrom*ntags,&recvreq);CHKERRQ(ierr);
343
344 ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
345 if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
346 for (i=0; i<nto; i++) {
347 PetscMPIInt k;
348 for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
349 ierr = (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);CHKERRQ(ierr);
350 }
351 for (i=0; i<*nfrom; i++) {
352 void *header = (*(char**)fromdata) + count*unitbytes*i;
353 PetscMPIInt k;
354 for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
355 ierr = (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);CHKERRQ(ierr);
356 }
357 ierr = PetscFree(tag);CHKERRQ(ierr);
358 ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
359 *toreqs = sendreq;
360 *fromreqs = recvreq;
361 PetscFunctionReturn(0);
362 }
363
364 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
365
PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata,PetscMPIInt ntags,MPI_Request ** toreqs,MPI_Request ** fromreqs,PetscErrorCode (* send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void *,MPI_Request[],void *),PetscErrorCode (* recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void *,MPI_Request[],void *),void * ctx)366 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
367 PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
368 PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
369 PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
370 {
371 PetscErrorCode ierr;
372 PetscMPIInt nrecvs,tag,*tags,done,i;
373 MPI_Aint lb,unitbytes;
374 char *tdata;
375 MPI_Request *sendreqs,*usendreqs,*req,barrier;
376 PetscSegBuffer segrank,segdata,segreq;
377 PetscBool barrier_started;
378
379 PetscFunctionBegin;
380 ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
381 ierr = PetscMalloc1(ntags,&tags);CHKERRQ(ierr);
382 for (i=0; i<ntags; i++) {
383 ierr = PetscCommGetNewTag(comm,&tags[i]);CHKERRQ(ierr);
384 }
385 ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
386 if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
387 tdata = (char*)todata;
388 ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
389 ierr = PetscMalloc1(nto*ntags,&usendreqs);CHKERRQ(ierr);
390 /* Post synchronous sends */
391 for (i=0; i<nto; i++) {
392 ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
393 }
394 /* Post actual payloads. These are typically larger messages. Hopefully sending these later does not slow down the
395 * synchronous messages above. */
396 for (i=0; i<nto; i++) {
397 PetscMPIInt k;
398 for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
399 ierr = (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);CHKERRQ(ierr);
400 }
401
402 ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
403 ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
404 ierr = PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);CHKERRQ(ierr);
405
406 nrecvs = 0;
407 barrier = MPI_REQUEST_NULL;
408 /* MPICH-3.2 sometimes does not create a request in some "optimized" cases. This is arguably a standard violation,
409 * but we need to work around it. */
410 barrier_started = PETSC_FALSE;
411 for (done=0; !done;) {
412 PetscMPIInt flag;
413 MPI_Status status;
414 ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
415 if (flag) { /* incoming message */
416 PetscMPIInt *recvrank,k;
417 void *buf;
418 ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
419 ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
420 *recvrank = status.MPI_SOURCE;
421 ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
422 ierr = PetscSegBufferGet(segreq,ntags,&req);CHKERRQ(ierr);
423 for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
424 ierr = (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);CHKERRQ(ierr);
425 nrecvs++;
426 }
427 if (!barrier_started) {
428 PetscMPIInt sent,nsends;
429 ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
430 ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
431 if (sent) {
432 #if defined(PETSC_HAVE_MPI_IBARRIER)
433 ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
434 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
435 ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
436 #endif
437 barrier_started = PETSC_TRUE;
438 }
439 } else {
440 ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
441 }
442 }
443 *nfrom = nrecvs;
444 ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
445 ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
446 ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
447 ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
448 *toreqs = usendreqs;
449 ierr = PetscSegBufferExtractAlloc(segreq,fromreqs);CHKERRQ(ierr);
450 ierr = PetscSegBufferDestroy(&segreq);CHKERRQ(ierr);
451 ierr = PetscFree(sendreqs);CHKERRQ(ierr);
452 ierr = PetscFree(tags);CHKERRQ(ierr);
453 ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
454 PetscFunctionReturn(0);
455 }
456 #endif
457
458 /*@C
459 PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
460
461 Collective
462
463 Input Arguments:
464 + comm - communicator
465 . count - number of entries to send/receive in initial rendezvous (must match on all ranks)
466 . dtype - datatype to send/receive from each rank (must match on all ranks)
467 . nto - number of ranks to send data to
468 . toranks - ranks to send to (array of length nto)
469 . todata - data to send to each rank (packed)
470 . ntags - number of tags needed by send/recv callbacks
471 . send - callback invoked on sending process when ready to send primary payload
472 . recv - callback invoked on receiving process after delivery of rendezvous message
473 - ctx - context for callbacks
474
475 Output Arguments:
476 + nfrom - number of ranks receiving messages from
477 . fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
478 - fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
479
480 Level: developer
481
482 Notes:
483 This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
484 PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
485
486 Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
487
488 References:
489 . 1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
490 Scalable communication protocols for dynamic sparse data exchange, 2010.
491
492 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
493 @*/
PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata,PetscMPIInt ntags,PetscErrorCode (* send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void *,MPI_Request[],void *),PetscErrorCode (* recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void *,MPI_Request[],void *),void * ctx)494 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
495 PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
496 PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
497 {
498 PetscErrorCode ierr;
499 MPI_Request *toreqs,*fromreqs;
500
501 PetscFunctionBegin;
502 ierr = PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);CHKERRQ(ierr);
503 ierr = MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
504 ierr = MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
505 ierr = PetscFree(toreqs);CHKERRQ(ierr);
506 ierr = PetscFree(fromreqs);CHKERRQ(ierr);
507 PetscFunctionReturn(0);
508 }
509
510 /*@C
511 PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
512
513 Collective
514
515 Input Arguments:
516 + comm - communicator
517 . count - number of entries to send/receive in initial rendezvous (must match on all ranks)
518 . dtype - datatype to send/receive from each rank (must match on all ranks)
519 . nto - number of ranks to send data to
520 . toranks - ranks to send to (array of length nto)
521 . todata - data to send to each rank (packed)
522 . ntags - number of tags needed by send/recv callbacks
523 . send - callback invoked on sending process when ready to send primary payload
524 . recv - callback invoked on receiving process after delivery of rendezvous message
525 - ctx - context for callbacks
526
527 Output Arguments:
528 + nfrom - number of ranks receiving messages from
529 . fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
530 . fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
531 . toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
532 - fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
533
534 Level: developer
535
536 Notes:
537 This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
538 PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
539
540 Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
541
542 References:
543 . 1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
544 Scalable communication protocols for dynamic sparse data exchange, 2010.
545
546 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
547 @*/
PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata,PetscMPIInt ntags,MPI_Request ** toreqs,MPI_Request ** fromreqs,PetscErrorCode (* send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void *,MPI_Request[],void *),PetscErrorCode (* recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void *,MPI_Request[],void *),void * ctx)548 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
549 PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
550 PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
551 PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
552 {
553 PetscErrorCode ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
554 PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
555 PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
556 PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
557 PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
558 PetscMPIInt i,size;
559
560 PetscFunctionBegin;
561 ierr = PetscSysInitializePackage();CHKERRQ(ierr);
562 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
563 for (i=0; i<nto; i++) {
564 if (toranks[i] < 0 || size <= toranks[i]) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
565 }
566 ierr = PetscLogEventSync(PETSC_BuildTwoSidedF,comm);CHKERRQ(ierr);
567 ierr = PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
568 ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
569 switch (buildtype) {
570 case PETSC_BUILDTWOSIDED_IBARRIER:
571 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
572 f = PetscCommBuildTwoSidedFReq_Ibarrier;
573 #else
574 SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
575 #endif
576 break;
577 case PETSC_BUILDTWOSIDED_ALLREDUCE:
578 case PETSC_BUILDTWOSIDED_REDSCATTER:
579 f = PetscCommBuildTwoSidedFReq_Reference;
580 break;
581 default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
582 }
583 ierr = (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);CHKERRQ(ierr);
584 ierr = PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
585 PetscFunctionReturn(0);
586 }
587