1 #include <petscsys.h>        /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 PetscLogEvent PETSC_BuildTwoSided;
5 PetscLogEvent PETSC_BuildTwoSidedF;
6 
7 const char *const PetscBuildTwoSidedTypes[] = {
8   "ALLREDUCE",
9   "IBARRIER",
10   "REDSCATTER",
11   "PetscBuildTwoSidedType",
12   "PETSC_BUILDTWOSIDED_",
13   NULL
14 };
15 
16 static PetscBuildTwoSidedType _twosided_type = PETSC_BUILDTWOSIDED_NOTSET;
17 
18 /*@
19    PetscCommBuildTwoSidedSetType - set algorithm to use when building two-sided communication
20 
21    Logically Collective
22 
23    Input Arguments:
24 +  comm - PETSC_COMM_WORLD
25 -  twosided - algorithm to use in subsequent calls to PetscCommBuildTwoSided()
26 
27    Level: developer
28 
29    Note:
30    This option is currently global, but could be made per-communicator.
31 
32 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedGetType()
33 @*/
PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)34 PetscErrorCode PetscCommBuildTwoSidedSetType(MPI_Comm comm,PetscBuildTwoSidedType twosided)
35 {
36   PetscFunctionBegin;
37   if (PetscDefined(USE_DEBUG)) {                             /* We don't have a PetscObject so can't use PetscValidLogicalCollectiveEnum */
38     PetscMPIInt ierr;
39     PetscMPIInt b1[2],b2[2];
40     b1[0] = -(PetscMPIInt)twosided;
41     b1[1] = (PetscMPIInt)twosided;
42     ierr  = MPIU_Allreduce(b1,b2,2,MPI_INT,MPI_MAX,comm);CHKERRQ(ierr);
43     if (-b2[0] != b2[1]) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"Enum value must be same on all processes");
44   }
45   _twosided_type = twosided;
46   PetscFunctionReturn(0);
47 }
48 
49 /*@
50    PetscCommBuildTwoSidedGetType - set algorithm to use when building two-sided communication
51 
52    Logically Collective
53 
54    Output Arguments:
55 +  comm - communicator on which to query algorithm
56 -  twosided - algorithm to use for PetscCommBuildTwoSided()
57 
58    Level: developer
59 
60 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedSetType()
61 @*/
PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType * twosided)62 PetscErrorCode PetscCommBuildTwoSidedGetType(MPI_Comm comm,PetscBuildTwoSidedType *twosided)
63 {
64   PetscErrorCode ierr;
65   PetscMPIInt    size;
66 
67   PetscFunctionBegin;
68   *twosided = PETSC_BUILDTWOSIDED_NOTSET;
69   if (_twosided_type == PETSC_BUILDTWOSIDED_NOTSET) {
70     ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
71     _twosided_type = PETSC_BUILDTWOSIDED_ALLREDUCE; /* default for small comms, see https://gitlab.com/petsc/petsc/-/merge_requests/2611 */
72 #if defined(PETSC_HAVE_MPI_IBARRIER)
73     if (size > 1024) _twosided_type = PETSC_BUILDTWOSIDED_IBARRIER;
74 #endif
75     ierr = PetscOptionsGetEnum(NULL,NULL,"-build_twosided",PetscBuildTwoSidedTypes,(PetscEnum*)&_twosided_type,NULL);CHKERRQ(ierr);
76   }
77   *twosided = _twosided_type;
78   PetscFunctionReturn(0);
79 }
80 
81 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
82 
PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata)83 static PetscErrorCode PetscCommBuildTwoSided_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
84 {
85   PetscErrorCode ierr;
86   PetscMPIInt    nrecvs,tag,done,i;
87   MPI_Aint       lb,unitbytes;
88   char           *tdata;
89   MPI_Request    *sendreqs,barrier;
90   PetscSegBuffer segrank,segdata;
91   PetscBool      barrier_started;
92 
93   PetscFunctionBegin;
94   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
95   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
96   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
97   tdata = (char*)todata;
98   ierr  = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
99   for (i=0; i<nto; i++) {
100     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
101   }
102   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
103   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
104 
105   nrecvs  = 0;
106   barrier = MPI_REQUEST_NULL;
107   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
108    * but we need to work around it. */
109   barrier_started = PETSC_FALSE;
110   for (done=0; !done;) {
111     PetscMPIInt flag;
112     MPI_Status  status;
113     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
114     if (flag) {                 /* incoming message */
115       PetscMPIInt *recvrank;
116       void        *buf;
117       ierr      = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
118       ierr      = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
119       *recvrank = status.MPI_SOURCE;
120       ierr      = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
121       nrecvs++;
122     }
123     if (!barrier_started) {
124       PetscMPIInt sent,nsends;
125       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
126       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
127       if (sent) {
128 #if defined(PETSC_HAVE_MPI_IBARRIER)
129         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
130 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
131         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
132 #endif
133         barrier_started = PETSC_TRUE;
134         ierr = PetscFree(sendreqs);CHKERRQ(ierr);
135       }
136     } else {
137       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
138     }
139   }
140   *nfrom = nrecvs;
141   ierr   = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
142   ierr   = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
143   ierr   = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
144   ierr   = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
145   ierr   = PetscCommDestroy(&comm);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 #endif
149 
PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata)150 static PetscErrorCode PetscCommBuildTwoSided_Allreduce(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
151 {
152   PetscErrorCode   ierr;
153   PetscMPIInt      size,rank,*iflags,nrecvs,tag,*franks,i,flg;
154   MPI_Aint         lb,unitbytes;
155   char             *tdata,*fdata;
156   MPI_Request      *reqs,*sendreqs;
157   MPI_Status       *statuses;
158   PetscCommCounter *counter;
159 
160   PetscFunctionBegin;
161   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
162   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
163   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
164   ierr = MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
165   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
166   if (!counter->iflags) {
167     ierr   = PetscCalloc1(size,&counter->iflags);CHKERRQ(ierr);
168     iflags = counter->iflags;
169   } else {
170     iflags = counter->iflags;
171     ierr   = PetscArrayzero(iflags,size);CHKERRQ(ierr);
172   }
173   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
174   ierr     = MPIU_Allreduce(MPI_IN_PLACE,iflags,size,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
175   nrecvs   = iflags[rank];
176   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
177   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
178   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
179   tdata    = (char*)todata;
180   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
181   sendreqs = reqs + nrecvs;
182   for (i=0; i<nrecvs; i++) {
183     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
184   }
185   for (i=0; i<nto; i++) {
186     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
187   }
188   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
189   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
190   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
191   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
192   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
193 
194   *nfrom            = nrecvs;
195   *fromranks        = franks;
196   *(void**)fromdata = fdata;
197   PetscFunctionReturn(0);
198 }
199 
200 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata)201 static PetscErrorCode PetscCommBuildTwoSided_RedScatter(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
202 {
203   PetscErrorCode ierr;
204   PetscMPIInt    size,*iflags,nrecvs,tag,*franks,i,flg;
205   MPI_Aint       lb,unitbytes;
206   char           *tdata,*fdata;
207   MPI_Request    *reqs,*sendreqs;
208   MPI_Status     *statuses;
209   PetscCommCounter *counter;
210 
211   PetscFunctionBegin;
212   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
213   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
214   ierr = MPI_Comm_get_attr(comm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
215   if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Inner PETSc communicator does not have its tag/name counter attribute set");
216   if (!counter->iflags) {
217     ierr   = PetscCalloc1(size,&counter->iflags);CHKERRQ(ierr);
218     iflags = counter->iflags;
219   } else {
220     iflags = counter->iflags;
221     ierr   = PetscArrayzero(iflags,size);CHKERRQ(ierr);
222   }
223   for (i=0; i<nto; i++) iflags[toranks[i]] = 1;
224   ierr     = MPI_Reduce_scatter_block(iflags,&nrecvs,1,MPI_INT,MPI_SUM,comm);CHKERRQ(ierr);
225   ierr     = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
226   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
227   ierr     = PetscMalloc(nrecvs*count*unitbytes,&fdata);CHKERRQ(ierr);
228   tdata    = (char*)todata;
229   ierr     = PetscMalloc2(nto+nrecvs,&reqs,nto+nrecvs,&statuses);CHKERRQ(ierr);
230   sendreqs = reqs + nrecvs;
231   for (i=0; i<nrecvs; i++) {
232     ierr = MPI_Irecv((void*)(fdata+count*unitbytes*i),count,dtype,MPI_ANY_SOURCE,tag,comm,reqs+i);CHKERRQ(ierr);
233   }
234   for (i=0; i<nto; i++) {
235     ierr = MPI_Isend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
236   }
237   ierr = MPI_Waitall(nto+nrecvs,reqs,statuses);CHKERRQ(ierr);
238   ierr = PetscMalloc1(nrecvs,&franks);CHKERRQ(ierr);
239   for (i=0; i<nrecvs; i++) franks[i] = statuses[i].MPI_SOURCE;
240   ierr = PetscFree2(reqs,statuses);CHKERRQ(ierr);
241   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
242 
243   *nfrom            = nrecvs;
244   *fromranks        = franks;
245   *(void**)fromdata = fdata;
246   PetscFunctionReturn(0);
247 }
248 #endif
249 
250 /*@C
251    PetscCommBuildTwoSided - discovers communicating ranks given one-sided information, moving constant-sized data in the process (often message lengths)
252 
253    Collective
254 
255    Input Arguments:
256 +  comm - communicator
257 .  count - number of entries to send/receive (must match on all ranks)
258 .  dtype - datatype to send/receive from each rank (must match on all ranks)
259 .  nto - number of ranks to send data to
260 .  toranks - ranks to send to (array of length nto)
261 -  todata - data to send to each rank (packed)
262 
263    Output Arguments:
264 +  nfrom - number of ranks receiving messages from
265 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
266 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
267 
268    Level: developer
269 
270    Options Database Keys:
271 .  -build_twosided <allreduce|ibarrier|redscatter> - algorithm to set up two-sided communication. Default is allreduce for communicators with <= 1024 ranks, otherwise ibarrier.
272 
273    Notes:
274    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
275    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other constant-size data.
276 
277    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
278 
279    References:
280 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
281    Scalable communication protocols for dynamic sparse data exchange, 2010.
282 
283 .seealso: PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
284 @*/
PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata)285 PetscErrorCode PetscCommBuildTwoSided(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata)
286 {
287   PetscErrorCode         ierr;
288   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
289 
290   PetscFunctionBegin;
291   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
292   ierr = PetscLogEventSync(PETSC_BuildTwoSided,comm);CHKERRQ(ierr);
293   ierr = PetscLogEventBegin(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
294   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
295   switch (buildtype) {
296   case PETSC_BUILDTWOSIDED_IBARRIER:
297 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
298     ierr = PetscCommBuildTwoSided_Ibarrier(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
299 #else
300     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
301 #endif
302     break;
303   case PETSC_BUILDTWOSIDED_ALLREDUCE:
304     ierr = PetscCommBuildTwoSided_Allreduce(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
305     break;
306   case PETSC_BUILDTWOSIDED_REDSCATTER:
307 #if defined(PETSC_HAVE_MPI_REDUCE_SCATTER_BLOCK)
308     ierr = PetscCommBuildTwoSided_RedScatter(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
309 #else
310     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Reduce_scatter_block (part of MPI-2.2)");
311 #endif
312     break;
313   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
314   }
315   ierr = PetscLogEventEnd(PETSC_BuildTwoSided,0,0,0,0);CHKERRQ(ierr);
316   PetscFunctionReturn(0);
317 }
318 
PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata,PetscMPIInt ntags,MPI_Request ** toreqs,MPI_Request ** fromreqs,PetscErrorCode (* send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void *,MPI_Request[],void *),PetscErrorCode (* recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void *,MPI_Request[],void *),void * ctx)319 static PetscErrorCode PetscCommBuildTwoSidedFReq_Reference(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
320                                                            PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
321                                                            PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
322                                                            PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
323 {
324   PetscErrorCode ierr;
325   PetscMPIInt i,*tag;
326   MPI_Aint    lb,unitbytes;
327   MPI_Request *sendreq,*recvreq;
328 
329   PetscFunctionBegin;
330   ierr = PetscMalloc1(ntags,&tag);CHKERRQ(ierr);
331   if (ntags > 0) {
332     ierr = PetscCommDuplicate(comm,&comm,&tag[0]);CHKERRQ(ierr);
333   }
334   for (i=1; i<ntags; i++) {
335     ierr = PetscCommGetNewTag(comm,&tag[i]);CHKERRQ(ierr);
336   }
337 
338   /* Perform complete initial rendezvous */
339   ierr = PetscCommBuildTwoSided(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata);CHKERRQ(ierr);
340 
341   ierr = PetscMalloc1(nto*ntags,&sendreq);CHKERRQ(ierr);
342   ierr = PetscMalloc1(*nfrom*ntags,&recvreq);CHKERRQ(ierr);
343 
344   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
345   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
346   for (i=0; i<nto; i++) {
347     PetscMPIInt k;
348     for (k=0; k<ntags; k++) sendreq[i*ntags+k] = MPI_REQUEST_NULL;
349     ierr = (*send)(comm,tag,i,toranks[i],((char*)todata)+count*unitbytes*i,sendreq+i*ntags,ctx);CHKERRQ(ierr);
350   }
351   for (i=0; i<*nfrom; i++) {
352     void *header = (*(char**)fromdata) + count*unitbytes*i;
353     PetscMPIInt k;
354     for (k=0; k<ntags; k++) recvreq[i*ntags+k] = MPI_REQUEST_NULL;
355     ierr = (*recv)(comm,tag,(*fromranks)[i],header,recvreq+i*ntags,ctx);CHKERRQ(ierr);
356   }
357   ierr = PetscFree(tag);CHKERRQ(ierr);
358   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
359   *toreqs = sendreq;
360   *fromreqs = recvreq;
361   PetscFunctionReturn(0);
362 }
363 
364 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
365 
PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata,PetscMPIInt ntags,MPI_Request ** toreqs,MPI_Request ** fromreqs,PetscErrorCode (* send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void *,MPI_Request[],void *),PetscErrorCode (* recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void *,MPI_Request[],void *),void * ctx)366 static PetscErrorCode PetscCommBuildTwoSidedFReq_Ibarrier(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
367                                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
368                                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
369                                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
370 {
371   PetscErrorCode ierr;
372   PetscMPIInt    nrecvs,tag,*tags,done,i;
373   MPI_Aint       lb,unitbytes;
374   char           *tdata;
375   MPI_Request    *sendreqs,*usendreqs,*req,barrier;
376   PetscSegBuffer segrank,segdata,segreq;
377   PetscBool      barrier_started;
378 
379   PetscFunctionBegin;
380   ierr = PetscCommDuplicate(comm,&comm,&tag);CHKERRQ(ierr);
381   ierr = PetscMalloc1(ntags,&tags);CHKERRQ(ierr);
382   for (i=0; i<ntags; i++) {
383     ierr = PetscCommGetNewTag(comm,&tags[i]);CHKERRQ(ierr);
384   }
385   ierr = MPI_Type_get_extent(dtype,&lb,&unitbytes);CHKERRQ(ierr);
386   if (lb != 0) SETERRQ1(comm,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
387   tdata = (char*)todata;
388   ierr = PetscMalloc1(nto,&sendreqs);CHKERRQ(ierr);
389   ierr = PetscMalloc1(nto*ntags,&usendreqs);CHKERRQ(ierr);
390   /* Post synchronous sends */
391   for (i=0; i<nto; i++) {
392     ierr = MPI_Issend((void*)(tdata+count*unitbytes*i),count,dtype,toranks[i],tag,comm,sendreqs+i);CHKERRQ(ierr);
393   }
394   /* Post actual payloads.  These are typically larger messages.  Hopefully sending these later does not slow down the
395    * synchronous messages above. */
396   for (i=0; i<nto; i++) {
397     PetscMPIInt k;
398     for (k=0; k<ntags; k++) usendreqs[i*ntags+k] = MPI_REQUEST_NULL;
399     ierr = (*send)(comm,tags,i,toranks[i],tdata+count*unitbytes*i,usendreqs+i*ntags,ctx);CHKERRQ(ierr);
400   }
401 
402   ierr = PetscSegBufferCreate(sizeof(PetscMPIInt),4,&segrank);CHKERRQ(ierr);
403   ierr = PetscSegBufferCreate(unitbytes,4*count,&segdata);CHKERRQ(ierr);
404   ierr = PetscSegBufferCreate(sizeof(MPI_Request),4,&segreq);CHKERRQ(ierr);
405 
406   nrecvs  = 0;
407   barrier = MPI_REQUEST_NULL;
408   /* MPICH-3.2 sometimes does not create a request in some "optimized" cases.  This is arguably a standard violation,
409    * but we need to work around it. */
410   barrier_started = PETSC_FALSE;
411   for (done=0; !done;) {
412     PetscMPIInt flag;
413     MPI_Status  status;
414     ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag,comm,&flag,&status);CHKERRQ(ierr);
415     if (flag) {                 /* incoming message */
416       PetscMPIInt *recvrank,k;
417       void        *buf;
418       ierr = PetscSegBufferGet(segrank,1,&recvrank);CHKERRQ(ierr);
419       ierr = PetscSegBufferGet(segdata,count,&buf);CHKERRQ(ierr);
420       *recvrank = status.MPI_SOURCE;
421       ierr = MPI_Recv(buf,count,dtype,status.MPI_SOURCE,tag,comm,MPI_STATUS_IGNORE);CHKERRQ(ierr);
422       ierr = PetscSegBufferGet(segreq,ntags,&req);CHKERRQ(ierr);
423       for (k=0; k<ntags; k++) req[k] = MPI_REQUEST_NULL;
424       ierr = (*recv)(comm,tags,status.MPI_SOURCE,buf,req,ctx);CHKERRQ(ierr);
425       nrecvs++;
426     }
427     if (!barrier_started) {
428       PetscMPIInt sent,nsends;
429       ierr = PetscMPIIntCast(nto,&nsends);CHKERRQ(ierr);
430       ierr = MPI_Testall(nsends,sendreqs,&sent,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
431       if (sent) {
432 #if defined(PETSC_HAVE_MPI_IBARRIER)
433         ierr = MPI_Ibarrier(comm,&barrier);CHKERRQ(ierr);
434 #elif defined(PETSC_HAVE_MPIX_IBARRIER)
435         ierr = MPIX_Ibarrier(comm,&barrier);CHKERRQ(ierr);
436 #endif
437         barrier_started = PETSC_TRUE;
438       }
439     } else {
440       ierr = MPI_Test(&barrier,&done,MPI_STATUS_IGNORE);CHKERRQ(ierr);
441     }
442   }
443   *nfrom = nrecvs;
444   ierr = PetscSegBufferExtractAlloc(segrank,fromranks);CHKERRQ(ierr);
445   ierr = PetscSegBufferDestroy(&segrank);CHKERRQ(ierr);
446   ierr = PetscSegBufferExtractAlloc(segdata,fromdata);CHKERRQ(ierr);
447   ierr = PetscSegBufferDestroy(&segdata);CHKERRQ(ierr);
448   *toreqs = usendreqs;
449   ierr = PetscSegBufferExtractAlloc(segreq,fromreqs);CHKERRQ(ierr);
450   ierr = PetscSegBufferDestroy(&segreq);CHKERRQ(ierr);
451   ierr = PetscFree(sendreqs);CHKERRQ(ierr);
452   ierr = PetscFree(tags);CHKERRQ(ierr);
453   ierr = PetscCommDestroy(&comm);CHKERRQ(ierr);
454   PetscFunctionReturn(0);
455 }
456 #endif
457 
458 /*@C
459    PetscCommBuildTwoSidedF - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous
460 
461    Collective
462 
463    Input Arguments:
464 +  comm - communicator
465 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
466 .  dtype - datatype to send/receive from each rank (must match on all ranks)
467 .  nto - number of ranks to send data to
468 .  toranks - ranks to send to (array of length nto)
469 .  todata - data to send to each rank (packed)
470 .  ntags - number of tags needed by send/recv callbacks
471 .  send - callback invoked on sending process when ready to send primary payload
472 .  recv - callback invoked on receiving process after delivery of rendezvous message
473 -  ctx - context for callbacks
474 
475    Output Arguments:
476 +  nfrom - number of ranks receiving messages from
477 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
478 -  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
479 
480    Level: developer
481 
482    Notes:
483    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
484    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
485 
486    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
487 
488    References:
489 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
490    Scalable communication protocols for dynamic sparse data exchange, 2010.
491 
492 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedFReq(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
493 @*/
PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata,PetscMPIInt ntags,PetscErrorCode (* send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void *,MPI_Request[],void *),PetscErrorCode (* recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void *,MPI_Request[],void *),void * ctx)494 PetscErrorCode PetscCommBuildTwoSidedF(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,
495                                        PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
496                                        PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
497 {
498   PetscErrorCode ierr;
499   MPI_Request    *toreqs,*fromreqs;
500 
501   PetscFunctionBegin;
502   ierr = PetscCommBuildTwoSidedFReq(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,&toreqs,&fromreqs,send,recv,ctx);CHKERRQ(ierr);
503   ierr = MPI_Waitall(nto*ntags,toreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
504   ierr = MPI_Waitall(*nfrom*ntags,fromreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
505   ierr = PetscFree(toreqs);CHKERRQ(ierr);
506   ierr = PetscFree(fromreqs);CHKERRQ(ierr);
507   PetscFunctionReturn(0);
508 }
509 
510 /*@C
511    PetscCommBuildTwoSidedFReq - discovers communicating ranks given one-sided information, calling user-defined functions during rendezvous, returns requests
512 
513    Collective
514 
515    Input Arguments:
516 +  comm - communicator
517 .  count - number of entries to send/receive in initial rendezvous (must match on all ranks)
518 .  dtype - datatype to send/receive from each rank (must match on all ranks)
519 .  nto - number of ranks to send data to
520 .  toranks - ranks to send to (array of length nto)
521 .  todata - data to send to each rank (packed)
522 .  ntags - number of tags needed by send/recv callbacks
523 .  send - callback invoked on sending process when ready to send primary payload
524 .  recv - callback invoked on receiving process after delivery of rendezvous message
525 -  ctx - context for callbacks
526 
527    Output Arguments:
528 +  nfrom - number of ranks receiving messages from
529 .  fromranks - ranks receiving messages from (length nfrom; caller should PetscFree())
530 .  fromdata - packed data from each rank, each with count entries of type dtype (length nfrom, caller responsible for PetscFree())
531 .  toreqs - array of nto*ntags sender requests (caller must wait on these, then PetscFree())
532 -  fromreqs - array of nfrom*ntags receiver requests (caller must wait on these, then PetscFree())
533 
534    Level: developer
535 
536    Notes:
537    This memory-scalable interface is an alternative to calling PetscGatherNumberOfMessages() and
538    PetscGatherMessageLengths(), possibly with a subsequent round of communication to send other data.
539 
540    Basic data types as well as contiguous types are supported, but non-contiguous (e.g., strided) types are not.
541 
542    References:
543 .  1. - Hoefler, Siebert and Lumsdaine, The MPI_Ibarrier implementation uses the algorithm in
544    Scalable communication protocols for dynamic sparse data exchange, 2010.
545 
546 .seealso: PetscCommBuildTwoSided(), PetscCommBuildTwoSidedF(), PetscGatherNumberOfMessages(), PetscGatherMessageLengths()
547 @*/
PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt * toranks,const void * todata,PetscMPIInt * nfrom,PetscMPIInt ** fromranks,void * fromdata,PetscMPIInt ntags,MPI_Request ** toreqs,MPI_Request ** fromreqs,PetscErrorCode (* send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void *,MPI_Request[],void *),PetscErrorCode (* recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void *,MPI_Request[],void *),void * ctx)548 PetscErrorCode PetscCommBuildTwoSidedFReq(MPI_Comm comm,PetscMPIInt count,MPI_Datatype dtype,PetscMPIInt nto,const PetscMPIInt *toranks,const void *todata,
549                                           PetscMPIInt *nfrom,PetscMPIInt **fromranks,void *fromdata,PetscMPIInt ntags,MPI_Request **toreqs,MPI_Request **fromreqs,
550                                           PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
551                                           PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx)
552 {
553   PetscErrorCode         ierr,(*f)(MPI_Comm,PetscMPIInt,MPI_Datatype,PetscMPIInt,const PetscMPIInt[],const void*,
554                                    PetscMPIInt*,PetscMPIInt**,void*,PetscMPIInt,MPI_Request**,MPI_Request**,
555                                    PetscErrorCode (*send)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,PetscMPIInt,void*,MPI_Request[],void*),
556                                    PetscErrorCode (*recv)(MPI_Comm,const PetscMPIInt[],PetscMPIInt,void*,MPI_Request[],void*),void *ctx);
557   PetscBuildTwoSidedType buildtype = PETSC_BUILDTWOSIDED_NOTSET;
558   PetscMPIInt i,size;
559 
560   PetscFunctionBegin;
561   ierr = PetscSysInitializePackage();CHKERRQ(ierr);
562   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
563   for (i=0; i<nto; i++) {
564     if (toranks[i] < 0 || size <= toranks[i]) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"toranks[%d] %d not in comm size %d",i,toranks[i],size);
565   }
566   ierr = PetscLogEventSync(PETSC_BuildTwoSidedF,comm);CHKERRQ(ierr);
567   ierr = PetscLogEventBegin(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
568   ierr = PetscCommBuildTwoSidedGetType(comm,&buildtype);CHKERRQ(ierr);
569   switch (buildtype) {
570   case PETSC_BUILDTWOSIDED_IBARRIER:
571 #if defined(PETSC_HAVE_MPI_IBARRIER) || defined(PETSC_HAVE_MPIX_IBARRIER)
572     f = PetscCommBuildTwoSidedFReq_Ibarrier;
573 #else
574     SETERRQ(comm,PETSC_ERR_PLIB,"MPI implementation does not provide MPI_Ibarrier (part of MPI-3)");
575 #endif
576     break;
577   case PETSC_BUILDTWOSIDED_ALLREDUCE:
578   case PETSC_BUILDTWOSIDED_REDSCATTER:
579     f = PetscCommBuildTwoSidedFReq_Reference;
580     break;
581   default: SETERRQ(comm,PETSC_ERR_PLIB,"Unknown method for building two-sided communication");
582   }
583   ierr = (*f)(comm,count,dtype,nto,toranks,todata,nfrom,fromranks,fromdata,ntags,toreqs,fromreqs,send,recv,ctx);CHKERRQ(ierr);
584   ierr = PetscLogEventEnd(PETSC_BuildTwoSidedF,0,0,0,0);CHKERRQ(ierr);
585   PetscFunctionReturn(0);
586 }
587