1 // Copyright (c) 2017-2021, Lawrence Livermore National Security, LLC and
2 // other Axom Project Developers. See the top-level LICENSE file for details.
3 //
4 // SPDX-License-Identifier: (BSD-3-Clause)
5 
6 #include "lulesh.h"
7 
8 // If no MPI, then this whole file is stubbed out
9 #ifdef AXOM_USE_MPI
10 
11 #include <mpi.h>
12 #include <string.h>
13 
14 /* Comm Routines */
15 
16 #define ALLOW_UNPACKED_PLANE false
17 #define ALLOW_UNPACKED_ROW   false
18 #define ALLOW_UNPACKED_COL   false
19 
20 /*
21    There are coherence issues for packing and unpacking message
22    buffers.  Ideally, you would like a lot of threads to
23    cooperate in the assembly/dissassembly of each message.
24    To do that, each thread should really be operating in a
25    different coherence zone.
26 
27    Let's assume we have three fields, f1 through f3, defined on
28    a 61x61x61 cube.  If we want to send the block boundary
29    information for each field to each neighbor processor across
30    each cube face, then we have three cases for the
31    memory layout/coherence of data on each of the six cube
32    boundaries:
33 
34       (a) Two of the faces will be in contiguous memory blocks
35       (b) Two of the faces will be comprised of pencils of
36           contiguous memory.
37       (c) Two of the faces will have large strides between
38           every value living on the face.
39 
40    How do you pack and unpack this data in buffers to
41    simultaneous achieve the best memory efficiency and
42    the most thread independence?
43 
44    Do do you pack field f1 through f3 tighly to reduce message
45    size?  Do you align each field on a cache coherence boundary
46    within the message so that threads can pack and unpack each
47    field independently?  For case (b), do you align each
48    boundary pencil of each field separately?  This increases
49    the message size, but could improve cache coherence so
50    each pencil could be processed independently by a separate
51    thread with no conflicts.
52 
53    Also, memory access for case (c) would best be done without
54    going through the cache (the stride is so large it just causes
55    a lot of useless cache evictions).  Is it worth creating
56    a special case version of the packing algorithm that uses
57    non-coherent load/store opcodes?
58 */
59 
60 /******************************************/
61 
62 
63 /* doRecv flag only works with regular block structure */
CommRecv(Domain & domain,int msgType,Index_t xferFields,Index_t dx,Index_t dy,Index_t dz,bool doRecv,bool planeOnly)64 void CommRecv(Domain& domain, int msgType, Index_t xferFields,
65               Index_t dx, Index_t dy, Index_t dz, bool doRecv, bool planeOnly) {
66 
67    if (domain.numRanks() == 1)
68       return ;
69 
70    /* post recieve buffers for all incoming messages */
71    int myRank ;
72    Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
73    Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
74    Index_t pmsg = 0 ; /* plane comm msg */
75    Index_t emsg = 0 ; /* edge comm msg */
76    Index_t cmsg = 0 ; /* corner comm msg */
77    MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
78    bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
79 
80    /* assume communication to 6 neighbors by default */
81    rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
82 
83    if (domain.rowLoc() == 0) {
84       rowMin = false ;
85    }
86    if (domain.rowLoc() == (domain.tp()-1)) {
87       rowMax = false ;
88    }
89    if (domain.colLoc() == 0) {
90       colMin = false ;
91    }
92    if (domain.colLoc() == (domain.tp()-1)) {
93       colMax = false ;
94    }
95    if (domain.planeLoc() == 0) {
96       planeMin = false ;
97    }
98    if (domain.planeLoc() == (domain.tp()-1)) {
99       planeMax = false ;
100    }
101 
102    for (Index_t i=0; i<26; ++i) {
103       domain.recvRequest[i] = MPI_REQUEST_NULL ;
104    }
105 
106    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
107 
108    /* post receives */
109 
110    /* receive data from neighboring domain faces */
111    if (planeMin && doRecv) {
112       /* contiguous memory */
113       int fromRank = myRank - domain.tp()*domain.tp() ;
114       int recvCount = dx * dy * xferFields ;
115       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
116                 recvCount, baseType, fromRank, msgType,
117                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
118       ++pmsg ;
119    }
120    if (planeMax) {
121       /* contiguous memory */
122       int fromRank = myRank + domain.tp()*domain.tp() ;
123       int recvCount = dx * dy * xferFields ;
124       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
125                 recvCount, baseType, fromRank, msgType,
126                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
127       ++pmsg ;
128    }
129    if (rowMin && doRecv) {
130       /* semi-contiguous memory */
131       int fromRank = myRank - domain.tp() ;
132       int recvCount = dx * dz * xferFields ;
133       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
134                 recvCount, baseType, fromRank, msgType,
135                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
136       ++pmsg ;
137    }
138    if (rowMax) {
139       /* semi-contiguous memory */
140       int fromRank = myRank + domain.tp() ;
141       int recvCount = dx * dz * xferFields ;
142       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
143                 recvCount, baseType, fromRank, msgType,
144                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
145       ++pmsg ;
146    }
147    if (colMin && doRecv) {
148       /* scattered memory */
149       int fromRank = myRank - 1 ;
150       int recvCount = dy * dz * xferFields ;
151       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
152                 recvCount, baseType, fromRank, msgType,
153                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
154       ++pmsg ;
155    }
156    if (colMax) {
157       /* scattered memory */
158       int fromRank = myRank + 1 ;
159       int recvCount = dy * dz * xferFields ;
160       MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm],
161                 recvCount, baseType, fromRank, msgType,
162                 MPI_COMM_WORLD, &domain.recvRequest[pmsg]) ;
163       ++pmsg ;
164    }
165 
166    if (!planeOnly) {
167       /* receive data from domains connected only by an edge */
168       if (rowMin && colMin && doRecv) {
169          int fromRank = myRank - domain.tp() - 1 ;
170          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
171                                          emsg * maxEdgeComm],
172                    dz * xferFields, baseType, fromRank, msgType,
173                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
174          ++emsg ;
175       }
176 
177       if (rowMin && planeMin && doRecv) {
178          int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
179          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
180                                          emsg * maxEdgeComm],
181                    dx * xferFields, baseType, fromRank, msgType,
182                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
183          ++emsg ;
184       }
185 
186       if (colMin && planeMin && doRecv) {
187          int fromRank = myRank - domain.tp()*domain.tp() - 1 ;
188          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
189                                          emsg * maxEdgeComm],
190                    dy * xferFields, baseType, fromRank, msgType,
191                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
192          ++emsg ;
193       }
194 
195       if (rowMax && colMax) {
196          int fromRank = myRank + domain.tp() + 1 ;
197          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
198                                          emsg * maxEdgeComm],
199                    dz * xferFields, baseType, fromRank, msgType,
200                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
201          ++emsg ;
202       }
203 
204       if (rowMax && planeMax) {
205          int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
206          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
207                                          emsg * maxEdgeComm],
208                    dx * xferFields, baseType, fromRank, msgType,
209                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
210          ++emsg ;
211       }
212 
213       if (colMax && planeMax) {
214          int fromRank = myRank + domain.tp()*domain.tp() + 1 ;
215          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
216                                          emsg * maxEdgeComm],
217                    dy * xferFields, baseType, fromRank, msgType,
218                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
219          ++emsg ;
220       }
221 
222       if (rowMax && colMin) {
223          int fromRank = myRank + domain.tp() - 1 ;
224          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
225                                          emsg * maxEdgeComm],
226                    dz * xferFields, baseType, fromRank, msgType,
227                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
228          ++emsg ;
229       }
230 
231       if (rowMin && planeMax) {
232          int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
233          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
234                                          emsg * maxEdgeComm],
235                    dx * xferFields, baseType, fromRank, msgType,
236                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
237          ++emsg ;
238       }
239 
240       if (colMin && planeMax) {
241          int fromRank = myRank + domain.tp()*domain.tp() - 1 ;
242          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
243                                          emsg * maxEdgeComm],
244                    dy * xferFields, baseType, fromRank, msgType,
245                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
246          ++emsg ;
247       }
248 
249       if (rowMin && colMax && doRecv) {
250          int fromRank = myRank - domain.tp() + 1 ;
251          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
252                                          emsg * maxEdgeComm],
253                    dz * xferFields, baseType, fromRank, msgType,
254                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
255          ++emsg ;
256       }
257 
258       if (rowMax && planeMin && doRecv) {
259          int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
260          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
261                                          emsg * maxEdgeComm],
262                    dx * xferFields, baseType, fromRank, msgType,
263                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
264          ++emsg ;
265       }
266 
267       if (colMax && planeMin && doRecv) {
268          int fromRank = myRank - domain.tp()*domain.tp() + 1 ;
269          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
270                                          emsg * maxEdgeComm],
271                    dy * xferFields, baseType, fromRank, msgType,
272                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg]) ;
273          ++emsg ;
274       }
275 
276       /* receive data from domains connected only by a corner */
277       if (rowMin && colMin && planeMin && doRecv) {
278          /* corner at domain logical coord (0, 0, 0) */
279          int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
280          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
281                                          emsg * maxEdgeComm +
282                                          cmsg * CACHE_COHERENCE_PAD_REAL],
283                    xferFields, baseType, fromRank, msgType,
284                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
285          ++cmsg ;
286       }
287       if (rowMin && colMin && planeMax) {
288          /* corner at domain logical coord (0, 0, 1) */
289          int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
290          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
291                                          emsg * maxEdgeComm +
292                                          cmsg * CACHE_COHERENCE_PAD_REAL],
293                    xferFields, baseType, fromRank, msgType,
294                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
295          ++cmsg ;
296       }
297       if (rowMin && colMax && planeMin && doRecv) {
298          /* corner at domain logical coord (1, 0, 0) */
299          int fromRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
300          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
301                                          emsg * maxEdgeComm +
302                                          cmsg * CACHE_COHERENCE_PAD_REAL],
303                    xferFields, baseType, fromRank, msgType,
304                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
305          ++cmsg ;
306       }
307       if (rowMin && colMax && planeMax) {
308          /* corner at domain logical coord (1, 0, 1) */
309          int fromRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
310          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
311                                          emsg * maxEdgeComm +
312                                          cmsg * CACHE_COHERENCE_PAD_REAL],
313                    xferFields, baseType, fromRank, msgType,
314                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
315          ++cmsg ;
316       }
317       if (rowMax && colMin && planeMin && doRecv) {
318          /* corner at domain logical coord (0, 1, 0) */
319          int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
320          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
321                                          emsg * maxEdgeComm +
322                                          cmsg * CACHE_COHERENCE_PAD_REAL],
323                    xferFields, baseType, fromRank, msgType,
324                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
325          ++cmsg ;
326       }
327       if (rowMax && colMin && planeMax) {
328          /* corner at domain logical coord (0, 1, 1) */
329          int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
330          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
331                                          emsg * maxEdgeComm +
332                                          cmsg * CACHE_COHERENCE_PAD_REAL],
333                    xferFields, baseType, fromRank, msgType,
334                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
335          ++cmsg ;
336       }
337       if (rowMax && colMax && planeMin && doRecv) {
338          /* corner at domain logical coord (1, 1, 0) */
339          int fromRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
340          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
341                                          emsg * maxEdgeComm +
342                                          cmsg * CACHE_COHERENCE_PAD_REAL],
343                    xferFields, baseType, fromRank, msgType,
344                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
345          ++cmsg ;
346       }
347       if (rowMax && colMax && planeMax) {
348          /* corner at domain logical coord (1, 1, 1) */
349          int fromRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
350          MPI_Irecv(&domain.commDataRecv[pmsg * maxPlaneComm +
351                                          emsg * maxEdgeComm +
352                                          cmsg * CACHE_COHERENCE_PAD_REAL],
353                    xferFields, baseType, fromRank, msgType,
354                    MPI_COMM_WORLD, &domain.recvRequest[pmsg+emsg+cmsg]) ;
355          ++cmsg ;
356       }
357    }
358 }
359 
360 /******************************************/
361 
CommSend(Domain & domain,int msgType,Index_t xferFields,Domain_member * fieldData,Index_t dx,Index_t dy,Index_t dz,bool doSend,bool planeOnly)362 void CommSend(Domain& domain, int msgType,
363               Index_t xferFields, Domain_member *fieldData,
364               Index_t dx, Index_t dy, Index_t dz, bool doSend, bool planeOnly)
365 {
366 
367    if (domain.numRanks() == 1)
368       return ;
369 
370    /* post recieve buffers for all incoming messages */
371    int myRank ;
372    Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
373    Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
374    Index_t pmsg = 0 ; /* plane comm msg */
375    Index_t emsg = 0 ; /* edge comm msg */
376    Index_t cmsg = 0 ; /* corner comm msg */
377    MPI_Datatype baseType = ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE) ;
378    MPI_Status status[26] ;
379    Real_t *destAddr ;
380    bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
381    /* assume communication to 6 neighbors by default */
382    rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
383    if (domain.rowLoc() == 0) {
384       rowMin = false ;
385    }
386    if (domain.rowLoc() == (domain.tp()-1)) {
387       rowMax = false ;
388    }
389    if (domain.colLoc() == 0) {
390       colMin = false ;
391    }
392    if (domain.colLoc() == (domain.tp()-1)) {
393       colMax = false ;
394    }
395    if (domain.planeLoc() == 0) {
396       planeMin = false ;
397    }
398    if (domain.planeLoc() == (domain.tp()-1)) {
399       planeMax = false ;
400    }
401 
402    for (Index_t i=0; i<26; ++i) {
403       domain.sendRequest[i] = MPI_REQUEST_NULL ;
404    }
405 
406    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
407 
408    /* post sends */
409 
410    if (planeMin | planeMax) {
411       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
412       int sendCount = dx * dy ;
413 
414       if (planeMin) {
415          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
416          for (Index_t fi=0 ; fi<xferFields; ++fi) {
417             Domain_member src = fieldData[fi] ;
418             for (Index_t i=0; i<sendCount; ++i) {
419                destAddr[i] = (domain.*src)(i) ;
420             }
421             destAddr += sendCount ;
422          }
423          destAddr -= xferFields*sendCount ;
424 
425          MPI_Isend(destAddr, xferFields*sendCount, baseType,
426                    myRank - domain.tp()*domain.tp(), msgType,
427                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
428          ++pmsg ;
429       }
430       if (planeMax && doSend) {
431          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
432          for (Index_t fi=0 ; fi<xferFields; ++fi) {
433             Domain_member src = fieldData[fi] ;
434             for (Index_t i=0; i<sendCount; ++i) {
435                destAddr[i] = (domain.*src)(dx*dy*(dz - 1) + i) ;
436             }
437             destAddr += sendCount ;
438          }
439          destAddr -= xferFields*sendCount ;
440 
441          MPI_Isend(destAddr, xferFields*sendCount, baseType,
442                    myRank + domain.tp()*domain.tp(), msgType,
443                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
444          ++pmsg ;
445       }
446    }
447    if (rowMin | rowMax) {
448       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
449       int sendCount = dx * dz ;
450 
451       if (rowMin) {
452          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
453          for (Index_t fi=0; fi<xferFields; ++fi) {
454             Domain_member src = fieldData[fi] ;
455             for (Index_t i=0; i<dz; ++i) {
456                for (Index_t j=0; j<dx; ++j) {
457                   destAddr[i*dx+j] = (domain.*src)(i*dx*dy + j) ;
458                }
459             }
460             destAddr += sendCount ;
461          }
462          destAddr -= xferFields*sendCount ;
463 
464          MPI_Isend(destAddr, xferFields*sendCount, baseType,
465                    myRank - domain.tp(), msgType,
466                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
467          ++pmsg ;
468       }
469       if (rowMax && doSend) {
470          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
471          for (Index_t fi=0; fi<xferFields; ++fi) {
472             Domain_member src = fieldData[fi] ;
473             for (Index_t i=0; i<dz; ++i) {
474                for (Index_t j=0; j<dx; ++j) {
475                   destAddr[i*dx+j] = (domain.*src)(dx*(dy - 1) + i*dx*dy + j) ;
476                }
477             }
478             destAddr += sendCount ;
479          }
480          destAddr -= xferFields*sendCount ;
481 
482          MPI_Isend(destAddr, xferFields*sendCount, baseType,
483                    myRank + domain.tp(), msgType,
484                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
485          ++pmsg ;
486       }
487    }
488    if (colMin | colMax) {
489       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
490       int sendCount = dy * dz ;
491 
492       if (colMin) {
493          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
494          for (Index_t fi=0; fi<xferFields; ++fi) {
495             Domain_member src = fieldData[fi] ;
496             for (Index_t i=0; i<dz; ++i) {
497                for (Index_t j=0; j<dy; ++j) {
498                   destAddr[i*dy + j] = (domain.*src)(i*dx*dy + j*dx) ;
499                }
500             }
501             destAddr += sendCount ;
502          }
503          destAddr -= xferFields*sendCount ;
504 
505          MPI_Isend(destAddr, xferFields*sendCount, baseType,
506                    myRank - 1, msgType,
507                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
508          ++pmsg ;
509       }
510       if (colMax && doSend) {
511          destAddr = &domain.commDataSend[pmsg * maxPlaneComm] ;
512          for (Index_t fi=0; fi<xferFields; ++fi) {
513             Domain_member src = fieldData[fi] ;
514             for (Index_t i=0; i<dz; ++i) {
515                for (Index_t j=0; j<dy; ++j) {
516                   destAddr[i*dy + j] = (domain.*src)(dx - 1 + i*dx*dy + j*dx) ;
517                }
518             }
519             destAddr += sendCount ;
520          }
521          destAddr -= xferFields*sendCount ;
522 
523          MPI_Isend(destAddr, xferFields*sendCount, baseType,
524                    myRank + 1, msgType,
525                    MPI_COMM_WORLD, &domain.sendRequest[pmsg]) ;
526          ++pmsg ;
527       }
528    }
529 
530    if (!planeOnly) {
531       if (rowMin && colMin) {
532          int toRank = myRank - domain.tp() - 1 ;
533          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
534                                           emsg * maxEdgeComm] ;
535          for (Index_t fi=0; fi<xferFields; ++fi) {
536             Domain_member src = fieldData[fi] ;
537             for (Index_t i=0; i<dz; ++i) {
538                destAddr[i] = (domain.*src)(i*dx*dy) ;
539             }
540             destAddr += dz ;
541          }
542          destAddr -= xferFields*dz ;
543          MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
544                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
545          ++emsg ;
546       }
547 
548       if (rowMin && planeMin) {
549          int toRank = myRank - domain.tp()*domain.tp() - domain.tp() ;
550          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
551                                           emsg * maxEdgeComm] ;
552          for (Index_t fi=0; fi<xferFields; ++fi) {
553             Domain_member src = fieldData[fi] ;
554             for (Index_t i=0; i<dx; ++i) {
555                destAddr[i] = (domain.*src)(i) ;
556             }
557             destAddr += dx ;
558          }
559          destAddr -= xferFields*dx ;
560          MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
561                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
562          ++emsg ;
563       }
564 
565       if (colMin && planeMin) {
566          int toRank = myRank - domain.tp()*domain.tp() - 1 ;
567          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
568                                           emsg * maxEdgeComm] ;
569          for (Index_t fi=0; fi<xferFields; ++fi) {
570             Domain_member src = fieldData[fi] ;
571             for (Index_t i=0; i<dy; ++i) {
572                destAddr[i] = (domain.*src)(i*dx) ;
573             }
574             destAddr += dy ;
575          }
576          destAddr -= xferFields*dy ;
577          MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
578                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
579          ++emsg ;
580       }
581 
582       if (rowMax && colMax && doSend) {
583          int toRank = myRank + domain.tp() + 1 ;
584          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
585                                           emsg * maxEdgeComm] ;
586          for (Index_t fi=0; fi<xferFields; ++fi) {
587             Domain_member src = fieldData[fi] ;
588             for (Index_t i=0; i<dz; ++i) {
589                destAddr[i] = (domain.*src)(dx*dy - 1 + i*dx*dy) ;
590             }
591             destAddr += dz ;
592          }
593          destAddr -= xferFields*dz ;
594          MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
595                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
596          ++emsg ;
597       }
598 
599       if (rowMax && planeMax && doSend) {
600          int toRank = myRank + domain.tp()*domain.tp() + domain.tp() ;
601          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
602                                           emsg * maxEdgeComm] ;
603          for (Index_t fi=0; fi<xferFields; ++fi) {
604             Domain_member src = fieldData[fi] ;
605             for (Index_t i=0; i<dx; ++i) {
606               destAddr[i] = (domain.*src)(dx*(dy-1) + dx*dy*(dz-1) + i) ;
607             }
608             destAddr += dx ;
609          }
610          destAddr -= xferFields*dx ;
611          MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
612                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
613          ++emsg ;
614       }
615 
616       if (colMax && planeMax && doSend) {
617          int toRank = myRank + domain.tp()*domain.tp() + 1 ;
618          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
619                                           emsg * maxEdgeComm] ;
620          for (Index_t fi=0; fi<xferFields; ++fi) {
621             Domain_member src = fieldData[fi] ;
622             for (Index_t i=0; i<dy; ++i) {
623                destAddr[i] = (domain.*src)(dx*dy*(dz-1) + dx - 1 + i*dx) ;
624             }
625             destAddr += dy ;
626          }
627          destAddr -= xferFields*dy ;
628          MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
629                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
630          ++emsg ;
631       }
632 
633       if (rowMax && colMin && doSend) {
634          int toRank = myRank + domain.tp() - 1 ;
635          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
636                                           emsg * maxEdgeComm] ;
637          for (Index_t fi=0; fi<xferFields; ++fi) {
638             Domain_member src = fieldData[fi] ;
639             for (Index_t i=0; i<dz; ++i) {
640                destAddr[i] = (domain.*src)(dx*(dy-1) + i*dx*dy) ;
641             }
642             destAddr += dz ;
643          }
644          destAddr -= xferFields*dz ;
645          MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
646                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
647          ++emsg ;
648       }
649 
650       if (rowMin && planeMax && doSend) {
651          int toRank = myRank + domain.tp()*domain.tp() - domain.tp() ;
652          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
653                                           emsg * maxEdgeComm] ;
654          for (Index_t fi=0; fi<xferFields; ++fi) {
655             Domain_member src = fieldData[fi] ;
656             for (Index_t i=0; i<dx; ++i) {
657                destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i) ;
658             }
659             destAddr += dx ;
660          }
661          destAddr -= xferFields*dx ;
662          MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
663                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
664          ++emsg ;
665       }
666 
667       if (colMin && planeMax && doSend) {
668          int toRank = myRank + domain.tp()*domain.tp() - 1 ;
669          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
670                                           emsg * maxEdgeComm] ;
671          for (Index_t fi=0; fi<xferFields; ++fi) {
672             Domain_member src = fieldData[fi] ;
673             for (Index_t i=0; i<dy; ++i) {
674                destAddr[i] = (domain.*src)(dx*dy*(dz-1) + i*dx) ;
675             }
676             destAddr += dy ;
677          }
678          destAddr -= xferFields*dy ;
679          MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
680                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
681          ++emsg ;
682       }
683 
684       if (rowMin && colMax) {
685          int toRank = myRank - domain.tp() + 1 ;
686          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
687                                           emsg * maxEdgeComm] ;
688          for (Index_t fi=0; fi<xferFields; ++fi) {
689             Domain_member src = fieldData[fi] ;
690             for (Index_t i=0; i<dz; ++i) {
691                destAddr[i] = (domain.*src)(dx - 1 + i*dx*dy) ;
692             }
693             destAddr += dz ;
694          }
695          destAddr -= xferFields*dz ;
696          MPI_Isend(destAddr, xferFields*dz, baseType, toRank, msgType,
697                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
698          ++emsg ;
699       }
700 
701       if (rowMax && planeMin) {
702          int toRank = myRank - domain.tp()*domain.tp() + domain.tp() ;
703          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
704                                           emsg * maxEdgeComm] ;
705          for (Index_t fi=0; fi<xferFields; ++fi) {
706             Domain_member src = fieldData[fi] ;
707             for (Index_t i=0; i<dx; ++i) {
708                destAddr[i] = (domain.*src)(dx*(dy - 1) + i) ;
709             }
710             destAddr += dx ;
711          }
712          destAddr -= xferFields*dx ;
713          MPI_Isend(destAddr, xferFields*dx, baseType, toRank, msgType,
714                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
715          ++emsg ;
716       }
717 
718       if (colMax && planeMin) {
719          int toRank = myRank - domain.tp()*domain.tp() + 1 ;
720          destAddr = &domain.commDataSend[pmsg * maxPlaneComm +
721                                           emsg * maxEdgeComm] ;
722          for (Index_t fi=0; fi<xferFields; ++fi) {
723             Domain_member src = fieldData[fi] ;
724             for (Index_t i=0; i<dy; ++i) {
725                destAddr[i] = (domain.*src)(dx - 1 + i*dx) ;
726             }
727             destAddr += dy ;
728          }
729          destAddr -= xferFields*dy ;
730          MPI_Isend(destAddr, xferFields*dy, baseType, toRank, msgType,
731                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg]) ;
732          ++emsg ;
733       }
734 
735       if (rowMin && colMin && planeMin) {
736          /* corner at domain logical coord (0, 0, 0) */
737          int toRank = myRank - domain.tp()*domain.tp() - domain.tp() - 1 ;
738          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
739                                                 emsg * maxEdgeComm +
740                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
741          for (Index_t fi=0; fi<xferFields; ++fi) {
742             comBuf[fi] = (domain.*fieldData[fi])(0) ;
743          }
744          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
745                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
746          ++cmsg ;
747       }
748       if (rowMin && colMin && planeMax && doSend) {
749          /* corner at domain logical coord (0, 0, 1) */
750          int toRank = myRank + domain.tp()*domain.tp() - domain.tp() - 1 ;
751          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
752                                                 emsg * maxEdgeComm +
753                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
754          Index_t idx = dx*dy*(dz - 1) ;
755          for (Index_t fi=0; fi<xferFields; ++fi) {
756             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
757          }
758          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
759                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
760          ++cmsg ;
761       }
762       if (rowMin && colMax && planeMin) {
763          /* corner at domain logical coord (1, 0, 0) */
764          int toRank = myRank - domain.tp()*domain.tp() - domain.tp() + 1 ;
765          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
766                                                 emsg * maxEdgeComm +
767                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
768          Index_t idx = dx - 1 ;
769          for (Index_t fi=0; fi<xferFields; ++fi) {
770             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
771          }
772          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
773                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
774          ++cmsg ;
775       }
776       if (rowMin && colMax && planeMax && doSend) {
777          /* corner at domain logical coord (1, 0, 1) */
778          int toRank = myRank + domain.tp()*domain.tp() - domain.tp() + 1 ;
779          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
780                                                 emsg * maxEdgeComm +
781                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
782          Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
783          for (Index_t fi=0; fi<xferFields; ++fi) {
784             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
785          }
786          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
787                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
788          ++cmsg ;
789       }
790       if (rowMax && colMin && planeMin) {
791          /* corner at domain logical coord (0, 1, 0) */
792          int toRank = myRank - domain.tp()*domain.tp() + domain.tp() - 1 ;
793          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
794                                                 emsg * maxEdgeComm +
795                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
796          Index_t idx = dx*(dy - 1) ;
797          for (Index_t fi=0; fi<xferFields; ++fi) {
798             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
799          }
800          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
801                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
802          ++cmsg ;
803       }
804       if (rowMax && colMin && planeMax && doSend) {
805          /* corner at domain logical coord (0, 1, 1) */
806          int toRank = myRank + domain.tp()*domain.tp() + domain.tp() - 1 ;
807          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
808                                                 emsg * maxEdgeComm +
809                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
810          Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
811          for (Index_t fi=0; fi<xferFields; ++fi) {
812             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
813          }
814          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
815                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
816          ++cmsg ;
817       }
818       if (rowMax && colMax && planeMin) {
819          /* corner at domain logical coord (1, 1, 0) */
820          int toRank = myRank - domain.tp()*domain.tp() + domain.tp() + 1 ;
821          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
822                                                 emsg * maxEdgeComm +
823                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
824          Index_t idx = dx*dy - 1 ;
825          for (Index_t fi=0; fi<xferFields; ++fi) {
826             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
827          }
828          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
829                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
830          ++cmsg ;
831       }
832       if (rowMax && colMax && planeMax && doSend) {
833          /* corner at domain logical coord (1, 1, 1) */
834          int toRank = myRank + domain.tp()*domain.tp() + domain.tp() + 1 ;
835          Real_t *comBuf = &domain.commDataSend[pmsg * maxPlaneComm +
836                                                 emsg * maxEdgeComm +
837                                          cmsg * CACHE_COHERENCE_PAD_REAL] ;
838          Index_t idx = dx*dy*dz - 1 ;
839          for (Index_t fi=0; fi<xferFields; ++fi) {
840             comBuf[fi] = (domain.*fieldData[fi])(idx) ;
841          }
842          MPI_Isend(comBuf, xferFields, baseType, toRank, msgType,
843                    MPI_COMM_WORLD, &domain.sendRequest[pmsg+emsg+cmsg]) ;
844          ++cmsg ;
845       }
846    }
847 
848    MPI_Waitall(26, domain.sendRequest, status) ;
849 }
850 
851 /******************************************/
852 
CommSBN(Domain & domain,int xferFields,Domain_member * fieldData)853 void CommSBN(Domain& domain, int xferFields, Domain_member *fieldData) {
854 
855    if (domain.numRanks() == 1)
856       return ;
857 
858    /* summation order should be from smallest value to largest */
859    /* or we could try out kahan summation! */
860 
861    int myRank ;
862    Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
863    Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
864    Index_t pmsg = 0 ; /* plane comm msg */
865    Index_t emsg = 0 ; /* edge comm msg */
866    Index_t cmsg = 0 ; /* corner comm msg */
867    Index_t dx = domain.sizeX() + 1 ;
868    Index_t dy = domain.sizeY() + 1 ;
869    Index_t dz = domain.sizeZ() + 1 ;
870    MPI_Status status ;
871    Real_t *srcAddr ;
872    Index_t rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
873    /* assume communication to 6 neighbors by default */
874    rowMin = rowMax = colMin = colMax = planeMin = planeMax = 1 ;
875    if (domain.rowLoc() == 0) {
876       rowMin = 0 ;
877    }
878    if (domain.rowLoc() == (domain.tp()-1)) {
879       rowMax = 0 ;
880    }
881    if (domain.colLoc() == 0) {
882       colMin = 0 ;
883    }
884    if (domain.colLoc() == (domain.tp()-1)) {
885       colMax = 0 ;
886    }
887    if (domain.planeLoc() == 0) {
888       planeMin = 0 ;
889    }
890    if (domain.planeLoc() == (domain.tp()-1)) {
891       planeMax = 0 ;
892    }
893 
894    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
895 
896    if (planeMin | planeMax) {
897       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
898       Index_t opCount = dx * dy ;
899 
900       if (planeMin) {
901          /* contiguous memory */
902          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
903          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
904          for (Index_t fi=0 ; fi<xferFields; ++fi) {
905             Domain_member dest = fieldData[fi] ;
906             for (Index_t i=0; i<opCount; ++i) {
907                (domain.*dest)(i) += srcAddr[i] ;
908             }
909             srcAddr += opCount ;
910          }
911          ++pmsg ;
912       }
913       if (planeMax) {
914          /* contiguous memory */
915          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
916          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
917          for (Index_t fi=0 ; fi<xferFields; ++fi) {
918             Domain_member dest = fieldData[fi] ;
919             for (Index_t i=0; i<opCount; ++i) {
920                (domain.*dest)(dx*dy*(dz - 1) + i) += srcAddr[i] ;
921             }
922             srcAddr += opCount ;
923          }
924          ++pmsg ;
925       }
926    }
927 
928    if (rowMin | rowMax) {
929       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
930       Index_t opCount = dx * dz ;
931 
932       if (rowMin) {
933          /* contiguous memory */
934          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
935          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
936          for (Index_t fi=0 ; fi<xferFields; ++fi) {
937             Domain_member dest = fieldData[fi] ;
938             for (Index_t i=0; i<dz; ++i) {
939                for (Index_t j=0; j<dx; ++j) {
940                   (domain.*dest)(i*dx*dy + j) += srcAddr[i*dx + j] ;
941                }
942             }
943             srcAddr += opCount ;
944          }
945          ++pmsg ;
946       }
947       if (rowMax) {
948          /* contiguous memory */
949          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
950          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
951          for (Index_t fi=0 ; fi<xferFields; ++fi) {
952             Domain_member dest = fieldData[fi] ;
953             for (Index_t i=0; i<dz; ++i) {
954                for (Index_t j=0; j<dx; ++j) {
955                   (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) += srcAddr[i*dx + j] ;
956                }
957             }
958             srcAddr += opCount ;
959          }
960          ++pmsg ;
961       }
962    }
963    if (colMin | colMax) {
964       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
965       Index_t opCount = dy * dz ;
966 
967       if (colMin) {
968          /* contiguous memory */
969          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
970          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
971          for (Index_t fi=0 ; fi<xferFields; ++fi) {
972             Domain_member dest = fieldData[fi] ;
973             for (Index_t i=0; i<dz; ++i) {
974                for (Index_t j=0; j<dy; ++j) {
975                   (domain.*dest)(i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
976                }
977             }
978             srcAddr += opCount ;
979          }
980          ++pmsg ;
981       }
982       if (colMax) {
983          /* contiguous memory */
984          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
985          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
986          for (Index_t fi=0 ; fi<xferFields; ++fi) {
987             Domain_member dest = fieldData[fi] ;
988             for (Index_t i=0; i<dz; ++i) {
989                for (Index_t j=0; j<dy; ++j) {
990                   (domain.*dest)(dx - 1 + i*dx*dy + j*dx) += srcAddr[i*dy + j] ;
991                }
992             }
993             srcAddr += opCount ;
994          }
995          ++pmsg ;
996       }
997    }
998 
999    if (rowMin & colMin) {
1000       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1001                                        emsg * maxEdgeComm] ;
1002       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1003       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1004          Domain_member dest = fieldData[fi] ;
1005          for (Index_t i=0; i<dz; ++i) {
1006             (domain.*dest)(i*dx*dy) += srcAddr[i] ;
1007          }
1008          srcAddr += dz ;
1009       }
1010       ++emsg ;
1011    }
1012 
1013    if (rowMin & planeMin) {
1014       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1015                                        emsg * maxEdgeComm] ;
1016       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1017       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1018          Domain_member dest = fieldData[fi] ;
1019          for (Index_t i=0; i<dx; ++i) {
1020             (domain.*dest)(i) += srcAddr[i] ;
1021          }
1022          srcAddr += dx ;
1023       }
1024       ++emsg ;
1025    }
1026 
1027    if (colMin & planeMin) {
1028       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1029                                        emsg * maxEdgeComm] ;
1030       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1031       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1032          Domain_member dest = fieldData[fi] ;
1033          for (Index_t i=0; i<dy; ++i) {
1034             (domain.*dest)(i*dx) += srcAddr[i] ;
1035          }
1036          srcAddr += dy ;
1037       }
1038       ++emsg ;
1039    }
1040 
1041    if (rowMax & colMax) {
1042       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1043                                        emsg * maxEdgeComm] ;
1044       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1045       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1046          Domain_member dest = fieldData[fi] ;
1047          for (Index_t i=0; i<dz; ++i) {
1048             (domain.*dest)(dx*dy - 1 + i*dx*dy) += srcAddr[i] ;
1049          }
1050          srcAddr += dz ;
1051       }
1052       ++emsg ;
1053    }
1054 
1055    if (rowMax & planeMax) {
1056       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1057                                        emsg * maxEdgeComm] ;
1058       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1059       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1060          Domain_member dest = fieldData[fi] ;
1061          for (Index_t i=0; i<dx; ++i) {
1062             (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) += srcAddr[i] ;
1063          }
1064          srcAddr += dx ;
1065       }
1066       ++emsg ;
1067    }
1068 
1069    if (colMax & planeMax) {
1070       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1071                                        emsg * maxEdgeComm] ;
1072       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1073       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1074          Domain_member dest = fieldData[fi] ;
1075          for (Index_t i=0; i<dy; ++i) {
1076             (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) += srcAddr[i] ;
1077          }
1078          srcAddr += dy ;
1079       }
1080       ++emsg ;
1081    }
1082 
1083    if (rowMax & colMin) {
1084       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1085                                        emsg * maxEdgeComm] ;
1086       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1087       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1088          Domain_member dest = fieldData[fi] ;
1089          for (Index_t i=0; i<dz; ++i) {
1090             (domain.*dest)(dx*(dy-1) + i*dx*dy) += srcAddr[i] ;
1091          }
1092          srcAddr += dz ;
1093       }
1094       ++emsg ;
1095    }
1096 
1097    if (rowMin & planeMax) {
1098       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1099                                        emsg * maxEdgeComm] ;
1100       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1101       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1102          Domain_member dest = fieldData[fi] ;
1103          for (Index_t i=0; i<dx; ++i) {
1104             (domain.*dest)(dx*dy*(dz-1) + i) += srcAddr[i] ;
1105          }
1106          srcAddr += dx ;
1107       }
1108       ++emsg ;
1109    }
1110 
1111    if (colMin & planeMax) {
1112       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1113                                        emsg * maxEdgeComm] ;
1114       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1115       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1116          Domain_member dest = fieldData[fi] ;
1117          for (Index_t i=0; i<dy; ++i) {
1118             (domain.*dest)(dx*dy*(dz-1) + i*dx) += srcAddr[i] ;
1119          }
1120          srcAddr += dy ;
1121       }
1122       ++emsg ;
1123    }
1124 
1125    if (rowMin & colMax) {
1126       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1127                                        emsg * maxEdgeComm] ;
1128       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1129       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1130          Domain_member dest = fieldData[fi] ;
1131          for (Index_t i=0; i<dz; ++i) {
1132             (domain.*dest)(dx - 1 + i*dx*dy) += srcAddr[i] ;
1133          }
1134          srcAddr += dz ;
1135       }
1136       ++emsg ;
1137    }
1138 
1139    if (rowMax & planeMin) {
1140       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1141                                        emsg * maxEdgeComm] ;
1142       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1143       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1144          Domain_member dest = fieldData[fi] ;
1145          for (Index_t i=0; i<dx; ++i) {
1146             (domain.*dest)(dx*(dy - 1) + i) += srcAddr[i] ;
1147          }
1148          srcAddr += dx ;
1149       }
1150       ++emsg ;
1151    }
1152 
1153    if (colMax & planeMin) {
1154       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1155                                        emsg * maxEdgeComm] ;
1156       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1157       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1158          Domain_member dest = fieldData[fi] ;
1159          for (Index_t i=0; i<dy; ++i) {
1160             (domain.*dest)(dx - 1 + i*dx) += srcAddr[i] ;
1161          }
1162          srcAddr += dy ;
1163       }
1164       ++emsg ;
1165    }
1166 
1167    if (rowMin & colMin & planeMin) {
1168       /* corner at domain logical coord (0, 0, 0) */
1169       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1170                                              emsg * maxEdgeComm +
1171                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1172       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1173       for (Index_t fi=0; fi<xferFields; ++fi) {
1174          (domain.*fieldData[fi])(0) += comBuf[fi] ;
1175       }
1176       ++cmsg ;
1177    }
1178    if (rowMin & colMin & planeMax) {
1179       /* corner at domain logical coord (0, 0, 1) */
1180       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1181                                              emsg * maxEdgeComm +
1182                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1183       Index_t idx = dx*dy*(dz - 1) ;
1184       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1185       for (Index_t fi=0; fi<xferFields; ++fi) {
1186          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1187       }
1188       ++cmsg ;
1189    }
1190    if (rowMin & colMax & planeMin) {
1191       /* corner at domain logical coord (1, 0, 0) */
1192       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1193                                              emsg * maxEdgeComm +
1194                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1195       Index_t idx = dx - 1 ;
1196       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1197       for (Index_t fi=0; fi<xferFields; ++fi) {
1198          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1199       }
1200       ++cmsg ;
1201    }
1202    if (rowMin & colMax & planeMax) {
1203       /* corner at domain logical coord (1, 0, 1) */
1204       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1205                                              emsg * maxEdgeComm +
1206                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1207       Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
1208       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1209       for (Index_t fi=0; fi<xferFields; ++fi) {
1210          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1211       }
1212       ++cmsg ;
1213    }
1214    if (rowMax & colMin & planeMin) {
1215       /* corner at domain logical coord (0, 1, 0) */
1216       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1217                                              emsg * maxEdgeComm +
1218                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1219       Index_t idx = dx*(dy - 1) ;
1220       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1221       for (Index_t fi=0; fi<xferFields; ++fi) {
1222          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1223       }
1224       ++cmsg ;
1225    }
1226    if (rowMax & colMin & planeMax) {
1227       /* corner at domain logical coord (0, 1, 1) */
1228       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1229                                              emsg * maxEdgeComm +
1230                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1231       Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
1232       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1233       for (Index_t fi=0; fi<xferFields; ++fi) {
1234          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1235       }
1236       ++cmsg ;
1237    }
1238    if (rowMax & colMax & planeMin) {
1239       /* corner at domain logical coord (1, 1, 0) */
1240       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1241                                              emsg * maxEdgeComm +
1242                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1243       Index_t idx = dx*dy - 1 ;
1244       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1245       for (Index_t fi=0; fi<xferFields; ++fi) {
1246          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1247       }
1248       ++cmsg ;
1249    }
1250    if (rowMax & colMax & planeMax) {
1251       /* corner at domain logical coord (1, 1, 1) */
1252       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1253                                              emsg * maxEdgeComm +
1254                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1255       Index_t idx = dx*dy*dz - 1 ;
1256       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1257       for (Index_t fi=0; fi<xferFields; ++fi) {
1258          (domain.*fieldData[fi])(idx) += comBuf[fi] ;
1259       }
1260       ++cmsg ;
1261    }
1262 }
1263 
1264 /******************************************/
1265 
CommSyncPosVel(Domain & domain)1266 void CommSyncPosVel(Domain& domain) {
1267 
1268    if (domain.numRanks() == 1)
1269       return ;
1270 
1271    int myRank ;
1272    bool doRecv = false ;
1273    Index_t xferFields = 6 ; /* x, y, z, xd, yd, zd */
1274    Domain_member fieldData[6] ;
1275    Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
1276    Index_t maxEdgeComm  = xferFields * domain.maxEdgeSize() ;
1277    Index_t pmsg = 0 ; /* plane comm msg */
1278    Index_t emsg = 0 ; /* edge comm msg */
1279    Index_t cmsg = 0 ; /* corner comm msg */
1280    Index_t dx = domain.sizeX() + 1 ;
1281    Index_t dy = domain.sizeY() + 1 ;
1282    Index_t dz = domain.sizeZ() + 1 ;
1283    MPI_Status status ;
1284    Real_t *srcAddr ;
1285    bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
1286 
1287    /* assume communication to 6 neighbors by default */
1288    rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
1289    if (domain.rowLoc() == 0) {
1290       rowMin = false ;
1291    }
1292    if (domain.rowLoc() == (domain.tp()-1)) {
1293       rowMax = false ;
1294    }
1295    if (domain.colLoc() == 0) {
1296       colMin = false ;
1297    }
1298    if (domain.colLoc() == (domain.tp()-1)) {
1299       colMax = false ;
1300    }
1301    if (domain.planeLoc() == 0) {
1302       planeMin = false ;
1303    }
1304    if (domain.planeLoc() == (domain.tp()-1)) {
1305       planeMax = false ;
1306    }
1307 
1308    fieldData[0] = &Domain::x ;
1309    fieldData[1] = &Domain::y ;
1310    fieldData[2] = &Domain::z ;
1311    fieldData[3] = &Domain::xd ;
1312    fieldData[4] = &Domain::yd ;
1313    fieldData[5] = &Domain::zd ;
1314 
1315    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
1316 
1317    if (planeMin | planeMax) {
1318       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1319       Index_t opCount = dx * dy ;
1320 
1321       if (planeMin && doRecv) {
1322          /* contiguous memory */
1323          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1324          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1325          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1326             Domain_member dest = fieldData[fi] ;
1327             for (Index_t i=0; i<opCount; ++i) {
1328                (domain.*dest)(i) = srcAddr[i] ;
1329             }
1330             srcAddr += opCount ;
1331          }
1332          ++pmsg ;
1333       }
1334       if (planeMax) {
1335          /* contiguous memory */
1336          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1337          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1338          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1339             Domain_member dest = fieldData[fi] ;
1340             for (Index_t i=0; i<opCount; ++i) {
1341                (domain.*dest)(dx*dy*(dz - 1) + i) = srcAddr[i] ;
1342             }
1343             srcAddr += opCount ;
1344          }
1345          ++pmsg ;
1346       }
1347    }
1348 
1349    if (rowMin | rowMax) {
1350       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1351       Index_t opCount = dx * dz ;
1352 
1353       if (rowMin && doRecv) {
1354          /* contiguous memory */
1355          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1356          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1357          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1358             Domain_member dest = fieldData[fi] ;
1359             for (Index_t i=0; i<dz; ++i) {
1360                for (Index_t j=0; j<dx; ++j) {
1361                   (domain.*dest)(i*dx*dy + j) = srcAddr[i*dx + j] ;
1362                }
1363             }
1364             srcAddr += opCount ;
1365          }
1366          ++pmsg ;
1367       }
1368       if (rowMax) {
1369          /* contiguous memory */
1370          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1371          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1372          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1373             Domain_member dest = fieldData[fi] ;
1374             for (Index_t i=0; i<dz; ++i) {
1375                for (Index_t j=0; j<dx; ++j) {
1376                   (domain.*dest)(dx*(dy - 1) + i*dx*dy + j) = srcAddr[i*dx + j] ;
1377                }
1378             }
1379             srcAddr += opCount ;
1380          }
1381          ++pmsg ;
1382       }
1383    }
1384 
1385    if (colMin | colMax) {
1386       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1387       Index_t opCount = dy * dz ;
1388 
1389       if (colMin && doRecv) {
1390          /* contiguous memory */
1391          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1392          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1393          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1394             Domain_member dest = fieldData[fi] ;
1395             for (Index_t i=0; i<dz; ++i) {
1396                for (Index_t j=0; j<dy; ++j) {
1397                   (domain.*dest)(i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
1398                }
1399             }
1400             srcAddr += opCount ;
1401          }
1402          ++pmsg ;
1403       }
1404       if (colMax) {
1405          /* contiguous memory */
1406          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1407          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1408          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1409             Domain_member dest = fieldData[fi] ;
1410             for (Index_t i=0; i<dz; ++i) {
1411                for (Index_t j=0; j<dy; ++j) {
1412                   (domain.*dest)(dx - 1 + i*dx*dy + j*dx) = srcAddr[i*dy + j] ;
1413                }
1414             }
1415             srcAddr += opCount ;
1416          }
1417          ++pmsg ;
1418       }
1419    }
1420 
1421    if (rowMin && colMin && doRecv) {
1422       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1423                                        emsg * maxEdgeComm] ;
1424       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1425       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1426          Domain_member dest = fieldData[fi] ;
1427          for (Index_t i=0; i<dz; ++i) {
1428             (domain.*dest)(i*dx*dy) = srcAddr[i] ;
1429          }
1430          srcAddr += dz ;
1431       }
1432       ++emsg ;
1433    }
1434 
1435    if (rowMin && planeMin && doRecv) {
1436       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1437                                        emsg * maxEdgeComm] ;
1438       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1439       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1440          Domain_member dest = fieldData[fi] ;
1441          for (Index_t i=0; i<dx; ++i) {
1442             (domain.*dest)(i) = srcAddr[i] ;
1443          }
1444          srcAddr += dx ;
1445       }
1446       ++emsg ;
1447    }
1448 
1449    if (colMin && planeMin && doRecv) {
1450       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1451                                        emsg * maxEdgeComm] ;
1452       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1453       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1454          Domain_member dest = fieldData[fi] ;
1455          for (Index_t i=0; i<dy; ++i) {
1456             (domain.*dest)(i*dx) = srcAddr[i] ;
1457          }
1458          srcAddr += dy ;
1459       }
1460       ++emsg ;
1461    }
1462 
1463    if (rowMax && colMax) {
1464       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1465                                        emsg * maxEdgeComm] ;
1466       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1467       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1468          Domain_member dest = fieldData[fi] ;
1469          for (Index_t i=0; i<dz; ++i) {
1470             (domain.*dest)(dx*dy - 1 + i*dx*dy) = srcAddr[i] ;
1471          }
1472          srcAddr += dz ;
1473       }
1474       ++emsg ;
1475    }
1476 
1477    if (rowMax && planeMax) {
1478       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1479                                        emsg * maxEdgeComm] ;
1480       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1481       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1482          Domain_member dest = fieldData[fi] ;
1483          for (Index_t i=0; i<dx; ++i) {
1484             (domain.*dest)(dx*(dy-1) + dx*dy*(dz-1) + i) = srcAddr[i] ;
1485          }
1486          srcAddr += dx ;
1487       }
1488       ++emsg ;
1489    }
1490 
1491    if (colMax && planeMax) {
1492       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1493                                        emsg * maxEdgeComm] ;
1494       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1495       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1496          Domain_member dest = fieldData[fi] ;
1497          for (Index_t i=0; i<dy; ++i) {
1498             (domain.*dest)(dx*dy*(dz-1) + dx - 1 + i*dx) = srcAddr[i] ;
1499          }
1500          srcAddr += dy ;
1501       }
1502       ++emsg ;
1503    }
1504 
1505    if (rowMax && colMin) {
1506       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1507                                        emsg * maxEdgeComm] ;
1508       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1509       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1510          Domain_member dest = fieldData[fi] ;
1511          for (Index_t i=0; i<dz; ++i) {
1512             (domain.*dest)(dx*(dy-1) + i*dx*dy) = srcAddr[i] ;
1513          }
1514          srcAddr += dz ;
1515       }
1516       ++emsg ;
1517    }
1518 
1519    if (rowMin && planeMax) {
1520       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1521                                        emsg * maxEdgeComm] ;
1522       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1523       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1524          Domain_member dest = fieldData[fi] ;
1525          for (Index_t i=0; i<dx; ++i) {
1526             (domain.*dest)(dx*dy*(dz-1) + i) = srcAddr[i] ;
1527          }
1528          srcAddr += dx ;
1529       }
1530       ++emsg ;
1531    }
1532 
1533    if (colMin && planeMax) {
1534       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1535                                        emsg * maxEdgeComm] ;
1536       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1537       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1538          Domain_member dest = fieldData[fi] ;
1539          for (Index_t i=0; i<dy; ++i) {
1540             (domain.*dest)(dx*dy*(dz-1) + i*dx) = srcAddr[i] ;
1541          }
1542          srcAddr += dy ;
1543       }
1544       ++emsg ;
1545    }
1546 
1547    if (rowMin && colMax && doRecv) {
1548       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1549                                        emsg * maxEdgeComm] ;
1550       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1551       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1552          Domain_member dest = fieldData[fi] ;
1553          for (Index_t i=0; i<dz; ++i) {
1554             (domain.*dest)(dx - 1 + i*dx*dy) = srcAddr[i] ;
1555          }
1556          srcAddr += dz ;
1557       }
1558       ++emsg ;
1559    }
1560 
1561    if (rowMax && planeMin && doRecv) {
1562       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1563                                        emsg * maxEdgeComm] ;
1564       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1565       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1566          Domain_member dest = fieldData[fi] ;
1567          for (Index_t i=0; i<dx; ++i) {
1568             (domain.*dest)(dx*(dy - 1) + i) = srcAddr[i] ;
1569          }
1570          srcAddr += dx ;
1571       }
1572       ++emsg ;
1573    }
1574 
1575    if (colMax && planeMin && doRecv) {
1576       srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm +
1577                                        emsg * maxEdgeComm] ;
1578       MPI_Wait(&domain.recvRequest[pmsg+emsg], &status) ;
1579       for (Index_t fi=0 ; fi<xferFields; ++fi) {
1580          Domain_member dest = fieldData[fi] ;
1581          for (Index_t i=0; i<dy; ++i) {
1582             (domain.*dest)(dx - 1 + i*dx) = srcAddr[i] ;
1583          }
1584          srcAddr += dy ;
1585       }
1586       ++emsg ;
1587    }
1588 
1589 
1590    if (rowMin && colMin && planeMin && doRecv) {
1591       /* corner at domain logical coord (0, 0, 0) */
1592       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1593                                              emsg * maxEdgeComm +
1594                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1595       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1596       for (Index_t fi=0; fi<xferFields; ++fi) {
1597          (domain.*fieldData[fi])(0) = comBuf[fi] ;
1598       }
1599       ++cmsg ;
1600    }
1601    if (rowMin && colMin && planeMax) {
1602       /* corner at domain logical coord (0, 0, 1) */
1603       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1604                                              emsg * maxEdgeComm +
1605                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1606       Index_t idx = dx*dy*(dz - 1) ;
1607       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1608       for (Index_t fi=0; fi<xferFields; ++fi) {
1609          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1610       }
1611       ++cmsg ;
1612    }
1613    if (rowMin && colMax && planeMin && doRecv) {
1614       /* corner at domain logical coord (1, 0, 0) */
1615       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1616                                              emsg * maxEdgeComm +
1617                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1618       Index_t idx = dx - 1 ;
1619       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1620       for (Index_t fi=0; fi<xferFields; ++fi) {
1621          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1622       }
1623       ++cmsg ;
1624    }
1625    if (rowMin && colMax && planeMax) {
1626       /* corner at domain logical coord (1, 0, 1) */
1627       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1628                                              emsg * maxEdgeComm +
1629                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1630       Index_t idx = dx*dy*(dz - 1) + (dx - 1) ;
1631       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1632       for (Index_t fi=0; fi<xferFields; ++fi) {
1633          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1634       }
1635       ++cmsg ;
1636    }
1637    if (rowMax && colMin && planeMin && doRecv) {
1638       /* corner at domain logical coord (0, 1, 0) */
1639       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1640                                              emsg * maxEdgeComm +
1641                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1642       Index_t idx = dx*(dy - 1) ;
1643       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1644       for (Index_t fi=0; fi<xferFields; ++fi) {
1645          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1646       }
1647       ++cmsg ;
1648    }
1649    if (rowMax && colMin && planeMax) {
1650       /* corner at domain logical coord (0, 1, 1) */
1651       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1652                                              emsg * maxEdgeComm +
1653                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1654       Index_t idx = dx*dy*(dz - 1) + dx*(dy - 1) ;
1655       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1656       for (Index_t fi=0; fi<xferFields; ++fi) {
1657          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1658       }
1659       ++cmsg ;
1660    }
1661    if (rowMax && colMax && planeMin && doRecv) {
1662       /* corner at domain logical coord (1, 1, 0) */
1663       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1664                                              emsg * maxEdgeComm +
1665                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1666       Index_t idx = dx*dy - 1 ;
1667       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1668       for (Index_t fi=0; fi<xferFields; ++fi) {
1669          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1670       }
1671       ++cmsg ;
1672    }
1673    if (rowMax && colMax && planeMax) {
1674       /* corner at domain logical coord (1, 1, 1) */
1675       Real_t *comBuf = &domain.commDataRecv[pmsg * maxPlaneComm +
1676                                              emsg * maxEdgeComm +
1677                                       cmsg * CACHE_COHERENCE_PAD_REAL] ;
1678       Index_t idx = dx*dy*dz - 1 ;
1679       MPI_Wait(&domain.recvRequest[pmsg+emsg+cmsg], &status) ;
1680       for (Index_t fi=0; fi<xferFields; ++fi) {
1681          (domain.*fieldData[fi])(idx) = comBuf[fi] ;
1682       }
1683       ++cmsg ;
1684    }
1685 }
1686 
1687 /******************************************/
1688 
CommMonoQ(Domain & domain)1689 void CommMonoQ(Domain& domain)
1690 {
1691    if (domain.numRanks() == 1)
1692       return ;
1693 
1694    int myRank ;
1695    Index_t xferFields = 3 ; /* delv_xi, delv_eta, delv_zeta */
1696    Domain_member fieldData[3] ;
1697    Index_t fieldOffset[3] ;
1698    Index_t maxPlaneComm = xferFields * domain.maxPlaneSize() ;
1699    Index_t pmsg = 0 ; /* plane comm msg */
1700    Index_t dx = domain.sizeX() ;
1701    Index_t dy = domain.sizeY() ;
1702    Index_t dz = domain.sizeZ() ;
1703    MPI_Status status ;
1704    Real_t *srcAddr ;
1705    bool rowMin, rowMax, colMin, colMax, planeMin, planeMax ;
1706    /* assume communication to 6 neighbors by default */
1707    rowMin = rowMax = colMin = colMax = planeMin = planeMax = true ;
1708    if (domain.rowLoc() == 0) {
1709       rowMin = false ;
1710    }
1711    if (domain.rowLoc() == (domain.tp()-1)) {
1712       rowMax = false ;
1713    }
1714    if (domain.colLoc() == 0) {
1715       colMin = false ;
1716    }
1717    if (domain.colLoc() == (domain.tp()-1)) {
1718       colMax = false ;
1719    }
1720    if (domain.planeLoc() == 0) {
1721       planeMin = false ;
1722    }
1723    if (domain.planeLoc() == (domain.tp()-1)) {
1724       planeMax = false ;
1725    }
1726 
1727    /* point into ghost data area */
1728    // fieldData[0] = &(domain.delv_xi(domain.numElem())) ;
1729    // fieldData[1] = &(domain.delv_eta(domain.numElem())) ;
1730    // fieldData[2] = &(domain.delv_zeta(domain.numElem())) ;
1731    fieldData[0] = &Domain::delv_xi ;
1732    fieldData[1] = &Domain::delv_eta ;
1733    fieldData[2] = &Domain::delv_zeta ;
1734    fieldOffset[0] = domain.numElem() ;
1735    fieldOffset[1] = domain.numElem() ;
1736    fieldOffset[2] = domain.numElem() ;
1737 
1738 
1739    MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
1740 
1741    if (planeMin | planeMax) {
1742       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1743       Index_t opCount = dx * dy ;
1744 
1745       if (planeMin) {
1746          /* contiguous memory */
1747          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1748          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1749          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1750             Domain_member dest = fieldData[fi] ;
1751             for (Index_t i=0; i<opCount; ++i) {
1752                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1753             }
1754             srcAddr += opCount ;
1755             fieldOffset[fi] += opCount ;
1756          }
1757          ++pmsg ;
1758       }
1759       if (planeMax) {
1760          /* contiguous memory */
1761          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1762          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1763          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1764             Domain_member dest = fieldData[fi] ;
1765             for (Index_t i=0; i<opCount; ++i) {
1766                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1767             }
1768             srcAddr += opCount ;
1769             fieldOffset[fi] += opCount ;
1770          }
1771          ++pmsg ;
1772       }
1773    }
1774 
1775    if (rowMin | rowMax) {
1776       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1777       Index_t opCount = dx * dz ;
1778 
1779       if (rowMin) {
1780          /* contiguous memory */
1781          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1782          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1783          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1784             Domain_member dest = fieldData[fi] ;
1785             for (Index_t i=0; i<opCount; ++i) {
1786                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1787             }
1788             srcAddr += opCount ;
1789             fieldOffset[fi] += opCount ;
1790          }
1791          ++pmsg ;
1792       }
1793       if (rowMax) {
1794          /* contiguous memory */
1795          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1796          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1797          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1798             Domain_member dest = fieldData[fi] ;
1799             for (Index_t i=0; i<opCount; ++i) {
1800                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1801             }
1802             srcAddr += opCount ;
1803             fieldOffset[fi] += opCount ;
1804          }
1805          ++pmsg ;
1806       }
1807    }
1808    if (colMin | colMax) {
1809       /* ASSUMING ONE DOMAIN PER RANK, CONSTANT BLOCK SIZE HERE */
1810       Index_t opCount = dy * dz ;
1811 
1812       if (colMin) {
1813          /* contiguous memory */
1814          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1815          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1816          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1817             Domain_member dest = fieldData[fi] ;
1818             for (Index_t i=0; i<opCount; ++i) {
1819                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1820             }
1821             srcAddr += opCount ;
1822             fieldOffset[fi] += opCount ;
1823          }
1824          ++pmsg ;
1825       }
1826       if (colMax) {
1827          /* contiguous memory */
1828          srcAddr = &domain.commDataRecv[pmsg * maxPlaneComm] ;
1829          MPI_Wait(&domain.recvRequest[pmsg], &status) ;
1830          for (Index_t fi=0 ; fi<xferFields; ++fi) {
1831             Domain_member dest = fieldData[fi] ;
1832             for (Index_t i=0; i<opCount; ++i) {
1833                (domain.*dest)(fieldOffset[fi] + i) = srcAddr[i] ;
1834             }
1835             srcAddr += opCount ;
1836          }
1837          ++pmsg ;
1838       }
1839    }
1840 }
1841 
1842 #endif
1843