1 // clang-format off
2 /* ----------------------------------------------------------------------
3    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
4    https://www.lammps.org/, Sandia National Laboratories
5    Steve Plimpton, sjplimp@sandia.gov
6 
7    Copyright (2003) Sandia Corporation.  Under the terms of Contract
8    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
9    certain rights in this software.  This software is distributed under
10    the GNU General Public License.
11 
12    See the README file in the top-level LAMMPS directory.
13 ------------------------------------------------------------------------- */
14 
15 #include "rcb.h"
16 
17 #include "irregular.h"
18 #include "memory.h"
19 
20 #include <cstring>
21 
22 using namespace LAMMPS_NS;
23 
24 #define MYHUGE 1.0e30
25 #define TINY 1.0e-6
26 #define DELTA 16384
27 
28 // prototypes for non-class functions
29 
30 void box_merge(void *, void *, int *, MPI_Datatype *);
31 void median_merge(void *, void *, int *, MPI_Datatype *);
32 
33 // NOTE: if want to have reuse flag, need to sum Tree across procs
34 
35 /* ---------------------------------------------------------------------- */
36 
RCB(LAMMPS * lmp)37 RCB::RCB(LAMMPS *lmp) : Pointers(lmp)
38 {
39   MPI_Comm_rank(world,&me);
40   MPI_Comm_size(world,&nprocs);
41 
42   ndot = maxdot = 0;
43   dots = nullptr;
44 
45   nlist = maxlist = 0;
46   dotlist = dotmark = dotmark_select = nullptr;
47 
48   maxbuf = 0;
49   buf = nullptr;
50 
51   maxrecv = maxsend = 0;
52   recvproc = recvindex = sendproc = sendindex = nullptr;
53 
54   tree = nullptr;
55   irregular = nullptr;
56 
57   // create MPI data and function types for box and median AllReduce ops
58 
59   MPI_Type_contiguous(6,MPI_DOUBLE,&box_type);
60   MPI_Type_commit(&box_type);
61   MPI_Type_contiguous(sizeof(Median),MPI_CHAR,&med_type);
62   MPI_Type_commit(&med_type);
63 
64   MPI_Op_create(box_merge,1,&box_op);
65   MPI_Op_create(median_merge,1,&med_op);
66 
67   reuse = 0;
68 }
69 
70 /* ---------------------------------------------------------------------- */
71 
~RCB()72 RCB::~RCB()
73 {
74   memory->sfree(dots);
75   memory->destroy(dotlist);
76   memory->destroy(dotmark);
77   memory->destroy(dotmark_select);
78   memory->sfree(buf);
79 
80   memory->destroy(recvproc);
81   memory->destroy(recvindex);
82   memory->destroy(sendproc);
83   memory->destroy(sendindex);
84 
85   memory->sfree(tree);
86   delete irregular;
87 
88   MPI_Type_free(&med_type);
89   MPI_Type_free(&box_type);
90   MPI_Op_free(&box_op);
91   MPI_Op_free(&med_op);
92 }
93 
94 /* ----------------------------------------------------------------------
95    perform RCB balancing of N particles at coords X in bounding box LO/HI
96    NEW version: each RCB cut is tested in all dimensions
97      dimeension that produces 2 boxes with largest min size is selected
98      this is to prevent very narrow boxes from being produced
99    if wt = nullptr, ignore per-particle weights
100    if wt defined, per-particle weights > 0.0
101    dimension = 2 or 3
102    as documented in rcb.h:
103      sets noriginal,nfinal,nkeep,recvproc,recvindex,lo,hi
104    all proc particles will be inside or on surface of 3-d box
105      defined by final lo/hi
106    // NOTE: worry about re-use of data structs for fix balance?
107 ------------------------------------------------------------------------- */
108 
compute(int dimension,int n,double ** x,double * wt,double * bboxlo,double * bboxhi)109 void RCB::compute(int dimension, int n, double **x, double *wt,
110                   double *bboxlo, double *bboxhi)
111 {
112   int i,j,k;
113   int keep,outgoing,incoming,incoming2;
114   int dim,markactive;
115   int indexlo,indexhi;
116   int first_iteration,breakflag;
117   double wttot,wtlo,wthi,wtsum,wtok,wtupto,wtmax;
118   double targetlo,targethi;
119   double valuemin,valuemax,valuehalf,valuehalf_select,smaller;
120   double tolerance;
121   MPI_Comm comm,comm_half;
122   MPI_Request request,request2;
123   Median med,medme;
124 
125   // create list of my Dots
126 
127   ndot = nkeep = noriginal = n;
128 
129   if (ndot > maxdot) {
130     maxdot = ndot;
131     memory->sfree(dots);
132     dots = (Dot *) memory->smalloc(ndot*sizeof(Dot),"RCB:dots");
133   }
134 
135   for (i = 0; i < ndot; i++) {
136     dots[i].x[0] = x[i][0];
137     dots[i].x[1] = x[i][1];
138     dots[i].x[2] = x[i][2];
139     dots[i].proc = me;
140     dots[i].index = i;
141   }
142 
143   if (wt)
144     for (i = 0; i < ndot; i++) dots[i].wt = wt[i];
145   else
146     for (i = 0; i < ndot; i++) dots[i].wt = 1.0;
147 
148   // initial bounding box = simulation box
149   // includes periodic or shrink-wrapped boundaries
150 
151   lo = bbox.lo;
152   hi = bbox.hi;
153 
154   lo[0] = bboxlo[0];
155   lo[1] = bboxlo[1];
156   lo[2] = bboxlo[2];
157   hi[0] = bboxhi[0];
158   hi[1] = bboxhi[1];
159   hi[2] = bboxhi[2];
160 
161   cut = 0.0;
162   cutdim = -1;
163 
164   // initialize counters
165 
166   counters[0] = 0;
167   counters[1] = 0;
168   counters[2] = 0;
169   counters[3] = ndot;
170   counters[4] = maxdot;
171   counters[5] = 0;
172   counters[6] = 0;
173 
174   // create communicator for use in recursion
175 
176   MPI_Comm_dup(world,&comm);
177 
178   // recurse until partition is a single proc = me
179   // proclower,procupper = lower,upper procs in partition
180   // procmid = 1st proc in upper half of partition
181 
182   int procpartner,procpartner2;
183 
184   int procmid;
185   int proclower = 0;
186   int procupper = nprocs - 1;
187 
188   while (proclower != procupper) {
189 
190     // if odd # of procs, lower partition gets extra one
191 
192     procmid = proclower + (procupper - proclower) / 2 + 1;
193 
194     // determine communication partner(s)
195     // readnumber = # of proc partners to read from
196 
197     if (me < procmid)
198       procpartner = me + (procmid - proclower);
199     else
200       procpartner = me - (procmid - proclower);
201 
202     int readnumber = 1;
203     if (procpartner > procupper) {
204       readnumber = 0;
205       procpartner--;
206     }
207     if (me == procupper && procpartner != procmid - 1) {
208       readnumber = 2;
209       procpartner2 = procpartner + 1;
210     }
211 
212     // wttot = summed weight of entire partition
213     // search tolerance = largest single weight (plus epsilon)
214     // targetlo = desired weight in lower half of partition
215     // targethi = desired weight in upper half of partition
216 
217     wtmax = wtsum = 0.0;
218 
219     if (wt) {
220       for (i = 0; i < ndot; i++) {
221         wtsum += dots[i].wt;
222         if (dots[i].wt > wtmax) wtmax = dots[i].wt;
223       }
224     } else {
225       for (i = 0; i < ndot; i++) wtsum += dots[i].wt;
226       wtmax = 1.0;
227     }
228 
229     MPI_Allreduce(&wtsum,&wttot,1,MPI_DOUBLE,MPI_SUM,comm);
230     if (wt) MPI_Allreduce(&wtmax,&tolerance,1,MPI_DOUBLE,MPI_MAX,comm);
231     else tolerance = 1.0;
232 
233     tolerance *= 1.0 + TINY;
234     targetlo = wttot * (procmid - proclower) / (procupper + 1 - proclower);
235     targethi = wttot - targetlo;
236 
237     // attempt a cut in each dimension
238     // each cut produces 2 boxes, each with a reduced box length in that dim
239     // smaller = the smaller of the 2 reduced box lengths in that dimension
240     // choose to cut in dimension which produces largest smaller value
241     // this should induce final proc sub-boxes to be as cube-ish as possible
242     // dim_select = selected cut dimension
243     // valuehalf_select = valuehalf in that dimension
244     // dotmark_select = dot markings in that dimension
245     // initialize largest = -1.0 to insure a cut in some dim is accepted
246     //   e.g. if current recursed box is size 0 in all dims
247 
248     int dim_select = -1;
249     double largest = -1.0;
250 
251     for (dim = 0; dim < dimension; dim++) {
252 
253       // create active list and mark array for dots
254       // initialize active list to all dots
255 
256       if (ndot > maxlist) {
257         memory->destroy(dotlist);
258         memory->destroy(dotmark);
259         memory->destroy(dotmark_select);
260         maxlist = maxdot;
261         memory->create(dotlist,maxlist,"RCB:dotlist");
262         memory->create(dotmark,maxlist,"RCB:dotmark");
263         memory->create(dotmark_select,maxlist,"RCB:dotmark_select");
264       }
265 
266       nlist = ndot;
267       for (i = 0; i < nlist; i++) dotlist[i] = i;
268 
269       // median iteration
270       // zoom in on bisector until correct # of dots in each half of partition
271       // as each iteration of median-loop begins, require:
272       //   all non-active dots are marked with 0/1 in dotmark
273       //   valuemin <= every active dot <= valuemax
274       //   wtlo, wthi = total wt of non-active dots
275       // when leave median-loop, require only:
276       //   valuehalf = correct cut position
277       //   all dots <= valuehalf are marked with 0 in dotmark
278       //   all dots >= valuehalf are marked with 1 in dotmark
279       // markactive = which side of cut is active = 0/1
280       // indexlo,indexhi = indices of dot closest to median
281 
282       wtlo = wthi = 0.0;
283       valuemin = lo[dim];
284       valuemax = hi[dim];
285       first_iteration = 1;
286       indexlo = indexhi = 0;
287 
288       while (1) {
289 
290         // choose bisector value
291         // use old value on 1st iteration if old cut dimension is the same
292         // on 2nd option: could push valuehalf towards geometric center
293         //   with "1.0-factor" to force overshoot
294 
295         if (first_iteration && reuse && dim == tree[procmid].dim) {
296           counters[5]++;
297           valuehalf = tree[procmid].cut;
298           if (valuehalf < valuemin || valuehalf > valuemax)
299             valuehalf = 0.5 * (valuemin + valuemax);
300         } else if (wt)
301           valuehalf = valuemin + (targetlo - wtlo) /
302             (wttot - wtlo - wthi) * (valuemax - valuemin);
303         else
304           valuehalf = 0.5 * (valuemin + valuemax);
305 
306         first_iteration = 0;
307 
308         // initialize local median data structure
309 
310         medme.totallo = medme.totalhi = 0.0;
311         medme.valuelo = -MYHUGE;
312         medme.valuehi = MYHUGE;
313         medme.wtlo = medme.wthi = 0.0;
314         medme.countlo = medme.counthi = 0;
315         medme.proclo = medme.prochi = me;
316 
317         // mark all active dots on one side or other of bisector
318         // also set all fields in median data struct
319         // save indices of closest dots on either side
320 
321         for (j = 0; j < nlist; j++) {
322           i = dotlist[j];
323           if (dots[i].x[dim] <= valuehalf) {            // in lower part
324             medme.totallo += dots[i].wt;
325             dotmark[i] = 0;
326             if (dots[i].x[dim] > medme.valuelo) {       // my closest dot
327               medme.valuelo = dots[i].x[dim];
328               medme.wtlo = dots[i].wt;
329               medme.countlo = 1;
330               indexlo = i;
331             } else if (dots[i].x[dim] == medme.valuelo) {   // tied for closest
332               medme.wtlo += dots[i].wt;
333               medme.countlo++;
334             }
335           }
336           else {                                        // in upper part
337             medme.totalhi += dots[i].wt;
338             dotmark[i] = 1;
339             if (dots[i].x[dim] < medme.valuehi) {       // my closest dot
340               medme.valuehi = dots[i].x[dim];
341               medme.wthi = dots[i].wt;
342               medme.counthi = 1;
343               indexhi = i;
344             } else if (dots[i].x[dim] == medme.valuehi) {   // tied for closest
345               medme.wthi += dots[i].wt;
346               medme.counthi++;
347             }
348           }
349         }
350 
351         // combine median data struct across current subset of procs
352 
353         counters[0]++;
354         MPI_Allreduce(&medme,&med,1,med_type,med_op,comm);
355 
356         // test median guess for convergence
357         // move additional dots that are next to cut across it
358 
359         if (wtlo + med.totallo < targetlo) {    // lower half TOO SMALL
360 
361           wtlo += med.totallo;
362           valuehalf = med.valuehi;
363 
364           if (med.counthi == 1) {                  // only one dot to move
365             if (wtlo + med.wthi < targetlo) {  // move it, keep iterating
366               if (me == med.prochi) dotmark[indexhi] = 0;
367             }
368             else {                                 // only move if beneficial
369               if (wtlo + med.wthi - targetlo < targetlo - wtlo)
370                 if (me == med.prochi) dotmark[indexhi] = 0;
371               break;                               // all done
372             }
373           }
374           else {                                   // multiple dots to move
375             breakflag = 0;
376             wtok = 0.0;
377             if (medme.valuehi == med.valuehi) wtok = medme.wthi;
378             if (wtlo + med.wthi >= targetlo) {                // all done
379               MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
380               wtmax = targetlo - wtlo;
381               if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
382               breakflag = 1;
383             }                                      // wtok = most I can move
384             for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
385               i = dotlist[j];
386               if (dots[i].x[dim] == med.valuehi) { // only move if better
387                 if (wtsum + dots[i].wt - wtok < wtok - wtsum)
388                   dotmark[i] = 0;
389                 wtsum += dots[i].wt;
390               }
391             }
392             if (breakflag) break;                   // done if moved enough
393           }
394 
395           wtlo += med.wthi;
396           if (targetlo-wtlo <= tolerance) break;  // close enough
397 
398           valuemin = med.valuehi;                   // iterate again
399           markactive = 1;
400         }
401 
402         else if (wthi + med.totalhi < targethi) {  // upper half TOO SMALL
403 
404           wthi += med.totalhi;
405           valuehalf = med.valuelo;
406 
407           if (med.countlo == 1) {                  // only one dot to move
408             if (wthi + med.wtlo < targethi) {  // move it, keep iterating
409               if (me == med.proclo) dotmark[indexlo] = 1;
410             }
411             else {                                 // only move if beneficial
412               if (wthi + med.wtlo - targethi < targethi - wthi)
413                 if (me == med.proclo) dotmark[indexlo] = 1;
414               break;                               // all done
415             }
416           }
417           else {                                   // multiple dots to move
418             breakflag = 0;
419             wtok = 0.0;
420             if (medme.valuelo == med.valuelo) wtok = medme.wtlo;
421             if (wthi + med.wtlo >= targethi) {                // all done
422               MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
423               wtmax = targethi - wthi;
424               if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
425               breakflag = 1;
426             }                                      // wtok = most I can move
427             for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
428               i = dotlist[j];
429               if (dots[i].x[dim] == med.valuelo) { // only move if better
430                 if (wtsum + dots[i].wt - wtok < wtok - wtsum)
431                   dotmark[i] = 1;
432                 wtsum += dots[i].wt;
433               }
434             }
435             if (breakflag) break;                   // done if moved enough
436           }
437 
438           wthi += med.wtlo;
439           if (targethi-wthi <= tolerance) break;  // close enough
440 
441           valuemax = med.valuelo;                   // iterate again
442           markactive = 0;
443         }
444 
445         else                  // Goldilocks result: both partitions just right
446           break;
447 
448         // shrink the active list
449 
450         k = 0;
451         for (j = 0; j < nlist; j++) {
452           i = dotlist[j];
453           if (dotmark[i] == markactive) dotlist[k++] = i;
454         }
455         nlist = k;
456       }
457 
458       // cut produces 2 sub-boxes with reduced size in dim
459       // compare smaller of the 2 sizes to previous dims
460       // keep dim that has the largest smaller
461 
462       smaller = MIN(valuehalf-lo[dim],hi[dim]-valuehalf);
463       if (smaller > largest) {
464         largest = smaller;
465         dim_select = dim;
466         valuehalf_select = valuehalf;
467         if (ndot > 0) memcpy(dotmark_select,dotmark,ndot*sizeof(int));
468       }
469     }
470 
471     // copy results for best dim cut into dim,valuehalf,dotmark
472 
473     dim = dim_select;
474     valuehalf = valuehalf_select;
475     if (ndot > 0) memcpy(dotmark,dotmark_select,ndot*sizeof(int));
476 
477     // special case for zero box width
478     // can occur when all dots are on corner vertices of this sub-box
479     // split box on longest dimension
480     // reset dotmark for that cut
481 
482     if (largest == 0.0) {
483       dim = 0;
484       if (hi[1]-lo[1] > hi[0]-lo[0]) dim = 1;
485       if (dimension == 3 && hi[2]-lo[2] > hi[dim]-lo[dim]) dim = 2;
486       valuehalf = 0.5* (lo[dim] + hi[dim]);
487 
488       for (j = 0; j < nlist; j++) {
489         i = dotlist[j];
490         if (dots[i].x[dim] <= valuehalf) dotmark[i] = 0;
491         else dotmark[i] = 1;
492       }
493     }
494 
495     // found median
496     // store cut info only if I am procmid
497 
498     if (me == procmid) {
499       cut = valuehalf;
500       cutdim = dim;
501     }
502 
503     // use cut to shrink my RCB bounding box
504 
505     if (me < procmid) hi[dim] = valuehalf;
506     else lo[dim] = valuehalf;
507 
508     // outgoing = number of dots to ship to partner
509     // nkeep = number of dots that have never migrated
510 
511     markactive = (me < procpartner);
512     for (i = 0, keep = 0, outgoing = 0; i < ndot; i++)
513       if (dotmark[i] == markactive) outgoing++;
514       else if (i < nkeep) keep++;
515     nkeep = keep;
516 
517     // alert partner how many dots I'll send, read how many I'll recv
518 
519     MPI_Send(&outgoing,1,MPI_INT,procpartner,0,world);
520     incoming = 0;
521     if (readnumber) {
522       MPI_Recv(&incoming,1,MPI_INT,procpartner,0,world,MPI_STATUS_IGNORE);
523       if (readnumber == 2) {
524         MPI_Recv(&incoming2,1,MPI_INT,procpartner2,0,world,MPI_STATUS_IGNORE);
525         incoming += incoming2;
526       }
527     }
528 
529     // check if need to alloc more space
530 
531     int ndotnew = ndot - outgoing + incoming;
532     if (ndotnew > maxdot) {
533       while (maxdot < ndotnew) maxdot += DELTA;
534       dots = (Dot *) memory->srealloc(dots,maxdot*sizeof(Dot),"RCB::dots");
535       counters[6]++;
536     }
537 
538     counters[1] += outgoing;
539     counters[2] += incoming;
540     if (ndotnew > counters[3]) counters[3] = ndotnew;
541     if (maxdot > counters[4]) counters[4] = maxdot;
542 
543     // malloc comm send buffer
544 
545     if (outgoing > maxbuf) {
546       memory->sfree(buf);
547       maxbuf = outgoing;
548       buf = (Dot *) memory->smalloc(maxbuf*sizeof(Dot),"RCB:buf");
549     }
550 
551     // fill buffer with dots that are marked for sending
552     // pack down the unmarked ones
553 
554     keep = outgoing = 0;
555     for (i = 0; i < ndot; i++) {
556       if (dotmark[i] == markactive)
557         memcpy(&buf[outgoing++],&dots[i],sizeof(Dot));
558       else
559         memmove(&dots[keep++],&dots[i],sizeof(Dot));
560     }
561 
562     // post receives for dots
563 
564     if (readnumber > 0) {
565       MPI_Irecv(&dots[keep],incoming*sizeof(Dot),MPI_CHAR,
566                 procpartner,1,world,&request);
567       if (readnumber == 2) {
568         keep += incoming - incoming2;
569         MPI_Irecv(&dots[keep],incoming2*sizeof(Dot),MPI_CHAR,
570                   procpartner2,1,world,&request2);
571       }
572     }
573 
574     // handshake before sending dots to insure recvs have been posted
575 
576     if (readnumber > 0) {
577       MPI_Send(nullptr,0,MPI_INT,procpartner,0,world);
578       if (readnumber == 2) MPI_Send(nullptr,0,MPI_INT,procpartner2,0,world);
579     }
580     MPI_Recv(nullptr,0,MPI_INT,procpartner,0,world,MPI_STATUS_IGNORE);
581 
582     // send dots to partner
583 
584     MPI_Rsend(buf,outgoing*sizeof(Dot),MPI_CHAR,procpartner,1,world);
585 
586     // wait until all dots are received
587 
588     if (readnumber > 0) {
589       MPI_Wait(&request,MPI_STATUS_IGNORE);
590       if (readnumber == 2) MPI_Wait(&request2,MPI_STATUS_IGNORE);
591     }
592 
593     ndot = ndotnew;
594 
595     // cut partition in half, create new communicators of 1/2 size
596 
597     int split;
598     if (me < procmid) {
599       procupper = procmid - 1;
600       split = 0;
601     } else {
602       proclower = procmid;
603       split = 1;
604     }
605 
606     MPI_Comm_split(comm,split,me,&comm_half);
607     MPI_Comm_free(&comm);
608     comm = comm_half;
609   }
610 
611   // clean up
612 
613   MPI_Comm_free(&comm);
614 
615   // set public variables with results of rebalance
616 
617   nfinal = ndot;
618 
619   if (nfinal > maxrecv) {
620     memory->destroy(recvproc);
621     memory->destroy(recvindex);
622     maxrecv = nfinal;
623     memory->create(recvproc,maxrecv,"RCB:recvproc");
624     memory->create(recvindex,maxrecv,"RCB:recvindex");
625   }
626 
627   for (i = 0; i < nfinal; i++) {
628     recvproc[i] = dots[i].proc;
629     recvindex[i] = dots[i].index;
630   }
631 }
632 
633 /* ----------------------------------------------------------------------
634    perform RCB balancing of N particles at coords X in bounding box LO/HI
635    OLD version: each RCB cut is made in longest dimension of sub-box
636    if wt = nullptr, ignore per-particle weights
637    if wt defined, per-particle weights > 0.0
638    dimension = 2 or 3
639    as documented in rcb.h:
640      sets noriginal,nfinal,nkeep,recvproc,recvindex,lo,hi
641    all proc particles will be inside or on surface of 3-d box
642      defined by final lo/hi
643    // NOTE: worry about re-use of data structs for fix balance?
644 ------------------------------------------------------------------------- */
645 
compute_old(int dimension,int n,double ** x,double * wt,double * bboxlo,double * bboxhi)646 void RCB::compute_old(int dimension, int n, double **x, double *wt,
647                       double *bboxlo, double *bboxhi)
648 {
649   int i,j,k;
650   int keep,outgoing,incoming,incoming2;
651   int dim,markactive;
652   int indexlo,indexhi;
653   int first_iteration,breakflag;
654   double wttot,wtlo,wthi,wtsum,wtok,wtupto,wtmax;
655   double targetlo,targethi;
656   double valuemin,valuemax,valuehalf;
657   double tolerance;
658   MPI_Comm comm,comm_half;
659   MPI_Request request,request2;
660   Median med,medme;
661 
662   // create list of my Dots
663 
664   ndot = nkeep = noriginal = n;
665 
666   if (ndot > maxdot) {
667     maxdot = ndot;
668     memory->sfree(dots);
669     dots = (Dot *) memory->smalloc(ndot*sizeof(Dot),"RCB:dots");
670   }
671 
672   for (i = 0; i < ndot; i++) {
673     dots[i].x[0] = x[i][0];
674     dots[i].x[1] = x[i][1];
675     dots[i].x[2] = x[i][2];
676     dots[i].proc = me;
677     dots[i].index = i;
678   }
679 
680   if (wt)
681     for (i = 0; i < ndot; i++) dots[i].wt = wt[i];
682   else
683     for (i = 0; i < ndot; i++) dots[i].wt = 1.0;
684 
685   // initial bounding box = simulation box
686   // includes periodic or shrink-wrapped boundaries
687 
688   lo = bbox.lo;
689   hi = bbox.hi;
690 
691   lo[0] = bboxlo[0];
692   lo[1] = bboxlo[1];
693   lo[2] = bboxlo[2];
694   hi[0] = bboxhi[0];
695   hi[1] = bboxhi[1];
696   hi[2] = bboxhi[2];
697 
698   cut = 0.0;
699   cutdim = -1;
700 
701   // initialize counters
702 
703   counters[0] = 0;
704   counters[1] = 0;
705   counters[2] = 0;
706   counters[3] = ndot;
707   counters[4] = maxdot;
708   counters[5] = 0;
709   counters[6] = 0;
710 
711   // create communicator for use in recursion
712 
713   MPI_Comm_dup(world,&comm);
714 
715   // recurse until partition is a single proc = me
716   // proclower,procupper = lower,upper procs in partition
717   // procmid = 1st proc in upper half of partition
718 
719   int procpartner,procpartner2;
720 
721   int procmid;
722   int proclower = 0;
723   int procupper = nprocs - 1;
724 
725   while (proclower != procupper) {
726 
727     // if odd # of procs, lower partition gets extra one
728 
729     procmid = proclower + (procupper - proclower) / 2 + 1;
730 
731     // determine communication partner(s)
732     // readnumber = # of proc partners to read from
733 
734     if (me < procmid)
735       procpartner = me + (procmid - proclower);
736     else
737       procpartner = me - (procmid - proclower);
738 
739     int readnumber = 1;
740     if (procpartner > procupper) {
741       readnumber = 0;
742       procpartner--;
743     }
744     if (me == procupper && procpartner != procmid - 1) {
745       readnumber = 2;
746       procpartner2 = procpartner + 1;
747     }
748 
749     // wttot = summed weight of entire partition
750     // search tolerance = largest single weight (plus epsilon)
751     // targetlo = desired weight in lower half of partition
752     // targethi = desired weight in upper half of partition
753 
754     wtmax = wtsum = 0.0;
755 
756     if (wt) {
757       for (i = 0; i < ndot; i++) {
758         wtsum += dots[i].wt;
759         if (dots[i].wt > wtmax) wtmax = dots[i].wt;
760       }
761     } else {
762       for (i = 0; i < ndot; i++) wtsum += dots[i].wt;
763       wtmax = 1.0;
764     }
765 
766     MPI_Allreduce(&wtsum,&wttot,1,MPI_DOUBLE,MPI_SUM,comm);
767     if (wt) MPI_Allreduce(&wtmax,&tolerance,1,MPI_DOUBLE,MPI_MAX,comm);
768     else tolerance = 1.0;
769 
770     tolerance *= 1.0 + TINY;
771     targetlo = wttot * (procmid - proclower) / (procupper + 1 - proclower);
772     targethi = wttot - targetlo;
773 
774     // dim = dimension to bisect on
775     // do not allow choice of z dimension for 2d system
776 
777     dim = 0;
778     if (hi[1]-lo[1] > hi[0]-lo[0]) dim = 1;
779     if (dimension == 3) {
780       if (dim == 0 && hi[2]-lo[2] > hi[0]-lo[0]) dim = 2;
781       if (dim == 1 && hi[2]-lo[2] > hi[1]-lo[1]) dim = 2;
782     }
783 
784     // create active list and mark array for dots
785     // initialize active list to all dots
786 
787     if (ndot > maxlist) {
788       memory->destroy(dotlist);
789       memory->destroy(dotmark);
790       maxlist = maxdot;
791       memory->create(dotlist,maxlist,"RCB:dotlist");
792       memory->create(dotmark,maxlist,"RCB:dotmark");
793     }
794 
795     nlist = ndot;
796     for (i = 0; i < nlist; i++) dotlist[i] = i;
797 
798     // median iteration
799     // zoom in on bisector until correct # of dots in each half of partition
800     // as each iteration of median-loop begins, require:
801     //   all non-active dots are marked with 0/1 in dotmark
802     //   valuemin <= every active dot <= valuemax
803     //   wtlo, wthi = total wt of non-active dots
804     // when leave median-loop, require only:
805     //   valuehalf = correct cut position
806     //   all dots <= valuehalf are marked with 0 in dotmark
807     //   all dots >= valuehalf are marked with 1 in dotmark
808     // markactive = which side of cut is active = 0/1
809     // indexlo,indexhi = indices of dot closest to median
810 
811     wtlo = wthi = 0.0;
812     valuemin = lo[dim];
813     valuemax = hi[dim];
814     first_iteration = 1;
815     indexlo = indexhi = 0;
816 
817     while (1) {
818 
819       // choose bisector value
820       // use old value on 1st iteration if old cut dimension is the same
821       // on 2nd option: could push valuehalf towards geometric center
822       //   with "1.0-factor" to force overshoot
823 
824       if (first_iteration && reuse && dim == tree[procmid].dim) {
825         counters[5]++;
826         valuehalf = tree[procmid].cut;
827         if (valuehalf < valuemin || valuehalf > valuemax)
828           valuehalf = 0.5 * (valuemin + valuemax);
829       } else if (wt)
830         valuehalf = valuemin + (targetlo - wtlo) /
831           (wttot - wtlo - wthi) * (valuemax - valuemin);
832       else
833         valuehalf = 0.5 * (valuemin + valuemax);
834 
835       first_iteration = 0;
836 
837       // initialize local median data structure
838 
839       medme.totallo = medme.totalhi = 0.0;
840       medme.valuelo = -MYHUGE;
841       medme.valuehi = MYHUGE;
842       medme.wtlo = medme.wthi = 0.0;
843       medme.countlo = medme.counthi = 0;
844       medme.proclo = medme.prochi = me;
845 
846       // mark all active dots on one side or other of bisector
847       // also set all fields in median data struct
848       // save indices of closest dots on either side
849 
850       for (j = 0; j < nlist; j++) {
851         i = dotlist[j];
852         if (dots[i].x[dim] <= valuehalf) {            // in lower part
853           medme.totallo += dots[i].wt;
854           dotmark[i] = 0;
855           if (dots[i].x[dim] > medme.valuelo) {       // my closest dot
856             medme.valuelo = dots[i].x[dim];
857             medme.wtlo = dots[i].wt;
858             medme.countlo = 1;
859             indexlo = i;
860           } else if (dots[i].x[dim] == medme.valuelo) {   // tied for closest
861             medme.wtlo += dots[i].wt;
862             medme.countlo++;
863           }
864         }
865         else {                                        // in upper part
866           medme.totalhi += dots[i].wt;
867           dotmark[i] = 1;
868           if (dots[i].x[dim] < medme.valuehi) {       // my closest dot
869             medme.valuehi = dots[i].x[dim];
870             medme.wthi = dots[i].wt;
871             medme.counthi = 1;
872             indexhi = i;
873           } else if (dots[i].x[dim] == medme.valuehi) {   // tied for closest
874             medme.wthi += dots[i].wt;
875             medme.counthi++;
876           }
877         }
878       }
879 
880       // combine median data struct across current subset of procs
881 
882       counters[0]++;
883       MPI_Allreduce(&medme,&med,1,med_type,med_op,comm);
884 
885       // test median guess for convergence
886       // move additional dots that are next to cut across it
887 
888       if (wtlo + med.totallo < targetlo) {    // lower half TOO SMALL
889 
890         wtlo += med.totallo;
891         valuehalf = med.valuehi;
892 
893         if (med.counthi == 1) {                  // only one dot to move
894           if (wtlo + med.wthi < targetlo) {  // move it, keep iterating
895             if (me == med.prochi) dotmark[indexhi] = 0;
896           }
897           else {                                 // only move if beneficial
898             if (wtlo + med.wthi - targetlo < targetlo - wtlo)
899               if (me == med.prochi) dotmark[indexhi] = 0;
900             break;                               // all done
901           }
902         }
903         else {                                   // multiple dots to move
904           breakflag = 0;
905           wtok = 0.0;
906           if (medme.valuehi == med.valuehi) wtok = medme.wthi;
907           if (wtlo + med.wthi >= targetlo) {                // all done
908             MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
909             wtmax = targetlo - wtlo;
910             if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
911             breakflag = 1;
912           }                                      // wtok = most I can move
913           for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
914             i = dotlist[j];
915             if (dots[i].x[dim] == med.valuehi) { // only move if better
916               if (wtsum + dots[i].wt - wtok < wtok - wtsum)
917                 dotmark[i] = 0;
918               wtsum += dots[i].wt;
919             }
920           }
921           if (breakflag) break;                   // done if moved enough
922         }
923 
924         wtlo += med.wthi;
925         if (targetlo-wtlo <= tolerance) break;  // close enough
926 
927         valuemin = med.valuehi;                   // iterate again
928         markactive = 1;
929       }
930 
931       else if (wthi + med.totalhi < targethi) {  // upper half TOO SMALL
932 
933         wthi += med.totalhi;
934         valuehalf = med.valuelo;
935 
936         if (med.countlo == 1) {                  // only one dot to move
937           if (wthi + med.wtlo < targethi) {  // move it, keep iterating
938             if (me == med.proclo) dotmark[indexlo] = 1;
939           }
940           else {                                 // only move if beneficial
941             if (wthi + med.wtlo - targethi < targethi - wthi)
942               if (me == med.proclo) dotmark[indexlo] = 1;
943             break;                               // all done
944           }
945         }
946         else {                                   // multiple dots to move
947           breakflag = 0;
948           wtok = 0.0;
949           if (medme.valuelo == med.valuelo) wtok = medme.wtlo;
950           if (wthi + med.wtlo >= targethi) {                // all done
951             MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
952             wtmax = targethi - wthi;
953             if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
954             breakflag = 1;
955           }                                      // wtok = most I can move
956           for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
957             i = dotlist[j];
958             if (dots[i].x[dim] == med.valuelo) { // only move if better
959               if (wtsum + dots[i].wt - wtok < wtok - wtsum)
960                 dotmark[i] = 1;
961               wtsum += dots[i].wt;
962             }
963           }
964           if (breakflag) break;                   // done if moved enough
965         }
966 
967         wthi += med.wtlo;
968         if (targethi-wthi <= tolerance) break;  // close enough
969 
970         valuemax = med.valuelo;                   // iterate again
971         markactive = 0;
972       }
973 
974       else                  // Goldilocks result: both partitions just right
975         break;
976 
977       // shrink the active list
978 
979       k = 0;
980       for (j = 0; j < nlist; j++) {
981         i = dotlist[j];
982         if (dotmark[i] == markactive) dotlist[k++] = i;
983       }
984       nlist = k;
985     }
986 
987     // found median
988     // store cut info only if I am procmid
989 
990     if (me == procmid) {
991       cut = valuehalf;
992       cutdim = dim;
993     }
994 
995     // use cut to shrink my RCB bounding box
996 
997     if (me < procmid) hi[dim] = valuehalf;
998     else lo[dim] = valuehalf;
999 
1000     // outgoing = number of dots to ship to partner
1001     // nkeep = number of dots that have never migrated
1002 
1003     markactive = (me < procpartner);
1004     for (i = 0, keep = 0, outgoing = 0; i < ndot; i++)
1005       if (dotmark[i] == markactive) outgoing++;
1006       else if (i < nkeep) keep++;
1007     nkeep = keep;
1008 
1009     // alert partner how many dots I'll send, read how many I'll recv
1010 
1011     MPI_Send(&outgoing,1,MPI_INT,procpartner,0,world);
1012     incoming = 0;
1013     if (readnumber) {
1014       MPI_Recv(&incoming,1,MPI_INT,procpartner,0,world,MPI_STATUS_IGNORE);
1015       if (readnumber == 2) {
1016         MPI_Recv(&incoming2,1,MPI_INT,procpartner2,0,world,MPI_STATUS_IGNORE);
1017         incoming += incoming2;
1018       }
1019     }
1020 
1021     // check if need to alloc more space
1022 
1023     int ndotnew = ndot - outgoing + incoming;
1024     if (ndotnew > maxdot) {
1025       while (maxdot < ndotnew) maxdot += DELTA;
1026       dots = (Dot *) memory->srealloc(dots,maxdot*sizeof(Dot),"RCB::dots");
1027       counters[6]++;
1028     }
1029 
1030     counters[1] += outgoing;
1031     counters[2] += incoming;
1032     if (ndotnew > counters[3]) counters[3] = ndotnew;
1033     if (maxdot > counters[4]) counters[4] = maxdot;
1034 
1035     // malloc comm send buffer
1036 
1037     if (outgoing > maxbuf) {
1038       memory->sfree(buf);
1039       maxbuf = outgoing;
1040       buf = (Dot *) memory->smalloc(maxbuf*sizeof(Dot),"RCB:buf");
1041     }
1042 
1043     // fill buffer with dots that are marked for sending
1044     // pack down the unmarked ones
1045 
1046     keep = outgoing = 0;
1047     for (i = 0; i < ndot; i++) {
1048       if (dotmark[i] == markactive)
1049         memcpy(&buf[outgoing++],&dots[i],sizeof(Dot));
1050       else
1051         memmove(&dots[keep++],&dots[i],sizeof(Dot));
1052     }
1053 
1054     // post receives for dots
1055 
1056     if (readnumber > 0) {
1057       MPI_Irecv(&dots[keep],incoming*sizeof(Dot),MPI_CHAR,
1058                 procpartner,1,world,&request);
1059       if (readnumber == 2) {
1060         keep += incoming - incoming2;
1061         MPI_Irecv(&dots[keep],incoming2*sizeof(Dot),MPI_CHAR,
1062                   procpartner2,1,world,&request2);
1063       }
1064     }
1065 
1066     // handshake before sending dots to insure recvs have been posted
1067 
1068     if (readnumber > 0) {
1069       MPI_Send(nullptr,0,MPI_INT,procpartner,0,world);
1070       if (readnumber == 2) MPI_Send(nullptr,0,MPI_INT,procpartner2,0,world);
1071     }
1072     MPI_Recv(nullptr,0,MPI_INT,procpartner,0,world,MPI_STATUS_IGNORE);
1073 
1074     // send dots to partner
1075 
1076     MPI_Rsend(buf,outgoing*sizeof(Dot),MPI_CHAR,procpartner,1,world);
1077 
1078     // wait until all dots are received
1079 
1080     if (readnumber > 0) {
1081       MPI_Wait(&request,MPI_STATUS_IGNORE);
1082       if (readnumber == 2) MPI_Wait(&request2,MPI_STATUS_IGNORE);
1083     }
1084 
1085     ndot = ndotnew;
1086 
1087     // cut partition in half, create new communicators of 1/2 size
1088 
1089     int split;
1090     if (me < procmid) {
1091       procupper = procmid - 1;
1092       split = 0;
1093     } else {
1094       proclower = procmid;
1095       split = 1;
1096     }
1097 
1098     MPI_Comm_split(comm,split,me,&comm_half);
1099     MPI_Comm_free(&comm);
1100     comm = comm_half;
1101   }
1102 
1103   // clean up
1104 
1105   MPI_Comm_free(&comm);
1106 
1107   // set public variables with results of rebalance
1108 
1109   nfinal = ndot;
1110 
1111   if (nfinal > maxrecv) {
1112     memory->destroy(recvproc);
1113     memory->destroy(recvindex);
1114     maxrecv = nfinal;
1115     memory->create(recvproc,maxrecv,"RCB:recvproc");
1116     memory->create(recvindex,maxrecv,"RCB:recvindex");
1117   }
1118 
1119   for (i = 0; i < nfinal; i++) {
1120     recvproc[i] = dots[i].proc;
1121     recvindex[i] = dots[i].index;
1122   }
1123 }
1124 
1125 /* ----------------------------------------------------------------------
1126    custom MPI reduce operation
1127    merge of each component of an RCB bounding box
1128 ------------------------------------------------------------------------- */
1129 
box_merge(void * in,void * inout,int *,MPI_Datatype *)1130 void box_merge(void *in, void *inout, int * /*len*/, MPI_Datatype * /*dptr*/)
1131 
1132 {
1133   RCB::BBox *box1 = (RCB::BBox *) in;
1134   RCB::BBox *box2 = (RCB::BBox *) inout;
1135 
1136   for (int i = 0; i < 3; i++) {
1137     if (box1->lo[i] < box2->lo[i]) box2->lo[i] = box1->lo[i];
1138     if (box1->hi[i] > box2->hi[i]) box2->hi[i] = box1->hi[i];
1139   }
1140 }
1141 
1142 /* ----------------------------------------------------------------------
1143    custom MPI reduce operation
1144    merge median data structure
1145    on input:
1146      in,inout->totallo, totalhi = weight in both partitions on this proc
1147                valuelo, valuehi = pos of nearest dot(s) to cut on this proc
1148                wtlo, wthi       = total wt of dot(s) at that pos on this proc
1149                countlo, counthi = # of dot(s) nearest to cut on this proc
1150                proclo, prochi   = not used
1151    on exit:
1152      inout->   totallo, totalhi = total # of active dots in both partitions
1153                valuelo, valuehi = pos of nearest dot(s) to cut
1154                wtlo, wthi       = total wt of dot(s) at that position
1155                countlo, counthi = total # of dot(s) nearest to cut
1156                proclo, prochi   = one unique proc who owns a nearest dot
1157                                   all procs must get same proclo,prochi
1158 ------------------------------------------------------------------------- */
1159 
median_merge(void * in,void * inout,int *,MPI_Datatype *)1160 void median_merge(void *in, void *inout, int * /*len*/, MPI_Datatype * /*dptr*/)
1161 
1162 {
1163   RCB::Median *med1 = (RCB::Median *) in;
1164   RCB::Median *med2 = (RCB::Median *) inout;
1165 
1166   med2->totallo += med1->totallo;
1167   if (med1->valuelo > med2->valuelo) {
1168     med2->valuelo = med1->valuelo;
1169     med2->wtlo = med1->wtlo;
1170     med2->countlo = med1->countlo;
1171     med2->proclo = med1->proclo;
1172   }
1173   else if (med1->valuelo == med2->valuelo) {
1174     med2->wtlo += med1->wtlo;
1175     med2->countlo += med1->countlo;
1176     if (med1->proclo < med2->proclo) med2->proclo = med1->proclo;
1177   }
1178 
1179   med2->totalhi += med1->totalhi;
1180   if (med1->valuehi < med2->valuehi) {
1181     med2->valuehi = med1->valuehi;
1182     med2->wthi = med1->wthi;
1183     med2->counthi = med1->counthi;
1184     med2->prochi = med1->prochi;
1185   }
1186   else if (med1->valuehi == med2->valuehi) {
1187     med2->wthi += med1->wthi;
1188     med2->counthi += med1->counthi;
1189     if (med1->prochi < med2->prochi) med2->prochi = med1->prochi;
1190   }
1191 }
1192 
1193 /* ----------------------------------------------------------------------
1194    invert the RCB rebalance result to convert receive info into send info
1195    sortflag = flag for sorting order of received messages by proc ID
1196 ------------------------------------------------------------------------- */
1197 
invert(int sortflag)1198 void RCB::invert(int sortflag)
1199 {
1200   // only create Irregular if not previously created
1201   // allows Irregular to persist for multiple RCB calls by fix balance
1202 
1203   if (!irregular) irregular = new Irregular(lmp);
1204 
1205   // nsend = # of dots to request from other procs
1206 
1207   int nsend = nfinal-nkeep;
1208 
1209   int *proclist;
1210   memory->create(proclist,nsend,"RCB:proclist");
1211 
1212   Invert *sinvert =
1213     (Invert *) memory->smalloc(nsend*sizeof(Invert),"RCB:sinvert");
1214 
1215   int m = 0;
1216   for (int i = nkeep; i < nfinal; i++) {
1217     proclist[m] = recvproc[i];
1218     sinvert[m].rindex = recvindex[i];
1219     sinvert[m].sproc = me;
1220     sinvert[m].sindex = i;
1221     m++;
1222   }
1223 
1224   // perform inversion via irregular comm
1225   // nrecv = # of my dots to send to other procs
1226 
1227   int nrecv = irregular->create_data(nsend,proclist,sortflag);
1228   Invert *rinvert =
1229     (Invert *) memory->smalloc(nrecv*sizeof(Invert),"RCB:rinvert");
1230   irregular->exchange_data((char *) sinvert,sizeof(Invert),(char *) rinvert);
1231   irregular->destroy_data();
1232 
1233   // set public variables from requests to send my dots
1234 
1235   if (noriginal > maxsend) {
1236     memory->destroy(sendproc);
1237     memory->destroy(sendindex);
1238     maxsend = noriginal;
1239     memory->create(sendproc,maxsend,"RCB:sendproc");
1240     memory->create(sendindex,maxsend,"RCB:sendindex");
1241   }
1242 
1243   for (int i = 0; i < nkeep; i++) {
1244     sendproc[recvindex[i]] = me;
1245     sendindex[recvindex[i]] = i;
1246   }
1247 
1248   for (int i = 0; i < nrecv; i++) {
1249     m = rinvert[i].rindex;
1250     sendproc[m] = rinvert[i].sproc;
1251     sendindex[m] = rinvert[i].sindex;
1252   }
1253 
1254   // clean-up
1255 
1256   memory->destroy(proclist);
1257   memory->destroy(sinvert);
1258   memory->destroy(rinvert);
1259 }
1260 
1261 /* ----------------------------------------------------------------------
1262    memory use of Irregular
1263 ------------------------------------------------------------------------- */
1264 
memory_usage()1265 double RCB::memory_usage()
1266 {
1267   double bytes = 0;
1268   if (irregular) bytes += irregular->memory_usage();
1269   return bytes;
1270 }
1271 
1272 // -----------------------------------------------------------------------
1273 // DEBUG methods
1274 // -----------------------------------------------------------------------
1275 /*
1276 // consistency checks on RCB results
1277 
1278 void RCB::check()
1279 {
1280   int i,iflag,total1,total2;
1281   double weight,wtmax,wtmin,wtone,tolerance;
1282 
1283   // check that total # of dots remained the same
1284 
1285   MPI_Allreduce(&ndotorig,&total1,1,MPI_INT,MPI_SUM,world);
1286   MPI_Allreduce(&ndot,&total2,1,MPI_INT,MPI_SUM,world);
1287   if (total1 != total2) {
1288     if (me == 0)
1289       printf("ERROR: Points before RCB = %d, Points after RCB = %d\n",
1290              total1,total2);
1291   }
1292 
1293   // check that result is load-balanced within log2(P)*max-wt
1294 
1295   weight = wtone = 0.0;
1296   for (i = 0; i < ndot; i++) {
1297     weight += dots[i].wt;
1298     if (dots[i].wt > wtone) wtone = dots[i].wt;
1299   }
1300 
1301   MPI_Allreduce(&weight,&wtmin,1,MPI_DOUBLE,MPI_MIN,world);
1302   MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
1303   MPI_Allreduce(&wtone,&tolerance,1,MPI_DOUBLE,MPI_MAX,world);
1304 
1305   // i = smallest power-of-2 >= nprocs
1306   // tolerance = largest-single-weight*log2(nprocs)
1307 
1308   for (i = 0; (nprocs >> i) != 0; i++);
1309   tolerance = tolerance * i * (1.0 + TINY);
1310 
1311   if (wtmax - wtmin > tolerance) {
1312     if (me == 0)
1313       printf("ERROR: Load-imbalance > tolerance of %g\n",tolerance);
1314     MPI_Barrier(world);
1315     if (weight == wtmin) printf("  Proc %d has weight = %g\n",me,weight);
1316     if (weight == wtmax) printf("  Proc %d has weight = %g\n",me,weight);
1317   }
1318 
1319   MPI_Barrier(world);
1320 
1321   // check that final set of points is inside RCB box of each proc
1322 
1323   iflag = 0;
1324   for (i = 0; i < ndot; i++) {
1325     if (dots[i].x[0] < lo[0] || dots[i].x[0] > hi[0] ||
1326         dots[i].x[1] < lo[1] || dots[i].x[1] > hi[1] ||
1327         dots[i].x[2] < lo[2] || dots[i].x[2] > hi[2])
1328       iflag++;
1329   }
1330   if (iflag > 0)
1331     printf("ERROR: %d points are out-of-box on proc %d\n",iflag,me);
1332 }
1333 
1334 // stats for RCB decomposition
1335 
1336 void RCB::stats(int flag)
1337 {
1338   int i,iflag,sum,min,max;
1339   double ave,rsum,rmin,rmax;
1340   double weight,wttot,wtmin,wtmax;
1341 
1342   if (me == 0) printf("RCB Statistics:\n");
1343 
1344   // distribution info
1345 
1346   for (i = 0, weight = 0.0; i < ndot; i++) weight += dots[i].wt;
1347   MPI_Allreduce(&weight,&wttot,1,MPI_DOUBLE,MPI_SUM,world);
1348   MPI_Allreduce(&weight,&wtmin,1,MPI_DOUBLE,MPI_MIN,world);
1349   MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
1350 
1351   if (me == 0) {
1352     printf(" Total weight of dots = %g\n",wttot);
1353     printf(" Weight on each proc: ave = %g, max = %g, min = %g\n",
1354            wttot/nprocs,wtmax,wtmin);
1355   }
1356   if (flag) {
1357     MPI_Barrier(world);
1358     printf("    Proc %d has weight = %g\n",me,weight);
1359   }
1360 
1361   for (i = 0, weight = 0.0; i < ndot; i++)
1362     if (dots[i].wt > weight) weight = dots[i].wt;
1363   MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
1364 
1365   if (me == 0) printf(" Maximum weight of single dot = %g\n",wtmax);
1366   if (flag) {
1367     MPI_Barrier(world);
1368     printf("    Proc %d max weight = %g\n",me,weight);
1369   }
1370 
1371   // counter info
1372 
1373   MPI_Allreduce(&counters[0],&sum,1,MPI_INT,MPI_SUM,world);
1374   MPI_Allreduce(&counters[0],&min,1,MPI_INT,MPI_MIN,world);
1375   MPI_Allreduce(&counters[0],&max,1,MPI_INT,MPI_MAX,world);
1376   ave = ((double) sum)/nprocs;
1377   if (me == 0)
1378     printf(" Median iter: ave = %g, min = %d, max = %d\n",ave,min,max);
1379   if (flag) {
1380     MPI_Barrier(world);
1381     printf("    Proc %d median count = %d\n",me,counters[0]);
1382   }
1383 
1384   MPI_Allreduce(&counters[1],&sum,1,MPI_INT,MPI_SUM,world);
1385   MPI_Allreduce(&counters[1],&min,1,MPI_INT,MPI_MIN,world);
1386   MPI_Allreduce(&counters[1],&max,1,MPI_INT,MPI_MAX,world);
1387   ave = ((double) sum)/nprocs;
1388   if (me == 0)
1389     printf(" Send count: ave = %g, min = %d, max = %d\n",ave,min,max);
1390   if (flag) {
1391     MPI_Barrier(world);
1392     printf("    Proc %d send count = %d\n",me,counters[1]);
1393   }
1394 
1395   MPI_Allreduce(&counters[2],&sum,1,MPI_INT,MPI_SUM,world);
1396   MPI_Allreduce(&counters[2],&min,1,MPI_INT,MPI_MIN,world);
1397   MPI_Allreduce(&counters[2],&max,1,MPI_INT,MPI_MAX,world);
1398   ave = ((double) sum)/nprocs;
1399   if (me == 0)
1400     printf(" Recv count: ave = %g, min = %d, max = %d\n",ave,min,max);
1401   if (flag) {
1402     MPI_Barrier(world);
1403     printf("    Proc %d recv count = %d\n",me,counters[2]);
1404   }
1405 
1406   MPI_Allreduce(&counters[3],&sum,1,MPI_INT,MPI_SUM,world);
1407   MPI_Allreduce(&counters[3],&min,1,MPI_INT,MPI_MIN,world);
1408   MPI_Allreduce(&counters[3],&max,1,MPI_INT,MPI_MAX,world);
1409   ave = ((double) sum)/nprocs;
1410   if (me == 0)
1411     printf(" Max dots: ave = %g, min = %d, max = %d\n",ave,min,max);
1412   if (flag) {
1413     MPI_Barrier(world);
1414     printf("    Proc %d max dots = %d\n",me,counters[3]);
1415   }
1416 
1417   MPI_Allreduce(&counters[4],&sum,1,MPI_INT,MPI_SUM,world);
1418   MPI_Allreduce(&counters[4],&min,1,MPI_INT,MPI_MIN,world);
1419   MPI_Allreduce(&counters[4],&max,1,MPI_INT,MPI_MAX,world);
1420   ave = ((double) sum)/nprocs;
1421   if (me == 0)
1422     printf(" Max memory: ave = %g, min = %d, max = %d\n",ave,min,max);
1423   if (flag) {
1424     MPI_Barrier(world);
1425     printf("    Proc %d max memory = %d\n",me,counters[4]);
1426   }
1427 
1428   if (reuse) {
1429     MPI_Allreduce(&counters[5],&sum,1,MPI_INT,MPI_SUM,world);
1430     MPI_Allreduce(&counters[5],&min,1,MPI_INT,MPI_MIN,world);
1431     MPI_Allreduce(&counters[5],&max,1,MPI_INT,MPI_MAX,world);
1432     ave = ((double) sum)/nprocs;
1433     if (me == 0)
1434       printf(" # of Reuse: ave = %g, min = %d, max = %d\n",ave,min,max);
1435     if (flag) {
1436       MPI_Barrier(world);
1437       printf("    Proc %d # of Reuse = %d\n",me,counters[5]);
1438     }
1439   }
1440 
1441   MPI_Allreduce(&counters[6],&sum,1,MPI_INT,MPI_SUM,world);
1442   MPI_Allreduce(&counters[6],&min,1,MPI_INT,MPI_MIN,world);
1443   MPI_Allreduce(&counters[6],&max,1,MPI_INT,MPI_MAX,world);
1444   ave = ((double) sum)/nprocs;
1445   if (me == 0)
1446     printf(" # of OverAlloc: ave = %g, min = %d, max = %d\n",ave,min,max);
1447   if (flag) {
1448     MPI_Barrier(world);
1449     printf("    Proc %d # of OverAlloc = %d\n",me,counters[6]);
1450   }
1451 
1452   // RCB boxes for each proc
1453 
1454   if (flag) {
1455     if (me == 0) printf(" RCB sub-domain boxes:\n");
1456     for (i = 0; i < 3; i++) {
1457       MPI_Barrier(world);
1458       if (me == 0) printf("    Dimension %d\n",i+1);
1459       MPI_Barrier(world);
1460       printf("      Proc = %d: Box = %g %g\n",me,lo[i],hi[i]);
1461     }
1462   }
1463 }
1464 */
1465