1 // clang-format off
2 /* ----------------------------------------------------------------------
3 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
4 https://www.lammps.org/, Sandia National Laboratories
5 Steve Plimpton, sjplimp@sandia.gov
6
7 Copyright (2003) Sandia Corporation. Under the terms of Contract
8 DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
9 certain rights in this software. This software is distributed under
10 the GNU General Public License.
11
12 See the README file in the top-level LAMMPS directory.
13 ------------------------------------------------------------------------- */
14
15 #include "rcb.h"
16
17 #include "irregular.h"
18 #include "memory.h"
19
20 #include <cstring>
21
22 using namespace LAMMPS_NS;
23
24 #define MYHUGE 1.0e30
25 #define TINY 1.0e-6
26 #define DELTA 16384
27
28 // prototypes for non-class functions
29
30 void box_merge(void *, void *, int *, MPI_Datatype *);
31 void median_merge(void *, void *, int *, MPI_Datatype *);
32
33 // NOTE: if want to have reuse flag, need to sum Tree across procs
34
35 /* ---------------------------------------------------------------------- */
36
RCB(LAMMPS * lmp)37 RCB::RCB(LAMMPS *lmp) : Pointers(lmp)
38 {
39 MPI_Comm_rank(world,&me);
40 MPI_Comm_size(world,&nprocs);
41
42 ndot = maxdot = 0;
43 dots = nullptr;
44
45 nlist = maxlist = 0;
46 dotlist = dotmark = dotmark_select = nullptr;
47
48 maxbuf = 0;
49 buf = nullptr;
50
51 maxrecv = maxsend = 0;
52 recvproc = recvindex = sendproc = sendindex = nullptr;
53
54 tree = nullptr;
55 irregular = nullptr;
56
57 // create MPI data and function types for box and median AllReduce ops
58
59 MPI_Type_contiguous(6,MPI_DOUBLE,&box_type);
60 MPI_Type_commit(&box_type);
61 MPI_Type_contiguous(sizeof(Median),MPI_CHAR,&med_type);
62 MPI_Type_commit(&med_type);
63
64 MPI_Op_create(box_merge,1,&box_op);
65 MPI_Op_create(median_merge,1,&med_op);
66
67 reuse = 0;
68 }
69
70 /* ---------------------------------------------------------------------- */
71
~RCB()72 RCB::~RCB()
73 {
74 memory->sfree(dots);
75 memory->destroy(dotlist);
76 memory->destroy(dotmark);
77 memory->destroy(dotmark_select);
78 memory->sfree(buf);
79
80 memory->destroy(recvproc);
81 memory->destroy(recvindex);
82 memory->destroy(sendproc);
83 memory->destroy(sendindex);
84
85 memory->sfree(tree);
86 delete irregular;
87
88 MPI_Type_free(&med_type);
89 MPI_Type_free(&box_type);
90 MPI_Op_free(&box_op);
91 MPI_Op_free(&med_op);
92 }
93
94 /* ----------------------------------------------------------------------
95 perform RCB balancing of N particles at coords X in bounding box LO/HI
96 NEW version: each RCB cut is tested in all dimensions
97 dimeension that produces 2 boxes with largest min size is selected
98 this is to prevent very narrow boxes from being produced
99 if wt = nullptr, ignore per-particle weights
100 if wt defined, per-particle weights > 0.0
101 dimension = 2 or 3
102 as documented in rcb.h:
103 sets noriginal,nfinal,nkeep,recvproc,recvindex,lo,hi
104 all proc particles will be inside or on surface of 3-d box
105 defined by final lo/hi
106 // NOTE: worry about re-use of data structs for fix balance?
107 ------------------------------------------------------------------------- */
108
compute(int dimension,int n,double ** x,double * wt,double * bboxlo,double * bboxhi)109 void RCB::compute(int dimension, int n, double **x, double *wt,
110 double *bboxlo, double *bboxhi)
111 {
112 int i,j,k;
113 int keep,outgoing,incoming,incoming2;
114 int dim,markactive;
115 int indexlo,indexhi;
116 int first_iteration,breakflag;
117 double wttot,wtlo,wthi,wtsum,wtok,wtupto,wtmax;
118 double targetlo,targethi;
119 double valuemin,valuemax,valuehalf,valuehalf_select,smaller;
120 double tolerance;
121 MPI_Comm comm,comm_half;
122 MPI_Request request,request2;
123 Median med,medme;
124
125 // create list of my Dots
126
127 ndot = nkeep = noriginal = n;
128
129 if (ndot > maxdot) {
130 maxdot = ndot;
131 memory->sfree(dots);
132 dots = (Dot *) memory->smalloc(ndot*sizeof(Dot),"RCB:dots");
133 }
134
135 for (i = 0; i < ndot; i++) {
136 dots[i].x[0] = x[i][0];
137 dots[i].x[1] = x[i][1];
138 dots[i].x[2] = x[i][2];
139 dots[i].proc = me;
140 dots[i].index = i;
141 }
142
143 if (wt)
144 for (i = 0; i < ndot; i++) dots[i].wt = wt[i];
145 else
146 for (i = 0; i < ndot; i++) dots[i].wt = 1.0;
147
148 // initial bounding box = simulation box
149 // includes periodic or shrink-wrapped boundaries
150
151 lo = bbox.lo;
152 hi = bbox.hi;
153
154 lo[0] = bboxlo[0];
155 lo[1] = bboxlo[1];
156 lo[2] = bboxlo[2];
157 hi[0] = bboxhi[0];
158 hi[1] = bboxhi[1];
159 hi[2] = bboxhi[2];
160
161 cut = 0.0;
162 cutdim = -1;
163
164 // initialize counters
165
166 counters[0] = 0;
167 counters[1] = 0;
168 counters[2] = 0;
169 counters[3] = ndot;
170 counters[4] = maxdot;
171 counters[5] = 0;
172 counters[6] = 0;
173
174 // create communicator for use in recursion
175
176 MPI_Comm_dup(world,&comm);
177
178 // recurse until partition is a single proc = me
179 // proclower,procupper = lower,upper procs in partition
180 // procmid = 1st proc in upper half of partition
181
182 int procpartner,procpartner2;
183
184 int procmid;
185 int proclower = 0;
186 int procupper = nprocs - 1;
187
188 while (proclower != procupper) {
189
190 // if odd # of procs, lower partition gets extra one
191
192 procmid = proclower + (procupper - proclower) / 2 + 1;
193
194 // determine communication partner(s)
195 // readnumber = # of proc partners to read from
196
197 if (me < procmid)
198 procpartner = me + (procmid - proclower);
199 else
200 procpartner = me - (procmid - proclower);
201
202 int readnumber = 1;
203 if (procpartner > procupper) {
204 readnumber = 0;
205 procpartner--;
206 }
207 if (me == procupper && procpartner != procmid - 1) {
208 readnumber = 2;
209 procpartner2 = procpartner + 1;
210 }
211
212 // wttot = summed weight of entire partition
213 // search tolerance = largest single weight (plus epsilon)
214 // targetlo = desired weight in lower half of partition
215 // targethi = desired weight in upper half of partition
216
217 wtmax = wtsum = 0.0;
218
219 if (wt) {
220 for (i = 0; i < ndot; i++) {
221 wtsum += dots[i].wt;
222 if (dots[i].wt > wtmax) wtmax = dots[i].wt;
223 }
224 } else {
225 for (i = 0; i < ndot; i++) wtsum += dots[i].wt;
226 wtmax = 1.0;
227 }
228
229 MPI_Allreduce(&wtsum,&wttot,1,MPI_DOUBLE,MPI_SUM,comm);
230 if (wt) MPI_Allreduce(&wtmax,&tolerance,1,MPI_DOUBLE,MPI_MAX,comm);
231 else tolerance = 1.0;
232
233 tolerance *= 1.0 + TINY;
234 targetlo = wttot * (procmid - proclower) / (procupper + 1 - proclower);
235 targethi = wttot - targetlo;
236
237 // attempt a cut in each dimension
238 // each cut produces 2 boxes, each with a reduced box length in that dim
239 // smaller = the smaller of the 2 reduced box lengths in that dimension
240 // choose to cut in dimension which produces largest smaller value
241 // this should induce final proc sub-boxes to be as cube-ish as possible
242 // dim_select = selected cut dimension
243 // valuehalf_select = valuehalf in that dimension
244 // dotmark_select = dot markings in that dimension
245 // initialize largest = -1.0 to insure a cut in some dim is accepted
246 // e.g. if current recursed box is size 0 in all dims
247
248 int dim_select = -1;
249 double largest = -1.0;
250
251 for (dim = 0; dim < dimension; dim++) {
252
253 // create active list and mark array for dots
254 // initialize active list to all dots
255
256 if (ndot > maxlist) {
257 memory->destroy(dotlist);
258 memory->destroy(dotmark);
259 memory->destroy(dotmark_select);
260 maxlist = maxdot;
261 memory->create(dotlist,maxlist,"RCB:dotlist");
262 memory->create(dotmark,maxlist,"RCB:dotmark");
263 memory->create(dotmark_select,maxlist,"RCB:dotmark_select");
264 }
265
266 nlist = ndot;
267 for (i = 0; i < nlist; i++) dotlist[i] = i;
268
269 // median iteration
270 // zoom in on bisector until correct # of dots in each half of partition
271 // as each iteration of median-loop begins, require:
272 // all non-active dots are marked with 0/1 in dotmark
273 // valuemin <= every active dot <= valuemax
274 // wtlo, wthi = total wt of non-active dots
275 // when leave median-loop, require only:
276 // valuehalf = correct cut position
277 // all dots <= valuehalf are marked with 0 in dotmark
278 // all dots >= valuehalf are marked with 1 in dotmark
279 // markactive = which side of cut is active = 0/1
280 // indexlo,indexhi = indices of dot closest to median
281
282 wtlo = wthi = 0.0;
283 valuemin = lo[dim];
284 valuemax = hi[dim];
285 first_iteration = 1;
286 indexlo = indexhi = 0;
287
288 while (1) {
289
290 // choose bisector value
291 // use old value on 1st iteration if old cut dimension is the same
292 // on 2nd option: could push valuehalf towards geometric center
293 // with "1.0-factor" to force overshoot
294
295 if (first_iteration && reuse && dim == tree[procmid].dim) {
296 counters[5]++;
297 valuehalf = tree[procmid].cut;
298 if (valuehalf < valuemin || valuehalf > valuemax)
299 valuehalf = 0.5 * (valuemin + valuemax);
300 } else if (wt)
301 valuehalf = valuemin + (targetlo - wtlo) /
302 (wttot - wtlo - wthi) * (valuemax - valuemin);
303 else
304 valuehalf = 0.5 * (valuemin + valuemax);
305
306 first_iteration = 0;
307
308 // initialize local median data structure
309
310 medme.totallo = medme.totalhi = 0.0;
311 medme.valuelo = -MYHUGE;
312 medme.valuehi = MYHUGE;
313 medme.wtlo = medme.wthi = 0.0;
314 medme.countlo = medme.counthi = 0;
315 medme.proclo = medme.prochi = me;
316
317 // mark all active dots on one side or other of bisector
318 // also set all fields in median data struct
319 // save indices of closest dots on either side
320
321 for (j = 0; j < nlist; j++) {
322 i = dotlist[j];
323 if (dots[i].x[dim] <= valuehalf) { // in lower part
324 medme.totallo += dots[i].wt;
325 dotmark[i] = 0;
326 if (dots[i].x[dim] > medme.valuelo) { // my closest dot
327 medme.valuelo = dots[i].x[dim];
328 medme.wtlo = dots[i].wt;
329 medme.countlo = 1;
330 indexlo = i;
331 } else if (dots[i].x[dim] == medme.valuelo) { // tied for closest
332 medme.wtlo += dots[i].wt;
333 medme.countlo++;
334 }
335 }
336 else { // in upper part
337 medme.totalhi += dots[i].wt;
338 dotmark[i] = 1;
339 if (dots[i].x[dim] < medme.valuehi) { // my closest dot
340 medme.valuehi = dots[i].x[dim];
341 medme.wthi = dots[i].wt;
342 medme.counthi = 1;
343 indexhi = i;
344 } else if (dots[i].x[dim] == medme.valuehi) { // tied for closest
345 medme.wthi += dots[i].wt;
346 medme.counthi++;
347 }
348 }
349 }
350
351 // combine median data struct across current subset of procs
352
353 counters[0]++;
354 MPI_Allreduce(&medme,&med,1,med_type,med_op,comm);
355
356 // test median guess for convergence
357 // move additional dots that are next to cut across it
358
359 if (wtlo + med.totallo < targetlo) { // lower half TOO SMALL
360
361 wtlo += med.totallo;
362 valuehalf = med.valuehi;
363
364 if (med.counthi == 1) { // only one dot to move
365 if (wtlo + med.wthi < targetlo) { // move it, keep iterating
366 if (me == med.prochi) dotmark[indexhi] = 0;
367 }
368 else { // only move if beneficial
369 if (wtlo + med.wthi - targetlo < targetlo - wtlo)
370 if (me == med.prochi) dotmark[indexhi] = 0;
371 break; // all done
372 }
373 }
374 else { // multiple dots to move
375 breakflag = 0;
376 wtok = 0.0;
377 if (medme.valuehi == med.valuehi) wtok = medme.wthi;
378 if (wtlo + med.wthi >= targetlo) { // all done
379 MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
380 wtmax = targetlo - wtlo;
381 if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
382 breakflag = 1;
383 } // wtok = most I can move
384 for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
385 i = dotlist[j];
386 if (dots[i].x[dim] == med.valuehi) { // only move if better
387 if (wtsum + dots[i].wt - wtok < wtok - wtsum)
388 dotmark[i] = 0;
389 wtsum += dots[i].wt;
390 }
391 }
392 if (breakflag) break; // done if moved enough
393 }
394
395 wtlo += med.wthi;
396 if (targetlo-wtlo <= tolerance) break; // close enough
397
398 valuemin = med.valuehi; // iterate again
399 markactive = 1;
400 }
401
402 else if (wthi + med.totalhi < targethi) { // upper half TOO SMALL
403
404 wthi += med.totalhi;
405 valuehalf = med.valuelo;
406
407 if (med.countlo == 1) { // only one dot to move
408 if (wthi + med.wtlo < targethi) { // move it, keep iterating
409 if (me == med.proclo) dotmark[indexlo] = 1;
410 }
411 else { // only move if beneficial
412 if (wthi + med.wtlo - targethi < targethi - wthi)
413 if (me == med.proclo) dotmark[indexlo] = 1;
414 break; // all done
415 }
416 }
417 else { // multiple dots to move
418 breakflag = 0;
419 wtok = 0.0;
420 if (medme.valuelo == med.valuelo) wtok = medme.wtlo;
421 if (wthi + med.wtlo >= targethi) { // all done
422 MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
423 wtmax = targethi - wthi;
424 if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
425 breakflag = 1;
426 } // wtok = most I can move
427 for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
428 i = dotlist[j];
429 if (dots[i].x[dim] == med.valuelo) { // only move if better
430 if (wtsum + dots[i].wt - wtok < wtok - wtsum)
431 dotmark[i] = 1;
432 wtsum += dots[i].wt;
433 }
434 }
435 if (breakflag) break; // done if moved enough
436 }
437
438 wthi += med.wtlo;
439 if (targethi-wthi <= tolerance) break; // close enough
440
441 valuemax = med.valuelo; // iterate again
442 markactive = 0;
443 }
444
445 else // Goldilocks result: both partitions just right
446 break;
447
448 // shrink the active list
449
450 k = 0;
451 for (j = 0; j < nlist; j++) {
452 i = dotlist[j];
453 if (dotmark[i] == markactive) dotlist[k++] = i;
454 }
455 nlist = k;
456 }
457
458 // cut produces 2 sub-boxes with reduced size in dim
459 // compare smaller of the 2 sizes to previous dims
460 // keep dim that has the largest smaller
461
462 smaller = MIN(valuehalf-lo[dim],hi[dim]-valuehalf);
463 if (smaller > largest) {
464 largest = smaller;
465 dim_select = dim;
466 valuehalf_select = valuehalf;
467 if (ndot > 0) memcpy(dotmark_select,dotmark,ndot*sizeof(int));
468 }
469 }
470
471 // copy results for best dim cut into dim,valuehalf,dotmark
472
473 dim = dim_select;
474 valuehalf = valuehalf_select;
475 if (ndot > 0) memcpy(dotmark,dotmark_select,ndot*sizeof(int));
476
477 // special case for zero box width
478 // can occur when all dots are on corner vertices of this sub-box
479 // split box on longest dimension
480 // reset dotmark for that cut
481
482 if (largest == 0.0) {
483 dim = 0;
484 if (hi[1]-lo[1] > hi[0]-lo[0]) dim = 1;
485 if (dimension == 3 && hi[2]-lo[2] > hi[dim]-lo[dim]) dim = 2;
486 valuehalf = 0.5* (lo[dim] + hi[dim]);
487
488 for (j = 0; j < nlist; j++) {
489 i = dotlist[j];
490 if (dots[i].x[dim] <= valuehalf) dotmark[i] = 0;
491 else dotmark[i] = 1;
492 }
493 }
494
495 // found median
496 // store cut info only if I am procmid
497
498 if (me == procmid) {
499 cut = valuehalf;
500 cutdim = dim;
501 }
502
503 // use cut to shrink my RCB bounding box
504
505 if (me < procmid) hi[dim] = valuehalf;
506 else lo[dim] = valuehalf;
507
508 // outgoing = number of dots to ship to partner
509 // nkeep = number of dots that have never migrated
510
511 markactive = (me < procpartner);
512 for (i = 0, keep = 0, outgoing = 0; i < ndot; i++)
513 if (dotmark[i] == markactive) outgoing++;
514 else if (i < nkeep) keep++;
515 nkeep = keep;
516
517 // alert partner how many dots I'll send, read how many I'll recv
518
519 MPI_Send(&outgoing,1,MPI_INT,procpartner,0,world);
520 incoming = 0;
521 if (readnumber) {
522 MPI_Recv(&incoming,1,MPI_INT,procpartner,0,world,MPI_STATUS_IGNORE);
523 if (readnumber == 2) {
524 MPI_Recv(&incoming2,1,MPI_INT,procpartner2,0,world,MPI_STATUS_IGNORE);
525 incoming += incoming2;
526 }
527 }
528
529 // check if need to alloc more space
530
531 int ndotnew = ndot - outgoing + incoming;
532 if (ndotnew > maxdot) {
533 while (maxdot < ndotnew) maxdot += DELTA;
534 dots = (Dot *) memory->srealloc(dots,maxdot*sizeof(Dot),"RCB::dots");
535 counters[6]++;
536 }
537
538 counters[1] += outgoing;
539 counters[2] += incoming;
540 if (ndotnew > counters[3]) counters[3] = ndotnew;
541 if (maxdot > counters[4]) counters[4] = maxdot;
542
543 // malloc comm send buffer
544
545 if (outgoing > maxbuf) {
546 memory->sfree(buf);
547 maxbuf = outgoing;
548 buf = (Dot *) memory->smalloc(maxbuf*sizeof(Dot),"RCB:buf");
549 }
550
551 // fill buffer with dots that are marked for sending
552 // pack down the unmarked ones
553
554 keep = outgoing = 0;
555 for (i = 0; i < ndot; i++) {
556 if (dotmark[i] == markactive)
557 memcpy(&buf[outgoing++],&dots[i],sizeof(Dot));
558 else
559 memmove(&dots[keep++],&dots[i],sizeof(Dot));
560 }
561
562 // post receives for dots
563
564 if (readnumber > 0) {
565 MPI_Irecv(&dots[keep],incoming*sizeof(Dot),MPI_CHAR,
566 procpartner,1,world,&request);
567 if (readnumber == 2) {
568 keep += incoming - incoming2;
569 MPI_Irecv(&dots[keep],incoming2*sizeof(Dot),MPI_CHAR,
570 procpartner2,1,world,&request2);
571 }
572 }
573
574 // handshake before sending dots to insure recvs have been posted
575
576 if (readnumber > 0) {
577 MPI_Send(nullptr,0,MPI_INT,procpartner,0,world);
578 if (readnumber == 2) MPI_Send(nullptr,0,MPI_INT,procpartner2,0,world);
579 }
580 MPI_Recv(nullptr,0,MPI_INT,procpartner,0,world,MPI_STATUS_IGNORE);
581
582 // send dots to partner
583
584 MPI_Rsend(buf,outgoing*sizeof(Dot),MPI_CHAR,procpartner,1,world);
585
586 // wait until all dots are received
587
588 if (readnumber > 0) {
589 MPI_Wait(&request,MPI_STATUS_IGNORE);
590 if (readnumber == 2) MPI_Wait(&request2,MPI_STATUS_IGNORE);
591 }
592
593 ndot = ndotnew;
594
595 // cut partition in half, create new communicators of 1/2 size
596
597 int split;
598 if (me < procmid) {
599 procupper = procmid - 1;
600 split = 0;
601 } else {
602 proclower = procmid;
603 split = 1;
604 }
605
606 MPI_Comm_split(comm,split,me,&comm_half);
607 MPI_Comm_free(&comm);
608 comm = comm_half;
609 }
610
611 // clean up
612
613 MPI_Comm_free(&comm);
614
615 // set public variables with results of rebalance
616
617 nfinal = ndot;
618
619 if (nfinal > maxrecv) {
620 memory->destroy(recvproc);
621 memory->destroy(recvindex);
622 maxrecv = nfinal;
623 memory->create(recvproc,maxrecv,"RCB:recvproc");
624 memory->create(recvindex,maxrecv,"RCB:recvindex");
625 }
626
627 for (i = 0; i < nfinal; i++) {
628 recvproc[i] = dots[i].proc;
629 recvindex[i] = dots[i].index;
630 }
631 }
632
633 /* ----------------------------------------------------------------------
634 perform RCB balancing of N particles at coords X in bounding box LO/HI
635 OLD version: each RCB cut is made in longest dimension of sub-box
636 if wt = nullptr, ignore per-particle weights
637 if wt defined, per-particle weights > 0.0
638 dimension = 2 or 3
639 as documented in rcb.h:
640 sets noriginal,nfinal,nkeep,recvproc,recvindex,lo,hi
641 all proc particles will be inside or on surface of 3-d box
642 defined by final lo/hi
643 // NOTE: worry about re-use of data structs for fix balance?
644 ------------------------------------------------------------------------- */
645
compute_old(int dimension,int n,double ** x,double * wt,double * bboxlo,double * bboxhi)646 void RCB::compute_old(int dimension, int n, double **x, double *wt,
647 double *bboxlo, double *bboxhi)
648 {
649 int i,j,k;
650 int keep,outgoing,incoming,incoming2;
651 int dim,markactive;
652 int indexlo,indexhi;
653 int first_iteration,breakflag;
654 double wttot,wtlo,wthi,wtsum,wtok,wtupto,wtmax;
655 double targetlo,targethi;
656 double valuemin,valuemax,valuehalf;
657 double tolerance;
658 MPI_Comm comm,comm_half;
659 MPI_Request request,request2;
660 Median med,medme;
661
662 // create list of my Dots
663
664 ndot = nkeep = noriginal = n;
665
666 if (ndot > maxdot) {
667 maxdot = ndot;
668 memory->sfree(dots);
669 dots = (Dot *) memory->smalloc(ndot*sizeof(Dot),"RCB:dots");
670 }
671
672 for (i = 0; i < ndot; i++) {
673 dots[i].x[0] = x[i][0];
674 dots[i].x[1] = x[i][1];
675 dots[i].x[2] = x[i][2];
676 dots[i].proc = me;
677 dots[i].index = i;
678 }
679
680 if (wt)
681 for (i = 0; i < ndot; i++) dots[i].wt = wt[i];
682 else
683 for (i = 0; i < ndot; i++) dots[i].wt = 1.0;
684
685 // initial bounding box = simulation box
686 // includes periodic or shrink-wrapped boundaries
687
688 lo = bbox.lo;
689 hi = bbox.hi;
690
691 lo[0] = bboxlo[0];
692 lo[1] = bboxlo[1];
693 lo[2] = bboxlo[2];
694 hi[0] = bboxhi[0];
695 hi[1] = bboxhi[1];
696 hi[2] = bboxhi[2];
697
698 cut = 0.0;
699 cutdim = -1;
700
701 // initialize counters
702
703 counters[0] = 0;
704 counters[1] = 0;
705 counters[2] = 0;
706 counters[3] = ndot;
707 counters[4] = maxdot;
708 counters[5] = 0;
709 counters[6] = 0;
710
711 // create communicator for use in recursion
712
713 MPI_Comm_dup(world,&comm);
714
715 // recurse until partition is a single proc = me
716 // proclower,procupper = lower,upper procs in partition
717 // procmid = 1st proc in upper half of partition
718
719 int procpartner,procpartner2;
720
721 int procmid;
722 int proclower = 0;
723 int procupper = nprocs - 1;
724
725 while (proclower != procupper) {
726
727 // if odd # of procs, lower partition gets extra one
728
729 procmid = proclower + (procupper - proclower) / 2 + 1;
730
731 // determine communication partner(s)
732 // readnumber = # of proc partners to read from
733
734 if (me < procmid)
735 procpartner = me + (procmid - proclower);
736 else
737 procpartner = me - (procmid - proclower);
738
739 int readnumber = 1;
740 if (procpartner > procupper) {
741 readnumber = 0;
742 procpartner--;
743 }
744 if (me == procupper && procpartner != procmid - 1) {
745 readnumber = 2;
746 procpartner2 = procpartner + 1;
747 }
748
749 // wttot = summed weight of entire partition
750 // search tolerance = largest single weight (plus epsilon)
751 // targetlo = desired weight in lower half of partition
752 // targethi = desired weight in upper half of partition
753
754 wtmax = wtsum = 0.0;
755
756 if (wt) {
757 for (i = 0; i < ndot; i++) {
758 wtsum += dots[i].wt;
759 if (dots[i].wt > wtmax) wtmax = dots[i].wt;
760 }
761 } else {
762 for (i = 0; i < ndot; i++) wtsum += dots[i].wt;
763 wtmax = 1.0;
764 }
765
766 MPI_Allreduce(&wtsum,&wttot,1,MPI_DOUBLE,MPI_SUM,comm);
767 if (wt) MPI_Allreduce(&wtmax,&tolerance,1,MPI_DOUBLE,MPI_MAX,comm);
768 else tolerance = 1.0;
769
770 tolerance *= 1.0 + TINY;
771 targetlo = wttot * (procmid - proclower) / (procupper + 1 - proclower);
772 targethi = wttot - targetlo;
773
774 // dim = dimension to bisect on
775 // do not allow choice of z dimension for 2d system
776
777 dim = 0;
778 if (hi[1]-lo[1] > hi[0]-lo[0]) dim = 1;
779 if (dimension == 3) {
780 if (dim == 0 && hi[2]-lo[2] > hi[0]-lo[0]) dim = 2;
781 if (dim == 1 && hi[2]-lo[2] > hi[1]-lo[1]) dim = 2;
782 }
783
784 // create active list and mark array for dots
785 // initialize active list to all dots
786
787 if (ndot > maxlist) {
788 memory->destroy(dotlist);
789 memory->destroy(dotmark);
790 maxlist = maxdot;
791 memory->create(dotlist,maxlist,"RCB:dotlist");
792 memory->create(dotmark,maxlist,"RCB:dotmark");
793 }
794
795 nlist = ndot;
796 for (i = 0; i < nlist; i++) dotlist[i] = i;
797
798 // median iteration
799 // zoom in on bisector until correct # of dots in each half of partition
800 // as each iteration of median-loop begins, require:
801 // all non-active dots are marked with 0/1 in dotmark
802 // valuemin <= every active dot <= valuemax
803 // wtlo, wthi = total wt of non-active dots
804 // when leave median-loop, require only:
805 // valuehalf = correct cut position
806 // all dots <= valuehalf are marked with 0 in dotmark
807 // all dots >= valuehalf are marked with 1 in dotmark
808 // markactive = which side of cut is active = 0/1
809 // indexlo,indexhi = indices of dot closest to median
810
811 wtlo = wthi = 0.0;
812 valuemin = lo[dim];
813 valuemax = hi[dim];
814 first_iteration = 1;
815 indexlo = indexhi = 0;
816
817 while (1) {
818
819 // choose bisector value
820 // use old value on 1st iteration if old cut dimension is the same
821 // on 2nd option: could push valuehalf towards geometric center
822 // with "1.0-factor" to force overshoot
823
824 if (first_iteration && reuse && dim == tree[procmid].dim) {
825 counters[5]++;
826 valuehalf = tree[procmid].cut;
827 if (valuehalf < valuemin || valuehalf > valuemax)
828 valuehalf = 0.5 * (valuemin + valuemax);
829 } else if (wt)
830 valuehalf = valuemin + (targetlo - wtlo) /
831 (wttot - wtlo - wthi) * (valuemax - valuemin);
832 else
833 valuehalf = 0.5 * (valuemin + valuemax);
834
835 first_iteration = 0;
836
837 // initialize local median data structure
838
839 medme.totallo = medme.totalhi = 0.0;
840 medme.valuelo = -MYHUGE;
841 medme.valuehi = MYHUGE;
842 medme.wtlo = medme.wthi = 0.0;
843 medme.countlo = medme.counthi = 0;
844 medme.proclo = medme.prochi = me;
845
846 // mark all active dots on one side or other of bisector
847 // also set all fields in median data struct
848 // save indices of closest dots on either side
849
850 for (j = 0; j < nlist; j++) {
851 i = dotlist[j];
852 if (dots[i].x[dim] <= valuehalf) { // in lower part
853 medme.totallo += dots[i].wt;
854 dotmark[i] = 0;
855 if (dots[i].x[dim] > medme.valuelo) { // my closest dot
856 medme.valuelo = dots[i].x[dim];
857 medme.wtlo = dots[i].wt;
858 medme.countlo = 1;
859 indexlo = i;
860 } else if (dots[i].x[dim] == medme.valuelo) { // tied for closest
861 medme.wtlo += dots[i].wt;
862 medme.countlo++;
863 }
864 }
865 else { // in upper part
866 medme.totalhi += dots[i].wt;
867 dotmark[i] = 1;
868 if (dots[i].x[dim] < medme.valuehi) { // my closest dot
869 medme.valuehi = dots[i].x[dim];
870 medme.wthi = dots[i].wt;
871 medme.counthi = 1;
872 indexhi = i;
873 } else if (dots[i].x[dim] == medme.valuehi) { // tied for closest
874 medme.wthi += dots[i].wt;
875 medme.counthi++;
876 }
877 }
878 }
879
880 // combine median data struct across current subset of procs
881
882 counters[0]++;
883 MPI_Allreduce(&medme,&med,1,med_type,med_op,comm);
884
885 // test median guess for convergence
886 // move additional dots that are next to cut across it
887
888 if (wtlo + med.totallo < targetlo) { // lower half TOO SMALL
889
890 wtlo += med.totallo;
891 valuehalf = med.valuehi;
892
893 if (med.counthi == 1) { // only one dot to move
894 if (wtlo + med.wthi < targetlo) { // move it, keep iterating
895 if (me == med.prochi) dotmark[indexhi] = 0;
896 }
897 else { // only move if beneficial
898 if (wtlo + med.wthi - targetlo < targetlo - wtlo)
899 if (me == med.prochi) dotmark[indexhi] = 0;
900 break; // all done
901 }
902 }
903 else { // multiple dots to move
904 breakflag = 0;
905 wtok = 0.0;
906 if (medme.valuehi == med.valuehi) wtok = medme.wthi;
907 if (wtlo + med.wthi >= targetlo) { // all done
908 MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
909 wtmax = targetlo - wtlo;
910 if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
911 breakflag = 1;
912 } // wtok = most I can move
913 for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
914 i = dotlist[j];
915 if (dots[i].x[dim] == med.valuehi) { // only move if better
916 if (wtsum + dots[i].wt - wtok < wtok - wtsum)
917 dotmark[i] = 0;
918 wtsum += dots[i].wt;
919 }
920 }
921 if (breakflag) break; // done if moved enough
922 }
923
924 wtlo += med.wthi;
925 if (targetlo-wtlo <= tolerance) break; // close enough
926
927 valuemin = med.valuehi; // iterate again
928 markactive = 1;
929 }
930
931 else if (wthi + med.totalhi < targethi) { // upper half TOO SMALL
932
933 wthi += med.totalhi;
934 valuehalf = med.valuelo;
935
936 if (med.countlo == 1) { // only one dot to move
937 if (wthi + med.wtlo < targethi) { // move it, keep iterating
938 if (me == med.proclo) dotmark[indexlo] = 1;
939 }
940 else { // only move if beneficial
941 if (wthi + med.wtlo - targethi < targethi - wthi)
942 if (me == med.proclo) dotmark[indexlo] = 1;
943 break; // all done
944 }
945 }
946 else { // multiple dots to move
947 breakflag = 0;
948 wtok = 0.0;
949 if (medme.valuelo == med.valuelo) wtok = medme.wtlo;
950 if (wthi + med.wtlo >= targethi) { // all done
951 MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
952 wtmax = targethi - wthi;
953 if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
954 breakflag = 1;
955 } // wtok = most I can move
956 for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
957 i = dotlist[j];
958 if (dots[i].x[dim] == med.valuelo) { // only move if better
959 if (wtsum + dots[i].wt - wtok < wtok - wtsum)
960 dotmark[i] = 1;
961 wtsum += dots[i].wt;
962 }
963 }
964 if (breakflag) break; // done if moved enough
965 }
966
967 wthi += med.wtlo;
968 if (targethi-wthi <= tolerance) break; // close enough
969
970 valuemax = med.valuelo; // iterate again
971 markactive = 0;
972 }
973
974 else // Goldilocks result: both partitions just right
975 break;
976
977 // shrink the active list
978
979 k = 0;
980 for (j = 0; j < nlist; j++) {
981 i = dotlist[j];
982 if (dotmark[i] == markactive) dotlist[k++] = i;
983 }
984 nlist = k;
985 }
986
987 // found median
988 // store cut info only if I am procmid
989
990 if (me == procmid) {
991 cut = valuehalf;
992 cutdim = dim;
993 }
994
995 // use cut to shrink my RCB bounding box
996
997 if (me < procmid) hi[dim] = valuehalf;
998 else lo[dim] = valuehalf;
999
1000 // outgoing = number of dots to ship to partner
1001 // nkeep = number of dots that have never migrated
1002
1003 markactive = (me < procpartner);
1004 for (i = 0, keep = 0, outgoing = 0; i < ndot; i++)
1005 if (dotmark[i] == markactive) outgoing++;
1006 else if (i < nkeep) keep++;
1007 nkeep = keep;
1008
1009 // alert partner how many dots I'll send, read how many I'll recv
1010
1011 MPI_Send(&outgoing,1,MPI_INT,procpartner,0,world);
1012 incoming = 0;
1013 if (readnumber) {
1014 MPI_Recv(&incoming,1,MPI_INT,procpartner,0,world,MPI_STATUS_IGNORE);
1015 if (readnumber == 2) {
1016 MPI_Recv(&incoming2,1,MPI_INT,procpartner2,0,world,MPI_STATUS_IGNORE);
1017 incoming += incoming2;
1018 }
1019 }
1020
1021 // check if need to alloc more space
1022
1023 int ndotnew = ndot - outgoing + incoming;
1024 if (ndotnew > maxdot) {
1025 while (maxdot < ndotnew) maxdot += DELTA;
1026 dots = (Dot *) memory->srealloc(dots,maxdot*sizeof(Dot),"RCB::dots");
1027 counters[6]++;
1028 }
1029
1030 counters[1] += outgoing;
1031 counters[2] += incoming;
1032 if (ndotnew > counters[3]) counters[3] = ndotnew;
1033 if (maxdot > counters[4]) counters[4] = maxdot;
1034
1035 // malloc comm send buffer
1036
1037 if (outgoing > maxbuf) {
1038 memory->sfree(buf);
1039 maxbuf = outgoing;
1040 buf = (Dot *) memory->smalloc(maxbuf*sizeof(Dot),"RCB:buf");
1041 }
1042
1043 // fill buffer with dots that are marked for sending
1044 // pack down the unmarked ones
1045
1046 keep = outgoing = 0;
1047 for (i = 0; i < ndot; i++) {
1048 if (dotmark[i] == markactive)
1049 memcpy(&buf[outgoing++],&dots[i],sizeof(Dot));
1050 else
1051 memmove(&dots[keep++],&dots[i],sizeof(Dot));
1052 }
1053
1054 // post receives for dots
1055
1056 if (readnumber > 0) {
1057 MPI_Irecv(&dots[keep],incoming*sizeof(Dot),MPI_CHAR,
1058 procpartner,1,world,&request);
1059 if (readnumber == 2) {
1060 keep += incoming - incoming2;
1061 MPI_Irecv(&dots[keep],incoming2*sizeof(Dot),MPI_CHAR,
1062 procpartner2,1,world,&request2);
1063 }
1064 }
1065
1066 // handshake before sending dots to insure recvs have been posted
1067
1068 if (readnumber > 0) {
1069 MPI_Send(nullptr,0,MPI_INT,procpartner,0,world);
1070 if (readnumber == 2) MPI_Send(nullptr,0,MPI_INT,procpartner2,0,world);
1071 }
1072 MPI_Recv(nullptr,0,MPI_INT,procpartner,0,world,MPI_STATUS_IGNORE);
1073
1074 // send dots to partner
1075
1076 MPI_Rsend(buf,outgoing*sizeof(Dot),MPI_CHAR,procpartner,1,world);
1077
1078 // wait until all dots are received
1079
1080 if (readnumber > 0) {
1081 MPI_Wait(&request,MPI_STATUS_IGNORE);
1082 if (readnumber == 2) MPI_Wait(&request2,MPI_STATUS_IGNORE);
1083 }
1084
1085 ndot = ndotnew;
1086
1087 // cut partition in half, create new communicators of 1/2 size
1088
1089 int split;
1090 if (me < procmid) {
1091 procupper = procmid - 1;
1092 split = 0;
1093 } else {
1094 proclower = procmid;
1095 split = 1;
1096 }
1097
1098 MPI_Comm_split(comm,split,me,&comm_half);
1099 MPI_Comm_free(&comm);
1100 comm = comm_half;
1101 }
1102
1103 // clean up
1104
1105 MPI_Comm_free(&comm);
1106
1107 // set public variables with results of rebalance
1108
1109 nfinal = ndot;
1110
1111 if (nfinal > maxrecv) {
1112 memory->destroy(recvproc);
1113 memory->destroy(recvindex);
1114 maxrecv = nfinal;
1115 memory->create(recvproc,maxrecv,"RCB:recvproc");
1116 memory->create(recvindex,maxrecv,"RCB:recvindex");
1117 }
1118
1119 for (i = 0; i < nfinal; i++) {
1120 recvproc[i] = dots[i].proc;
1121 recvindex[i] = dots[i].index;
1122 }
1123 }
1124
1125 /* ----------------------------------------------------------------------
1126 custom MPI reduce operation
1127 merge of each component of an RCB bounding box
1128 ------------------------------------------------------------------------- */
1129
box_merge(void * in,void * inout,int *,MPI_Datatype *)1130 void box_merge(void *in, void *inout, int * /*len*/, MPI_Datatype * /*dptr*/)
1131
1132 {
1133 RCB::BBox *box1 = (RCB::BBox *) in;
1134 RCB::BBox *box2 = (RCB::BBox *) inout;
1135
1136 for (int i = 0; i < 3; i++) {
1137 if (box1->lo[i] < box2->lo[i]) box2->lo[i] = box1->lo[i];
1138 if (box1->hi[i] > box2->hi[i]) box2->hi[i] = box1->hi[i];
1139 }
1140 }
1141
1142 /* ----------------------------------------------------------------------
1143 custom MPI reduce operation
1144 merge median data structure
1145 on input:
1146 in,inout->totallo, totalhi = weight in both partitions on this proc
1147 valuelo, valuehi = pos of nearest dot(s) to cut on this proc
1148 wtlo, wthi = total wt of dot(s) at that pos on this proc
1149 countlo, counthi = # of dot(s) nearest to cut on this proc
1150 proclo, prochi = not used
1151 on exit:
1152 inout-> totallo, totalhi = total # of active dots in both partitions
1153 valuelo, valuehi = pos of nearest dot(s) to cut
1154 wtlo, wthi = total wt of dot(s) at that position
1155 countlo, counthi = total # of dot(s) nearest to cut
1156 proclo, prochi = one unique proc who owns a nearest dot
1157 all procs must get same proclo,prochi
1158 ------------------------------------------------------------------------- */
1159
median_merge(void * in,void * inout,int *,MPI_Datatype *)1160 void median_merge(void *in, void *inout, int * /*len*/, MPI_Datatype * /*dptr*/)
1161
1162 {
1163 RCB::Median *med1 = (RCB::Median *) in;
1164 RCB::Median *med2 = (RCB::Median *) inout;
1165
1166 med2->totallo += med1->totallo;
1167 if (med1->valuelo > med2->valuelo) {
1168 med2->valuelo = med1->valuelo;
1169 med2->wtlo = med1->wtlo;
1170 med2->countlo = med1->countlo;
1171 med2->proclo = med1->proclo;
1172 }
1173 else if (med1->valuelo == med2->valuelo) {
1174 med2->wtlo += med1->wtlo;
1175 med2->countlo += med1->countlo;
1176 if (med1->proclo < med2->proclo) med2->proclo = med1->proclo;
1177 }
1178
1179 med2->totalhi += med1->totalhi;
1180 if (med1->valuehi < med2->valuehi) {
1181 med2->valuehi = med1->valuehi;
1182 med2->wthi = med1->wthi;
1183 med2->counthi = med1->counthi;
1184 med2->prochi = med1->prochi;
1185 }
1186 else if (med1->valuehi == med2->valuehi) {
1187 med2->wthi += med1->wthi;
1188 med2->counthi += med1->counthi;
1189 if (med1->prochi < med2->prochi) med2->prochi = med1->prochi;
1190 }
1191 }
1192
1193 /* ----------------------------------------------------------------------
1194 invert the RCB rebalance result to convert receive info into send info
1195 sortflag = flag for sorting order of received messages by proc ID
1196 ------------------------------------------------------------------------- */
1197
invert(int sortflag)1198 void RCB::invert(int sortflag)
1199 {
1200 // only create Irregular if not previously created
1201 // allows Irregular to persist for multiple RCB calls by fix balance
1202
1203 if (!irregular) irregular = new Irregular(lmp);
1204
1205 // nsend = # of dots to request from other procs
1206
1207 int nsend = nfinal-nkeep;
1208
1209 int *proclist;
1210 memory->create(proclist,nsend,"RCB:proclist");
1211
1212 Invert *sinvert =
1213 (Invert *) memory->smalloc(nsend*sizeof(Invert),"RCB:sinvert");
1214
1215 int m = 0;
1216 for (int i = nkeep; i < nfinal; i++) {
1217 proclist[m] = recvproc[i];
1218 sinvert[m].rindex = recvindex[i];
1219 sinvert[m].sproc = me;
1220 sinvert[m].sindex = i;
1221 m++;
1222 }
1223
1224 // perform inversion via irregular comm
1225 // nrecv = # of my dots to send to other procs
1226
1227 int nrecv = irregular->create_data(nsend,proclist,sortflag);
1228 Invert *rinvert =
1229 (Invert *) memory->smalloc(nrecv*sizeof(Invert),"RCB:rinvert");
1230 irregular->exchange_data((char *) sinvert,sizeof(Invert),(char *) rinvert);
1231 irregular->destroy_data();
1232
1233 // set public variables from requests to send my dots
1234
1235 if (noriginal > maxsend) {
1236 memory->destroy(sendproc);
1237 memory->destroy(sendindex);
1238 maxsend = noriginal;
1239 memory->create(sendproc,maxsend,"RCB:sendproc");
1240 memory->create(sendindex,maxsend,"RCB:sendindex");
1241 }
1242
1243 for (int i = 0; i < nkeep; i++) {
1244 sendproc[recvindex[i]] = me;
1245 sendindex[recvindex[i]] = i;
1246 }
1247
1248 for (int i = 0; i < nrecv; i++) {
1249 m = rinvert[i].rindex;
1250 sendproc[m] = rinvert[i].sproc;
1251 sendindex[m] = rinvert[i].sindex;
1252 }
1253
1254 // clean-up
1255
1256 memory->destroy(proclist);
1257 memory->destroy(sinvert);
1258 memory->destroy(rinvert);
1259 }
1260
1261 /* ----------------------------------------------------------------------
1262 memory use of Irregular
1263 ------------------------------------------------------------------------- */
1264
memory_usage()1265 double RCB::memory_usage()
1266 {
1267 double bytes = 0;
1268 if (irregular) bytes += irregular->memory_usage();
1269 return bytes;
1270 }
1271
1272 // -----------------------------------------------------------------------
1273 // DEBUG methods
1274 // -----------------------------------------------------------------------
1275 /*
1276 // consistency checks on RCB results
1277
1278 void RCB::check()
1279 {
1280 int i,iflag,total1,total2;
1281 double weight,wtmax,wtmin,wtone,tolerance;
1282
1283 // check that total # of dots remained the same
1284
1285 MPI_Allreduce(&ndotorig,&total1,1,MPI_INT,MPI_SUM,world);
1286 MPI_Allreduce(&ndot,&total2,1,MPI_INT,MPI_SUM,world);
1287 if (total1 != total2) {
1288 if (me == 0)
1289 printf("ERROR: Points before RCB = %d, Points after RCB = %d\n",
1290 total1,total2);
1291 }
1292
1293 // check that result is load-balanced within log2(P)*max-wt
1294
1295 weight = wtone = 0.0;
1296 for (i = 0; i < ndot; i++) {
1297 weight += dots[i].wt;
1298 if (dots[i].wt > wtone) wtone = dots[i].wt;
1299 }
1300
1301 MPI_Allreduce(&weight,&wtmin,1,MPI_DOUBLE,MPI_MIN,world);
1302 MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
1303 MPI_Allreduce(&wtone,&tolerance,1,MPI_DOUBLE,MPI_MAX,world);
1304
1305 // i = smallest power-of-2 >= nprocs
1306 // tolerance = largest-single-weight*log2(nprocs)
1307
1308 for (i = 0; (nprocs >> i) != 0; i++);
1309 tolerance = tolerance * i * (1.0 + TINY);
1310
1311 if (wtmax - wtmin > tolerance) {
1312 if (me == 0)
1313 printf("ERROR: Load-imbalance > tolerance of %g\n",tolerance);
1314 MPI_Barrier(world);
1315 if (weight == wtmin) printf(" Proc %d has weight = %g\n",me,weight);
1316 if (weight == wtmax) printf(" Proc %d has weight = %g\n",me,weight);
1317 }
1318
1319 MPI_Barrier(world);
1320
1321 // check that final set of points is inside RCB box of each proc
1322
1323 iflag = 0;
1324 for (i = 0; i < ndot; i++) {
1325 if (dots[i].x[0] < lo[0] || dots[i].x[0] > hi[0] ||
1326 dots[i].x[1] < lo[1] || dots[i].x[1] > hi[1] ||
1327 dots[i].x[2] < lo[2] || dots[i].x[2] > hi[2])
1328 iflag++;
1329 }
1330 if (iflag > 0)
1331 printf("ERROR: %d points are out-of-box on proc %d\n",iflag,me);
1332 }
1333
1334 // stats for RCB decomposition
1335
1336 void RCB::stats(int flag)
1337 {
1338 int i,iflag,sum,min,max;
1339 double ave,rsum,rmin,rmax;
1340 double weight,wttot,wtmin,wtmax;
1341
1342 if (me == 0) printf("RCB Statistics:\n");
1343
1344 // distribution info
1345
1346 for (i = 0, weight = 0.0; i < ndot; i++) weight += dots[i].wt;
1347 MPI_Allreduce(&weight,&wttot,1,MPI_DOUBLE,MPI_SUM,world);
1348 MPI_Allreduce(&weight,&wtmin,1,MPI_DOUBLE,MPI_MIN,world);
1349 MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
1350
1351 if (me == 0) {
1352 printf(" Total weight of dots = %g\n",wttot);
1353 printf(" Weight on each proc: ave = %g, max = %g, min = %g\n",
1354 wttot/nprocs,wtmax,wtmin);
1355 }
1356 if (flag) {
1357 MPI_Barrier(world);
1358 printf(" Proc %d has weight = %g\n",me,weight);
1359 }
1360
1361 for (i = 0, weight = 0.0; i < ndot; i++)
1362 if (dots[i].wt > weight) weight = dots[i].wt;
1363 MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
1364
1365 if (me == 0) printf(" Maximum weight of single dot = %g\n",wtmax);
1366 if (flag) {
1367 MPI_Barrier(world);
1368 printf(" Proc %d max weight = %g\n",me,weight);
1369 }
1370
1371 // counter info
1372
1373 MPI_Allreduce(&counters[0],&sum,1,MPI_INT,MPI_SUM,world);
1374 MPI_Allreduce(&counters[0],&min,1,MPI_INT,MPI_MIN,world);
1375 MPI_Allreduce(&counters[0],&max,1,MPI_INT,MPI_MAX,world);
1376 ave = ((double) sum)/nprocs;
1377 if (me == 0)
1378 printf(" Median iter: ave = %g, min = %d, max = %d\n",ave,min,max);
1379 if (flag) {
1380 MPI_Barrier(world);
1381 printf(" Proc %d median count = %d\n",me,counters[0]);
1382 }
1383
1384 MPI_Allreduce(&counters[1],&sum,1,MPI_INT,MPI_SUM,world);
1385 MPI_Allreduce(&counters[1],&min,1,MPI_INT,MPI_MIN,world);
1386 MPI_Allreduce(&counters[1],&max,1,MPI_INT,MPI_MAX,world);
1387 ave = ((double) sum)/nprocs;
1388 if (me == 0)
1389 printf(" Send count: ave = %g, min = %d, max = %d\n",ave,min,max);
1390 if (flag) {
1391 MPI_Barrier(world);
1392 printf(" Proc %d send count = %d\n",me,counters[1]);
1393 }
1394
1395 MPI_Allreduce(&counters[2],&sum,1,MPI_INT,MPI_SUM,world);
1396 MPI_Allreduce(&counters[2],&min,1,MPI_INT,MPI_MIN,world);
1397 MPI_Allreduce(&counters[2],&max,1,MPI_INT,MPI_MAX,world);
1398 ave = ((double) sum)/nprocs;
1399 if (me == 0)
1400 printf(" Recv count: ave = %g, min = %d, max = %d\n",ave,min,max);
1401 if (flag) {
1402 MPI_Barrier(world);
1403 printf(" Proc %d recv count = %d\n",me,counters[2]);
1404 }
1405
1406 MPI_Allreduce(&counters[3],&sum,1,MPI_INT,MPI_SUM,world);
1407 MPI_Allreduce(&counters[3],&min,1,MPI_INT,MPI_MIN,world);
1408 MPI_Allreduce(&counters[3],&max,1,MPI_INT,MPI_MAX,world);
1409 ave = ((double) sum)/nprocs;
1410 if (me == 0)
1411 printf(" Max dots: ave = %g, min = %d, max = %d\n",ave,min,max);
1412 if (flag) {
1413 MPI_Barrier(world);
1414 printf(" Proc %d max dots = %d\n",me,counters[3]);
1415 }
1416
1417 MPI_Allreduce(&counters[4],&sum,1,MPI_INT,MPI_SUM,world);
1418 MPI_Allreduce(&counters[4],&min,1,MPI_INT,MPI_MIN,world);
1419 MPI_Allreduce(&counters[4],&max,1,MPI_INT,MPI_MAX,world);
1420 ave = ((double) sum)/nprocs;
1421 if (me == 0)
1422 printf(" Max memory: ave = %g, min = %d, max = %d\n",ave,min,max);
1423 if (flag) {
1424 MPI_Barrier(world);
1425 printf(" Proc %d max memory = %d\n",me,counters[4]);
1426 }
1427
1428 if (reuse) {
1429 MPI_Allreduce(&counters[5],&sum,1,MPI_INT,MPI_SUM,world);
1430 MPI_Allreduce(&counters[5],&min,1,MPI_INT,MPI_MIN,world);
1431 MPI_Allreduce(&counters[5],&max,1,MPI_INT,MPI_MAX,world);
1432 ave = ((double) sum)/nprocs;
1433 if (me == 0)
1434 printf(" # of Reuse: ave = %g, min = %d, max = %d\n",ave,min,max);
1435 if (flag) {
1436 MPI_Barrier(world);
1437 printf(" Proc %d # of Reuse = %d\n",me,counters[5]);
1438 }
1439 }
1440
1441 MPI_Allreduce(&counters[6],&sum,1,MPI_INT,MPI_SUM,world);
1442 MPI_Allreduce(&counters[6],&min,1,MPI_INT,MPI_MIN,world);
1443 MPI_Allreduce(&counters[6],&max,1,MPI_INT,MPI_MAX,world);
1444 ave = ((double) sum)/nprocs;
1445 if (me == 0)
1446 printf(" # of OverAlloc: ave = %g, min = %d, max = %d\n",ave,min,max);
1447 if (flag) {
1448 MPI_Barrier(world);
1449 printf(" Proc %d # of OverAlloc = %d\n",me,counters[6]);
1450 }
1451
1452 // RCB boxes for each proc
1453
1454 if (flag) {
1455 if (me == 0) printf(" RCB sub-domain boxes:\n");
1456 for (i = 0; i < 3; i++) {
1457 MPI_Barrier(world);
1458 if (me == 0) printf(" Dimension %d\n",i+1);
1459 MPI_Barrier(world);
1460 printf(" Proc = %d: Box = %g %g\n",me,lo[i],hi[i]);
1461 }
1462 }
1463 }
1464 */
1465