1 /*
2  * Copyright 1997, Regents of the University of Minnesota
3  *
4  * xyzpart.c
5  *
6  * This file contains code that implements a coordinate based partitioning
7  *
8  * Started 7/11/97
9  * George
10  *
11  * $Id: xyzpart.c 10755 2011-09-15 12:28:34Z karypis $
12  *
13  */
14 
15 #include <parmetislib.h>
16 
17 
18 /*************************************************************************/
19 /*! This function implements a simple coordinate based partitioning
20 */
21 /*************************************************************************/
Coordinate_Partition(ctrl_t * ctrl,graph_t * graph,idx_t ndims,real_t * xyz,idx_t setup)22 void Coordinate_Partition(ctrl_t *ctrl, graph_t *graph, idx_t ndims,
23          real_t *xyz, idx_t setup)
24 {
25   idx_t i, j, k, nvtxs, firstvtx, icoord, nbits;
26   idx_t *vtxdist, *bxyz;
27   ikv_t *cand;
28 
29   WCOREPUSH;
30 
31   if (setup)
32     CommSetup(ctrl, graph);
33   else
34     graph->nrecv = 0;
35 
36   nvtxs    = graph->nvtxs;
37   vtxdist  = graph->vtxdist;
38   firstvtx = vtxdist[ctrl->mype];
39 
40   cand = ikvwspacemalloc(ctrl, nvtxs);
41   bxyz = iwspacemalloc(ctrl, nvtxs*ndims);
42 
43   /* Assign the coordinates into bins */
44   nbits = 9;  /* 2^nbits # of bins */
45   IRBinCoordinates(ctrl, graph, ndims, xyz, 1<<nbits, bxyz);
46 
47   /* Z-ordering */
48   for (i=0; i<nvtxs; i++) {
49     for (icoord=0, j=nbits-1; j>=0; j--) {
50       for (k=0; k<ndims; k++)
51         icoord = (icoord<<1) + (bxyz[i*ndims+k]&(1<<j) ? 1 : 0);
52     }
53     cand[i].key = icoord;
54     cand[i].val = firstvtx+i;
55   }
56 
57   /* Partition using sorting */
58   PseudoSampleSort(ctrl, graph, cand);
59 
60   WCOREPOP;
61 }
62 
63 
64 /*************************************************************************/
65 /*! This function maps the coordinates into bin numbers.
66     It starts with a uniform distribution of the max-min range and then
67     performs a number of iterations that adjust the bucket boundaries based
68     on the actual bucket counts.
69 */
70 /*************************************************************************/
IRBinCoordinates(ctrl_t * ctrl,graph_t * graph,idx_t ndims,real_t * xyz,idx_t nbins,idx_t * bxyz)71 void IRBinCoordinates(ctrl_t *ctrl, graph_t *graph, idx_t ndims, real_t *xyz,
72          idx_t nbins, idx_t *bxyz)
73 {
74   idx_t npes=ctrl->npes, mype=ctrl->mype;
75   idx_t i, j, k, l, gnvtxs, nvtxs;
76   idx_t csize, psize;
77   idx_t *vtxdist, *lcounts, *gcounts;
78   real_t gmin, gmax, *emarkers, *nemarkers;
79   rkv_t *cand;
80 
81   WCOREPUSH;
82 
83   gnvtxs   = graph->gnvtxs;
84   nvtxs    = graph->nvtxs;
85 
86   cand      = rkvwspacemalloc(ctrl, nvtxs);
87   lcounts   = iwspacemalloc(ctrl, nbins);
88   gcounts   = iwspacemalloc(ctrl, nbins);
89   emarkers  = rwspacemalloc(ctrl, nbins+1);
90   nemarkers = rwspacemalloc(ctrl, nbins+1);
91 
92 
93   /* Go over each dimension */
94   for (k=0; k<ndims; k++) {
95     for (i=0; i<nvtxs; i++) {
96       cand[i].key = xyz[i*ndims+k];
97       cand[i].val = i;
98     }
99     rkvsorti(nvtxs, cand);
100 
101     /* determine initial range */
102     gkMPI_Allreduce((void *)&cand[0].key, (void *)&gmin, 1, REAL_T, MPI_MIN, ctrl->comm);
103     gkMPI_Allreduce((void *)&cand[nvtxs-1].key, (void *)&gmax, 1, REAL_T, MPI_MAX, ctrl->comm);
104 
105     for (i=0; i<nbins; i++)
106       emarkers[i] = gmin + (gmax-gmin)*i/nbins;
107     emarkers[nbins] = gmax*(1.0+copysign(1.0,gmax)*2.0*REAL_EPSILON);
108 
109     /* get into a iterative backet boundary refinement */
110     for (l=0; l<5; l++) {
111       /* determine bucket counts */
112       iset(nbins, 0, lcounts);
113       for (j=0, i=0; i<nvtxs;) {
114         if (cand[i].key <= emarkers[j+1]) {
115           lcounts[j]++;
116           i++;
117         }
118         else {
119           j++;
120         }
121       }
122 
123       gkMPI_Allreduce((void *)lcounts, (void *)gcounts, nbins, IDX_T, MPI_SUM, ctrl->comm);
124 
125       /*
126       if (mype == 0) {
127         printf("Distribution [%"PRIDX"]...\n", l);
128         for (i=0; i<nbins; i++)
129           printf("\t%"PRREAL" - %"PRREAL" => %"PRIDX"\n", emarkers[i], emarkers[i+1], gcounts[i]);
130       }
131       */
132 
133       /* break-out if things look reasonably balanced */
134       if (imax(nbins, gcounts) < 4*gnvtxs/nbins)
135         break;
136 
137       /* refine buckets */
138       rset(nbins, -1, nemarkers);
139       for (j=0, i=0; i<nbins; i++) {
140         for (csize=0; ; j++) {
141           if (csize+gcounts[j] < gnvtxs/nbins) {
142             csize += gcounts[j];
143           }
144           else {
145             psize = gnvtxs/nbins-csize;
146             emarkers[j] += (emarkers[j+1]-emarkers[j])*psize/gcounts[j];
147             gcounts[j]  -= psize;
148 
149             nemarkers[i+1] = emarkers[j];
150             break;
151           }
152         }
153       }
154       nemarkers[0]     = gmin;
155       nemarkers[nbins] = gmax*(1.0+copysign(1.0,gmax)*2.0*REAL_EPSILON);
156       rcopy(nbins+1, nemarkers, emarkers);
157     }
158 
159     /* assign the coordinate to the appropriate bin */
160     for (j=0, i=0; i<nvtxs;) {
161       if (cand[i].key <= emarkers[j+1]) {
162         bxyz[cand[i].val*ndims+k] = j;
163         i++;
164       }
165       else {
166         j++;
167       }
168     }
169   }
170 
171   WCOREPOP;
172 }
173 
174 
175 /*************************************************************************/
176 /*! This function maps the coordinates into bin numbers. It uses a per
177     dimension recursive center-of mass bisection approach.
178 */
179 /*************************************************************************/
RBBinCoordinates(ctrl_t * ctrl,graph_t * graph,idx_t ndims,real_t * xyz,idx_t nbins,idx_t * bxyz)180 void RBBinCoordinates(ctrl_t *ctrl, graph_t *graph, idx_t ndims, real_t *xyz,
181          idx_t nbins, idx_t *bxyz)
182 {
183   idx_t npes=ctrl->npes, mype=ctrl->mype;
184   idx_t i, j, k, l, gnvtxs, nvtxs, cnbins;
185   idx_t *vtxdist, *lcounts, *gcounts;
186   real_t sum, gmin, gmax, gsum, *emarkers, *nemarkers, *lsums, *gsums;
187   rkv_t *cand;
188   ikv_t *buckets;
189 
190   WCOREPUSH;
191 
192   gnvtxs   = graph->gnvtxs;
193   nvtxs    = graph->nvtxs;
194 
195   buckets   = ikvwspacemalloc(ctrl, nbins);
196   cand      = rkvwspacemalloc(ctrl, nvtxs);
197   lcounts   = iwspacemalloc(ctrl, nbins);
198   gcounts   = iwspacemalloc(ctrl, nbins);
199   lsums     = rwspacemalloc(ctrl, nbins);
200   gsums     = rwspacemalloc(ctrl, nbins);
201   emarkers  = rwspacemalloc(ctrl, nbins+1);
202   nemarkers = rwspacemalloc(ctrl, nbins+1);
203 
204 
205   /* Go over each dimension */
206   for (k=0; k<ndims; k++) {
207     for (sum=0.0, i=0; i<nvtxs; i++) {
208       cand[i].key = xyz[i*ndims+k];
209       cand[i].val = i;
210       sum += cand[i].key;
211     }
212     rkvsorti(nvtxs, cand);
213 
214     /* determine initial stats */
215     gkMPI_Allreduce((void *)&cand[0].key, (void *)&gmin, 1, REAL_T, MPI_MIN, ctrl->comm);
216     gkMPI_Allreduce((void *)&cand[nvtxs-1].key, (void *)&gmax, 1, REAL_T, MPI_MAX, ctrl->comm);
217     gkMPI_Allreduce((void *)&sum, (void *)&gsum, 1, REAL_T, MPI_MAX, ctrl->comm);
218 
219     emarkers[0] = gmin;
220     emarkers[1] = gsum/gnvtxs;
221     emarkers[2] = gmax*(1.0+(gmax < 0 ? -1. : 1.)*2.0*REAL_EPSILON);
222     cnbins = 2;
223 
224     /* get into a iterative backet boundary refinement */
225     while (cnbins < nbins) {
226       /* determine bucket counts */
227       iset(cnbins, 0, lcounts);
228       rset(cnbins, 0, lsums);
229       for (j=0, i=0; i<nvtxs;) {
230         if (cand[i].key <= emarkers[j+1]) {
231           lcounts[j]++;
232           lsums[j] += cand[i].key;
233           i++;
234         }
235         else {
236           j++;
237         }
238       }
239 
240       gkMPI_Allreduce((void *)lcounts, (void *)gcounts, cnbins, IDX_T, MPI_SUM, ctrl->comm);
241       gkMPI_Allreduce((void *)lsums, (void *)gsums, cnbins, REAL_T, MPI_SUM, ctrl->comm);
242 
243       /*
244       if (mype == 0) {
245         printf("Distribution [%"PRIDX"]...\n", cnbins);
246         for (i=0; i<cnbins; i++)
247           printf("\t%"PRREAL" - %"PRREAL" => %"PRIDX"\n", emarkers[i], emarkers[i+1], gcounts[i]);
248       }
249       */
250 
251 
252       /* split over-weight buckets */
253       for (i=0; i<cnbins; i++) {
254         buckets[i].key = gcounts[i];
255         buckets[i].val = i;
256       }
257       ikvsorti(cnbins, buckets);
258 
259       for (j=0, i=cnbins-1; i>=0; i--, j++) {
260         l = buckets[i].val;
261         if (buckets[i].key > gnvtxs/nbins && cnbins < nbins) {
262           /*
263           if (mype == 0)
264             printf("\t\t %f %f\n", (float)emarkers[l], (float)emarkers[l+1]);
265           */
266           nemarkers[j++] = (emarkers[l]+emarkers[l+1])/2;
267           cnbins++;
268         }
269         nemarkers[j] = emarkers[l];
270       }
271       PASSERT(ctrl, cnbins == j);
272 
273       rsorti(cnbins, nemarkers);
274       rcopy(cnbins, nemarkers, emarkers);
275       emarkers[cnbins] = gmax*(1.0+(gmax < 0 ? -1. : 1.)*2.0*REAL_EPSILON);
276     }
277 
278     /* assign the coordinate to the appropriate bin */
279     for (j=0, i=0; i<nvtxs;) {
280       if (cand[i].key <= emarkers[j+1]) {
281         bxyz[cand[i].val*ndims+k] = j;
282         i++;
283       }
284       else {
285         j++;
286       }
287     }
288   }
289 
290   WCOREPOP;
291 }
292 
293 
294 /**************************************************************************/
295 /*! This function sorts a distributed list of ikv_t in increasing
296     order, and uses it to compute a partition. It uses samplesort.
297 
298     This function is poorly implemented and makes the assumption that the
299     number of vertices in each processor is greater than npes.
300     This constraint is currently enforced by the calling functions.
301     \todo fix it in 4.0.
302 */
303 /**************************************************************************/
SampleSort(ctrl_t * ctrl,graph_t * graph,ikv_t * elmnts)304 void SampleSort(ctrl_t *ctrl, graph_t *graph, ikv_t *elmnts)
305 {
306   idx_t i, j, k, nvtxs, nrecv, npes=ctrl->npes, mype=ctrl->mype,
307         firstvtx, lastvtx;
308   idx_t *scounts, *rcounts, *vtxdist, *perm;
309   ikv_t *relmnts, *mypicks, *allpicks;
310 
311   WCOREPUSH;
312 
313   CommUpdateNnbrs(ctrl, npes);
314 
315   nvtxs   = graph->nvtxs;
316   vtxdist = graph->vtxdist;
317 
318   /* get memory for the counts */
319   scounts = iwspacemalloc(ctrl, npes+1);
320   rcounts = iwspacemalloc(ctrl, npes+1);
321 
322   /* get memory for the splitters */
323   mypicks  = ikvwspacemalloc(ctrl, npes+1);
324   WCOREPUSH; /* for freeing allpicks */
325   allpicks = ikvwspacemalloc(ctrl, npes*npes);
326 
327   /* Sort the local elements */
328   ikvsorti(nvtxs, elmnts);
329 
330   /* Select the local npes-1 equally spaced elements */
331   for (i=1; i<npes; i++) {
332     mypicks[i-1].key = elmnts[i*(nvtxs/npes)].key;
333     mypicks[i-1].val = elmnts[i*(nvtxs/npes)].val;
334   }
335 
336   /* PrintPairs(ctrl, npes-1, mypicks, "Mypicks"); */
337 
338   /* Gather the picks to all the processors */
339   gkMPI_Allgather((void *)mypicks, 2*(npes-1), IDX_T, (void *)allpicks,
340       2*(npes-1), IDX_T, ctrl->comm);
341 
342   /* PrintPairs(ctrl, npes*(npes-1), allpicks, "Allpicks"); */
343 
344   /* Sort all the picks */
345   ikvsortii(npes*(npes-1), allpicks);
346 
347   /* PrintPairs(ctrl, npes*(npes-1), allpicks, "Allpicks"); */
348 
349   /* Select the final splitters. Set the boundaries to simplify coding */
350   for (i=1; i<npes; i++)
351     mypicks[i] = allpicks[i*(npes-1)];
352   mypicks[0].key    = IDX_MIN;
353   mypicks[npes].key = IDX_MAX;
354 
355   /* PrintPairs(ctrl, npes+1, mypicks, "Mypicks"); */
356 
357   WCOREPOP;  /* free allpicks */
358 
359   /* Compute the number of elements that belong to each bucket */
360   iset(npes, 0, scounts);
361   for (j=i=0; i<nvtxs; i++) {
362     if (elmnts[i].key < mypicks[j+1].key ||
363         (elmnts[i].key == mypicks[j+1].key && elmnts[i].val < mypicks[j+1].val))
364       scounts[j]++;
365     else
366       scounts[++j]++;
367   }
368   gkMPI_Alltoall(scounts, 1, IDX_T, rcounts, 1, IDX_T, ctrl->comm);
369 
370   MAKECSR(i, npes, scounts);
371   MAKECSR(i, npes, rcounts);
372 
373 /*
374   PrintVector(ctrl, npes+1, 0, scounts, "Scounts");
375   PrintVector(ctrl, npes+1, 0, rcounts, "Rcounts");
376 */
377 
378   /* Allocate memory for sorted elements and receive them */
379   nrecv   = rcounts[npes];
380   relmnts = ikvwspacemalloc(ctrl, nrecv);
381 
382   /* Issue the receives first */
383   for (i=0; i<npes; i++)
384     gkMPI_Irecv((void *)(relmnts+rcounts[i]), 2*(rcounts[i+1]-rcounts[i]),
385         IDX_T, i, 1, ctrl->comm, ctrl->rreq+i);
386 
387   /* Issue the sends next */
388   for (i=0; i<npes; i++)
389     gkMPI_Isend((void *)(elmnts+scounts[i]), 2*(scounts[i+1]-scounts[i]),
390         IDX_T, i, 1, ctrl->comm, ctrl->sreq+i);
391 
392   gkMPI_Waitall(npes, ctrl->rreq, ctrl->statuses);
393   gkMPI_Waitall(npes, ctrl->sreq, ctrl->statuses);
394 
395 
396   /* OK, now do the local sort of the relmnts. Use perm to keep track original order */
397   perm = iwspacemalloc(ctrl, nrecv);
398   for (i=0; i<nrecv; i++) {
399     perm[i]        = relmnts[i].val;
400     relmnts[i].val = i;
401   }
402   ikvsorti(nrecv, relmnts);
403 
404 
405   /* Compute what needs to be shifted */
406   gkMPI_Scan((void *)(&nrecv), (void *)(&lastvtx), 1, IDX_T, MPI_SUM, ctrl->comm);
407   firstvtx = lastvtx-nrecv;
408 
409   /*myprintf(ctrl, "first, last: %"PRIDX" %"PRIDX"\n", firstvtx, lastvtx); */
410 
411   for (j=0, i=0; i<npes; i++) {
412     if (vtxdist[i+1] > firstvtx) {  /* Found the first PE that is passed me */
413       if (vtxdist[i+1] >= lastvtx) {
414         /* myprintf(ctrl, "Shifting %"PRIDX" elements to processor %"PRIDX"\n", lastvtx-firstvtx, i); */
415         for (k=0; k<lastvtx-firstvtx; k++, j++)
416           relmnts[relmnts[j].val].key = i;
417       }
418       else {
419         /* myprintf(ctrl, "Shifting %"PRIDX" elements to processor %"PRIDX"\n", vtxdist[i+1]-firstvtx, i); */
420         for (k=0; k<vtxdist[i+1]-firstvtx; k++, j++)
421           relmnts[relmnts[j].val].key = i;
422 
423         firstvtx = vtxdist[i+1];
424       }
425     }
426     if (vtxdist[i+1] >= lastvtx)
427       break;
428   }
429 
430   /* Reverse the ordering on the relmnts[].val */
431   for (i=0; i<nrecv; i++) {
432     PASSERTP(ctrl, relmnts[i].key>=0 && relmnts[i].key<npes,
433             (ctrl, "%"PRIDX" %"PRIDX"\n", i, relmnts[i].key));
434     relmnts[i].val = perm[i];
435   }
436 
437   /* OK, now sent it back */
438   /* Issue the receives first */
439   for (i=0; i<npes; i++)
440     gkMPI_Irecv((void *)(elmnts+scounts[i]), 2*(scounts[i+1]-scounts[i]), IDX_T,
441         i, 1, ctrl->comm, ctrl->rreq+i);
442 
443   /* Issue the sends next */
444   for (i=0; i<npes; i++)
445     gkMPI_Isend((void *)(relmnts+rcounts[i]), 2*(rcounts[i+1]-rcounts[i]), IDX_T,
446         i, 1, ctrl->comm, ctrl->sreq+i);
447 
448   gkMPI_Waitall(npes, ctrl->rreq, ctrl->statuses);
449   gkMPI_Waitall(npes, ctrl->sreq, ctrl->statuses);
450 
451 
452   /* Construct a partition for the graph */
453   graph->where = imalloc(graph->nvtxs+graph->nrecv, "PartSort: graph->where");
454   firstvtx = vtxdist[mype];
455   for (i=0; i<nvtxs; i++) {
456     PASSERTP(ctrl, elmnts[i].key>=0 && elmnts[i].key<npes,
457         (ctrl, "%"PRIDX" %"PRIDX"\n", i, elmnts[i].key));
458     PASSERTP(ctrl, elmnts[i].val>=vtxdist[mype] && elmnts[i].val<vtxdist[mype+1],
459         (ctrl, "%"PRIDX" %"PRIDX" %"PRIDX" %"PRIDX"\n", i, vtxdist[mype], vtxdist[mype+1], elmnts[i].val));
460     graph->where[elmnts[i].val-firstvtx] = elmnts[i].key;
461   }
462 
463   WCOREPOP;
464 }
465 
466 
467 /**************************************************************************/
468 /*! This function sorts a distributed list of ikv_t in increasing
469     order, and uses it to compute a partition. It uses a
470     samplesort variant whose number of local samples can potentially
471     be smaller than npes.
472 */
473 /**************************************************************************/
PseudoSampleSort(ctrl_t * ctrl,graph_t * graph,ikv_t * elmnts)474 void PseudoSampleSort(ctrl_t *ctrl, graph_t *graph, ikv_t *elmnts)
475 {
476   idx_t npes=ctrl->npes, mype=ctrl->mype;
477   idx_t i, j, k, nlsamples, ntsamples, nvtxs, nrecv, firstvtx, lastvtx;
478   idx_t *scounts, *rcounts, *sdispls, *rdispls, *vtxdist, *perm;
479   ikv_t *relmnts, *mypicks, *allpicks;
480 
481 STARTTIMER(ctrl, ctrl->AuxTmr1);
482 
483   WCOREPUSH;
484 
485   nvtxs   = graph->nvtxs;
486   vtxdist = graph->vtxdist;
487 
488   /* determine the number of local samples */
489   //nlsamples = (GlobalSESum(ctrl, graph->nedges) + graph->gnvtxs)/(npes*npes);
490   nlsamples = graph->gnvtxs/(npes*npes);
491   if (nlsamples > npes)
492     nlsamples = npes;
493   else if (nlsamples < 75)
494     nlsamples = gk_min(75, npes); /* the 'npes' in the min is to account for small graphs */
495 
496 
497   IFSET(ctrl->dbglvl, DBG_INFO,
498       rprintf(ctrl, "PseudoSampleSort: nlsamples=%"PRIDX" of %"PRIDX"\n", nlsamples, npes));
499 
500   /* get memory for the counts and displacements */
501   scounts = iwspacemalloc(ctrl, npes+1);
502   rcounts = iwspacemalloc(ctrl, npes+1);
503   sdispls = iwspacemalloc(ctrl, npes+1);
504   rdispls = iwspacemalloc(ctrl, npes+1);
505 
506   /* get memory for the splitters */
507   mypicks  = ikvwspacemalloc(ctrl, npes+1);
508 
509   WCOREPUSH; /* for freeing allpicks */
510   allpicks = ikvwspacemalloc(ctrl, npes*nlsamples);
511 
512   /* Sort the local elements */
513   ikvsorti(nvtxs, elmnts);
514 
515   /* Select the local nlsamples-1 equally spaced elements */
516   for (i=0; i<nlsamples-1; i++) {
517     if (nvtxs > 0) {
518       k = (nvtxs/(3*nlsamples)            /* initial offset */
519            + i*nvtxs/nlsamples            /* increament */
520            + mype*nvtxs/(npes*nlsamples)  /* per-pe shift for nlsamples<npes */
521           )%nvtxs;
522       mypicks[i].key = elmnts[k].key;
523       mypicks[i].val = elmnts[k].val;
524     }
525     else {
526       /* Take care the case in which a processor has no elements, at which
527          point we still select nlsamples-1, but we set their .val to -1 to be
528          removed later prior to sorting */
529       mypicks[i].val = -1;
530     }
531   }
532 
533   /* PrintPairs(ctrl, nlsamples-1, mypicks, "Mypicks"); */
534 
535 STOPTIMER(ctrl, ctrl->AuxTmr1);
536 STARTTIMER(ctrl, ctrl->AuxTmr2);
537 
538   /* Gather the picks to all the processors */
539   gkMPI_Allgather((void *)mypicks, 2*(nlsamples-1), IDX_T, (void *)allpicks,
540       2*(nlsamples-1), IDX_T, ctrl->comm);
541 
542   /* PrintPairs(ctrl, npes*(nlsamples-1), allpicks, "Allpicks"); */
543 
544   /* Remove any samples that have .val == -1 */
545   for (ntsamples=0, i=0; i<npes*(nlsamples-1); i++) {
546     if (allpicks[i].val != -1)
547       allpicks[ntsamples++] = allpicks[i];
548   }
549 
550   /* Sort all the picks */
551   ikvsortii(ntsamples, allpicks);
552 
553 
554   /* Select the final splitters. Set the boundaries to simplify coding */
555   for (i=1; i<npes; i++)
556     mypicks[i] = allpicks[i*ntsamples/npes];
557   mypicks[0].key    = IDX_MIN;
558   mypicks[npes].key = IDX_MAX;
559 
560 
561   WCOREPOP;  /* free allpicks */
562 
563 STOPTIMER(ctrl, ctrl->AuxTmr2);
564 STARTTIMER(ctrl, ctrl->AuxTmr3);
565 
566   /* Compute the number of elements that belong to each bucket */
567   iset(npes, 0, scounts);
568   for (j=i=0; i<nvtxs; i++) {
569     if (elmnts[i].key < mypicks[j+1].key ||
570         (elmnts[i].key == mypicks[j+1].key && elmnts[i].val < mypicks[j+1].val))
571       scounts[j]++;
572     else
573       scounts[++j]++;
574   }
575   PASSERT(ctrl, j < npes);
576   gkMPI_Alltoall(scounts, 1, IDX_T, rcounts, 1, IDX_T, ctrl->comm);
577 
578   /* multiply raw counts by 2 to account for the ikv_t type */
579   sdispls[0] = rdispls[0] = 0;
580   for (i=0; i<npes; i++) {
581     scounts[i] *= 2;
582     rcounts[i] *= 2;
583     sdispls[i+1] = sdispls[i] + scounts[i];
584     rdispls[i+1] = rdispls[i] + rcounts[i];
585   }
586 
587 STOPTIMER(ctrl, ctrl->AuxTmr3);
588 STARTTIMER(ctrl, ctrl->AuxTmr4);
589 
590   /*
591   PrintVector(ctrl, npes+1, 0, scounts, "Scounts");
592   PrintVector(ctrl, npes+1, 0, rcounts, "Rcounts");
593   */
594 
595   /* Allocate memory for sorted elements and receive them */
596   nrecv   = rdispls[npes]/2;  /* The divide by 2 is to get the # of ikv_t elements */
597   relmnts = ikvwspacemalloc(ctrl, nrecv);
598 
599   IFSET(ctrl->dbglvl, DBG_INFO,
600       rprintf(ctrl, "PseudoSampleSort: max_nrecv: %"PRIDX" of %"PRIDX"\n",
601         GlobalSEMax(ctrl, nrecv), graph->gnvtxs/npes));
602   if (mype == 0 || mype == npes-1)
603     IFSET(ctrl->dbglvl, DBG_INFO,
604         myprintf(ctrl, "PseudoSampleSort: nrecv: %"PRIDX" of %"PRIDX"\n",
605           nrecv, graph->gnvtxs/npes));
606 
607 
608   gkMPI_Alltoallv((void *)elmnts,  scounts, sdispls, IDX_T,
609                   (void *)relmnts, rcounts, rdispls, IDX_T,
610                   ctrl->comm);
611 
612 STOPTIMER(ctrl, ctrl->AuxTmr4);
613 STARTTIMER(ctrl, ctrl->AuxTmr5);
614 
615   /* OK, now do the local sort of the relmnts. Use perm to keep track original order */
616   perm = iwspacemalloc(ctrl, nrecv);
617   for (i=0; i<nrecv; i++) {
618     perm[i]        = relmnts[i].val;
619     relmnts[i].val = i;
620   }
621   ikvsorti(nrecv, relmnts);
622 
623   /* Compute what needs to be shifted */
624   gkMPI_Scan((void *)(&nrecv), (void *)(&lastvtx), 1, IDX_T, MPI_SUM, ctrl->comm);
625   firstvtx = lastvtx-nrecv;
626 
627   for (j=0, i=0; i<npes; i++) {
628     if (vtxdist[i+1] > firstvtx) {  /* Found the first PE that is passed me */
629       if (vtxdist[i+1] >= lastvtx) {
630         /* myprintf(ctrl, "Shifting %"PRIDX" elements to processor %"PRIDX"\n", lastvtx-firstvtx, i); */
631         for (k=0; k<lastvtx-firstvtx; k++, j++)
632           relmnts[relmnts[j].val].key = i;
633       }
634       else {
635         /* myprintf(ctrl, "Shifting %"PRIDX" elements to processor %"PRIDX"\n", vtxdist[i+1]-firstvtx, i); */
636         for (k=0; k<vtxdist[i+1]-firstvtx; k++, j++)
637           relmnts[relmnts[j].val].key = i;
638 
639         firstvtx = vtxdist[i+1];
640       }
641     }
642     if (vtxdist[i+1] >= lastvtx)
643       break;
644   }
645 
646   /* Reverse the ordering on the relmnts[].val */
647   for (i=0; i<nrecv; i++) {
648     PASSERTP(ctrl, relmnts[i].key>=0 && relmnts[i].key<npes,
649             (ctrl, "%"PRIDX" %"PRIDX"\n", i, relmnts[i].key));
650     relmnts[i].val = perm[i];
651   }
652 
653 STOPTIMER(ctrl, ctrl->AuxTmr5);
654 STARTTIMER(ctrl, ctrl->AuxTmr6);
655 
656   /* OK, now sent it back. The role of send/recv arrays is now reversed. */
657   gkMPI_Alltoallv((void *)relmnts, rcounts, rdispls, IDX_T,
658                   (void *)elmnts,  scounts, sdispls, IDX_T,
659                   ctrl->comm);
660 
661 
662   /* Construct a partition for the graph */
663   graph->where = imalloc(graph->nvtxs+graph->nrecv, "PartSort: graph->where");
664   firstvtx = vtxdist[mype];
665   for (i=0; i<nvtxs; i++) {
666     PASSERTP(ctrl, elmnts[i].key>=0 && elmnts[i].key<npes,
667         (ctrl, "%"PRIDX" %"PRIDX"\n", i, elmnts[i].key));
668     PASSERTP(ctrl, elmnts[i].val>=vtxdist[mype] && elmnts[i].val<vtxdist[mype+1],
669         (ctrl, "%"PRIDX" %"PRIDX" %"PRIDX" %"PRIDX"\n", i, vtxdist[mype], vtxdist[mype+1], elmnts[i].val));
670     graph->where[elmnts[i].val-firstvtx] = elmnts[i].key;
671   }
672 
673   WCOREPOP;
674 
675 STOPTIMER(ctrl, ctrl->AuxTmr6);
676 }
677 
678 
679