1 /*
2 * Copyright 1997, Regents of the University of Minnesota
3 *
4 * xyzpart.c
5 *
6 * This file contains code that implements a coordinate based partitioning
7 *
8 * Started 7/11/97
9 * George
10 *
11 * $Id: xyzpart.c 10755 2011-09-15 12:28:34Z karypis $
12 *
13 */
14
15 #include <parmetislib.h>
16
17
18 /*************************************************************************/
19 /*! This function implements a simple coordinate based partitioning
20 */
21 /*************************************************************************/
Coordinate_Partition(ctrl_t * ctrl,graph_t * graph,idx_t ndims,real_t * xyz,idx_t setup)22 void Coordinate_Partition(ctrl_t *ctrl, graph_t *graph, idx_t ndims,
23 real_t *xyz, idx_t setup)
24 {
25 idx_t i, j, k, nvtxs, firstvtx, icoord, nbits;
26 idx_t *vtxdist, *bxyz;
27 ikv_t *cand;
28
29 WCOREPUSH;
30
31 if (setup)
32 CommSetup(ctrl, graph);
33 else
34 graph->nrecv = 0;
35
36 nvtxs = graph->nvtxs;
37 vtxdist = graph->vtxdist;
38 firstvtx = vtxdist[ctrl->mype];
39
40 cand = ikvwspacemalloc(ctrl, nvtxs);
41 bxyz = iwspacemalloc(ctrl, nvtxs*ndims);
42
43 /* Assign the coordinates into bins */
44 nbits = 9; /* 2^nbits # of bins */
45 IRBinCoordinates(ctrl, graph, ndims, xyz, 1<<nbits, bxyz);
46
47 /* Z-ordering */
48 for (i=0; i<nvtxs; i++) {
49 for (icoord=0, j=nbits-1; j>=0; j--) {
50 for (k=0; k<ndims; k++)
51 icoord = (icoord<<1) + (bxyz[i*ndims+k]&(1<<j) ? 1 : 0);
52 }
53 cand[i].key = icoord;
54 cand[i].val = firstvtx+i;
55 }
56
57 /* Partition using sorting */
58 PseudoSampleSort(ctrl, graph, cand);
59
60 WCOREPOP;
61 }
62
63
64 /*************************************************************************/
65 /*! This function maps the coordinates into bin numbers.
66 It starts with a uniform distribution of the max-min range and then
67 performs a number of iterations that adjust the bucket boundaries based
68 on the actual bucket counts.
69 */
70 /*************************************************************************/
IRBinCoordinates(ctrl_t * ctrl,graph_t * graph,idx_t ndims,real_t * xyz,idx_t nbins,idx_t * bxyz)71 void IRBinCoordinates(ctrl_t *ctrl, graph_t *graph, idx_t ndims, real_t *xyz,
72 idx_t nbins, idx_t *bxyz)
73 {
74 idx_t npes=ctrl->npes, mype=ctrl->mype;
75 idx_t i, j, k, l, gnvtxs, nvtxs;
76 idx_t csize, psize;
77 idx_t *vtxdist, *lcounts, *gcounts;
78 real_t gmin, gmax, *emarkers, *nemarkers;
79 rkv_t *cand;
80
81 WCOREPUSH;
82
83 gnvtxs = graph->gnvtxs;
84 nvtxs = graph->nvtxs;
85
86 cand = rkvwspacemalloc(ctrl, nvtxs);
87 lcounts = iwspacemalloc(ctrl, nbins);
88 gcounts = iwspacemalloc(ctrl, nbins);
89 emarkers = rwspacemalloc(ctrl, nbins+1);
90 nemarkers = rwspacemalloc(ctrl, nbins+1);
91
92
93 /* Go over each dimension */
94 for (k=0; k<ndims; k++) {
95 for (i=0; i<nvtxs; i++) {
96 cand[i].key = xyz[i*ndims+k];
97 cand[i].val = i;
98 }
99 rkvsorti(nvtxs, cand);
100
101 /* determine initial range */
102 gkMPI_Allreduce((void *)&cand[0].key, (void *)&gmin, 1, REAL_T, MPI_MIN, ctrl->comm);
103 gkMPI_Allreduce((void *)&cand[nvtxs-1].key, (void *)&gmax, 1, REAL_T, MPI_MAX, ctrl->comm);
104
105 for (i=0; i<nbins; i++)
106 emarkers[i] = gmin + (gmax-gmin)*i/nbins;
107 emarkers[nbins] = gmax*(1.0+copysign(1.0,gmax)*2.0*REAL_EPSILON);
108
109 /* get into a iterative backet boundary refinement */
110 for (l=0; l<5; l++) {
111 /* determine bucket counts */
112 iset(nbins, 0, lcounts);
113 for (j=0, i=0; i<nvtxs;) {
114 if (cand[i].key <= emarkers[j+1]) {
115 lcounts[j]++;
116 i++;
117 }
118 else {
119 j++;
120 }
121 }
122
123 gkMPI_Allreduce((void *)lcounts, (void *)gcounts, nbins, IDX_T, MPI_SUM, ctrl->comm);
124
125 /*
126 if (mype == 0) {
127 printf("Distribution [%"PRIDX"]...\n", l);
128 for (i=0; i<nbins; i++)
129 printf("\t%"PRREAL" - %"PRREAL" => %"PRIDX"\n", emarkers[i], emarkers[i+1], gcounts[i]);
130 }
131 */
132
133 /* break-out if things look reasonably balanced */
134 if (imax(nbins, gcounts) < 4*gnvtxs/nbins)
135 break;
136
137 /* refine buckets */
138 rset(nbins, -1, nemarkers);
139 for (j=0, i=0; i<nbins; i++) {
140 for (csize=0; ; j++) {
141 if (csize+gcounts[j] < gnvtxs/nbins) {
142 csize += gcounts[j];
143 }
144 else {
145 psize = gnvtxs/nbins-csize;
146 emarkers[j] += (emarkers[j+1]-emarkers[j])*psize/gcounts[j];
147 gcounts[j] -= psize;
148
149 nemarkers[i+1] = emarkers[j];
150 break;
151 }
152 }
153 }
154 nemarkers[0] = gmin;
155 nemarkers[nbins] = gmax*(1.0+copysign(1.0,gmax)*2.0*REAL_EPSILON);
156 rcopy(nbins+1, nemarkers, emarkers);
157 }
158
159 /* assign the coordinate to the appropriate bin */
160 for (j=0, i=0; i<nvtxs;) {
161 if (cand[i].key <= emarkers[j+1]) {
162 bxyz[cand[i].val*ndims+k] = j;
163 i++;
164 }
165 else {
166 j++;
167 }
168 }
169 }
170
171 WCOREPOP;
172 }
173
174
175 /*************************************************************************/
176 /*! This function maps the coordinates into bin numbers. It uses a per
177 dimension recursive center-of mass bisection approach.
178 */
179 /*************************************************************************/
RBBinCoordinates(ctrl_t * ctrl,graph_t * graph,idx_t ndims,real_t * xyz,idx_t nbins,idx_t * bxyz)180 void RBBinCoordinates(ctrl_t *ctrl, graph_t *graph, idx_t ndims, real_t *xyz,
181 idx_t nbins, idx_t *bxyz)
182 {
183 idx_t npes=ctrl->npes, mype=ctrl->mype;
184 idx_t i, j, k, l, gnvtxs, nvtxs, cnbins;
185 idx_t *vtxdist, *lcounts, *gcounts;
186 real_t sum, gmin, gmax, gsum, *emarkers, *nemarkers, *lsums, *gsums;
187 rkv_t *cand;
188 ikv_t *buckets;
189
190 WCOREPUSH;
191
192 gnvtxs = graph->gnvtxs;
193 nvtxs = graph->nvtxs;
194
195 buckets = ikvwspacemalloc(ctrl, nbins);
196 cand = rkvwspacemalloc(ctrl, nvtxs);
197 lcounts = iwspacemalloc(ctrl, nbins);
198 gcounts = iwspacemalloc(ctrl, nbins);
199 lsums = rwspacemalloc(ctrl, nbins);
200 gsums = rwspacemalloc(ctrl, nbins);
201 emarkers = rwspacemalloc(ctrl, nbins+1);
202 nemarkers = rwspacemalloc(ctrl, nbins+1);
203
204
205 /* Go over each dimension */
206 for (k=0; k<ndims; k++) {
207 for (sum=0.0, i=0; i<nvtxs; i++) {
208 cand[i].key = xyz[i*ndims+k];
209 cand[i].val = i;
210 sum += cand[i].key;
211 }
212 rkvsorti(nvtxs, cand);
213
214 /* determine initial stats */
215 gkMPI_Allreduce((void *)&cand[0].key, (void *)&gmin, 1, REAL_T, MPI_MIN, ctrl->comm);
216 gkMPI_Allreduce((void *)&cand[nvtxs-1].key, (void *)&gmax, 1, REAL_T, MPI_MAX, ctrl->comm);
217 gkMPI_Allreduce((void *)&sum, (void *)&gsum, 1, REAL_T, MPI_MAX, ctrl->comm);
218
219 emarkers[0] = gmin;
220 emarkers[1] = gsum/gnvtxs;
221 emarkers[2] = gmax*(1.0+(gmax < 0 ? -1. : 1.)*2.0*REAL_EPSILON);
222 cnbins = 2;
223
224 /* get into a iterative backet boundary refinement */
225 while (cnbins < nbins) {
226 /* determine bucket counts */
227 iset(cnbins, 0, lcounts);
228 rset(cnbins, 0, lsums);
229 for (j=0, i=0; i<nvtxs;) {
230 if (cand[i].key <= emarkers[j+1]) {
231 lcounts[j]++;
232 lsums[j] += cand[i].key;
233 i++;
234 }
235 else {
236 j++;
237 }
238 }
239
240 gkMPI_Allreduce((void *)lcounts, (void *)gcounts, cnbins, IDX_T, MPI_SUM, ctrl->comm);
241 gkMPI_Allreduce((void *)lsums, (void *)gsums, cnbins, REAL_T, MPI_SUM, ctrl->comm);
242
243 /*
244 if (mype == 0) {
245 printf("Distribution [%"PRIDX"]...\n", cnbins);
246 for (i=0; i<cnbins; i++)
247 printf("\t%"PRREAL" - %"PRREAL" => %"PRIDX"\n", emarkers[i], emarkers[i+1], gcounts[i]);
248 }
249 */
250
251
252 /* split over-weight buckets */
253 for (i=0; i<cnbins; i++) {
254 buckets[i].key = gcounts[i];
255 buckets[i].val = i;
256 }
257 ikvsorti(cnbins, buckets);
258
259 for (j=0, i=cnbins-1; i>=0; i--, j++) {
260 l = buckets[i].val;
261 if (buckets[i].key > gnvtxs/nbins && cnbins < nbins) {
262 /*
263 if (mype == 0)
264 printf("\t\t %f %f\n", (float)emarkers[l], (float)emarkers[l+1]);
265 */
266 nemarkers[j++] = (emarkers[l]+emarkers[l+1])/2;
267 cnbins++;
268 }
269 nemarkers[j] = emarkers[l];
270 }
271 PASSERT(ctrl, cnbins == j);
272
273 rsorti(cnbins, nemarkers);
274 rcopy(cnbins, nemarkers, emarkers);
275 emarkers[cnbins] = gmax*(1.0+(gmax < 0 ? -1. : 1.)*2.0*REAL_EPSILON);
276 }
277
278 /* assign the coordinate to the appropriate bin */
279 for (j=0, i=0; i<nvtxs;) {
280 if (cand[i].key <= emarkers[j+1]) {
281 bxyz[cand[i].val*ndims+k] = j;
282 i++;
283 }
284 else {
285 j++;
286 }
287 }
288 }
289
290 WCOREPOP;
291 }
292
293
294 /**************************************************************************/
295 /*! This function sorts a distributed list of ikv_t in increasing
296 order, and uses it to compute a partition. It uses samplesort.
297
298 This function is poorly implemented and makes the assumption that the
299 number of vertices in each processor is greater than npes.
300 This constraint is currently enforced by the calling functions.
301 \todo fix it in 4.0.
302 */
303 /**************************************************************************/
SampleSort(ctrl_t * ctrl,graph_t * graph,ikv_t * elmnts)304 void SampleSort(ctrl_t *ctrl, graph_t *graph, ikv_t *elmnts)
305 {
306 idx_t i, j, k, nvtxs, nrecv, npes=ctrl->npes, mype=ctrl->mype,
307 firstvtx, lastvtx;
308 idx_t *scounts, *rcounts, *vtxdist, *perm;
309 ikv_t *relmnts, *mypicks, *allpicks;
310
311 WCOREPUSH;
312
313 CommUpdateNnbrs(ctrl, npes);
314
315 nvtxs = graph->nvtxs;
316 vtxdist = graph->vtxdist;
317
318 /* get memory for the counts */
319 scounts = iwspacemalloc(ctrl, npes+1);
320 rcounts = iwspacemalloc(ctrl, npes+1);
321
322 /* get memory for the splitters */
323 mypicks = ikvwspacemalloc(ctrl, npes+1);
324 WCOREPUSH; /* for freeing allpicks */
325 allpicks = ikvwspacemalloc(ctrl, npes*npes);
326
327 /* Sort the local elements */
328 ikvsorti(nvtxs, elmnts);
329
330 /* Select the local npes-1 equally spaced elements */
331 for (i=1; i<npes; i++) {
332 mypicks[i-1].key = elmnts[i*(nvtxs/npes)].key;
333 mypicks[i-1].val = elmnts[i*(nvtxs/npes)].val;
334 }
335
336 /* PrintPairs(ctrl, npes-1, mypicks, "Mypicks"); */
337
338 /* Gather the picks to all the processors */
339 gkMPI_Allgather((void *)mypicks, 2*(npes-1), IDX_T, (void *)allpicks,
340 2*(npes-1), IDX_T, ctrl->comm);
341
342 /* PrintPairs(ctrl, npes*(npes-1), allpicks, "Allpicks"); */
343
344 /* Sort all the picks */
345 ikvsortii(npes*(npes-1), allpicks);
346
347 /* PrintPairs(ctrl, npes*(npes-1), allpicks, "Allpicks"); */
348
349 /* Select the final splitters. Set the boundaries to simplify coding */
350 for (i=1; i<npes; i++)
351 mypicks[i] = allpicks[i*(npes-1)];
352 mypicks[0].key = IDX_MIN;
353 mypicks[npes].key = IDX_MAX;
354
355 /* PrintPairs(ctrl, npes+1, mypicks, "Mypicks"); */
356
357 WCOREPOP; /* free allpicks */
358
359 /* Compute the number of elements that belong to each bucket */
360 iset(npes, 0, scounts);
361 for (j=i=0; i<nvtxs; i++) {
362 if (elmnts[i].key < mypicks[j+1].key ||
363 (elmnts[i].key == mypicks[j+1].key && elmnts[i].val < mypicks[j+1].val))
364 scounts[j]++;
365 else
366 scounts[++j]++;
367 }
368 gkMPI_Alltoall(scounts, 1, IDX_T, rcounts, 1, IDX_T, ctrl->comm);
369
370 MAKECSR(i, npes, scounts);
371 MAKECSR(i, npes, rcounts);
372
373 /*
374 PrintVector(ctrl, npes+1, 0, scounts, "Scounts");
375 PrintVector(ctrl, npes+1, 0, rcounts, "Rcounts");
376 */
377
378 /* Allocate memory for sorted elements and receive them */
379 nrecv = rcounts[npes];
380 relmnts = ikvwspacemalloc(ctrl, nrecv);
381
382 /* Issue the receives first */
383 for (i=0; i<npes; i++)
384 gkMPI_Irecv((void *)(relmnts+rcounts[i]), 2*(rcounts[i+1]-rcounts[i]),
385 IDX_T, i, 1, ctrl->comm, ctrl->rreq+i);
386
387 /* Issue the sends next */
388 for (i=0; i<npes; i++)
389 gkMPI_Isend((void *)(elmnts+scounts[i]), 2*(scounts[i+1]-scounts[i]),
390 IDX_T, i, 1, ctrl->comm, ctrl->sreq+i);
391
392 gkMPI_Waitall(npes, ctrl->rreq, ctrl->statuses);
393 gkMPI_Waitall(npes, ctrl->sreq, ctrl->statuses);
394
395
396 /* OK, now do the local sort of the relmnts. Use perm to keep track original order */
397 perm = iwspacemalloc(ctrl, nrecv);
398 for (i=0; i<nrecv; i++) {
399 perm[i] = relmnts[i].val;
400 relmnts[i].val = i;
401 }
402 ikvsorti(nrecv, relmnts);
403
404
405 /* Compute what needs to be shifted */
406 gkMPI_Scan((void *)(&nrecv), (void *)(&lastvtx), 1, IDX_T, MPI_SUM, ctrl->comm);
407 firstvtx = lastvtx-nrecv;
408
409 /*myprintf(ctrl, "first, last: %"PRIDX" %"PRIDX"\n", firstvtx, lastvtx); */
410
411 for (j=0, i=0; i<npes; i++) {
412 if (vtxdist[i+1] > firstvtx) { /* Found the first PE that is passed me */
413 if (vtxdist[i+1] >= lastvtx) {
414 /* myprintf(ctrl, "Shifting %"PRIDX" elements to processor %"PRIDX"\n", lastvtx-firstvtx, i); */
415 for (k=0; k<lastvtx-firstvtx; k++, j++)
416 relmnts[relmnts[j].val].key = i;
417 }
418 else {
419 /* myprintf(ctrl, "Shifting %"PRIDX" elements to processor %"PRIDX"\n", vtxdist[i+1]-firstvtx, i); */
420 for (k=0; k<vtxdist[i+1]-firstvtx; k++, j++)
421 relmnts[relmnts[j].val].key = i;
422
423 firstvtx = vtxdist[i+1];
424 }
425 }
426 if (vtxdist[i+1] >= lastvtx)
427 break;
428 }
429
430 /* Reverse the ordering on the relmnts[].val */
431 for (i=0; i<nrecv; i++) {
432 PASSERTP(ctrl, relmnts[i].key>=0 && relmnts[i].key<npes,
433 (ctrl, "%"PRIDX" %"PRIDX"\n", i, relmnts[i].key));
434 relmnts[i].val = perm[i];
435 }
436
437 /* OK, now sent it back */
438 /* Issue the receives first */
439 for (i=0; i<npes; i++)
440 gkMPI_Irecv((void *)(elmnts+scounts[i]), 2*(scounts[i+1]-scounts[i]), IDX_T,
441 i, 1, ctrl->comm, ctrl->rreq+i);
442
443 /* Issue the sends next */
444 for (i=0; i<npes; i++)
445 gkMPI_Isend((void *)(relmnts+rcounts[i]), 2*(rcounts[i+1]-rcounts[i]), IDX_T,
446 i, 1, ctrl->comm, ctrl->sreq+i);
447
448 gkMPI_Waitall(npes, ctrl->rreq, ctrl->statuses);
449 gkMPI_Waitall(npes, ctrl->sreq, ctrl->statuses);
450
451
452 /* Construct a partition for the graph */
453 graph->where = imalloc(graph->nvtxs+graph->nrecv, "PartSort: graph->where");
454 firstvtx = vtxdist[mype];
455 for (i=0; i<nvtxs; i++) {
456 PASSERTP(ctrl, elmnts[i].key>=0 && elmnts[i].key<npes,
457 (ctrl, "%"PRIDX" %"PRIDX"\n", i, elmnts[i].key));
458 PASSERTP(ctrl, elmnts[i].val>=vtxdist[mype] && elmnts[i].val<vtxdist[mype+1],
459 (ctrl, "%"PRIDX" %"PRIDX" %"PRIDX" %"PRIDX"\n", i, vtxdist[mype], vtxdist[mype+1], elmnts[i].val));
460 graph->where[elmnts[i].val-firstvtx] = elmnts[i].key;
461 }
462
463 WCOREPOP;
464 }
465
466
467 /**************************************************************************/
468 /*! This function sorts a distributed list of ikv_t in increasing
469 order, and uses it to compute a partition. It uses a
470 samplesort variant whose number of local samples can potentially
471 be smaller than npes.
472 */
473 /**************************************************************************/
PseudoSampleSort(ctrl_t * ctrl,graph_t * graph,ikv_t * elmnts)474 void PseudoSampleSort(ctrl_t *ctrl, graph_t *graph, ikv_t *elmnts)
475 {
476 idx_t npes=ctrl->npes, mype=ctrl->mype;
477 idx_t i, j, k, nlsamples, ntsamples, nvtxs, nrecv, firstvtx, lastvtx;
478 idx_t *scounts, *rcounts, *sdispls, *rdispls, *vtxdist, *perm;
479 ikv_t *relmnts, *mypicks, *allpicks;
480
481 STARTTIMER(ctrl, ctrl->AuxTmr1);
482
483 WCOREPUSH;
484
485 nvtxs = graph->nvtxs;
486 vtxdist = graph->vtxdist;
487
488 /* determine the number of local samples */
489 //nlsamples = (GlobalSESum(ctrl, graph->nedges) + graph->gnvtxs)/(npes*npes);
490 nlsamples = graph->gnvtxs/(npes*npes);
491 if (nlsamples > npes)
492 nlsamples = npes;
493 else if (nlsamples < 75)
494 nlsamples = gk_min(75, npes); /* the 'npes' in the min is to account for small graphs */
495
496
497 IFSET(ctrl->dbglvl, DBG_INFO,
498 rprintf(ctrl, "PseudoSampleSort: nlsamples=%"PRIDX" of %"PRIDX"\n", nlsamples, npes));
499
500 /* get memory for the counts and displacements */
501 scounts = iwspacemalloc(ctrl, npes+1);
502 rcounts = iwspacemalloc(ctrl, npes+1);
503 sdispls = iwspacemalloc(ctrl, npes+1);
504 rdispls = iwspacemalloc(ctrl, npes+1);
505
506 /* get memory for the splitters */
507 mypicks = ikvwspacemalloc(ctrl, npes+1);
508
509 WCOREPUSH; /* for freeing allpicks */
510 allpicks = ikvwspacemalloc(ctrl, npes*nlsamples);
511
512 /* Sort the local elements */
513 ikvsorti(nvtxs, elmnts);
514
515 /* Select the local nlsamples-1 equally spaced elements */
516 for (i=0; i<nlsamples-1; i++) {
517 if (nvtxs > 0) {
518 k = (nvtxs/(3*nlsamples) /* initial offset */
519 + i*nvtxs/nlsamples /* increament */
520 + mype*nvtxs/(npes*nlsamples) /* per-pe shift for nlsamples<npes */
521 )%nvtxs;
522 mypicks[i].key = elmnts[k].key;
523 mypicks[i].val = elmnts[k].val;
524 }
525 else {
526 /* Take care the case in which a processor has no elements, at which
527 point we still select nlsamples-1, but we set their .val to -1 to be
528 removed later prior to sorting */
529 mypicks[i].val = -1;
530 }
531 }
532
533 /* PrintPairs(ctrl, nlsamples-1, mypicks, "Mypicks"); */
534
535 STOPTIMER(ctrl, ctrl->AuxTmr1);
536 STARTTIMER(ctrl, ctrl->AuxTmr2);
537
538 /* Gather the picks to all the processors */
539 gkMPI_Allgather((void *)mypicks, 2*(nlsamples-1), IDX_T, (void *)allpicks,
540 2*(nlsamples-1), IDX_T, ctrl->comm);
541
542 /* PrintPairs(ctrl, npes*(nlsamples-1), allpicks, "Allpicks"); */
543
544 /* Remove any samples that have .val == -1 */
545 for (ntsamples=0, i=0; i<npes*(nlsamples-1); i++) {
546 if (allpicks[i].val != -1)
547 allpicks[ntsamples++] = allpicks[i];
548 }
549
550 /* Sort all the picks */
551 ikvsortii(ntsamples, allpicks);
552
553
554 /* Select the final splitters. Set the boundaries to simplify coding */
555 for (i=1; i<npes; i++)
556 mypicks[i] = allpicks[i*ntsamples/npes];
557 mypicks[0].key = IDX_MIN;
558 mypicks[npes].key = IDX_MAX;
559
560
561 WCOREPOP; /* free allpicks */
562
563 STOPTIMER(ctrl, ctrl->AuxTmr2);
564 STARTTIMER(ctrl, ctrl->AuxTmr3);
565
566 /* Compute the number of elements that belong to each bucket */
567 iset(npes, 0, scounts);
568 for (j=i=0; i<nvtxs; i++) {
569 if (elmnts[i].key < mypicks[j+1].key ||
570 (elmnts[i].key == mypicks[j+1].key && elmnts[i].val < mypicks[j+1].val))
571 scounts[j]++;
572 else
573 scounts[++j]++;
574 }
575 PASSERT(ctrl, j < npes);
576 gkMPI_Alltoall(scounts, 1, IDX_T, rcounts, 1, IDX_T, ctrl->comm);
577
578 /* multiply raw counts by 2 to account for the ikv_t type */
579 sdispls[0] = rdispls[0] = 0;
580 for (i=0; i<npes; i++) {
581 scounts[i] *= 2;
582 rcounts[i] *= 2;
583 sdispls[i+1] = sdispls[i] + scounts[i];
584 rdispls[i+1] = rdispls[i] + rcounts[i];
585 }
586
587 STOPTIMER(ctrl, ctrl->AuxTmr3);
588 STARTTIMER(ctrl, ctrl->AuxTmr4);
589
590 /*
591 PrintVector(ctrl, npes+1, 0, scounts, "Scounts");
592 PrintVector(ctrl, npes+1, 0, rcounts, "Rcounts");
593 */
594
595 /* Allocate memory for sorted elements and receive them */
596 nrecv = rdispls[npes]/2; /* The divide by 2 is to get the # of ikv_t elements */
597 relmnts = ikvwspacemalloc(ctrl, nrecv);
598
599 IFSET(ctrl->dbglvl, DBG_INFO,
600 rprintf(ctrl, "PseudoSampleSort: max_nrecv: %"PRIDX" of %"PRIDX"\n",
601 GlobalSEMax(ctrl, nrecv), graph->gnvtxs/npes));
602 if (mype == 0 || mype == npes-1)
603 IFSET(ctrl->dbglvl, DBG_INFO,
604 myprintf(ctrl, "PseudoSampleSort: nrecv: %"PRIDX" of %"PRIDX"\n",
605 nrecv, graph->gnvtxs/npes));
606
607
608 gkMPI_Alltoallv((void *)elmnts, scounts, sdispls, IDX_T,
609 (void *)relmnts, rcounts, rdispls, IDX_T,
610 ctrl->comm);
611
612 STOPTIMER(ctrl, ctrl->AuxTmr4);
613 STARTTIMER(ctrl, ctrl->AuxTmr5);
614
615 /* OK, now do the local sort of the relmnts. Use perm to keep track original order */
616 perm = iwspacemalloc(ctrl, nrecv);
617 for (i=0; i<nrecv; i++) {
618 perm[i] = relmnts[i].val;
619 relmnts[i].val = i;
620 }
621 ikvsorti(nrecv, relmnts);
622
623 /* Compute what needs to be shifted */
624 gkMPI_Scan((void *)(&nrecv), (void *)(&lastvtx), 1, IDX_T, MPI_SUM, ctrl->comm);
625 firstvtx = lastvtx-nrecv;
626
627 for (j=0, i=0; i<npes; i++) {
628 if (vtxdist[i+1] > firstvtx) { /* Found the first PE that is passed me */
629 if (vtxdist[i+1] >= lastvtx) {
630 /* myprintf(ctrl, "Shifting %"PRIDX" elements to processor %"PRIDX"\n", lastvtx-firstvtx, i); */
631 for (k=0; k<lastvtx-firstvtx; k++, j++)
632 relmnts[relmnts[j].val].key = i;
633 }
634 else {
635 /* myprintf(ctrl, "Shifting %"PRIDX" elements to processor %"PRIDX"\n", vtxdist[i+1]-firstvtx, i); */
636 for (k=0; k<vtxdist[i+1]-firstvtx; k++, j++)
637 relmnts[relmnts[j].val].key = i;
638
639 firstvtx = vtxdist[i+1];
640 }
641 }
642 if (vtxdist[i+1] >= lastvtx)
643 break;
644 }
645
646 /* Reverse the ordering on the relmnts[].val */
647 for (i=0; i<nrecv; i++) {
648 PASSERTP(ctrl, relmnts[i].key>=0 && relmnts[i].key<npes,
649 (ctrl, "%"PRIDX" %"PRIDX"\n", i, relmnts[i].key));
650 relmnts[i].val = perm[i];
651 }
652
653 STOPTIMER(ctrl, ctrl->AuxTmr5);
654 STARTTIMER(ctrl, ctrl->AuxTmr6);
655
656 /* OK, now sent it back. The role of send/recv arrays is now reversed. */
657 gkMPI_Alltoallv((void *)relmnts, rcounts, rdispls, IDX_T,
658 (void *)elmnts, scounts, sdispls, IDX_T,
659 ctrl->comm);
660
661
662 /* Construct a partition for the graph */
663 graph->where = imalloc(graph->nvtxs+graph->nrecv, "PartSort: graph->where");
664 firstvtx = vtxdist[mype];
665 for (i=0; i<nvtxs; i++) {
666 PASSERTP(ctrl, elmnts[i].key>=0 && elmnts[i].key<npes,
667 (ctrl, "%"PRIDX" %"PRIDX"\n", i, elmnts[i].key));
668 PASSERTP(ctrl, elmnts[i].val>=vtxdist[mype] && elmnts[i].val<vtxdist[mype+1],
669 (ctrl, "%"PRIDX" %"PRIDX" %"PRIDX" %"PRIDX"\n", i, vtxdist[mype], vtxdist[mype+1], elmnts[i].val));
670 graph->where[elmnts[i].val-firstvtx] = elmnts[i].key;
671 }
672
673 WCOREPOP;
674
675 STOPTIMER(ctrl, ctrl->AuxTmr6);
676 }
677
678
679