1 /* ---------------------------------------------------------------- */
2 /* (C)Copyright IBM Corp. 2007, 2008 */
3 /* ---------------------------------------------------------------- */
4 /**
5 * \file ad_bg_aggrs.c
6 * \brief The externally used function from this file is is declared in ad_bg_aggrs.h
7 */
8
9 /* -*- Mode: C; c-basic-offset:4 ; -*- */
10 /*
11 * Copyright (C) 1997-2001 University of Chicago.
12 * See COPYRIGHT notice in top-level directory.
13 */
14
15 /*#define TRACE_ON */
16
17 // Uncomment this line to turn tracing on for the gpfsmpio_balancecontig aggr selection optimization
18 // #define balancecontigtrace 1
19 // #define bridgeringaggtrace 1
20
21 #include "adio.h"
22 #include "adio_cb_config_list.h"
23 #include "../ad_gpfs.h"
24 #include "ad_bg_pset.h"
25 #include "ad_bg_aggrs.h"
26 #ifdef AGGREGATION_PROFILE
27 #include "mpe.h"
28 #endif
29
30
31 #ifdef USE_DBG_LOGGING
32 #define AGG_DEBUG 1
33 #endif
34
35 #ifndef TRACE_ERR
36 # define TRACE_ERR(format...)
37 #endif
38
39 /* Comments copied from common:
40 * This file contains four functions:
41 *
42 * ADIOI_Calc_aggregator()
43 * ADIOI_Calc_file_domains()
44 * ADIOI_Calc_my_req()
45 * ADIOI_Calc_others_req()
46 *
47 * The last three of these were originally in ad_read_coll.c, but they are
48 * also shared with ad_write_coll.c. I felt that they were better kept with
49 * the rest of the shared aggregation code.
50 */
51
52 /* Discussion of values available from above:
53 *
54 * ADIO_Offset st_offsets[0..nprocs-1]
55 * ADIO_Offset end_offsets[0..nprocs-1]
56 * These contain a list of start and end offsets for each process in
57 * the communicator. For example, an access at loc 10, size 10 would
58 * have a start offset of 10 and end offset of 19.
59 * int nprocs
60 * number of processors in the collective I/O communicator
61 * ADIO_Offset min_st_offset
62 * ADIO_Offset fd_start[0..nprocs_for_coll-1]
63 * starting location of "file domain"; region that a given process will
64 * perform aggregation for (i.e. actually do I/O)
65 * ADIO_Offset fd_end[0..nprocs_for_coll-1]
66 * start + size - 1 roughly, but it can be less, or 0, in the case of
67 * uneven distributions
68 */
69
70 /* forward declaration */
71 static void
72 ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
73 const ADIOI_BG_ConfInfo_t *confInfo,
74 ADIOI_BG_ProcInfo_t *all_procInfo);
75
76 /*
77 * Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
78 * The parameters are
79 * . the number of aggregators (proxies) : fd->hints->cb_nodes
80 * . the ranks of the aggregators : fd->hints->ranklist
81 * By compute these two parameters in a BG-PSET-aware way, the default 2-phase collective IO of
82 * ADIO can work more efficiently.
83 */
84 int
ADIOI_BG_gen_agg_ranklist(ADIO_File fd,int n_aggrs_per_pset)85 ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
86 {
87 int r, s;
88 ADIOI_BG_ProcInfo_t *procInfo, *all_procInfo;
89 ADIOI_BG_ConfInfo_t *confInfo;
90 TRACE_ERR("Entering ADIOI_BG_gen_agg_ranklist\n");
91
92 MPI_Comm_size( fd->comm, &s );
93 MPI_Comm_rank( fd->comm, &r );
94
95 /* Collect individual BG personality information */
96 confInfo = ADIOI_BG_ConfInfo_new ();
97 procInfo = ADIOI_BG_ProcInfo_new ();
98 ADIOI_BG_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset, fd->comm);
99
100 /* Gather BG personality infomation onto process 0 */
101 /* if (r == 0) */
102 all_procInfo = ADIOI_BG_ProcInfo_new_n (s);
103
104 MPI_Gather( (void *)procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
105 (void *)all_procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
106 0,
107 fd->comm );
108
109 /* Compute a list of the ranks of chosen IO proxy CN on process 0 */
110 if (r == 0) {
111 ADIOI_BG_compute_agg_ranklist_serial (fd, confInfo, all_procInfo);
112 /* ADIOI_BG_ProcInfo_free (all_procInfo);*/
113 }
114 ADIOI_BG_ProcInfo_free (all_procInfo);
115
116 /* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
117 Declared in adio_cb_config_list.h */
118 ADIOI_cb_bcast_rank_map(fd);
119 if (gpfsmpio_balancecontig == 1) { /* additionally need to send bridgelist,
120 bridgelistnum and numbridges to all
121 ranks */
122 if (r != 0) {
123 fd->hints->fs_hints.bg.bridgelist =
124 ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
125 if (fd->hints->fs_hints.bg.bridgelist == NULL) {
126 /* NEED TO HANDLE ENOMEM */
127 }
128 }
129 MPI_Bcast(fd->hints->fs_hints.bg.bridgelist, fd->hints->cb_nodes, MPI_INT, 0,
130 fd->comm);
131
132 if (r != 0) {
133 fd->hints->fs_hints.bg.bridgelistnum =
134 ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
135 if (fd->hints->fs_hints.bg.bridgelistnum == NULL) {
136 /* NEED TO HANDLE ENOMEM */
137 }
138 }
139 MPI_Bcast(fd->hints->fs_hints.bg.bridgelistnum, fd->hints->cb_nodes,
140 MPI_INT, 0, fd->comm);
141
142 MPI_Bcast(&fd->hints->fs_hints.bg.numbridges, 1, MPI_INT, 0,
143 fd->comm);
144
145 }
146
147
148 ADIOI_BG_persInfo_free( confInfo, procInfo );
149 TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
150 return 0;
151 }
152
153
154 /* There are some number of bridge nodes (randomly) distributed through the job
155 * We need to split the nodes among the bridge nodes */
156 /* Maybe find which bridge node is closer (manhattan distance) and try to
157 * distribute evenly.
158 */
159 /*
160 * Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
161 * The first order of tmp_ranklist is : PSET number
162 * The secondary order of the list is determined in ADIOI_BG_select_agg_in_pset() and thus adjustable.
163 */
164 typedef struct
165 {
166 int rank;
167 int bridge;
168 } sortstruct;
169
170 typedef struct
171 {
172 int bridgeRank;
173 int numAggsAssigned;
174 } bridgeAggAssignment;
175
intsort(const void * p1,const void * p2)176 static int intsort(const void *p1, const void *p2)
177 {
178 sortstruct *i1, *i2;
179 i1 = (sortstruct *)p1;
180 i2 = (sortstruct *)p2;
181 return(i1->bridge - i2->bridge);
182 }
183
184 static int
ADIOI_BG_compute_agg_ranklist_serial_do(const ADIOI_BG_ConfInfo_t * confInfo,ADIOI_BG_ProcInfo_t * all_procInfo,int * tmp_ranklist)185 ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
186 ADIOI_BG_ProcInfo_t *all_procInfo,
187 int *tmp_ranklist)
188 {
189 TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial_do\n");
190 /* BES: This should be done in the init routines probably. */
191 int i, j;
192 int aggTotal;
193 int *aggList;
194
195 if (gpfsmpio_bridgeringagg > 0) {
196
197 int numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
198 /* the number of aggregators is (numAggs per bridgenode) */
199 if(numAggs == 1)
200 aggTotal = 1;
201 else
202 aggTotal = confInfo->numBridgeRanks * numAggs;
203
204 aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
205 if(aggTotal == 1) { /* special case when we only have one bridge node */
206
207 sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
208 for(i=0; i < confInfo->nProcs; i++)
209 {
210 bridgelist[i].bridge = all_procInfo[i].bridgeRank;
211 bridgelist[i].rank = i;
212 TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
213 }
214
215 /* This list contains rank->bridge info. Now, we need to sort this list. */
216 qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
217
218 aggList[0] = bridgelist[0].bridge;
219 ADIOI_Free(bridgelist);
220
221 }
222 else { // aggTotal > 1
223
224 int currentAggListSize = 0;
225 int numBridgesWithAggAssignments = 0;
226 bridgeAggAssignment *aggAssignments = (bridgeAggAssignment *)ADIOI_Malloc(confInfo->numBridgeRanks * sizeof(bridgeAggAssignment));
227
228 int partitionSize = all_procInfo[0].numNodesInPartition;
229 int *nodesAssigned = (int *)ADIOI_Malloc(partitionSize * sizeof(int));
230 for (i=0;i<partitionSize;i++)
231 nodesAssigned[i] = 0;
232
233 int currentNumHops = gpfsmpio_bridgeringagg;
234 int allAggsAssigned = 0;
235
236 /* Iterate thru the process infos and select aggregators starting at currentNumHops
237 away. Increase the currentNumHops until all bridges have numAggs assigned to them.
238 */
239 while (!allAggsAssigned) {
240 /* track whether any aggs are selected durng this round */
241 int startingCurrentAggListSize = currentAggListSize;
242 int numIterForHopsWithNoAggs = 0;
243 for (i=0;i<confInfo->nProcs;i++) {
244 if (all_procInfo[i].manhattanDistanceToBridge == currentNumHops) {
245 if (nodesAssigned[all_procInfo[i].nodeRank] == 0) { // node is not assigned as an agg yet
246 int foundBridge = 0;
247 for (j=0;(j<numBridgesWithAggAssignments && !foundBridge);j++) {
248 if (aggAssignments[j].bridgeRank == all_procInfo[i].bridgeRank) {
249 foundBridge = 1;
250 if (aggAssignments[j].numAggsAssigned < numAggs) {
251 aggAssignments[j].numAggsAssigned++;
252 nodesAssigned[all_procInfo[i].nodeRank] = 1;
253 aggList[currentAggListSize] = all_procInfo[i].rank;
254 currentAggListSize++;
255 #ifdef bridgeringaggtrace
256 printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
257 #endif
258 }
259 }
260 }
261 if (!foundBridge) {
262 aggAssignments[numBridgesWithAggAssignments].bridgeRank = all_procInfo[i].bridgeRank;
263 aggAssignments[numBridgesWithAggAssignments].numAggsAssigned = 1;
264 numBridgesWithAggAssignments++;
265 nodesAssigned[all_procInfo[i].nodeRank] = 1;
266 aggList[currentAggListSize] = all_procInfo[i].rank;
267 currentAggListSize++;
268 #ifdef bridgeringaggtrace
269 printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
270 #endif
271 }
272 }
273 }
274 }
275
276 if (numBridgesWithAggAssignments == confInfo->numBridgeRanks) {
277 allAggsAssigned = 1;
278 for (i=0;(i<numBridgesWithAggAssignments && allAggsAssigned);i++) {
279 if (aggAssignments[i].numAggsAssigned < numAggs)
280 allAggsAssigned = 0;
281 }
282 }
283 currentNumHops++;
284 /* If 3 rounds go by without selecting an agg abort to avoid
285 infinite loop.
286 */
287 if (startingCurrentAggListSize == currentAggListSize)
288 numIterForHopsWithNoAggs++;
289 else
290 numIterForHopsWithNoAggs = 0;
291 ADIOI_Assert(numIterForHopsWithNoAggs <= 3);
292 }
293
294 ADIOI_Free(aggAssignments);
295 ADIOI_Free(nodesAssigned);
296
297 } // else aggTotal > 1
298
299 memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
300 } // gpfsmpio_bridgeringagg > 0
301
302 else { // gpfsmpio_bridgeringagg unset - default code
303
304 int distance, numAggs;
305
306 /* Aggregators will be midpoints between sorted MPI rank lists of who shares a given
307 * bridge node */
308
309 sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
310 for(i=0; i < confInfo->nProcs; i++)
311 {
312 bridgelist[i].bridge = all_procInfo[i].bridgeRank;
313 bridgelist[i].rank = i;
314 TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
315 }
316
317 /* This list contains rank->bridge info. Now, we need to sort this list. */
318 qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
319
320 /* In this array, we can pick an appropriate number of midpoints based on
321 * our bridgenode index and the number of aggregators */
322
323 numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
324 if(numAggs == 1)
325 aggTotal = 1;
326 else
327 /* the number of aggregators is (numAggs per bridgenode) plus each
328 * bridge node is an aggregator */
329 aggTotal = confInfo->numBridgeRanks * (numAggs+1);
330
331 if(aggTotal>confInfo->nProcs) aggTotal=confInfo->nProcs;
332
333 TRACE_ERR("numBridgeRanks: %d, aggRatio: %f numBridge: %d pset size: %d/%d numAggs: %d, aggTotal: %d\n", confInfo->numBridgeRanks, confInfo->aggRatio, confInfo->numBridgeRanks, confInfo->ioMinSize, confInfo->ioMaxSize /*virtualPsetSize*/, numAggs, aggTotal);
334 aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
335
336
337 /* For each bridge node, determine who the aggregators will be */
338 /* basically, the n*distance and bridge node */
339 if(aggTotal == 1) /* special case when we only have one bridge node */
340 aggList[0] = bridgelist[0].bridge;
341 else
342 {
343 int lastBridge = bridgelist[confInfo->nProcs-1].bridge;
344 int nextBridge = 0, nextAggr = confInfo->numBridgeRanks;
345 int psetSize = 0;
346 int procIndex;
347 for(procIndex=confInfo->nProcs-1; procIndex>=0; procIndex--)
348 {
349 TRACE_ERR("bridgelist[%d].bridge %u/rank %u\n",procIndex, bridgelist[procIndex].bridge, bridgelist[procIndex].rank);
350 if(lastBridge == bridgelist[procIndex].bridge)
351 {
352 psetSize++;
353 if(procIndex) continue;
354 else procIndex--;/* procIndex == 0 */
355 }
356 /* Sets up a list of nodes which will act as aggregators. numAggs
357 * per bridge node total. The list of aggregators is
358 * bridgeNode 0
359 * bridgeNode 1
360 * bridgeNode ...
361 * bridgeNode N
362 * bridgeNode[0]aggr[0]
363 * bridgeNode[0]aggr[1]...
364 * bridgeNode[0]aggr[N]...
365 * ...
366 * bridgeNode[N]aggr[0]..
367 * bridgeNode[N]aggr[N]
368 */
369 aggList[nextBridge]=lastBridge;
370 distance = psetSize/numAggs;
371 TRACE_ERR("nextBridge %u is bridge %u, distance %u, size %u\n",nextBridge, aggList[nextBridge],distance,psetSize);
372 if(numAggs>1)
373 {
374 for(j = 0; j < numAggs; j++)
375 {
376 ADIOI_Assert(nextAggr<aggTotal);
377 aggList[nextAggr] = bridgelist[procIndex+j*distance+1].rank;
378 TRACE_ERR("agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+j*distance+1,aggList[nextAggr]);
379 if(aggList[nextAggr]==lastBridge) /* can't have bridge in the list twice */
380 {
381 aggList[nextAggr] = bridgelist[procIndex+psetSize].rank; /* take the last one in the pset */
382 TRACE_ERR("replacement agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+psetSize,aggList[nextAggr]);
383 }
384 nextAggr++;
385 }
386 }
387 if(procIndex<0) break;
388 lastBridge = bridgelist[procIndex].bridge;
389 psetSize = 1;
390 nextBridge++;
391 }
392 }
393
394 TRACE_ERR("memcpy(tmp_ranklist, aggList, (numAggs(%u)*confInfo->numBridgeRanks(%u)+numAggs(%u)) (%u) %u*sizeof(int))\n",numAggs,confInfo->numBridgeRanks,numAggs,(numAggs*confInfo->numBridgeRanks+numAggs),aggTotal);
395 memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
396 for(i=0;i<aggTotal;i++)
397 {
398 TRACE_ERR("tmp_ranklist[%d]: %d\n", i, tmp_ranklist[i]);
399 }
400
401
402 ADIOI_Free (bridgelist);
403
404 TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
405 }
406
407 ADIOI_Free (aggList);
408 return aggTotal;
409
410 }
411
412 /*
413 * compute aggregators ranklist and put it into fd->hints struct
414 */
415 static void
ADIOI_BG_compute_agg_ranklist_serial(ADIO_File fd,const ADIOI_BG_ConfInfo_t * confInfo,ADIOI_BG_ProcInfo_t * all_procInfo)416 ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
417 const ADIOI_BG_ConfInfo_t *confInfo,
418 ADIOI_BG_ProcInfo_t *all_procInfo)
419 {
420 TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial\n");
421 int i;
422 int naggs;
423 int size;
424 int *tmp_ranklist;
425
426 /* compute the ranklist of IO aggregators and put into tmp_ranklist */
427 tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
428
429 # if AGG_DEBUG
430 for (i=0; i<confInfo->nProcs; i++) {
431 DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].coreID, all_procInfo[i].rank );
432 }
433 # endif
434
435 naggs=
436 ADIOI_BG_compute_agg_ranklist_serial_do (confInfo, all_procInfo, tmp_ranklist);
437
438 # define VERIFY 1
439 # if VERIFY
440 DBG_FPRINTF(stderr, "\tconfInfo = min: %3d, max: %3d, naggrs: %3d, bridge: %3d, nprocs: %3d, vpset: %3d, tsize: %3d, ratio: %.4f; naggs = %d\n",
441 confInfo->ioMinSize ,
442 confInfo->ioMaxSize ,
443 confInfo->nAggrs ,
444 confInfo->numBridgeRanks ,
445 confInfo->nProcs ,
446 confInfo->ioMaxSize /*virtualPsetSize*/ ,
447 confInfo->cpuIDsize,
448 confInfo->aggRatio ,
449 naggs );
450 # endif
451 MPI_Comm_size( fd->comm, &size );
452 /* This fix is for when the bridgenode rnk is not part of the particular
453 * subcomm associated with this MPI File operation. I don't know if
454 * this is the best/right answer but it passes the test cases at least.
455 * I don't know how common file IO in subcomms is anyway... */
456 for(i=0;i<naggs;i++)
457 {
458 if(tmp_ranklist[i] > size)
459 {
460 TRACE_ERR("Using 0 as tmp_ranklist[%d] instead of %d for comm %x\n",
461 i, tmp_ranklist[i], fd->comm);
462 tmp_ranklist[i] = 0;
463 }
464 }
465
466 # if AGG_DEBUG
467 for (i=0; i<naggs; i++) {
468 DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
469 }
470 # endif
471 if (gpfsmpio_balancecontig == 1) {
472 /* what comes out of this code block is the agg ranklist sorted by
473 * bridge set and ion id with associated bridge info stored in the
474 * hints structure for later access during file domain assignment */
475
476 // sort the agg ranklist by ions and bridges
477
478 int *interleavedbridgeranklist = (int *) ADIOI_Malloc (naggs * sizeof(int)); // resorted agg rank list
479 /* list of all bridge ranks */
480 int *bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
481
482 /* each entry here is the number of aggregators associated with the
483 * bridge rank of the same index in bridgelist */
484 int *bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
485 /* list of all ion IDs corresponding with bridgelist entries of same index */
486 int *ionlist = (int *) ADIOI_Malloc (naggs * sizeof(int));
487
488 int numbridges = 0;
489
490 for (i=0;i<naggs;i++)
491 bridgelistnum[i] = 0;
492
493 /* Each entry in this list corresponds with the bridgelist and will contain the lowest bridge
494 * agg rank on that ion. */
495 int *summarybridgeminionaggrank = (int *) ADIOI_Malloc (naggs * sizeof(int));
496 for (i=0;i<naggs;i++)
497 summarybridgeminionaggrank[i] = -1;
498
499 /* build the bridgelist, ionlist and bridgelistnum data by going thru each agg
500 * entry and find the associated bridge list index - at the end we will
501 * know how many aggs belong to each bridge in each ion */
502 for (i=0;i<naggs;i++) {
503 int aggbridgerank = all_procInfo[tmp_ranklist[i]].bridgeRank;
504 int aggionid = all_procInfo[tmp_ranklist[i]].ionID;
505 int foundrank = 0;
506 int summaryranklistbridgeindex = 0;
507 int j;
508 for (j=0;(j<numbridges && !foundrank);j++) {
509 if (bridgelist[j] == aggbridgerank) {
510 foundrank = 1;
511 summaryranklistbridgeindex = j;
512 }
513 else
514 summaryranklistbridgeindex++;
515 }
516 if (!foundrank) {
517 bridgelist[summaryranklistbridgeindex] = aggbridgerank;
518 ionlist[summaryranklistbridgeindex] = aggionid;
519
520 if (summarybridgeminionaggrank[summaryranklistbridgeindex] == -1)
521 summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
522 else if (summarybridgeminionaggrank[summaryranklistbridgeindex] > aggbridgerank)
523 summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
524 numbridges++;
525 }
526
527 bridgelistnum[summaryranklistbridgeindex]++;
528 }
529
530 /* at this point summarybridgeminionaggrank has the agg rank of the bridge for entries,
531 * need to make each entry the minimum bridge rank for the entire ion. */
532 for (i=0;i<numbridges;i++) {
533 int aggIonId = ionlist[i];
534 int j;
535 for (j=0;j<numbridges;j++) {
536 if (ionlist[j] == aggIonId) {
537 if (summarybridgeminionaggrank[j] < summarybridgeminionaggrank[i])
538 summarybridgeminionaggrank[i] = summarybridgeminionaggrank[j];
539 }
540 }
541 }
542
543 // resort by io node minimum bridge rank
544 int x;
545 for (x=0;x<numbridges;x++) {
546 for (i=0;i<(numbridges-1);i++) {
547 if (summarybridgeminionaggrank[i] > summarybridgeminionaggrank[i+1]) {
548 int tmpminionaggrank = summarybridgeminionaggrank[i];
549 summarybridgeminionaggrank[i] = summarybridgeminionaggrank[i+1];
550 summarybridgeminionaggrank[i+1] = tmpminionaggrank;
551 int tmpionid = ionlist[i];
552 ionlist[i] = ionlist[i+1];
553 ionlist[i+1] = tmpionid;
554 int tmpbridgerank = bridgelist[i];
555 bridgelist[i] = bridgelist[i+1];
556 bridgelist[i+1] = tmpbridgerank;
557 int tmpbridgeranknum = bridgelistnum[i];
558 bridgelistnum[i] = bridgelistnum[i+1];
559 bridgelistnum[i+1] = tmpbridgeranknum;
560 }
561 }
562 }
563
564 // for each io node make sure bridgelist is in rank order
565 int startSortIndex = -1;
566 int endSortIndex = -1;
567 int currentBridgeIndex = 0;
568
569 while (currentBridgeIndex < numbridges) {
570 int currentIonId = ionlist[currentBridgeIndex];
571 startSortIndex = currentBridgeIndex;
572 while (ionlist[currentBridgeIndex] == currentIonId)
573 currentBridgeIndex++;
574 endSortIndex = currentBridgeIndex-1;
575 for (x=startSortIndex;x<=endSortIndex;x++) {
576 for (i=startSortIndex;i<endSortIndex;i++) {
577 if (bridgelist[i] > bridgelist[i+1]) {
578 int tmpbridgerank = bridgelist[i];
579 bridgelist[i] = bridgelist[i+1];
580 bridgelist[i+1] = tmpbridgerank;
581 int tmpbridgeranknum = bridgelistnum[i];
582 bridgelistnum[i] = bridgelistnum[i+1];
583 bridgelistnum[i+1] = tmpbridgeranknum;
584 }
585 }
586 }
587 }
588
589
590 /* populate interleavedbridgeranklist - essentially the agg rank list
591 * is now sorted by the ion minimum bridge rank and bridge node */
592 int currentrankoffset = 0;
593 for (i=0;i<numbridges;i++) {
594 int *thisBridgeAggList = (int *) ADIOI_Malloc (naggs * sizeof(int));
595 int numAggsForThisBridge = 0;
596
597 int k;
598 for (k=0;k<naggs;k++) {
599 int aggbridgerank = all_procInfo[tmp_ranklist[k]].bridgeRank;
600 if (aggbridgerank == bridgelist[i]) {
601 thisBridgeAggList[numAggsForThisBridge] = tmp_ranklist[k];
602 numAggsForThisBridge++;
603 }
604 }
605
606 // sort thisBridgeAggList
607 for (x=0;x<numAggsForThisBridge;x++) {
608 int n;
609 for (n=0;n<(numAggsForThisBridge-1);n++) {
610 if (thisBridgeAggList[n] > thisBridgeAggList[n+1]) {
611 int tmpthisBridgeAggList = thisBridgeAggList[n];
612 thisBridgeAggList[n] = thisBridgeAggList[n+1];
613 thisBridgeAggList[n+1] = tmpthisBridgeAggList;
614 }
615 }
616 }
617 int n;
618 for (n=0;n<numAggsForThisBridge;n++) {
619 interleavedbridgeranklist[currentrankoffset] = thisBridgeAggList[n];
620 currentrankoffset++;
621 }
622 ADIOI_Free(thisBridgeAggList);
623 }
624
625 #ifdef balancecontigtrace
626 fprintf(stderr,"Interleaved aggregator list:\n");
627 for (i=0;i<naggs;i++) {
628 fprintf(stderr,"Agg: %d Agg rank: %d with bridge rank %d and ion ID %d\n",i,interleavedbridgeranklist[i],all_procInfo[interleavedbridgeranklist[i]].bridgeRank,all_procInfo[interleavedbridgeranklist[i]].ionID);
629 }
630 fprintf(stderr,"Bridges list:\n");
631 for (i=0;i<numbridges;i++) {
632 fprintf(stderr,"bridge %d ion min rank %d rank %d number of aggs %d ion id %d\n",i,summarybridgeminionaggrank[i],bridgelist[i],bridgelistnum[i],ionlist[i]);
633 }
634
635 #endif
636 /* copy the ranklist of IO aggregators to fd->hints */
637 if(fd->hints->ranklist != NULL)
638 ADIOI_Free (fd->hints->ranklist);
639 if(fd->hints->fs_hints.bg.bridgelist != NULL)
640 ADIOI_Free (fd->hints->fs_hints.bg.bridgelist);
641 if(fd->hints->fs_hints.bg.bridgelistnum != NULL)
642 ADIOI_Free (fd->hints->fs_hints.bg.bridgelistnum);
643
644 fd->hints->cb_nodes = naggs;
645 fd->hints->fs_hints.bg.numbridges = numbridges;
646 fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
647 memcpy( fd->hints->ranklist, interleavedbridgeranklist, naggs*sizeof(int) );
648
649 fd->hints->fs_hints.bg.bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
650 memcpy( fd->hints->fs_hints.bg.bridgelist, bridgelist, naggs*sizeof(int) );
651
652 fd->hints->fs_hints.bg.bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
653 memcpy( fd->hints->fs_hints.bg.bridgelistnum, bridgelistnum, naggs*sizeof(int) );
654
655 ADIOI_Free(summarybridgeminionaggrank);
656 ADIOI_Free( tmp_ranklist );
657 ADIOI_Free( bridgelistnum );
658 ADIOI_Free( bridgelist );
659 ADIOI_Free( interleavedbridgeranklist );
660 ADIOI_Free(ionlist);
661
662 } else {
663 /* classic topology-agnostic copy of the ranklist of IO aggregators to
664 * fd->hints */
665 if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
666
667 fd->hints->cb_nodes = naggs;
668 fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
669 memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
670
671 ADIOI_Free( tmp_ranklist );
672 }
673 TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
674 return;
675 }
676