1 /* ---------------------------------------------------------------- */
2 /* (C)Copyright IBM Corp.  2007, 2008                               */
3 /* ---------------------------------------------------------------- */
4 /**
5  * \file ad_bg_aggrs.c
6  * \brief The externally used function from this file is is declared in ad_bg_aggrs.h
7  */
8 
9 /* -*- Mode: C; c-basic-offset:4 ; -*- */
10 /*
11  *   Copyright (C) 1997-2001 University of Chicago.
12  *   See COPYRIGHT notice in top-level directory.
13  */
14 
15 /*#define TRACE_ON */
16 
17 // Uncomment this line to turn tracing on for the gpfsmpio_balancecontig aggr selection optimization
18 // #define balancecontigtrace 1
19 // #define bridgeringaggtrace 1
20 
21 #include "adio.h"
22 #include "adio_cb_config_list.h"
23 #include "../ad_gpfs.h"
24 #include "ad_bg_pset.h"
25 #include "ad_bg_aggrs.h"
26 #ifdef AGGREGATION_PROFILE
27 #include "mpe.h"
28 #endif
29 
30 
31 #ifdef USE_DBG_LOGGING
32   #define AGG_DEBUG 1
33 #endif
34 
35 #ifndef TRACE_ERR
36 #  define TRACE_ERR(format...)
37 #endif
38 
39 /* Comments copied from common:
40  * This file contains four functions:
41  *
42  * ADIOI_Calc_aggregator()
43  * ADIOI_Calc_file_domains()
44  * ADIOI_Calc_my_req()
45  * ADIOI_Calc_others_req()
46  *
47  * The last three of these were originally in ad_read_coll.c, but they are
48  * also shared with ad_write_coll.c.  I felt that they were better kept with
49  * the rest of the shared aggregation code.
50  */
51 
52 /* Discussion of values available from above:
53  *
54  * ADIO_Offset st_offsets[0..nprocs-1]
55  * ADIO_Offset end_offsets[0..nprocs-1]
56  *    These contain a list of start and end offsets for each process in
57  *    the communicator.  For example, an access at loc 10, size 10 would
58  *    have a start offset of 10 and end offset of 19.
59  * int nprocs
60  *    number of processors in the collective I/O communicator
61  * ADIO_Offset min_st_offset
62  * ADIO_Offset fd_start[0..nprocs_for_coll-1]
63  *    starting location of "file domain"; region that a given process will
64  *    perform aggregation for (i.e. actually do I/O)
65  * ADIO_Offset fd_end[0..nprocs_for_coll-1]
66  *    start + size - 1 roughly, but it can be less, or 0, in the case of
67  *    uneven distributions
68  */
69 
70 /* forward declaration */
71 static void
72 ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
73 					const ADIOI_BG_ConfInfo_t *confInfo,
74 					ADIOI_BG_ProcInfo_t *all_procInfo);
75 
76 /*
77  * Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO.
78  * The parameters are
79  * 	. the number of aggregators (proxies) : fd->hints->cb_nodes
80  *	. the ranks of the aggregators :        fd->hints->ranklist
81  * By compute these two parameters in a BG-PSET-aware way, the default 2-phase collective IO of
82  *	ADIO can work more efficiently.
83  */
84 int
ADIOI_BG_gen_agg_ranklist(ADIO_File fd,int n_aggrs_per_pset)85 ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_aggrs_per_pset)
86 {
87     int r, s;
88     ADIOI_BG_ProcInfo_t  *procInfo, *all_procInfo;
89     ADIOI_BG_ConfInfo_t  *confInfo;
90     TRACE_ERR("Entering ADIOI_BG_gen_agg_ranklist\n");
91 
92     MPI_Comm_size( fd->comm, &s );
93     MPI_Comm_rank( fd->comm, &r );
94 
95   /* Collect individual BG personality information */
96     confInfo = ADIOI_BG_ConfInfo_new ();
97     procInfo = ADIOI_BG_ProcInfo_new ();
98     ADIOI_BG_persInfo_init( confInfo, procInfo, s, r, n_aggrs_per_pset, fd->comm);
99 
100   /* Gather BG personality infomation onto process 0 */
101     /* if (r == 0) */
102     all_procInfo  = ADIOI_BG_ProcInfo_new_n  (s);
103 
104     MPI_Gather( (void *)procInfo,     sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
105 		(void *)all_procInfo, sizeof(ADIOI_BG_ProcInfo_t), MPI_BYTE,
106 		0,
107 		fd->comm );
108 
109   /* Compute a list of the ranks of chosen IO proxy CN on process 0 */
110     if (r == 0) {
111 	ADIOI_BG_compute_agg_ranklist_serial (fd, confInfo, all_procInfo);
112 	/* ADIOI_BG_ProcInfo_free (all_procInfo);*/
113     }
114     ADIOI_BG_ProcInfo_free (all_procInfo);
115 
116   /* Send the info of IO proxy CN to all processes and keep the info in fd->hints struct.
117      Declared in adio_cb_config_list.h */
118     ADIOI_cb_bcast_rank_map(fd);
119     if (gpfsmpio_balancecontig == 1) { /* additionally need to send bridgelist,
120 					bridgelistnum and numbridges to all
121 					ranks */
122 	if (r != 0) {
123 	    fd->hints->fs_hints.bg.bridgelist =
124 		ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
125 	    if (fd->hints->fs_hints.bg.bridgelist == NULL) {
126 		/* NEED TO HANDLE ENOMEM */
127 	    }
128 	}
129 	MPI_Bcast(fd->hints->fs_hints.bg.bridgelist, fd->hints->cb_nodes, MPI_INT, 0,
130 		fd->comm);
131 
132 	if (r != 0) {
133 	    fd->hints->fs_hints.bg.bridgelistnum =
134 		ADIOI_Malloc(fd->hints->cb_nodes*sizeof(int));
135 	    if (fd->hints->fs_hints.bg.bridgelistnum == NULL) {
136 		/* NEED TO HANDLE ENOMEM */
137 	    }
138 	}
139 	MPI_Bcast(fd->hints->fs_hints.bg.bridgelistnum, fd->hints->cb_nodes,
140 		MPI_INT, 0, fd->comm);
141 
142 	MPI_Bcast(&fd->hints->fs_hints.bg.numbridges, 1, MPI_INT, 0,
143 		fd->comm);
144 
145     }
146 
147 
148     ADIOI_BG_persInfo_free( confInfo, procInfo );
149     TRACE_ERR("Leaving ADIOI_BG_gen_agg_ranklist\n");
150     return 0;
151 }
152 
153 
154 /* There are some number of bridge nodes (randomly) distributed through the job
155  * We need to split the nodes among the bridge nodes */
156 /* Maybe find which bridge node is closer (manhattan distance) and try to
157  * distribute evenly.
158  */
159 /*
160  * Pick IO aggregators based on the under PSET organization and stores the ranks of the proxy CNs in tmp_ranklist.
161  * The first order of tmp_ranklist is : PSET number
162  * The secondary order of the list is determined in ADIOI_BG_select_agg_in_pset() and thus adjustable.
163  */
164 typedef struct
165 {
166    int rank;
167    int bridge;
168 } sortstruct;
169 
170 typedef struct
171 {
172    int bridgeRank;
173    int numAggsAssigned;
174 } bridgeAggAssignment;
175 
intsort(const void * p1,const void * p2)176 static int intsort(const void *p1, const void *p2)
177 {
178    sortstruct *i1, *i2;
179    i1 = (sortstruct *)p1;
180    i2 = (sortstruct *)p2;
181    return(i1->bridge - i2->bridge);
182 }
183 
184 static int
ADIOI_BG_compute_agg_ranklist_serial_do(const ADIOI_BG_ConfInfo_t * confInfo,ADIOI_BG_ProcInfo_t * all_procInfo,int * tmp_ranklist)185 ADIOI_BG_compute_agg_ranklist_serial_do (const ADIOI_BG_ConfInfo_t *confInfo,
186 					  ADIOI_BG_ProcInfo_t       *all_procInfo,
187 					  int *tmp_ranklist)
188 {
189     TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial_do\n");
190    /* BES: This should be done in the init routines probably. */
191     int i, j;
192     int aggTotal;
193     int *aggList;
194 
195     if (gpfsmpio_bridgeringagg > 0) {
196 
197       int numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
198         /* the number of aggregators is (numAggs per bridgenode) */
199       if(numAggs == 1)
200         aggTotal = 1;
201       else
202         aggTotal = confInfo->numBridgeRanks * numAggs;
203 
204       aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
205       if(aggTotal == 1) { /* special case when we only have one bridge node */
206 
207         sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
208         for(i=0; i < confInfo->nProcs; i++)
209         {
210           bridgelist[i].bridge = all_procInfo[i].bridgeRank;
211           bridgelist[i].rank = i;
212           TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
213         }
214 
215         /* This list contains rank->bridge info. Now, we need to sort this list. */
216         qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
217 
218         aggList[0] = bridgelist[0].bridge;
219         ADIOI_Free(bridgelist);
220 
221       }
222       else { // aggTotal > 1
223 
224         int currentAggListSize = 0;
225         int numBridgesWithAggAssignments = 0;
226         bridgeAggAssignment *aggAssignments = (bridgeAggAssignment *)ADIOI_Malloc(confInfo->numBridgeRanks * sizeof(bridgeAggAssignment));
227 
228         int partitionSize = all_procInfo[0].numNodesInPartition;
229         int *nodesAssigned = (int *)ADIOI_Malloc(partitionSize * sizeof(int));
230         for (i=0;i<partitionSize;i++)
231           nodesAssigned[i] = 0;
232 
233         int currentNumHops = gpfsmpio_bridgeringagg;
234         int allAggsAssigned = 0;
235 
236         /* Iterate thru the process infos and select aggregators starting at currentNumHops
237            away.  Increase the currentNumHops until all bridges have numAggs assigned to them.
238         */
239         while (!allAggsAssigned) {
240           /* track whether any aggs are selected durng this round */
241           int startingCurrentAggListSize = currentAggListSize;
242           int numIterForHopsWithNoAggs = 0;
243           for (i=0;i<confInfo->nProcs;i++) {
244           if (all_procInfo[i].manhattanDistanceToBridge == currentNumHops) {
245             if (nodesAssigned[all_procInfo[i].nodeRank] == 0) { // node is not assigned as an agg yet
246               int foundBridge = 0;
247               for (j=0;(j<numBridgesWithAggAssignments && !foundBridge);j++) {
248                 if (aggAssignments[j].bridgeRank == all_procInfo[i].bridgeRank) {
249                   foundBridge = 1;
250                   if (aggAssignments[j].numAggsAssigned < numAggs) {
251                     aggAssignments[j].numAggsAssigned++;
252                     nodesAssigned[all_procInfo[i].nodeRank] = 1;
253                     aggList[currentAggListSize] = all_procInfo[i].rank;
254                     currentAggListSize++;
255 #ifdef bridgeringaggtrace
256                 printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
257 #endif
258                   }
259                 }
260               }
261               if (!foundBridge) {
262                 aggAssignments[numBridgesWithAggAssignments].bridgeRank = all_procInfo[i].bridgeRank;
263                 aggAssignments[numBridgesWithAggAssignments].numAggsAssigned = 1;
264                 numBridgesWithAggAssignments++;
265                 nodesAssigned[all_procInfo[i].nodeRank] = 1;
266                 aggList[currentAggListSize] = all_procInfo[i].rank;
267                 currentAggListSize++;
268 #ifdef bridgeringaggtrace
269                 printf("Assigned agg rank %d at nodeRank %d to bridge rank %d at a distance of %d hops\n",all_procInfo[i].rank,all_procInfo[i].nodeRank,all_procInfo[i].bridgeRank,currentNumHops);
270 #endif
271               }
272             }
273           }
274         }
275 
276         if (numBridgesWithAggAssignments == confInfo->numBridgeRanks) {
277           allAggsAssigned = 1;
278           for (i=0;(i<numBridgesWithAggAssignments && allAggsAssigned);i++) {
279             if (aggAssignments[i].numAggsAssigned < numAggs)
280               allAggsAssigned = 0;
281           }
282         }
283         currentNumHops++;
284         /* If 3 rounds go by without selecting an agg abort to avoid
285            infinite loop.
286         */
287         if (startingCurrentAggListSize == currentAggListSize)
288           numIterForHopsWithNoAggs++;
289         else
290           numIterForHopsWithNoAggs = 0;
291         ADIOI_Assert(numIterForHopsWithNoAggs <= 3);
292         }
293 
294         ADIOI_Free(aggAssignments);
295         ADIOI_Free(nodesAssigned);
296 
297       } // else aggTotal  > 1
298 
299        memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
300     } // gpfsmpio_bridgeringagg > 0
301 
302     else { // gpfsmpio_bridgeringagg unset - default code
303 
304     int distance, numAggs;
305 
306     /* Aggregators will be midpoints between sorted MPI rank lists of who shares a given
307      * bridge node */
308 
309    sortstruct *bridgelist = (sortstruct *)ADIOI_Malloc(confInfo->nProcs * sizeof(sortstruct));
310    for(i=0; i < confInfo->nProcs; i++)
311    {
312       bridgelist[i].bridge = all_procInfo[i].bridgeRank;
313       bridgelist[i].rank = i;
314       TRACE_ERR("bridgelist[%d].bridge: %d .rank: %d\n", i, bridgelist[i].bridge, i);
315    }
316 
317    /* This list contains rank->bridge info. Now, we need to sort this list. */
318    qsort(bridgelist, confInfo->nProcs, sizeof(sortstruct), intsort);
319 
320    /* In this array, we can pick an appropriate number of midpoints based on
321     * our bridgenode index and the number of aggregators */
322 
323    numAggs = confInfo->aggRatio * confInfo->ioMinSize /*virtualPsetSize*/;
324    if(numAggs == 1)
325       aggTotal = 1;
326    else
327    /* the number of aggregators is (numAggs per bridgenode) plus each
328     * bridge node is an aggregator */
329       aggTotal = confInfo->numBridgeRanks * (numAggs+1);
330 
331    if(aggTotal>confInfo->nProcs) aggTotal=confInfo->nProcs;
332 
333    TRACE_ERR("numBridgeRanks: %d, aggRatio: %f numBridge: %d pset size: %d/%d numAggs: %d, aggTotal: %d\n", confInfo->numBridgeRanks, confInfo->aggRatio, confInfo->numBridgeRanks,  confInfo->ioMinSize, confInfo->ioMaxSize /*virtualPsetSize*/, numAggs, aggTotal);
334    aggList = (int *)ADIOI_Malloc(aggTotal * sizeof(int));
335 
336 
337    /* For each bridge node, determine who the aggregators will be */
338    /* basically, the n*distance and bridge node */
339    if(aggTotal == 1) /* special case when we only have one bridge node */
340       aggList[0] = bridgelist[0].bridge;
341    else
342    {
343      int lastBridge = bridgelist[confInfo->nProcs-1].bridge;
344      int nextBridge = 0, nextAggr = confInfo->numBridgeRanks;
345      int psetSize = 0;
346      int procIndex;
347      for(procIndex=confInfo->nProcs-1; procIndex>=0; procIndex--)
348      {
349        TRACE_ERR("bridgelist[%d].bridge %u/rank %u\n",procIndex,  bridgelist[procIndex].bridge, bridgelist[procIndex].rank);
350        if(lastBridge == bridgelist[procIndex].bridge)
351        {
352          psetSize++;
353          if(procIndex) continue;
354          else procIndex--;/* procIndex == 0 */
355        }
356        /* Sets up a list of nodes which will act as aggregators. numAggs
357         * per bridge node total. The list of aggregators is
358         * bridgeNode 0
359         * bridgeNode 1
360         * bridgeNode ...
361         * bridgeNode N
362         * bridgeNode[0]aggr[0]
363         * bridgeNode[0]aggr[1]...
364         * bridgeNode[0]aggr[N]...
365         * ...
366         * bridgeNode[N]aggr[0]..
367         * bridgeNode[N]aggr[N]
368         */
369        aggList[nextBridge]=lastBridge;
370        distance = psetSize/numAggs;
371        TRACE_ERR("nextBridge %u is bridge %u, distance %u, size %u\n",nextBridge, aggList[nextBridge],distance,psetSize);
372        if(numAggs>1)
373        {
374          for(j = 0; j < numAggs; j++)
375          {
376            ADIOI_Assert(nextAggr<aggTotal);
377            aggList[nextAggr] = bridgelist[procIndex+j*distance+1].rank;
378            TRACE_ERR("agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+j*distance+1,aggList[nextAggr]);
379            if(aggList[nextAggr]==lastBridge) /* can't have bridge in the list twice */
380            {
381              aggList[nextAggr] = bridgelist[procIndex+psetSize].rank; /* take the last one in the pset */
382              TRACE_ERR("replacement agglist[%d] -> bridgelist[%d] = %d\n", nextAggr, procIndex+psetSize,aggList[nextAggr]);
383            }
384            nextAggr++;
385          }
386        }
387        if(procIndex<0) break;
388        lastBridge = bridgelist[procIndex].bridge;
389        psetSize = 1;
390        nextBridge++;
391      }
392    }
393 
394    TRACE_ERR("memcpy(tmp_ranklist, aggList, (numAggs(%u)*confInfo->numBridgeRanks(%u)+numAggs(%u)) (%u) %u*sizeof(int))\n",numAggs,confInfo->numBridgeRanks,numAggs,(numAggs*confInfo->numBridgeRanks+numAggs),aggTotal);
395    memcpy(tmp_ranklist, aggList, aggTotal*sizeof(int));
396    for(i=0;i<aggTotal;i++)
397    {
398       TRACE_ERR("tmp_ranklist[%d]: %d\n", i, tmp_ranklist[i]);
399    }
400 
401 
402    ADIOI_Free (bridgelist);
403 
404    TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial_do\n");
405    }
406 
407    ADIOI_Free (aggList);
408    return aggTotal;
409 
410 }
411 
412 /*
413  * compute aggregators ranklist and put it into fd->hints struct
414  */
415 static void
ADIOI_BG_compute_agg_ranklist_serial(ADIO_File fd,const ADIOI_BG_ConfInfo_t * confInfo,ADIOI_BG_ProcInfo_t * all_procInfo)416 ADIOI_BG_compute_agg_ranklist_serial ( ADIO_File fd,
417 					const ADIOI_BG_ConfInfo_t *confInfo,
418 					ADIOI_BG_ProcInfo_t *all_procInfo)
419 {
420     TRACE_ERR("Entering ADIOI_BG_compute_agg_ranklist_serial\n");
421     int i;
422     int naggs;
423     int size;
424     int *tmp_ranklist;
425 
426   /* compute the ranklist of IO aggregators and put into tmp_ranklist */
427     tmp_ranklist = (int *) ADIOI_Malloc (confInfo->nProcs * sizeof(int));
428 
429 #   if AGG_DEBUG
430     for (i=0; i<confInfo->nProcs; i++) {
431       DBG_FPRINTF(stderr, "\tcpuid %1d, rank = %6d\n", all_procInfo[i].coreID, all_procInfo[i].rank );
432     }
433 #   endif
434 
435     naggs=
436     ADIOI_BG_compute_agg_ranklist_serial_do (confInfo, all_procInfo, tmp_ranklist);
437 
438 #   define VERIFY 1
439 #   if VERIFY
440     DBG_FPRINTF(stderr, "\tconfInfo = min: %3d, max: %3d, naggrs: %3d, bridge: %3d, nprocs: %3d, vpset: %3d, tsize: %3d, ratio: %.4f; naggs = %d\n",
441 	    confInfo->ioMinSize        ,
442 	    confInfo->ioMaxSize        ,
443 	    confInfo->nAggrs           ,
444 	    confInfo->numBridgeRanks ,
445 	    confInfo->nProcs          ,
446 	    confInfo->ioMaxSize /*virtualPsetSize*/          ,
447       confInfo->cpuIDsize,
448 	    confInfo->aggRatio        ,
449 	    naggs );
450 #   endif
451     MPI_Comm_size( fd->comm, &size );
452     /* This fix is for when the bridgenode rnk is not part of the particular
453      * subcomm associated with this MPI File operation. I don't know if
454      * this is the best/right answer but it passes the test cases at least.
455      * I don't know how common file IO in subcomms is anyway... */
456     for(i=0;i<naggs;i++)
457     {
458       if(tmp_ranklist[i] > size)
459       {
460          TRACE_ERR("Using 0 as tmp_ranklist[%d] instead of %d for comm %x\n",
461                i, tmp_ranklist[i], fd->comm);
462          tmp_ranklist[i] = 0;
463       }
464    }
465 
466 #   if AGG_DEBUG
467     for (i=0; i<naggs; i++) {
468       DBG_FPRINTF(stderr, "\taggr %-4d = %6d\n", i, tmp_ranklist[i] );
469     }
470 #   endif
471     if (gpfsmpio_balancecontig == 1) {
472 	/* what comes out of this code block is the agg ranklist sorted by
473 	 * bridge set and ion id with associated bridge info stored in the
474 	 * hints structure for later access during file domain assignment */
475 
476 	// sort the agg ranklist by ions and bridges
477 
478 	int *interleavedbridgeranklist = (int *) ADIOI_Malloc (naggs * sizeof(int)); // resorted agg rank list
479 	/* list of all bridge ranks */
480 	int *bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
481 
482 	/* each entry here is the number of aggregators associated with the
483 	 * bridge rank of the same index in bridgelist */
484 	int *bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
485 	/* list of all ion IDs corresponding with bridgelist entries of same index */
486 	int *ionlist = (int *) ADIOI_Malloc (naggs * sizeof(int));
487 
488 	int numbridges = 0;
489 
490 	for (i=0;i<naggs;i++)
491 	    bridgelistnum[i] = 0;
492 
493 	/* Each entry in this list corresponds with the bridgelist and will contain the lowest bridge
494 	 * agg rank on that ion. */
495 	int *summarybridgeminionaggrank = (int *) ADIOI_Malloc (naggs * sizeof(int));
496 	for (i=0;i<naggs;i++)
497 	    summarybridgeminionaggrank[i] = -1;
498 
499 	/* build the bridgelist, ionlist and bridgelistnum data by going thru each agg
500 	 * entry and find the associated bridge list index - at the end we will
501 	 * know how many aggs belong to each bridge in each ion */
502 	for (i=0;i<naggs;i++) {
503 	    int aggbridgerank = all_procInfo[tmp_ranklist[i]].bridgeRank;
504 	    int aggionid = all_procInfo[tmp_ranklist[i]].ionID;
505 	    int foundrank = 0;
506 	    int summaryranklistbridgeindex = 0;
507 	    int j;
508 	    for (j=0;(j<numbridges && !foundrank);j++) {
509 		if (bridgelist[j] == aggbridgerank) {
510 		    foundrank = 1;
511 		    summaryranklistbridgeindex = j;
512 		}
513 		else
514 		    summaryranklistbridgeindex++;
515 	    }
516 	    if (!foundrank) {
517 		bridgelist[summaryranklistbridgeindex] = aggbridgerank;
518 		ionlist[summaryranklistbridgeindex] = aggionid;
519 
520 		if (summarybridgeminionaggrank[summaryranklistbridgeindex] == -1)
521 		    summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
522 		else if (summarybridgeminionaggrank[summaryranklistbridgeindex] > aggbridgerank)
523 		    summarybridgeminionaggrank[summaryranklistbridgeindex] = aggbridgerank;
524 		numbridges++;
525 	    }
526 
527 	    bridgelistnum[summaryranklistbridgeindex]++;
528 	}
529 
530     /* at this point summarybridgeminionaggrank has the agg rank of the bridge for entries,
531      * need to make each entry the minimum bridge rank for the entire ion. */
532     for (i=0;i<numbridges;i++) {
533         int aggIonId = ionlist[i];
534         int j;
535         for (j=0;j<numbridges;j++) {
536           if (ionlist[j] == aggIonId) {
537             if (summarybridgeminionaggrank[j] < summarybridgeminionaggrank[i])
538               summarybridgeminionaggrank[i] = summarybridgeminionaggrank[j];
539           }
540         }
541     }
542 
543 	// resort by io node minimum bridge rank
544 	int x;
545 	for (x=0;x<numbridges;x++) {
546 	    for (i=0;i<(numbridges-1);i++) {
547 		if (summarybridgeminionaggrank[i] > summarybridgeminionaggrank[i+1]) {
548 		    int tmpminionaggrank = summarybridgeminionaggrank[i];
549 		    summarybridgeminionaggrank[i] = summarybridgeminionaggrank[i+1];
550 		    summarybridgeminionaggrank[i+1] = tmpminionaggrank;
551 		    int tmpionid = ionlist[i];
552 		    ionlist[i] = ionlist[i+1];
553 		    ionlist[i+1] = tmpionid;
554 		    int tmpbridgerank = bridgelist[i];
555 		    bridgelist[i] = bridgelist[i+1];
556 		    bridgelist[i+1] = tmpbridgerank;
557 		    int tmpbridgeranknum = bridgelistnum[i];
558 		    bridgelistnum[i] = bridgelistnum[i+1];
559 		    bridgelistnum[i+1] = tmpbridgeranknum;
560 		  }
561 	    }
562 	}
563 
564 	// for each io node make sure bridgelist is in rank order
565 	int startSortIndex = -1;
566 	int endSortIndex = -1;
567 	int currentBridgeIndex = 0;
568 
569 	while (currentBridgeIndex < numbridges) {
570 	    int currentIonId = ionlist[currentBridgeIndex];
571 	    startSortIndex = currentBridgeIndex;
572 	    while (ionlist[currentBridgeIndex] == currentIonId)
573 		  currentBridgeIndex++;
574 	    endSortIndex = currentBridgeIndex-1;
575 	    for (x=startSortIndex;x<=endSortIndex;x++) {
576 		  for (i=startSortIndex;i<endSortIndex;i++) {
577 		    if (bridgelist[i] > bridgelist[i+1]) {
578 			  int tmpbridgerank = bridgelist[i];
579 			  bridgelist[i] = bridgelist[i+1];
580 			  bridgelist[i+1] = tmpbridgerank;
581 			  int tmpbridgeranknum = bridgelistnum[i];
582 			  bridgelistnum[i] = bridgelistnum[i+1];
583 			  bridgelistnum[i+1] = tmpbridgeranknum;
584 		    }
585 		  }
586 	    }
587 	}
588 
589 
590 	/* populate interleavedbridgeranklist - essentially the agg rank list
591 	 * is now sorted by the ion minimum bridge rank and bridge node */
592 	int currentrankoffset = 0;
593 	for (i=0;i<numbridges;i++) {
594 	    int *thisBridgeAggList = (int *) ADIOI_Malloc (naggs * sizeof(int));
595 	    int numAggsForThisBridge = 0;
596 
597 	    int k;
598 	    for (k=0;k<naggs;k++) {
599 		int aggbridgerank = all_procInfo[tmp_ranklist[k]].bridgeRank;
600 		if (aggbridgerank == bridgelist[i]) {
601 		    thisBridgeAggList[numAggsForThisBridge] = tmp_ranklist[k];
602 		    numAggsForThisBridge++;
603 		}
604 	    }
605 
606 	    // sort thisBridgeAggList
607 	    for (x=0;x<numAggsForThisBridge;x++) {
608 		int n;
609 		for (n=0;n<(numAggsForThisBridge-1);n++) {
610 		    if (thisBridgeAggList[n] > thisBridgeAggList[n+1]) {
611 			int tmpthisBridgeAggList = thisBridgeAggList[n];
612 			thisBridgeAggList[n] = thisBridgeAggList[n+1];
613 			thisBridgeAggList[n+1] = tmpthisBridgeAggList;
614 		    }
615 		}
616 	    }
617 	    int n;
618 	    for (n=0;n<numAggsForThisBridge;n++) {
619 		interleavedbridgeranklist[currentrankoffset] = thisBridgeAggList[n];
620 		currentrankoffset++;
621 	    }
622 	    ADIOI_Free(thisBridgeAggList);
623 	}
624 
625 #ifdef balancecontigtrace
626 	fprintf(stderr,"Interleaved aggregator list:\n");
627 	for (i=0;i<naggs;i++) {
628 	    fprintf(stderr,"Agg: %d Agg rank: %d with bridge rank %d and ion ID %d\n",i,interleavedbridgeranklist[i],all_procInfo[interleavedbridgeranklist[i]].bridgeRank,all_procInfo[interleavedbridgeranklist[i]].ionID);
629 	}
630 	fprintf(stderr,"Bridges list:\n");
631 	for (i=0;i<numbridges;i++) {
632 	    fprintf(stderr,"bridge %d ion min rank %d rank %d number of aggs %d ion id %d\n",i,summarybridgeminionaggrank[i],bridgelist[i],bridgelistnum[i],ionlist[i]);
633 	}
634 
635 #endif
636 	/* copy the ranklist of IO aggregators to fd->hints */
637 	if(fd->hints->ranklist != NULL)
638 	    ADIOI_Free (fd->hints->ranklist);
639 	if(fd->hints->fs_hints.bg.bridgelist != NULL)
640 	    ADIOI_Free (fd->hints->fs_hints.bg.bridgelist);
641 	if(fd->hints->fs_hints.bg.bridgelistnum != NULL)
642 	    ADIOI_Free (fd->hints->fs_hints.bg.bridgelistnum);
643 
644 	fd->hints->cb_nodes = naggs;
645 	fd->hints->fs_hints.bg.numbridges = numbridges;
646 	fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
647 	memcpy( fd->hints->ranklist, interleavedbridgeranklist, naggs*sizeof(int) );
648 
649 	fd->hints->fs_hints.bg.bridgelist = (int *) ADIOI_Malloc (naggs * sizeof(int));
650 	memcpy( fd->hints->fs_hints.bg.bridgelist, bridgelist, naggs*sizeof(int) );
651 
652 	fd->hints->fs_hints.bg.bridgelistnum = (int *) ADIOI_Malloc (naggs * sizeof(int));
653 	memcpy( fd->hints->fs_hints.bg.bridgelistnum, bridgelistnum, naggs*sizeof(int) );
654 
655 	ADIOI_Free(summarybridgeminionaggrank);
656 	ADIOI_Free( tmp_ranklist );
657 	ADIOI_Free( bridgelistnum );
658 	ADIOI_Free( bridgelist );
659 	ADIOI_Free( interleavedbridgeranklist );
660 	ADIOI_Free(ionlist);
661 
662     }  else {
663 	/* classic topology-agnostic copy of the ranklist of IO aggregators to
664 	 * fd->hints */
665 	if(fd->hints->ranklist != NULL) ADIOI_Free (fd->hints->ranklist);
666 
667 	fd->hints->cb_nodes = naggs;
668 	fd->hints->ranklist = (int *) ADIOI_Malloc (naggs * sizeof(int));
669 	memcpy( fd->hints->ranklist, tmp_ranklist, naggs*sizeof(int) );
670 
671 	ADIOI_Free( tmp_ranklist );
672     }
673     TRACE_ERR("Leaving ADIOI_BG_compute_agg_ranklist_serial\n");
674     return;
675 }
676