1 
2 /**
3   @file parUtils.C
4   @author Rahul S. Sampath, rahul.sampath@gmail.com
5   @author Hari Sundar, hsundar@gmail.com
6   */
7 
8 #include "mpi.h"
9 #include "binUtils.h"
10 #include "dtypes.h"
11 #include "parUtils.h"
12 
13 #ifdef __DEBUG__
14 #ifndef __DEBUG_PAR__
15 #define __DEBUG_PAR__
16 #endif
17 #endif
18 
19 namespace par {
20 
splitCommBinary(MPI_Comm orig_comm,MPI_Comm * new_comm)21   unsigned int splitCommBinary( MPI_Comm orig_comm, MPI_Comm *new_comm) {
22     int npes, rank;
23 
24     MPI_Group  orig_group, new_group;
25 
26     MPI_Comm_size(orig_comm, &npes);
27     MPI_Comm_rank(orig_comm, &rank);
28 
29     unsigned int splitterRank = binOp::getPrevHighestPowerOfTwo(npes);
30 
31     int *ranksAsc, *ranksDesc;
32     //Determine sizes for the 2 groups
33     ranksAsc = new int[splitterRank];
34     ranksDesc = new int[( npes - splitterRank)];
35 
36     int numAsc = 0;
37     int numDesc = ( npes - splitterRank - 1);
38 
39     //This is the main mapping between old ranks and new ranks.
40     for(int i=0; i<npes; i++) {
41       if( static_cast<unsigned int>(i) < splitterRank) {
42         ranksAsc[numAsc] = i;
43         numAsc++;
44       }else {
45         ranksDesc[numDesc] = i;
46         numDesc--;
47       }
48     }//end for i
49 
50     MPI_Comm_group(orig_comm, &orig_group);
51 
52     /* Divide tasks into two distinct groups based upon rank */
53     if (static_cast<unsigned int>(rank) < splitterRank) {
54       MPI_Group_incl(orig_group, splitterRank, ranksAsc, &new_group);
55     }else {
56       MPI_Group_incl(orig_group, (npes-splitterRank), ranksDesc, &new_group);
57     }
58 
59     MPI_Comm_create(orig_comm, new_group, new_comm);
60 
61     delete [] ranksAsc;
62     ranksAsc = NULL;
63 
64     delete [] ranksDesc;
65     ranksDesc = NULL;
66 
67     return splitterRank;
68   }//end function
69 
splitCommBinaryNoFlip(MPI_Comm orig_comm,MPI_Comm * new_comm)70   unsigned int splitCommBinaryNoFlip( MPI_Comm orig_comm, MPI_Comm *new_comm) {
71     int npes, rank;
72 
73     MPI_Group  orig_group, new_group;
74 
75     MPI_Comm_size(orig_comm, &npes);
76     MPI_Comm_rank(orig_comm, &rank);
77 
78     unsigned int splitterRank =  binOp::getPrevHighestPowerOfTwo(npes);
79 
80     int *ranksAsc, *ranksDesc;
81     //Determine sizes for the 2 groups
82     ranksAsc = new int[splitterRank];
83     ranksDesc = new int[( npes - splitterRank)];
84 
85     int numAsc = 0;
86     int numDesc = 0; //( npes - splitterRank - 1);
87 
88     //This is the main mapping between old ranks and new ranks.
89     for(int i = 0; i < npes; i++) {
90       if(static_cast<unsigned int>(i) < splitterRank) {
91         ranksAsc[numAsc] = i;
92         numAsc++;
93       }else {
94         ranksDesc[numDesc] = i;
95         numDesc++;
96       }
97     }//end for i
98 
99     MPI_Comm_group(orig_comm, &orig_group);
100 
101     /* Divide tasks into two distinct groups based upon rank */
102     if (static_cast<unsigned int>(rank) < splitterRank) {
103       MPI_Group_incl(orig_group, splitterRank, ranksAsc, &new_group);
104     }else {
105       MPI_Group_incl(orig_group, (npes-splitterRank), ranksDesc, &new_group);
106     }
107 
108     MPI_Comm_create(orig_comm, new_group, new_comm);
109 
110     delete [] ranksAsc;
111     ranksAsc = NULL;
112 
113     delete [] ranksDesc;
114     ranksDesc = NULL;
115 
116     return splitterRank;
117   }//end function
118 
119   //create Comm groups and remove empty processors...
splitComm2way(bool iAmEmpty,MPI_Comm * new_comm,MPI_Comm comm)120   int splitComm2way(bool iAmEmpty, MPI_Comm * new_comm, MPI_Comm comm) {
121 #ifdef __PROFILE_WITH_BARRIER__
122     MPI_Barrier(comm);
123 #endif
124 
125       MPI_Group  orig_group, new_group;
126     int size;
127     MPI_Comm_size(comm, &size);
128 
129     bool* isEmptyList = new bool[size];
130     par::Mpi_Allgather<bool>(&iAmEmpty, isEmptyList, 1, comm);
131 
132     int numActive=0, numIdle=0;
133     for(int i = 0; i < size; i++) {
134       if(isEmptyList[i]) {
135         numIdle++;
136       }else {
137         numActive++;
138       }
139     }//end for i
140 
141     int* ranksActive = new int[numActive];
142     int* ranksIdle = new int[numIdle];
143 
144     numActive=0;
145     numIdle=0;
146     for(int i = 0; i < size; i++) {
147       if(isEmptyList[i]) {
148         ranksIdle[numIdle] = i;
149         numIdle++;
150       }else {
151         ranksActive[numActive] = i;
152         numActive++;
153       }
154     }//end for i
155 
156     delete [] isEmptyList;
157     isEmptyList = NULL;
158 
159     /* Extract the original group handle */
160     MPI_Comm_group(comm, &orig_group);
161 
162     /* Divide tasks into two distinct groups based upon rank */
163     if (!iAmEmpty) {
164       MPI_Group_incl(orig_group, numActive, ranksActive, &new_group);
165     }else {
166       MPI_Group_incl(orig_group, numIdle, ranksIdle, &new_group);
167     }
168 
169     /* Create new communicator */
170     MPI_Comm_create(comm, new_group, new_comm);
171 
172     delete [] ranksActive;
173     ranksActive = NULL;
174 
175     delete [] ranksIdle;
176     ranksIdle = NULL;
177 
178   }//end function
179 
splitCommUsingSplittingRank(int splittingRank,MPI_Comm * new_comm,MPI_Comm comm)180   int splitCommUsingSplittingRank(int splittingRank, MPI_Comm* new_comm,
181       MPI_Comm comm) {
182 #ifdef __PROFILE_WITH_BARRIER__
183     MPI_Barrier(comm);
184 #endif
185 
186       MPI_Group  orig_group, new_group;
187     int size;
188     int rank;
189     MPI_Comm_rank(comm, &rank);
190     MPI_Comm_size(comm, &size);
191 
192     int* ranksActive = new int[splittingRank];
193     int* ranksIdle = new int[size - splittingRank];
194 
195     for(int i = 0; i < splittingRank; i++) {
196       ranksActive[i] = i;
197     }
198 
199     for(int i = splittingRank; i < size; i++) {
200       ranksIdle[i - splittingRank] = i;
201     }
202 
203     /* Extract the original group handle */
204     MPI_Comm_group(comm, &orig_group);
205 
206     /* Divide tasks into two distinct groups based upon rank */
207     if (rank < splittingRank) {
208       MPI_Group_incl(orig_group, splittingRank, ranksActive, &new_group);
209     }else {
210       MPI_Group_incl(orig_group, (size - splittingRank), ranksIdle, &new_group);
211     }
212 
213     /* Create new communicator */
214     MPI_Comm_create(comm, new_group, new_comm);
215 
216     delete [] ranksActive;
217     ranksActive = NULL;
218 
219     delete [] ranksIdle;
220     ranksIdle = NULL;
221 
222   }//end function
223 
224   //create Comm groups and remove empty processors...
splitComm2way(const bool * isEmptyList,MPI_Comm * new_comm,MPI_Comm comm)225   int splitComm2way(const bool* isEmptyList, MPI_Comm * new_comm, MPI_Comm comm) {
226 
227     MPI_Group  orig_group, new_group;
228     int size, rank;
229     MPI_Comm_size(comm, &size);
230     MPI_Comm_rank(comm, &rank);
231 
232     int numActive=0, numIdle=0;
233     for(int i = 0; i < size; i++) {
234       if(isEmptyList[i]) {
235         numIdle++;
236       }else {
237         numActive++;
238       }
239     }//end for i
240 
241     int* ranksActive = new int[numActive];
242     int* ranksIdle = new int[numIdle];
243 
244     numActive=0;
245     numIdle=0;
246     for(int i = 0; i < size; i++) {
247       if(isEmptyList[i]) {
248         ranksIdle[numIdle] = i;
249         numIdle++;
250       }else {
251         ranksActive[numActive] = i;
252         numActive++;
253       }
254     }//end for i
255 
256     /* Extract the original group handle */
257     MPI_Comm_group(comm, &orig_group);
258 
259     /* Divide tasks into two distinct groups based upon rank */
260     if (!isEmptyList[rank]) {
261       MPI_Group_incl(orig_group, numActive, ranksActive, &new_group);
262     }else {
263       MPI_Group_incl(orig_group, numIdle, ranksIdle, &new_group);
264     }
265 
266     /* Create new communicator */
267     MPI_Comm_create(comm, new_group, new_comm);
268 
269     delete [] ranksActive;
270     ranksActive = NULL;
271 
272     delete [] ranksIdle;
273     ranksIdle = NULL;
274 
275     return 0;
276   }//end function
277 
278 
AdjustCommunicationPattern(std::vector<int> & send_sizes,std::vector<int> & send_partners,std::vector<int> & recv_sizes,std::vector<int> & recv_partners,MPI_Comm comm)279 	int AdjustCommunicationPattern(std::vector<int>& send_sizes, std::vector<int>& send_partners,
280 				 												 std::vector<int>& recv_sizes, std::vector<int>& recv_partners, MPI_Comm comm)
281 	{
282     int npes;
283     int rank;
284     MPI_Comm_rank(comm, &rank);
285     MPI_Comm_size(comm, &npes);
286 
287 		unsigned int k = send_sizes.size();
288 
289 		// do scans ...
290 		DendroIntL lsz[k];
291 		DendroIntL gsz[k],  gscan[k];
292 
293 		for(size_t i = 0; i < send_sizes.size(); ++i) {
294 			lsz[i] = send_sizes[i];
295 		}
296 		par::Mpi_Scan<DendroIntL>( lsz, gscan, k, MPI_SUM, comm);
297 
298 		if (rank == npes-1) {
299 			for(size_t i = 0; i < k; ++i) {
300 				gsz[i] = gscan[i];
301 			}
302 		}
303 		// broadcast from last proc to get total counts, per segment ...
304 		par::Mpi_Bcast<DendroIntL>( gsz, k, npes-1, comm);
305 
306 		DendroIntL segment_p0[k];
307 		for(size_t i = 0; i < k; ++i) {
308 			segment_p0[i] = (i*npes)/k;
309 		}
310 
311 		/*
312 		 * -- Dividing into k segments, so each segment will have npes/k procs.
313 		 * -- Each proc will have gsz[i]/(npes/k) elements.
314 		 * -- rank of proc which will get i-th send_buff is,
315 		 *        -- segment_p0[i] + gscan[i]
316 		 */
317 
318 		// figure out send_partners for k sends
319 		// send_partners.clear();
320 		for(size_t i = 0; i < k; ++i) {
321 			int new_part;
322 			int seg_npes  =   ( (i == k-1) ? npes - segment_p0[i] : segment_p0[i+1]-segment_p0[i] );
323 			int overhang  =   gsz[i] % seg_npes;
324 			DendroIntL rank_mid = gscan[i] - lsz[i]/2;
325 			if ( rank_mid < overhang*(gsz[i]/seg_npes + 1)) {
326 				new_part = segment_p0[i] + rank_mid/(gsz[i]/seg_npes + 1);
327 			} else {
328 				new_part = segment_p0[i] + (rank_mid - overhang)/(gsz[i]/seg_npes);
329 			}
330 			send_partners[i] = new_part;
331 		}
332 
333 		int idx=0;
334 		if (send_partners[0] == rank) {
335 			send_sizes[0] = 0;
336 		}
337 		for(size_t i = 1; i < k; ++i)
338 		{
339 			if (send_partners[i] == rank) {
340 				send_sizes[i] = 0;
341 				idx = i;
342 				continue;
343 			}
344 			if (send_partners[i] == send_partners[i-1]) {
345 				send_sizes[idx] += lsz[i];
346 				send_sizes[i]=0;
347 			} else {
348 					idx = i;
349 			}
350 		}
351 
352 		// let procs know you will be sending to them ...
353 
354 		// try MPI one sided comm
355 		MPI_Win win;
356 		int *rcv;
357 	  MPI_Alloc_mem(sizeof(int)*npes, MPI_INFO_NULL, &rcv);
358 		for(size_t i = 0; i < npes; ++i) rcv[i] = 0;
359 
360 		MPI_Win_create(rcv, npes, sizeof(int), MPI_INFO_NULL,  MPI_COMM_WORLD, &win);
361 
362 
363 		MPI_Win_fence(MPI_MODE_NOPRECEDE, win);
364 		for (size_t i = 0; i < send_sizes.size(); i++)
365 		{
366 			if (send_sizes[i]) {
367 		    MPI_Put(&(send_sizes[i]), 1, MPI_INT, send_partners[i], rank, 1, MPI_INT, win);
368 			}
369 		}
370 		MPI_Win_fence((MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED), win);
371 		// figure out recv partners and sizes ...
372 		recv_sizes.clear(); recv_partners.clear();
373 		for(size_t i = 0; i < npes; ++i)
374 		{
375 			if (rcv[i]) {
376 				recv_partners.push_back(i);
377 				recv_sizes.push_back(rcv[i]);
378 			}
379 		}
380 
381 		MPI_Win_free(&win);
382 	  MPI_Free_mem(rcv);
383 
384 		return 1;
385 	}
386 
387 }// end namespace
388 
389