1 2 /** 3 @file parUtils.C 4 @author Rahul S. Sampath, rahul.sampath@gmail.com 5 @author Hari Sundar, hsundar@gmail.com 6 */ 7 8 #include "mpi.h" 9 #include "binUtils.h" 10 #include "dtypes.h" 11 #include "parUtils.h" 12 13 #ifdef __DEBUG__ 14 #ifndef __DEBUG_PAR__ 15 #define __DEBUG_PAR__ 16 #endif 17 #endif 18 19 namespace par { 20 splitCommBinary(MPI_Comm orig_comm,MPI_Comm * new_comm)21 unsigned int splitCommBinary( MPI_Comm orig_comm, MPI_Comm *new_comm) { 22 int npes, rank; 23 24 MPI_Group orig_group, new_group; 25 26 MPI_Comm_size(orig_comm, &npes); 27 MPI_Comm_rank(orig_comm, &rank); 28 29 unsigned int splitterRank = binOp::getPrevHighestPowerOfTwo(npes); 30 31 int *ranksAsc, *ranksDesc; 32 //Determine sizes for the 2 groups 33 ranksAsc = new int[splitterRank]; 34 ranksDesc = new int[( npes - splitterRank)]; 35 36 int numAsc = 0; 37 int numDesc = ( npes - splitterRank - 1); 38 39 //This is the main mapping between old ranks and new ranks. 40 for(int i=0; i<npes; i++) { 41 if( static_cast<unsigned int>(i) < splitterRank) { 42 ranksAsc[numAsc] = i; 43 numAsc++; 44 }else { 45 ranksDesc[numDesc] = i; 46 numDesc--; 47 } 48 }//end for i 49 50 MPI_Comm_group(orig_comm, &orig_group); 51 52 /* Divide tasks into two distinct groups based upon rank */ 53 if (static_cast<unsigned int>(rank) < splitterRank) { 54 MPI_Group_incl(orig_group, splitterRank, ranksAsc, &new_group); 55 }else { 56 MPI_Group_incl(orig_group, (npes-splitterRank), ranksDesc, &new_group); 57 } 58 59 MPI_Comm_create(orig_comm, new_group, new_comm); 60 61 delete [] ranksAsc; 62 ranksAsc = NULL; 63 64 delete [] ranksDesc; 65 ranksDesc = NULL; 66 67 return splitterRank; 68 }//end function 69 splitCommBinaryNoFlip(MPI_Comm orig_comm,MPI_Comm * new_comm)70 unsigned int splitCommBinaryNoFlip( MPI_Comm orig_comm, MPI_Comm *new_comm) { 71 int npes, rank; 72 73 MPI_Group orig_group, new_group; 74 75 MPI_Comm_size(orig_comm, &npes); 76 MPI_Comm_rank(orig_comm, &rank); 77 78 unsigned int splitterRank = binOp::getPrevHighestPowerOfTwo(npes); 79 80 int *ranksAsc, *ranksDesc; 81 //Determine sizes for the 2 groups 82 ranksAsc = new int[splitterRank]; 83 ranksDesc = new int[( npes - splitterRank)]; 84 85 int numAsc = 0; 86 int numDesc = 0; //( npes - splitterRank - 1); 87 88 //This is the main mapping between old ranks and new ranks. 89 for(int i = 0; i < npes; i++) { 90 if(static_cast<unsigned int>(i) < splitterRank) { 91 ranksAsc[numAsc] = i; 92 numAsc++; 93 }else { 94 ranksDesc[numDesc] = i; 95 numDesc++; 96 } 97 }//end for i 98 99 MPI_Comm_group(orig_comm, &orig_group); 100 101 /* Divide tasks into two distinct groups based upon rank */ 102 if (static_cast<unsigned int>(rank) < splitterRank) { 103 MPI_Group_incl(orig_group, splitterRank, ranksAsc, &new_group); 104 }else { 105 MPI_Group_incl(orig_group, (npes-splitterRank), ranksDesc, &new_group); 106 } 107 108 MPI_Comm_create(orig_comm, new_group, new_comm); 109 110 delete [] ranksAsc; 111 ranksAsc = NULL; 112 113 delete [] ranksDesc; 114 ranksDesc = NULL; 115 116 return splitterRank; 117 }//end function 118 119 //create Comm groups and remove empty processors... splitComm2way(bool iAmEmpty,MPI_Comm * new_comm,MPI_Comm comm)120 int splitComm2way(bool iAmEmpty, MPI_Comm * new_comm, MPI_Comm comm) { 121 #ifdef __PROFILE_WITH_BARRIER__ 122 MPI_Barrier(comm); 123 #endif 124 125 MPI_Group orig_group, new_group; 126 int size; 127 MPI_Comm_size(comm, &size); 128 129 bool* isEmptyList = new bool[size]; 130 par::Mpi_Allgather<bool>(&iAmEmpty, isEmptyList, 1, comm); 131 132 int numActive=0, numIdle=0; 133 for(int i = 0; i < size; i++) { 134 if(isEmptyList[i]) { 135 numIdle++; 136 }else { 137 numActive++; 138 } 139 }//end for i 140 141 int* ranksActive = new int[numActive]; 142 int* ranksIdle = new int[numIdle]; 143 144 numActive=0; 145 numIdle=0; 146 for(int i = 0; i < size; i++) { 147 if(isEmptyList[i]) { 148 ranksIdle[numIdle] = i; 149 numIdle++; 150 }else { 151 ranksActive[numActive] = i; 152 numActive++; 153 } 154 }//end for i 155 156 delete [] isEmptyList; 157 isEmptyList = NULL; 158 159 /* Extract the original group handle */ 160 MPI_Comm_group(comm, &orig_group); 161 162 /* Divide tasks into two distinct groups based upon rank */ 163 if (!iAmEmpty) { 164 MPI_Group_incl(orig_group, numActive, ranksActive, &new_group); 165 }else { 166 MPI_Group_incl(orig_group, numIdle, ranksIdle, &new_group); 167 } 168 169 /* Create new communicator */ 170 MPI_Comm_create(comm, new_group, new_comm); 171 172 delete [] ranksActive; 173 ranksActive = NULL; 174 175 delete [] ranksIdle; 176 ranksIdle = NULL; 177 178 }//end function 179 splitCommUsingSplittingRank(int splittingRank,MPI_Comm * new_comm,MPI_Comm comm)180 int splitCommUsingSplittingRank(int splittingRank, MPI_Comm* new_comm, 181 MPI_Comm comm) { 182 #ifdef __PROFILE_WITH_BARRIER__ 183 MPI_Barrier(comm); 184 #endif 185 186 MPI_Group orig_group, new_group; 187 int size; 188 int rank; 189 MPI_Comm_rank(comm, &rank); 190 MPI_Comm_size(comm, &size); 191 192 int* ranksActive = new int[splittingRank]; 193 int* ranksIdle = new int[size - splittingRank]; 194 195 for(int i = 0; i < splittingRank; i++) { 196 ranksActive[i] = i; 197 } 198 199 for(int i = splittingRank; i < size; i++) { 200 ranksIdle[i - splittingRank] = i; 201 } 202 203 /* Extract the original group handle */ 204 MPI_Comm_group(comm, &orig_group); 205 206 /* Divide tasks into two distinct groups based upon rank */ 207 if (rank < splittingRank) { 208 MPI_Group_incl(orig_group, splittingRank, ranksActive, &new_group); 209 }else { 210 MPI_Group_incl(orig_group, (size - splittingRank), ranksIdle, &new_group); 211 } 212 213 /* Create new communicator */ 214 MPI_Comm_create(comm, new_group, new_comm); 215 216 delete [] ranksActive; 217 ranksActive = NULL; 218 219 delete [] ranksIdle; 220 ranksIdle = NULL; 221 222 }//end function 223 224 //create Comm groups and remove empty processors... splitComm2way(const bool * isEmptyList,MPI_Comm * new_comm,MPI_Comm comm)225 int splitComm2way(const bool* isEmptyList, MPI_Comm * new_comm, MPI_Comm comm) { 226 227 MPI_Group orig_group, new_group; 228 int size, rank; 229 MPI_Comm_size(comm, &size); 230 MPI_Comm_rank(comm, &rank); 231 232 int numActive=0, numIdle=0; 233 for(int i = 0; i < size; i++) { 234 if(isEmptyList[i]) { 235 numIdle++; 236 }else { 237 numActive++; 238 } 239 }//end for i 240 241 int* ranksActive = new int[numActive]; 242 int* ranksIdle = new int[numIdle]; 243 244 numActive=0; 245 numIdle=0; 246 for(int i = 0; i < size; i++) { 247 if(isEmptyList[i]) { 248 ranksIdle[numIdle] = i; 249 numIdle++; 250 }else { 251 ranksActive[numActive] = i; 252 numActive++; 253 } 254 }//end for i 255 256 /* Extract the original group handle */ 257 MPI_Comm_group(comm, &orig_group); 258 259 /* Divide tasks into two distinct groups based upon rank */ 260 if (!isEmptyList[rank]) { 261 MPI_Group_incl(orig_group, numActive, ranksActive, &new_group); 262 }else { 263 MPI_Group_incl(orig_group, numIdle, ranksIdle, &new_group); 264 } 265 266 /* Create new communicator */ 267 MPI_Comm_create(comm, new_group, new_comm); 268 269 delete [] ranksActive; 270 ranksActive = NULL; 271 272 delete [] ranksIdle; 273 ranksIdle = NULL; 274 275 return 0; 276 }//end function 277 278 AdjustCommunicationPattern(std::vector<int> & send_sizes,std::vector<int> & send_partners,std::vector<int> & recv_sizes,std::vector<int> & recv_partners,MPI_Comm comm)279 int AdjustCommunicationPattern(std::vector<int>& send_sizes, std::vector<int>& send_partners, 280 std::vector<int>& recv_sizes, std::vector<int>& recv_partners, MPI_Comm comm) 281 { 282 int npes; 283 int rank; 284 MPI_Comm_rank(comm, &rank); 285 MPI_Comm_size(comm, &npes); 286 287 unsigned int k = send_sizes.size(); 288 289 // do scans ... 290 DendroIntL lsz[k]; 291 DendroIntL gsz[k], gscan[k]; 292 293 for(size_t i = 0; i < send_sizes.size(); ++i) { 294 lsz[i] = send_sizes[i]; 295 } 296 par::Mpi_Scan<DendroIntL>( lsz, gscan, k, MPI_SUM, comm); 297 298 if (rank == npes-1) { 299 for(size_t i = 0; i < k; ++i) { 300 gsz[i] = gscan[i]; 301 } 302 } 303 // broadcast from last proc to get total counts, per segment ... 304 par::Mpi_Bcast<DendroIntL>( gsz, k, npes-1, comm); 305 306 DendroIntL segment_p0[k]; 307 for(size_t i = 0; i < k; ++i) { 308 segment_p0[i] = (i*npes)/k; 309 } 310 311 /* 312 * -- Dividing into k segments, so each segment will have npes/k procs. 313 * -- Each proc will have gsz[i]/(npes/k) elements. 314 * -- rank of proc which will get i-th send_buff is, 315 * -- segment_p0[i] + gscan[i] 316 */ 317 318 // figure out send_partners for k sends 319 // send_partners.clear(); 320 for(size_t i = 0; i < k; ++i) { 321 int new_part; 322 int seg_npes = ( (i == k-1) ? npes - segment_p0[i] : segment_p0[i+1]-segment_p0[i] ); 323 int overhang = gsz[i] % seg_npes; 324 DendroIntL rank_mid = gscan[i] - lsz[i]/2; 325 if ( rank_mid < overhang*(gsz[i]/seg_npes + 1)) { 326 new_part = segment_p0[i] + rank_mid/(gsz[i]/seg_npes + 1); 327 } else { 328 new_part = segment_p0[i] + (rank_mid - overhang)/(gsz[i]/seg_npes); 329 } 330 send_partners[i] = new_part; 331 } 332 333 int idx=0; 334 if (send_partners[0] == rank) { 335 send_sizes[0] = 0; 336 } 337 for(size_t i = 1; i < k; ++i) 338 { 339 if (send_partners[i] == rank) { 340 send_sizes[i] = 0; 341 idx = i; 342 continue; 343 } 344 if (send_partners[i] == send_partners[i-1]) { 345 send_sizes[idx] += lsz[i]; 346 send_sizes[i]=0; 347 } else { 348 idx = i; 349 } 350 } 351 352 // let procs know you will be sending to them ... 353 354 // try MPI one sided comm 355 MPI_Win win; 356 int *rcv; 357 MPI_Alloc_mem(sizeof(int)*npes, MPI_INFO_NULL, &rcv); 358 for(size_t i = 0; i < npes; ++i) rcv[i] = 0; 359 360 MPI_Win_create(rcv, npes, sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win); 361 362 363 MPI_Win_fence(MPI_MODE_NOPRECEDE, win); 364 for (size_t i = 0; i < send_sizes.size(); i++) 365 { 366 if (send_sizes[i]) { 367 MPI_Put(&(send_sizes[i]), 1, MPI_INT, send_partners[i], rank, 1, MPI_INT, win); 368 } 369 } 370 MPI_Win_fence((MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED), win); 371 // figure out recv partners and sizes ... 372 recv_sizes.clear(); recv_partners.clear(); 373 for(size_t i = 0; i < npes; ++i) 374 { 375 if (rcv[i]) { 376 recv_partners.push_back(i); 377 recv_sizes.push_back(rcv[i]); 378 } 379 } 380 381 MPI_Win_free(&win); 382 MPI_Free_mem(rcv); 383 384 return 1; 385 } 386 387 }// end namespace 388 389