1 2 /** 3 @file parUtils.h 4 @brief A set of parallel utilities. 5 @author Hari Sundar, hsundar@gmail.com 6 */ 7 8 #ifndef __PAR_UTILS_H_ 9 #define __PAR_UTILS_H_ 10 11 #define KEEP_HIGH 100 12 #define KEEP_LOW 101 13 14 #ifdef __DEBUG__ 15 #ifndef __DEBUG_PAR__ 16 #define __DEBUG_PAR__ 17 #endif 18 #endif 19 20 #include "mpi.h" 21 #include <vector> 22 23 #ifdef __USE_64_BIT_INT__ 24 #define DendroIntL long long 25 #define DendroIntLSpecifier %lld 26 #define DendroUIntLSpecifier %llu 27 #else 28 #define DendroIntL int 29 #define DendroIntLSpecifier %d 30 #define DendroUIntLSpecifier %u 31 #endif 32 33 /** 34 @namespace par 35 @author Hari Sundar hsundar@gmail.com 36 @brief Collection of Generic Parallel Functions: Sorting, Partitioning, Searching,... 37 */ 38 namespace par { 39 40 template <typename T> 41 int Mpi_Isend(T* buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request* request); 42 43 template <typename T> 44 int Mpi_Issend(T* buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request* request); 45 46 template <typename T> 47 int Mpi_Recv(T* buf, int count, int source, int tag, MPI_Comm comm, MPI_Status* status); 48 49 template <typename T> 50 int Mpi_Irecv(T* buf, int count, int source, int tag, MPI_Comm comm, MPI_Request* request); 51 52 template <typename T> 53 int Mpi_Gather( T* sendBuffer, T* recvBuffer, int count, int root, MPI_Comm comm); 54 55 template <typename T, typename S> 56 int Mpi_Sendrecv( T* sendBuf, int sendCount, int dest, int sendTag, 57 S* recvBuf, int recvCount, int source, int recvTag, 58 MPI_Comm comm, MPI_Status* status); 59 60 template <typename T> 61 int Mpi_Bcast( T* buffer, int count, int root, MPI_Comm comm); 62 63 template <typename T> 64 int Mpi_Scan( T* sendbuf, T* recvbuf, int count, MPI_Op op, MPI_Comm comm); 65 66 template <typename T> 67 int Mpi_Reduce( T* sendbuf, T* recvbuf, int count, MPI_Op op, int root, MPI_Comm comm); 68 69 template <typename T> 70 int Mpi_Allreduce( T* sendbuf, T* recvbuf, int count, MPI_Op op, MPI_Comm comm); 71 72 template <typename T> 73 int Mpi_Alltoall(T* sendbuf, T* recvbuf, int count, MPI_Comm comm); 74 75 template <typename T> 76 int Mpi_Allgatherv(T* sendbuf, int sendcount, T* recvbuf, 77 int* recvcounts, int* displs, MPI_Comm comm); 78 79 template <typename T> 80 int Mpi_Allgather(T* sendbuf, T* recvbuf, int count, MPI_Comm comm); 81 82 template <typename T> 83 int Mpi_Alltoallv_sparse(T* sendbuf, int* sendcnts, int* sdispls, 84 T* recvbuf, int* recvcnts, int* rdispls, MPI_Comm comm); 85 86 template <typename T> 87 int Mpi_Alltoallv_dense(T* sendbuf, int* sendcnts, int* sdispls, 88 T* recvbuf, int* recvcnts, int* rdispls, MPI_Comm comm); 89 90 91 template<typename T> 92 unsigned int defaultWeight(const T *a); 93 94 /** 95 @brief A parallel weighted partitioning function. In our implementation, we do not pose any 96 restriction on the input or the number of processors. This function can be used with an odd number of processors as well. 97 Some processors can pass an empty vector as input. The relative ordering of the elements is preserved. 98 @author Hari Sundar 99 @author Rahul Sampath 100 @param vec the input vector 101 @param getWeight function pointer to compute the weight of each element. If you pass NULL, 102 then every element will get a weight equal to 1. 103 @param comm the communicator 104 */ 105 template<typename T> 106 int partitionW(std::vector<T>& vec, 107 unsigned int (*getWeight)(const T *), MPI_Comm comm); 108 109 110 template<typename T> 111 void rankSamples(std::vector<T>& arr, std::vector<T> samples, MPI_Comm comm); 112 113 template<typename T> 114 std::vector<T> Sorted_Sample_Select(std::vector<T>& arr, unsigned int kway, std::vector<unsigned int>& min_idx, std::vector<unsigned int>& max_idx, std::vector<DendroIntL>& splitter_ranks, MPI_Comm comm); 115 template<typename T> 116 std::vector<T> Sorted_approx_Select(std::vector<T>& arr, unsigned int k, MPI_Comm comm); 117 //! new one to handle skewed distributions ... 118 template<typename T> 119 std::vector<std::pair<T, DendroIntL> > Sorted_approx_Select_skewed(std::vector<T>& arr, unsigned int k, MPI_Comm comm); 120 121 template<typename T> 122 std::vector<T> GetRangeMean(std::vector<T>& arr, std::vector<unsigned int> range_min, std::vector<unsigned int> range_max, MPI_Comm comm); 123 template<typename T> 124 std::vector<T> GuessRangeMedian(std::vector<T>& arr, std::vector<unsigned int> range_min, std::vector<unsigned int> range_max, MPI_Comm comm); 125 126 /** @brief A parallel k-selection algorithms 127 128 @author Hari Sundar 129 @date 2013-01-10 130 @param arr arr from which samples are to be selected 131 @param k number of samples 132 @param isSorted if false, arr is locally sorted first. 133 @return list of k keys. 134 **/ 135 template<typename T> 136 std::vector<T> Sorted_k_Select(std::vector<T>& arr, std::vector<unsigned int> min_idx, std::vector<unsigned int> max_idx, std::vector<DendroIntL> K, std::vector<T> guess, MPI_Comm comm); 137 138 /** 139 @brief A parallel hyper quick sort implementation. 140 @author Dhairya Malhotra 141 @param in the input vector 142 @param out the output vector 143 @param comm the communicator 144 */ 145 template<typename T> 146 int HyperQuickSort(std::vector<T>& in, std::vector<T> & out, MPI_Comm comm); 147 148 template<typename T> 149 int HyperQuickSort_kway(std::vector<T>& in, std::vector<T> & out, MPI_Comm comm); 150 151 template<typename T> 152 int HyperQuickSort_cav(std::vector<T>& in, std::vector<T> & out, MPI_Comm comm); 153 154 /* mem-efficient version */ 155 template<typename T> 156 int HyperQuickSort(std::vector<T>& arr, MPI_Comm comm); 157 158 template<typename T> 159 int HyperQuickSort_kway(std::vector<T>& in, MPI_Comm comm); 160 161 template<typename T> 162 int HyperQuickSort_cav(std::vector<T>& in, MPI_Comm comm); 163 164 /** 165 @brief A parallel sample sort implementation. In our implementation, we do not pose any 166 restriction on the input or the number of processors. This function can be used with an odd number of processors as well. 167 Some processors can pass an empty vector as input. If the total number of elements in the vector (globally) is fewer 168 than 10*p^2, where p is the number of processors, then we will use bitonic sort instead of sample sort to sort the vector. 169 We use a paralle bitonic sort to sort the samples in the sample sort algorithm. Hence, the complexity of the algorithm 170 is O(n/p log n/p) + O(p log p). Here, n is the global length of the vector and p is the number of processors. 171 @author Hari Sundar 172 @param in the input vector 173 @param out the output vector 174 @param comm the communicator 175 */ 176 template<typename T> 177 int sampleSort(std::vector<T>& in, std::vector<T> & out, MPI_Comm comm); 178 179 template<typename T> 180 int sampleSort(std::vector<T>& in, MPI_Comm comm); 181 182 /** 183 @brief Splits a communication group into two, one containing processors that passed a value of 'false' 184 for the parameter 'iAmEmpty' and the another containing processors that passed a value of 'true' 185 for the parameter. Both the groups are sorted in the ascending order of their ranks in the old comm. 186 @author Rahul Sampath 187 @param iAmEmpty Some flag to determine which group the calling processor will be combined into. 188 @param orig_comm The comm group that needs to be split. 189 @param new_comm The new comm group. 190 */ 191 int splitComm2way(bool iAmEmpty, MPI_Comm* new_comm, MPI_Comm orig_comm); 192 193 /** 194 @brief Splits a communication group into two depending on the values in isEmptyList. 195 Both the groups are sorted in the ascending order of their ranks in the old comm. 196 All processors must call this function with the same 'isEmptyList' array. 197 @author Rahul Sampath 198 @param isEmptyList flags (of length equal to the number of processors) to determine whether each processor is active or not. 199 @param orig_comm The comm group that needs to be split. 200 @param new_comm The new comm group. 201 */ 202 int splitComm2way(const bool* isEmptyList, MPI_Comm* new_comm, MPI_Comm orig_comm); 203 204 /* 205 @author Rahul Sampath 206 @brief Splits a communication group into two, processors with 207 ranks less than splittingRank form one group and the other 208 processors form the second group. Both the groups are sorted in 209 the ascending order of their ranks in the old comm. 210 @param splittingRank The rank used for splitting the communicator 211 @param orig_comm The comm group that needs to be split. 212 @param new_comm The new comm group. 213 */ 214 int splitCommUsingSplittingRank(int splittingRank, MPI_Comm* new_comm, MPI_Comm orig_comm); 215 216 /** 217 * @brief Splits a communication group into two, the first having a power of 2 218 * number of processors and the other having the remainder. The first group 219 * is sorted in the ascending order of their ranks in the old comm and the second group 220 * is sorted in the descending order of their ranks in the old comm 221 * @author Hari Sundar 222 * @param orig_comm The comm group that needs to be split. 223 * @param new_comm The new comm group. 224 */ 225 unsigned int splitCommBinary( MPI_Comm orig_comm, MPI_Comm* new_comm); 226 227 228 /** 229 * @brief Splits a communication group into two, the first having a power of 2 230 * number of processors and the other having the remainder. Both the groups 231 * are sorted in the ascending order of their ranks in the old comm. 232 * @author Hari Sundar 233 * @param orig_comm The comm group that needs to be split. 234 * @param new_comm The new comm group. 235 */ 236 unsigned int splitCommBinaryNoFlip( MPI_Comm orig_comm, MPI_Comm* new_comm); 237 238 /** 239 @author Hari Sundar 240 * @brief Merges lists A, and B, retaining either the low or the High in list A. 241 * 242 * @param listA Input list, and where the output is stored. 243 * @param listB Second input list. 244 * @param KEEP_WHAT determines whether to retain the High or the low values 245 * from A and B. One of KEEP_HIGH or KEEP_LOW. 246 * 247 * Merging the two lists when their sizes are not the same is a bit involved. 248 * The major condition that needs to be used is that all elements that are less 249 * than max(min(A), min(B)) are retained by the KEEP_LOW processor, and 250 * similarly all elements that are larger larger than min(max(A), max(B)) are 251 * retained by the KEEP_HIGH processor. 252 * 253 * The reason for this is that, on the Keep_Low side, 254 * 255 * max(min(A), min(B)) > min(A) > max(A-) 256 * 257 * and similarly on the Keep_high side, 258 * 259 * min(max(A), max(B)) < max(A) < min(A+) 260 * 261 * which guarantees that the merged lists remain bitonic. 262 */ 263 template <typename T> 264 void MergeLists( std::vector<T> &listA, std::vector<T> &listB, int KEEP_WHAT) ; 265 266 /** 267 @author Hari Sundar 268 @brief The main operation in the parallel bitonic sort algorithm. This implements the compare-split operation. 269 * @param which_keys is one of KEEP_HIGH or KEEP_LOW 270 * @param partner is the processor with which to Merge and Split. 271 @param local_list the input vector 272 @param comm the communicator 273 */ 274 template <typename T> 275 void MergeSplit( std::vector<T> &local_list, int which_keys, int partner, MPI_Comm comm); 276 277 /** 278 @author Hari Sundar 279 */ 280 template <typename T> 281 void Par_bitonic_sort_incr( std::vector<T> &local_list, int proc_set_size, MPI_Comm comm ); 282 283 /** 284 @author Hari Sundar 285 */ 286 template <typename T> 287 void Par_bitonic_sort_decr( std::vector<T> &local_list, int proc_set_size, MPI_Comm comm); 288 289 /** 290 @author Hari Sundar 291 */ 292 template <typename T> 293 void Par_bitonic_merge_incr( std::vector<T> &local_list, int proc_set_size, MPI_Comm comm ); 294 295 /** 296 @brief An implementation of parallel bitonic sort that expects the number of processors to be a power of 2. 297 However, unlike most implementations, we do not expect the length of the vector 298 (neither locally nor globally) to be a power of 2 or even. Moreover, each processor can call 299 this with a different number of elements. However, we do expect that 'in' atleast has 1 300 element on each processor. 301 @param in the vector to be sorted 302 @author Hari Sundar 303 */ 304 template <typename T> 305 void bitonicSort_binary(std::vector<T> & in, MPI_Comm comm) ; 306 307 /** 308 @brief An implementation of parallel bitonic sort that does not expect the number of processors to 309 be a power of 2. In fact, the number of processors can even be odd. 310 Moreover, we do not even expect the length of the vector 311 (neither locally nor globally) to be a power of 2 or even. Moreover, each processor can call 312 this with a different number of elements. However, we do expect that 'in' atleast has 1 313 element on each processor. This recursively calls the function bitonicSort_binary, followed by a 314 special parallel merge. 315 @param in the vector to be sorted 316 @author Hari Sundar 317 @see bitonicSort_binary 318 */ 319 template <typename T> 320 void bitonicSort(std::vector<T> & in, MPI_Comm comm) ; 321 322 }//end namespace 323 324 #include "parUtils.tcc" 325 326 #endif 327 328