1 
2 /**
3   @file parUtils.h
4   @brief A set of parallel utilities.
5   @author Hari Sundar, hsundar@gmail.com
6   */
7 
8 #ifndef __PAR_UTILS_H_
9 #define __PAR_UTILS_H_
10 
11 #define KEEP_HIGH 100
12 #define KEEP_LOW  101
13 
14 #ifdef __DEBUG__
15 #ifndef __DEBUG_PAR__
16 #define __DEBUG_PAR__
17 #endif
18 #endif
19 
20 #include "mpi.h"
21 #include <vector>
22 
23 #ifdef __USE_64_BIT_INT__
24 #define DendroIntL long long
25 #define DendroIntLSpecifier %lld
26 #define DendroUIntLSpecifier %llu
27 #else
28 #define DendroIntL int
29 #define DendroIntLSpecifier %d
30 #define DendroUIntLSpecifier %u
31 #endif
32 
33 /**
34   @namespace par
35   @author Hari Sundar   hsundar@gmail.com
36   @brief Collection of Generic Parallel Functions: Sorting, Partitioning, Searching,...
37   */
38 namespace par {
39 
40   template <typename T>
41     int Mpi_Isend(T* buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request* request);
42 
43   template <typename T>
44     int Mpi_Issend(T* buf, int count, int dest, int tag, MPI_Comm comm, MPI_Request* request);
45 
46   template <typename T>
47     int Mpi_Recv(T* buf, int count, int source, int tag, MPI_Comm comm, MPI_Status* status);
48 
49   template <typename T>
50     int Mpi_Irecv(T* buf, int count, int source, int tag, MPI_Comm comm, MPI_Request* request);
51 
52   template <typename T>
53     int Mpi_Gather( T* sendBuffer, T* recvBuffer, int count, int root, MPI_Comm comm);
54 
55   template <typename T, typename S>
56     int Mpi_Sendrecv( T* sendBuf, int sendCount, int dest, int sendTag,
57         S* recvBuf, int recvCount, int source, int recvTag,
58         MPI_Comm comm, MPI_Status* status);
59 
60   template <typename T>
61     int Mpi_Bcast( T* buffer, int count, int root, MPI_Comm comm);
62 
63   template <typename T>
64     int Mpi_Scan( T* sendbuf, T* recvbuf, int count, MPI_Op op, MPI_Comm comm);
65 
66   template <typename T>
67     int Mpi_Reduce( T* sendbuf, T* recvbuf, int count, MPI_Op op, int root, MPI_Comm comm);
68 
69   template <typename T>
70     int Mpi_Allreduce( T* sendbuf, T* recvbuf, int count, MPI_Op op, MPI_Comm comm);
71 
72   template <typename T>
73     int Mpi_Alltoall(T* sendbuf, T* recvbuf, int count, MPI_Comm comm);
74 
75   template <typename T>
76     int Mpi_Allgatherv(T* sendbuf, int sendcount, T* recvbuf,
77         int* recvcounts, int* displs, MPI_Comm comm);
78 
79   template <typename T>
80     int Mpi_Allgather(T* sendbuf, T* recvbuf, int count, MPI_Comm comm);
81 
82   template <typename T>
83     int Mpi_Alltoallv_sparse(T* sendbuf, int* sendcnts, int* sdispls,
84         T* recvbuf, int* recvcnts, int* rdispls, MPI_Comm comm);
85 
86   template <typename T>
87     int Mpi_Alltoallv_dense(T* sendbuf, int* sendcnts, int* sdispls,
88         T* recvbuf, int* recvcnts, int* rdispls, MPI_Comm comm);
89 
90 
91 	template<typename T>
92     unsigned int defaultWeight(const T *a);
93 
94   /**
95     @brief A parallel weighted partitioning function. In our implementation, we do not pose any
96     restriction on the input or the number of processors. This function can be used with an odd number of processors as well.
97     Some processors can pass an empty vector as input. The relative ordering of the elements is preserved.
98     @author Hari Sundar
99     @author Rahul Sampath
100     @param vec the input vector
101     @param getWeight function pointer to compute the weight of each element. If you pass NULL,
102     then every element will get a weight equal to 1.
103     @param comm the communicator
104     */
105   template<typename T>
106     int partitionW(std::vector<T>& vec,
107         unsigned int (*getWeight)(const T *), MPI_Comm comm);
108 
109 
110 		template<typename T>
111 		void rankSamples(std::vector<T>& arr, std::vector<T> samples, MPI_Comm comm);
112 
113 	template<typename T>
114 		std::vector<T> Sorted_Sample_Select(std::vector<T>& arr, unsigned int kway, std::vector<unsigned int>& min_idx, std::vector<unsigned int>& max_idx, std::vector<DendroIntL>& splitter_ranks, MPI_Comm comm);
115 	template<typename T>
116 		std::vector<T> Sorted_approx_Select(std::vector<T>& arr, unsigned int k, MPI_Comm comm);
117 	//! new one to handle skewed distributions ...
118   template<typename T>
119 		std::vector<std::pair<T, DendroIntL> > Sorted_approx_Select_skewed(std::vector<T>& arr, unsigned int k, MPI_Comm comm);
120 
121   template<typename T>
122 		std::vector<T> GetRangeMean(std::vector<T>& arr, std::vector<unsigned int> range_min, std::vector<unsigned int> range_max, MPI_Comm comm);
123 	template<typename T>
124 		std::vector<T> GuessRangeMedian(std::vector<T>& arr, std::vector<unsigned int> range_min, std::vector<unsigned int> range_max, MPI_Comm comm);
125 
126 		/** @brief A parallel k-selection algorithms
127 
128 			@author Hari Sundar
129 			@date 2013-01-10
130 			@param  arr				arr from which samples are to be selected
131 			@param  k					number of samples
132 			@param  isSorted	if false, arr is locally sorted first.
133 			@return list of k keys.
134 		**/
135 		template<typename T>
136 				std::vector<T> Sorted_k_Select(std::vector<T>& arr, std::vector<unsigned int> min_idx, std::vector<unsigned int> max_idx, std::vector<DendroIntL> K, std::vector<T> guess, MPI_Comm comm);
137 
138   /**
139     @brief A parallel hyper quick sort implementation.
140     @author Dhairya Malhotra
141     @param in the input vector
142     @param out the output vector
143     @param comm the communicator
144     */
145   template<typename T>
146     int HyperQuickSort(std::vector<T>& in, std::vector<T> & out, MPI_Comm comm);
147 
148   template<typename T>
149     int HyperQuickSort_kway(std::vector<T>& in, std::vector<T> & out, MPI_Comm comm);
150 
151   template<typename T>
152     int HyperQuickSort_cav(std::vector<T>& in, std::vector<T> & out, MPI_Comm comm);
153 
154   /* mem-efficient version */
155   template<typename T>
156     int HyperQuickSort(std::vector<T>& arr, MPI_Comm comm);
157 
158   template<typename T>
159     int HyperQuickSort_kway(std::vector<T>& in, MPI_Comm comm);
160 
161   template<typename T>
162     int HyperQuickSort_cav(std::vector<T>& in, MPI_Comm comm);
163 
164   /**
165     @brief A parallel sample sort implementation. In our implementation, we do not pose any
166     restriction on the input or the number of processors. This function can be used with an odd number of processors as well.
167     Some processors can pass an empty vector as input. If the total number of elements in the vector (globally) is fewer
168     than 10*p^2, where p is the number of processors, then we will use bitonic sort instead of sample sort to sort the vector.
169     We use a paralle bitonic sort to sort the samples in the sample sort algorithm. Hence, the complexity of the algorithm
170     is O(n/p log n/p) + O(p log p). Here, n is the global length of the vector and p is the number of processors.
171     @author Hari Sundar
172     @param in the input vector
173     @param out the output vector
174     @param comm the communicator
175     */
176   template<typename T>
177     int sampleSort(std::vector<T>& in, std::vector<T> & out, MPI_Comm comm);
178 
179   template<typename T>
180     int sampleSort(std::vector<T>& in, MPI_Comm comm);
181 
182   /**
183     @brief Splits a communication group into two, one containing processors that passed a value of 'false'
184     for the parameter 'iAmEmpty' and the another containing processors that passed a value of 'true'
185     for the parameter. Both the groups are sorted in the ascending order of their ranks in the old comm.
186     @author Rahul Sampath
187     @param iAmEmpty     Some flag to determine which group the calling processor will be combined into.
188     @param orig_comm    The comm group that needs to be split.
189     @param new_comm     The new comm group.
190     */
191   int splitComm2way(bool iAmEmpty, MPI_Comm* new_comm, MPI_Comm orig_comm);
192 
193   /**
194     @brief Splits a communication group into two depending on the values in isEmptyList.
195     Both the groups are sorted in the ascending order of their ranks in the old comm.
196     All processors must call this function with the same 'isEmptyList' array.
197     @author Rahul Sampath
198     @param isEmptyList  flags (of length equal to the number of processors) to determine whether each processor is active or not.
199     @param orig_comm    The comm group that needs to be split.
200     @param new_comm     The new comm group.
201     */
202   int splitComm2way(const bool* isEmptyList, MPI_Comm* new_comm, MPI_Comm orig_comm);
203 
204   /*
205      @author Rahul Sampath
206      @brief Splits a communication group into two, processors with
207      ranks less than splittingRank form one group and the other
208      processors form the second group. Both the groups are sorted in
209      the ascending order of their ranks in the old comm.
210      @param splittingRank The rank used for splitting the communicator
211      @param orig_comm    The comm group that needs to be split.
212      @param new_comm     The new comm group.
213      */
214   int splitCommUsingSplittingRank(int splittingRank, MPI_Comm* new_comm, MPI_Comm orig_comm);
215 
216   /**
217    * @brief Splits a communication group into two, the first having a power of 2
218    * number of processors and the other having the remainder. The first group
219    * is sorted in the ascending order of their ranks in the old comm and the second group
220    * is sorted in the descending order of their ranks in the old comm
221    * @author Hari Sundar
222    * @param orig_comm    The comm group that needs to be split.
223    * @param new_comm     The new comm group.
224    */
225   unsigned int splitCommBinary( MPI_Comm orig_comm, MPI_Comm* new_comm);
226 
227 
228   /**
229    * @brief Splits a communication group into two, the first having a power of 2
230    * number of processors and the other having the remainder. Both the groups
231    * are sorted in the ascending order of their ranks in the old comm.
232    * @author Hari Sundar
233    * @param orig_comm    The comm group that needs to be split.
234    * @param new_comm     The new comm group.
235    */
236   unsigned int splitCommBinaryNoFlip( MPI_Comm orig_comm, MPI_Comm* new_comm);
237 
238   /**
239     @author Hari Sundar
240    * @brief Merges lists A, and B, retaining either the low or the High in list A.
241    *
242    * @param listA   	Input list, and where the output is stored.
243    * @param listB   	Second input list.
244    * @param KEEP_WHAT 	determines whether to retain the High or the low values
245    * 			from A and B. One of KEEP_HIGH or KEEP_LOW.
246    *
247    * Merging the two lists when their sizes are not the same is a bit involved.
248    * The major condition that needs to be used is that all elements that are less
249    * than max(min(A), min(B)) are retained by the KEEP_LOW processor, and
250    * similarly all elements that are larger larger than min(max(A), max(B)) are
251    * retained by the KEEP_HIGH processor.
252    *
253    * The reason for this is that, on the Keep_Low side,
254    *
255    *   max(min(A), min(B)) > min(A) > max(A-)
256    *
257    * and similarly on the Keep_high side,
258    *
259    *   min(max(A), max(B)) < max(A) < min(A+)
260    *
261    * which guarantees that the merged lists remain bitonic.
262    */
263   template <typename T>
264     void MergeLists( std::vector<T> &listA, std::vector<T> &listB, int KEEP_WHAT) ;
265 
266   /**
267     @author Hari Sundar
268     @brief The main operation in the parallel bitonic sort algorithm. This implements the compare-split operation.
269    * @param which_keys is one of KEEP_HIGH or KEEP_LOW
270    * @param partner    is the processor with which to Merge and Split.
271    @param local_list the input vector
272    @param comm the communicator
273    */
274   template <typename T>
275     void MergeSplit( std::vector<T> &local_list, int which_keys, int partner, MPI_Comm  comm);
276 
277   /**
278     @author Hari Sundar
279     */
280   template <typename T>
281     void Par_bitonic_sort_incr( std::vector<T> &local_list, int proc_set_size, MPI_Comm  comm );
282 
283   /**
284     @author Hari Sundar
285     */
286   template <typename T>
287     void Par_bitonic_sort_decr( std::vector<T> &local_list, int proc_set_size, MPI_Comm  comm);
288 
289   /**
290     @author Hari Sundar
291     */
292   template <typename T>
293     void Par_bitonic_merge_incr( std::vector<T> &local_list, int proc_set_size, MPI_Comm  comm );
294 
295   /**
296     @brief An implementation of parallel bitonic sort that expects the number of processors to be a power of 2.
297     However, unlike most implementations, we do not expect the length of the vector
298     (neither locally nor globally) to be a power of 2 or even. Moreover, each processor can call
299     this with a different number of elements. However, we do expect that 'in' atleast has 1
300     element on each processor.
301     @param in the vector to be sorted
302     @author Hari Sundar
303     */
304   template <typename T>
305     void bitonicSort_binary(std::vector<T> & in, MPI_Comm comm) ;
306 
307   /**
308     @brief An implementation of parallel bitonic sort that does not expect the number of processors to
309     be a power of 2. In fact, the number of processors can even be odd.
310     Moreover, we do not even expect the length of the vector
311     (neither locally nor globally) to be a power of 2 or even. Moreover, each processor can call
312     this with a different number of elements. However, we do expect that 'in' atleast has 1
313     element on each processor. This recursively calls the function bitonicSort_binary, followed by a
314     special parallel merge.
315     @param in  the vector to be sorted
316     @author Hari Sundar
317     @see bitonicSort_binary
318     */
319   template <typename T>
320     void bitonicSort(std::vector<T> & in, MPI_Comm comm) ;
321 
322 }//end namespace
323 
324 #include "parUtils.tcc"
325 
326 #endif
327 
328