1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include "mpiimpl.h"
7 #include "recexchalgo.h"
8
MPII_Recexchalgo_init(void)9 int MPII_Recexchalgo_init(void)
10 {
11 int mpi_errno = MPI_SUCCESS;
12
13 return mpi_errno;
14 }
15
16
MPII_Recexchalgo_comm_init(MPIR_Comm * comm)17 int MPII_Recexchalgo_comm_init(MPIR_Comm * comm)
18 {
19 int mpi_errno = MPI_SUCCESS;
20
21 return mpi_errno;
22 }
23
24
MPII_Recexchalgo_comm_cleanup(MPIR_Comm * comm)25 int MPII_Recexchalgo_comm_cleanup(MPIR_Comm * comm)
26 {
27 int mpi_errno = MPI_SUCCESS;
28
29 return mpi_errno;
30 }
31
32
33 /* This function calculates the ranks to/from which the
34 * data is sent/recvd in various steps/phases of recursive exchange
35 * algorithm. Recursive exchange is divided into three steps - Step 1, 2 and 3.
36 * Step 2 is the main step that does the recursive exchange algorithm.
37 * Step 1 and Step 3 are done to handle non-power-of-k number of ranks.
38 * Only p_of_k (largest power of k that is less than n) ranks participate in Step 2.
39 * In Step 1, the non partcipating ranks send their data to ranks
40 * participating in Step 2. In Step 3, the ranks that participated in Step 2 send
41 * the final data to non-partcipating ranks.
42 */
MPII_Recexchalgo_get_neighbors(int rank,int nranks,int * k_,int * step1_sendto,int ** step1_recvfrom_,int * step1_nrecvs,int *** step2_nbrs_,int * step2_nphases,int * p_of_k_,int * T_)43 int MPII_Recexchalgo_get_neighbors(int rank, int nranks, int *k_,
44 int *step1_sendto, int **step1_recvfrom_, int *step1_nrecvs,
45 int ***step2_nbrs_, int *step2_nphases, int *p_of_k_, int *T_)
46 {
47 int mpi_errno = MPI_SUCCESS;
48 int i, j, k;
49 int p_of_k = 1, log_p_of_k = 0, rem, T, newrank;
50 int **step2_nbrs;
51 int *step1_recvfrom;
52
53 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_RECEXCHALGO_GET_NEIGHBORS);
54 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_RECEXCHALGO_GET_NEIGHBORS);
55
56 k = *k_;
57 if (nranks < k) /* If size of the communicator is less than k, reduce the value of k */
58 k = (nranks > 2) ? nranks : 2;
59 *k_ = k;
60
61 /* Calculate p_of_k, p_of_k is the largest power of k that is less than nranks */
62 while (p_of_k <= nranks) {
63 p_of_k *= k;
64 log_p_of_k++;
65 }
66 p_of_k /= k;
67 log_p_of_k--;
68
69 MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
70 (MPL_DBG_FDEST, "allocate memory for storing communication pattern"));
71 step1_recvfrom = *step1_recvfrom_ = (int *) MPL_malloc(sizeof(int) * (k - 1), MPL_MEM_COLL);
72 step2_nbrs = *step2_nbrs_ = (int **) MPL_malloc(sizeof(int *) * log_p_of_k, MPL_MEM_COLL);
73 MPIR_Assert(step1_recvfrom != NULL && *step1_recvfrom_ != NULL && step2_nbrs != NULL &&
74 *step2_nbrs_ != NULL);
75
76 for (i = 0; i < log_p_of_k; i++) {
77 (*step2_nbrs_)[i] = (int *) MPL_malloc(sizeof(int) * (k - 1), MPL_MEM_COLL);
78 }
79
80 *step2_nphases = log_p_of_k;
81
82 rem = nranks - p_of_k;
83 /* rem is the number of ranks that do not particpate in Step 2
84 * We need to identify these non-participating ranks. This is done in the following way.
85 * The first T ranks are divided into sets of k consecutive ranks each.
86 * In each of these sets, the first k-1 ranks are the non-participating
87 * ranks while the last rank is the participating rank.
88 * The non-participating ranks send their data to the participating rank
89 * in their corresponding set.
90 */
91 T = (rem * k) / (k - 1);
92 *T_ = T;
93 *p_of_k_ = p_of_k;
94
95
96 MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
97 (MPL_DBG_FDEST, "step 1 nbr calculation started. T is %d", T));
98 *step1_nrecvs = 0;
99 *step1_sendto = -1;
100
101 /* Step 1 */
102 if (rank < T) {
103 if (rank % k != (k - 1)) { /* I am a non-participating rank */
104 *step1_sendto = rank + (k - 1 - rank % k); /* partipating rank to send the data to */
105 /* if the corresponding participating rank is not in T,
106 * then send to the Tth rank to preserve non-commutativity */
107 if (*step1_sendto > T - 1)
108 *step1_sendto = T;
109 newrank = -1; /* tag this rank as non-participating */
110 } else { /* participating rank */
111 for (i = 0; i < k - 1; i++) {
112 step1_recvfrom[i] = rank - i - 1;
113 }
114 *step1_nrecvs = k - 1;
115 newrank = rank / k; /* this is the new rank amongst the set of participating ranks */
116 }
117 } else { /* rank >= T */
118 newrank = rank - rem;
119
120 if (rank == T && (T - 1) % k != k - 1 && T >= 1) {
121 int nsenders = (T - 1) % k + 1; /* number of ranks sending their data to me in Step 1 */
122
123 for (j = nsenders - 1; j >= 0; j--) {
124 step1_recvfrom[nsenders - 1 - j] = T - nsenders + j;
125 }
126 *step1_nrecvs = nsenders;
127 }
128 }
129
130 MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, "step 1 nbr computation completed"));
131
132 /* Step 2 */
133 if (*step1_sendto == -1) { /* calulate step2_nbrs only for participating ranks */
134 int *digit = (int *) MPL_malloc(sizeof(int) * log_p_of_k, MPL_MEM_COLL);
135 MPIR_Assert(digit != NULL);
136 int temprank = newrank;
137 int mask = 0x1;
138 int phase = 0, cbit, cnt, nbr, power;
139
140 /* calculate the digits in base k representation of newrank */
141 for (i = 0; i < log_p_of_k; i++)
142 digit[i] = 0;
143
144 int remainder, i_digit = 0;
145 while (temprank != 0) {
146 remainder = temprank % k;
147 temprank = temprank / k;
148 digit[i_digit] = remainder;
149 i_digit++;
150 }
151
152 while (mask < p_of_k) {
153 cbit = digit[phase]; /* phase_th digit changes in this phase, obtain its original value */
154 cnt = 0;
155 for (i = 0; i < k; i++) { /* there are k-1 neighbors */
156 if (i != cbit) { /* do not generate yourself as your nieighbor */
157 digit[phase] = i; /* this gets us the base k representation of the neighbor */
158
159 /* calculate the base 10 value of the neighbor rank */
160 nbr = 0;
161 power = 1;
162 for (j = 0; j < log_p_of_k; j++) {
163 nbr += digit[j] * power;
164 power *= k;
165 }
166
167 /* calculate its real rank and store it */
168 step2_nbrs[phase][cnt] =
169 (nbr < rem / (k - 1)) ? (nbr * k) + (k - 1) : nbr + rem;
170 MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
171 (MPL_DBG_FDEST, "step2_nbrs[%d][%d] is %d", phase, cnt,
172 step2_nbrs[phase][cnt]));
173 cnt++;
174 }
175 }
176 MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
177 (MPL_DBG_FDEST, "step 2, phase %d nbr calculation completed", phase));
178 digit[phase] = cbit; /* reset the digit to original value */
179 phase++;
180 mask *= k;
181 }
182
183 MPL_free(digit);
184 }
185
186 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_RECEXCHALGO_GET_NEIGHBORS);
187
188 return mpi_errno;
189 }
190
191
MPII_Recexchalgo_origrank_to_step2rank(int rank,int rem,int T,int k)192 int MPII_Recexchalgo_origrank_to_step2rank(int rank, int rem, int T, int k)
193 {
194 int step2rank;
195
196 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_RECEXCHALGO_ORIGRANK_TO_STEP2RANK);
197 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_RECEXCHALGO_ORIGRANK_TO_STEP2RANK);
198
199 step2rank = (rank < T) ? rank / k : rank - rem;
200
201 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_RECEXCHALGO_ORIGRANK_TO_STEP2RANK);
202
203 return step2rank;
204 }
205
206
MPII_Recexchalgo_step2rank_to_origrank(int rank,int rem,int T,int k)207 int MPII_Recexchalgo_step2rank_to_origrank(int rank, int rem, int T, int k)
208 {
209 int orig_rank;
210
211 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_RECEXCHALGO_STEP2RANK_TO_ORIGRANK);
212 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_RECEXCHALGO_STEP2RANK_TO_ORIGRANK);
213
214 orig_rank = (rank < rem / (k - 1)) ? (rank * k) + (k - 1) : rank + rem;
215
216 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_RECEXCHALGO_STEP2RANK_TO_ORIGRANK);
217
218 return orig_rank;
219 }
220
221
222 /* This function calculates the offset and count for send and receive for a given
223 * phase in recursive exchange algorithms in collective operations like Allgather,
224 * Allgatherv, Reducescatter.
225 */
MPII_Recexchalgo_get_count_and_offset(int rank,int phase,int k,int nranks,int * count,int * offset)226 int MPII_Recexchalgo_get_count_and_offset(int rank, int phase, int k, int nranks, int *count,
227 int *offset)
228 {
229 int mpi_errno = MPI_SUCCESS;
230 int step2rank, min, max, orig_max, orig_min;
231 int k_power_phase = 1;
232 int p_of_k = 1, rem, T;
233
234 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_RECEXCHALGO_GET_COUNT_AND_OFFSET);
235 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_RECEXCHALGO_GET_COUNT_AND_OFFSET);
236
237 /* p_of_k is the largest power of k that is less than nranks */
238 while (p_of_k <= nranks) {
239 p_of_k *= k;
240 }
241 p_of_k /= k;
242
243 rem = nranks - p_of_k;
244 T = (rem * k) / (k - 1);
245
246 /* k_power_phase is k^phase */
247 while (phase > 0) {
248 k_power_phase *= k;
249 phase--;
250 }
251 /* Calculate rank in step2 */
252 step2rank = MPII_Recexchalgo_origrank_to_step2rank(rank, rem, T, k);
253 /* Calculate min and max ranks of the range of ranks that 'rank'
254 * represents in phase 'phase' */
255 min = ((step2rank / k_power_phase) * k_power_phase) - 1;
256 max = min + k_power_phase;
257 /* convert (min,max] to their original ranks */
258 orig_min = (min >= 0) ? MPII_Recexchalgo_step2rank_to_origrank(min, rem, T, k) : min;
259 orig_max = MPII_Recexchalgo_step2rank_to_origrank(max, rem, T, k);
260 *count = orig_max - orig_min;
261 *offset = orig_min + 1;
262
263 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_RECEXCHALGO_GET_COUNT_AND_OFFSET);
264
265 return mpi_errno;
266 }
267
268
269 /* This function calculates the digit reversed (in base 'k' representation) rank of a given rank participating in Step 2. It does so in the following steps:
270 * 1. Converts the given rank to its Step2 rank
271 * 2. Calculates the digit reversed (in base 'k' representation) rank of the Step 2 rank
272 * 3. Convert the digit reversed rank in the previous Step to the original rank.
273 */
MPII_Recexchalgo_reverse_digits_step2(int rank,int comm_size,int k)274 int MPII_Recexchalgo_reverse_digits_step2(int rank, int comm_size, int k)
275 {
276 int i, T, rem, power, step2rank, step2_reverse_rank = 0;
277 int pofk = 1, log_pofk = 0;
278 int *digit, *digit_reverse;
279 int mpi_errno = MPI_SUCCESS;
280 MPIR_CHKLMEM_DECL(2);
281
282 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_RECEXCHALGO_REVERSE_DIGITS_STEP2);
283 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_RECEXCHALGO_REVERSE_DIGITS_STEP2);
284
285 while (pofk <= comm_size) {
286 pofk *= k;
287 log_pofk++;
288 }
289 MPIR_Assert(log_pofk > 0);
290 pofk /= k;
291 log_pofk--;
292
293 rem = comm_size - pofk;
294 T = (rem * k) / (k - 1);
295
296 /* step2rank is the rank in the particiapting ranks group of recursive exchange step2 */
297 step2rank = MPII_Recexchalgo_origrank_to_step2rank(rank, rem, T, k);
298
299 /* calculate the digits in base k representation of step2rank */
300 MPIR_CHKLMEM_MALLOC(digit, int *, sizeof(int) * log_pofk,
301 mpi_errno, "digit buffer", MPL_MEM_COLL);
302 MPIR_CHKLMEM_MALLOC(digit_reverse, int *, sizeof(int) * log_pofk,
303 mpi_errno, "digit_reverse buffer", MPL_MEM_COLL);
304 for (i = 0; i < log_pofk; i++)
305 digit[i] = 0;
306
307 int remainder, i_digit = 0;
308 while (step2rank != 0) {
309 remainder = step2rank % k;
310 step2rank = step2rank / k;
311 digit[i_digit] = remainder;
312 i_digit++;
313 }
314
315 /* reverse the number in base k representation to get the step2_reverse_rank
316 * which is the reversed rank in the participating ranks group of recursive exchange step2
317 */
318 for (i = 0; i < log_pofk; i++)
319 digit_reverse[i] = digit[log_pofk - 1 - i];
320 /*calculate the base 10 value of the reverse rank */
321 step2_reverse_rank = 0;
322 power = 1;
323 for (i = 0; i < log_pofk; i++) {
324 step2_reverse_rank += digit_reverse[i] * power;
325 power *= k;
326 }
327
328 /* calculate the actual rank from logical rank */
329 step2_reverse_rank = MPII_Recexchalgo_step2rank_to_origrank(step2_reverse_rank, rem, T, k);
330 MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
331 (MPL_DBG_FDEST, "reverse_rank is %d", step2_reverse_rank));
332
333 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_RECEXCHALGO_REVERSE_DIGITS_STEP2);
334
335 fn_exit:
336 MPIR_CHKLMEM_FREEALL();
337 return step2_reverse_rank;
338 fn_fail:
339 /* TODO - Replace this with real error handling */
340 MPIR_Assert(MPI_SUCCESS == mpi_errno);
341 goto fn_exit;
342 }
343