1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpiimpl.h"
7 #include "recexchalgo.h"
8 
MPII_Recexchalgo_init(void)9 int MPII_Recexchalgo_init(void)
10 {
11     int mpi_errno = MPI_SUCCESS;
12 
13     return mpi_errno;
14 }
15 
16 
MPII_Recexchalgo_comm_init(MPIR_Comm * comm)17 int MPII_Recexchalgo_comm_init(MPIR_Comm * comm)
18 {
19     int mpi_errno = MPI_SUCCESS;
20 
21     return mpi_errno;
22 }
23 
24 
MPII_Recexchalgo_comm_cleanup(MPIR_Comm * comm)25 int MPII_Recexchalgo_comm_cleanup(MPIR_Comm * comm)
26 {
27     int mpi_errno = MPI_SUCCESS;
28 
29     return mpi_errno;
30 }
31 
32 
33 /* This function calculates the ranks to/from which the
34  * data is sent/recvd in various steps/phases of recursive exchange
35  * algorithm. Recursive exchange is divided into three steps - Step 1, 2 and 3.
36  * Step 2 is the main step that does the recursive exchange algorithm.
37  * Step 1 and Step 3 are done to handle non-power-of-k number of ranks.
38  * Only p_of_k (largest power of k that is less than n) ranks participate in Step 2.
39  * In Step 1, the non partcipating ranks send their data to ranks
40  * participating in Step 2. In Step 3, the ranks that participated in Step 2 send
41  * the final data to non-partcipating ranks.
42 */
MPII_Recexchalgo_get_neighbors(int rank,int nranks,int * k_,int * step1_sendto,int ** step1_recvfrom_,int * step1_nrecvs,int *** step2_nbrs_,int * step2_nphases,int * p_of_k_,int * T_)43 int MPII_Recexchalgo_get_neighbors(int rank, int nranks, int *k_,
44                                    int *step1_sendto, int **step1_recvfrom_, int *step1_nrecvs,
45                                    int ***step2_nbrs_, int *step2_nphases, int *p_of_k_, int *T_)
46 {
47     int mpi_errno = MPI_SUCCESS;
48     int i, j, k;
49     int p_of_k = 1, log_p_of_k = 0, rem, T, newrank;
50     int **step2_nbrs;
51     int *step1_recvfrom;
52 
53     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_RECEXCHALGO_GET_NEIGHBORS);
54     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_RECEXCHALGO_GET_NEIGHBORS);
55 
56     k = *k_;
57     if (nranks < k)     /* If size of the communicator is less than k, reduce the value of k */
58         k = (nranks > 2) ? nranks : 2;
59     *k_ = k;
60 
61     /* Calculate p_of_k, p_of_k is the largest power of k that is less than nranks */
62     while (p_of_k <= nranks) {
63         p_of_k *= k;
64         log_p_of_k++;
65     }
66     p_of_k /= k;
67     log_p_of_k--;
68 
69     MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
70                     (MPL_DBG_FDEST, "allocate memory for storing communication pattern"));
71     step1_recvfrom = *step1_recvfrom_ = (int *) MPL_malloc(sizeof(int) * (k - 1), MPL_MEM_COLL);
72     step2_nbrs = *step2_nbrs_ = (int **) MPL_malloc(sizeof(int *) * log_p_of_k, MPL_MEM_COLL);
73     MPIR_Assert(step1_recvfrom != NULL && *step1_recvfrom_ != NULL && step2_nbrs != NULL &&
74                 *step2_nbrs_ != NULL);
75 
76     for (i = 0; i < log_p_of_k; i++) {
77         (*step2_nbrs_)[i] = (int *) MPL_malloc(sizeof(int) * (k - 1), MPL_MEM_COLL);
78     }
79 
80     *step2_nphases = log_p_of_k;
81 
82     rem = nranks - p_of_k;
83     /* rem is the number of ranks that do not particpate in Step 2
84      * We need to identify these non-participating ranks. This is done in the following way.
85      * The first T ranks are divided into sets of k consecutive ranks each.
86      * In each of these sets, the first k-1 ranks are the non-participating
87      * ranks while the last rank is the participating rank.
88      * The non-participating ranks send their data to the participating rank
89      * in their corresponding set.
90      */
91     T = (rem * k) / (k - 1);
92     *T_ = T;
93     *p_of_k_ = p_of_k;
94 
95 
96     MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
97                     (MPL_DBG_FDEST, "step 1 nbr calculation started. T is %d", T));
98     *step1_nrecvs = 0;
99     *step1_sendto = -1;
100 
101     /* Step 1 */
102     if (rank < T) {
103         if (rank % k != (k - 1)) {      /* I am a non-participating rank */
104             *step1_sendto = rank + (k - 1 - rank % k);  /* partipating rank to send the data to */
105             /* if the corresponding participating rank is not in T,
106              * then send to the Tth rank to preserve non-commutativity */
107             if (*step1_sendto > T - 1)
108                 *step1_sendto = T;
109             newrank = -1;       /* tag this rank as non-participating */
110         } else {        /* participating rank */
111             for (i = 0; i < k - 1; i++) {
112                 step1_recvfrom[i] = rank - i - 1;
113             }
114             *step1_nrecvs = k - 1;
115             newrank = rank / k; /* this is the new rank amongst the set of participating ranks */
116         }
117     } else {    /* rank >= T */
118         newrank = rank - rem;
119 
120         if (rank == T && (T - 1) % k != k - 1 && T >= 1) {
121             int nsenders = (T - 1) % k + 1;     /* number of ranks sending their data to me in Step 1 */
122 
123             for (j = nsenders - 1; j >= 0; j--) {
124                 step1_recvfrom[nsenders - 1 - j] = T - nsenders + j;
125             }
126             *step1_nrecvs = nsenders;
127         }
128     }
129 
130     MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, "step 1 nbr computation completed"));
131 
132     /* Step 2 */
133     if (*step1_sendto == -1) {  /* calulate step2_nbrs only for participating ranks */
134         int *digit = (int *) MPL_malloc(sizeof(int) * log_p_of_k, MPL_MEM_COLL);
135         MPIR_Assert(digit != NULL);
136         int temprank = newrank;
137         int mask = 0x1;
138         int phase = 0, cbit, cnt, nbr, power;
139 
140         /* calculate the digits in base k representation of newrank */
141         for (i = 0; i < log_p_of_k; i++)
142             digit[i] = 0;
143 
144         int remainder, i_digit = 0;
145         while (temprank != 0) {
146             remainder = temprank % k;
147             temprank = temprank / k;
148             digit[i_digit] = remainder;
149             i_digit++;
150         }
151 
152         while (mask < p_of_k) {
153             cbit = digit[phase];        /* phase_th digit changes in this phase, obtain its original value */
154             cnt = 0;
155             for (i = 0; i < k; i++) {   /* there are k-1 neighbors */
156                 if (i != cbit) {        /* do not generate yourself as your nieighbor */
157                     digit[phase] = i;   /* this gets us the base k representation of the neighbor */
158 
159                     /* calculate the base 10 value of the neighbor rank */
160                     nbr = 0;
161                     power = 1;
162                     for (j = 0; j < log_p_of_k; j++) {
163                         nbr += digit[j] * power;
164                         power *= k;
165                     }
166 
167                     /* calculate its real rank and store it */
168                     step2_nbrs[phase][cnt] =
169                         (nbr < rem / (k - 1)) ? (nbr * k) + (k - 1) : nbr + rem;
170                     MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
171                                     (MPL_DBG_FDEST, "step2_nbrs[%d][%d] is %d", phase, cnt,
172                                      step2_nbrs[phase][cnt]));
173                     cnt++;
174                 }
175             }
176             MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
177                             (MPL_DBG_FDEST, "step 2, phase %d nbr calculation completed", phase));
178             digit[phase] = cbit;        /* reset the digit to original value */
179             phase++;
180             mask *= k;
181         }
182 
183         MPL_free(digit);
184     }
185 
186     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_RECEXCHALGO_GET_NEIGHBORS);
187 
188     return mpi_errno;
189 }
190 
191 
MPII_Recexchalgo_origrank_to_step2rank(int rank,int rem,int T,int k)192 int MPII_Recexchalgo_origrank_to_step2rank(int rank, int rem, int T, int k)
193 {
194     int step2rank;
195 
196     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_RECEXCHALGO_ORIGRANK_TO_STEP2RANK);
197     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_RECEXCHALGO_ORIGRANK_TO_STEP2RANK);
198 
199     step2rank = (rank < T) ? rank / k : rank - rem;
200 
201     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_RECEXCHALGO_ORIGRANK_TO_STEP2RANK);
202 
203     return step2rank;
204 }
205 
206 
MPII_Recexchalgo_step2rank_to_origrank(int rank,int rem,int T,int k)207 int MPII_Recexchalgo_step2rank_to_origrank(int rank, int rem, int T, int k)
208 {
209     int orig_rank;
210 
211     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_RECEXCHALGO_STEP2RANK_TO_ORIGRANK);
212     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_RECEXCHALGO_STEP2RANK_TO_ORIGRANK);
213 
214     orig_rank = (rank < rem / (k - 1)) ? (rank * k) + (k - 1) : rank + rem;
215 
216     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_RECEXCHALGO_STEP2RANK_TO_ORIGRANK);
217 
218     return orig_rank;
219 }
220 
221 
222 /* This function calculates the offset and count for send and receive for a given
223  * phase in recursive exchange algorithms in collective operations like Allgather,
224  * Allgatherv, Reducescatter.
225 */
MPII_Recexchalgo_get_count_and_offset(int rank,int phase,int k,int nranks,int * count,int * offset)226 int MPII_Recexchalgo_get_count_and_offset(int rank, int phase, int k, int nranks, int *count,
227                                           int *offset)
228 {
229     int mpi_errno = MPI_SUCCESS;
230     int step2rank, min, max, orig_max, orig_min;
231     int k_power_phase = 1;
232     int p_of_k = 1, rem, T;
233 
234     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_RECEXCHALGO_GET_COUNT_AND_OFFSET);
235     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_RECEXCHALGO_GET_COUNT_AND_OFFSET);
236 
237     /* p_of_k is the largest power of k that is less than nranks */
238     while (p_of_k <= nranks) {
239         p_of_k *= k;
240     }
241     p_of_k /= k;
242 
243     rem = nranks - p_of_k;
244     T = (rem * k) / (k - 1);
245 
246     /* k_power_phase is k^phase */
247     while (phase > 0) {
248         k_power_phase *= k;
249         phase--;
250     }
251     /* Calculate rank in step2 */
252     step2rank = MPII_Recexchalgo_origrank_to_step2rank(rank, rem, T, k);
253     /* Calculate min and max ranks of the range of ranks that 'rank'
254      * represents in phase 'phase' */
255     min = ((step2rank / k_power_phase) * k_power_phase) - 1;
256     max = min + k_power_phase;
257     /* convert (min,max] to their original ranks */
258     orig_min = (min >= 0) ? MPII_Recexchalgo_step2rank_to_origrank(min, rem, T, k) : min;
259     orig_max = MPII_Recexchalgo_step2rank_to_origrank(max, rem, T, k);
260     *count = orig_max - orig_min;
261     *offset = orig_min + 1;
262 
263     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_RECEXCHALGO_GET_COUNT_AND_OFFSET);
264 
265     return mpi_errno;
266 }
267 
268 
269 /* This function calculates the digit reversed (in base 'k' representation) rank of a given rank participating in Step 2. It does so in the following steps:
270  * 1. Converts the given rank to its Step2 rank
271  * 2. Calculates the digit reversed (in base 'k' representation) rank of the Step 2 rank
272  * 3. Convert the digit reversed rank in the previous Step to the original rank.
273 */
MPII_Recexchalgo_reverse_digits_step2(int rank,int comm_size,int k)274 int MPII_Recexchalgo_reverse_digits_step2(int rank, int comm_size, int k)
275 {
276     int i, T, rem, power, step2rank, step2_reverse_rank = 0;
277     int pofk = 1, log_pofk = 0;
278     int *digit, *digit_reverse;
279     int mpi_errno = MPI_SUCCESS;
280     MPIR_CHKLMEM_DECL(2);
281 
282     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_RECEXCHALGO_REVERSE_DIGITS_STEP2);
283     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_RECEXCHALGO_REVERSE_DIGITS_STEP2);
284 
285     while (pofk <= comm_size) {
286         pofk *= k;
287         log_pofk++;
288     }
289     MPIR_Assert(log_pofk > 0);
290     pofk /= k;
291     log_pofk--;
292 
293     rem = comm_size - pofk;
294     T = (rem * k) / (k - 1);
295 
296     /* step2rank is the rank in the particiapting ranks group of recursive exchange step2 */
297     step2rank = MPII_Recexchalgo_origrank_to_step2rank(rank, rem, T, k);
298 
299     /* calculate the digits in base k representation of step2rank */
300     MPIR_CHKLMEM_MALLOC(digit, int *, sizeof(int) * log_pofk,
301                         mpi_errno, "digit buffer", MPL_MEM_COLL);
302     MPIR_CHKLMEM_MALLOC(digit_reverse, int *, sizeof(int) * log_pofk,
303                         mpi_errno, "digit_reverse buffer", MPL_MEM_COLL);
304     for (i = 0; i < log_pofk; i++)
305         digit[i] = 0;
306 
307     int remainder, i_digit = 0;
308     while (step2rank != 0) {
309         remainder = step2rank % k;
310         step2rank = step2rank / k;
311         digit[i_digit] = remainder;
312         i_digit++;
313     }
314 
315     /* reverse the number in base k representation to get the step2_reverse_rank
316      * which is the reversed rank in the participating ranks group of recursive exchange step2
317      */
318     for (i = 0; i < log_pofk; i++)
319         digit_reverse[i] = digit[log_pofk - 1 - i];
320     /*calculate the base 10 value of the reverse rank */
321     step2_reverse_rank = 0;
322     power = 1;
323     for (i = 0; i < log_pofk; i++) {
324         step2_reverse_rank += digit_reverse[i] * power;
325         power *= k;
326     }
327 
328     /* calculate the actual rank from logical rank */
329     step2_reverse_rank = MPII_Recexchalgo_step2rank_to_origrank(step2_reverse_rank, rem, T, k);
330     MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
331                     (MPL_DBG_FDEST, "reverse_rank is %d", step2_reverse_rank));
332 
333     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_RECEXCHALGO_REVERSE_DIGITS_STEP2);
334 
335   fn_exit:
336     MPIR_CHKLMEM_FREEALL();
337     return step2_reverse_rank;
338   fn_fail:
339     /* TODO - Replace this with real error handling */
340     MPIR_Assert(MPI_SUCCESS == mpi_errno);
341     goto fn_exit;
342 }
343