1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 /* Header protection (i.e., IGATHER_TSP_TREE_ALGOS_H_INCLUDED) is
7  * intentionally omitted since this header might get included multiple
8  * times within the same .c file. */
9 
10 #include "algo_common.h"
11 #include "treealgo.h"
12 #include "tsp_namespace_def.h"
13 
14 /* Routine to schedule a tree based gather */
MPIR_TSP_Igather_sched_intra_tree(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,int root,MPIR_Comm * comm,int k,MPIR_TSP_sched_t * sched)15 int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount,
16                                       MPI_Datatype sendtype, void *recvbuf, int recvcount,
17                                       MPI_Datatype recvtype, int root, MPIR_Comm * comm,
18                                       int k, MPIR_TSP_sched_t * sched)
19 {
20     int mpi_errno = MPI_SUCCESS;
21     int size, rank, lrank;
22     int i, j, tag, is_inplace = false;
23     MPI_Aint sendtype_lb, sendtype_extent, sendtype_true_extent;
24     MPI_Aint recvtype_lb, recvtype_extent, recvtype_true_extent;
25     int dtcopy_id, *recv_id = NULL;
26     void *tmp_buf = NULL;
27     const void *data_buf = NULL;
28     int tree_type;
29     MPIR_Treealgo_tree_t my_tree, parents_tree;
30     int next_child, num_children, *child_subtree_size = NULL, *child_data_offset = NULL;
31     int offset, recv_size, num_dependencies;
32     MPIR_CHKLMEM_DECL(3);
33 
34     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIR_TSP_IGATHER_SCHED_INTRA_TREE);
35     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIR_TSP_IGATHER_SCHED_INTRA_TREE);
36 
37 
38     size = MPIR_Comm_size(comm);
39     rank = MPIR_Comm_rank(comm);
40     lrank = (rank - root + size) % size;        /* logical rank when root is non-zero */
41 
42     if (rank == root)
43         is_inplace = (sendbuf == MPI_IN_PLACE); /* For gather, MPI_IN_PLACE is significant only at root */
44 
45     tree_type = MPIR_TREE_TYPE_KNOMIAL_1;       /* currently only tree_type=MPIR_TREE_TYPE_KNOMIAL_1 is supported for gather */
46     mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree);
47     MPIR_ERR_CHECK(mpi_errno);
48     num_children = my_tree.num_children;
49 
50     /* For correctness, transport based collectives need to get the
51      * tag from the same pool as schedule based collectives */
52     mpi_errno = MPIR_Sched_next_tag(comm, &tag);
53     MPIR_ERR_CHECK(mpi_errno);
54 
55     if (rank == root && is_inplace) {
56         sendtype = recvtype;
57         sendcount = recvcount;
58     } else if (rank != root) {  /* all non root ranks will forward data in recvtype format */
59         recvtype = sendtype;
60         recvcount = sendcount;
61     }
62 
63     MPIR_Datatype_get_extent_macro(sendtype, sendtype_extent);
64     MPIR_Type_get_true_extent_impl(sendtype, &sendtype_lb, &sendtype_true_extent);
65     sendtype_extent = MPL_MAX(sendtype_extent, sendtype_true_extent);
66 
67     MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent);
68     MPIR_Type_get_true_extent_impl(recvtype, &recvtype_lb, &recvtype_true_extent);
69     recvtype_extent = MPL_MAX(recvtype_extent, recvtype_true_extent);
70 
71     num_children = my_tree.num_children;
72     MPIR_CHKLMEM_MALLOC(child_subtree_size, int *, sizeof(int) * num_children, mpi_errno, "child_subtree_size buffer", MPL_MEM_COLL);   /* to store size of subtree of each child */
73     MPIR_CHKLMEM_MALLOC(child_data_offset, int *, sizeof(int) * num_children, mpi_errno, "child_data_offset buffer", MPL_MEM_COLL);     /* to store the offset of the data to be sent to each child  */
74 
75     /* calculate size of subtree of each child */
76 
77     /* get tree information of the parent */
78     if (my_tree.parent != -1) {
79         MPIR_Treealgo_tree_create(my_tree.parent, size, tree_type, k, root, &parents_tree);
80     } else {    /* initialize an empty children array */
81         utarray_new(parents_tree.children, &ut_int_icd, MPL_MEM_COLL);
82         parents_tree.num_children = 0;
83     }
84 
85     recv_size = 1;      /* total size of the data to be received from the parent.
86                          * 1 is to count yourself, now add the size of each child's subtree */
87     for (i = 0; i < num_children; i++) {
88         int current_child = (*(int *) utarray_eltptr(my_tree.children, i) - root + size) % size;
89 
90         if (i < num_children - 1) {
91             next_child = (*(int *) utarray_eltptr(my_tree.children, i + 1) - root + size) % size;
92         } else {
93             next_child = size;
94             for (j = 0; j < parents_tree.num_children; j++) {
95                 int sibling =
96                     (*(int *) utarray_eltptr(parents_tree.children, j) - root + size) % size;
97                 if (sibling > lrank) {  /* This is the first sibling larger than lrank */
98                     next_child = sibling;
99                     break;
100                 }
101             }
102         }
103 
104         child_subtree_size[i] = next_child - current_child;
105         MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
106                         (MPL_DBG_FDEST,
107                          "i:%d rank:%d current_child:%d, next_child:%d, child_subtree_size[i]:%d, recv_size:%d\n",
108                          i, rank, current_child, next_child, child_subtree_size[i], recv_size));
109         recv_size += child_subtree_size[i];
110     }
111 
112     MPIR_Treealgo_tree_free(&parents_tree);
113 
114     recv_size *= (lrank == 0) ? recvcount : sendcount;
115     offset = (lrank == 0) ? recvcount : sendcount;
116 
117     for (i = 0; i < num_children; i++) {
118         child_data_offset[i] = offset;
119         offset += (child_subtree_size[i] * ((lrank == 0) ? recvcount : sendcount));
120     }
121 
122 
123     if (root != 0 && lrank == 0) {
124         tmp_buf = MPIR_TSP_sched_malloc(recv_size * recvtype_extent, sched);
125     } else if (root == 0 && lrank == 0) {
126         tmp_buf = (void *) recvbuf;
127     } else if (num_children > 0 && lrank != 0) {        /* intermediate ranks in the tree */
128         tmp_buf = MPIR_TSP_sched_malloc(recv_size * sendtype_extent, sched);
129     } else {    /* leaf ranks in the tree */
130         tmp_buf = (void *) sendbuf;
131     }
132 
133     MPIR_CHKLMEM_MALLOC(recv_id, int *, sizeof(int) * num_children,
134                         mpi_errno, "recv_id buffer", MPL_MEM_COLL);
135     /* Leaf nodes send to parent */
136     if (num_children == 0) {
137         MPIR_TSP_sched_isend(tmp_buf, sendcount, sendtype, my_tree.parent, tag, comm, sched, 0,
138                              NULL);
139         MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, "rank:%d posts recv\n", rank));
140     } else {
141         num_dependencies = 0;
142         if (tmp_buf != recvbuf || (lrank == 0 && !is_inplace)) {        /* copy data into tmp buf unless tmp_buf is recvbuf and inplace
143                                                                          * in which case data is already located in the correct position */
144             if (lrank == 0 && root != 0 && is_inplace)
145                 data_buf = (char *) recvbuf + root * recvcount * recvtype_extent;
146             else
147                 data_buf = (void *) sendbuf;
148 
149             dtcopy_id =
150                 MPIR_TSP_sched_localcopy(data_buf, sendcount, sendtype,
151                                          tmp_buf, recvcount, recvtype, sched, 0, NULL);
152             num_dependencies++;
153         }
154 
155         for (i = 0; i < num_children; i++) {
156             int child = *(int *) utarray_eltptr(my_tree.children, i);
157             recv_id[i] =
158                 MPIR_TSP_sched_irecv((char *) tmp_buf + child_data_offset[i] * recvtype_extent,
159                                      child_subtree_size[i] * recvcount, recvtype, child, tag, comm,
160                                      sched, num_dependencies, &dtcopy_id);
161         }
162 
163         if (my_tree.parent != -1)
164             MPIR_TSP_sched_isend(tmp_buf, recv_size, recvtype, my_tree.parent,
165                                  tag, comm, sched, num_children, recv_id);
166 
167     }
168 
169     if (lrank == 0 && root != 0) {      /* root puts data in recvbuf */
170         dtcopy_id = MPIR_TSP_sched_localcopy(tmp_buf, (size - root) * recvcount, recvtype,
171                                              (char *) recvbuf + root * recvcount * recvtype_extent,
172                                              (size - root) * recvcount, recvtype, sched,
173                                              num_children, recv_id);
174 
175         MPIR_TSP_sched_localcopy((char *) tmp_buf + (size - root) * recvcount * recvtype_extent,
176                                  root * recvcount, recvtype, recvbuf,
177                                  root * recvcount, recvtype, sched, 1, &dtcopy_id);
178 
179     }
180 
181     MPIR_Treealgo_tree_free(&my_tree);
182 
183   fn_exit:
184     MPIR_CHKLMEM_FREEALL();
185     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIR_TSP_IGATHER_SCHED_INTRA_TREE);
186     return mpi_errno;
187   fn_fail:
188     goto fn_exit;
189 }
190 
191 
192 /* Non-blocking tree based gather */
MPIR_TSP_Igather_intra_tree(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,int root,MPIR_Comm * comm,MPIR_Request ** req,int k)193 int MPIR_TSP_Igather_intra_tree(const void *sendbuf, int sendcount,
194                                 MPI_Datatype sendtype, void *recvbuf, int recvcount,
195                                 MPI_Datatype recvtype, int root, MPIR_Comm * comm,
196                                 MPIR_Request ** req, int k)
197 {
198     int mpi_errno = MPI_SUCCESS;
199     MPIR_TSP_sched_t *sched;
200 
201     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIR_TSP_IGATHER_INTRA_TREE);
202     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIR_TSP_IGATHER_INTRA_TREE);
203 
204     *req = NULL;
205 
206     /* generate the schedule */
207     sched = MPL_malloc(sizeof(MPIR_TSP_sched_t), MPL_MEM_COLL);
208     MPIR_Assert(sched != NULL);
209     MPIR_TSP_sched_create(sched);
210 
211     /* schedule tree algo */
212     mpi_errno = MPIR_TSP_Igather_sched_intra_tree(sendbuf, sendcount, sendtype,
213                                                   recvbuf, recvcount, recvtype, root, comm, k,
214                                                   sched);
215     MPIR_ERR_CHECK(mpi_errno);
216 
217     /* start and register the schedule */
218     mpi_errno = MPIR_TSP_sched_start(sched, comm, req);
219     MPIR_ERR_CHECK(mpi_errno);
220 
221   fn_exit:
222     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIR_TSP_IGATHER_INTRA_TREE);
223     return mpi_errno;
224   fn_fail:
225     goto fn_exit;
226 }
227