1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 /* Header protection (i.e., IGATHER_TSP_TREE_ALGOS_H_INCLUDED) is
7 * intentionally omitted since this header might get included multiple
8 * times within the same .c file. */
9
10 #include "algo_common.h"
11 #include "treealgo.h"
12 #include "tsp_namespace_def.h"
13
14 /* Routine to schedule a tree based gather */
MPIR_TSP_Igather_sched_intra_tree(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,int root,MPIR_Comm * comm,int k,MPIR_TSP_sched_t * sched)15 int MPIR_TSP_Igather_sched_intra_tree(const void *sendbuf, int sendcount,
16 MPI_Datatype sendtype, void *recvbuf, int recvcount,
17 MPI_Datatype recvtype, int root, MPIR_Comm * comm,
18 int k, MPIR_TSP_sched_t * sched)
19 {
20 int mpi_errno = MPI_SUCCESS;
21 int size, rank, lrank;
22 int i, j, tag, is_inplace = false;
23 MPI_Aint sendtype_lb, sendtype_extent, sendtype_true_extent;
24 MPI_Aint recvtype_lb, recvtype_extent, recvtype_true_extent;
25 int dtcopy_id, *recv_id = NULL;
26 void *tmp_buf = NULL;
27 const void *data_buf = NULL;
28 int tree_type;
29 MPIR_Treealgo_tree_t my_tree, parents_tree;
30 int next_child, num_children, *child_subtree_size = NULL, *child_data_offset = NULL;
31 int offset, recv_size, num_dependencies;
32 MPIR_CHKLMEM_DECL(3);
33
34 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIR_TSP_IGATHER_SCHED_INTRA_TREE);
35 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIR_TSP_IGATHER_SCHED_INTRA_TREE);
36
37
38 size = MPIR_Comm_size(comm);
39 rank = MPIR_Comm_rank(comm);
40 lrank = (rank - root + size) % size; /* logical rank when root is non-zero */
41
42 if (rank == root)
43 is_inplace = (sendbuf == MPI_IN_PLACE); /* For gather, MPI_IN_PLACE is significant only at root */
44
45 tree_type = MPIR_TREE_TYPE_KNOMIAL_1; /* currently only tree_type=MPIR_TREE_TYPE_KNOMIAL_1 is supported for gather */
46 mpi_errno = MPIR_Treealgo_tree_create(rank, size, tree_type, k, root, &my_tree);
47 MPIR_ERR_CHECK(mpi_errno);
48 num_children = my_tree.num_children;
49
50 /* For correctness, transport based collectives need to get the
51 * tag from the same pool as schedule based collectives */
52 mpi_errno = MPIR_Sched_next_tag(comm, &tag);
53 MPIR_ERR_CHECK(mpi_errno);
54
55 if (rank == root && is_inplace) {
56 sendtype = recvtype;
57 sendcount = recvcount;
58 } else if (rank != root) { /* all non root ranks will forward data in recvtype format */
59 recvtype = sendtype;
60 recvcount = sendcount;
61 }
62
63 MPIR_Datatype_get_extent_macro(sendtype, sendtype_extent);
64 MPIR_Type_get_true_extent_impl(sendtype, &sendtype_lb, &sendtype_true_extent);
65 sendtype_extent = MPL_MAX(sendtype_extent, sendtype_true_extent);
66
67 MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent);
68 MPIR_Type_get_true_extent_impl(recvtype, &recvtype_lb, &recvtype_true_extent);
69 recvtype_extent = MPL_MAX(recvtype_extent, recvtype_true_extent);
70
71 num_children = my_tree.num_children;
72 MPIR_CHKLMEM_MALLOC(child_subtree_size, int *, sizeof(int) * num_children, mpi_errno, "child_subtree_size buffer", MPL_MEM_COLL); /* to store size of subtree of each child */
73 MPIR_CHKLMEM_MALLOC(child_data_offset, int *, sizeof(int) * num_children, mpi_errno, "child_data_offset buffer", MPL_MEM_COLL); /* to store the offset of the data to be sent to each child */
74
75 /* calculate size of subtree of each child */
76
77 /* get tree information of the parent */
78 if (my_tree.parent != -1) {
79 MPIR_Treealgo_tree_create(my_tree.parent, size, tree_type, k, root, &parents_tree);
80 } else { /* initialize an empty children array */
81 utarray_new(parents_tree.children, &ut_int_icd, MPL_MEM_COLL);
82 parents_tree.num_children = 0;
83 }
84
85 recv_size = 1; /* total size of the data to be received from the parent.
86 * 1 is to count yourself, now add the size of each child's subtree */
87 for (i = 0; i < num_children; i++) {
88 int current_child = (*(int *) utarray_eltptr(my_tree.children, i) - root + size) % size;
89
90 if (i < num_children - 1) {
91 next_child = (*(int *) utarray_eltptr(my_tree.children, i + 1) - root + size) % size;
92 } else {
93 next_child = size;
94 for (j = 0; j < parents_tree.num_children; j++) {
95 int sibling =
96 (*(int *) utarray_eltptr(parents_tree.children, j) - root + size) % size;
97 if (sibling > lrank) { /* This is the first sibling larger than lrank */
98 next_child = sibling;
99 break;
100 }
101 }
102 }
103
104 child_subtree_size[i] = next_child - current_child;
105 MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
106 (MPL_DBG_FDEST,
107 "i:%d rank:%d current_child:%d, next_child:%d, child_subtree_size[i]:%d, recv_size:%d\n",
108 i, rank, current_child, next_child, child_subtree_size[i], recv_size));
109 recv_size += child_subtree_size[i];
110 }
111
112 MPIR_Treealgo_tree_free(&parents_tree);
113
114 recv_size *= (lrank == 0) ? recvcount : sendcount;
115 offset = (lrank == 0) ? recvcount : sendcount;
116
117 for (i = 0; i < num_children; i++) {
118 child_data_offset[i] = offset;
119 offset += (child_subtree_size[i] * ((lrank == 0) ? recvcount : sendcount));
120 }
121
122
123 if (root != 0 && lrank == 0) {
124 tmp_buf = MPIR_TSP_sched_malloc(recv_size * recvtype_extent, sched);
125 } else if (root == 0 && lrank == 0) {
126 tmp_buf = (void *) recvbuf;
127 } else if (num_children > 0 && lrank != 0) { /* intermediate ranks in the tree */
128 tmp_buf = MPIR_TSP_sched_malloc(recv_size * sendtype_extent, sched);
129 } else { /* leaf ranks in the tree */
130 tmp_buf = (void *) sendbuf;
131 }
132
133 MPIR_CHKLMEM_MALLOC(recv_id, int *, sizeof(int) * num_children,
134 mpi_errno, "recv_id buffer", MPL_MEM_COLL);
135 /* Leaf nodes send to parent */
136 if (num_children == 0) {
137 MPIR_TSP_sched_isend(tmp_buf, sendcount, sendtype, my_tree.parent, tag, comm, sched, 0,
138 NULL);
139 MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE, (MPL_DBG_FDEST, "rank:%d posts recv\n", rank));
140 } else {
141 num_dependencies = 0;
142 if (tmp_buf != recvbuf || (lrank == 0 && !is_inplace)) { /* copy data into tmp buf unless tmp_buf is recvbuf and inplace
143 * in which case data is already located in the correct position */
144 if (lrank == 0 && root != 0 && is_inplace)
145 data_buf = (char *) recvbuf + root * recvcount * recvtype_extent;
146 else
147 data_buf = (void *) sendbuf;
148
149 dtcopy_id =
150 MPIR_TSP_sched_localcopy(data_buf, sendcount, sendtype,
151 tmp_buf, recvcount, recvtype, sched, 0, NULL);
152 num_dependencies++;
153 }
154
155 for (i = 0; i < num_children; i++) {
156 int child = *(int *) utarray_eltptr(my_tree.children, i);
157 recv_id[i] =
158 MPIR_TSP_sched_irecv((char *) tmp_buf + child_data_offset[i] * recvtype_extent,
159 child_subtree_size[i] * recvcount, recvtype, child, tag, comm,
160 sched, num_dependencies, &dtcopy_id);
161 }
162
163 if (my_tree.parent != -1)
164 MPIR_TSP_sched_isend(tmp_buf, recv_size, recvtype, my_tree.parent,
165 tag, comm, sched, num_children, recv_id);
166
167 }
168
169 if (lrank == 0 && root != 0) { /* root puts data in recvbuf */
170 dtcopy_id = MPIR_TSP_sched_localcopy(tmp_buf, (size - root) * recvcount, recvtype,
171 (char *) recvbuf + root * recvcount * recvtype_extent,
172 (size - root) * recvcount, recvtype, sched,
173 num_children, recv_id);
174
175 MPIR_TSP_sched_localcopy((char *) tmp_buf + (size - root) * recvcount * recvtype_extent,
176 root * recvcount, recvtype, recvbuf,
177 root * recvcount, recvtype, sched, 1, &dtcopy_id);
178
179 }
180
181 MPIR_Treealgo_tree_free(&my_tree);
182
183 fn_exit:
184 MPIR_CHKLMEM_FREEALL();
185 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIR_TSP_IGATHER_SCHED_INTRA_TREE);
186 return mpi_errno;
187 fn_fail:
188 goto fn_exit;
189 }
190
191
192 /* Non-blocking tree based gather */
MPIR_TSP_Igather_intra_tree(const void * sendbuf,int sendcount,MPI_Datatype sendtype,void * recvbuf,int recvcount,MPI_Datatype recvtype,int root,MPIR_Comm * comm,MPIR_Request ** req,int k)193 int MPIR_TSP_Igather_intra_tree(const void *sendbuf, int sendcount,
194 MPI_Datatype sendtype, void *recvbuf, int recvcount,
195 MPI_Datatype recvtype, int root, MPIR_Comm * comm,
196 MPIR_Request ** req, int k)
197 {
198 int mpi_errno = MPI_SUCCESS;
199 MPIR_TSP_sched_t *sched;
200
201 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIR_TSP_IGATHER_INTRA_TREE);
202 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIR_TSP_IGATHER_INTRA_TREE);
203
204 *req = NULL;
205
206 /* generate the schedule */
207 sched = MPL_malloc(sizeof(MPIR_TSP_sched_t), MPL_MEM_COLL);
208 MPIR_Assert(sched != NULL);
209 MPIR_TSP_sched_create(sched);
210
211 /* schedule tree algo */
212 mpi_errno = MPIR_TSP_Igather_sched_intra_tree(sendbuf, sendcount, sendtype,
213 recvbuf, recvcount, recvtype, root, comm, k,
214 sched);
215 MPIR_ERR_CHECK(mpi_errno);
216
217 /* start and register the schedule */
218 mpi_errno = MPIR_TSP_sched_start(sched, comm, req);
219 MPIR_ERR_CHECK(mpi_errno);
220
221 fn_exit:
222 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIR_TSP_IGATHER_INTRA_TREE);
223 return mpi_errno;
224 fn_fail:
225 goto fn_exit;
226 }
227