1 /*
2 * Copyright (C) Huawei Technologies Co., Ltd. 2019. ALL RIGHTS RESERVED.
3 * See file LICENSE for terms.
4 */
5
6 #include <math.h>
7 #include <ucs/debug/log.h>
8 #include <ucs/debug/assert.h>
9 #include <ucs/debug/memtrack.h>
10 #include <uct/api/uct_def.h>
11
12 #include "builtin_plan.h"
13
14 #define MAX_PHASES (5)
15 #define ALLOC_SIZE(ep_cnt) (sizeof(ucg_builtin_plan_t) + (MAX_PHASES * \
16 (sizeof(ucg_builtin_plan_phase_t) + ((ep_cnt) * sizeof(uct_ep_h)))))
17
18 ucs_config_field_t ucg_builtin_tree_config_table[] = {
19 {"RADIX", "8", "Tree radix, for inter-node trees.\n",
20 ucs_offsetof(ucg_builtin_tree_config_t, radix), UCS_CONFIG_TYPE_UINT},
21
22 {"SOCKET_LEVEL_PPN_THRESH", "16",
23 "Threshold for switching from 1-level to 2-level intra-node tree.\n",
24 ucs_offsetof(ucg_builtin_tree_config_t, sock_thresh), UCS_CONFIG_TYPE_UINT},
25
26 // TODO: add multi-root configuration
27
28 {NULL}
29 };
30
ucg_builtin_tree_connect_phase(ucg_builtin_plan_phase_t * phase,const ucg_builtin_tree_params_t * params,ucg_step_idx_t step_index,uct_ep_h ** eps,ucg_group_member_index_t * peers,unsigned peer_cnt,enum ucg_builtin_plan_method_type method,enum ucg_plan_connect_flags coll_flags)31 static inline ucs_status_t ucg_builtin_tree_connect_phase(ucg_builtin_plan_phase_t *phase,
32 const ucg_builtin_tree_params_t *params,
33 ucg_step_idx_t step_index,
34 uct_ep_h **eps,
35 ucg_group_member_index_t *peers,
36 unsigned peer_cnt,
37 enum ucg_builtin_plan_method_type method,
38 enum ucg_plan_connect_flags coll_flags)
39 {
40 int is_mock = params->coll_type->modifiers & UCG_GROUP_COLLECTIVE_MODIFIER_MOCK_EPS;
41
42 ucs_assert(peer_cnt > 0);
43 if ((peer_cnt == 1) || coll_flags) {
44 ucg_group_member_index_t peer = peers[0];
45 if (coll_flags & (UCG_PLAN_CONNECT_FLAG_WANT_INCAST |
46 UCG_PLAN_CONNECT_FLAG_WANT_BCAST)) {
47 /* For some methods, e.g. REDUCE_TERMINAL, the peer is the message
48 * source and not the destination, so we need to switch to "loopback" */
49 if (((coll_flags & UCG_PLAN_CONNECT_FLAG_WANT_INCAST) &&
50 (method != UCG_PLAN_METHOD_SEND_TERMINAL)) ||
51 ((coll_flags & UCG_PLAN_CONNECT_FLAG_WANT_BCAST) &&
52 (method != UCG_PLAN_METHOD_RECV_TERMINAL))) {
53 peer = params->root; /* Should be myself... TODO: validate! */
54 }
55 }
56
57 ucs_status_t status = ucg_builtin_single_connection_phase(params->ctx,
58 peer, step_index, method, coll_flags, phase, is_mock);
59 if ((status == UCS_OK) || !coll_flags) {
60 return status;
61 }
62 }
63
64
65 if (peer_cnt > 1) {
66 phase->multi_eps = *eps;
67 *eps += peer_cnt;
68 }
69 phase->ep_cnt = peer_cnt;
70 phase->step_index = step_index;
71 phase->method = method;
72
73 #if ENABLE_DEBUG_DATA || ENABLE_FAULT_TOLERANCE
74 phase->indexes = UCS_ALLOC_CHECK(peer_cnt * sizeof(*peers),
75 "tree topology indexes");
76 #endif
77
78 /* connect every endpoint, by group member index */
79 unsigned idx;
80 ucs_status_t status = UCS_OK;
81 for (idx = 0; (idx < peer_cnt) && (status == UCS_OK); idx++, peers++) {
82 status = ucg_builtin_connect(params->ctx, *peers, phase, idx, 0, is_mock);
83 }
84 return status;
85 }
86
ucg_builtin_tree_connect(ucg_builtin_plan_t * tree,ucg_builtin_topo_tree_root_phase_t * root,const ucg_builtin_tree_params_t * params,ucg_step_idx_t step_offset,uct_ep_h * first_ep,ucg_group_member_index_t * host_up,unsigned host_up_cnt,ucg_group_member_index_t * net_up,unsigned net_up_cnt,ucg_group_member_index_t * net_down,unsigned net_down_cnt,ucg_group_member_index_t * host_down,unsigned host_down_cnt)87 ucs_status_t ucg_builtin_tree_connect(ucg_builtin_plan_t *tree,
88 ucg_builtin_topo_tree_root_phase_t *root,
89 const ucg_builtin_tree_params_t *params,
90 ucg_step_idx_t step_offset,
91 uct_ep_h *first_ep,
92 ucg_group_member_index_t *host_up,
93 unsigned host_up_cnt,
94 ucg_group_member_index_t *net_up,
95 unsigned net_up_cnt,
96 ucg_group_member_index_t *net_down,
97 unsigned net_down_cnt,
98 ucg_group_member_index_t *host_down,
99 unsigned host_down_cnt)
100 {
101 unsigned idx;
102 ucs_status_t status = UCS_OK;
103 enum ucg_collective_modifiers mod = params->coll_type->modifiers;
104 uct_ep_h *iter_eps = first_ep;
105 ucg_builtin_plan_phase_t *phase = root ? root->phss : tree->phss;
106 ucg_step_idx_t *phs_cnt = root ? &root->phs_cnt : &tree->phs_cnt;
107 phase += *phs_cnt;
108
109 enum ucg_builtin_plan_method_type fanin_method, fanout_method;
110 ucs_assert(host_up_cnt + host_down_cnt + net_up_cnt + net_down_cnt < UCG_BUILTIN_TREE_MAX_RADIX);
111 ucs_assert(host_up_cnt + host_down_cnt + net_up_cnt + net_down_cnt > 0);
112 // TODO: ucs_assert(up_cnt == (params->coll_type->root == params->me));
113
114 enum ucg_builtin_plan_topology_type topo_type = params->topology->type;
115 switch (topo_type) {
116 case UCG_PLAN_TREE_FANIN:
117 case UCG_PLAN_TREE_FANIN_FANOUT:
118 /* Create a phase for inter-node communication ("up the tree") */
119 if (host_down_cnt) {
120 fanin_method = (mod & UCG_GROUP_COLLECTIVE_MODIFIER_AGGREGATE) ?
121 (host_up_cnt ? UCG_PLAN_METHOD_REDUCE_WAYPOINT :
122 UCG_PLAN_METHOD_REDUCE_TERMINAL):
123 (host_up_cnt ? UCG_PLAN_METHOD_GATHER_WAYPOINT :
124 UCG_PLAN_METHOD_RECV_TERMINAL);
125 } else {
126 fanin_method = UCG_PLAN_METHOD_SEND_TERMINAL;
127 }
128
129 if (host_up_cnt + host_down_cnt) {
130 /* Add the host-level parent to the end of the host-array, so that
131 * forwarding the message to the parent is all included in one step */
132 if (host_up_cnt) host_down[host_down_cnt++] = host_up[0];
133 status = ucg_builtin_tree_connect_phase(phase++, params, step_offset,
134 &iter_eps, host_down, host_down_cnt, fanin_method,
135 UCG_PLAN_CONNECT_FLAG_WANT_INTRANODE |
136 UCG_PLAN_CONNECT_FLAG_WANT_INCAST);
137 (*phs_cnt)++;
138 if (status != UCS_OK) {
139 break;
140 }
141 }
142
143 /* Create a phase for intra-node communication ("up the tree") */
144 if (net_down_cnt) {
145 fanin_method = (mod & UCG_GROUP_COLLECTIVE_MODIFIER_AGGREGATE) ?
146 (net_up_cnt ? UCG_PLAN_METHOD_REDUCE_WAYPOINT :
147 UCG_PLAN_METHOD_REDUCE_TERMINAL):
148 (net_up_cnt ? UCG_PLAN_METHOD_GATHER_WAYPOINT :
149 UCG_PLAN_METHOD_RECV_TERMINAL);
150 } else {
151 fanin_method = UCG_PLAN_METHOD_SEND_TERMINAL;
152 }
153
154 if (net_up_cnt + net_down_cnt) {
155 /* Add the network-level parent to the end of the host-array, so that
156 * forwarding the message to the parent is all included in one step */
157 if (net_up_cnt) net_down[net_down_cnt++] = net_up[0];
158 status = ucg_builtin_tree_connect_phase(phase++, params, step_offset + 1,
159 &iter_eps, net_down, net_down_cnt, fanin_method,
160 UCG_PLAN_CONNECT_FLAG_WANT_INTERNODE |
161 UCG_PLAN_CONNECT_FLAG_WANT_INCAST);
162 (*phs_cnt)++;
163 }
164
165 if ((topo_type == UCG_PLAN_TREE_FANIN) || (status != UCS_OK)) {
166 break;
167 }
168
169 /* If I have a host-level or network-level parent (it was added to
170 * the end of the host-array) - now is the time to discard it by
171 * decrementing the counter for each host-array */
172 if (net_down_cnt) net_down_cnt--;
173 if (host_down_cnt) host_down_cnt--;
174 /* conditional break */
175 case UCG_PLAN_TREE_FANOUT:
176 /* Create a phase for inter-node communication ("up the tree") */
177 if (net_down_cnt) {
178 fanout_method = (mod & UCG_GROUP_COLLECTIVE_MODIFIER_BROADCAST) ?
179 (net_up_cnt ? UCG_PLAN_METHOD_BCAST_WAYPOINT :
180 UCG_PLAN_METHOD_SEND_TERMINAL):
181 (net_up_cnt ? UCG_PLAN_METHOD_SCATTER_WAYPOINT:
182 UCG_PLAN_METHOD_SCATTER_TERMINAL);
183 } else {
184 fanout_method = UCG_PLAN_METHOD_RECV_TERMINAL;
185 }
186
187 if (net_up_cnt + net_down_cnt) {
188 for (idx = 0; idx < net_down_cnt; idx++, net_up_cnt++) {
189 if (net_up_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
190 return UCS_ERR_BUFFER_TOO_SMALL;
191 }
192 net_up[net_up_cnt] = net_down[idx];
193 }
194 status = ucg_builtin_tree_connect_phase(phase++, params, step_offset + 2,
195 &iter_eps, net_up, net_up_cnt, fanout_method,
196 UCG_PLAN_CONNECT_FLAG_WANT_INTERNODE |
197 UCG_PLAN_CONNECT_FLAG_WANT_BCAST);
198 (*phs_cnt)++;
199 if (status != UCS_OK) {
200 break;
201 }
202 }
203
204 /* Create a phase for intra-node communication ("down the tree") */
205 if (host_down_cnt) {
206 fanout_method = (mod & UCG_GROUP_COLLECTIVE_MODIFIER_BROADCAST) ?
207 (host_up_cnt ? UCG_PLAN_METHOD_BCAST_WAYPOINT :
208 UCG_PLAN_METHOD_SEND_TERMINAL):
209 (host_up_cnt ? UCG_PLAN_METHOD_SCATTER_WAYPOINT:
210 UCG_PLAN_METHOD_SCATTER_TERMINAL);
211 } else {
212 fanout_method = UCG_PLAN_METHOD_RECV_TERMINAL;
213 }
214
215 if (host_up_cnt + host_down_cnt) {
216 for (idx = 0; idx < host_down_cnt; idx++, host_up_cnt++) {
217 if (host_up_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
218 return UCS_ERR_BUFFER_TOO_SMALL;
219 }
220 host_up[host_up_cnt] = host_down[idx];
221 }
222 status = ucg_builtin_tree_connect_phase(phase++, params, step_offset + 3,
223 &iter_eps, host_up, host_up_cnt, fanout_method,
224 UCG_PLAN_CONNECT_FLAG_WANT_INTRANODE |
225 UCG_PLAN_CONNECT_FLAG_WANT_BCAST);
226 (*phs_cnt)++;
227 }
228 break;
229 default:
230 status = UCS_ERR_INVALID_PARAM;
231 break;
232 }
233
234 if (status != UCS_OK) {
235 free(tree);
236 return status;
237 }
238
239 /* For the sake of later creation of non-zero-rooted trees - store params */
240 if (!step_offset) {
241 memcpy(phase, params, sizeof(*params));
242 tree->ep_cnt = iter_eps - first_ep;
243 }
244 return UCS_OK;
245 }
246
ucg_builtin_tree_add_intra(const ucg_builtin_tree_params_t * params,ucg_group_member_index_t * my_idx,unsigned * ppn,ucg_group_member_index_t * up,unsigned * final_up_cnt,ucg_group_member_index_t * down,unsigned * final_down_cnt,enum ucg_group_member_distance * master_phase)247 ucs_status_t ucg_builtin_tree_add_intra(const ucg_builtin_tree_params_t *params,
248 ucg_group_member_index_t *my_idx,
249 unsigned *ppn,
250 ucg_group_member_index_t *up,
251 unsigned *final_up_cnt,
252 ucg_group_member_index_t *down,
253 unsigned *final_down_cnt,
254 enum ucg_group_member_distance *master_phase)
255 {
256 unsigned up_cnt = 0;
257 unsigned down_cnt = 0;
258 ucg_group_member_index_t member_idx = (ucg_group_member_index_t)-1;
259 ucg_group_member_index_t socket_threshold = params->config->sock_thresh;
260 enum ucg_group_member_distance up_distance = UCG_GROUP_MEMBER_DISTANCE_LAST;
261 enum ucg_group_member_distance down_distance = UCG_GROUP_MEMBER_DISTANCE_SELF;
262 enum ucg_group_member_distance first_distance = UCG_GROUP_MEMBER_DISTANCE_SELF;
263
264 /* Go over member distances, filling the per-phase member lists */
265 for (member_idx = 0; member_idx < params->group_params->member_count; member_idx++) {
266 enum ucg_group_member_distance next_distance =
267 params->group_params->distance[member_idx];
268 ucs_assert(next_distance < UCG_GROUP_MEMBER_DISTANCE_LAST);
269 if (next_distance <= UCG_GROUP_MEMBER_DISTANCE_HOST) {
270 (*ppn)++; // TODO: fix support for "non-full-nodes" allocation...
271 if (ucs_unlikely(next_distance == UCG_GROUP_MEMBER_DISTANCE_SELF)) {
272 ucs_assert(member_idx == params->group_params->member_index);
273 *my_idx = member_idx;
274 }
275 }
276 }
277 ucs_assert(member_idx != (ucg_group_member_index_t)-1);
278
279 /* If there's a small number of cores per socket - no use in 2-levels... */
280 int is_single_level = (*ppn < socket_threshold);
281
282 /* Go over potential parents, filling the per-phase member lists */
283 for (member_idx = 0; member_idx < *my_idx; member_idx++) {
284 enum ucg_group_member_distance next_distance =
285 params->group_params->distance[member_idx];
286
287 /* If per-socket level is disabled - treat all local ranks the same */
288 if (is_single_level && (next_distance == UCG_GROUP_MEMBER_DISTANCE_SOCKET)) {
289 next_distance = UCG_GROUP_MEMBER_DISTANCE_HOST;
290 }
291
292 if (up_distance > next_distance) {
293 /* Replace parent, possibly "demoting" myself */
294 up_distance = next_distance;
295 *master_phase = next_distance - 1;
296 up[0] = member_idx;
297 up_cnt = 1;
298 /**
299 * Note: in the "multi-root" case, members #1,2,3... are likely
300 * on the same host, and do not make for good tree roots. To
301 * address this we change the root selection - inside the call
302 * to @ref ucg_topo_fabric_calc .
303 */
304 }
305 }
306
307 /* Go over potential children, filling the per-phase member lists */
308 for (member_idx++; member_idx < params->group_params->member_count; member_idx++) {
309 enum ucg_group_member_distance next_distance =
310 params->group_params->distance[member_idx];
311 /* If per-socket level is disabled - treat all local ranks the same */
312 if (is_single_level && (next_distance == UCG_GROUP_MEMBER_DISTANCE_SOCKET)) {
313 next_distance = UCG_GROUP_MEMBER_DISTANCE_HOST;
314 }
315
316 /* Possibly add the next member to my list according to its distance */
317 if ((next_distance > down_distance) &&
318 (next_distance <= *master_phase) &&
319 (next_distance < UCG_GROUP_MEMBER_DISTANCE_NET)) {
320 down_distance = next_distance;
321 if (first_distance == UCG_GROUP_MEMBER_DISTANCE_SELF) {
322 first_distance = down_distance;
323 } else {
324 first_distance = UCG_GROUP_MEMBER_DISTANCE_LAST;
325 }
326 down[down_cnt++] = down[0];
327 down[0] = member_idx;
328 if (down_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
329 goto limit_exceeded;
330 }
331 } else if (next_distance == first_distance) {
332 down[down_cnt++] = member_idx;
333 if (down_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
334 goto limit_exceeded;
335 }
336 }
337 }
338
339 /* Make corrections for non-zero root */
340 if (params->root != 0) {
341 /* If the new root is my child - omit him */
342 for (member_idx = 0; member_idx < down_cnt; member_idx++) {
343 if (down[member_idx] == params->root) {
344 down[member_idx] = down[--down_cnt];
345 break;
346 }
347 }
348
349 /* If I'm the new root - expect also a message from the "old" root (0) */
350 if (*my_idx == params->root) {
351 up_cnt = 0;
352 if (params->group_params->distance[0] < UCG_GROUP_MEMBER_DISTANCE_NET) {
353 down[down_cnt++] = 0;
354 }
355 }
356 }
357
358 *final_up_cnt = up_cnt;
359 *final_down_cnt = down_cnt;
360 return UCS_OK;
361
362 limit_exceeded:
363 ucs_error("Internal PPN limit (%i) exceeded", UCG_BUILTIN_TREE_MAX_RADIX);
364 return UCS_ERR_UNSUPPORTED;
365 }
366
ucg_builtin_tree_add_inter(const ucg_builtin_tree_params_t * params,ucg_group_member_index_t my_idx,ucg_group_member_index_t ppn,ucg_group_member_index_t * up,unsigned * up_cnt,ucg_group_member_index_t * down,unsigned * down_cnt)367 static ucs_status_t ucg_builtin_tree_add_inter(const ucg_builtin_tree_params_t *params,
368 ucg_group_member_index_t my_idx,
369 ucg_group_member_index_t ppn,
370 ucg_group_member_index_t *up,
371 unsigned *up_cnt,
372 ucg_group_member_index_t *down,
373 unsigned *down_cnt)
374 {
375 /* Calculate fabric peers up and down, assuming uniform distribution */
376 ucg_group_member_index_t limit = params->group_params->member_count;
377 ucg_group_member_index_t radix = params->config->radix;
378 ucg_group_member_index_t inner_range = ppn;
379 ucg_group_member_index_t outer_range = ppn * radix;
380 ucg_group_member_index_t root, inner_idx, outer_idx;
381
382 /* The OpenMPI default is to allocate "by node", so that each host gets
383 * <PPN> consequtive rank numbers assigned to it. Alternatively, it is
384 * possible to support different patterns, where there rank numbers are
385 * assigned in a "round-robin" fashion. */
386 int consecutive_ranks = 1;
387 if (!consecutive_ranks) {
388 limit = limit / ppn;
389 ppn = 1; /* from now on - ignore non-host-master members. */
390 }
391
392 /* Calculate the tree */
393 do {
394 for (outer_idx = 0; outer_idx < limit; outer_idx += outer_range) {
395 root = (outer_range < limit) ? outer_idx : params->root;
396 for (inner_idx = outer_idx;
397 (inner_idx < outer_idx + outer_range) && (inner_idx < limit);
398 inner_idx += inner_range) {
399 if (my_idx == inner_idx) {
400 if (my_idx == root) {
401 continue;
402 }
403 up[(*up_cnt)++] = root;
404 if (*up_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
405 goto limit_exceeded;
406 }
407 } else if (my_idx == root) {
408 down[(*down_cnt)++] = inner_idx;
409 if (*down_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
410 goto limit_exceeded;
411 }
412 }
413 }
414 }
415 inner_range *= radix;
416 outer_range *= radix;
417 } while (outer_range < (limit * radix));
418 return UCS_OK;
419
420 limit_exceeded:
421 ucs_error("Internal PPN limit (%i) exceeded", UCG_BUILTIN_TREE_MAX_RADIX);
422 return UCS_ERR_UNSUPPORTED;
423 }
424
425
ucg_builtin_tree_build(const ucg_builtin_tree_params_t * params,ucg_builtin_topo_tree_root_phase_t * root,ucg_builtin_plan_t * tree)426 static ucs_status_t ucg_builtin_tree_build(const ucg_builtin_tree_params_t *params,
427 ucg_builtin_topo_tree_root_phase_t *root,
428 ucg_builtin_plan_t *tree)
429 {
430 ucg_group_member_index_t net_up[UCG_BUILTIN_TREE_MAX_RADIX] = {0};
431 ucg_group_member_index_t net_down[UCG_BUILTIN_TREE_MAX_RADIX] = {0};
432 ucg_group_member_index_t host_up[UCG_BUILTIN_TREE_MAX_RADIX] = {0};
433 ucg_group_member_index_t host_down[UCG_BUILTIN_TREE_MAX_RADIX] = {0};
434
435 unsigned ppn = 0;
436 unsigned net_up_cnt = 0;
437 unsigned net_down_cnt = 0;
438 unsigned host_up_cnt = 0;
439 unsigned host_down_cnt = 0;
440
441 /**
442 * "Master phase" would be the highest phase this member would be the master of.
443 * By the end of this function, the master_phase represents the type of node
444 * in the topology tree - one of the following:
445 *
446 * UCG_GROUP_MEMBER_DISTANCE_SELF:
447 * no children at all, father is socket-master.
448 *
449 * UCG_GROUP_MEMBER_DISTANCE_SOCKET:
450 * socket-master, possible children on socket, father is host-master.
451 *
452 * UCG_GROUP_MEMBER_DISTANCE_HOST:
453 * socket-and-host-master, possible socket-master children
454 * (one on each of the rest of the sockets) and other host-masters,
455 * father is a (single) host-master.
456 *
457 * UCG_GROUP_MEMBER_DISTANCE_NET:
458 * fabric(-and-host-and-socket)-master, possible children of all
459 * types (but it's his own socket-and-host-master), fathers can be
460 * a list of fabric-masters in a multi-root formation (topmost
461 * phase of each collective is all-to-all between fabric-masters).
462 */
463 enum ucg_group_member_distance master_phase = UCG_GROUP_MEMBER_DISTANCE_NET;
464 ucs_status_t status = ucg_builtin_tree_add_intra(params, &tree->super.my_index,
465 &ppn, host_up, &host_up_cnt, host_down, &host_down_cnt, &master_phase);
466 if (ucs_unlikely(status != UCS_OK)) {
467 return status;
468 }
469
470 /* Network peers calculation */
471 if ((master_phase >= UCG_GROUP_MEMBER_DISTANCE_HOST) &&
472 (ppn < params->group_params->member_count)) {
473 host_up_cnt = 0; /* ignore fake parent of index 0 */
474 status = ucg_builtin_tree_add_inter(params,
475 tree->super.my_index, ppn, net_up, &net_up_cnt,
476 net_down, &net_down_cnt);
477 if (ucs_unlikely(status != UCS_OK)) {
478 return status;
479 }
480 }
481
482 /* Some output, for informational purposes */
483 unsigned member_idx;
484 for (member_idx = 0; member_idx < host_up_cnt; member_idx++) {
485 ucs_info("%lu's tree (host) parent #%u/%u: %lu ", tree->super.my_index,
486 member_idx + 1, host_up_cnt, host_up[member_idx]);
487 }
488 for (member_idx = 0; member_idx < net_up_cnt; member_idx++) {
489 ucs_info("%lu's tree (net) parent #%u/%u: %lu ", tree->super.my_index,
490 member_idx + 1, net_up_cnt, net_up[member_idx]);
491 }
492 for (member_idx = 0; member_idx < net_down_cnt; member_idx++) {
493 ucs_info("%lu's tree (net) child #%u/%u: %lu ", tree->super.my_index,
494 member_idx + 1, net_down_cnt, net_down[member_idx]);
495 }
496 for (member_idx = 0; member_idx < host_down_cnt; member_idx++) {
497 ucs_info("%lu's tree (host) child #%u/%u: %lu ", tree->super.my_index,
498 member_idx + 1, host_down_cnt, host_down[member_idx]);
499 }
500
501 /* fill in the tree phases while establishing the connections */
502 return ucg_builtin_tree_connect(tree, root, params, 0,
503 (uct_ep_h*)(&tree->phss[MAX_PHASES]),
504 host_up, host_up_cnt, net_up, net_up_cnt,
505 net_down, net_down_cnt, host_down, host_down_cnt);
506 }
507
ucg_builtin_tree_create(ucg_builtin_group_ctx_t * ctx,const ucg_builtin_plan_topology_t * topology,const ucg_builtin_config_t * config,const ucg_group_params_t * group_params,const ucg_collective_type_t * coll_type,ucg_builtin_plan_t ** plan_p)508 ucs_status_t ucg_builtin_tree_create(ucg_builtin_group_ctx_t *ctx,
509 const ucg_builtin_plan_topology_t *topology,
510 const ucg_builtin_config_t *config,
511 const ucg_group_params_t *group_params,
512 const ucg_collective_type_t *coll_type,
513 ucg_builtin_plan_t **plan_p)
514 {
515 /* Allocate worst-case memory footprint, resized down later */
516 ucg_builtin_plan_t *tree =
517 (ucg_builtin_plan_t*)UCS_ALLOC_CHECK(ALLOC_SIZE(UCG_BUILTIN_TREE_MAX_RADIX), "buitin_tree");
518 ucs_list_head_init(&tree->by_root);
519 tree->phs_cnt = 0;
520 tree->ep_cnt = 0;
521
522 /* tree discovery and construction, by phase */
523 ucg_builtin_tree_params_t params = {
524 .ctx = ctx,
525 .topology = topology,
526 .coll_type = coll_type,
527 .group_params = group_params,
528 .config = &config->tree,
529 .root = 0
530 };
531 ucs_status_t status = ucg_builtin_tree_build(¶ms, NULL, tree);
532 if (status != UCS_OK) {
533 return status;
534 }
535
536 /* Reduce the allocation size according to actual usage */
537 *plan_p = (ucg_builtin_plan_t*)ucs_realloc(tree, ALLOC_SIZE(tree->ep_cnt), "buitin_tree");
538 ucs_assert(*plan_p != NULL); /* only reduces size - should never fail */
539 return UCS_OK;
540 }
541
ucg_builtin_topo_tree_set_root(ucg_group_member_index_t root,ucg_group_member_index_t my_index,ucg_builtin_plan_t * plan,ucg_builtin_plan_phase_t ** first_phase_p,unsigned * phase_count_p)542 ucs_status_t ucg_builtin_topo_tree_set_root(ucg_group_member_index_t root,
543 ucg_group_member_index_t my_index,
544 ucg_builtin_plan_t *plan,
545 ucg_builtin_plan_phase_t **first_phase_p,
546 unsigned *phase_count_p)
547 {
548 ucs_assert(root != 0);
549
550 /* Check if I'm rank #0, the original root of the plan */
551 ucg_builtin_topo_tree_root_phase_t *root_phase;
552 /* Search for a previously prepared step - by root rank number */
553 ucs_list_for_each(root_phase, &plan->by_root, list) {
554 if (root_phase->root == root) {
555 goto phase_found;
556 }
557 }
558
559 /* Extend the tree to allow for additional endpoints */
560 ucg_builtin_plan_t *tree = ucs_realloc(plan,
561 ALLOC_SIZE(UCG_BUILTIN_TREE_MAX_RADIX), "nonzero_root");
562 if (tree == NULL) {
563 return UCS_ERR_NO_MEMORY;
564 }
565
566 /* Create a new descriptor for this phase */
567 root_phase = UCS_ALLOC_CHECK(sizeof(*root_phase), "ucg_builtin_root_phase");
568 ucs_list_add_head(&plan->by_root, &root_phase->list);
569 root_phase->root = root;
570
571 /* Build new phases in addition to the existing tree, for this new root */
572 const ucg_builtin_tree_params_t *params =
573 (ucg_builtin_tree_params_t*)&plan->phss[plan->phs_cnt-1];
574 ucs_status_t status = ucg_builtin_tree_build(params, root_phase, plan);
575 if (status != UCS_OK) {
576 return UCS_OK;
577 }
578
579 /* Reduce the allocation size according to actual usage */
580 tree = (ucg_builtin_plan_t*)ucs_realloc(tree, ALLOC_SIZE(tree->ep_cnt), "buitin_tree");
581 ucs_assert(tree != NULL); /* only reduces size - should never fail */
582
583 phase_found:
584 *first_phase_p = &root_phase->phss[0];
585 *phase_count_p = root_phase->phs_cnt;
586 return UCS_OK;
587 }
588