1 /*
2  * Copyright (C) Huawei Technologies Co., Ltd. 2019.  ALL RIGHTS RESERVED.
3  * See file LICENSE for terms.
4  */
5 
6 #include <math.h>
7 #include <ucs/debug/log.h>
8 #include <ucs/debug/assert.h>
9 #include <ucs/debug/memtrack.h>
10 #include <uct/api/uct_def.h>
11 
12 #include "builtin_plan.h"
13 
14 #define MAX_PHASES (5)
15 #define ALLOC_SIZE(ep_cnt) (sizeof(ucg_builtin_plan_t) + (MAX_PHASES * \
16     (sizeof(ucg_builtin_plan_phase_t) + ((ep_cnt) * sizeof(uct_ep_h)))))
17 
18 ucs_config_field_t ucg_builtin_tree_config_table[] = {
19     {"RADIX", "8", "Tree radix, for inter-node trees.\n",
20      ucs_offsetof(ucg_builtin_tree_config_t, radix), UCS_CONFIG_TYPE_UINT},
21 
22     {"SOCKET_LEVEL_PPN_THRESH", "16",
23      "Threshold for switching from 1-level to 2-level intra-node tree.\n",
24      ucs_offsetof(ucg_builtin_tree_config_t, sock_thresh), UCS_CONFIG_TYPE_UINT},
25 
26     // TODO: add multi-root configuration
27 
28     {NULL}
29 };
30 
ucg_builtin_tree_connect_phase(ucg_builtin_plan_phase_t * phase,const ucg_builtin_tree_params_t * params,ucg_step_idx_t step_index,uct_ep_h ** eps,ucg_group_member_index_t * peers,unsigned peer_cnt,enum ucg_builtin_plan_method_type method,enum ucg_plan_connect_flags coll_flags)31 static inline ucs_status_t ucg_builtin_tree_connect_phase(ucg_builtin_plan_phase_t *phase,
32                                                           const ucg_builtin_tree_params_t *params,
33                                                           ucg_step_idx_t step_index,
34                                                           uct_ep_h **eps,
35                                                           ucg_group_member_index_t *peers,
36                                                           unsigned peer_cnt,
37                                                           enum ucg_builtin_plan_method_type method,
38                                                           enum ucg_plan_connect_flags coll_flags)
39 {
40     int is_mock = params->coll_type->modifiers & UCG_GROUP_COLLECTIVE_MODIFIER_MOCK_EPS;
41 
42     ucs_assert(peer_cnt > 0);
43     if ((peer_cnt == 1) || coll_flags) {
44         ucg_group_member_index_t peer = peers[0];
45         if (coll_flags & (UCG_PLAN_CONNECT_FLAG_WANT_INCAST |
46                           UCG_PLAN_CONNECT_FLAG_WANT_BCAST)) {
47             /* For some methods, e.g. REDUCE_TERMINAL, the peer is the message
48              * source and not the destination, so we need to switch to "loopback" */
49             if (((coll_flags & UCG_PLAN_CONNECT_FLAG_WANT_INCAST) &&
50                  (method != UCG_PLAN_METHOD_SEND_TERMINAL)) ||
51                 ((coll_flags & UCG_PLAN_CONNECT_FLAG_WANT_BCAST) &&
52                  (method != UCG_PLAN_METHOD_RECV_TERMINAL))) {
53                 peer = params->root; /* Should be myself... TODO: validate! */
54             }
55         }
56 
57         ucs_status_t status = ucg_builtin_single_connection_phase(params->ctx,
58                 peer, step_index, method, coll_flags, phase, is_mock);
59         if ((status == UCS_OK) || !coll_flags) {
60             return status;
61         }
62     }
63 
64 
65     if (peer_cnt > 1) {
66         phase->multi_eps = *eps;
67         *eps            += peer_cnt;
68     }
69     phase->ep_cnt        = peer_cnt;
70     phase->step_index    = step_index;
71     phase->method        = method;
72 
73 #if ENABLE_DEBUG_DATA || ENABLE_FAULT_TOLERANCE
74     phase->indexes       = UCS_ALLOC_CHECK(peer_cnt * sizeof(*peers),
75                                            "tree topology indexes");
76 #endif
77 
78     /* connect every endpoint, by group member index */
79     unsigned idx;
80     ucs_status_t status = UCS_OK;
81     for (idx = 0; (idx < peer_cnt) && (status == UCS_OK); idx++, peers++) {
82         status = ucg_builtin_connect(params->ctx, *peers, phase, idx, 0, is_mock);
83     }
84     return status;
85 }
86 
ucg_builtin_tree_connect(ucg_builtin_plan_t * tree,ucg_builtin_topo_tree_root_phase_t * root,const ucg_builtin_tree_params_t * params,ucg_step_idx_t step_offset,uct_ep_h * first_ep,ucg_group_member_index_t * host_up,unsigned host_up_cnt,ucg_group_member_index_t * net_up,unsigned net_up_cnt,ucg_group_member_index_t * net_down,unsigned net_down_cnt,ucg_group_member_index_t * host_down,unsigned host_down_cnt)87 ucs_status_t ucg_builtin_tree_connect(ucg_builtin_plan_t *tree,
88                                       ucg_builtin_topo_tree_root_phase_t *root,
89                                       const ucg_builtin_tree_params_t *params,
90                                       ucg_step_idx_t step_offset,
91                                       uct_ep_h *first_ep,
92                                       ucg_group_member_index_t *host_up,
93                                       unsigned host_up_cnt,
94                                       ucg_group_member_index_t *net_up,
95                                       unsigned net_up_cnt,
96                                       ucg_group_member_index_t *net_down,
97                                       unsigned net_down_cnt,
98                                       ucg_group_member_index_t *host_down,
99                                       unsigned host_down_cnt)
100 {
101     unsigned idx;
102     ucs_status_t status               = UCS_OK;
103     enum ucg_collective_modifiers mod = params->coll_type->modifiers;
104     uct_ep_h *iter_eps                = first_ep;
105     ucg_builtin_plan_phase_t *phase   = root ? root->phss : tree->phss;
106     ucg_step_idx_t *phs_cnt           = root ? &root->phs_cnt : &tree->phs_cnt;
107     phase                            += *phs_cnt;
108 
109     enum ucg_builtin_plan_method_type fanin_method, fanout_method;
110     ucs_assert(host_up_cnt + host_down_cnt + net_up_cnt + net_down_cnt < UCG_BUILTIN_TREE_MAX_RADIX);
111     ucs_assert(host_up_cnt + host_down_cnt + net_up_cnt + net_down_cnt > 0);
112     // TODO: ucs_assert(up_cnt == (params->coll_type->root == params->me));
113 
114     enum ucg_builtin_plan_topology_type topo_type = params->topology->type;
115     switch (topo_type) {
116     case UCG_PLAN_TREE_FANIN:
117     case UCG_PLAN_TREE_FANIN_FANOUT:
118         /* Create a phase for inter-node communication ("up the tree") */
119         if (host_down_cnt) {
120             fanin_method =  (mod & UCG_GROUP_COLLECTIVE_MODIFIER_AGGREGATE) ?
121                     (host_up_cnt ? UCG_PLAN_METHOD_REDUCE_WAYPOINT :
122                                    UCG_PLAN_METHOD_REDUCE_TERMINAL):
123                     (host_up_cnt ? UCG_PLAN_METHOD_GATHER_WAYPOINT :
124                                    UCG_PLAN_METHOD_RECV_TERMINAL);
125         } else {
126             fanin_method  = UCG_PLAN_METHOD_SEND_TERMINAL;
127         }
128 
129         if (host_up_cnt + host_down_cnt) {
130             /* Add the host-level parent to the end of the host-array, so that
131              * forwarding the message to the parent is all included in one step */
132             if (host_up_cnt) host_down[host_down_cnt++] = host_up[0];
133             status = ucg_builtin_tree_connect_phase(phase++, params, step_offset,
134                     &iter_eps, host_down, host_down_cnt, fanin_method,
135                     UCG_PLAN_CONNECT_FLAG_WANT_INTRANODE |
136                     UCG_PLAN_CONNECT_FLAG_WANT_INCAST);
137             (*phs_cnt)++;
138             if (status != UCS_OK) {
139                 break;
140             }
141         }
142 
143         /* Create a phase for intra-node communication ("up the tree") */
144         if (net_down_cnt) {
145             fanin_method = (mod & UCG_GROUP_COLLECTIVE_MODIFIER_AGGREGATE) ?
146                     (net_up_cnt ? UCG_PLAN_METHOD_REDUCE_WAYPOINT :
147                                   UCG_PLAN_METHOD_REDUCE_TERMINAL):
148                     (net_up_cnt ? UCG_PLAN_METHOD_GATHER_WAYPOINT :
149                                   UCG_PLAN_METHOD_RECV_TERMINAL);
150         } else {
151             fanin_method  = UCG_PLAN_METHOD_SEND_TERMINAL;
152         }
153 
154         if (net_up_cnt + net_down_cnt) {
155             /* Add the network-level parent to the end of the host-array, so that
156              * forwarding the message to the parent is all included in one step */
157             if (net_up_cnt) net_down[net_down_cnt++] = net_up[0];
158             status = ucg_builtin_tree_connect_phase(phase++, params, step_offset + 1,
159                     &iter_eps, net_down, net_down_cnt, fanin_method,
160                     UCG_PLAN_CONNECT_FLAG_WANT_INTERNODE |
161                     UCG_PLAN_CONNECT_FLAG_WANT_INCAST);
162             (*phs_cnt)++;
163         }
164 
165         if ((topo_type == UCG_PLAN_TREE_FANIN) || (status != UCS_OK)) {
166             break;
167         }
168 
169         /* If I have a host-level or network-level parent (it was added to
170          * the end of the host-array) - now is the time to discard it by
171          * decrementing the counter for each host-array */
172         if (net_down_cnt) net_down_cnt--;
173         if (host_down_cnt) host_down_cnt--;
174         /* conditional break */
175     case UCG_PLAN_TREE_FANOUT:
176         /* Create a phase for inter-node communication ("up the tree") */
177         if (net_down_cnt) {
178             fanout_method = (mod & UCG_GROUP_COLLECTIVE_MODIFIER_BROADCAST) ?
179                      (net_up_cnt ? UCG_PLAN_METHOD_BCAST_WAYPOINT :
180                                    UCG_PLAN_METHOD_SEND_TERMINAL):
181                      (net_up_cnt ? UCG_PLAN_METHOD_SCATTER_WAYPOINT:
182                                    UCG_PLAN_METHOD_SCATTER_TERMINAL);
183         } else {
184             fanout_method = UCG_PLAN_METHOD_RECV_TERMINAL;
185         }
186 
187         if (net_up_cnt + net_down_cnt) {
188             for (idx = 0; idx < net_down_cnt; idx++, net_up_cnt++) {
189                 if (net_up_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
190                     return UCS_ERR_BUFFER_TOO_SMALL;
191                 }
192                 net_up[net_up_cnt] = net_down[idx];
193             }
194             status = ucg_builtin_tree_connect_phase(phase++, params, step_offset + 2,
195                     &iter_eps, net_up, net_up_cnt, fanout_method,
196                     UCG_PLAN_CONNECT_FLAG_WANT_INTERNODE |
197                     UCG_PLAN_CONNECT_FLAG_WANT_BCAST);
198             (*phs_cnt)++;
199             if (status != UCS_OK) {
200                 break;
201             }
202         }
203 
204         /* Create a phase for intra-node communication ("down the tree") */
205         if (host_down_cnt) {
206             fanout_method = (mod & UCG_GROUP_COLLECTIVE_MODIFIER_BROADCAST) ?
207                     (host_up_cnt ? UCG_PLAN_METHOD_BCAST_WAYPOINT :
208                                    UCG_PLAN_METHOD_SEND_TERMINAL):
209                     (host_up_cnt ? UCG_PLAN_METHOD_SCATTER_WAYPOINT:
210                                    UCG_PLAN_METHOD_SCATTER_TERMINAL);
211         } else {
212             fanout_method = UCG_PLAN_METHOD_RECV_TERMINAL;
213         }
214 
215         if (host_up_cnt + host_down_cnt) {
216             for (idx = 0; idx < host_down_cnt; idx++, host_up_cnt++) {
217                 if (host_up_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
218                     return UCS_ERR_BUFFER_TOO_SMALL;
219                 }
220                 host_up[host_up_cnt] = host_down[idx];
221             }
222             status = ucg_builtin_tree_connect_phase(phase++, params, step_offset + 3,
223                     &iter_eps, host_up, host_up_cnt, fanout_method,
224                     UCG_PLAN_CONNECT_FLAG_WANT_INTRANODE |
225                     UCG_PLAN_CONNECT_FLAG_WANT_BCAST);
226             (*phs_cnt)++;
227         }
228         break;
229     default:
230         status = UCS_ERR_INVALID_PARAM;
231         break;
232     }
233 
234     if (status != UCS_OK) {
235         free(tree);
236         return status;
237     }
238 
239     /* For the sake of later creation of non-zero-rooted trees - store params */
240     if (!step_offset) {
241         memcpy(phase, params, sizeof(*params));
242         tree->ep_cnt = iter_eps - first_ep;
243     }
244     return UCS_OK;
245 }
246 
ucg_builtin_tree_add_intra(const ucg_builtin_tree_params_t * params,ucg_group_member_index_t * my_idx,unsigned * ppn,ucg_group_member_index_t * up,unsigned * final_up_cnt,ucg_group_member_index_t * down,unsigned * final_down_cnt,enum ucg_group_member_distance * master_phase)247 ucs_status_t ucg_builtin_tree_add_intra(const ucg_builtin_tree_params_t *params,
248                                         ucg_group_member_index_t *my_idx,
249                                         unsigned *ppn,
250                                         ucg_group_member_index_t *up,
251                                         unsigned *final_up_cnt,
252                                         ucg_group_member_index_t *down,
253                                         unsigned *final_down_cnt,
254                                         enum ucg_group_member_distance *master_phase)
255 {
256     unsigned up_cnt = 0;
257     unsigned down_cnt = 0;
258     ucg_group_member_index_t member_idx = (ucg_group_member_index_t)-1;
259     ucg_group_member_index_t socket_threshold = params->config->sock_thresh;
260     enum ucg_group_member_distance up_distance = UCG_GROUP_MEMBER_DISTANCE_LAST;
261     enum ucg_group_member_distance down_distance = UCG_GROUP_MEMBER_DISTANCE_SELF;
262     enum ucg_group_member_distance first_distance = UCG_GROUP_MEMBER_DISTANCE_SELF;
263 
264     /* Go over member distances, filling the per-phase member lists */
265     for (member_idx = 0; member_idx < params->group_params->member_count; member_idx++) {
266         enum ucg_group_member_distance next_distance =
267                 params->group_params->distance[member_idx];
268         ucs_assert(next_distance < UCG_GROUP_MEMBER_DISTANCE_LAST);
269         if (next_distance <= UCG_GROUP_MEMBER_DISTANCE_HOST) {
270             (*ppn)++; // TODO: fix support for "non-full-nodes" allocation...
271             if (ucs_unlikely(next_distance == UCG_GROUP_MEMBER_DISTANCE_SELF)) {
272                 ucs_assert(member_idx == params->group_params->member_index);
273                 *my_idx = member_idx;
274             }
275         }
276     }
277     ucs_assert(member_idx != (ucg_group_member_index_t)-1);
278 
279     /* If there's a small number of cores per socket - no use in 2-levels... */
280     int is_single_level = (*ppn < socket_threshold);
281 
282     /* Go over potential parents, filling the per-phase member lists */
283     for (member_idx = 0; member_idx < *my_idx; member_idx++) {
284         enum ucg_group_member_distance next_distance =
285                 params->group_params->distance[member_idx];
286 
287         /* If per-socket level is disabled - treat all local ranks the same */
288         if (is_single_level && (next_distance == UCG_GROUP_MEMBER_DISTANCE_SOCKET)) {
289             next_distance = UCG_GROUP_MEMBER_DISTANCE_HOST;
290         }
291 
292         if (up_distance > next_distance) {
293             /* Replace parent, possibly "demoting" myself */
294             up_distance  = next_distance;
295             *master_phase = next_distance - 1;
296             up[0] = member_idx;
297             up_cnt = 1;
298             /**
299              * Note: in the "multi-root" case, members #1,2,3... are likely
300              * on the same host, and do not make for good tree roots. To
301              * address this we change the root selection - inside the call
302              * to @ref ucg_topo_fabric_calc .
303              */
304         }
305     }
306 
307     /* Go over potential children, filling the per-phase member lists */
308     for (member_idx++; member_idx < params->group_params->member_count; member_idx++) {
309         enum ucg_group_member_distance next_distance =
310                 params->group_params->distance[member_idx];
311         /* If per-socket level is disabled - treat all local ranks the same */
312         if (is_single_level && (next_distance == UCG_GROUP_MEMBER_DISTANCE_SOCKET)) {
313             next_distance = UCG_GROUP_MEMBER_DISTANCE_HOST;
314         }
315 
316         /* Possibly add the next member to my list according to its distance */
317         if ((next_distance > down_distance) &&
318             (next_distance <= *master_phase) &&
319             (next_distance < UCG_GROUP_MEMBER_DISTANCE_NET)) {
320             down_distance = next_distance;
321             if (first_distance == UCG_GROUP_MEMBER_DISTANCE_SELF) {
322                 first_distance = down_distance;
323             } else {
324                 first_distance = UCG_GROUP_MEMBER_DISTANCE_LAST;
325             }
326             down[down_cnt++] = down[0];
327             down[0]          = member_idx;
328             if (down_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
329                 goto limit_exceeded;
330             }
331         } else if (next_distance == first_distance) {
332             down[down_cnt++] = member_idx;
333             if (down_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
334                 goto limit_exceeded;
335             }
336         }
337     }
338 
339     /* Make corrections for non-zero root */
340     if (params->root != 0) {
341         /* If the new root is my child - omit him */
342         for (member_idx = 0; member_idx < down_cnt; member_idx++) {
343             if (down[member_idx] == params->root) {
344                 down[member_idx] = down[--down_cnt];
345                 break;
346             }
347         }
348 
349         /* If I'm the new root - expect also a message from the "old" root (0) */
350         if (*my_idx == params->root) {
351             up_cnt = 0;
352             if (params->group_params->distance[0] < UCG_GROUP_MEMBER_DISTANCE_NET) {
353                 down[down_cnt++] = 0;
354             }
355         }
356     }
357 
358     *final_up_cnt = up_cnt;
359     *final_down_cnt = down_cnt;
360     return UCS_OK;
361 
362 limit_exceeded:
363     ucs_error("Internal PPN limit (%i) exceeded", UCG_BUILTIN_TREE_MAX_RADIX);
364     return UCS_ERR_UNSUPPORTED;
365 }
366 
ucg_builtin_tree_add_inter(const ucg_builtin_tree_params_t * params,ucg_group_member_index_t my_idx,ucg_group_member_index_t ppn,ucg_group_member_index_t * up,unsigned * up_cnt,ucg_group_member_index_t * down,unsigned * down_cnt)367 static ucs_status_t ucg_builtin_tree_add_inter(const ucg_builtin_tree_params_t *params,
368                                                ucg_group_member_index_t my_idx,
369                                                ucg_group_member_index_t ppn,
370                                                ucg_group_member_index_t *up,
371                                                unsigned *up_cnt,
372                                                ucg_group_member_index_t *down,
373                                                unsigned *down_cnt)
374 {
375     /* Calculate fabric peers up and down, assuming uniform distribution */
376     ucg_group_member_index_t limit       = params->group_params->member_count;
377     ucg_group_member_index_t radix       = params->config->radix;
378     ucg_group_member_index_t inner_range = ppn;
379     ucg_group_member_index_t outer_range = ppn * radix;
380     ucg_group_member_index_t root, inner_idx, outer_idx;
381 
382     /* The OpenMPI default is to allocate "by node", so that each host gets
383      * <PPN> consequtive rank numbers assigned to it. Alternatively, it is
384      * possible to support different patterns, where there rank numbers are
385      * assigned in a "round-robin" fashion. */
386     int consecutive_ranks = 1;
387     if (!consecutive_ranks) {
388         limit = limit / ppn;
389         ppn   = 1; /* from now on - ignore non-host-master members. */
390     }
391 
392     /* Calculate the tree */
393     do {
394         for (outer_idx = 0; outer_idx < limit; outer_idx += outer_range) {
395             root = (outer_range < limit) ? outer_idx : params->root;
396             for (inner_idx = outer_idx;
397                 (inner_idx < outer_idx + outer_range) && (inner_idx < limit);
398                  inner_idx += inner_range) {
399                 if (my_idx == inner_idx) {
400                     if (my_idx == root) {
401                         continue;
402                     }
403                     up[(*up_cnt)++] = root;
404                     if (*up_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
405                         goto limit_exceeded;
406                     }
407                 } else if (my_idx == root) {
408                     down[(*down_cnt)++] = inner_idx;
409                     if (*down_cnt == UCG_BUILTIN_TREE_MAX_RADIX) {
410                         goto limit_exceeded;
411                     }
412                 }
413             }
414         }
415         inner_range *= radix;
416         outer_range *= radix;
417     } while (outer_range < (limit * radix));
418     return UCS_OK;
419 
420 limit_exceeded:
421     ucs_error("Internal PPN limit (%i) exceeded", UCG_BUILTIN_TREE_MAX_RADIX);
422     return UCS_ERR_UNSUPPORTED;
423 }
424 
425 
ucg_builtin_tree_build(const ucg_builtin_tree_params_t * params,ucg_builtin_topo_tree_root_phase_t * root,ucg_builtin_plan_t * tree)426 static ucs_status_t ucg_builtin_tree_build(const ucg_builtin_tree_params_t *params,
427                                            ucg_builtin_topo_tree_root_phase_t *root,
428                                            ucg_builtin_plan_t *tree)
429 {
430     ucg_group_member_index_t net_up[UCG_BUILTIN_TREE_MAX_RADIX] = {0};
431     ucg_group_member_index_t net_down[UCG_BUILTIN_TREE_MAX_RADIX] = {0};
432     ucg_group_member_index_t host_up[UCG_BUILTIN_TREE_MAX_RADIX] = {0};
433     ucg_group_member_index_t host_down[UCG_BUILTIN_TREE_MAX_RADIX] = {0};
434 
435     unsigned ppn = 0;
436     unsigned net_up_cnt = 0;
437     unsigned net_down_cnt = 0;
438     unsigned host_up_cnt = 0;
439     unsigned host_down_cnt = 0;
440 
441     /**
442      * "Master phase" would be the highest phase this member would be the master of.
443      * By the end of this function, the master_phase represents the type of node
444      * in the topology tree - one of the following:
445      *
446      * UCG_GROUP_MEMBER_DISTANCE_SELF:
447      *         no children at all, father is socket-master.
448      *
449      * UCG_GROUP_MEMBER_DISTANCE_SOCKET:
450      *         socket-master, possible children on socket, father is host-master.
451      *
452      * UCG_GROUP_MEMBER_DISTANCE_HOST:
453      *         socket-and-host-master, possible socket-master children
454      *         (one on each of the rest of the sockets) and other host-masters,
455      *         father is a (single) host-master.
456      *
457      * UCG_GROUP_MEMBER_DISTANCE_NET:
458      *         fabric(-and-host-and-socket)-master, possible children of all
459      *         types (but it's his own socket-and-host-master), fathers can be
460      *         a list of fabric-masters in a multi-root formation (topmost
461      *         phase of each collective is all-to-all between fabric-masters).
462      */
463     enum ucg_group_member_distance master_phase = UCG_GROUP_MEMBER_DISTANCE_NET;
464     ucs_status_t status = ucg_builtin_tree_add_intra(params, &tree->super.my_index,
465             &ppn, host_up, &host_up_cnt, host_down, &host_down_cnt, &master_phase);
466     if (ucs_unlikely(status != UCS_OK)) {
467         return status;
468     }
469 
470     /* Network peers calculation */
471     if ((master_phase >= UCG_GROUP_MEMBER_DISTANCE_HOST) &&
472         (ppn < params->group_params->member_count)) {
473         host_up_cnt = 0; /* ignore fake parent of index 0 */
474         status = ucg_builtin_tree_add_inter(params,
475                 tree->super.my_index, ppn, net_up, &net_up_cnt,
476                 net_down, &net_down_cnt);
477         if (ucs_unlikely(status != UCS_OK)) {
478             return status;
479         }
480     }
481 
482     /* Some output, for informational purposes */
483     unsigned member_idx;
484     for (member_idx = 0; member_idx < host_up_cnt; member_idx++) {
485         ucs_info("%lu's tree (host) parent #%u/%u: %lu ", tree->super.my_index,
486                 member_idx + 1, host_up_cnt, host_up[member_idx]);
487     }
488     for (member_idx = 0; member_idx < net_up_cnt; member_idx++) {
489         ucs_info("%lu's tree (net) parent #%u/%u: %lu ", tree->super.my_index,
490                 member_idx + 1, net_up_cnt, net_up[member_idx]);
491     }
492     for (member_idx = 0; member_idx < net_down_cnt; member_idx++) {
493         ucs_info("%lu's tree (net) child  #%u/%u: %lu ", tree->super.my_index,
494                 member_idx + 1, net_down_cnt, net_down[member_idx]);
495     }
496     for (member_idx = 0; member_idx < host_down_cnt; member_idx++) {
497         ucs_info("%lu's tree (host) child  #%u/%u: %lu ", tree->super.my_index,
498                 member_idx + 1, host_down_cnt, host_down[member_idx]);
499     }
500 
501     /* fill in the tree phases while establishing the connections */
502     return ucg_builtin_tree_connect(tree, root, params, 0,
503             (uct_ep_h*)(&tree->phss[MAX_PHASES]),
504             host_up, host_up_cnt, net_up, net_up_cnt,
505             net_down, net_down_cnt, host_down, host_down_cnt);
506 }
507 
ucg_builtin_tree_create(ucg_builtin_group_ctx_t * ctx,const ucg_builtin_plan_topology_t * topology,const ucg_builtin_config_t * config,const ucg_group_params_t * group_params,const ucg_collective_type_t * coll_type,ucg_builtin_plan_t ** plan_p)508 ucs_status_t ucg_builtin_tree_create(ucg_builtin_group_ctx_t *ctx,
509         const ucg_builtin_plan_topology_t *topology,
510         const ucg_builtin_config_t *config,
511         const ucg_group_params_t *group_params,
512         const ucg_collective_type_t *coll_type,
513         ucg_builtin_plan_t **plan_p)
514 {
515     /* Allocate worst-case memory footprint, resized down later */
516     ucg_builtin_plan_t *tree =
517             (ucg_builtin_plan_t*)UCS_ALLOC_CHECK(ALLOC_SIZE(UCG_BUILTIN_TREE_MAX_RADIX), "buitin_tree");
518     ucs_list_head_init(&tree->by_root);
519     tree->phs_cnt = 0;
520     tree->ep_cnt  = 0;
521 
522     /* tree discovery and construction, by phase */
523     ucg_builtin_tree_params_t params = {
524             .ctx          = ctx,
525             .topology     = topology,
526             .coll_type    = coll_type,
527             .group_params = group_params,
528             .config       = &config->tree,
529             .root         = 0
530     };
531     ucs_status_t status = ucg_builtin_tree_build(&params, NULL, tree);
532     if (status != UCS_OK) {
533         return status;
534     }
535 
536     /* Reduce the allocation size according to actual usage */
537     *plan_p = (ucg_builtin_plan_t*)ucs_realloc(tree, ALLOC_SIZE(tree->ep_cnt), "buitin_tree");
538     ucs_assert(*plan_p != NULL); /* only reduces size - should never fail */
539     return UCS_OK;
540 }
541 
ucg_builtin_topo_tree_set_root(ucg_group_member_index_t root,ucg_group_member_index_t my_index,ucg_builtin_plan_t * plan,ucg_builtin_plan_phase_t ** first_phase_p,unsigned * phase_count_p)542 ucs_status_t ucg_builtin_topo_tree_set_root(ucg_group_member_index_t root,
543                                             ucg_group_member_index_t my_index,
544                                             ucg_builtin_plan_t *plan,
545                                             ucg_builtin_plan_phase_t **first_phase_p,
546                                             unsigned *phase_count_p)
547 {
548     ucs_assert(root != 0);
549 
550     /* Check if I'm rank #0, the original root of the plan */
551     ucg_builtin_topo_tree_root_phase_t *root_phase;
552     /* Search for a previously prepared step - by root rank number */
553     ucs_list_for_each(root_phase, &plan->by_root, list) {
554         if (root_phase->root == root) {
555             goto phase_found;
556         }
557     }
558 
559     /* Extend the tree to allow for additional endpoints */
560     ucg_builtin_plan_t *tree = ucs_realloc(plan,
561             ALLOC_SIZE(UCG_BUILTIN_TREE_MAX_RADIX), "nonzero_root");
562     if (tree == NULL) {
563         return UCS_ERR_NO_MEMORY;
564     }
565 
566     /* Create a new descriptor for this phase */
567     root_phase = UCS_ALLOC_CHECK(sizeof(*root_phase), "ucg_builtin_root_phase");
568     ucs_list_add_head(&plan->by_root, &root_phase->list);
569     root_phase->root = root;
570 
571     /* Build new phases in addition to the existing tree, for this new root */
572     const ucg_builtin_tree_params_t *params =
573             (ucg_builtin_tree_params_t*)&plan->phss[plan->phs_cnt-1];
574     ucs_status_t status = ucg_builtin_tree_build(params, root_phase, plan);
575     if (status != UCS_OK) {
576         return UCS_OK;
577     }
578 
579     /* Reduce the allocation size according to actual usage */
580     tree = (ucg_builtin_plan_t*)ucs_realloc(tree, ALLOC_SIZE(tree->ep_cnt), "buitin_tree");
581     ucs_assert(tree != NULL); /* only reduces size - should never fail */
582 
583 phase_found:
584     *first_phase_p = &root_phase->phss[0];
585     *phase_count_p = root_phase->phs_cnt;
586     return UCS_OK;
587 }
588