1 /*
2  * Copyright (c) 2007-2011 Los Alamos National Security, LLC.
3  *                         All rights reserved.
4  * Copyright (c) 2004-2011 The University of Tennessee and The University
5  *                         of Tennessee Research Foundation.  All rights
6  *                         reserved.
7  * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
8  * $COPYRIGHT$
9  *
10  * Additional copyrights may follow
11  *
12  * $HEADER$
13  */
14 
15 #include "orte_config.h"
16 #include "orte/constants.h"
17 
18 #include "opal/dss/dss.h"
19 #include "opal/util/output.h"
20 
21 #include "orte/mca/errmgr/errmgr.h"
22 #include "orte/mca/rml/rml.h"
23 #include "orte/util/name_fns.h"
24 #include "orte/util/proc_info.h"
25 #include "orte/runtime/orte_globals.h"
26 #include "orte/runtime/data_type_support/orte_dt_support.h"
27 #include "orte/runtime/orte_wait.h"
28 
29 #include "orte/mca/rml/base/rml_contact.h"
30 
31 #include "orte/mca/routed/base/base.h"
32 #include "routed_direct.h"
33 
34 static int init(void);
35 static int finalize(void);
36 static int delete_route(orte_process_name_t *proc);
37 static int update_route(orte_process_name_t *target,
38                         orte_process_name_t *route);
39 static orte_process_name_t get_route(orte_process_name_t *target);
40 static int route_lost(const orte_process_name_t *route);
41 static bool route_is_defined(const orte_process_name_t *target);
42 static void update_routing_plan(void);
43 static void get_routing_list(opal_list_t *coll);
44 static int set_lifeline(orte_process_name_t *proc);
45 static size_t num_routes(void);
46 
47 #if OPAL_ENABLE_FT_CR == 1
48 static int direct_ft_event(int state);
49 #endif
50 
51 orte_routed_module_t orte_routed_direct_module = {
52     .initialize = init,
53     .finalize = finalize,
54     .delete_route = delete_route,
55     .update_route = update_route,
56     .get_route = get_route,
57     .route_lost = route_lost,
58     .route_is_defined = route_is_defined,
59     .set_lifeline = set_lifeline,
60     .update_routing_plan = update_routing_plan,
61     .get_routing_list = get_routing_list,
62     .num_routes = num_routes,
63 #if OPAL_ENABLE_FT_CR == 1
64     .ft_event = direct_ft_event
65 #else
66     NULL
67 #endif
68 };
69 
70 static orte_process_name_t mylifeline;
71 static orte_process_name_t *lifeline = NULL;
72 static opal_list_t my_children;
73 
init(void)74 static int init(void)
75 {
76     lifeline = NULL;
77 
78     if (ORTE_PROC_IS_DAEMON) {
79         ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid;
80         /* if we are using static ports, set my lifeline to point at my parent */
81         if (orte_static_ports) {
82             /* we will have been given our parent's vpid by MCA param */
83             lifeline = ORTE_PROC_MY_PARENT;
84         } else {
85             /* set our lifeline to the HNP - we will abort if that connection is lost */
86             lifeline = ORTE_PROC_MY_HNP;
87             ORTE_PROC_MY_PARENT->vpid = 0;
88         }
89     } else if (ORTE_PROC_IS_APP) {
90         /* if we don't have a designated daemon, just
91          * disqualify ourselves */
92         if (NULL == orte_process_info.my_daemon_uri) {
93             return ORTE_ERR_TAKE_NEXT_OPTION;
94         }
95         /* set our lifeline to the local daemon - we will abort if this connection is lost */
96         lifeline = ORTE_PROC_MY_DAEMON;
97         orte_routing_is_enabled = true;
98     }
99 
100     /* setup the list of children */
101     OBJ_CONSTRUCT(&my_children, opal_list_t);
102 
103     return ORTE_SUCCESS;
104 }
105 
finalize(void)106 static int finalize(void)
107 {
108     OPAL_LIST_DESTRUCT(&my_children);
109     return ORTE_SUCCESS;
110 }
111 
delete_route(orte_process_name_t * proc)112 static int delete_route(orte_process_name_t *proc)
113 {
114     OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
115                          "%s routed_direct_delete_route for %s",
116                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
117                          ORTE_NAME_PRINT(proc)));
118 
119     /*There is nothing to do here */
120 
121     return ORTE_SUCCESS;
122 }
123 
update_route(orte_process_name_t * target,orte_process_name_t * route)124 static int update_route(orte_process_name_t *target,
125                         orte_process_name_t *route)
126 {
127     OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
128                          "%s routed_direct_update: %s --> %s",
129                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
130                          ORTE_NAME_PRINT(target),
131                          ORTE_NAME_PRINT(route)));
132 
133     /*There is nothing to do here */
134 
135     return ORTE_SUCCESS;
136 }
137 
138 
get_route(orte_process_name_t * target)139 static orte_process_name_t get_route(orte_process_name_t *target)
140 {
141     orte_process_name_t *ret, daemon;
142 
143     if (target->jobid == ORTE_JOBID_INVALID ||
144         target->vpid == ORTE_VPID_INVALID) {
145         ret = ORTE_NAME_INVALID;
146         goto found;
147     }
148 
149     /* initialize */
150     daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
151     daemon.vpid = ORTE_PROC_MY_DAEMON->vpid;
152 
153     if (ORTE_PROC_IS_APP) {
154         /* if I am an application, AND I have knowledge of
155          * my daemon (i.e., a daemon launched me), then I
156          * always route thru the daemon */
157         if (NULL != orte_process_info.my_daemon_uri) {
158             ret = ORTE_PROC_MY_DAEMON;
159         } else {
160             /* I was direct launched and do not have
161              * a daemon, so I have to route direct */
162             ret = target;
163         }
164         goto found;
165     }
166 
167     /* if I am a tool, the route is direct if target is in
168      * my own job family, and to the target's HNP if not
169      */
170     if (ORTE_PROC_IS_TOOL) {
171         if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
172             ret = target;
173             goto found;
174         } else {
175             ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid);
176             ret = &daemon;
177             goto found;
178         }
179     }
180 
181     /******     HNP AND DAEMONS ONLY     ******/
182     if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
183         OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
184                     "%s routing direct to the HNP",
185                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
186         ret = ORTE_PROC_MY_HNP;
187         goto found;
188     }
189 
190     daemon.jobid = ORTE_PROC_MY_NAME->jobid;
191     /* find out what daemon hosts this proc */
192     if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
193         ret = ORTE_NAME_INVALID;
194         goto found;
195     }
196 
197     /* if the daemon is me, then send direct to the target! */
198     if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
199         ret = target;
200         goto found;
201     }
202 
203     /* else route to this daemon directly */
204     ret = &daemon;
205 
206  found:
207     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
208                          "%s routed_direct_get(%s) --> %s",
209                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
210                          ORTE_NAME_PRINT(target),
211                          ORTE_NAME_PRINT(ret)));
212 
213     return *ret;
214 }
215 
route_lost(const orte_process_name_t * route)216 static int route_lost(const orte_process_name_t *route)
217 {
218     opal_list_item_t *item;
219     orte_routed_tree_t *child;
220 
221     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
222                          "%s route to %s lost",
223                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
224                          ORTE_NAME_PRINT(route)));
225 
226     /* if we lose the connection to the lifeline and we are NOT already,
227      * in finalize, tell the OOB to abort.
228      * NOTE: we cannot call abort from here as the OOB needs to first
229      * release a thread-lock - otherwise, we will hang!!
230      */
231     if (!orte_finalizing &&
232         NULL != lifeline &&
233         OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) {
234         OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
235                              "%s routed:direct: Connection to lifeline %s lost",
236                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
237                              ORTE_NAME_PRINT(lifeline)));
238         return ORTE_ERR_FATAL;
239     }
240 
241     /* if we are the HNP, and the route is a daemon,
242      * see if it is one of our children - if so, remove it
243      */
244     if (ORTE_PROC_IS_HNP &&
245         route->jobid == ORTE_PROC_MY_NAME->jobid) {
246         for (item = opal_list_get_first(&my_children);
247              item != opal_list_get_end(&my_children);
248              item = opal_list_get_next(item)) {
249             child = (orte_routed_tree_t*)item;
250             if (child->vpid == route->vpid) {
251                 opal_list_remove_item(&my_children, item);
252                 OBJ_RELEASE(item);
253                 return ORTE_SUCCESS;
254             }
255         }
256     }
257 
258     /* we don't care about this one, so return success */
259     return ORTE_SUCCESS;
260 }
261 
262 
route_is_defined(const orte_process_name_t * target)263 static bool route_is_defined(const orte_process_name_t *target)
264 {
265     /* all routes are defined */
266     return true;
267 }
268 
set_lifeline(orte_process_name_t * proc)269 static int set_lifeline(orte_process_name_t *proc)
270 {
271     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
272                          "%s routed:direct: set lifeline to %s",
273                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
274                          ORTE_NAME_PRINT(proc)));
275     mylifeline = *proc;
276     lifeline = &mylifeline;
277     return ORTE_SUCCESS;
278 }
279 
update_routing_plan(void)280 static void update_routing_plan(void)
281 {
282     orte_routed_tree_t *child;
283     int32_t i;
284     orte_job_t *jdata;
285     orte_proc_t *proc;
286 
287     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
288                          "%s routed:direct: update routing plan",
289                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
290 
291     if (!ORTE_PROC_IS_HNP) {
292         /* nothing to do */
293         return;
294     }
295 
296     /* clear the current list */
297     OPAL_LIST_DESTRUCT(&my_children);
298     OBJ_CONSTRUCT(&my_children, opal_list_t);
299 
300     /* HNP is directly connected to each daemon */
301     if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
302         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
303         return;
304     }
305     for (i=1; i < jdata->procs->size; i++) {
306         if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
307             continue;
308         }
309         child = OBJ_NEW(orte_routed_tree_t);
310         child->vpid = proc->name.vpid;
311         opal_list_append(&my_children, &child->super);
312     }
313 
314     return;
315 }
316 
get_routing_list(opal_list_t * coll)317 static void get_routing_list(opal_list_t *coll)
318 {
319 
320     OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
321                          "%s routed:direct: get routing list",
322                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
323 
324     /* if I am anything other than daemons and the HNP, this
325      * is a meaningless command as I am not allowed to route
326      */
327     if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) {
328         return;
329     }
330 
331     orte_routed_base_xcast_routing(coll, &my_children);
332 }
333 
num_routes(void)334 static size_t num_routes(void)
335 {
336     if (!ORTE_PROC_IS_HNP) {
337         return 0;
338     }
339     return opal_list_get_size(&my_children);
340 }
341 
342 #if OPAL_ENABLE_FT_CR == 1
direct_ft_event(int state)343 static int direct_ft_event(int state)
344 {
345     int ret, exit_status = ORTE_SUCCESS;
346 
347     /******** Checkpoint Prep ********/
348     if(OPAL_CRS_CHECKPOINT == state) {
349     }
350     /******** Continue Recovery ********/
351     else if (OPAL_CRS_CONTINUE == state ) {
352     }
353     else if (OPAL_CRS_TERM == state ) {
354         /* Nothing */
355     }
356     else {
357         /* Error state = Nothing */
358     }
359 
360  cleanup:
361     return exit_status;
362 }
363 #endif
364