1 /*
2 * Copyright (c) 2007-2011 Los Alamos National Security, LLC.
3 * All rights reserved.
4 * Copyright (c) 2004-2011 The University of Tennessee and The University
5 * of Tennessee Research Foundation. All rights
6 * reserved.
7 * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
8 * $COPYRIGHT$
9 *
10 * Additional copyrights may follow
11 *
12 * $HEADER$
13 */
14
15 #include "orte_config.h"
16 #include "orte/constants.h"
17
18 #include "opal/dss/dss.h"
19 #include "opal/util/output.h"
20
21 #include "orte/mca/errmgr/errmgr.h"
22 #include "orte/mca/rml/rml.h"
23 #include "orte/util/name_fns.h"
24 #include "orte/util/proc_info.h"
25 #include "orte/runtime/orte_globals.h"
26 #include "orte/runtime/data_type_support/orte_dt_support.h"
27 #include "orte/runtime/orte_wait.h"
28
29 #include "orte/mca/rml/base/rml_contact.h"
30
31 #include "orte/mca/routed/base/base.h"
32 #include "routed_direct.h"
33
34 static int init(void);
35 static int finalize(void);
36 static int delete_route(orte_process_name_t *proc);
37 static int update_route(orte_process_name_t *target,
38 orte_process_name_t *route);
39 static orte_process_name_t get_route(orte_process_name_t *target);
40 static int route_lost(const orte_process_name_t *route);
41 static bool route_is_defined(const orte_process_name_t *target);
42 static void update_routing_plan(void);
43 static void get_routing_list(opal_list_t *coll);
44 static int set_lifeline(orte_process_name_t *proc);
45 static size_t num_routes(void);
46
47 #if OPAL_ENABLE_FT_CR == 1
48 static int direct_ft_event(int state);
49 #endif
50
51 orte_routed_module_t orte_routed_direct_module = {
52 .initialize = init,
53 .finalize = finalize,
54 .delete_route = delete_route,
55 .update_route = update_route,
56 .get_route = get_route,
57 .route_lost = route_lost,
58 .route_is_defined = route_is_defined,
59 .set_lifeline = set_lifeline,
60 .update_routing_plan = update_routing_plan,
61 .get_routing_list = get_routing_list,
62 .num_routes = num_routes,
63 #if OPAL_ENABLE_FT_CR == 1
64 .ft_event = direct_ft_event
65 #else
66 NULL
67 #endif
68 };
69
70 static orte_process_name_t mylifeline;
71 static orte_process_name_t *lifeline = NULL;
72 static opal_list_t my_children;
73
init(void)74 static int init(void)
75 {
76 lifeline = NULL;
77
78 if (ORTE_PROC_IS_DAEMON) {
79 ORTE_PROC_MY_PARENT->jobid = ORTE_PROC_MY_NAME->jobid;
80 /* if we are using static ports, set my lifeline to point at my parent */
81 if (orte_static_ports) {
82 /* we will have been given our parent's vpid by MCA param */
83 lifeline = ORTE_PROC_MY_PARENT;
84 } else {
85 /* set our lifeline to the HNP - we will abort if that connection is lost */
86 lifeline = ORTE_PROC_MY_HNP;
87 ORTE_PROC_MY_PARENT->vpid = 0;
88 }
89 } else if (ORTE_PROC_IS_APP) {
90 /* if we don't have a designated daemon, just
91 * disqualify ourselves */
92 if (NULL == orte_process_info.my_daemon_uri) {
93 return ORTE_ERR_TAKE_NEXT_OPTION;
94 }
95 /* set our lifeline to the local daemon - we will abort if this connection is lost */
96 lifeline = ORTE_PROC_MY_DAEMON;
97 orte_routing_is_enabled = true;
98 }
99
100 /* setup the list of children */
101 OBJ_CONSTRUCT(&my_children, opal_list_t);
102
103 return ORTE_SUCCESS;
104 }
105
finalize(void)106 static int finalize(void)
107 {
108 OPAL_LIST_DESTRUCT(&my_children);
109 return ORTE_SUCCESS;
110 }
111
delete_route(orte_process_name_t * proc)112 static int delete_route(orte_process_name_t *proc)
113 {
114 OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
115 "%s routed_direct_delete_route for %s",
116 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
117 ORTE_NAME_PRINT(proc)));
118
119 /*There is nothing to do here */
120
121 return ORTE_SUCCESS;
122 }
123
update_route(orte_process_name_t * target,orte_process_name_t * route)124 static int update_route(orte_process_name_t *target,
125 orte_process_name_t *route)
126 {
127 OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output,
128 "%s routed_direct_update: %s --> %s",
129 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
130 ORTE_NAME_PRINT(target),
131 ORTE_NAME_PRINT(route)));
132
133 /*There is nothing to do here */
134
135 return ORTE_SUCCESS;
136 }
137
138
get_route(orte_process_name_t * target)139 static orte_process_name_t get_route(orte_process_name_t *target)
140 {
141 orte_process_name_t *ret, daemon;
142
143 if (target->jobid == ORTE_JOBID_INVALID ||
144 target->vpid == ORTE_VPID_INVALID) {
145 ret = ORTE_NAME_INVALID;
146 goto found;
147 }
148
149 /* initialize */
150 daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
151 daemon.vpid = ORTE_PROC_MY_DAEMON->vpid;
152
153 if (ORTE_PROC_IS_APP) {
154 /* if I am an application, AND I have knowledge of
155 * my daemon (i.e., a daemon launched me), then I
156 * always route thru the daemon */
157 if (NULL != orte_process_info.my_daemon_uri) {
158 ret = ORTE_PROC_MY_DAEMON;
159 } else {
160 /* I was direct launched and do not have
161 * a daemon, so I have to route direct */
162 ret = target;
163 }
164 goto found;
165 }
166
167 /* if I am a tool, the route is direct if target is in
168 * my own job family, and to the target's HNP if not
169 */
170 if (ORTE_PROC_IS_TOOL) {
171 if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) {
172 ret = target;
173 goto found;
174 } else {
175 ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid);
176 ret = &daemon;
177 goto found;
178 }
179 }
180
181 /****** HNP AND DAEMONS ONLY ******/
182 if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) {
183 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
184 "%s routing direct to the HNP",
185 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
186 ret = ORTE_PROC_MY_HNP;
187 goto found;
188 }
189
190 daemon.jobid = ORTE_PROC_MY_NAME->jobid;
191 /* find out what daemon hosts this proc */
192 if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) {
193 ret = ORTE_NAME_INVALID;
194 goto found;
195 }
196
197 /* if the daemon is me, then send direct to the target! */
198 if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) {
199 ret = target;
200 goto found;
201 }
202
203 /* else route to this daemon directly */
204 ret = &daemon;
205
206 found:
207 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
208 "%s routed_direct_get(%s) --> %s",
209 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
210 ORTE_NAME_PRINT(target),
211 ORTE_NAME_PRINT(ret)));
212
213 return *ret;
214 }
215
route_lost(const orte_process_name_t * route)216 static int route_lost(const orte_process_name_t *route)
217 {
218 opal_list_item_t *item;
219 orte_routed_tree_t *child;
220
221 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
222 "%s route to %s lost",
223 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
224 ORTE_NAME_PRINT(route)));
225
226 /* if we lose the connection to the lifeline and we are NOT already,
227 * in finalize, tell the OOB to abort.
228 * NOTE: we cannot call abort from here as the OOB needs to first
229 * release a thread-lock - otherwise, we will hang!!
230 */
231 if (!orte_finalizing &&
232 NULL != lifeline &&
233 OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) {
234 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
235 "%s routed:direct: Connection to lifeline %s lost",
236 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
237 ORTE_NAME_PRINT(lifeline)));
238 return ORTE_ERR_FATAL;
239 }
240
241 /* if we are the HNP, and the route is a daemon,
242 * see if it is one of our children - if so, remove it
243 */
244 if (ORTE_PROC_IS_HNP &&
245 route->jobid == ORTE_PROC_MY_NAME->jobid) {
246 for (item = opal_list_get_first(&my_children);
247 item != opal_list_get_end(&my_children);
248 item = opal_list_get_next(item)) {
249 child = (orte_routed_tree_t*)item;
250 if (child->vpid == route->vpid) {
251 opal_list_remove_item(&my_children, item);
252 OBJ_RELEASE(item);
253 return ORTE_SUCCESS;
254 }
255 }
256 }
257
258 /* we don't care about this one, so return success */
259 return ORTE_SUCCESS;
260 }
261
262
route_is_defined(const orte_process_name_t * target)263 static bool route_is_defined(const orte_process_name_t *target)
264 {
265 /* all routes are defined */
266 return true;
267 }
268
set_lifeline(orte_process_name_t * proc)269 static int set_lifeline(orte_process_name_t *proc)
270 {
271 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
272 "%s routed:direct: set lifeline to %s",
273 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
274 ORTE_NAME_PRINT(proc)));
275 mylifeline = *proc;
276 lifeline = &mylifeline;
277 return ORTE_SUCCESS;
278 }
279
update_routing_plan(void)280 static void update_routing_plan(void)
281 {
282 orte_routed_tree_t *child;
283 int32_t i;
284 orte_job_t *jdata;
285 orte_proc_t *proc;
286
287 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
288 "%s routed:direct: update routing plan",
289 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
290
291 if (!ORTE_PROC_IS_HNP) {
292 /* nothing to do */
293 return;
294 }
295
296 /* clear the current list */
297 OPAL_LIST_DESTRUCT(&my_children);
298 OBJ_CONSTRUCT(&my_children, opal_list_t);
299
300 /* HNP is directly connected to each daemon */
301 if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
302 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
303 return;
304 }
305 for (i=1; i < jdata->procs->size; i++) {
306 if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
307 continue;
308 }
309 child = OBJ_NEW(orte_routed_tree_t);
310 child->vpid = proc->name.vpid;
311 opal_list_append(&my_children, &child->super);
312 }
313
314 return;
315 }
316
get_routing_list(opal_list_t * coll)317 static void get_routing_list(opal_list_t *coll)
318 {
319
320 OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output,
321 "%s routed:direct: get routing list",
322 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
323
324 /* if I am anything other than daemons and the HNP, this
325 * is a meaningless command as I am not allowed to route
326 */
327 if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) {
328 return;
329 }
330
331 orte_routed_base_xcast_routing(coll, &my_children);
332 }
333
num_routes(void)334 static size_t num_routes(void)
335 {
336 if (!ORTE_PROC_IS_HNP) {
337 return 0;
338 }
339 return opal_list_get_size(&my_children);
340 }
341
342 #if OPAL_ENABLE_FT_CR == 1
direct_ft_event(int state)343 static int direct_ft_event(int state)
344 {
345 int ret, exit_status = ORTE_SUCCESS;
346
347 /******** Checkpoint Prep ********/
348 if(OPAL_CRS_CHECKPOINT == state) {
349 }
350 /******** Continue Recovery ********/
351 else if (OPAL_CRS_CONTINUE == state ) {
352 }
353 else if (OPAL_CRS_TERM == state ) {
354 /* Nothing */
355 }
356 else {
357 /* Error state = Nothing */
358 }
359
360 cleanup:
361 return exit_status;
362 }
363 #endif
364