1 /*
2 * Copyright (c) 2009-2011 The Trustees of Indiana University.
3 * All rights reserved.
4 * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
5 * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
6 * Copyright (c) 2004-2011 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
10 * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
11 * All rights reserved.
12 * Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
13 * Copyright (c) 2017 IBM Corporation. All rights reserved.
14 * Copyright (c) 2018 Research Organization for Information Science
15 * and Technology (RIST). All rights reserved.
16 * $COPYRIGHT$
17 *
18 * Additional copyrights may follow
19 *
20 * $HEADER$
21 */
22
23 #include "orte_config.h"
24
25 #include <sys/types.h>
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif /* HAVE_UNISTD_H */
29 #include <string.h>
30 #ifdef HAVE_SYS_WAIT_H
31 #include <sys/wait.h>
32 #endif
33
34 #include "opal/util/output.h"
35 #include "opal/dss/dss.h"
36
37 #include "orte/mca/iof/base/base.h"
38 #include "orte/mca/rml/rml.h"
39 #include "orte/mca/odls/odls.h"
40 #include "orte/mca/odls/base/base.h"
41 #include "orte/mca/odls/base/odls_private.h"
42 #include "orte/mca/plm/base/plm_private.h"
43 #include "orte/mca/plm/plm.h"
44 #include "orte/mca/rmaps/rmaps_types.h"
45 #include "orte/mca/routed/routed.h"
46 #include "orte/mca/grpcomm/grpcomm.h"
47 #include "orte/mca/ess/ess.h"
48 #include "orte/mca/state/state.h"
49
50 #include "orte/util/error_strings.h"
51 #include "orte/util/name_fns.h"
52 #include "orte/util/proc_info.h"
53 #include "orte/util/show_help.h"
54 #include "orte/util/threads.h"
55
56 #include "orte/runtime/orte_globals.h"
57 #include "orte/runtime/orte_locks.h"
58 #include "orte/runtime/orte_quit.h"
59 #include "orte/runtime/data_type_support/orte_dt_support.h"
60
61 #include "orte/mca/errmgr/errmgr.h"
62 #include "orte/mca/errmgr/base/base.h"
63 #include "orte/mca/errmgr/base/errmgr_private.h"
64
65 #include "errmgr_default_hnp.h"
66
67 static int init(void);
68 static int finalize(void);
69 static void hnp_abort(int error_code, char *fmt, ...);
70
71 /******************
72 * default_hnp module
73 ******************/
74 orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
75 .init = init,
76 .finalize = finalize,
77 .logfn = orte_errmgr_base_log,
78 .abort = hnp_abort,
79 .abort_peers = orte_errmgr_base_abort_peers
80 };
81
82
83 /*
84 * Local functions
85 */
86 static void default_hnp_abort(orte_job_t *jdata);
87 static void job_errors(int fd, short args, void *cbdata);
88 static void proc_errors(int fd, short args, void *cbdata);
89
90 /**********************
91 * From DEFAULT_HNP
92 **********************/
init(void)93 static int init(void)
94 {
95 /* setup state machine to trap job errors */
96 orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
97
98 /* set the lost connection state to run at MSG priority so
99 * we can process any last messages from the proc
100 */
101 orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
102
103 /* setup state machine to trap proc errors */
104 orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
105
106 return ORTE_SUCCESS;
107 }
108
finalize(void)109 static int finalize(void)
110 {
111 return ORTE_SUCCESS;
112 }
113
wakeup(int sd,short args,void * cbdata)114 static void wakeup(int sd, short args, void *cbdata)
115 {
116 /* nothing more we can do */
117 ORTE_ACQUIRE_OBJECT(cbdata);
118 orte_quit(0, 0, NULL);
119 }
120
121 /* this function only gets called when FORCED_TERMINATE
122 * has been invoked, which means that there is some
123 * internal failure (e.g., to pack/unpack a correct value).
124 * We could just exit, but that doesn't result in any
125 * meaningful error message to the user. Likewise, just
126 * printing something to stdout/stderr won't necessarily
127 * get back to the user. Instead, we will send an error
128 * report to mpirun and give it a chance to order our
129 * termination. In order to ensure we _do_ terminate,
130 * we set a timer - if it fires before we receive the
131 * termination command, then we will exit on our own. This
132 * protects us in the case that the failure is in the
133 * messaging system itself */
hnp_abort(int error_code,char * fmt,...)134 static void hnp_abort(int error_code, char *fmt, ...)
135 {
136 va_list arglist;
137 char *outmsg = NULL;
138 orte_timer_t *timer;
139
140 /* only do this once */
141 if (orte_abnormal_term_ordered) {
142 return;
143 }
144
145 /* ensure we exit with non-zero status */
146 ORTE_UPDATE_EXIT_STATUS(error_code);
147
148 /* set the aborting flag */
149 orte_abnormal_term_ordered = true;
150
151 /* If there was a message, construct it */
152 va_start(arglist, fmt);
153 if (NULL != fmt) {
154 vasprintf(&outmsg, fmt, arglist);
155 }
156 va_end(arglist);
157
158 /* use the show-help system to get the message out */
159 orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
160
161 /* this could have happened very early, so see if it happened
162 * before we started anything - if so, we can just finalize */
163 if (orte_never_launched) {
164 orte_quit(0, 0, NULL);
165 return;
166 }
167
168 /* tell the daemons to terminate */
169 if (ORTE_SUCCESS != orte_plm.terminate_orteds()) {
170 orte_quit(0, 0, NULL);
171 return;
172 }
173
174 /* set a timer for exiting - this also gives the message a chance
175 * to get out! */
176 if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
177 ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
178 return;
179 }
180 timer->tv.tv_sec = 5;
181 timer->tv.tv_usec = 0;
182 opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
183 opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
184 ORTE_POST_OBJECT(timer);
185 opal_event_evtimer_add(timer->ev, &timer->tv);
186 }
187
188
job_errors(int fd,short args,void * cbdata)189 static void job_errors(int fd, short args, void *cbdata)
190 {
191 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
192 orte_job_t *jdata;
193 orte_job_state_t jobstate;
194 orte_exit_code_t sts;
195 orte_proc_t *aborted_proc;
196 opal_buffer_t *answer;
197 int32_t rc, ret;
198 int room, *rmptr;
199
200 ORTE_ACQUIRE_OBJECT(caddy);
201
202 /*
203 * if orte is trying to shutdown, just let it
204 */
205 if (orte_finalizing) {
206 return;
207 }
208
209 /* ensure we have an error exit status */
210 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
211
212 /* if the jdata is NULL, then we abort as this
213 * is reporting an unrecoverable error
214 */
215 if (NULL == caddy->jdata) {
216 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
217 OBJ_RELEASE(caddy);
218 return;
219 }
220
221 /* update the state */
222 jdata = caddy->jdata;
223 jobstate = caddy->job_state;
224 jdata->state = jobstate;
225
226 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
227 "%s errmgr:default_hnp: job %s reported state %s",
228 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
229 ORTE_JOBID_PRINT(jdata->jobid),
230 orte_job_state_to_str(jobstate)));
231
232 if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
233 ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
234 ORTE_JOB_STATE_MAP_FAILED == jobstate ||
235 ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
236 if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
237 /* this is the primary job */
238 orte_never_launched = true;
239 }
240 /* disable routing as we may not have performed the daemon
241 * wireup - e.g., in a managed environment, all the daemons
242 * "phone home", but don't actually wireup into the routed
243 * network until they receive the launch message
244 */
245 orte_routing_is_enabled = false;
246 jdata->num_terminated = jdata->num_procs;
247 /* activate the terminated state so we can exit */
248 ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
249 /* if it was a dynamic spawn, then we better tell them this didn't work */
250 if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
251 rc = jobstate;
252 answer = OBJ_NEW(opal_buffer_t);
253 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
254 ORTE_ERROR_LOG(ret);
255 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
256 OBJ_RELEASE(caddy);
257 return;
258 }
259 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
260 ORTE_ERROR_LOG(ret);
261 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
262 OBJ_RELEASE(caddy);
263 return;
264 }
265 /* pack the room number */
266 rmptr = &room;
267 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
268 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
269 ORTE_ERROR_LOG(ret);
270 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
271 OBJ_RELEASE(caddy);
272 return;
273 }
274 }
275 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
276 "%s errmgr:hnp sending dyn error release of job %s to %s",
277 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
278 ORTE_JOBID_PRINT(jdata->jobid),
279 ORTE_NAME_PRINT(&jdata->originator)));
280 if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
281 &jdata->originator, answer,
282 ORTE_RML_TAG_LAUNCH_RESP,
283 orte_rml_send_callback, NULL))) {
284 ORTE_ERROR_LOG(ret);
285 OBJ_RELEASE(answer);
286 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
287 }
288 }
289 OBJ_RELEASE(caddy);
290 return;
291 }
292
293 if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
294 ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
295 /* the job object for this job will have been NULL'd
296 * in the array if the job was solely local. If it isn't
297 * NULL, then we need to tell everyone else to die
298 */
299 aborted_proc = NULL;
300 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
301 sts = aborted_proc->exit_code;
302 if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
303 if (WIFSIGNALED(sts)) { /* died on signal */
304 #ifdef WCOREDUMP
305 if (WCOREDUMP(sts)) {
306 orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
307 WTERMSIG(sts));
308 sts = WTERMSIG(sts);
309 } else {
310 orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
311 WTERMSIG(sts));
312 sts = WTERMSIG(sts);
313 }
314 #else
315 orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
316 WTERMSIG(sts));
317 sts = WTERMSIG(sts);
318 #endif /* WCOREDUMP */
319 } else {
320 orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
321 WEXITSTATUS(sts));
322 sts = WEXITSTATUS(sts);
323 }
324 }
325 }
326 /* if this is the daemon job, then we need to ensure we
327 * output an error message indicating we couldn't launch the
328 * daemons */
329 if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
330 orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
331 }
332 }
333
334 /* if the daemon job aborted and we haven't heard from everyone yet,
335 * then this could well have been caused by a daemon not finding
336 * a way back to us. In this case, output a message indicating a daemon
337 * died without reporting. Otherwise, say nothing as we
338 * likely already output an error message */
339 if (ORTE_JOB_STATE_ABORTED == jobstate &&
340 jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
341 jdata->num_procs != jdata->num_reported) {
342 orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
343 }
344
345 /* abort the job */
346 ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
347 /* set the global abnormal exit flag */
348 orte_abnormal_term_ordered = true;
349 OBJ_RELEASE(caddy);
350 }
351
proc_errors(int fd,short args,void * cbdata)352 static void proc_errors(int fd, short args, void *cbdata)
353 {
354 orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
355 orte_job_t *jdata;
356 orte_proc_t *pptr, *proct;
357 orte_process_name_t *proc = &caddy->name;
358 orte_proc_state_t state = caddy->proc_state;
359 int i;
360 int32_t i32, *i32ptr;
361 char *rtmod;
362
363 ORTE_ACQUIRE_OBJECT(caddy);
364
365 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
366 "%s errmgr:default_hnp: for proc %s state %s",
367 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
368 ORTE_NAME_PRINT(proc),
369 orte_proc_state_to_str(state)));
370
371 /*
372 * if orte is trying to shutdown, just let it
373 */
374 if (orte_finalizing) {
375 goto cleanup;
376 }
377
378 /* get the job object */
379 if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
380 /* could be a race condition */
381 goto cleanup;
382 }
383 pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
384 rtmod = orte_rml.get_routed(orte_mgmt_conduit);
385
386 /* we MUST handle a communication failure before doing anything else
387 * as it requires some special care to avoid normal termination issues
388 * for local application procs
389 */
390 if (ORTE_PROC_STATE_COMM_FAILED == state) {
391 /* is this to a daemon? */
392 if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
393 /* nope - ignore it */
394 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
395 "%s Comm failure to non-daemon proc - ignoring it",
396 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
397 goto cleanup;
398 }
399 /* if this is my own connection, ignore it */
400 if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
401 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
402 "%s Comm failure on my own connection - ignoring it",
403 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
404 goto cleanup;
405 }
406 /* mark the daemon as gone */
407 ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
408 /* if we have ordered orteds to terminate or abort
409 * is in progress, record it */
410 if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
411 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
412 "%s Comm failure: daemons terminating - recording daemon %s as gone",
413 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
414 /* remove from dependent routes, if it is one */
415 orte_routed.route_lost(rtmod, proc);
416 /* if all my routes and local children are gone, then terminate ourselves */
417 if (0 == orte_routed.num_routes(rtmod)) {
418 for (i=0; i < orte_local_children->size; i++) {
419 if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
420 ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
421 /* at least one is still alive */
422 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
423 "%s Comm failure: at least one proc (%s) still alive",
424 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
425 ORTE_NAME_PRINT(&proct->name)));
426 goto cleanup;
427 }
428 }
429 /* call our appropriate exit procedure */
430 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
431 "%s errmgr_hnp: all routes and children gone - ordering exit",
432 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
433 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
434 } else {
435 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
436 "%s Comm failure: %d routes remain alive",
437 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
438 (int)orte_routed.num_routes(rtmod)));
439 }
440 goto cleanup;
441 }
442 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
443 "%s Comm failure: daemon %s - aborting",
444 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
445 /* record the first one to fail */
446 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
447 /* mark the daemon job as failed */
448 jdata->state = ORTE_JOB_STATE_COMM_FAILED;
449 /* point to the lowest rank to cause the problem */
450 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
451 /* retain the object so it doesn't get free'd */
452 OBJ_RETAIN(pptr);
453 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
454 if (!orte_enable_recovery) {
455 /* output an error message so the user knows what happened */
456 orte_show_help("help-errmgr-base.txt", "node-died", true,
457 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
458 orte_process_info.nodename,
459 ORTE_NAME_PRINT(proc),
460 pptr->node->name);
461 /* update our exit code */
462 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
463 /* just in case the exit code hadn't been set, do it here - this
464 * won't override any reported exit code */
465 ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
466 }
467 }
468 /* if recovery is enabled, then we are done - otherwise,
469 * abort the system */
470 if (!orte_enable_recovery) {
471 default_hnp_abort(jdata);
472 }
473 goto cleanup;
474 }
475
476 /* update the proc state - can get multiple reports on a proc
477 * depending on circumstances, so ensure we only do this once
478 */
479 if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
480 pptr->state = state;
481 }
482
483 /* if we were ordered to terminate, mark this proc as dead and see if
484 * any of our routes or local children remain alive - if not, then
485 * terminate ourselves. */
486 if (orte_orteds_term_ordered) {
487 for (i=0; i < orte_local_children->size; i++) {
488 if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
489 if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
490 goto keep_going;
491 }
492 }
493 }
494 /* if all my routes and children are gone, then terminate
495 ourselves nicely (i.e., this is a normal termination) */
496 if (0 == orte_routed.num_routes(rtmod)) {
497 OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
498 "%s errmgr:default:hnp all routes gone - exiting",
499 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
500 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
501 }
502 }
503
504 keep_going:
505 /* if this is a continuously operating job, then there is nothing more
506 * to do - we let the job continue to run */
507 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
508 ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) {
509 /* always mark the waitpid as having fired */
510 ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
511 /* if this is a remote proc, we won't hear anything more about it
512 * as the default behavior would be to terminate the job. So be sure to
513 * mark the IOF as having completed too so we correctly mark this proc
514 * as dead and notify everyone as required */
515 if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
516 ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_IOF_COMPLETE);
517 }
518 goto cleanup;
519 }
520
521 /* ensure we record the failed proc properly so we can report
522 * the error once we terminate
523 */
524 switch (state) {
525 case ORTE_PROC_STATE_KILLED_BY_CMD:
526 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
527 "%s errmgr:hnp: proc %s killed by cmd",
528 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
529 ORTE_NAME_PRINT(proc)));
530 /* we ordered this proc to die, so it isn't an abnormal termination
531 * and we don't flag it as such
532 */
533 if (jdata->num_terminated >= jdata->num_procs) {
534 /* this job has terminated */
535 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
536 }
537 /* don't abort the job as this isn't an abnormal termination */
538 break;
539
540 case ORTE_PROC_STATE_ABORTED:
541 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
542 "%s errmgr:hnp: proc %s aborted",
543 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
544 ORTE_NAME_PRINT(proc)));
545 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
546 jdata->state = ORTE_JOB_STATE_ABORTED;
547 /* point to the first rank to cause the problem */
548 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
549 /* retain the object so it doesn't get free'd */
550 OBJ_RETAIN(pptr);
551 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
552 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
553 /* abnormal termination - abort, but only do it once
554 * to avoid creating a lot of confusion */
555 default_hnp_abort(jdata);
556 }
557 break;
558
559 case ORTE_PROC_STATE_ABORTED_BY_SIG:
560 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
561 "%s errmgr:hnp: proc %s aborted by signal",
562 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
563 ORTE_NAME_PRINT(proc)));
564
565 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
566 /* track the number of non-zero exits */
567 i32 = 0;
568 i32ptr = &i32;
569 orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
570 ++i32;
571 orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
572 if (orte_abort_non_zero_exit) {
573
574 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
575 jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
576 /* point to the first rank to cause the problem */
577 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
578 /* retain the object so it doesn't get free'd */
579 OBJ_RETAIN(pptr);
580 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
581 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
582 /* abnormal termination - abort, but only do it once
583 * to avoid creating a lot of confusion */
584 default_hnp_abort(jdata);
585 }
586 } else {
587 /* user requested we consider this normal termination */
588 if (jdata->num_terminated >= jdata->num_procs) {
589 /* this job has terminated */
590 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
591 }
592 }
593 break;
594
595 case ORTE_PROC_STATE_TERM_WO_SYNC:
596 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
597 "%s errmgr:hnp: proc %s terminated without sync",
598 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
599 ORTE_NAME_PRINT(proc)));
600 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
601 jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
602 /* point to the first rank to cause the problem */
603 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
604 /* retain the object so it doesn't get free'd */
605 OBJ_RETAIN(pptr);
606 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
607 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
608 /* now treat a special case - if the proc exit'd without a required
609 * sync, it may have done so with a zero exit code. We want to ensure
610 * that the user realizes there was an error, so in this -one- case,
611 * we overwrite the process' exit code with the default error code
612 */
613 ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
614 /* abnormal termination - abort, but only do it once
615 * to avoid creating a lot of confusion */
616 default_hnp_abort(jdata);
617 }
618 break;
619
620 case ORTE_PROC_STATE_FAILED_TO_START:
621 case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
622 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
623 "%s errmgr:hnp: proc %s %s",
624 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
625 ORTE_NAME_PRINT(proc),
626 orte_proc_state_to_str(state)));
627 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
628 if (ORTE_PROC_STATE_FAILED_TO_START) {
629 jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
630 } else {
631 jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
632 }
633 /* point to the first rank to cause the problem */
634 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
635 /* retain the object so it doesn't get free'd */
636 OBJ_RETAIN(pptr);
637 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
638 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
639 /* abnormal termination - abort, but only do it once
640 * to avoid creating a lot of confusion */
641 default_hnp_abort(jdata);
642 }
643 /* if this was a daemon, report it */
644 if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
645 /* output a message indicating we failed to launch a daemon */
646 orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
647 }
648 break;
649
650 case ORTE_PROC_STATE_CALLED_ABORT:
651 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
652 "%s errmgr:hnp: proc %s called abort with exit code %d",
653 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
654 ORTE_NAME_PRINT(proc), pptr->exit_code));
655 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
656 jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
657 /* point to the first proc to cause the problem */
658 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
659 /* retain the object so it doesn't get free'd */
660 OBJ_RETAIN(pptr);
661 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
662 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
663 /* abnormal termination - abort, but only do it once
664 * to avoid creating a lot of confusion */
665 default_hnp_abort(jdata);
666 }
667 break;
668
669 case ORTE_PROC_STATE_TERM_NON_ZERO:
670 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
671 "%s errmgr:hnp: proc %s exited with non-zero status %d",
672 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
673 ORTE_NAME_PRINT(proc),
674 pptr->exit_code));
675 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
676 /* track the number of non-zero exits */
677 i32 = 0;
678 i32ptr = &i32;
679 orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
680 ++i32;
681 orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
682 if (orte_abort_non_zero_exit) {
683 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
684 jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
685 /* point to the first rank to cause the problem */
686 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
687 /* retain the object so it doesn't get free'd */
688 OBJ_RETAIN(pptr);
689 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
690 /* abnormal termination - abort, but only do it once
691 * to avoid creating a lot of confusion */
692 default_hnp_abort(jdata);
693 }
694 } else {
695 /* user requested we consider this normal termination */
696 if (jdata->num_terminated >= jdata->num_procs) {
697 /* this job has terminated */
698 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
699 }
700 }
701 break;
702
703 case ORTE_PROC_STATE_HEARTBEAT_FAILED:
704 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
705 "%s errmgr:hnp: proc %s heartbeat failed",
706 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
707 ORTE_NAME_PRINT(proc)));
708 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
709 jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
710 /* point to the first rank to cause the problem */
711 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
712 /* retain the object so it doesn't get free'd */
713 OBJ_RETAIN(pptr);
714 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
715 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
716 /* abnormal termination - abort, but only do it once
717 * to avoid creating a lot of confusion */
718 default_hnp_abort(jdata);
719 }
720 /* remove from dependent routes, if it is one */
721 orte_routed.route_lost(rtmod, proc);
722 break;
723
724 case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
725 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
726 "%s errmgr:hnp: unable to send message to proc %s",
727 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
728 ORTE_NAME_PRINT(proc)));
729 /* if this proc is one of my daemons, then we are truly
730 * hosed - so just exit out
731 */
732 if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
733 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
734 break;
735 }
736 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
737 /* abnormal termination - abort, but only do it once
738 * to avoid creating a lot of confusion */
739 default_hnp_abort(jdata);
740 }
741 break;
742
743 case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
744 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
745 "%s errmgr:hnp: no message path to proc %s",
746 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
747 ORTE_NAME_PRINT(proc)));
748 orte_show_help("help-errmgr-base.txt", "no-path", true,
749 orte_process_info.nodename, pptr->node->name);
750 /* if this proc is one of my daemons, then we are truly
751 * hosed - so just exit out
752 */
753 if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
754 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
755 break;
756 }
757 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
758 /* abnormal termination - abort, but only do it once
759 * to avoid creating a lot of confusion */
760 default_hnp_abort(jdata);
761 }
762 break;
763
764 case ORTE_PROC_STATE_FAILED_TO_CONNECT:
765 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
766 "%s errmgr:hnp: cannot connect to proc %s",
767 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
768 ORTE_NAME_PRINT(proc)));
769 orte_show_help("help-errmgr-base.txt", "no-connect", true,
770 orte_process_info.nodename, pptr->node->name);
771 /* if this proc is one of my daemons, then we are truly
772 * hosed - so just exit out
773 */
774 if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
775 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
776 break;
777 }
778 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
779 /* abnormal termination - abort, but only do it once
780 * to avoid creating a lot of confusion */
781 default_hnp_abort(jdata);
782 }
783 break;
784
785 default:
786 /* shouldn't get this, but terminate job if required */
787 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
788 "%s errmgr:hnp: proc %s default error %s",
789 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
790 ORTE_NAME_PRINT(proc),
791 orte_proc_state_to_str(state)));
792 if (jdata->num_terminated == jdata->num_procs) {
793 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
794 }
795 break;
796 }
797 /* if the waitpid fired, be sure to let the state machine know */
798 if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
799 ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
800 }
801
802 cleanup:
803 OBJ_RELEASE(caddy);
804 }
805
806 /*****************
807 * Local Functions
808 *****************/
default_hnp_abort(orte_job_t * jdata)809 static void default_hnp_abort(orte_job_t *jdata)
810 {
811 int rc;
812 int32_t i32, *i32ptr;
813
814 /* if we are already in progress, then ignore this call */
815 if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
816 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
817 "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s",
818 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
819 ORTE_JOBID_PRINT(jdata->jobid)));
820 return;
821 }
822
823 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
824 "%s errmgr:default_hnp: abort called on job %s",
825 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
826 ORTE_JOBID_PRINT(jdata->jobid)));
827
828 /* set control params to indicate we are terminating */
829 orte_job_term_ordered = true;
830 orte_enable_recovery = false;
831
832 /* if it is the daemon job that aborted, then we need
833 * to flag an abnormal term - otherwise, just abort
834 * the job cleanly
835 */
836 if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
837 orte_abnormal_term_ordered = true;
838 }
839
840 i32 = 0;
841 i32ptr = &i32;
842 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32)) {
843 /* warn user */
844 orte_show_help("help-errmgr-base.txt", "normal-termination-but", true,
845 (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child",
846 (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
847 i32, (1 == i32) ? "process returned\na non-zero exit code" :
848 "processes returned\nnon-zero exit codes");
849 }
850
851 OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
852 "%s errmgr:default_hnp: ordering orted termination",
853 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
854
855 /* tell the plm to terminate the orteds - they will automatically
856 * kill their local procs
857 */
858 if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
859 ORTE_ERROR_LOG(rc);
860 }
861 }
862