1 /*
2  * Copyright (c) 2009-2011 The Trustees of Indiana University.
3  *                         All rights reserved.
4  * Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
5  * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
6  * Copyright (c) 2004-2011 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2011      Oracle and/or all its affiliates.  All rights reserved.
10  * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
11  *                         All rights reserved.
12  * Copyright (c) 2014-2018 Intel, Inc.  All rights reserved.
13  * Copyright (c) 2017      IBM Corporation.  All rights reserved.
14  * Copyright (c) 2018      Research Organization for Information Science
15  *                         and Technology (RIST).  All rights reserved.
16  * $COPYRIGHT$
17  *
18  * Additional copyrights may follow
19  *
20  * $HEADER$
21  */
22 
23 #include "orte_config.h"
24 
25 #include <sys/types.h>
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif  /* HAVE_UNISTD_H */
29 #include <string.h>
30 #ifdef HAVE_SYS_WAIT_H
31 #include <sys/wait.h>
32 #endif
33 
34 #include "opal/util/output.h"
35 #include "opal/dss/dss.h"
36 
37 #include "orte/mca/iof/base/base.h"
38 #include "orte/mca/rml/rml.h"
39 #include "orte/mca/odls/odls.h"
40 #include "orte/mca/odls/base/base.h"
41 #include "orte/mca/odls/base/odls_private.h"
42 #include "orte/mca/plm/base/plm_private.h"
43 #include "orte/mca/plm/plm.h"
44 #include "orte/mca/rmaps/rmaps_types.h"
45 #include "orte/mca/routed/routed.h"
46 #include "orte/mca/grpcomm/grpcomm.h"
47 #include "orte/mca/ess/ess.h"
48 #include "orte/mca/state/state.h"
49 
50 #include "orte/util/error_strings.h"
51 #include "orte/util/name_fns.h"
52 #include "orte/util/proc_info.h"
53 #include "orte/util/show_help.h"
54 #include "orte/util/threads.h"
55 
56 #include "orte/runtime/orte_globals.h"
57 #include "orte/runtime/orte_locks.h"
58 #include "orte/runtime/orte_quit.h"
59 #include "orte/runtime/data_type_support/orte_dt_support.h"
60 
61 #include "orte/mca/errmgr/errmgr.h"
62 #include "orte/mca/errmgr/base/base.h"
63 #include "orte/mca/errmgr/base/errmgr_private.h"
64 
65 #include "errmgr_default_hnp.h"
66 
67 static int init(void);
68 static int finalize(void);
69 static void hnp_abort(int error_code, char *fmt, ...);
70 
71 /******************
72  * default_hnp module
73  ******************/
74 orte_errmgr_base_module_t orte_errmgr_default_hnp_module = {
75     .init = init,
76     .finalize = finalize,
77     .logfn = orte_errmgr_base_log,
78     .abort = hnp_abort,
79     .abort_peers = orte_errmgr_base_abort_peers
80 };
81 
82 
83 /*
84  * Local functions
85  */
86 static void default_hnp_abort(orte_job_t *jdata);
87 static void job_errors(int fd, short args, void *cbdata);
88 static void proc_errors(int fd, short args, void *cbdata);
89 
90 /**********************
91  * From DEFAULT_HNP
92  **********************/
init(void)93 static int init(void)
94 {
95     /* setup state machine to trap job errors */
96     orte_state.add_job_state(ORTE_JOB_STATE_ERROR, job_errors, ORTE_ERROR_PRI);
97 
98     /* set the lost connection state to run at MSG priority so
99      * we can process any last messages from the proc
100      */
101     orte_state.add_proc_state(ORTE_PROC_STATE_COMM_FAILED, proc_errors, ORTE_MSG_PRI);
102 
103     /* setup state machine to trap proc errors */
104     orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
105 
106     return ORTE_SUCCESS;
107 }
108 
finalize(void)109 static int finalize(void)
110 {
111     return ORTE_SUCCESS;
112 }
113 
wakeup(int sd,short args,void * cbdata)114 static void wakeup(int sd, short args, void *cbdata)
115 {
116     /* nothing more we can do */
117     ORTE_ACQUIRE_OBJECT(cbdata);
118     orte_quit(0, 0, NULL);
119 }
120 
121 /* this function only gets called when FORCED_TERMINATE
122  * has been invoked, which means that there is some
123  * internal failure (e.g., to pack/unpack a correct value).
124  * We could just exit, but that doesn't result in any
125  * meaningful error message to the user. Likewise, just
126  * printing something to stdout/stderr won't necessarily
127  * get back to the user. Instead, we will send an error
128  * report to mpirun and give it a chance to order our
129  * termination. In order to ensure we _do_ terminate,
130  * we set a timer - if it fires before we receive the
131  * termination command, then we will exit on our own. This
132  * protects us in the case that the failure is in the
133  * messaging system itself */
hnp_abort(int error_code,char * fmt,...)134 static void hnp_abort(int error_code, char *fmt, ...)
135 {
136     va_list arglist;
137     char *outmsg = NULL;
138     orte_timer_t *timer;
139 
140     /* only do this once */
141     if (orte_abnormal_term_ordered) {
142         return;
143     }
144 
145     /* ensure we exit with non-zero status */
146     ORTE_UPDATE_EXIT_STATUS(error_code);
147 
148     /* set the aborting flag */
149     orte_abnormal_term_ordered = true;
150 
151     /* If there was a message, construct it */
152     va_start(arglist, fmt);
153     if (NULL != fmt) {
154         vasprintf(&outmsg, fmt, arglist);
155     }
156     va_end(arglist);
157 
158     /* use the show-help system to get the message out */
159     orte_show_help("help-errmgr-base.txt", "simple-message", true, outmsg);
160 
161     /* this could have happened very early, so see if it happened
162      * before we started anything - if so, we can just finalize */
163     if (orte_never_launched) {
164         orte_quit(0, 0, NULL);
165         return;
166     }
167 
168     /* tell the daemons to terminate */
169     if (ORTE_SUCCESS != orte_plm.terminate_orteds()) {
170         orte_quit(0, 0, NULL);
171         return;
172     }
173 
174     /* set a timer for exiting - this also gives the message a chance
175      * to get out! */
176     if (NULL == (timer = OBJ_NEW(orte_timer_t))) {
177         ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
178         return;
179     }
180     timer->tv.tv_sec = 5;
181     timer->tv.tv_usec = 0;
182     opal_event_evtimer_set(orte_event_base, timer->ev, wakeup, NULL);
183     opal_event_set_priority(timer->ev, ORTE_ERROR_PRI);
184     ORTE_POST_OBJECT(timer);
185     opal_event_evtimer_add(timer->ev, &timer->tv);
186 }
187 
188 
job_errors(int fd,short args,void * cbdata)189 static void job_errors(int fd, short args, void *cbdata)
190 {
191     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
192     orte_job_t *jdata;
193     orte_job_state_t jobstate;
194     orte_exit_code_t sts;
195     orte_proc_t *aborted_proc;
196     opal_buffer_t *answer;
197     int32_t rc, ret;
198     int room, *rmptr;
199 
200     ORTE_ACQUIRE_OBJECT(caddy);
201 
202     /*
203      * if orte is trying to shutdown, just let it
204      */
205     if (orte_finalizing) {
206         return;
207     }
208 
209     /* ensure we have an error exit status */
210     ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
211 
212     /* if the jdata is NULL, then we abort as this
213      * is reporting an unrecoverable error
214      */
215     if (NULL == caddy->jdata) {
216         ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
217         OBJ_RELEASE(caddy);
218         return;
219     }
220 
221     /* update the state */
222     jdata = caddy->jdata;
223     jobstate = caddy->job_state;
224     jdata->state = jobstate;
225 
226     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
227                          "%s errmgr:default_hnp: job %s reported state %s",
228                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
229                          ORTE_JOBID_PRINT(jdata->jobid),
230                          orte_job_state_to_str(jobstate)));
231 
232     if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
233         ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
234         ORTE_JOB_STATE_MAP_FAILED == jobstate ||
235         ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
236         if (1 == ORTE_LOCAL_JOBID(jdata->jobid)) {
237             /* this is the primary job */
238             orte_never_launched = true;
239         }
240         /* disable routing as we may not have performed the daemon
241          * wireup - e.g., in a managed environment, all the daemons
242          * "phone home", but don't actually wireup into the routed
243          * network until they receive the launch message
244          */
245         orte_routing_is_enabled = false;
246         jdata->num_terminated = jdata->num_procs;
247         /* activate the terminated state so we can exit */
248         ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
249         /* if it was a dynamic spawn, then we better tell them this didn't work */
250         if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
251             rc = jobstate;
252             answer = OBJ_NEW(opal_buffer_t);
253             if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
254                 ORTE_ERROR_LOG(ret);
255                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
256                 OBJ_RELEASE(caddy);
257                 return;
258             }
259             if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
260                 ORTE_ERROR_LOG(ret);
261                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
262                 OBJ_RELEASE(caddy);
263                 return;
264             }
265             /* pack the room number */
266             rmptr = &room;
267             if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
268                 if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
269                     ORTE_ERROR_LOG(ret);
270                     ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
271                     OBJ_RELEASE(caddy);
272                     return;
273                 }
274             }
275             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
276                                  "%s errmgr:hnp sending dyn error release of job %s to %s",
277                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
278                                  ORTE_JOBID_PRINT(jdata->jobid),
279                                  ORTE_NAME_PRINT(&jdata->originator)));
280             if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
281                                                    &jdata->originator, answer,
282                                                    ORTE_RML_TAG_LAUNCH_RESP,
283                                                    orte_rml_send_callback, NULL))) {
284                 ORTE_ERROR_LOG(ret);
285                 OBJ_RELEASE(answer);
286                 ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
287             }
288         }
289         OBJ_RELEASE(caddy);
290         return;
291     }
292 
293     if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
294         ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
295         /* the job object for this job will have been NULL'd
296          * in the array if the job was solely local. If it isn't
297          * NULL, then we need to tell everyone else to die
298          */
299         aborted_proc = NULL;
300         if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
301             sts = aborted_proc->exit_code;
302             if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
303                 if (WIFSIGNALED(sts)) { /* died on signal */
304 #ifdef WCOREDUMP
305                     if (WCOREDUMP(sts)) {
306                         orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
307                                        WTERMSIG(sts));
308                         sts = WTERMSIG(sts);
309                     } else {
310                         orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
311                                        WTERMSIG(sts));
312                         sts = WTERMSIG(sts);
313                     }
314 #else
315                     orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
316                                    WTERMSIG(sts));
317                     sts = WTERMSIG(sts);
318 #endif /* WCOREDUMP */
319                 } else {
320                     orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
321                                    WEXITSTATUS(sts));
322                     sts = WEXITSTATUS(sts);
323                 }
324             }
325         }
326         /* if this is the daemon job, then we need to ensure we
327          * output an error message indicating we couldn't launch the
328          * daemons */
329         if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
330             orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
331         }
332     }
333 
334     /* if the daemon job aborted and we haven't heard from everyone yet,
335      * then this could well have been caused by a daemon not finding
336      * a way back to us. In this case, output a message indicating a daemon
337      * died without reporting. Otherwise, say nothing as we
338      * likely already output an error message */
339     if (ORTE_JOB_STATE_ABORTED == jobstate &&
340         jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
341         jdata->num_procs != jdata->num_reported) {
342         orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
343     }
344 
345     /* abort the job */
346     ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_FORCED_EXIT);
347     /* set the global abnormal exit flag  */
348     orte_abnormal_term_ordered = true;
349     OBJ_RELEASE(caddy);
350 }
351 
proc_errors(int fd,short args,void * cbdata)352 static void proc_errors(int fd, short args, void *cbdata)
353 {
354     orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
355     orte_job_t *jdata;
356     orte_proc_t *pptr, *proct;
357     orte_process_name_t *proc = &caddy->name;
358     orte_proc_state_t state = caddy->proc_state;
359     int i;
360     int32_t i32, *i32ptr;
361     char *rtmod;
362 
363     ORTE_ACQUIRE_OBJECT(caddy);
364 
365     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
366                          "%s errmgr:default_hnp: for proc %s state %s",
367                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
368                          ORTE_NAME_PRINT(proc),
369                          orte_proc_state_to_str(state)));
370 
371     /*
372      * if orte is trying to shutdown, just let it
373      */
374     if (orte_finalizing) {
375         goto cleanup;
376     }
377 
378     /* get the job object */
379     if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
380         /* could be a race condition */
381         goto cleanup;
382     }
383     pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
384     rtmod = orte_rml.get_routed(orte_mgmt_conduit);
385 
386     /* we MUST handle a communication failure before doing anything else
387      * as it requires some special care to avoid normal termination issues
388      * for local application procs
389      */
390     if (ORTE_PROC_STATE_COMM_FAILED == state) {
391         /* is this to a daemon? */
392         if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
393             /* nope - ignore it */
394             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
395                                  "%s Comm failure to non-daemon proc - ignoring it",
396                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
397             goto cleanup;
398         }
399         /* if this is my own connection, ignore it */
400         if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
401             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
402                                  "%s Comm failure on my own connection - ignoring it",
403                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
404             goto cleanup;
405         }
406         /* mark the daemon as gone */
407         ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
408         /* if we have ordered orteds to terminate or abort
409          * is in progress, record it */
410         if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
411             OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
412                                  "%s Comm failure: daemons terminating - recording daemon %s as gone",
413                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
414             /* remove from dependent routes, if it is one */
415             orte_routed.route_lost(rtmod, proc);
416             /* if all my routes and local children are gone, then terminate ourselves */
417             if (0 == orte_routed.num_routes(rtmod)) {
418                 for (i=0; i < orte_local_children->size; i++) {
419                     if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
420                         ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
421                         /* at least one is still alive */
422                         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
423                                              "%s Comm failure: at least one proc (%s) still alive",
424                                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
425                                              ORTE_NAME_PRINT(&proct->name)));
426                         goto cleanup;
427                     }
428                 }
429                 /* call our appropriate exit procedure */
430                 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
431                                      "%s errmgr_hnp: all routes and children gone - ordering exit",
432                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
433                 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
434             } else {
435                 OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
436                                      "%s Comm failure: %d routes remain alive",
437                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
438                                      (int)orte_routed.num_routes(rtmod)));
439             }
440             goto cleanup;
441         }
442         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
443                              "%s Comm failure: daemon %s - aborting",
444                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
445         /* record the first one to fail */
446         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
447             /* mark the daemon job as failed */
448             jdata->state = ORTE_JOB_STATE_COMM_FAILED;
449             /* point to the lowest rank to cause the problem */
450             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
451             /* retain the object so it doesn't get free'd */
452             OBJ_RETAIN(pptr);
453             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
454             if (!orte_enable_recovery) {
455                 /* output an error message so the user knows what happened */
456                 orte_show_help("help-errmgr-base.txt", "node-died", true,
457                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
458                                orte_process_info.nodename,
459                                ORTE_NAME_PRINT(proc),
460                                pptr->node->name);
461                 /* update our exit code */
462                 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
463                 /* just in case the exit code hadn't been set, do it here - this
464                  * won't override any reported exit code */
465                 ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
466             }
467         }
468         /* if recovery is enabled, then we are done - otherwise,
469          * abort the system */
470         if (!orte_enable_recovery) {
471             default_hnp_abort(jdata);
472         }
473         goto cleanup;
474     }
475 
476     /* update the proc state - can get multiple reports on a proc
477      * depending on circumstances, so ensure we only do this once
478      */
479     if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
480         pptr->state = state;
481     }
482 
483     /* if we were ordered to terminate, mark this proc as dead and see if
484      * any of our routes or local children remain alive - if not, then
485      * terminate ourselves. */
486     if (orte_orteds_term_ordered) {
487         for (i=0; i < orte_local_children->size; i++) {
488             if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
489                 if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
490                     goto keep_going;
491                 }
492             }
493         }
494         /* if all my routes and children are gone, then terminate
495            ourselves nicely (i.e., this is a normal termination) */
496         if (0 == orte_routed.num_routes(rtmod)) {
497             OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
498                                  "%s errmgr:default:hnp all routes gone - exiting",
499                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
500             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
501         }
502     }
503 
504   keep_going:
505     /* if this is a continuously operating job, then there is nothing more
506      * to do - we let the job continue to run */
507     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
508         ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) {
509         /* always mark the waitpid as having fired */
510         ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
511         /* if this is a remote proc, we won't hear anything more about it
512          * as the default behavior would be to terminate the job. So be sure to
513          * mark the IOF as having completed too so we correctly mark this proc
514          * as dead and notify everyone as required */
515         if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
516             ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_IOF_COMPLETE);
517         }
518         goto cleanup;
519     }
520 
521     /* ensure we record the failed proc properly so we can report
522      * the error once we terminate
523      */
524     switch (state) {
525     case ORTE_PROC_STATE_KILLED_BY_CMD:
526         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
527                              "%s errmgr:hnp: proc %s killed by cmd",
528                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
529                              ORTE_NAME_PRINT(proc)));
530         /* we ordered this proc to die, so it isn't an abnormal termination
531          * and we don't flag it as such
532          */
533         if (jdata->num_terminated >= jdata->num_procs) {
534             /* this job has terminated */
535             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
536         }
537         /* don't abort the job as this isn't an abnormal termination */
538         break;
539 
540     case ORTE_PROC_STATE_ABORTED:
541         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
542                              "%s errmgr:hnp: proc %s aborted",
543                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
544                              ORTE_NAME_PRINT(proc)));
545         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
546             jdata->state = ORTE_JOB_STATE_ABORTED;
547             /* point to the first rank to cause the problem */
548             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
549             /* retain the object so it doesn't get free'd */
550             OBJ_RETAIN(pptr);
551             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
552             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
553             /* abnormal termination - abort, but only do it once
554              * to avoid creating a lot of confusion */
555             default_hnp_abort(jdata);
556         }
557         break;
558 
559     case ORTE_PROC_STATE_ABORTED_BY_SIG:
560         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
561                              "%s errmgr:hnp: proc %s aborted by signal",
562                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
563                              ORTE_NAME_PRINT(proc)));
564 
565         ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
566         /* track the number of non-zero exits */
567         i32 = 0;
568         i32ptr = &i32;
569         orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
570         ++i32;
571         orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
572         if (orte_abort_non_zero_exit) {
573 
574             if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
575                 jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
576                 /* point to the first rank to cause the problem */
577                 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
578                 /* retain the object so it doesn't get free'd */
579                 OBJ_RETAIN(pptr);
580                 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
581                 ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
582                 /* abnormal termination - abort, but only do it once
583                  * to avoid creating a lot of confusion */
584                 default_hnp_abort(jdata);
585             }
586         } else {
587             /* user requested we consider this normal termination */
588             if (jdata->num_terminated >= jdata->num_procs) {
589                 /* this job has terminated */
590                 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
591             }
592         }
593         break;
594 
595     case ORTE_PROC_STATE_TERM_WO_SYNC:
596         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
597                              "%s errmgr:hnp: proc %s terminated without sync",
598                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
599                              ORTE_NAME_PRINT(proc)));
600         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
601             jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
602             /* point to the first rank to cause the problem */
603             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
604             /* retain the object so it doesn't get free'd */
605             OBJ_RETAIN(pptr);
606             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
607             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
608             /* now treat a special case - if the proc exit'd without a required
609              * sync, it may have done so with a zero exit code. We want to ensure
610              * that the user realizes there was an error, so in this -one- case,
611              * we overwrite the process' exit code with the default error code
612              */
613             ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
614             /* abnormal termination - abort, but only do it once
615              * to avoid creating a lot of confusion */
616             default_hnp_abort(jdata);
617         }
618         break;
619 
620     case ORTE_PROC_STATE_FAILED_TO_START:
621     case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
622         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
623                              "%s errmgr:hnp: proc %s %s",
624                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
625                              ORTE_NAME_PRINT(proc),
626                              orte_proc_state_to_str(state)));
627         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
628             if (ORTE_PROC_STATE_FAILED_TO_START) {
629                 jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
630             } else {
631                 jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
632             }
633             /* point to the first rank to cause the problem */
634             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
635             /* retain the object so it doesn't get free'd */
636             OBJ_RETAIN(pptr);
637             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
638             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
639             /* abnormal termination - abort, but only do it once
640              * to avoid creating a lot of confusion */
641             default_hnp_abort(jdata);
642         }
643         /* if this was a daemon, report it */
644         if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
645             /* output a message indicating we failed to launch a daemon */
646             orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
647         }
648         break;
649 
650     case ORTE_PROC_STATE_CALLED_ABORT:
651         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
652                              "%s errmgr:hnp: proc %s called abort with exit code %d",
653                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
654                              ORTE_NAME_PRINT(proc), pptr->exit_code));
655         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
656             jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
657             /* point to the first proc to cause the problem */
658             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
659             /* retain the object so it doesn't get free'd */
660             OBJ_RETAIN(pptr);
661             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
662             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
663             /* abnormal termination - abort, but only do it once
664              * to avoid creating a lot of confusion */
665             default_hnp_abort(jdata);
666         }
667         break;
668 
669     case ORTE_PROC_STATE_TERM_NON_ZERO:
670         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
671                              "%s errmgr:hnp: proc %s exited with non-zero status %d",
672                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
673                              ORTE_NAME_PRINT(proc),
674                              pptr->exit_code));
675         ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
676         /* track the number of non-zero exits */
677         i32 = 0;
678         i32ptr = &i32;
679         orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
680         ++i32;
681         orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
682         if (orte_abort_non_zero_exit) {
683             if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
684                 jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
685                 /* point to the first rank to cause the problem */
686                 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
687                 /* retain the object so it doesn't get free'd */
688                 OBJ_RETAIN(pptr);
689                 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
690                 /* abnormal termination - abort, but only do it once
691                  * to avoid creating a lot of confusion */
692                 default_hnp_abort(jdata);
693             }
694         } else {
695             /* user requested we consider this normal termination */
696             if (jdata->num_terminated >= jdata->num_procs) {
697                 /* this job has terminated */
698                 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
699             }
700         }
701         break;
702 
703     case ORTE_PROC_STATE_HEARTBEAT_FAILED:
704         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
705                              "%s errmgr:hnp: proc %s heartbeat failed",
706                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
707                              ORTE_NAME_PRINT(proc)));
708         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
709             jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
710             /* point to the first rank to cause the problem */
711             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
712             /* retain the object so it doesn't get free'd */
713             OBJ_RETAIN(pptr);
714             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
715             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
716             /* abnormal termination - abort, but only do it once
717              * to avoid creating a lot of confusion */
718             default_hnp_abort(jdata);
719         }
720         /* remove from dependent routes, if it is one */
721         orte_routed.route_lost(rtmod, proc);
722         break;
723 
724     case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
725         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
726                              "%s errmgr:hnp: unable to send message to proc %s",
727                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
728                              ORTE_NAME_PRINT(proc)));
729         /* if this proc is one of my daemons, then we are truly
730          * hosed - so just exit out
731          */
732         if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
733             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
734             break;
735         }
736         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
737             /* abnormal termination - abort, but only do it once
738              * to avoid creating a lot of confusion */
739             default_hnp_abort(jdata);
740         }
741         break;
742 
743     case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
744         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
745                              "%s errmgr:hnp: no message path to proc %s",
746                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
747                              ORTE_NAME_PRINT(proc)));
748         orte_show_help("help-errmgr-base.txt", "no-path", true,
749                        orte_process_info.nodename, pptr->node->name);
750         /* if this proc is one of my daemons, then we are truly
751          * hosed - so just exit out
752          */
753         if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
754             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
755             break;
756         }
757         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
758             /* abnormal termination - abort, but only do it once
759              * to avoid creating a lot of confusion */
760             default_hnp_abort(jdata);
761         }
762         break;
763 
764     case ORTE_PROC_STATE_FAILED_TO_CONNECT:
765         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
766                              "%s errmgr:hnp: cannot connect to proc %s",
767                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
768                              ORTE_NAME_PRINT(proc)));
769         orte_show_help("help-errmgr-base.txt", "no-connect", true,
770                        orte_process_info.nodename, pptr->node->name);
771         /* if this proc is one of my daemons, then we are truly
772          * hosed - so just exit out
773          */
774         if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
775             ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
776             break;
777         }
778         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
779             /* abnormal termination - abort, but only do it once
780              * to avoid creating a lot of confusion */
781             default_hnp_abort(jdata);
782         }
783         break;
784 
785     default:
786         /* shouldn't get this, but terminate job if required */
787         OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
788                              "%s errmgr:hnp: proc %s default error %s",
789                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
790                              ORTE_NAME_PRINT(proc),
791                              orte_proc_state_to_str(state)));
792         if (jdata->num_terminated == jdata->num_procs) {
793             ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
794         }
795         break;
796     }
797     /* if the waitpid fired, be sure to let the state machine know */
798     if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
799         ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
800     }
801 
802  cleanup:
803     OBJ_RELEASE(caddy);
804 }
805 
806 /*****************
807  * Local Functions
808  *****************/
default_hnp_abort(orte_job_t * jdata)809 static void default_hnp_abort(orte_job_t *jdata)
810 {
811     int rc;
812     int32_t i32, *i32ptr;
813 
814     /* if we are already in progress, then ignore this call */
815     if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
816         OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
817                              "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s",
818                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
819                              ORTE_JOBID_PRINT(jdata->jobid)));
820         return;
821     }
822 
823     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
824                          "%s errmgr:default_hnp: abort called on job %s",
825                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
826                          ORTE_JOBID_PRINT(jdata->jobid)));
827 
828     /* set control params to indicate we are terminating */
829     orte_job_term_ordered = true;
830     orte_enable_recovery = false;
831 
832     /* if it is the daemon job that aborted, then we need
833      * to flag an abnormal term - otherwise, just abort
834      * the job cleanly
835      */
836     if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
837         orte_abnormal_term_ordered = true;
838     }
839 
840     i32 = 0;
841     i32ptr = &i32;
842     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32)) {
843         /* warn user */
844         orte_show_help("help-errmgr-base.txt", "normal-termination-but", true,
845                        (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child",
846                        (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
847                        i32, (1 == i32) ? "process returned\na non-zero exit code" :
848                        "processes returned\nnon-zero exit codes");
849     }
850 
851     OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
852                          "%s errmgr:default_hnp: ordering orted termination",
853                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
854 
855     /* tell the plm to terminate the orteds - they will automatically
856      * kill their local procs
857      */
858     if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
859         ORTE_ERROR_LOG(rc);
860     }
861 }
862